#!/usr/bin/env python

from __future__ import print_function
import os
import re
import string
import sys

from lxml import etree, html

################################################################
### main program
################################################################

def print_usage():
    sys.stderr.write("combine multiple hOCR documents into one\n\n")
    sys.stderr.write("usage: %s file1.html file2.html...\n"%sys.argv[0])

if len(sys.argv)>1 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
    print_usage()
    sys.exit(0)

if len(sys.argv)<2:
    print_usage()
    sys.exit(1)

doc = html.parse(sys.argv[1])

pages = doc.xpath("//*[@class='ocr_page']")
container = pages[-1].getparent()

for fname in sys.argv[2:]:
    doc2 = html.parse(fname)
    pages = doc2.xpath("//*[@class='ocr_page']")
    for page in pages:
        page = doc.importNode(page,1)
        container.append(page)

print(etree.tostring(doc, pretty_print=True))
