#!/usr/bin/env python

from __future__ import print_function
import os
import re
import sys
import xml

from lxml import etree, html

dcknown = [
    "dc:title","dc:creator","dc:subject",
    "dc:description","dc:publisher",
    "dc:contributor","dc:date","dc:type","dc:format",
    "dc:identifier","dc:source","dc:language","dc:relation",
    "dc:coverage","dc:rights"
]

def get_text(node):
    textnodes = node.xpath(".//text()")
    s = "".join([text for text in textnodes])
    return re.sub(r'\s+',' ',s)

def print_usage():
    sys.stderr.write("merge Dublin Core metadata into hOCR header files\n\n")
    sys.stderr.write("usage: %s dc.xml hocr.html\n"%sys.argv[0])

if len(sys.argv)>1 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
    print_usage()
    sys.exit(0)

if len(sys.argv)<2:
    print_usage()
    sys.exit(1)

dc_doc = etree.parse(sys.argv[1], html.XHTMLParser())
hocr_doc = html.parse(sys.argv[2])


### remove all existing META tags representing Dublin Core metadata

hocr_meta = hocr_doc.xpath("//HEAD|//head")
assert hocr_meta!=[]
hocr_meta = hocr_meta[0]

hocr_nodes = hocr_doc.xpath("//head//meta[starts-with(@name,'DC.')]")
for node in hocr_nodes:
    node.getparent().remove(node)

### find all the Dublin Core tags in the Dublin Core metadata

dc_nodes = dc_doc.xpath("//dc:*", namespaces={"dc":"http://purl.org/dc/elements/1.1/"})
for node in dc_nodes:
    nt = re.sub(r'^{http://purl.org/dc/elements/1.1/}', 'dc:', node.tag)
    if nt in dcknown:
        name = re.sub(r'^dc:','DC.',nt)
        value = get_text(node)
        value = re.sub("[\t\r\n'\"]"," ",value).strip()
        value = value[:500]
        hnode = etree.Element("meta", nsmap = {'DC': 'http://purl.org/dc/elements/1.1'})
        hnode.attrib['name'] = name
        hnode.attrib['content'] = value
        hocr_meta.append(hnode)

print(etree.tostring(hocr_doc, pretty_print=True))
