#!/usr/bin/env python

# extract the images and texts within all the ocr_line elements within the hOCR file

from __future__ import print_function
import codecs
import getopt
import os
import re
import sys

from lxml import html
from PIL import Image


def assoc(key,list):
    for k,v in list:
        if k==key: return v
    return None

def get_text(node):
    textnodes = node.xpath('.//text()')
    s = ''.join([text for text in textnodes])
    return re.sub(r'\s+',' ',s)

def get_prop(node,name):
    title = node.get("title")
    props = title.split(';')
    for prop in props:
        (key,args) = prop.split(None,1)
        args = args.strip('"')
        if key==name: return args
    return None

def print_usage():
    print("Usage: %s [-b image-dir] [-p file-pattern] [-e element-name] [hocr-file]"%sys.argv[0])

if len(sys.argv)>1 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
    print_usage()
    sys.exit(0)
if len(sys.argv)<2 and sys.stdin.isatty():
    print_usage()
    sys.exit(0)
optlist,args = getopt.getopt(sys.argv[1:],"b:p:e:")

basename = assoc('-b',optlist)
pattern = assoc('-p',optlist) or 'line-%03d.png'
element = assoc('-e',optlist) or 'ocr_line'

tpattern = pattern + '.txt'
if pattern[-4] == '.':
    tpattern = pattern[:-3] + 'txt'

if len(args)>1:
    print("Error: Too many args")
    print_usage()
    sys.exit(1)
if len(args)==1: doc = html.parse(args[0])
else: doc = html.parse(sys.stdin)

pages = doc.xpath('//*[@class="ocr_page"]')
for page in pages:
    iname = get_prop(page,'file')
    if not iname:
        iname = get_prop(page, 'image')
    if basename:
        iname = os.path.join(basename,os.path.basename(iname))
    if not os.path.exists(iname):
        print("not found:",iname)
        sys.exit(1)
    image = Image.open(iname)
    #  print image
    lines = page.xpath("//*[@class='%s']"%element)
    lcount = 1
    for line in lines:
        bbox = [int(x) for x in get_prop(line,'bbox').split()]
        assert bbox[0]<bbox[2]
        assert bbox[1]<bbox[3]
        lineimage = image.crop(bbox)
        lineimage.save(pattern%lcount)
        f = codecs.open(tpattern%lcount, 'w', 'utf-8')
        f.write(get_text(line))
        f.close
        lcount += 1
