#!/usr/bin/env python

# check the given file for conformance with the hOCR format spec

from __future__ import print_function
import getopt
import os
import re
import string
import sys

from lxml import html

################################################################
### misc library code
################################################################

TEST_COUNTER = 0
def test_ok(v, msg):
    global TEST_COUNTER
    TEST_COUNTER += 1
    if not v:
        sys.stderr.write("not ")
    sys.stderr.write("ok " + str(TEST_COUNTER) + " - " + msg + "\n")

def assoc(key,list):
    for k,v in list:
        if k==key: return v
    return None

### node properties

def get_prop(node,name):
    title = node.get('title')
    if not title: return None
    props = title.split(';')
    for prop in props:
        (key,args) = prop.split(None,1)
        if key==name: return args
    return None
def get_bbox(node):
    bbox = get_prop(node,'bbox')
    if not bbox: return None
    return tuple([int(x) for x in bbox.split()])

### rectangle properties

def intersect(u,v):
    # intersection of two rectangles
    r = (max(u[0],v[0]),max(u[1],v[1]),min(u[2],v[2]),min(u[3],v[3]))
    return r
def area(u):
    # area of a rectangle
    return max(0,u[2]-u[0])*max(0,u[3]-u[1])
def overlaps(u,v):
    # predicate: do the two rectangles overlap?
    return area(intersect(u,v))>0
def relative_overlap(u,v):
    m = max(area(u),area(v))
    i = area(intersect(u,v))
    return float(i)/m

def mostly_nonoverlapping(boxes,significant_overlap=0.2):
    for i in range(len(boxes)):
        for j in range(i+1,len(boxes)):
            if relative_overlap(boxes[i],boxes[j])>significant_overlap:
                return 0
    return 1

################################################################
### main
################################################################

def print_usage():
    print("usage:",sys.argv[0],"[-o] file.html")

if len(sys.argv)>1 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
    print_usage()
    sys.exit(0)

if len(sys.argv)<1:
    print_usage()
    sys.exit(1)
    
optlist,args = getopt.getopt(sys.argv[1:],"o")
nooverlap = (assoc('-o',optlist)=='')
if len(args)>0: doc = html.parse(args[0])
elif len(args)>1:
    print("Can only check one file at a time")
    print_usage()
    sys.exit(1)
else: doc = html.parse(sys.stdin)

################################################################
### XML structure checks
################################################################

# check for presence of meta information
test_ok(doc.xpath("//meta[@name='ocr-system']")!=[], "//meta[@name='ocr-system']")
test_ok(doc.xpath("//meta[@name='ocr-capabilites']")!=[], "//meta[@name='ocr-capabilites']")

# check for presence of page
test_ok(doc.xpath("//*[@class='ocr_page']")!=[], "has a page")

# check that lines are inside pages
lines = doc.xpath("//*[@class='ocr_line']")
for line_idx, line in enumerate(lines):
    test_ok(line.xpath("./ancestor::*[@class='ocr_page']"), "ocr_line %2d in an ocr_page"%(line_idx))

# check that pars are inside pages
pars = doc.xpath("//*[@class='ocr_par']")
for par_idx, par in enumerate(pars):
    test_ok(par.xpath("./ancestor::*[@class='ocr_page']"), "ocr_par %2d in an ocr_page"%(par_idx))

# check that careas are inside pages
careas = doc.xpath("//*[@class='ocr_carea']")
for carea_idx, carea in enumerate(careas):
    test_ok(carea.xpath("./ancestor::*[@class='ocr_page']"), "ocr_carea %2d in an ocr_page"%(carea_idx))

################################################################
### geometric checks
################################################################    

if not nooverlap:
    for page in doc.xpath("//*[@class='ocr_page']"):
        # check lines
        objs = page.xpath("//*[@class='ocr_line']")
        line_bboxes = [get_bbox(obj) for obj in objs if get_prop(obj,'bbox')]
        test_ok(mostly_nonoverlapping(line_bboxes), 'mostly_nonoverlapping/line')
        # check paragraphs
        objs = page.xpath("//*[@class='ocr_par']")
        par_bboxes = [get_bbox(obj) for obj in objs if get_prop(obj,'bbox')]
        test_ok(mostly_nonoverlapping(par_bboxes), 'mostly_nonoverlapping/par')
        # check careas
        objs = page.xpath("//*[@class='ocr_carea']")
        carea_bboxes = [get_bbox(obj) for obj in objs if get_prop(obj,'bbox')]
        test_ok(mostly_nonoverlapping(carea_bboxes), 'mostly_nonoverlapping/carea')

################################################################
### TODO
################################################################

# FIXME add many other checks:
# - containment of paragraphs, careas, etc.
# - ocr-capabilites vs. actual tags
# - warn about text outside ocr_ elements
# - check title= attribute format
# - check that only the right attributes are present on the right elements
# - check for unrecognized ocr_ elements
# - check for significant overlaps
# - check that image files are not repeated
