#!/usr/bin/env python
from __future__ import print_function

import sys,os,re
import argparse
from lxml import etree

parser = argparse.ArgumentParser()
parser.add_argument(
        'sources',
        nargs='+',
        help="hOCR file to check or '-' to read from STDIN")
parser.add_argument(
        '--ansi',
        action='store_true',
        help="Use ANSI highlighting (bold, italic, etc)")
args = parser.parse_args()

def bold(s):
    print("%s%s%s" % ("\033[1m", s, "\033[0m"))

for source in args.sources:
    if source == '-':
        source = '/dev/stdin'
    doc = etree.parse(source)
    for page in doc.xpath("//*[@class='ocr_page']"):
        for line in page.xpath(".//*[@class='ocr_line']"):
            for inline in line.xpath('.//*'):
                if inline.find('strong'):
                    bold(inline)
                else:
                    print(inline.text, end=' ')
            print()
