#!/usr/bin/env python

# extract the text within all the ocr_line elements within the hOCR file

from __future__ import print_function
import os
import re
import sys

from lxml import html


def get_text(node):
    textnodes = node.xpath(".//text()")
    s = ''.join([text for text in textnodes])
    return re.sub(r'\s+',' ',s)

if len(sys.argv)>1 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
    print("Usage:", sys.argv[0], "file.html")
    sys.exit(0)

if len(sys.argv)>1: doc = html.parse(sys.argv[1])
else: doc = html.parse(sys.stdin)

lines = doc.xpath("//*[@class='ocr_line']")
for line in lines:
    print(get_text(line).encode("utf-8"))
