#!/usr/bin/env python

'''
grep for HTML; CLI for pyquery

Usage:
    pquery <selector>
    pquery <selector> -p <projector>
    pquery <selector> -f <format_string>
    pquery -h | --help

Options:
    -p: project the dict onto field `<projector>`.
    -f: equivalent of `<format_string>.format(item)`,
        where item is the dict form of one selected HTML element.
    -h | -v: shows this doc.

Dict keys:
    'tag': The HTML tag
    'html': Inner HTML of the element
    'text': Inner text of the element
    ...: [optional] Other attributes: e.g. 'href'
'''

import sys
PYTHON2 = True

try:
    import sys
    from imp import reload
    reload(sys)
    sys.setdefaultencoding('utf-8')
    # This is python2..
    sys.stderr.write('PYTHON2=True\n')
except Exception as e:
    print(e)
    # noop with e...
    # This is python3..
    PYTHON2 = False
    sys.stderr.write('PYTHON2=False\n')

import docopt
from pyquery import PyQuery as pq

# The format string may contain some key that does not exist in dict.
# Maximum number of trials are specified here.
MAX_FORMAT_TRIAL = 20

# The workflow is driven by iterator/generator

def array_output(a):
    for i in a:
        if i:
            # 'i' may be a dict.
            # .encode('utf-8') does not work here.
            sys.stdout.write(str(i) + '\n')

def project(a, projector):
    # For initial version, only support grepping one field.
    # Support multiple fields if there are good use cases.
    #fields = set(projector.split(','))
    for i in a:
        yield i.get(projector, None)

def format_dict(a, format_string):
    #try:
    #    from exceptions import KeyError
    #except Exception as e:
    #    # Python3 here
    #    pass
    import copy
    for i in a:
        d = copy.deepcopy(i)
        success = False
        for j in range(MAX_FORMAT_TRIAL):
            try:
                ret = format_string.format(**d)
                yield ret
                success = True
                break
            except KeyError as e:
                # e.message contains the non-exist key
                if PYTHON2:
                    key = e.message
                else:
                    key = e.args[0]
                d[key] = None
                # sys.stderr.write('KeyError "%s"\n' % e.message)
        if not success:
            sys.stderr.write('Max number of formating trials reached. Format string: "%s". Dict: %s .\n' % (format_string, i))

def html_element_to_dict(a):
    for i in a:
        d = {}
        try:
            d['html'] = i.html()
        except Exception as e:
            # log the exception in verbose mode?
            # Possible exceptions:
            #    * some utf-8 errors
            d['html'] = None
        h = i[0]
        d['tag'] = h.tag
        d['text'] = h.text
        d.update(h.attrib)
        yield d

if __name__ == '__main__':
    args = docopt.docopt(__doc__)
    html = sys.stdin.read()
    d = pq(html)
    matches = d(args['<selector>'])
    data = html_element_to_dict(list(matches.items()))
    if args['<projector>']:
        data = project(data, args['<projector>'])
    if args['<format_string>']:
        if PYTHON2:
            data = format_dict(data, unicode(args['<format_string>']).decode('string_escape'))
        else:
            data = format_dict(data, args['<format_string>'])
    array_output(data)

