#!/usr/bin/env python
import argparse
import re
import csv
import sys
import itertools

from webgrep.utils import get_soup, select
from webgrep.grep import main_grep
from webgrep.lookup import main_follow_path


def readCL():
    parser = argparse.ArgumentParser()
    parser.add_argument("-g","--grep",help="item to grep from the website")
    parser.add_argument("-l","--location",nargs="*",help="location within the page to look. given as a path through the DOM tree. The path is a csv list of steps where '-' is a wildcard. eg: 1,1,0,13,9,1,5,3,1,-,0")
    parser.add_argument("-u", "--url", help="url to look in")
    parser.add_argument("-f", "--html_file")
    parser.add_argument("-r", "--relative", help="text field to search relative to. Example: webgrep -r \"Size + Fit\" -g 'Model' returns a relative location. Then use that as in: webgrep -r \"Size + Fit\" -l \" -1,-1,3,-,0\"")
    parser.add_argument("--css", help="This css is used as the root for other searches. Example: a.myFirstClass")
    parser.add_argument("--print_url", action="store_true", help="print link urls instead of link text")
    # parser.add_argument("--no_cache",help="By default webgrep caches the last webpage you downloaded for faster rerunning. Use no_cache to override.")
    parser.add_argument("--phantomjs", action="store_true", help="use phantomjs to parse the website for DOM elements that are loaded using javascript")
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("--print_css", action="store_true", help="used with -l argument it prints the raw css of the returned element instead of the text inside")
    args = parser.parse_args()
    if args.location:
        path = [l.strip().split(",") for l in args.location]
    else:
        path = None
    args.no_cache = True
    return args.css, args.grep, args.url, args.html_file, path, args.relative, args.print_url, args.no_cache, args.phantomjs, args.print_css, args.verbose


def fix_broken_pipe():
    #following two lines solve 'Broken pipe' error when piping
    #script output into head
    from signal import signal, SIGPIPE, SIG_DFL
    signal(SIGPIPE,SIG_DFL)

def soup_text(soup):
    texts = soup.findAll(text=True)
    def visible(element):
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', unicode(element)):
            return False
        return True
    visible_texts = filter(visible, texts)
    return visible_texts

def main(css, grep, url, html_file, all_paths, relative, print_url, no_cache, phantomjs, print_css, verbose):
    soup = get_soup(url, phantomjs, html_file, no_cache)
    if css:
        soup = select(css, soup)[0]
    if grep:
        csv_rows = main_grep(grep, soup, relative, verbose)
        csv.writer(sys.stdout, lineterminator= '\n').writerows(csv_rows)
    elif all_paths:
        csv_rows = main_follow_path(all_paths, soup, relative, print_url, return_soup = print_css)
        if print_css:
            print csv_rows
        else:
            csv.writer(sys.stdout, lineterminator = '\n').writerows([[format_field(f) for f in row] for row in csv_rows])
    elif css:
        # soup = soup.find_all(text=True)
        soup = soup_text(soup)
        write_soup(soup)
        
def write_soup(s):
    csv.writer(sys.stdout, lineterminator="\n").writerows([[format_field(i) for i in s]])

def format_field(f):
    return f.encode("utf-8").replace("\n","").replace("\r","")
    
        
        
if __name__ == "__main__":
    css, grep, url, html_file, all_paths, relative, print_url, no_cache, phantomjs, print_css, verbose = readCL()
    fix_broken_pipe()
    main(css, grep, url, html_file, all_paths, relative, print_url, no_cache, phantomjs, print_css, verbose)
