#!/usr/bin/env python

import argparse
import datetime
import json
import time
import sys
import os
import shlex
import shutil
import subprocess
import nudecrawler 
from nudecrawler import Page, Unbuffered
from nudecrawler.page import  get_processed_images
from nudecrawler.version import version
from nudecrawler.cache import cache

import transliterate.discover 
from transliterate.base import TranslitLanguagePack, registry

transliterate.discover.autodiscover()

stats = {
    'cmd': None,
    'uptime': 0,
    'urls': 0,
    'words': 0,
    'word': None,
    'last': {
        'url': None,
        'status': None,
        'detailed': None,
    },
    'last_interesting': {
        'url': None,
        'status': None,
        'detailed': None,
    },
    'now': None,
    'processed_images': 0,
    'found_interesting_pages': 0,
    'found_nude_images': 0,
    'resume': dict(),
    'gap_max': 0,
    'gap_url': None
}

stats_file = None
stats_period = 60

stats_next_write = time.time() + stats_period

started = time.time()

logfile = None
stop_after = None
stop_each = None
refresh = None
detect_image = None
detect_url = None

page_extensions = None
page_image_minsize = 10000
page_mintotal = 0

class TgRuLanguagePack(TranslitLanguagePack):
    language_code = "tgru"
    language_name = "tgru"

    character_ranges = ((0x0400, 0x04FF), (0x0500, 0x052F))

    mapping = (
        u"abvgdezijklmnoprstufhcC'y'ABVGDEZIJKLMNOPRSTUFH'Y'",
        u"абвгдезийклмнопрстуфхцЦъыьАБВГДЕЗИЙКЛМНОПРСТУФХЪЫЬ",
    )

    #reversed_specific_mapping = (
    #    u"ъьЪЬ",
    #    u"''''"
    #)

    pre_processor_mapping = {
        u"zh": u"ж",
        "yo": 'ё',
        u"ts": u"ц",
        u"ch": u"ч",
        u"sh": u"ш",
        u"sch": u"щ",
        u"yu": u"ю",
        u"ya": u"я",
        "Yo": 'Ё',
        u"Zh": u"Ж",
        u"Ts": u"Ц",
        u"Ch": u"Ч",
        u"Sh": u"Ш",
        u"Sch": u"Щ",
        u"Yu": u"Ю",
        u"Ja": u"Я",
        u"EH": u"Э",
        u"eh": u"э"
    }


registry.register(TgRuLanguagePack)


nude = 1
video = 1
verbose = False
all_found = True

filter_methods = {
    "nudepy": ("builtin", ":nude"),
    "aid": ("image", "detect-image-aid.py"),
    "nsfwapi": ("image", "detect-image-nsfw-api.py"),
    "nudenet": ("image", "detect-image-nudenet.py")
}

def get_args(argv=None):
    parser = argparse.ArgumentParser(description=f'Nudecrawler: Telegra.ph Spider {version}')

    def_total =5
    def_minsize=10

    methods_list = ', '.join(filter_methods.keys())

    parser.add_argument('words', nargs='*')
    parser.add_argument('-d', '--days', type=int, default=30)
    parser.add_argument('--nude', metavar='N', type=int, default=1, help='Interesting if N+ nude images')
    parser.add_argument('--total', metavar='N', type=int, default=5, help=f'Interesting if N+ total images ({def_total})')
    parser.add_argument('--video', metavar='N', type=int, default=1, help='Interesting if N+ video')
    parser.add_argument('--url1', metavar="URL", help='process only one url')
    parser.add_argument('-f', '--fails', type=int, default=0, help='stop searching next pages with same words after N failures')
    parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31')


    g = parser.add_argument_group('Image filtering options')
    g.add_argument('-a', '--all', default=False, action='store_true', help='do not detect, print all found pages')
    g.add_argument('--detect-image', metavar='SCRIPT', help='explicitly use this script to detect nudity on image file')
    g.add_argument('--detect-url', metavar='SCRIPT', help='explicitly use this script to detect nudity on image URL')
    g.add_argument('--detect', metavar='METHOD', help=f'One of {methods_list}')
    g.add_argument('--extensions', nargs='*', default=['.jpeg','.jpg', '.png'],help='interesting extensions (with dot, like .jpg)')
    g.add_argument('--minsize', type=int, default=def_minsize,help=f'min size of image in Kb ({def_minsize})')

    g = parser.add_argument_group('Output options')
    g.add_argument('-v', '--verbose', default=False, action='store_true', help='verbose')
    g.add_argument('--unbuffered', '-b', default=False, action='store_true', help='Use unbuffered stdout')
    g.add_argument('--urls', default=False, action='store_true', help='Do not detect, just generate and print URLs')    
    g.add_argument('--log', help='print all precious treasures to this logfile')


    g = parser.add_argument_group('list-related options')
    g.add_argument('-w', '--wordlist', help='wordlist (urllist) file')
    g.add_argument('--stats', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
    g.add_argument('--resume', metavar='STATS_FILE', help='resume from STATS_FILE (other args are not needed)')
    g.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop (or --refresh) after N images processed (or little after)')
    g.add_argument('--refresh', metavar=('SCRIPT', 'ARG'), nargs='+', help='run this refresh script every --stop NUM_IMAGES images')

    return parser.parse_args(argv)




def analyse(url):

    global stop_after

    p = Page(url, neednnudes=nude, neednvideo=video, all_found=all_found,
            detect_url=detect_url, detect_image=detect_image)

    p.image_minsize = page_image_minsize
    p.image_extensions = page_extensions

    stats['urls'] += 1

    if p.error:
        return p
    p.check_all()
    stats['last']['url'] = url
    stats['last']['status'] = p._status
    stats['last']['detailed'] = p._status_detailed


    if p.status().startswith('INTERESTING'):
        stats['found_interesting_pages'] += 1
        stats['found_nude_images'] += p.nude_images
        stats['last_interesting']['url'] = url
        stats['last_interesting']['status'] = p._status
        stats['last_interesting']['detailed'] = p._status_detailed

        if logfile:
            with open(logfile, "a") as fh:
                print(p, file=fh)
        print(p)

    save_stats(force=False)
    if stop_after is not None and get_processed_images() > stop_after:
        print("Stop/refresh after processed", get_processed_images(), "images...")
        if refresh:
            # print("Refresh:", refresh)
            subprocess.run(refresh)

            # schedule next stop
            stop_after = get_processed_images() + stop_each
        else:
            print("No --refresh, exiting with code 2")
            sys.exit(2)

    return p


def save_stats(force=False):
    global stats_next_write    

    if stats_file is None:
        return

    if time.time() > stats_next_write or force:
        stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        stats['uptime'] = int(time.time() - started)
        stats['processed_images'] = get_processed_images()
        
        stats['cache'] = cache.status()

        with open(stats_file, "w") as fh:
            json.dump(stats, fh, indent=4)
            stats_next_write = time.time() + stats_period


def check_word(word, day, fails, print_urls=False, resumecount=None):

    word = word.replace(' ','-').translate({ord('ь'): '', ord('ъ'): ''})

    if word.startswith("https://"):
        baseurl = word
    else:
        trans_word = transliterate.translit(word, 'tgru', reversed=True)
        baseurl=f'https://telegra.ph/{trans_word}'

    stats['word'] = word
    stats['words'] += 1

    url=f'{baseurl}-{day.month:02}-{day.day:02}'

    stats['resume']['month'] = day.month
    stats['resume']['day'] = day.day    
    stats['resume']['count'] = resumecount

    if print_urls:
        print(url)
        return

    # r = requests.get(url)  
    if not resumecount:
        p = analyse(url)
        if p.error:
            return
        c=2
    else:
        c=resumecount
        print(f"Resume from word {word} count {c}")

    nfails=0
    while nfails<fails:


        url=f'{baseurl}-{day.month:02}-{day.day:02}-{c}'
        p = analyse(url)

        if p.error:
            nfails += 1
        else:
            # end of gap
            if nfails>stats['gap_max']:
                stats['gap_max'] = nfails
                stats['gap_url'] = url
            nfails=0

        c+=1
        stats['resume']['count'] = c


def sanity_check(args):
    pass


def main():
    global nude, video, verbose, all_found, stats_file, stats, logfile, \
        stop_after, stop_each, detect_image, detect_url, page_image_minsize, page_extensions, \
        page_mintotal, refresh

    words = None
    args = get_args()
    if args.resume:
        print("Resume from", args.resume)
        with open(args.resume) as fh:
            stats = json.load(fh)
            cmd = stats['cmd']
            args = get_args(shlex.split(cmd)[1:])

    sanity_check(args)

    nude = args.nude
    video = args.video
    verbose = args.verbose
    all_found = args.all    
    matched_resume = False
    skipped_words = 0
    stop_after = args.stop
    stop_each = args.stop
    refresh = args.refresh
    detect_url = args.detect_url
    detect_image = args.detect_image



    if args.detect:
        try:
            kind, basename = filter_methods[args.detect]
        except KeyError:
            print(f"Do not know detector {args.detect!r}, use one of known detectors: ({ ', '.join(filter_methods.keys()) }) or explicitly specify script with --detect-image or --detect-url")
            sys.exit(1)

        if kind in ['image', 'url']:
            if shutil.which(basename) is None:
                print(f"Cannot find {basename}, maybe not in $PATH?" ,file=sys.stderr)
                sys.exit(1)

        if kind == 'builtin':
            detect_image = basename
        elif kind == 'image':
            detect_image = basename
            print(f"# Will use script {shutil.which(basename)} for filtering images")
        elif kind == 'url':
            detect_url = basename
            print(f"# Will use script {shutil.which(basename)} for filtering images")            
    
    # fix arguments
    if not any([detect_image, detect_url, all_found]):
        print("# No filter, using built-in :nude by default")
        detect_image=':nude'

    # when fastforward, we go to specific word/day/count quickly
    fastforward = False

    nudecrawler.verbose.verbose = verbose

    stats['cmd'] = ' '.join(sys.argv)

    if args.unbuffered:
        sys.stdout = Unbuffered(sys.stdout)

    if args.extensions:
        page_extensions = args.extensions
    
    if args.minsize:
        page_image_minsize = args.minsize * 1024

    if args.total:
        page_mintotal = args.total


    # processing could start here
    # --url1 
    if args.url1:
        p = analyse(args.url1)
        if p.error:
            print("Page returned ERROR")
        print(p.status())
        for msg in p._log:
            print(" ", msg)
        return

    ## wordlist
    if args.wordlist:
        stats_file = args.stats
        with open(args.wordlist) as fh:
            words = [line.rstrip() for line in fh]
    
    if args.words:
        words = args.words
    
    if not words:
        print("Need either --url1 URL or words like 'nude' or -w wordlist.txt")
        sys.exit(1)

    logfile = args.log

    if args.stats and os.path.exists(args.stats):
        with open(args.stats) as fh:
            stats = json.load(fh)
            fastforward = True

    for w in words:        
        if fastforward and not matched_resume:
            if w == stats['resume']['word']:
                matched_resume = True
            else:
                skipped_words += 1
                continue

        stats['resume']['word'] = w


        if fastforward:
            day = datetime.datetime(2020, stats['resume']['month'], stats['resume']['day'])
        elif args.day is None:
            day = datetime.datetime.now()
        else:
            day = datetime.datetime(2020, args.day[0], args.day[1])

        days_tried = 0
        while days_tried < args.days:
            if fastforward:
                resumecount = stats['resume']['count']
            else:
                resumecount = None
            # stop fastforward
            fastforward=False
            check_word(w, day, args.fails, print_urls = args.urls, resumecount=resumecount)
            
            days_tried += 1
            day = day - datetime.timedelta(days=1)

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt as e:
        print(e)
        save_stats(force=True)