#!/usr/bin/python3

#
# TODO: 
# forced anchors (done?) and selfhash anchors
# do not download selfhash file if it's in hashdb or in prepacking dir
# 
# heuristics?
#

import argparse
import os
import urllib.parse
import requests
import sys
import time
import logging
import shutil

# from requests.packages.urllib3.util.retry import Retry
# from requests.adapters import HTTPAdapter

import hashget
import hashget.hashdb
from hashget.submiturl import submit_url

from hashget.utils import kmgt
from hashget.debian import DebStatus, debsubmit
from hashget.restorefile import restorefile

BUF_SIZE = 1024*1024

repo = dict()
log=None
hashdb = None

def download_file(url, prefix="/tmp", headers=None):
    headers = headers or dict()
    out = dict()

    chunk_size = 1024*1024
    basename = url.split('/')[-1]
    local_filename = os.path.join(prefix, basename)

    r = requests.get(url, stream=True, headers=headers)
    
    if r.status_code != 200:
        return None
        
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=chunk_size): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

    return local_filename



def deb2url(root, project, basename):
    if basename.startswith('lib'):
        namedir = os.path.join(project, 'name', basename[0:3], basename[3], basename[4])
    else:
        namedir = os.path.join(project, 'name', basename[0], basename[1])

    namepath = os.path.join(namedir, basename)
    project_url = urllib.parse.urljoin(root, project)
    file_url = urllib.parse.urljoin(project_url, namepath)
    return file_url

#
# Prepare
#

def prepare(root, hashdb, anchors, filesz, skipdirs, excludefile):
    """
        ANCHORS ??? do we need it here? maybe delete?
    """

    files = prepare_readfiles(root, anchors, filesz, skipdirs)

    for a in anchors.anchorlist:
        hashdb.pull_anchor(a.get_hashspec())

    ph2url = dict() # package hashes to URLs
    rfile = restorefile()



    with open(excludefile,'w') as excf:
        for f in files:
            try:
                """
                    save url and package hashes, then write to snf
                    write file info to snf
                """
                hashspec = f.get_hashspec()
                hp = hashdb.hash2hp(hashspec)

                ph2url[hp.get_phash()] = hp.url

                excf.write("{}\n".format(os.path.relpath(f.filename, root)))
                rfile.add_file(f)

            except KeyError:
                # unique file, not found in any packages
                pass
                
        for hashspec, purl in ph2url.items():
            rfile.add_package(url = purl, hashspec = hashspec)
        
        rfile.save(os.path.join(root,'.hashget-restore'))
    log.info("saved: {}".format(rfile))

    # guess_packages(root, files)

def prepare_readfiles(root, anchors, filesz, skipdirs):

    def skipdir(d, skipdirs):
        # maybe skip it?
        for sd in skipdirs:
            if (d+'/').startswith(sd):
                return True
        return False

    total = 0
    files = list()

    for directory, subdirs, dirfiles in os.walk(root):
    
        if skipdir(directory, skipdirs):
            continue
                
    
        for basename in dirfiles:
            total += 1        
            path = os.path.join(directory, basename)                        
            
            if os.path.islink(path) or not os.path.isfile(path):
                continue
            
            f = hashget.file.File(path, root=root)
            if f.size > filesz:
                files.append(f)
            anchors.check_append(f)

            #if f.size > 100*1024:
            #    anchors.append(f)

    # sort anchors
    # files = hashget.FileList(sorted(files, key = lambda k: getattr(k,'size'), reverse=True))
    
    return files
    

#
# PostUnpack
#
def postunpack(root, hashdb):
    """
        Restore files after untarring
    """

    rfile = hashget.restorefile.restorefile(os.path.join(root, '.hashget-restore'))
    rfile.preiteration()
    
    stat_cached = 0
    stat_downloaded = 0
    stat_recovered = 0
    stat_files = 0
    started = time.time()
    
    for purl in rfile.packages():
        #pbasename = purl.split('/')[-1]
        #ptdir = os.path.join(tdir, pbasename)
        #os.mkdir(ptdir)

        # lp = cg.get(purl)
        
        log.debug('restore from URL ' + purl)
        p = hashget.package.Package(url = purl)
        p.download()
        p.unpack()
        p.read_files()
        
        for pf in p.files:
            hashspec = pf.get_hashspec()
            try:
                rf = rfile.fbyhash(hashspec)
            except LookupError:
                pass
            else:
                print('recovered ' + rf.filename)
                rf.recover(pf.filename)
                stat_recovered += rf.size
                stat_files += 1
                
        stat_cached += p.stat_cached
        stat_downloaded += p.stat_downloaded
        p.cleanup()                        
    
    log.debug('Recovered {} files {} bytes ({} downloaded, {} cached) in {:.2f}s'.format(
            stat_files,
            kmgt(stat_recovered),
            kmgt(stat_downloaded),
            kmgt(stat_cached),
            time.time() - started
        ))
    
    # delete tmpdir


def get_by_sig(sigtype, sig, outdir=None):
    try:
        hp = hashdb.sig2hp(sigtype, sig, remote=True)
    except KeyError:
        log.error("sig {} not found in hashdb".format(sig))
        return

    p = hashget.package.Package(url = hp.url, log=log)
    p.download()
    dst = os.path.join(outdir, p.basename)
    shutil.copy(p.path, dst)
    log.info(dst)
    return dst

def get_by_hashspec(hashdb, hashspec, outdir):
    """

    :param hashspec: either hashspec (sha256:aabb...) or debsib
    :param outdir:
    :return:
    """

    try:
        hp = hashdb.hash2hp(hashspec)
    except KeyError:
        log.error("{} not found in hashdb".format(hashspec))
        return

    p = hashget.package.Package(url = hp.url, log=log)
    p.download()
    if p.hashes.match_hashspec(hashspec):
        dst = os.path.join(outdir, p.basename)
        shutil.copy(p.path, dst)
        log.info(dst)
        return dst

    src = p.hash2path(hashspec)
    dst = os.path.join(outdir, os.path.basename(src))
    shutil.copy(src, dst)
    p.cleanup() 
    log.info(dst)

def main():

    global log, hashdb

    def_anchsz = 100*1024
    def_filesz = 1024
    def_hashserver = list(('https://hashdb.okerr.com/hashdb/',))
    def_project = None
    def_excludefile = os.path.expanduser("~/hashget-exclude") 
    def_skipdirs = ['var/cache/','var/lib/apt/']
    def_sleep = 2
    def_outdir = '.'

    parser = argparse.ArgumentParser(description='HashGet ver {} deduplication and compression tool'.format(hashget.__version__))

    g = parser.add_argument_group('Packing/unpacking')
    g.add_argument('--prepack', '-p', default=None, metavar='DIR', help='prepare DIR for hash-tarring')
    g.add_argument('--postunpack', '-u', default=None, metavar='DIR', help='post-unpack')

    g = parser.add_argument_group('Fetching packages and files')
    g.add_argument('--get', default=None, metavar='HASHSPEC', help='get file by hash')
    g.add_argument('--fetch', default=None, help='fetch .deb file by basename or hash specification (sha256:aabbcc...)')


    g = parser.add_argument_group('Local HashDB commands')
    g.add_argument('--submit', default=None, metavar='URL', help='submit URL to --project')
    g.add_argument('--debcrawl', default=None, metavar='DIR', help='snapshot crawl packages in DIR')
    g.add_argument('--debsubmit', default=None, metavar='package.deb', help='submit local .deb file')

    g = parser.add_argument_group('Crawling options')
    g.add_argument('--anchsz', type=int, default=def_anchsz, help='min size of anchors ({})'.format(def_anchsz))
    g.add_argument('--filesz', type=int, default=def_filesz, help='min size of files ({})'.format(def_filesz))
    g.add_argument('--project', default=def_project, help='project name ({})'.format(def_project))
#    g.add_argument('--repo', nargs=2, action='append', metavar=('REPO_TAG', 'REPO_URL'), default=list(), help='repositories (many)')
    g.add_argument('--sleep', type=int, default=def_sleep, help='delay ({}s)'.format(def_sleep))
    g.add_argument('--fanchor', nargs='+', metavar='FILEGLOB', default=list(), help='forced anchor glob, e.g. --fanchor ".*Makefile" etc/passwd')

    g = parser.add_argument_group('Other options')
    g.add_argument('--hashserver', nargs='*', default=def_hashserver, help='hashserver URL ({})'.format(def_hashserver))
    g.add_argument('--hashdb', default=None, help='path to local hashdb directory')
    g.add_argument('-X','--exclude-from', metavar='FILENAME', dest='excludefile', default=def_excludefile, help='Exclude file (for -X tar option) ({})'.format(def_excludefile))
    g.add_argument('--etag', default=False, action='store_true', help='verify HTTP E-Tag when reuse cached files')
    g.add_argument('--outdir', '-o', default=def_outdir, help='dir where to store --get files ({})'.format(def_outdir))
    g.add_argument('--file', '-f', default=None, help='filename (if already downloaded)')


    g = parser.add_argument_group('Target system specification')
    g.add_argument('--skip', nargs='*', default=def_skipdirs, help='Do not try to dedup these dirs ({}). Relative to --prepare path'.format(def_skipdirs))


    g = parser.add_argument_group('Logging, output')
    g.add_argument('--logfile', default=None, help='log file name')
    g.add_argument('-v', dest='verbose', default=False, action='store_true', help='verbose mode')
    g.add_argument('-q', dest='quiet', default=False, action='store_true', help='quiet mode')


    args = parser.parse_args()

    hashdb = hashget.hashdb.HashDBClient(args.hashdb)
    anchors = hashget.AnchorList(args.anchsz)
    for fa in args.fanchor:
        anchors.add_fanchor(fa)

    for url in args.hashserver:
        hashdb.add_hashserver(url)

    # configure logging
    if args.verbose:
        loglevel = logging.DEBUG

    else:
        loglevel = logging.INFO

    if args.quiet:
        loglevel = logging.ERROR

    hashget.cacheget.opt_verify_etag = args.etag


    log = logging.getLogger('hashget')
    log.setLevel(loglevel)
    # logging.basicConfig(level = loglevel, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
    #log.basicConfig(level = loglevel, format='%(message)s')

    logstdout = logging.StreamHandler()
    logstdout.setFormatter(logging.Formatter('%(message)s', '%Y-%m-%d %H:%M:%S'))
    log.addHandler(logstdout)

    if args.logfile:
        fh = logging.FileHandler(args.logfile)
        fh.setFormatter(logging.Formatter('%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
        log.addHandler(fh)

    """
    for e in os.environ:        
        if e.startswith("REPO_"):
            rname = e[len("REPO_"):]
            repo[rname] = os.environ[e]
            print("import repo {} {} from env".format(rname, repo[rname]))

    for r in args.repo:
        repo[r[0]]=r[1]
        print("import repo {} {} from args".format(rname, repo[rname]))
    """
     
    if args.debcrawl:

        cnt_total = 0
        cnt_already = 0
        cnt_new = 0

        started = time.time()

        # ensure debsnap project exists
        hashdb.create_project('debsnap')

        debstatus = DebStatus(args.debcrawl)
        np = debstatus.n_installed
        print("Total: {} packages".format(np))

        for p in debstatus.packages_iter():

            cnt_total += 1

            if hashdb.sig_present('deb', p.signature, remote=False):
                log.debug('[{}/{}] local {}'.format(cnt_total, np, p.signature))
                continue

            if hashdb.pull_sig('deb', p.signature):
                log.info('[{}/{}] pulled {} from hashserver'.format(cnt_total, np, p.signature))
                continue

            url = p.url
            if url is None:
                log.warning('[{}/{}] FAILED to crawl {}'.format(cnt_total, np, p.signature))
                continue

            log.info("[{}/{}] crawl {}".format(cnt_total, np, p.url))

            anchors.clean_list()

            signatures = {
                'deb': p.signature
            }

            submit_url(
                url = p.url,
                hashdb = hashdb,
                project = 'debsnap',
                anchors = anchors,
                filesz = args.filesz,
                signatures = signatures)

            cnt_new += 1
            time.sleep(args.sleep)

        print("Crawling done in {:.2f}s. {} total, {} new, {} already in db.".format(time.time() - started, cnt_total, cnt_new, cnt_already))

    if args.prepack:

        skipdirs = [ os.path.join(args.prepack, d) for d in args.skip ]    
                          
        prepare(args.prepack,
            hashdb = hashdb,
            anchors = anchors,
            filesz = args.filesz,
            skipdirs = skipdirs,
            excludefile = args.excludefile,
            )

    if args.postunpack:
        postunpack(args.postunpack, hashdb = hashdb)

    if args.get:
        if args.get.startswith('sha256:'):
            get_by_hashspec(hashdb, args.get, args.outdir)
        else:
            if args.get.endswith('.deb'):
                sig = args.get[:-4]
            else:
                sig = args.get
            get_by_sig('deb', sig, args.outdir)

    if args.submit:

        if not args.project:
            log.error('need --project when --submit')
            sys.exit(1)

        hashget.submiturl.submit_url(
            url = args.submit,
            file = args.file,
            project = args.project,
            anchors = anchors,
            filesz = args.filesz)

    if args.debsubmit:
        debsubmit(hashdb, args.debsubmit, anchors)

main()



