#!/usr/bin/env python
'''
This is a script which allows you to **quickly** download large
numbers of files from Amazon S3. It does this by opening a large
number of connections, and downloading in parallel.

It is not the best thing for small numbers of large files. There
are better tools for that.
'''

import Queue
import argparse
import os
import sys
import threading
import time

from boto.s3.connection import S3Connection

from s3clone import humanize

DEBUG = False

parser = argparse.ArgumentParser(description="Download a bucket from S3")
parser.add_argument('--workers', '-n', metavar='N', type=int,
                    help="Number of worker threads", default=100)
parser.add_argument('--prefix', '-p', metavar='prefix', type=str,
                    help="Prefix of files to get", default="")
parser.add_argument('bucket', metavar='<s3 bucket>', type=str,
                    help="S3 bucket to download")

args = parser.parse_args()

print "Workers", args.workers

conn = S3Connection()
bucket = conn.get_bucket(args.bucket)
items = bucket.list(prefix=args.prefix)


def download(key):
    '''
    Download a single file from Amazon S3.
    '''
    # This try/except clause avoids threading conflict issues and
    # similar. It is overly broad, but if it fails, wrrors will still
    # be caught if this fails at the time of the attempted download.
    # This avoids issues such as worker thread contention, where two
    # try to create the same directory, as might happen with simply
    # creating a dir if it does not exist.
    try:
        os.makedirs(os.path.dirname(key.name))
    except:
        pass
    key.get_contents_to_filename(key.name)

task_queue = Queue.PriorityQueue()
lock = threading.Lock()
size = 0


def worker():
    while True:
        queue_item = task_queue.get()
        if DEBUG:
            print "Grabbing", queue_item
        try:
            download(queue_item)
            with lock:
                global size
                size = size - queue_item.size
        except:
            print "Download of ", queue_item, "failed"
            print sys.exc_info()
        task_queue.task_done()

print "Queuing bucket..."

directories = set()

for item in items:
    directories.add(os.path.dirname(item.name))

count = 0
for item in items:
    # S3 can't tell me if something is a bucket.
    # I used 3 heuristics. I think the first two might be sufficient.
    if item.name in directories:
        continue
    if item.name.endswith('/'):
        continue
    if item.content_type == 'application/x-directory':
        continue
    if os.path.exists(item.name):
        if os.stat(item.name).st_size != item.size:
            print item.name, ": Size mismatch",
            print item.size, os.stat(item.name).st_size
    else:
        task_queue.put(item, -item.size)
        count = count + 1
        size = size + item.size

total_size = size

if total_size == 0:
    print "Empty bucket"
    sys.exit(-1)
print "Running workers... (%s)".format(humanize.bytes(size))

for i in range(args.workers):
    t = threading.Thread(target=worker)
    t.daemon = True
    t.start()

print "Workers started..."

start_time = time.time()

while not task_queue.empty():
    time.sleep(0.1)
    print "Downloading. ", task_queue.unfinished_tasks,
    print " files remaining, (", count, ")",
    print humanize.bytes(size), "(", humanize.bytes(total_size),
    print ")", int(time.time()-start_time), "secs \r",


task_queue.join()

print "Done"
