#!python

import sys
import logging
import argparse

import otmt

def process_arguments(args):

    parser = argparse.ArgumentParser(prog="{}".format(args[0]),
        description='Detects off-topic webpages in a collection.',
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-i', '--input', dest='input_type',
        required=True, type=otmt.process_input_types,
        help="The source of the input data. The following options are available:\n"
        "* warc=[warc-filenames separated by commas with no spaces] - EXPERIMENTAL\n"
        "* archiveit=[collection identifier or collection URI]\n"
        "* timemap=[URI of TimeMap]"
        )

    parser.add_argument('-o', '--output', dest='output_filename',
        required=True, help="The file name in which to store the results.")

    parser.add_argument('-d', '--directory', dest='working_directory',
        default=otmt.working_directory_default,
        help='The working directory holding the data being downloaded'
        ' and processed. If data is already here, it will be used in'
        ' lieu of any supplied input option.')

    parser.add_argument('-ot', '--output-type', dest='output_type',
        default='json', type=otmt.process_output_types,
        help="output type for off-topic analysis:\n"
        "* json - a JSON file containing the memento off-topic status(default)\n"
        "* csv - a CSV file containinig similar data to the JSON format"
        )

    tmmeasurehelp = ""
    for measure in otmt.supported_timemap_measures:
        tmmeasurehelp += "* {} - {}, default threshold {}\n".format(
            measure, otmt.supported_timemap_measures[measure]['name'],
            otmt.supported_timemap_measures[measure]['default threshold'])

    parser.add_argument('-tm', '--timemap-measures', dest='timemap_measures',
        type=otmt.process_timemap_similarity_measure_inputs,
        help="The TimeMap-based similarity measures specified will be used. \n"
        "For each of these measures, the first memento in a TimeMap\n"
        "is compared with each subsequent memento to measure topic drift.\n"
        "Specify measure with optional threshold separated by equals.\n"
        "Multiple measures can be specified.\n"
        "(e.g., jaccard=0.10,cosine=0.15,wcount);\n"
        "Leave thresholds off to use default thresholds.\n"
        "Accepted values:\n{}".format(tmmeasurehelp)
        )

    cmmeasurehelp = ""
    for measure in otmt.supported_collection_measures:
        cmmeasurehelp += "* {} - {}, default threshold {}\n".format(
            measure, otmt.supported_collection_measures[measure]['name'],
            otmt.supported_collection_measures[measure]['default threshold'])

    # parser.add_argument('-cm', '--collection-measures', dest='collection_measures',
    #     type=otmt.process_collection_similarity_measure_inputs,
    #     help="The Collection-based similarity measures specified will be used. \n"
    #     "For each of these measures, all content in the collection is taken into\n"
    #     "account when determining if a given memento is off topic.\n"
    #     "Specify measure with optional threshold separated by equals.\n"
    #     "Multiple measures can be specified.\n"
    #     "Leave thresholds off to use default thresholds.\n"
    #     "Accepted values:\n{}".format(cmmeasurehelp)        
    #     )

    parser.add_argument('-l', '--logfile', dest='logfile',
        default=sys.stdout,
        help="The path to a logging file. The log is printed to screen by default.")

    parser.add_argument('-v', '--verbose', dest='verbose',
        action='store_true',
        help="This will raise the logging level to debug for more verbose output")

    parser.add_argument('-q', '--quiet', dest='quiet',
        action='store_true',
        help="This will lower the logging level to only show warnings or errors")

    parser.add_argument('--compute-simhashes', dest='compute_simhashes',
        action='store_true',
        help="Compute Simhashes of all raw memento content")

    parser.add_argument('--compute-lengths', dest='compute_content_length',
        action='store_true',
        help="Compute content lengths of all raw memento content")

    parser.add_argument('--number-of-topics', dest="num_topics", type=int,
        help="The number of topics to use for gensim_lda and gensim_lsi, "
        "ignored if these measures are not requested.")

    args = parser.parse_args()

    if args.compute_simhashes == False and \
        args.compute_content_length == False and \
        args.timemap_measures == None:
        # args.collection_measures == None and \

        parser.error("must supply one of these options: \n"
            " -tm, --compute-lengths, or --compute-simashes")

    return args

if __name__ == '__main__':
    """
        This function is designed to work as follows:
        1. Acquire content using the input types specified
        2. Pass that content through the measures and thresholds specified
        3. Perform an additional calculations
        4. Save the results in the format specified
    """

    args = process_arguments(sys.argv)

    # set up logging for the rest of the system
    logger = otmt.get_logger(
        __name__, otmt.calculate_loglevel(
            verbose=args.verbose, quiet=args.quiet), 
        args.logfile)

    logger.info('Starting topic analysis run.')
    logger.debug('command-line arguments: {}'.format(args))

    input_type = args.input_type[0]
    input_type_arguments = args.input_type[1]

    logger.info("Acquiring memento colleciton using input type {}".format(input_type))

    logger.info("TimeMap measures chosen: {}".format(args.timemap_measures))
    # logger.info("Collection measures chosen: {}".format(args.collection_measures))

    # 1. Acquire content using the input types specified
    # the content is stored in a CollectionModel object
    cm = otmt.get_collection_model(
        input_type, input_type_arguments, args.working_directory
    )

    # 2. Pass that content through the measures and thresholds specified
    # the results are stored in a MeasureModel object
    mm = otmt.MeasureModel()

    if args.timemap_measures:

        for measure in args.timemap_measures:

            logger.info("Processing mementos using TimeMap measure {}".format(measure))

            if measure == "gensim_lda" or measure == "gensim_lsi":

                if args.num_topics:
                    num_topics = int(args.num_topics)
                else:
                    num_topics = otmt.supported_timemap_measures[measure]["default number of topics"]

                mm = otmt.supported_timemap_measures[measure]["function"](
                    cm, mm, num_topics=num_topics)

            else:

                mm = otmt.supported_timemap_measures[measure]["function"](
                    cm, mm)

            threshold = args.timemap_measures[measure]

            mm.calculate_offtopic_by_measure(
                "timemap measures", measure, threshold,
                otmt.supported_timemap_measures[measure]["comparison direction"]
                )

    # if args.collection_measures:

    #     for measure in args.collection_measures:

    #         logger.info("Processing mementos using Collection measure {}".format(measure))

    #         mm = otmt.supported_collection_measures[measure]["function"](
    #             cm, mm)

    #         threshold = args.collection_measures[measure]

    #         mm.calculate_offtopic_by_measure(
    #             "collection measures", measure, threshold,
    #             otmt.supported_collection_measures[measure]["comparison direction"]
    #         )

    mm.calculate_overall_offtopic_status()

    # 3. Perform an additional calculations
    if args.compute_simhashes:
        logger.info("computing Simhashes")
        mm = otmt.save_Simhashes(cm, mm)

    if args.compute_content_length:
        logger.info("computing content lengths")
        mm = otmt.save_raw_content_lengths(cm, mm)

    # 4. Save the results in the format specified
    logger.info("saving ouput as type {}".format(args.output_type))

    otmt.supported_output_types[args.output_type](
        args.output_filename, mm, cm)

    logger.info("output written to {}".format(args.output_filename))

    logger.info("Finished analysis run")