#!python

import sys
import logging
import argparse

import requests
import requests_cache

import otmt
from otmt.version import __appversion__

def process_arguments(args):

    parser = argparse.ArgumentParser(prog="{}".format(args[0]),
        description='Detects off-topic webpages in a collection.',
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-i', '--input', dest='input_type',
        required=True, type=otmt.process_input_types,
        help="The source of the input data. The following options are available:\n"
        "* warc=[warc-filenames separated by commas with no spaces] - EXPERIMENTAL\n"
        "* archiveit=[collection identifier or collection URI]\n"
        "* timemap=[URI of TimeMap]"
        )

    parser.add_argument('-o', '--output', dest='output_filename',
        required=True, help="The file name in which to store the results.")

    parser.add_argument('-d', '--directory', dest='working_directory',
        default=otmt.working_directory_default,
        help='The working directory holding the data being downloaded'
        ' and processed. If data is already here, it will be used in'
        ' lieu of any supplied input option.')

    parser.add_argument('-ot', '--output-type', dest='output_type',
        default='json', type=otmt.process_output_types,
        help="output type for off-topic analysis:\n"
        "* json - a JSON file containing the memento off-topic status(default)\n"
        "* csv - a CSV file containinig similar data to the JSON format"
        )

    parser.add_argument('--offtopic-file', dest='offtopic_file',
        default=None,
        help='If this is set, then only write the off-topic URI-Ms to this file.'
    )

    parser.add_argument('--ontopic-file', dest='ontopic_file',
        default=None,
        help='If this is set, then only write the on-topic URI-Ms to this file.'
    )

    tmmeasurehelp = ""
    for measure in otmt.supported_timemap_measures:
        tmmeasurehelp += "* {} - {}, default threshold {}\n".format(
            measure, otmt.supported_timemap_measures[measure]['name'],
            otmt.supported_timemap_measures[measure]['default threshold'])

    parser.add_argument('-tm', '--timemap-measures', dest='timemap_measures',
        type=otmt.process_timemap_similarity_measure_inputs,
        default='cosine',
        help="The TimeMap-based similarity measures specified will be used. \n"
        "For each of these measures, the first memento in a TimeMap\n"
        "is compared with each subsequent memento to measure topic drift.\n"
        "Specify measure with optional threshold separated by equals.\n"
        "Multiple measures can be specified.\n"
        "(e.g., jaccard=0.10,cosine=0.15,wcount);\n"
        "Leave thresholds off to use default thresholds.\n"
        "Accepted values:\n{}".format(tmmeasurehelp)
        )

    cmmeasurehelp = ""
    for measure in otmt.supported_collection_measures:
        cmmeasurehelp += "* {} - {}, default threshold {}\n".format(
            measure, otmt.supported_collection_measures[measure]['name'],
            otmt.supported_collection_measures[measure]['default threshold'])

    # parser.add_argument('-cm', '--collection-measures', dest='collection_measures',
    #     type=otmt.process_collection_similarity_measure_inputs,
    #     help="The Collection-based similarity measures specified will be used. \n"
    #     "For each of these measures, all content in the collection is taken into\n"
    #     "account when determining if a given memento is off topic.\n"
    #     "Specify measure with optional threshold separated by equals.\n"
    #     "Multiple measures can be specified.\n"
    #     "Leave thresholds off to use default thresholds.\n"
    #     "Accepted values:\n{}".format(cmmeasurehelp)        
    #     )

    parser.add_argument('-l', '--logfile', dest='logfile',
        default=sys.stdout,
        help="The path to a logging file. The log is printed to screen by default.")

    parser.add_argument('-v', '--verbose', dest='verbose',
        action='store_true',
        help="This will raise the logging level to debug for more verbose output")

    parser.add_argument('-q', '--quiet', dest='quiet',
        action='store_true',
        help="This will lower the logging level to only show warnings or errors")

    parser.add_argument('-cf', '--cachefile', dest='cachefile',
        default='/tmp/otmt',
        help="The path to the SQLite cache file used to cache HTTP requests")

    parser.add_argument('--no-compute-simhashes', dest='compute_simhashes',
        action='store_false', default=True,
        help="Do not compute Simhashes of all raw memento content")

    parser.add_argument('--no-compute-lengths', dest='compute_content_length',
        action='store_false', default=True,
        help="Do not compute content lengths of all raw memento content")

    parser.add_argument('--no-detect-languages', dest='detect_languages',
        action='store_false', default=True,
        help="Do not perform language detection on raw memento content")

    parser.add_argument('--number-of-topics', dest="num_topics", type=int,
        help="The number of topics to use for gensim_lda and gensim_lsi, "
        "ignored if these measures are not requested.")

    parser.add_argument('--version', action='version', 
        version=__appversion__)

    args = parser.parse_args()

    if args.compute_simhashes == False and \
        args.compute_content_length == False and \
        args.timemap_measures == None:
        # args.collection_measures == None and \

        parser.error("must supply one of these options: \n"
            " -tm, --compute-lengths, or --compute-simashes")

    return args

def get_list_of_offtopic(measuremodel):

    offtopic_mementos = []

    for urit in measuremodel.get_TimeMap_URIs():
        for urim in measuremodel.get_Memento_URIs_in_TimeMap(urit):

            try:
                if measuremodel.get_overall_off_topic_status(urim) == "off-topic":
                    offtopic_mementos.append(urim)
            except KeyError:
                logger.error("failed to get off-topic status for URI-M {}".format(urim))

    return offtopic_mementos

def get_list_of_ontopic(measuremodel):

    ontopic_mementos = []

    for urit in measuremodel.get_TimeMap_URIs():
        for urim in measuremodel.get_Memento_URIs_in_TimeMap(urit):

            try:
                if measuremodel.get_overall_off_topic_status(urim) == "on-topic":
                    ontopic_mementos.append(urim)
            except KeyError:
                logger.error("failed to get on-topic status for URI-M {}".format(urim))

    return ontopic_mementos

if __name__ == '__main__':
    """
        This function is designed to work as follows:
        1. Acquire content using the input types specified
        2. Pass that content through the measures and thresholds specified
        3. Perform an additional calculations
        4. Save the results in the format specified
    """

    args = process_arguments(sys.argv)

    # set up logging for the rest of the system
    logger = otmt.get_logger(
        __name__, otmt.calculate_loglevel(
            verbose=args.verbose, quiet=args.quiet), 
        args.logfile)

    logger.info('Starting topic analysis run.')
    logger.debug('command-line arguments: {}'.format(args))

    requests_cache.install_cache(args.cachefile, backend='sqlite')

    input_type = args.input_type[0]
    input_type_arguments = args.input_type[1]

    logger.info("Acquiring memento colleciton using input type {}".format(input_type))

    logger.info("TimeMap measures chosen: {}".format(args.timemap_measures))
    # logger.info("Collection measures chosen: {}".format(args.collection_measures))

    # 1. Acquire content using the input types specified
    # the content is stored in a CollectionModel object
    cm = otmt.get_collection_model(
        input_type, input_type_arguments, args.working_directory
    )

    # 2. Pass that content through the measures and thresholds specified
    # the results are stored in a MeasureModel object
    mm = otmt.MeasureModel()

    if args.timemap_measures:

        for measure in args.timemap_measures:

            logger.info("Processing mementos using TimeMap measure {}".format(measure))

            if measure == "gensim_lda" or measure == "gensim_lsi":

                if args.num_topics:
                    num_topics = int(args.num_topics)
                else:
                    num_topics = otmt.supported_timemap_measures[measure]["default number of topics"]

                mm = otmt.supported_timemap_measures[measure]["function"](
                    cm, mm, num_topics=num_topics)

            else:

                mm = otmt.supported_timemap_measures[measure]["function"](
                    cm, mm)

            threshold = args.timemap_measures[measure]

            mm.calculate_offtopic_by_measure(
                "timemap measures", measure, threshold,
                otmt.supported_timemap_measures[measure]["comparison direction"]
                )

    # if args.collection_measures:

    #     for measure in args.collection_measures:

    #         logger.info("Processing mementos using Collection measure {}".format(measure))

    #         mm = otmt.supported_collection_measures[measure]["function"](
    #             cm, mm)

    #         threshold = args.collection_measures[measure]

    #         mm.calculate_offtopic_by_measure(
    #             "collection measures", measure, threshold,
    #             otmt.supported_collection_measures[measure]["comparison direction"]
    #         )

    mm.calculate_overall_offtopic_status()

    # 3. Perform an additional calculations
    if args.compute_simhashes:
        logger.info("computing Simhashes")
        mm = otmt.compute_Simhashes(cm, mm)

    if args.compute_content_length:
        logger.info("computing content lengths")
        mm = otmt.compute_raw_content_lengths(cm, mm)

    if args.detect_languages:
        logger.info("detecting languages")
        mm = otmt.detect_languages(cm, mm)

    mm = otmt.extract_memento_datetimes(cm, mm)

    # 4. Save the results in the format specified
    logger.info("saving ouput as type {}".format(args.output_type))

    otmt.supported_output_types[args.output_type](
        args.output_filename, mm, cm)

    logger.info("output written to {}".format(args.output_filename))

    if args.offtopic_file:
        
        offtopic_mementos = get_list_of_offtopic(mm)

        with open(args.offtopic_file, 'w') as f:
            for urim in offtopic_mementos:
                f.write("{}\n".format(urim))

        logger.info("off-topic URI-Ms saved to {}".format(args.offtopic_file))

    if args.ontopic_file:
        
        ontopic_mementos = get_list_of_ontopic(mm)

        with open(args.ontopic_file, 'w') as f:
            for urim in ontopic_mementos:
                f.write("{}\n".format(urim))

        logger.info("on-topic URI-Ms saved to {}".format(args.ontopic_file))

    logger.info("Finished analysis run")
