#!/usr/bin/env python
"""
Merge multiple metapeptide database files into a single metapeptide database. Optionally, filter simultaneously.
"""

import argparse
import logging
from datetime import datetime
from sixgill import metapeptides
import sys
from Bio import bgzf
from sixgill import __version__

__author__ = "Damon May"
__copyright__ = "Copyright (c) 2016 Damon May"
__license__ = "Apache 2.0"

logger = logging.getLogger(__name__)


def declare_gather_args():
    """
    Declare all arguments, parse them, and return the args dict.
    Does no validation beyond the implicit validation done by argparse.
    return: a dict mapping arg names to values
    """

    # declare args
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('metapeptidedbfiles', type=argparse.FileType('r'), nargs='+',
                        help='input metapeptide database files')
    parser.add_argument('--out', type=argparse.FileType('w'), required=True,
                        help='output file')

    parser.add_argument('--debug', action="store_true", help='Enable debug logging')
    parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__))
    return parser.parse_args()


def main():
    args = declare_gather_args()
    # logging
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s: %(message)s")
    if args.debug:
        logger.setLevel(logging.DEBUG)
        # any module-specific debugging goes below

    script_start_time = datetime.now()
    logger.debug("Start time: %s" % script_start_time)

    read_written_xvals = []
    read_written_yvals = []
    ns_read_per_file = []

    # map from metapeptides to counts of reads they occur in and min qual score
    metapeptideseq_metapeptide_map = {}
    n_lines_read = 0
    for metapeptide_file in args.metapeptidedbfiles:
        metapeptide_filename = metapeptide_file.name
        metapeptide_file = bgzf.BgzfReader(metapeptide_filename)
        print("Reading file %s..." % metapeptide_filename)
        n_lines_this_file = 0
        for metapeptide in metapeptides.read_metapeptides(metapeptide_file):
            if n_lines_read % 5000000 == 0:
                print("    Read %d metapeptides (all files). %d unique metapeptides...." %
                      (n_lines_read, len(metapeptideseq_metapeptide_map)))
                read_written_xvals.append(n_lines_read)
                read_written_yvals.append(len(metapeptideseq_metapeptide_map))
                sys.stdout.flush()
            n_lines_this_file += 1
            n_lines_read += 1

            if metapeptide.sequence in metapeptideseq_metapeptide_map:
                # not a new one. Update our recordkeeping
                existing_metapeptide = metapeptideseq_metapeptide_map[metapeptide.sequence]
                existing_metapeptide.metagene_score = max(existing_metapeptide.metagene_score,
                                                          metapeptide.metagene_score)
                existing_metapeptide.read_ids.append(metapeptide.read_ids[0])
                existing_metapeptide.min_qualscore = max(existing_metapeptide.min_qualscore, metapeptide.min_qualscore)
            else:
                # Got a new one. Put it in the map
                metapeptideseq_metapeptide_map[metapeptide.sequence] = metapeptide
        ns_read_per_file.append(n_lines_this_file)
        print("Read %d lines this file" % n_lines_this_file)
        n_lines_read += n_lines_this_file

    print("Done reading input databases. Read %d metapeptide lines, %d unique metapeptides" %
          (n_lines_read, len(metapeptideseq_metapeptide_map)))

    out_metapeptide_file = bgzf.BgzfWriter(args.out.name, "w")
    print("Building output file %s with count, min quality data..." % args.out.name)
    out_metapeptide_file.write("\t".join(metapeptides.METAPEPTIDEDB_COLUMNS) + '\n')

    # update the temp file rows with correct readcount and minqualscore data, write the real file
    for metapeptide in metapeptideseq_metapeptide_map.values():
        # check readcount. If it passes, update the row appropriately and write it
        out_metapeptide_file.write(metapeptide.make_output_line() + "\n")

    out_metapeptide_file.close()

    print("Wrote %d entries to metapeptide database file %s." % (len(metapeptideseq_metapeptide_map), args.out.name))

    print("Done.")


main()
