#!/usr/bin/env python
# -*- coding: utf-8
"""A script to import collections (and their colors)"""

import sys
from collections import Counter

import anvio
import anvio.tables as t
import anvio.dbops as dbops
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError, FilesNPathsError


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()


def main(args):
    sanity_check(args)

    I = lambda: 'contigs' if args.contigs_mode else 'splits'

    # read the input file with split/contig - bin ID associations
    input_data_file_content = utils.get_TAB_delimited_file_as_dictionary(args.data, no_header = True, column_names = ['split_id', 'bin_name'])

    # populate bins_info_dict there is any information about bins
    bins_info_dict = {}
    if args.bins_info:
        try:
            bins_info_dict = utils.get_TAB_delimited_file_as_dictionary(args.bins_info, no_header = True, column_names = ['bin_name', 'source', 'html_color'])
        except Exception, e:
            raise ConfigError, "Someone was not happy with the TAB-delimited bins info file you provided. Here\
                                is the complaint: %s" % e

    contigs_db = dbops.ContigsDatabase(args.contigs_db)
    splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
    contigs_basic_info = contigs_db.db.get_table_as_dict(t.contigs_info_table_name, string_the_key = True)
    contig_name_to_splits_dict = utils.get_contig_name_to_splits_dict(splits_basic_info, contigs_basic_info)
    contigs_db.disconnect()

    run.info('Num contigs in the database', len(contigs_basic_info))
    run.info('Num splits in the database', len(splits_basic_info))
    run.info('Num %s in the input file' % I(), len(input_data_file_content))
    run.info('Num bins the input file describes', len(set([e['bin_name'] for e in input_data_file_content.values()])))

    # lets make sure names in the input file are relevant to names in the database (whether it is contigs, or splits):
    input_names = set(input_data_file_content.keys())
    db_names = set(contigs_basic_info.keys() if args.contigs_mode else splits_basic_info.keys())

    entry_names_overlap_between_input_and_db = set.intersection(*[db_names, input_names])

    if not len(entry_names_overlap_between_input_and_db):
        raise ConfigError, "There is no overlap between the %(item)s names found in your input file\
                            and %(item)s names found in the database. This is odd! For instance one\
                            of the names from your file looks like this: '%(from_file)s', and this is\
                            an example name from the database: '%(from_db)s'. Please issue a report if\
                            you are almost certain that this is anvi'o's fault." \
                                % {'item': I(), 'from_file': input_names.pop(), 'from_db': db_names.pop()}

    run.info('Num %s both input file and the db has' % I(), len(entry_names_overlap_between_input_and_db))

    data = {}

    # populate the data ditionary
    for entry_name in input_data_file_content:
        bin_name = input_data_file_content[entry_name]['bin_name']
        if entry_name not in entry_names_overlap_between_input_and_db:
            continue

        if not data.has_key(bin_name):
            data[bin_name] = set([])

        if args.contigs_mode:
            for split_name in contig_name_to_splits_dict[entry_name]:
                data[bin_name].add(split_name)
        else:
            data[bin_name].add(entry_name)

    if args.contigs_mode:
        run.info_single('%d of %d contig names in the input file that matched to contig names in the database\
                         resolved into %d split names that described in %d bins.' % (len(entry_names_overlap_between_input_and_db),
                                                                                     len(input_data_file_content),
                                                                                     sum([len(v) for v in data.values()]),
                                                                                     len(data)), nl_before = 1)


    if args.profile_db:
        collections = dbops.TablesForCollections(args.profile_db, anvio.__profile__version__)
    else:
        collections = dbops.TablesForCollections(args.contigs_db, anvio.__contigs__version__)

    collections.append(args.collection_name, data, bins_info_dict)


def sanity_check(args):
    if not args.collection_name:
        raise ConfigError, "You must give a name for this collection."

    if not args.contigs_db:
        raise ConfigError, "You must at least provide an anvi'o' contigs database for this to work :("

    if not args.profile_db:
        run.warning("Since you haven't provided an anvi'o profile database, this program will add your collection into\
                     the contigs database you provided. If you use the same collection name later in one of your profile\
                     databases that will be generated from this contigs database, things may go South, and anvi'o would\
                     not even care.")

    try:
        utils.check_sample_id(args.collection_name)
    except:
        raise ConfigError, '"%s" is not a proper collection name. A proper one should be a single word and not contain\
                            ANY characters but digits, ASCII letters and underscore character(s). There should not be\
                            any space characters, and the collection name should not start with a digit.' % args.collection_name

    if args.profile_db:
        dbops.is_profile_db(args.profile_db)
    dbops.is_contigs_db(args.contigs_db)

    filesnpaths.is_file_tab_delimited(args.data, expected_number_of_fields = 2)
    if args.bins_info:
        filesnpaths.is_file_tab_delimited(args.bins_info, expected_number_of_fields = 3)

    num_occurences_of_entries = Counter([l.split('\t')[0] for l in open(args.data).readlines()])
    if max(num_occurences_of_entries.values()) != 1:
        raise ConfigError, "Some %(item)s names occur more than once in the input file. A %(item)s cannot belong in two\
                            bins, and neither there should be the same bin assignment for a given %(item)s. Long story\
                            short, each name should appear only once in your input file, and it is not the case :/" \
                                                                        % {'item': 'contig' if args.contigs_mode else 'split'}



if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description="Import an external binning result into anvi'o")

    parser.add_argument('data', metavar = "TAB DELIMITED FILE",
                        help = 'The input file that describes bin IDs for each split or contig.')

    parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db'))
    parser.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db', {'required': False}))
    parser.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name', {'required': True}))
    parser.add_argument(*anvio.A('bins-info'), **anvio.K('bins-info'))
    parser.add_argument(*anvio.A('contigs-mode'), **anvio.K('contigs-mode'))

    args = parser.parse_args()


    try:
        main(args)
    except ConfigError, e:
        print e
        sys.exit(-1)
    except FilesNPathsError, e:
        print e
        sys.exit(-2)
