#!/usr/bin/env python
# -*- coding: utf-8
"""Get sequences for all HMM hits in a given bin.

   This program takes a profile database, a collection ID, and a bin name, and an
   HMM source, and returnes sequences of HMM hits. This program is useful when you
   want to get actual sequencs for each single-copy gene hit in a particular genome
   bin.
"""

import os
import sys
import argparse

import anvio
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.ccollections as ccollections

from anvio.dbops import ContigsSuperclass
from anvio.hmmops import SequencesForHMMHits
from anvio.errors import ConfigError, FilesNPathsError


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def main(args):
    if args.list_available_hmm_sources:
        info_table = SequencesForHMMHits(args.contigs_db).hmm_hits_info
        for source in info_table:
            t = info_table[source]
            run.info_single('%s [type: %s] [num genes: %d]' % (source, t['search_type'], len(t['genes'].split(','))))
        sys.exit(0)

    if args.list_available_gene_names:
        info_table = SequencesForHMMHits(args.contigs_db).hmm_hits_info
        for source in info_table:
            t = info_table[source]
            run.info_single('%s [type: %s]: %s' % (source, t['search_type'], ', '.join(sorted(t['genes'].split(',')))), nl_after = 2)
        sys.exit(0)

    gene_names = set([g.strip() for g in args.gene_names.split(',')]) if args.gene_names else set([])

    if args.profile_db and not args.collection_name:
        raise ConfigError, "You can't use this program with a profile database but without a collection name. Yes. Because."

    if args.profile_db:
        splits_dict = ccollections.GetSplitNamesInBins(args).get_dict()
        run.info('Init', '%d splits in %d bin(s)' % (sum([len(v) for v in splits_dict.values()]), len(splits_dict)))
    else:
        contigs_db = ContigsSuperclass(args, r = run, p = progress)
        splits_dict = {os.path.basename(args.contigs_db[:-3]): contigs_db.splits_basic_info.keys()}

    sources = set([s.strip() for s in args.hmm_sources.split(',')]) if args.hmm_sources else set([])
    s = SequencesForHMMHits(args.contigs_db, sources = sources)

    hmm_sequences_dict = s.get_hmm_sequences_dict_for_splits(splits_dict)
    run.info('Num hits', '%d found for %d source(s)' % (len(hmm_sequences_dict), len(sources)))

    if len(gene_names):
        hmm_sequences_dict = utils.get_filtered_dict(hmm_sequences_dict, 'gene_name', gene_names)
        run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))

    if not hmm_sequences_dict:
        raise ConfigError, "Your selections resulted in 0 hits. There is nothing to report. Are you\
                            sure you have the right set of gene names and sources? If you\
                            select an HMM source, and then use gene names that belong to another\
                            source, the intersection of the two can be empty. Just saying."

    run.info('Result', '%d hits for %d source(s)' % (len(hmm_sequences_dict), len(s.sources)))

    s.store_hmm_sequences_into_FASTA(hmm_sequences_dict, args.output_file)
    run.info('Output', args.output_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Get sequences for HMM hits")

    parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db'))
    parser.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db', {'required': False}))
    parser.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name'))
    parser.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id'))
    parser.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file'))
    parser.add_argument(*anvio.A('hmm-sources'), **anvio.K('hmm-sources'))
    parser.add_argument(*anvio.A('gene-names'), **anvio.K('gene-names'))
    parser.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'required': True}))
    parser.add_argument(*anvio.A('list-available-hmm-sources'), **anvio.K('list-available-hmm-sources'))
    parser.add_argument(*anvio.A('list-available-gene-names'), **anvio.K('list-available-gene-names'))

    args = parser.parse_args() 
    
    try:
        main(args)
    except ConfigError, e:
        print e
        sys.exit(-1)
    except FilesNPathsError, e:
        print e
        sys.exit(-1)
