#!/usr/bin/env python3

import logging
import time

import click

from condiga_utils.contig_utils import get_contig_lengths
from condiga_utils.coverage_utils import get_coverage
from condiga_utils.gene_utils import (align_genes_to_refs, get_aa_gene_seqs,
                                      get_genes_mapped_to_species,
                                      write_aa_gene_seqs, write_nt_gene_seqs)
from condiga_utils.genome_utils import (download_genomes, get_ref_ids,
                                        get_ref_lengths,
                                        rename_and_copy_genomes)
from condiga_utils.species_utils import get_species_stats
from condiga_utils.taxa_utils import get_taxa_result

__author__ = "Vijini Mallawaarachchi and Yu Lin"
__copyright__ = "Copyright 2022, ConDiGA Project"
__license__ = "MIT"
__version__ = "0.2.1"
__maintainer__ = "Vijini Mallawaarachchi"
__email__ = "viji.mallawaarachchi@gmail.com"
__status__ = "Development"

# Sample command
# -------------------------------------------------------------------

# Setup arguments
# ----------------------------------------------------------------------


@click.command()
@click.option(
    "--contigs",
    "-c",
    required=True,
    help="path to the contigs file",
    type=click.Path(exists=True),
)
@click.option(
    "--taxa",
    "-ta",
    required=True,
    help="path to the taxanomic classification results file",
    type=click.Path(exists=True),
)
@click.option(
    "--genes",
    "-g",
    required=True,
    help="path to the genes file",
    type=click.Path(exists=True),
)
@click.option(
    "--coverages",
    "-cov",
    required=True,
    help="path to the contig coverages file",
    type=click.Path(exists=True),
)
@click.option(
    "--assembly-summary",
    "-as",
    required=True,
    help="path to the assembly_summary.txt file",
    type=click.Path(exists=True),
)
@click.option(
    "--rel-abundance",
    "-ra",
    default=0.0001,
    show_default=True,
    required=False,
    help="minimum relative abundance cut-off",
    type=click.FloatRange(0, 1),
)
@click.option(
    "--genome-coverage",
    "-gc",
    default=0.001,
    show_default=True,
    required=False,
    help="minimum genome coverage cut-off",
    type=click.FloatRange(0, 1),
)
@click.option(
    "--map-threshold",
    "-mt",
    default=0.5,
    show_default=True,
    required=False,
    help="minimum mapping length threshold cut-off",
    type=click.FloatRange(0, 1),
)
@click.option(
    "--nthreads",
    "-t",
    default=8,
    show_default=True,
    required=False,
    help="number of threads to use",
    type=int,
)
@click.option(
    "--output",
    "-o",
    required=True,
    help="path to the output folder",
    type=click.Path(exists=True),
)
@click.version_option(__version__, "-v", "--version", is_flag=True)
def main(contigs, taxa, genes, coverages, assembly_summary, rel_abundance, genome_coverage, map_threshold, nthreads, output):

    """ConDiGA: Contigs directed gene annotation for accurate protein sequence database construction in metaproteomics."""

    # Setup logger
    # ----------------------------------------------------------------------

    logger = logging.getLogger("condiga 0.2.1")
    logger.setLevel(logging.DEBUG)
    logging.captureWarnings(True)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    consoleHeader = logging.StreamHandler()
    consoleHeader.setFormatter(formatter)
    consoleHeader.setLevel(logging.INFO)
    logger.addHandler(consoleHeader)

    # Setup output path for log file
    fileHandler = logging.FileHandler(f"{output}/condiga.log")
    fileHandler.setLevel(logging.DEBUG)
    fileHandler.setFormatter(formatter)
    logger.addHandler(fileHandler)

    logger.info(f"Welcome to ConDiGA: Contigs directed gene annotation for accurate protein sequence database construction in metaproteomics.")

    logger.info(f"Input arguments: ")
    logger.info(f"Contigs file: {contigs}")
    logger.info(f"Taxanomic classification results file: {taxa}")
    logger.info(f"Genes file: {genes}")
    logger.info(f"Contig coverages file: {coverages}")
    logger.info(f"Path to assembly_summary.txt file: {assembly_summary}")
    logger.info(f"Minimum relative abundance cut-off: {rel_abundance}")
    logger.info(f"Minimum genome coverage cut-off: {genome_coverage}")
    logger.info(f"Minimum mapping length threshold cut-off: {map_threshold}")
    logger.info(f"Output folder: {output}")

    start_time = time.time()

    # Get contig lengths
    # ----------------------------------------------------------------------

    logger.info(f"Obtaining contig lengths and k value")
    contig_lengths, k_val = get_contig_lengths(contigs)
    logger.info(f"k value of contigs: {k_val}")

    # Get kraken result
    # ----------------------------------------------------------------------

    logger.info(f"Processing taxids from taxanomic classification results")
    taxid_total_len, species_names_taxid, taxid_list, contig_taxid, taxid_contigs, taxid_to_species, species_names_taxid_length = get_taxa_result(taxa, contig_lengths)
    logger.info(f"{len(taxid_list)} taxid were found in the taxanomic classification result")

    # Download genomes of taxids
    # ----------------------------------------------------------------------
    
    logger.info(f"Downloading genomes of taxids")
    taxid_dates, taxid_urls, taxid_file_path, taxid_assembly_level, taxid_present = download_genomes(taxid_list, assembly_summary, output)

    # Get length of references
    # ----------------------------------------------------------------------

    logger.info(f"Obtaining lengths of downloaded reference genomes")
    taxid_file_len = get_ref_lengths(taxid_present, taxid_file_path)

    # Get contig coverage values
    # ----------------------------------------------------------------------

    logger.info(f"Getting contig coverage values")
    contig_coverages = get_coverage(coverages)
    
    # Get species genome coverage and relative abundance
    # ----------------------------------------------------------------------

    logger.info(f"Getting species genome coverage and relative abundance")
    species_genome_coverages, species_rel_abundance = get_species_stats(species_names_taxid_length, taxid_file_len, taxid_contigs, contig_lengths, contig_coverages)

    # Rename and copy selected species
    # ----------------------------------------------------------------------

    logger.info(f"Renaming and copying reference genomes of selected species")
    rename_and_copy_genomes(taxid_file_path, species_names_taxid_length, species_genome_coverages, species_rel_abundance, rel_abundance, genome_coverage, output)

    # Write nucleotide gene sequences
    # ----------------------------------------------------------------------

    logger.info(f"Writing nucleotide sequences of genes")
    write_nt_gene_seqs(genes, output)

    # Align genes to refs
    # ----------------------------------------------------------------------

    logger.info(f"Aligning genes to reference genomes")
    align_genes_to_refs(map_threshold, nthreads, output)

    # Get reference IDs
    # ----------------------------------------------------------------------

    logger.info(f"Obtaining IDs of reference genomes")
    ref_ids = get_ref_ids(output)

    # Get genes to mapped species
    # ----------------------------------------------------------------------

    logger.info(f"Obtaining genes mapped to reference genomes")
    gene_bins = get_genes_mapped_to_species(ref_ids, output)

    # Get aa gene seqs mapped
    # ----------------------------------------------------------------------

    logger.info(f"Obtaining amino acid sequences of genes mapped")
    gene_seq, genes_contigs, gene_species_mapped, unmapped_count = get_aa_gene_seqs(genes, gene_bins, taxid_present, taxid_to_species, contig_taxid, k_val)

    logger.info(f"Total number of genes: {len(gene_seq)}")
    logger.info(f"Number of genes mapped: {len(gene_seq)-unmapped_count}")
    logger.info(f"Percentage of genes mapped: {(len(gene_seq)-unmapped_count)/len(gene_seq)}")

    # Write amino acid gene sequences
    # ----------------------------------------------------------------------

    logger.info(f"Writing amino acide sequences of genes and gene annotations details")
    workbook_name = write_aa_gene_seqs(gene_species_mapped, gene_seq, output)

    logger.info(f"Gene annotation results can be found in {workbook_name}")

    # Get elapsed time
    # ----------------------------------------------------------------------

    # Determine elapsed time
    elapsed_time = time.time() - start_time

    # Print elapsed time for the process
    logger.info(f"Elapsed time: {elapsed_time} seconds")

    # Exit program
    # ----------------------------------------------------------------------

    logger.info(f"Thank you for using condiga!")


if __name__ == "__main__":
    main()
