#!/usr/bin/env python


import sys
from optparse import OptionParser
import errno


import pysam
import pysamstats
from pysamstats.config import stats_types, stats_types_withref, stepper_types
from pysamstats.io import write_csv, write_hdf5


if __name__ == '__main__':

    usage = 'usage: %prog [options] FILE'
    description = "Calculate statistics against genome positions based on " \
                  "sequence alignments from a SAM or BAM file and print them " \
                  "to stdout."
    epilog = """
Pileup-based statistics types (each row has statistics over reads in a pileup column):

    * coverage            - Number of reads aligned to each genome position
                            (total and properly paired).
    * coverage_strand     - As coverage but with forward/reverse strand counts.
    * coverage_ext        - Various additional coverage metrics, including
                            coverage for reads not properly paired (mate
                            unmapped, mate on other chromosome, ...).
    * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts.
    * coverage_gc         - As coverage but also includes a column for %GC.
    * variation           - Numbers of matches, mismatches, deletions,
                            insertions, etc.
    * variation_strand    - As variation but with forward/reverse strand counts.
    * tlen                - Insert size statistics.
    * tlen_strand         - As tlen but with statistics by forward/reverse strand.
    * mapq                - Mapping quality statistics.
    * mapq_strand         - As mapq but with statistics by forward/reverse strand.
    * baseq               - Base quality statistics.
    * baseq_strand        - As baseq but with statistics by forward/reverse strand.
    * baseq_ext           - Extended base quality statistics, including qualities
                            of bases matching and mismatching reference.
    * baseq_ext_strand    - As baseq_ext but with statistics by forward/reverse strand.

Binned statistics types (each row has statistics over reads aligned starting within a genome window):

    * coverage_binned     - As coverage but binned.
    * coverage_ext_binned - As coverage_ext but binned.
    * mapq_binned         - Similar to mapq but binned.
    * alignment_binned    - Aggregated counts from cigar strings.
    * tlen_binned         - As tlen but binned.

Examples:

    pysamstats --type coverage example.bam > example.coverage.txt
    pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt

Version: {version} (pysam {pysamversion})

""".format(version=pysamstats.__version__, pysamversion=pysam.__version__)

    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage=usage, description=description, epilog=epilog)

    parser.add_option(
        '-t', '--type', dest='type', default='coverage',
        help='Type of statistics to print, one of: %s.' % ', '.join(stats_types))

    parser.add_option(
        '-c', '--chromosome', dest='chromosome', default=None,
        help='Chromosome name.')

    parser.add_option(
        '-s', '--start', dest='start', type='int', default=None,
        help='Start position (1-based).')

    parser.add_option(
        '-e', '--end', dest='end', type='int', default=None,
        help='End position (1-based).')

    parser.add_option(
        '-z', '--zero-based', dest='zero_based', action='store_true', default=False,
        help='Use zero-based coordinates (default is false, i.e., use one-based coords).')

    parser.add_option(
        '-u', '--truncate', dest='truncate', action='store_true', default=False,
        help='Truncate pileup-based stats so no records are emitted outside the specified range.')

    parser.add_option(
        '-S', '--stepper', dest='stepper', action='store', default='all',
        help='Stepper to provide to underlying pysam call. Options are:'
             '"all" (default): all reads are returned, except where flags BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, '
             'BAM_FDUP set; "nofilter" applies no filter to returned reads; '
             '"samtools": filter & read processing as in _csamtools_ pileup. This requires a fasta file. '
             'For complete details see the pysam documentation.')

    parser.add_option(
        '-d', '--pad', dest='pad', action='store_true', default=False,
        help='Pad pileup-based stats so a record is emitted for every position (default is only '
             'covered positions).')

    parser.add_option(
        '-D', '--max-depth', dest='max_depth', type=int, default=8000,
        help='Maximum read depth permitted in pileup-based statistics. The default limit is 8000.')

    parser.add_option(
        '-f', '--fasta', dest='fasta', default=None,
        help='Reference sequence file, only required for some statistics.')

    parser.add_option(
        '-o', '--omit-header', dest='omit_header', default=False, action='store_true',
        help='Omit header row from output.')

    parser.add_option(
        '-p', '--progress', dest='progress', type='int', metavar='N', default=None,
        help='Report progress every N rows.')

    parser.add_option(
        '--window-size', dest='window_size', type='int', metavar='N', default=300,
        help='Size of window for binned statistics (default is 300).')

    parser.add_option(
        '--window-offset', dest='window_offset', type=int, default=None, metavar='N',
        help='Window offset to use for deciding which genome position to report binned statistics '
             'against. The default is 150, i.e., the middle of 300bp window.')

    parser.add_option(
        '--format', dest='format', default='tsv',
        help='Output format, one of {tsv, csv, hdf5} (defaults to tsv). N.B., hdf5 requires '
             'PyTables to be installed.')

    parser.add_option(
        '--output', dest='output',
        help='Path to output file. If not provided, write to stdout.')

    parser.add_option(
        '--fields', dest='fields', default=None,
        help='Comma-separated list of fields to output (defaults to all fields).')

    parser.add_option(
        '--hdf5-group', dest='hdf5_group', default='/',
        help='Name of HDF5 group to write to (defaults to the root group).')

    parser.add_option(
        '--hdf5-dataset', dest='hdf5_dataset', default='data',
        help='Name of HDF5 dataset to create (defaults to "data").')

    parser.add_option(
        '--hdf5-complib', dest='hdf5_complib', default='zlib',
        help='HDF5 compression library (defaults to zlib).')

    parser.add_option(
        '--hdf5-complevel', dest='hdf5_complevel', type=int, default=1,
        help='HDF5 compression level (defaults to 5).')

    parser.add_option(
        '--hdf5-chunksize', dest='hdf5_chunksize', type=int, default=2**20,
        help='Size of chunks in number of bytes (defaults to 2**20).')

    parser.add_option(
        '--min-mapq', dest='min_mapq', type=int, default=0,
        help='Only reads with mapping quality equal to or greater than this value will be counted '
             '(0 by default).')

    parser.add_option(
        '--min-baseq', dest='min_baseq', type=int, default=0,
        help='Only reads with base quality equal to or greater than this value will be counted '
             '(0 by default). Only applies to pileup-based statistics.')

    parser.add_option(
        '--no-dup', dest='no_dup', default=False, action='store_true',
        help="Don't count reads flagged as duplicate.")

    parser.add_option(
        '--no-del', dest='no_del', default=False, action='store_true',
        help="Don't count reads aligned with a deletion at the given position. Only applies to "
             "pileup-based statistics.")

    options, args = parser.parse_args()

    if len(args) != 1:
        parser.error('missing SAM or BAM file operand\n\nTry "pysamstats --help" for more '
                     'information.')

    samfile = args[0]
    one_based = not options.zero_based
    write_header = not options.omit_header
    if options.fields:
        fields = options.fields.split(',')
    else:
        fields = None

    try:

        if options.type not in stats_types:
            parser.error('unsupported statistics type: "%s"\nTry one of %s or '
                         '"pysamstats --help" for more information.'
                         % (options.type, stats_types))

        elif options.stepper not in stepper_types:
            parser.error('unsupported stepper type: "%s"\nMust be one of %s or '
                         '"pysamstats --help" for more information.'
                         % (options.stepper, stepper_types))

        elif options.type in stats_types_withref \
                and options.fasta is None:
            parser.error('missing --fasta option\n\nTry "pysamstats --help"'
                         ' for more information.')

        else:

            # setup common parameters
            kwargs = dict(
                chrom=options.chromosome,
                start=options.start,
                end=options.end,
                one_based=one_based,
                window_size=options.window_size,
                window_offset=options.window_offset,
                min_mapq=options.min_mapq,
                no_dup=options.no_dup
            )
            # some options only make sense if not performing binned analysis
            if not options.type.endswith('_binned'):
                kwargs['truncate'] = options.truncate
                kwargs['pad'] = options.pad
                kwargs['max_depth'] = options.max_depth
                kwargs['min_baseq'] = options.min_baseq
                kwargs['no_del'] = options.no_del
                kwargs['stepper'] = options.stepper

            if options.format.lower() in ['tsv', 'csv']:

                # setup
                dialect = {'tsv': 'excel-tab', 'csv': 'excel'}[options.format]
                if options.output is None:
                    output = sys.stdout
                    needs_closing = False
                else:
                    output = open(options.output, 'w')
                    needs_closing = True

                try:
                    write_csv(
                        options.type,
                        output,
                        samfile,
                        fafile=options.fasta,
                        write_header=write_header,
                        dialect=dialect,
                        progress=options.progress,
                        fields=fields,
                        **kwargs
                    )
                finally:
                    if needs_closing:
                        output.close()

            elif options.format.lower() == 'hdf5':

                assert options.output is not None, '--output must be specified'

                write_hdf5(
                    options.type,
                    options.output,
                    samfile,
                    fafile=options.fasta,
                    progress=options.progress,
                    hdf5_group=options.hdf5_group,
                    hdf5_dataset=options.hdf5_dataset,
                    hdf5_complib=options.hdf5_complib,
                    hdf5_complevel=options.hdf5_complevel,
                    hdf5_chunksize=options.hdf5_chunksize,
                    fields=fields,
                    **kwargs
                )

    except IOError as e:
        if e.errno == errno.EPIPE:
            pass  # ignore broken pipe
        else:
            raise
