#!/usr/bin/python3
# -*- coding: utf-8 -*-

# ------------------------------------------------------------------------------
# 
# Author: Gabriele Girelli
# Email: gigi.ga90@gmail.com
# Date: 2018-11-29
# Description: prepare database compatible with FISH-ProDe query script and
#              associated web-interface.
# 
# ------------------------------------------------------------------------------



# ==============================================================================

import argparse
import configparser
import ifpd as fp
from ifpd.bioext import UCSCbed
import logging
import numpy as np
import os
import pandas as pd
import shutil
import sys
from tqdm import tqdm

parser = argparse.ArgumentParser(description = '''
Builds a database of complementary oligodeoxyribonucleotides (ODNs) in a format
compatible with FISH-ProDe.

DISCLAIMER: this script does NOT generate a new database. It only re-formats an
existing database to a format compatible with FISH-ProDe.

The script uses takes as input a BED file (BED3) format and the reference
genome. The reference genome name must be compatible with UCSC standard.

When building a database for FISH-ProDe, it is possible to retain the ODNs
sequence in the database itself. While this generates larger files, it also
speeds up the process of probe design, since the script would not need to
retreive the ODN sequences from somewhere else.

To retain the ODN sequences in the database, use the --retain-sequences option.
When this option is used, the ODN sequences must be present in the input BED3
file 'name' column. Alternatively, if such column is not found, the script
queries UCSC for the sequences (this process requires an active internet
connection to contact the UCSC DAS server).

NOTE: only non-arbitrary nucleotides in standard IUPAC format are allowed in the
ODNs sequence. https://www.bioinformatics.org/sms/iupac.html

After the database is created, we advise running the fprode_dbcheck script to
test its integrity.

Details on the BED3 format are available on the UCSC website:
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
''', formatter_class = argparse.RawDescriptionHelpFormatter)

parser.add_argument('bedFile', metavar = 'bedPath', type = str,
    help = '''Path to BED3 bed file. Optionally, a fourth column can be present
    with the sequence of the corresponding region''')
parser.add_argument('refGenome', metavar = 'refGen', type = str,
    help = 'Reference genome in NCBI-compatible format (e.g., hg19).')
parser.add_argument('dbName', metavar = 'dbName', type = str,
    help = 'Database name.')

parser.add_argument('-O', metavar = 'dirPath', type = str, default = '.',
    help = '''Path to directory where to build the database. Created if missing.
        Defaults to current directory.''')
parser.add_argument('--ucsc-das-uri', metavar = 'dasURI', type = str,
    default = 'http://genome.ucsc.edu/cgi-bin/das/dsn',
    help = '''(advanced option) URI to UCSC DAS server.
        Default: http://genome.ucsc.edu/cgi-bin/das/''')
parser.add_argument('--custom-config', metavar = 'config', type = str,
    nargs = '*', help = '''Space separated "key:value" pairs to store in the
    database .config file. No quotes needed.''')

parser.add_argument('--increment-chrom-end', action = 'store_const',
    dest = 'incrementChromEnd', const = True, default = False,
    help = '''Use this option if your input bed file is not in UCSC bed format,
        and the last position (chromEnd) is actually included. This forces
        a unit increase of that position to convert to UCSC bed format.''')
parser.add_argument('--enforce-bed3', action = 'store_const',
    dest = 'enforceBED3', const = True, default = False,
    help = 'Consider only first 3 columns of the bed file.')
parser.add_argument('--retain-sequences', action = 'store_const',
    dest = 'retainSequences', const = True, default = False,
    help = 'Use this option to generate a database with sequences')
parser.add_argument('--list-refGenomes', action = 'store_const',
    dest = 'listRefGenomes', const = True, default = False,
    help = ''''List UCSC reference genomes and exit
        (requires internet connection)''')
parser.add_argument('-f', action = 'store_const',
    dest = 'forceRun', const = True, default = False,
    help = '''Force overwriting of the query if already run.
        This is potentially dangerous.''')

version = "0.0.1"
parser.add_argument('--version', action = 'version',
    version = f'{sys.argv[0]} v{version}')

args = parser.parse_args()

refGenomeList = fp.web.list_UCSC_reference_genomes(
    UCSC_DAS_URI = args.ucsc_das_uri, verbose = args.listRefGenomes)
if args.listRefGenomes:
    sys.exit()

assert os.path.isfile(args.bedFile), f'bed file not found: "{args.bedFile}"'
assert not os.path.isfile(args.O), f'expected folder, file found: "{args.O}"'
assert_msg = f'genome "{args.refGenome}" not found @UCSC'
assert args.refGenome in refGenomeList, assert_msg

if args.forceRun:
    if os.path.isdir(args.O):
        shutil.rmtree(args.O)
        pass
else:
    assert_msg = f'output folder already exists: "{args.O}"'
    assert not os.path.isdir(args.O), assert_msg
os.mkdir(args.O)

formatString = '%(asctime)-25s %(name)s %(levelname)-8s %(message)s'
formatter = logging.Formatter(formatString)
logging.basicConfig(
    filename = os.path.join(args.O, '.log'),
    level = logging.DEBUG,
    format = formatString)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

if args.forceRun and os.path.isdir(args.O):
    logging.warning('Overwritten previously run query.')

logging.info("Buffering bed-like file...")
bed = UCSCbed(args.bedFile,
    incrementChromEnd = args.incrementChromEnd, bufferize = True)

oligoGenerator = bed.buffer(False, args.enforceBED3)
if args.retainSequences:
    logging.info("Enabled sequence retrieval from UCSC...")
    oligoGenerator = UCSCbed.add_sequence_to_raw_record(oligoGenerator)

logging.info("Writing database...")
chromList = set()
for line in oligoGenerator:
    line = line.strip().split("\t")
    chrom = line.pop(0)
    line = "\t".join(line) + '\n'
    chromList.add(chrom)
    with open(os.path.join(args.O, chrom), 'a+') as OH:
        OH.write(line)

logging.info("Sorting oligos...")
has_overlaps = False
oligoLengthRange = [np.inf, -np.inf]
oligoMinDist = np.inf
for chrom in chromList:
    chromDF = pd.read_csv(os.path.join(args.O, chrom), "\t", header = None)
    chromDF.columns = UCSCbed.FIELD_NAMES[1:(chromDF.shape[1]+1)]

    chromDF = chromDF.sort_values('chromStart')
    chromDF.to_csv(os.path.join(args.O, chrom), "\t",
        header = False, index = False)

    startPositions = np.array(chromDF['chromStart'].iloc[1:].tolist())
    endPositions = np.array(chromDF['chromEnd'].iloc[:-1].tolist()) - 1
    if any(startPositions <= endPositions):
        has_overlaps = True

    oligoMinDist = min(oligoMinDist, min(startPositions - endPositions - 1))

    oligoLengthList = np.unique(chromDF['chromEnd'] - chromDF['chromStart'])
    oligoLengthRange[0] = min(oligoLengthRange[0], oligoLengthList.min())
    oligoLengthRange[1] = max(oligoLengthRange[1], oligoLengthList.max())

logging.info("Generating config file...")
config = configparser.ConfigParser()
config['DATABASE'] = {
    'name' : args.dbName,
    'refGenome' : args.refGenome
}
config['OLIGOS'] = {
    'sequence' : args.retainSequences,
    'min_dist' : oligoMinDist,
    'min_length' : oligoLengthRange[0],
    'max_length' : oligoLengthRange[1],
    'overlaps' : has_overlaps
}
config['SOURCE'] = {
    'bed' : args.bedFile,
    'enforced2bed3' : args.enforceBED3,
    'outDirectory' : args.O
}
if type(None) != type(args.custom_config):
    config['CUSTOM'] = {}
    config['CUSTOM'].update([tuple(x.split(":")) for x in args.custom_config])

with open(os.path.join(args.O, '.config'), 'w+') as OH:
    config.write(OH)

logging.info(f'Created database "{args.dbName}".')

# END ==========================================================================

################################################################################
