from os.path import join, splitext


#OPTIONS
QUIET = ""
CLOBBER = "--clobber"
NOPLOT = ""
OUTDIR = config['outdir']
SEGMENTATION = config['segmentation']

PREFIX = "/users/masenjoi/anaconda2/envs/snakemake-test/"
SPECIES = "Homo_sapiens"
BUILD = "hg38"

recipeGTF = "rnaseq"
recipeSEQ = "sequence"

fileGTF = BUILD + ".gtf"
fileSEQ = BUILD + ".fa"

BUILDPATH = join(PREFIX, "share", "ggd", SPECIES, BUILD)


RESULTS = "results"
RESULTFILE="results"
DATA = "data"
LOGS = "logs"
PLOTS = "plots"

preprocessed_segmentation = join(OUTDIR, DATA,"segmentation","segmentation.pkl.gz")

GMTK_PARAMS = "/scratch/turnkey/data/2018-02-22/train/params/params.params"


#Main rule that sets the targets to obtain
rule all:
	input:
		dynamic(join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}", "feature_distance.tab" )),
		#os.path.join(OUTDIR, DATA, "feature_distance", "general", "feature_distance.tab"),
		gmtk=join(OUTDIR, RESULTS, "gmtk_parameters", RESULTFILE),
		len_dist=join(OUTDIR, RESULTS, "length_distribution", RESULTFILE),
		nuc=join(OUTDIR, RESULTS, "nucleotide", RESULTFILE),
		agg=join(OUTDIR, RESULTS, "aggregation", "general", RESULTFILE),
		aggs=dynamic(join(OUTDIR, RESULTS, "aggregation", "gene_biotype", "{biotype}", RESULTFILE))
	output:
		outfile=join(OUTDIR, PLOTS, "plot.png")
	script:
		"scripts/visualization.py"

#DOWNLOADING RULES
#GGD conda installations into the anaconda environment folder
	
rule download_ggd_annotation:
	input:
	output:
		join(BUILDPATH, recipeGTF, fileGTF)
	shell:
		"conda install -p {PREFIX} -y {QUIET} {CLOBBER} -c ggd-alpha {BUILD}-gtf"

rule download_ggd_sequence:
	input:
	output:
		join(BUILDPATH, recipeSEQ, fileSEQ)
	shell:
		"conda install -p {PREFIX} -y {QUIET} {CLOBBER} -c ggd-alpha {BUILD}-sequence"

#PREPROCESSING RULES

#Preprocess segway annotation for future faster pasing
rule run_segtools_preprocess:
	input:
		SEGMENTATION
	output:
		preprocessed_segmentation
	params:
		outfile=splitext(splitext(preprocessed_segmentation)[0])[0]
	shell:
		"segtools-preprocess {QUIET} {CLOBBER} {SEGMENTATION} {params.outfile}"

#Parse the main gtf file to create one file per each gene biotype (uses pybedtools)
rule create_gene_biotype_gtf:
	input:
		gtf=join(BUILDPATH, recipeGTF, fileGTF)
	output:
		dynamic(join(BUILDPATH, recipeGTF, "gene_biotype", "general", "{biotype}", fileGTF)),
		dynamic(join(BUILDPATH, recipeGTF, "gene_biotype", "gene", "{biotype}", fileGTF))
	log:
		join(OUTDIR, LOGS, "create_gene_biotype_gtfs")
	params:
		outdir=join(BUILDPATH, recipeGTF, "gene_biotype"),
		outfile=fileGTF
	script:
		"scripts/create_gene_biotype_gtfs.py"


#ANALYZING RULES

#Segtools GMTK-Parameters execution
#TODO make input for path
rule run_segtools_gmtk_parameters:
	input:
		GMTK_PARAMS
	output:
		csv=join(OUTDIR, DATA, "gmtk_parameters", "gmtk_parameters.stats.csv"),
		result=join(OUTDIR, RESULTS, "gmtk_parameters", RESULTFILE)
	params:
		outdir=join(OUTDIR, DATA, "gmtk_parameters")
	shell:
		"segtools-gmtk-parameters {QUIET} {CLOBBER} --outdir {params.outdir} {GMTK_PARAMS};"
		"cp {output.csv} {output.result}" #TODO PLOT is necessary to obtain .csv and no variance is available


#Segtools Aggregation results obtention, general and by gene_biotype
rule run_segtools_aggregation_general:
	input:
		preprocessed_segmentation,
		gtf=join(BUILDPATH, recipeGTF, fileGTF)
	output:
		join(OUTDIR, DATA, "aggregation", "general", "feature_aggregation.tab")
	params:
		outdir=join(OUTDIR, DATA, "aggregation", "general")
	shell:
		"segtools-aggregation --mode=gene {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf}"

rule run_segtools_aggregation_gene_biotype:
	input:
		preprocessed_segmentation,
		gtf=join(BUILDPATH, recipeGTF, "gene_biotype", "general", "{biotype}", fileGTF)
	output:
		join(OUTDIR, DATA, "aggregation", "gene_biotype", "{biotype}", "feature_aggregation.tab")
	params:
		outdir=join(OUTDIR, DATA, "aggregation", "gene_biotype", "{biotype}")
	shell:
		"segtools-aggregation --mode=gene {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf}"

rule create_aggregation_results_general:
	input:
		infile=join(OUTDIR, DATA, "aggregation", "general", "feature_aggregation.tab")
	output:
		outfile=join(OUTDIR, RESULTS, "aggregation", "general", "results")
	script:
		"scripts/create_aggregation_results.py"

rule create_aggregation_results_gene_biotype:
	input:
		infile=join(OUTDIR, DATA, "aggregation", "gene_biotype", "{biotype}", "feature_aggregation.tab")
	output:
		outfile=join(OUTDIR, RESULTS, "aggregation", "gene_biotype", "{biotype}", "results")
	script:
		"scripts/create_aggregation_results.py"


#Segtools Length Distribution execution and results creation
#TODO Don't just copy, preprocess for easier obtention of results
rule run_segtools_length_distribution:
	input:
		preprocessed_segmentation,
	output:
		join(OUTDIR, DATA, "length_distribution", "segment_sizes.tab"),
		join(OUTDIR, RESULTS, "length_distribution", "results")
	params:
		outdir=join(OUTDIR, DATA, "length_distribution")
	shell:
		"segtools-length-distribution {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation};"
		"cp {output[0]} {output[1]}"


#Segtools Feature Distance execution for general file and gene_biotype
rule run_segtools_feature_distance_general:
	input:
		preprocessed_segmentation,
		gtf=join(BUILDPATH, recipeGTF, fileGTF)
	output:
		join(OUTDIR, DATA, "feature_distance", "general", "feature_distance.tab"),
		outfile=join(OUTDIR, DATA, "feature_distance", "general", "feature_distance_segments.tab")
	params:
		outdir=join(OUTDIR, DATA, "feature_distance", "general")
	shell:
		"segtools-feature-distance {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf} > {output.outfile}"

rule run_segtools_feature_distance_gene_biotype:
	input:
		preprocessed_segmentation,
		gtf=join(BUILDPATH, recipeGTF, "gene_biotype", "gene", "{biotype}", fileGTF)
	output:
		join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}", "feature_distance.tab"),
		outfile=join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}", "feature_distance_segments.tab")
	params:
		outdir=join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}")
	shell:
		"segtools-feature-distance {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf} > {output.outfile}"


#Bedtools nucleotide results obtention
rule run_bedtools_nuc:
	input:
		SEGMENTATION,
		fasta=join(BUILDPATH, recipeSEQ, fileSEQ)
	output:
		join(OUTDIR, DATA, "nucleotide", "bedtools_output")
	shell:
		"bedtools nuc -fi {input.fasta} -bed {SEGMENTATION} > {output}"

rule create_nucleotide_results:
	input:
		infile=join(OUTDIR, DATA, "nucleotide", "bedtools_output")
	output:
		outfile=join(OUTDIR, RESULTS, "nucleotide", "results")
	script:
		"scripts/create_nucleotide_results.py"


# Visualization

rule create_diagram:
	input:
	output:
		"dag.svg",
		"dag.png"
	shell:
		"snakemake --rulegraph all | dot -Tsvg > {PLOTS}/dag.svg"
