# kate: syntax Python;
"""
"""
import shutil
import textwrap
import json
from sqt.dna import reverse_complement
from sqt import FastaReader, SequenceReader
import igdiscover
from igdiscover.utils import relative_symlink, Config

try:
	config = Config.from_default_path()
except FileNotFoundError as e:
	sys.exit("Pipeline configuration file {!r} not found. Please create it!".format(e.filename))

print('IgDiscover version {}. Configuration:'.format(igdiscover.__version__))
for k, v in sorted(vars(config).items()):
	# TODO the following line is only necessary for non-YAML configurations
	if k.startswith('_'):
		continue
	print('   ', k, ': ', repr(v), sep='')

# This command is run before every shell command and helps to catch errors early
shell.prefix("set -euo pipefail;")

READS1, READS2 = 'reads/1-limited.1.fastq.gz', 'reads/1-limited.2.fastq.gz'
PREPROCESSED_READS = 'reads/sequences.fasta.gz'

# Targets for each iteration
ITERATION_TARGETS = [
	'clusterplots/done',
	#'correlationVJ.pdf',
	'errorhistograms.pdf',
	'V_usage.tab',
	'V_usage.pdf',
	'V_dendrogram.pdf',
]
# Targets for non-final iterations
DISCOVERY_TARGETS = [
	'candidates.tab',
	'new_V_germline.fasta',
	'new_V_pregermline.fasta',
]
# Targets for final iteration
FINAL_TARGETS = [
	#'consensus/correlationVJ.pdf',
	#'consensus/V_usage.tab',
	#'consensus/V_usage.pdf',
]
TARGETS = expand('iteration-{nr:02d}/{path}', nr=range(1, config.iterations+1), path=ITERATION_TARGETS+DISCOVERY_TARGETS)
TARGETS += expand('final/{path}', path=ITERATION_TARGETS+FINAL_TARGETS)
TARGETS += [
	'stats/readlengths.pdf',
	'stats/merging-successful',
	'stats/trimming-successful',
	'stats/stats.json'
]

# Use pigz (parallel gzip) if available
GZIP = 'pigz' if shutil.which('pigz') is not None else 'gzip'

rule all:
	input:
		TARGETS


if config.limit:
	rule limit_reads_gz:
		output: 'reads/1-limited.{nr,([12]\\.|)}{ext,(fasta|fastq)}.gz'
		input: 'reads.{nr}{ext}.gz'
		shell:
			'sqt fastxmod -w 0 --limit {config.limit} {input} | {GZIP} > {output}'

	rule limit_reads:
		output: 'reads/1-limited.{nr,([12]\\.|)}{ext,(fasta|fastq)}.gz'
		input: 'reads.{nr}{ext}'
		shell:
			'sqt fastxmod -w 0 --limit {config.limit} {input} | {GZIP} > {output}'

else:
	rule symlink_limited:
		output: fastaq='reads/1-limited.{nr,([12]\\.|)}{ext,(fasta|fastq)}.gz'
		input: fastaq='reads.{nr}{ext}.gz'
		resources: time=1
		run:
			relative_symlink(input.fastaq, output.fastaq, force=True)

	# TODO compressing the input file is an unnecessary step
	rule gzip_limited:
		output: fastaq='reads/1-limited.{nr,([12]\\.|)}{ext,(fasta|fastq)}.gz'
		input: fastaq='reads.{nr}{ext}'
		shell:
			'{GZIP} < {input} > {output}'


# After the rules above, we either end up with
#
# 'reads/1-limited.1.fastq.gz' and 'reads/1-limited.2.fastq.gz'
# or
# 'reads/1-limited.fasta.gz'
# or
# 'reads/1-limited.fastq.gz'


if config.merge_program == 'flash':
	rule flash_merge:
		"""Use FLASH to merge paired-end reads"""
		output: fastqgz='reads/2-merged.fastq.gz', log='reads/2-flash.log'
		input: 'reads/1-limited.1.fastq.gz', 'reads/1-limited.2.fastq.gz'
		resources: time=60
		threads: 8
		shell:
			# -M: maximal overlap (2x300, 420-450bp expected fragment size)
			"time flash -t {threads} -c -M {config.flash_maximum_overlap} {input} 2> "
			">(tee {output.log} >&2) | {GZIP} > {output.fastqgz}"

	rule parse_flash_stats:
		input: log='reads/2-flash.log'
		output:
			json='stats/reads.json'
		run:
			total_ex = re.compile(r'\[FLASH\]\s*Total reads:\s*([0-9]+)')
			merged_ex = re.compile(r'\[FLASH\]\s*Combined reads:\s*([0-9]+)')
			with open(input.log) as f:
				for line in f:
					match = total_ex.search(line)
					if match:
						total = int(match.group(1))
						continue
					match = merged_ex.search(line)
					if match:
						merged = int(match.group(1))
						break
				else:
					sys.exit('Could not parse the FLASH log file')
			d = OrderedDict({'total': total})
			d['merged'] = merged
			d['merging_was_done'] = True
			with open(output.json, 'w') as f:
				json.dump(d, f)


elif config.merge_program == 'pear':
	rule pear_merge:
		"""Use pear to merge paired-end reads"""
		output:
			unmerged1='reads/2-pear.unassembled.forward.fastq.gz',
			unmerged2='reads/2-pear.unassembled.reverse.fastq.gz',
			discarded='reads/2-pear.discarded.fastq.gz',
			fastq='reads/2-merged.fastq.gz',
			log='reads/2-pear.log'
		input: 'reads/1-limited.1.fastq.gz', 'reads/1-limited.2.fastq.gz'
		resources: time=60
		threads: 16
		shell:
			r"""
			time pear -j {threads} -f {input[0]} -r {input[1]} -o reads/2-pear | tee {output.log}
			for n in unassembled.forward unassembled.reverse assembled discarded; do
				{GZIP} -f reads/2-pear.$n.fastq
			done
			mv reads/2-pear.assembled.fastq.gz {output.fastq}
			"""

	rule parse_pear_stats:
		input: log='reads/2-pear.log'
		output:
			json='stats/reads.json'
		run:
			expression = re.compile(r"Assembled reads \.*: (?P<merged>[0-9,]*) / (?P<total>[0-9,]*)")
			with open(input.log) as f:
				for line in f:
					match = expression.search(line)
					if match:
						merged = int(match.group('merged').replace(',', ''))
						total = int(match.group('total').replace(',', ''))
						break
				else:
					sys.exit('Could not parse the PEAR log file')
			d = OrderedDict({'total': total})
			d['merged'] = merged
			d['merging_was_done'] = True
			with open(output.json, 'w') as f:
				json.dump(d, f)
else:
	sys.exit("merge_program {config.merge_program!r} given in configuration file not recognized".format(config=config))


# This rule applies only when the input is single-end or already merged
rule symlink_merged:
	output:
		fastaq='reads/2-merged.{ext,(fasta|fastq)}.gz'
	input: fastaq='reads/1-limited.{ext}.gz'
	run:
		relative_symlink(input.fastaq, output.fastaq, force=True)


# After the rules above, we end up with
#
# 'reads/2-merged.fasta.gz'
# or
# 'reads/2-merged.fastq.gz'


rule read_stats_single_fasta:
	"""Compute statistics if no merging was done (FASTA input)"""
	output: json='stats/reads.json',
	input: fastagz='reads/1-limited.fasta.gz'
	run:
		total = count_sequences(input.fastagz)
		d = OrderedDict({'total': total})
		d['merged'] = total
		d['merging_was_done'] = False
		with open(output.json, 'w') as f:
			json.dump(d, f)


rule read_stats_single_fastq:
	"""Compute statistics if no merging was done (FASTQ input)"""
	output: json='stats/reads.json',
	input: fastagz='reads/1-limited.fastq.gz'
	run:
		total = count_sequences(input.fastagz)
		d = OrderedDict({'total': total})
		d['merged'] = total
		d['merging_was_done'] = False
		with open(output.json, 'w') as f:
			json.dump(d, f)


rule check_merging:
	"""Ensure the merging succeeded"""
	output: success='stats/merging-successful'
	input:
		json='stats/reads.json'
	run:
		with open(input.json) as f:
			d = json.load(f)
		total = d['total']
		merged = d['merged']
		if total == 0 or merged / total >= 0.3:
			with open(output.success, 'w') as f:
				print('This marker file exists if at least 30% of the input '
					'reads could be merged.', file=f)
		else:
			sys.exit('Less than 30% of the input reads could be merged. Please '
				'check whether there is an issue with your input data. To skip '
				'this check, create the file "stats/merging-successful" and '
				're-start "igdiscover run".')


rule merged_read_length_histogram:
	output:
		txt="stats/merged.readlengths.txt",
		pdf="stats/merged.readlengths.pdf"
	input:
		fastq='reads/2-merged.fastq.gz'
	shell:
		"""sqt readlenhisto --bins 100 --left {config.minimum_merged_read_length} --title "Lengths of merged reads" --plot {output.pdf} {input}  > {output.txt}"""


rule read_length_histogram:
	output:
		txt="stats/readlengths.txt",
		pdf="stats/readlengths.pdf"
	input:
		fastq=PREPROCESSED_READS
	shell:
		"""sqt readlenhisto --bins 100 --left {config.minimum_merged_read_length} --title "Lengths of pre-processed reads" --plot {output.pdf} {input}  > {output.txt}"""


rule reads_stats_fasta:
	"""
	TODO implement this
	"""
	output: txt="stats/reads.txt"
	input:
		merged='reads/1-limited.fasta.gz'
	shell: "touch {output}"


# Remove primer sequences

if config.forward_primers:
	# At least one forward primer is to be removed
	rule trim_forward_primers:
		output: fastaq=temp('reads/3-forward-primer-trimmed.{ext,(fasta|fastq)}.gz')
		input: fastaq='reads/2-merged.{ext}.gz', mergesuccess='stats/merging-successful'
		resources: time=120
		log: 'reads/3-forward-primer-trimmed.cutadapt.log'
		run:
			primers = config.forward_primers
			param = ' '.join('-g ^{}'.format(seq) for seq in primers)
			if not config.stranded:
				param += ' ' + ' '.join('-a {}$'.format(reverse_complement(seq)) for seq in config.forward_primers)
			shell("cutadapt --discard-untrimmed {param} -o {output.fastaq} {input.fastaq} | tee {log}")
else:
	# No trimming, just symlink the file
	rule dont_trim_forward_primers:
		output: fastaq='reads/3-forward-primer-trimmed.{ext,(fasta|fastq)}.gz'
		input: fastaq='reads/2-merged.{ext}.gz', mergesuccess='stats/merging-successful'
		resources: time=1
		run:
			relative_symlink(input.fastaq, output.fastaq, force=True)


if config.reverse_primers:
	# At least one reverse primer is to be removed
	rule trim_reverse_primers:
		output: fastaq='reads/4-trimmed.{ext,(fasta|fastq)}.gz'
		input: fastaq='reads/3-forward-primer-trimmed.{ext,(fasta|fastq)}.gz'
		resources: time=120
		log: 'reads/4-trimmed.cutadapt.log'
		run:
			primers = config.reverse_primers
			# Reverse primers should appear reverse-complemented at the 3' end
			# of the merged read.
			param = ' '.join('-a {}$'.format(reverse_complement(seq)) for seq in primers)
			if not config.stranded:
				param += ' ' + ' '.join('-g ^{}'.format(seq) for seq in config.reverse_primers)
			shell("cutadapt --discard-untrimmed {param} -o {output.fastaq} {input.fastaq} | tee {log}")

else:
	# No trimming, just symlink the file
	rule dont_trim_reverse_primers:
		output: fastaq='reads/4-trimmed.{ext,(fasta|fastq)}.gz'
		input: fastaq='reads/3-forward-primer-trimmed.{ext,(fasta|fastq)}.gz'
		resources: time=1
		run:
			relative_symlink(input.fastaq, output.fastaq, force=True)


rule trimmed_fasta_stats:
	output: json='stats/trimmed.json',
	input: fastagz='reads/4-trimmed.fasta.gz'
	run:
		with open(output.json, 'w') as f:
			json.dump({'trimmed': count_sequences(input.fastagz)}, f)


rule trimmed_fastq_stats:
	output: json='stats/trimmed.json',
	input: fastqgz='reads/4-trimmed.fastq.gz'
	run:
		with open(output.json, 'w') as f:
			json.dump({'trimmed': count_sequences(input.fastqgz)}, f)


rule check_trimming:
	"""Ensure that some reads are left after trimming"""
	output: success='stats/trimming-successful'
	input:
		reads_json='stats/reads.json',
		trimmed_json='stats/trimmed.json'
	run:
		with open(input.reads_json) as f:
			total = json.load(f)['total']
		with open(input.trimmed_json) as f:
			trimmed = json.load(f)['trimmed']
		if total == 0 or trimmed / total >= 0.1:
			with open(output.success, 'w') as f:
				print('This marker file exists if at least 10% of input '
					'reads contain the required primer sequences.', file=f)
		else:
			print(*textwrap.wrap(
				'Less than 10% of the input reads contain the required primer '
				'sequences. Please check whether you have specified the '
				'correct primer sequences in the configuration file. To skip '
				'this check, create the file "stats/trimming-successful" and '
				're-start "igdiscover run".'), sep='\n')
			sys.exit(1)


rule fastqc:
	output:
		zip='fastqc/{file}.zip',
		png='fastqc/{file}/Images/per_base_quality.png',
		html='fastqc/{file}_fastqc.html'
	input: fastq='{file}.fastq.gz'
	shell:
		r"""
		rm -rf fastqc/{wildcards.file}/ fastqc/{wildcards.file}_fastqc/ && \
		fastqc -o fastqc {input} && \
		mv fastqc/{wildcards.file}_fastqc.zip {output.zip} && \
		unzip -o -d fastqc/ {output.zip} && \
		mv fastqc/{wildcards.file}_fastqc/ fastqc/{wildcards.file}
		"""


rule filter_fasta:
	"""
	* Remove low-quality sequences
	* Discard too short sequences
	"""
	output: fasta=temp("reads/5-filtered.fasta")
	input: fasta="reads/4-trimmed.fasta.gz", success='stats/trimming-successful'
	params: max_errors=" --max-errors {config.maximum_expected_errors}" if config.maximum_expected_errors is not None else ""
	shell:
		"sqt fastxmod{params.max_errors} --minimum-length {config.minimum_merged_read_length} {input.fasta} > {output.fasta}"


rule fastq_to_fasta:
	"""
	Same as filter_fasta, but also convert from FASTQ to FASTA
	"""
	output: fasta=temp("reads/5-filtered.fasta")
	input: fastq="reads/4-trimmed.fastq.gz", success='stats/trimming-successful'
	params: max_errors=" --max-errors {config.maximum_expected_errors}" if config.maximum_expected_errors is not None else ""
	shell:
		"sqt fastxmod{params.max_errors} --minimum-length {config.minimum_merged_read_length} --fasta {input.fastq} > {output.fasta}"


if config.barcode_length != 0:
	rule igdiscover_group:
		"""Group by barcode and CDR3 (also implicitly removes duplicates)"""
		output:
			fastagz=PREPROCESSED_READS,
			pdf="stats/groupsizes.pdf"
		input:
			fasta="reads/5-filtered.fasta"
		log: PREPROCESSED_READS + ".log"
		shell:
			"igdiscover group --barcode-length {config.barcode_length} --trim-g --plot-sizes {output.pdf} {input.fasta} 2> {log} | {GZIP} > {output.fastagz}"

else:
	rule dereplicate:
		"""Collapse identical sequences with VSEARCH"""
		output: fastagz=PREPROCESSED_READS
		input: fasta="reads/5-filtered.fasta"
		shell:
			'time vsearch --derep_fulllength {input.fasta} --strand both --sizeout --output >({GZIP} > {output.fastagz})'



rule copy_dj_database:
	"""Copy D and J gene database into the iteration folder"""
	output:
		fasta="{base}/database/{species}_{gene,[DJ]}.fasta"
	input:
		fasta="database/{species}_{gene}.fasta"
	shell:
		"cp -p {input} {output}"


rule v_database_iteration_1:
	"""Copy original V gene database into the iteration 1 folder"""
	output:
		fasta="iteration-01/database/{species}_V.fasta"
	input:
		fasta="database/{species}_V.fasta"
	shell:
		"cp -p {input} {output}"


for i in range(2, config.iterations + 1):
	rule:
		output:
			fasta='iteration-{nr:02d}/database/{{species}}_V.fasta'.format(nr=i)
		input:
			fasta='iteration-{nr:02d}/new_V_pregermline.fasta'.format(nr=i-1)
		shell:
			"cp -p {input.fasta} {output.fasta}"

if config.iterations == 0:
	# Copy over the input database (would be nice to avoid this)
	rule copy_database:
		output:
			fasta='final/database/{species}_V.fasta'
		input:
			fasta='database/{species}_V.fasta'
		shell:
			"cp -p {input.fasta} {output.fasta}"
else:
	rule copy_final_v_database:
		output:
			fasta='final/database/{species}_V.fasta'.format(species=config.species)
		input:
			fasta='iteration-{nr:02d}/new_V_germline.fasta'.format(nr=config.iterations)
		shell:
			"cp -p {input.fasta} {output.fasta}"


rule makeblastdb:
	output: "{dir}/{species}_{gene,[VDJ]}.nhr"  # and nin nog nsd nsi nsq
	input: fasta="{dir}/{species}_{gene}.fasta"
	params: dbname="{dir}/{species}_{gene}"
	log: '{dir}/{species}_{gene}.log'
	threads: 100  # force to run as single job
	run:
		with FastaReader(input.fasta) as fr:
			sequences = list(fr)
		if not sequences:
			raise ValueError("The FASTA file {} is empty, cannot continue!".format(input.fasta))
		shell("makeblastdb -parse_seqids -dbtype nucl -in {input.fasta} -out {params.dbname} >& {log}")
		with open(log[0]) as f:
		    if 'Error: ' in f.read():
		        sys.exit("makeblastdb failed when creating {}".format(params.dbname))


rule igdiscover_igblast:
	output:
		txtgz="{dir}/igblast.txt.gz"
	input:
		fastagz=PREPROCESSED_READS,
		db_v="{{dir}}/database/{species}_V.nhr".format(species=config.species),
		db_d="{{dir}}/database/{species}_D.nhr".format(species=config.species),
		db_j="{{dir}}/database/{species}_J.nhr".format(species=config.species)
	params:
		penalty='--penalty {}'.format(config.mismatch_penalty) if config.mismatch_penalty is not None else '',
		database='{dir}/database'
	log: '{dir}/igblast.log'
	threads: 16
	shell:
		"time igdiscover igblast --threads {threads} {params.penalty} "
		"--species {config.species} {params.database} {input.fastagz} | "
		"{GZIP} 2>&1 > {output.txtgz} | tee {log} >&2"


rule igdiscover_parse:
	output:
		tabgz="{dir}/assigned.tab.gz",
	input:
		txt="{dir}/igblast.txt.gz",
		fasta=PREPROCESSED_READS
	params:
		extra="--rename {config.library_name!r}_".format(config=config) if config.rename else ""
	shell:
		"igdiscover parse {params.extra} {input.txt} {input.fasta} | {GZIP} > {output.tabgz}"


rule igdiscover_filter:
	output:
		filtered="{dir}/filtered.tab.gz"
	input:
		assigned="{dir}/assigned.tab.gz"
	shell:
		"igdiscover filter {input} | {GZIP} > {output}"


rule igdiscover_count:
	output:
		plot="{dir}/{gene,[VDJ]}_usage.pdf",
		counts="{dir}/{gene}_usage.tab"
	input:
		reference="{{dir}}/database/{species}_{{gene}}.fasta".format(species=config.species),
		tab="{dir}/filtered.tab.gz"
	shell:
		"igdiscover count --database {input.reference} --gene {wildcards.gene} "
		"{input.tab} {output.plot} > {output.counts}"


rule igdiscover_clusterplot:
	output:
		done="{dir}/clusterplots/done"
	input:
		tab="{dir}/filtered.tab.gz"
	params:
		clusterplots="{dir}/clusterplots/",
		ignore_j=' --ignore-J' if config.ignore_j else ''
	shell:
		"igdiscover clusterplot{params.ignore_j} {input.tab} {params.clusterplots} && touch {output.done}"


rule igdiscover_discover:
	"""Discover potential new V gene sequences"""
	output:
		tab="{dir}/candidates.tab"
	input:
		v_reference="{{dir}}/database/{species}_V.fasta".format(species=config.species),
		tab="{dir}/filtered.tab.gz"
	params:
		ignore_j=' --ignore-J' if config.ignore_j else '',
		seed=' --seed={}'.format(config.seed) if config.seed is not None else '',
		exact_copies=' --exact-copies={}'.format(config.exact_copies) if config.exact_copies is not None else ''
	threads: 6
	shell:
		"time igdiscover discover -j {threads}{params.seed}{params.ignore_j}{params.exact_copies} "
		"--subsample={config.subsample} --database={input.v_reference} "
		"{input.tab} > {output.tab}"


def db_whitelist_or_not(wildcards):
	filterconf = config.pre_germline_filter if wildcards.pre == 'pre' else config.germline_filter
	if filterconf['whitelist']:
	    # Use original (non-iteration-specific) database as whitelist
		return 'database/{species}_V.fasta'.format(
			nr=wildcards.nr, species=config.species)
	else:
		return []


rule igdiscover_compose:
	"""Construct a new database out of the discovered sequences"""
	output:
		tab='iteration-{nr}/new_V_{pre,(pre|)}germline.tab',
		fasta='iteration-{nr}/new_V_{pre,(pre|)}germline.fasta'
	input:
		tab='iteration-{nr}/candidates.tab',
		db_whitelist=db_whitelist_or_not,
		whitelist='whitelist.fasta' if os.path.exists('whitelist.fasta') else []
	run:
		nr = int(wildcards.nr, base=10)
		conf = config.pre_germline_filter if wildcards.pre == 'pre' else config.germline_filter
		criteria = []
		for path in [input.db_whitelist, input.whitelist]:
			if path:
				criteria += ['--whitelist=' + path]
		if conf['check_motifs']:
			criteria += ['--looks-like-V']
		if conf['allow_stop']:
			criteria += ['--allow-stop']
		criteria += ['--max-differences={}'.format(conf['differences'])]
		criteria += ['--unique-CDR3={}'.format(conf['unique_cdr3s'])]
		criteria += ['--cluster-size={}'.format(conf['cluster_size'])]
		criteria += ['--unique-J={}'.format(conf['unique_js'])]
		criteria = ' '.join(criteria)
		shell("igdiscover compose {criteria} --fasta={output.fasta} {input.tab} > {output.tab}")


rule stats_correlation_V_J:
	output:
		pdf="{dir}/correlationVJ.pdf"
	input:
		table="{dir}/assigned.tab.gz"
	run:
		import matplotlib
		matplotlib.use('pdf')
		# sns.heatmap will not work properly with the object-oriented interface,
		# so use pyplot
		import matplotlib.pyplot as plt
		import seaborn as sns
		import numpy as np
		import pandas as pd
		from collections import Counter
		table = igdiscover.read_table(input.table)
		fig = plt.figure(figsize=(29.7/2.54, 21/2.54))
		counts = np.zeros((21, 11), dtype=np.int64)
		counter = Counter(zip(table['V_errors'], table['J_errors']))
		for (v,j), count in counter.items():
			if v is not None and v < counts.shape[0] and j is not None and j < counts.shape[1]:
				counts[v,j] = count
		df = pd.DataFrame(counts.T)[::-1]
		df.index.name = 'J errors'
		df.columns.name = 'V errors'
		sns.heatmap(df, annot=True, fmt=',d', cbar=False)
		fig.suptitle('V errors vs. J errors in unfiltered sequences')
		fig.set_tight_layout(True)
		fig.savefig(output.pdf)


rule plot_errorhistograms:
	output:
		pdf='{dir}/errorhistograms.pdf',
	input:
		table='{dir}/filtered.tab.gz'
	params:
		ignore_j='--ignore-J' if config.ignore_j else ''
	shell:
		'igdiscover errorplot {params.ignore_j} {input.table} {output.pdf}'


rule dendrogram:
	output:
		pdf='{dir}/{gene}_dendrogram.pdf'
	input:
		fasta='{{dir}}/database/{species}_{{gene}}.fasta'.format(species=config.species)
	shell:
		'igdiscover dendrogram --mark database/{config.species}_{wildcards.gene}.fasta {input.fasta} {output.pdf}'


rule version:
	output: txt='stats/version.txt'
	run:
		with open(output.txt, 'w') as f:
			print('IgDiscover version', igdiscover.__version__, file=f)


def get_sequences(path):
	with SequenceReader(path) as fr:
		sequences = [ record.sequence.upper() for record in fr ]
	return sequences


def count_sequences(path):
	with SequenceReader(path) as fr:
		n = 0
		for _ in fr:
			n += 1
	return n


rule json_stats:
	output: json='stats/stats.json'
	input:
		original_db='database/{species}_V.fasta'.format(species=config.species),
		v_pre_germline=['iteration-{:02d}/new_V_pregermline.fasta'.format(i+1) for i in range(config.iterations)],
		v_germline=['iteration-{:02d}/new_V_germline.fasta'.format(i+1) for i in range(config.iterations)],
		reads='stats/reads.json',
		trimmed='stats/trimmed.json'
	run:
		d = OrderedDict()
		d['version'] = igdiscover.__version__

		with open(input.reads) as f:
			d['reads'] = OrderedDict(json.load(f))
		with open(input.trimmed) as f:
			d['reads']['trimmed'] = json.load(f)['trimmed']

		iterations = []
		prev_sequences = set(get_sequences(input.original_db))
		size = len(prev_sequences)
		info = OrderedDict(size=size)
		iterations.append(info)
		for i, (pre_germline_path, germline_path) in enumerate(
				zip(input.v_pre_germline, input.v_germline)):

			pre_germline_sequences = set(get_sequences(pre_germline_path))
			germline_sequences = set(get_sequences(germline_path))

			gained = len(germline_sequences - prev_sequences)
			lost = len(prev_sequences - germline_sequences)
			gained_pre = len(pre_germline_sequences - prev_sequences)
			lost_pre = len(prev_sequences - pre_germline_sequences)

			info = OrderedDict()
			info['size'] = len(germline_sequences)
			info['gained'] = gained
			info['lost'] = lost
			info['size_pre'] = len(pre_germline_sequences)
			info['gained_pre'] = gained_pre
			info['lost_pre'] = lost_pre
			iterations.append(info)
			prev_sequences = pre_germline_sequences
		d['iterations'] = iterations

		with open(output.json, 'w') as f:
			json.dump(d, f, indent=2)
			print(file=f)
