#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""%prog [options]

Read FASTA-formatted data from stdin and writes selected sequences
to stdout. Options control which sequences are to be selected.

Example:
%prog -c 'ACGT' -m 100 -M 200 < input.fasta > output.fasta

This writes all sequences that exclusively contain the letters
A,C,G,T (in upper case) and whose length is between 100 and 200
to output.fasta. All other sequences are discarded.
"""
from __future__ import print_function, division
import sys
import os
import string
from sqt import HelpfulOptionParser
from sqt.io.fasta import FastaReader

__author__ = "Tobias Marschall"


def main():
	parser = HelpfulOptionParser(usage=__doc__)
	parser.add_option("-c", action="store", dest="allowed_chars",
		help="Allowed characters.")
	parser.add_option("-m", action="store", dest="min_length", type=int,
		help="Minimal length.")
	parser.add_option("-M", action="store", dest="max_length", type=int,
		help="Maximal length.")
	(options, args) = parser.parse_args()
	if len(args)!=0:
		parser.error("Wrong number of arguments.")
	if os.isatty(0):
		parser.error("Expecting input from stdin.")
	if options.allowed_chars != None:
		allowed_chars = set(c for c in options.allowed_chars)
	for s in FastaReader(sys.stdin):
		if options.min_length != None:
			if len(s.sequence) < options.min_length:
				continue
		if options.max_length != None:
			if len(s.sequence) > options.max_length:
				continue
		if options.allowed_chars != None:
			# TODO: this way of doing this is probably rather slow.
			valid = True
			for c in s.sequence:
				if not c in allowed_chars:
					valid = False
					break
			if not valid: continue
		print('>',s.name,sep='')
		print(s.sequence)


if __name__ == '__main__':
	main()
