#!python
"""Extract subordinate clauses from a text file or directory of text files"""
import argparse
import logging
import os
from qextract import read_in_chunks, nlp

# Constants
OUTPUT_TEXT_FILE_BASE = 'subordinateClausesFrom{}'


def _extract_subordinate_clause(sentence):
    """Extract a subordinate clause from a sentence if it has one"""
    subordinate_clause, subordinating_conj, flagged = '', None, False
    doc = nlp(sentence)
    for word in doc:
        if word.tag_ == 'IN':
            subordinating_conj = str(word) 
            break
    if not subordinating_conj:
        return '', None, False
     
    rest_of_sentence = sentence.split(str(subordinating_conj), 1)[1]
    for clause in rest_of_sentence.split(','):
        cd = doc(clause)
        for word in clause:
            if word.tag_ in ['VB', 'VBD', 'VBP', 'VBZ']:
                is_full_clause = True
                break
        subordinate_clause += clause
        if is_full_clause:
            break
    subordinate_clause = subordinating_conj + subordinate_clause
    return subordinate_clause, subordinating_conj, flagged
    

def _extract_from_file(input_file, output_file='qextract.out'):
    """Write subordinate clause file"""
    # open a working copy of the file to show its currently being written to
    try:
        with open(input_file, 'r') as f:
            # final sentence may not be a complete sentence, save and prepend to next chunk
            leftovers = ''
            sentence_no = 0
            output = open(output_file + '.working', 'w+')
            for chunk in read_in_chunks(f): # lazy way of reading our file in case it's large
                # prepend leftovers to chunk
                chunk = leftovers + chunk
                chunk = chunk.replace(';', '.') # replace semi colons with periods 
                doc = nlp(chunk)

                # last sentence may not be sentence, move to next chunk
                sents = [sent.string.strip() for sent in doc.sents]
                if len(sents) > 1:
                    leftovers = sents[-1] + chunk.rpartition(sents[-1])[-1]
                    sents = sents[:-1]
                for sent in sents:
                    sent = sent.replace('\n', ' ')
                    clause, conj, flagged = _extract_subordinate_clause(sent)
                    if clause:
                        output.write("{}\n{}\n{}\n" \
                        "{}\n\n\n\n\n".format(sent, clause['clause'],
                            clause['subordinating_conj'], clause['flagged']))
            output.close()
            # remove the .working extention to show the file is finished
            os.rename(output_file + '.working', output_file)
    except Exception as e:
        raise e
        print('error on {}'.format(input_file))
        print(e)
        print('closing file and continuing')
        os.rename(output_file + '.working', output_file)
        print('...')


def _extract_from_directory(inputdir, outputdir='qextract_output'):
    os.makedirs(outputdir, exist_ok=True)
    existing_books = [os.path.join(outputdir, os.fsdecode(f1)) for f1 in
            os.listdir(outputdir)]
    for f in os.listdir(inputdir):
        input_filename = os.path.join(inputdir, os.fsdecode(f))
        output_filename = os.path.join(outputdir,
                OUTPUT_TEXT_FILE_BASE.format(os.fsdecode(f)))
        if output_filename not in existing_books:
            _extract_from_file(input_filename, output_filename)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Extract subordinate clauses '
            'from arbitrary text.')
    parser.add_argument('-i', '--inputfile', help='Extract subordinate clauses '
            'from here.')
    parser.add_argument('-I', '--inputdir', help='Extract subordinate clauses '
            'from files in this input directory.')
    parser.add_argument('-o', '--outputfile', help='write output to this file')
    parser.add_argument('-O', '--outputdir', help='write output to this '
            'directory.')
    args = parser.parse_args()

    if args.inputdir and args.outputdir:
        _extract_from_directory(args.inputdir, args.outputdir)
    elif args.inputdir:
        _extract_from_directory(args.inputdir) 
    elif args.inputfile and args.outputfile:
        _extract_from_file(args.inputfile, args.outputfile)
    elif args.inputfile:
        _extract_from_file(args.inputfile)

    # Generate ignore messages
    if args.inputdir and (args.inputfile or args.outputfile):
        logging.warning('inputfile and outputfile unused when directory is '
                'specified.')
    if args.outputdir and not args.inputdir:
        logging.warning('inputdir required with outputdir.')
    
    if args.outputfile and not args.inputfile:
        logging.warning('inputfile required with outputfile.')
