#!/usr/bin/env python3

import argparse

from somajo import Tokenizer


def arguments():
    """"""
    parser = argparse.ArgumentParser(description="Tokenize an input file according to the guidelines of the EmpiriST 2015 shared task on automatic linguistic annotation of computer-mediated communication / social media.")
    parser.add_argument("-c", "--split_camel_case", action="store_true", help="Split items in written in camelCase (excluding several exceptions).")
    parser.add_argument("-t", "--token_classes", action="store_true", help="Output the token classes (number, XML tag, abbreviation, etc.) in addition to the tokens.")
    parser.add_argument("FILE", type=argparse.FileType("r"), help="The input file")
    args = parser.parse_args()
    return args


def get_paragraphs(fh):
    """Generator for the paragraphs in the file."""
    paragraph = []
    for line in fh:
        if line.strip() == "":
            if len(paragraph) > 0:
                yield paragraph
                paragraph = []
        else:
            paragraph.append(line)
    if len(paragraph) > 0:
        yield paragraph


def main():
    args = arguments()
    tokenizer = Tokenizer(args.split_camel_case, args.token_classes)
    paragraphs = get_paragraphs(args.FILE)
    tokenized_paragraphs = (tokenizer.tokenize("".join(p)) for p in paragraphs)
    if args.token_classes:
        verticalized = ("\n".join(["\t".join(t) for t in tp]) for tp in tokenized_paragraphs)
    else:
        verticalized = ("\n".join(tp) for tp in tokenized_paragraphs)
    # print("\n\n".join(verticalized))
    for p in verticalized:
        print(p, "\n")


if __name__ == "__main__":
    main()
