#!/usr/bin/python3
"""Convert a file or a text block from tradspell to Lytspel.

Copyright (c) 2018-2019 Christian Siefkes

Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
"""

# pylint: disable=too-many-lines

import argparse
from collections import Counter
import csv
from enum import Enum
from functools import lru_cache
from inspect import trace
from io import BytesIO, TextIOWrapper
from os import path
import re
import readline  # pylint: disable=unused-import
from sys import stderr, stdin, stdout
from typing import List, Sequence, Set, Tuple, TypeVar, Union
import warnings
from warnings import warn
from zipfile import is_zipfile, ZipFile

from lxml import etree, html
from pkg_resources import resource_stream
import spacy


# Constants and type variables
SCRIPTNAME = 'lytspel'

# Non-ASCII letters which we know how to convert to ASCII (lower-case only).
HANDLED_NON_ASCII_LETTERS = 'áàăâǎåäãąāảæćĉčċçďđéèĕêěëẽęēẻǵğĝǧġḡĥȟḧħíìĭîǐïĩįīĵǰḱǩĺľłḹḿńǹňñ'\
                            'óòŏôǒöõøǫōỏœṕŕřśŝšşßťẗúùŭûǔůüũųūủṽẃẁẘẅẍýỳŷẙÿỹȳỷźẑž'

LOWERCASE_LETTER_PAT = '[a-z{}]'.format(HANDLED_NON_ASCII_LETTERS)

# Matches a non-ASCII letter which we know how to convert.
# Note: precompiling all regexes, while not as readable, actually speeds up the conversion
# process quite a bit.
NON_ASCII_LETTERS_RE = re.compile('[{}]'.format(HANDLED_NON_ASCII_LETTERS), re.IGNORECASE)

# Matches a word or non-word token.
TOKEN_RE = re.compile('(' + LOWERCASE_LETTER_PAT + "(?:['’]?" + LOWERCASE_LETTER_PAT +')*)',
                      re.IGNORECASE)

# Matches a capitalized letter followed by a lower-case one (optionally with something else
# between them).
CAPTALIZED_RE = re.compile('[A-Z].*[a-z]')

# Matches a capitalized letter occurring after something else.
INNER_CAP_RE = re.compile('.[A-Z]')

# Matches all-caps words.
ALL_CAPS_RE = re.compile("[-'A-Z]+$")

# Matches an upper-case letter.
UPPER_CASE_RE = re.compile('[A-Z]')

# Matches a single-letter contraction such as "'d".
SINGLE_LETTER_CONTRACTION_RE = re.compile("'.$")

# Matches 'I' and its contractions.
I_AND_CONTRACTIONS_RE = re.compile("I$|I['’]")

# Matches an arbitrary word (allowing non-ASCII and contractions).
# Words start with a letter, or with an apostrophe followed by a letter.
WORD_RE = re.compile("['’]?" + LOWERCASE_LETTER_PAT, re.IGNORECASE)

# Matches words ending in "'s" (genitive) or generic contractions such as "'ve".
GENERIC_CONTRACTION_RE = re.compile(".['’](s|d|ll|re|ve)$", re.IGNORECASE)

# Heuristic used to detect the end of a sentence.
ENDS_SENTENCE_HEURISTIC_RE = re.compile(r'([.?!](\s*["\'“‘”’])?|:\s*["\'“‘]|>(\s*["\'“‘])?)\s*$')

# Matches a digit
DIGIT_RE = re.compile(r'\d')

# Matches the first non-whitespace characters that typically occur at the start of a XML or
# HTML file
XML_START_RE = re.compile('<[!?h]', re.IGNORECASE)

# All-caps abbreviations that should be kept unchanged
CAP_ABBREVS = set(('BIOS', 'DE', 'GA', 'HI', 'LA', 'MA', 'ME', 'MI', 'OH', 'PA', 'US', 'WA'))

XHTML_NAMESPACE = '{http://www.w3.org/1999/xhtml}'

BLOCK_LEVEL_TAGS = set(('address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div',
                        'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1',
                        'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav',
                        'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table', 'tfoot',
                        'ul', 'video'))

POS_FALLBACKS = {'aj': 'n', 'av': 'aj', 'n': 'aj', 'prp': 'aj', 'v': 'n'}

T = TypeVar('T')

ConvState = Enum('ConvState', 'LOOKS_FOREIGN NLP_NEEDED')  # pylint: disable=invalid-name


# Initialization code

if __name__ == '__main__':
    def compact_warning(message, category, filename, lineno, line=None):
        """Print warnings without showing the warning message line itself."""
        #pylint: disable=unused-argument
        fname = path.split(filename)[1]
        return '{}:{}: {}: {}\n'.format(fname, lineno, category.__name__, message)

    warnings.formatwarning = compact_warning


# Utility functions

def get_elem(seq: Sequence[T], idx: int) -> T:
    """Safely retried an element from a sequence.

    None is returned if 'seq' ends before the requested 'idx' position.
    """
    if len(seq) > idx:
        return seq[idx]
    else:
        return None


# Classes

class Dictionary:
    """A dictionary mapping tradspell to Lytspel.

    Instances of this class are threadsafe after initialization. Usually it should be
    enough to have a single instance per program.
    """

    def __init__(self):
        """Initializes and loads the dictionary."""
        self._dict = {}
        self._mixed_dict = {}

        with TextIOWrapper(resource_stream('lytspel', 'lytspel-dict.csv'),
                           encoding='utf-8') as csv_stream:
            csvreader = csv.reader(csv_stream)
            next(csvreader)  # skip header line
            redirects = {}  # will be resolved later

            for row in csvreader:
                tradspell = get_elem(row, 0)
                pos = get_elem(row, 1)
                redirect = get_elem(row, 2)
                lytspel = get_elem(row, 3)

                if tradspell and lytspel:
                    ts_lower = tradspell.lower()
                    ls_lower = lytspel.lower()

                    if pos:
                        # Treat value as a dict of POS-tagged entries
                        if not ts_lower in self._dict:
                            self._dict[ts_lower] = {}
                        self._dict[ts_lower][pos] = ls_lower
                    else:
                        self._dict[ts_lower] = ls_lower

                    if self.is_mixed_case(lytspel):
                        self._mixed_dict[tradspell] = lytspel

                elif tradspell and redirect:
                    redirects[tradspell.lower()] = redirect.lower()
                else:
                    warn('Unexpected/malformed CSV row: {}'.format(','.join(row)))

        # Resolve redirects
        for key, target in redirects.items():
            value = self._dict.get(target)
            if value:
                self._dict[key] = value
            else:
                warn("Target '{}' of redirect '{}' missing!".format(target, key))

    @staticmethod
    def is_mixed_case(word: str) -> bool:
        """Test whether a word is MiXed case.

        A word is assumed to be MiXed case if it starts with an upper-case letter and if it
        contains at least one other upper-case and one lower-case letter. Words must be
        ASCII-fied for this function to work correctly.
        """
        return bool(CAPTALIZED_RE.match(word)) and bool(INNER_CAP_RE.search(word))

    @lru_cache(maxsize=1048576)
    def lookup(self, word: str, spacy_pos: str = None, at_sent_start: bool = False) -> Union[
            str, ConvState]:
        """Lookup a word in the dictionary and returned its converted form.

        'spacy_pos' is the POS tag as returned by spaCy. If it's omitted and if the respelling
        of the word depends on its POS tag, ConvState.NLP_NEEDED is returned.

        'at_sent_start' should be set to true if this token is the first of a new sentence.
        It is used to handle the case of the converted pronoun 'I' and its contractions.

        None is returned if the word is unknown.

        Case is restored (lower, Capitalized, or ALL_CAPS). MixedCase is also restored
        provided that *both* the input word and the dictionary entry use this case form
        (e.g. JavaScript -> JaavaScript).
        """
        # pylint: disable=too-many-branches
        word = self.asciify(word)
        # Strip final "'s" (genitive) or contraction and remember for later (handling case)
        word, contraction = self.strip_generic_contraction(word)

        if word in CAP_ABBREVS:
            result = word
        else:
            lower = word.lower()

            if lower in ("'ll", "'re", "'ve"):
                # Contraction tokens produced by spaCy: discard the last letter
                result = lower[:-1]
            elif SINGLE_LETTER_CONTRACTION_RE.match(lower):
                # Single-letter contraction token: return as is
                result = lower
            else:
                result = self._dict.get(lower)

                if result is None:
                    result = self.fallback_lookup(lower)
                    if result is None:
                        # Still no match
                        return None

                if isinstance(result, dict):
                    if spacy_pos:
                        result = self._find_pos_tagged_entry(result, spacy_pos)
                    else:
                        return ConvState.NLP_NEEDED

            result = self._restore_capitalization(result, word)

            if not at_sent_start and I_AND_CONTRACTIONS_RE.match(word):
                # Correct the case of 'I' and its contractions
                result = result.lower()

        if contraction:
            if len(contraction) == 3:
                # Strip the final letter of two-letter contractions such as "'ll" or "'ve"
                contraction = contraction[:-1]
            result += contraction

        result = result.replace("'", '’')  # replace normal by typographic apostrophe
        return result

    @staticmethod
    @lru_cache(maxsize=1048576)
    def asciify(word: str) -> str:
        """Convert a word to its ASCII equivalent.

        Typographic apostrophes are replaced by normal ones and diacritical letters are
        replaced by their closest ASCII equivalents.
        """
        # pylint: disable=too-many-branches, too-many-statements
        result = word.replace('’', "'")  # ASCII-ify apostrophes

        if NON_ASCII_LETTERS_RE.search(result): # non-ASCII word
            letters = list(result)

            for i, letter in enumerate(letters):
                if ord(letter) < 128:
                    continue  # ASCII, nothing to do

                lower_letter = letter.lower()
                replacement = ''

                # vowels and semivowels
                if lower_letter in 'áàăâǎåäãąāả':
                    replacement = 'a'
                elif lower_letter in 'æ':
                    replacement = 'ae'
                elif lower_letter in 'éèĕêěëẽęēẻ':
                    replacement = 'e'
                elif lower_letter in 'íìĭîǐïĩįī':
                    replacement = 'i'
                elif lower_letter in 'óòŏôǒöõøǫōỏ':
                    replacement = 'o'
                elif lower_letter in 'œ':
                    replacement = 'oe'
                elif lower_letter in 'úùŭûǔůüũųūủ':
                    replacement = 'u'
                elif lower_letter in 'ýỳŷẙÿỹȳỷ':
                    replacement = 'y'
                # consonants
                elif lower_letter in 'ćĉčċç':
                    replacement = 'c'
                elif lower_letter in 'ďđ':
                    replacement = 'd'
                elif lower_letter in 'ǵğĝǧġḡ':
                    replacement = 'g'
                elif lower_letter in 'ĥȟḧħ':
                    replacement = 'h'
                elif lower_letter in 'ĵǰ':
                    replacement = 'j'
                elif lower_letter in 'ḱǩ':
                    replacement = 'k'
                elif lower_letter in 'ĺľłḹ':
                    replacement = 'l'
                elif lower_letter in 'ḿ':
                    replacement = 'm'
                elif lower_letter in 'ńǹňñ':
                    replacement = 'n'
                elif lower_letter in 'ṕ':
                    replacement = 'p'
                elif lower_letter in 'ŕř':
                    replacement = 'r'
                elif lower_letter in 'śŝšş':
                    replacement = 's'
                elif lower_letter in 'ß':
                    replacement = 'ss'
                elif lower_letter in 'ťẗ':
                    replacement = 't'
                elif lower_letter in 'ṽ':
                    replacement = 'v'
                elif lower_letter in 'ẃẁẘẅ':
                    replacement = 'w'
                elif lower_letter in 'ẍ':
                    replacement = 'x'
                elif lower_letter in 'źẑž':
                    replacement = 'z'

                if replacement:
                    if letter != lower_letter:  #  original was upper case
                        replacement = replacement.upper()
                    letters[i] = replacement

            result = ''.join(letters)
        return result

    @staticmethod
    def strip_generic_contraction(word: str) -> Tuple[str, str]:
        """Strip the genitive ("'s") and generic contractions (such as "'ve") from a word.

        Returns two values: actual word, contraction. If the word doesn't end in a genitive
        or contraction, the first return value will be identical to the argument and the
        second return value will be empty.

        Both simple and typographic apostrophes are recognized.
        """
        match = GENERIC_CONTRACTION_RE.search(word)
        if match:
            idx = match.start() + 1
            return word[:idx], word[idx:]
        else:
            return word, ''

    def fallback_lookup(self, lc_word: str) -> str:
        """Fallback lookup if the regular lookup failed.

        'lc_word' must be a lower-case word.

        Checks if the word is a contraction that starts with an apostrophe (e.g. "'cause" or
        "'em"), as sometimes produced by spaCy, or if it ends in -in and a corresponding -ing
        entry exists (e.g. 'friggin' instead of 'frigging').
        """
        if lc_word.startswith("'"):
            result = self._dict.get(lc_word[1:])
            if isinstance(result, str):
                return "'" + result
        elif lc_word.endswith('in'):
            result = self._dict.get(lc_word + 'g')
            if isinstance(result, str) and result.endswith('g'):
                return result[:-1]

        return None

    @staticmethod
    def translate_pos(spacy_pos: str) -> str:
        """Translate a POS tag as used by spaCy into the form used in the dictionary."""
        if spacy_pos in ('AUX', 'VERB'):
            return 'v'
        elif spacy_pos in ('NOUN', 'PROPN'):
            return 'n'
        elif spacy_pos == 'ADJ':
            return 'aj'
        elif spacy_pos == 'ADV':
            return 'av'
        elif spacy_pos == 'ADP':
            return 'prp'
        else:
            # Other POS tags shouldn't usually occur regarding our POS-tagged words,
            # but we use a reasonable default for cases where spaCy gets it wrong
            return 'v'

    def _find_pos_tagged_entry(self, entries: dict, pos: str,
                               tried_variants: List[str] = None) -> str:
        """Find the entry to use if several POS-tagged spellings exist for a word.

        'pos' is the POS tag as returned by spaCy (capitalized) or used in our dictionary
        (lower-case) - the former will be translated into the later if needed.

        'tried_variants' is used internally to prevent endless loops in case another POS tag
        has to be tried as fallback.
        """
        if pos and pos[0].isupper():
            pos = self.translate_pos(pos)
        if tried_variants is None:
            tried_variants = []

        tried_variants.append(pos)
        result = entries.get(pos)

        if result is None:
            fallback = POS_FALLBACKS.get(pos)

            if fallback is None or fallback in tried_variants:
                if fallback is None:
                    warn("No fallback found for POS tag '{}'; using last entry of {} as fallback"
                         .format(pos, entries))
                else:
                    warn("Trying POS tag '{}' after {} would loop; using last entry of {} as "
                         'fallback'.format(pos, tried_variants, entries))

                result = entries[sorted(entries.keys())[-1]]
            else:
                result = self._find_pos_tagged_entry(entries, fallback, tried_variants)

        return result

    def _restore_capitalization(self, converted: str, word: str) -> str:
        """Restore the capitalization of 'converted' to match that of 'word'."""
        if ALL_CAPS_RE.match(word):
            return converted.upper()  # ALL_CAPS

        if UPPER_CASE_RE.match(word):
            if word in self._mixed_dict:
                return self._mixed_dict[word]  #  MixedCase
            else:
                return converted[0].upper() + converted[1:]  # Capitalized

        return converted  # No change


class Converter:
    """A converter from tradspell to Lytspel.

    Instances of this class are stateful and hence not threadsafe.
    """

    # Attributes that are shared across all instances, but loaded in the constructor to avoid
    # loading them in cases where they aren't needed (e.g. when this program is called with the
    # --help option)
    _dict = None
    _nlp = None

    def __init__(self, use_unknown_counter: bool = False) -> None:
        """Create a new instance.

        Parameters:

        * use_unknown_counter: whether unknown words should be counted
        """
        # Switch consulted during lookup and updated during conversion and tokenization
        self._at_sent_start = True
        # Optional counter of unknown words
        self._unknown_counter: Counter = Counter() if use_unknown_counter else None

        # Lazy initialization of static attributes
        if self._dict is None:
            self._dict = Dictionary()

        if self._nlp is None:
            try:
                # Load spaCy without any unnecessary components
                self._nlp = spacy.load('en', disable=['parser', 'ner'])
            except OSError:
                print("{}: Downloading language model for the spaCy POS tagger\n"
                      "(don't worry, this will only happen once)".format(SCRIPTNAME), file=stderr)
                from spacy.cli import download
                download('en')
                self._nlp = spacy.load('en', disable=['parser', 'ner'])

    @staticmethod
    def tokenize_text(text: str) -> List[str]:
        """Tokenize a string, returning an array of words and punctuation.

        Words must start and end with a letter and may contain apostrophes.
        """
        if not text:
            return []  # Empty or None

        result = TOKEN_RE.split(text)
        # Remove first and/or last element if they are empty
        if result[0] == '':
            result.pop(0)
        if result[-1] == '':
            result.pop()
        return result

    def text_looks_foreign(self, text: str) -> bool:
        """Check whether a text fragment looks like it's written in a foreign language.

        Returns True iff a majority of the words are unknown.
        """
        in_tokens = self.tokenize_text(text)
        known_words = 0
        unknown_words = 0

        for token in in_tokens:
            if WORD_RE.match(token):
                conv = self._dict.lookup(token, None, self._at_sent_start)

                if conv:
                    known_words += 1
                else:
                    unknown_words += 1

        return unknown_words > known_words

    @staticmethod
    def ends_sentence(token: str) -> bool:
        """Use some simple heuristics to determine whether a token seems to end a sentence.

        Returns True in one of three cases:

        * The tokens ends in a dot, question or exclamation mark, optionally followed by an
          opening or closing quote marks
        * It ends in a colon followed by an opening quote mark
        * It ends in '>' (quoted text marker), optionally followed by an opening quote mark

        Whitespace is ignored in all cases.
        """
        if not token:
            return False

        return bool(ENDS_SENTENCE_HEURISTIC_RE.search(token))

    def _update_at_sent_start(self, token: str) -> None:
        """Update the '_at_sent_start' attribute.

        Parameter:

        * token: the last read token which must be a non-word
        """
        local_value = self._at_sent_start
        if local_value:
            if DIGIT_RE.search(token):
                # Token contains digits, hence it is considered as first token of the new sentence
                local_value = False

        # Set to True if the token seems to end a sentence OR if the attribute was already True
        # (and not set to False by us due to encountering a number)
        self._at_sent_start = local_value or self.ends_sentence(token)

    def _convert_text_if_simple(self, text: str, test_if_foreign: bool = True,
                                starts_sent: bool = True) -> Union[str, ConvState]:
        """Convert a text fragment if doing so is possible without POS tagging.

        'starts_sent' is an optional attribute that specifies whether this text fragment is
        likely to start with a new sentence. If true, the '_at_sent_start' switch is set
        accordingly, otherwise it is left alone.

        The return value is either:

        * The converted text (a string)
        * ConvState.NLP_NEEDED if POS tagging (NLP) is needed)
        * ConvState.LOOKS_FOREIGN if 'test_if_foreign' is true and a majority of the words in
          the fragment are unknown
        """
        # pylint: disable=too-many-branches
        orig_at_sent_start = self._at_sent_start  # Remember in case we have to restore it later

        if not text:
            return text  # Empty or None, nothing to do
        if starts_sent:
            self._at_sent_start = True

        in_tokens = self.tokenize_text(text)
        out_tokens: List[str] = []
        lasttok = ''
        known_words = 0
        unknown_words = 0

        if self._unknown_counter is not None:
            local_unknown_counter: Counter = Counter()

        for token in in_tokens:
            if WORD_RE.match(token):
                conv = self._dict.lookup(token, None, self._at_sent_start)

                if conv is ConvState.NLP_NEEDED:
                    self._at_sent_start = orig_at_sent_start
                    return ConvState.NLP_NEEDED
                elif isinstance(conv, str):
                    known_words += 1
                    out_tokens.append(conv)
                else:
                    unknown_words += 1
                    out_tokens.append(token)

                    if self._unknown_counter is not None:
                        actual_word = self._dict.strip_generic_contraction(token)[0]
                        if len(actual_word) > 1:
                            local_unknown_counter[actual_word] += 1

                self._at_sent_start = False
            else:
                # Not a word
                if token == '-' and lasttok and WORD_RE.match(lasttok):
                    # Check if this forms a hyphenated prefix with the preceding token, e.g. 're-'
                    conv = self._dict.lookup(lasttok + token, None, self._at_sent_start)

                    if isinstance(conv, str):
                        out_tokens[-1] = conv
                    else:
                        out_tokens.append(token)
                else:
                    # Append non-word as is and check whether it terminates a sentence
                    out_tokens.append(token)
                    self._update_at_sent_start(token)

            lasttok = token

        if not test_if_foreign or unknown_words <= known_words:
            # We shouldn't make the foreign language test OR at least half of the words are known
            if self._unknown_counter is not None:
                self._unknown_counter += local_unknown_counter

            return ''.join(out_tokens)
        else:
            return ConvState.LOOKS_FOREIGN

    def convert_para(self, text: str, test_if_foreign: bool = True,
                     starts_sent: bool = True) -> str:
        """Convert a paragraph.

        'starts_sent' is an optional attribute that specifies whether this text fragment is likely
        to start with a new sentence. If true, the 'self._at_sent_start' switch is set accordingly,
        otherwise it is left alone.

        If 'test_if_foreign' is True and a majority of the words in the paragraph are unknown,
        the paragraph is assumed to be written in a foreign language and returned unchanged.
        """
        # pylint: disable=too-many-branches, too-many-locals, too-many-nested-blocks, too-many-statements
        if not text:
            if starts_sent:
                # Nothing to do, but we still update the global state (a new block-level HTML
                # element might have been opened)
                self._at_sent_start = True
            return text

        simple_result = self._convert_text_if_simple(text, test_if_foreign, starts_sent)

        if simple_result is ConvState.NLP_NEEDED:
            # We have to invoke spaCy for POS tagging (unless the text looks foreign)
            if self.text_looks_foreign(text):
                return text  # Return text unchanged

            doc = self._nlp(text)
            out_tokens: List[str] = []
            lasttok = ''
            last_nonword = ''

            if starts_sent:
                self._at_sent_start = True

            for entry in doc:
                # Sometimes spaCy doesn't recognize all word boundaries, hence we run our own
                # tokenizer on each of its entries (unless it looks like an URL or abbreviation)
                if entry.text.count('.') >= 2 or entry.text.count('/') >= 2:
                    in_tokens = [entry.text]
                    looks_like_url = True
                else:
                    in_tokens = self.tokenize_text(entry.text)
                    looks_like_url = False

                # Glue contraction tokens such as "'ll" back together
                if len(in_tokens) >= 2 and in_tokens[0] in ("'", "’") and \
                        WORD_RE.match(in_tokens[1]):
                    in_tokens[0] += in_tokens[1]
                    in_tokens.pop(1)

                for idx, token in enumerate(in_tokens, start=1):
                    tail = ''

                    if idx == len(in_tokens) and entry.whitespace_:
                        tail = entry.whitespace_  # optional trailing whitespace after the entry

                    if token.lower() in ("n't", 'n’t'):
                        # SpaCy treats "n't" (as in "don't" etc.) as a separate word, but we look
                        # it up together with the preceding word because the joined pronunciation
                        # (and hence spelling) is sometimes different
                        if out_tokens:
                            out_tokens.pop()
                        token = lasttok + token

                        if self._unknown_counter is not None and lasttok in self._unknown_counter:
                            self._unknown_counter[lasttok] -= 1

                    if WORD_RE.match(token):
                        conv = self._dict.lookup(token, entry.pos_, self._at_sent_start)

                        if isinstance(conv, str):
                            out_tokens.append(conv)
                        elif conv is not None:
                            raise ValueError('lookup({}, {}) returned unexpected result: {}'
                                             .format(token, entry.pos_, conv))
                        else:
                            out_tokens.append(token)

                            if self._unknown_counter is not None and not looks_like_url:
                                actual_word = self._dict.strip_generic_contraction(token)[0]
                                if len(actual_word) > 1:
                                    self._unknown_counter[actual_word] += 1

                        self._at_sent_start = self.ends_sentence(tail)
                        last_nonword = ''
                    else:
                        # Not a word
                        if token == '-' and not tail and lasttok and WORD_RE.match(lasttok):
                            # Check if this forms a hyphenated prefix with the preceding token,
                            # e.g. 're-'
                            conv = self._dict.lookup(lasttok + token, entry.pos_,
                                                     self._at_sent_start)

                            if isinstance(conv, str):
                                out_tokens[-1] = conv
                            elif conv is not None:
                                raise ValueError(
                                    'lookup({}, {}) returned unexpected result: {}'
                                    .format(token, entry.pos_, conv))
                            else:
                                out_tokens.append(token)
                                last_nonword = token
                        else:
                            # Append as is
                            out_tokens.append(token)

                            # We also consider the last non-word token for cases such as ‹: "›
                            # (colon followed by quote) which are considered two tokens by spaCy
                            self._update_at_sent_start(last_nonword + token + tail)
                            last_nonword = token

                    lasttok = token

                    # Append trailing whitespace to token, if any
                    if tail:
                        out_tokens[-1] += tail

            return ''.join(out_tokens)
        elif simple_result is ConvState.LOOKS_FOREIGN:
            return text  # Return text unchanged
        elif isinstance(simple_result, str):
            return simple_result
        else:
            raise ValueError('self._convert_text_if_simple returned unexpected result: {}'
                             .format(simple_result))

    @staticmethod
    def determine_file_type(filename: str) -> str:
        """Inspect the contents of a file to determine the likely file type.

        Returns either 'epub', 'html', 'txt', or None if the file is clearly not of any of
        these types. However, 'txt' is used as a fairly general fallback hence it's quite
        possible that a file labeled as 'txt' is actually something else
        """
        # Check if it's an epub file
        if is_zipfile(filename):
            with ZipFile(filename) as zin:
                bstr = b''

                try:
                    bstr = zin.read('mimetype')
                except KeyError:
                    pass  # Not an epub file

                if bstr.decode().startswith('application/epub+zip'):
                    return 'epub'
                else:
                    return None

        with open(filename) as file:
            for line in file:
                line = line.strip()

                if not line:
                    continue  # Empty line, inspect next one

                if XML_START_RE.match(line):
                    # File seems to start with a DOCTYPE or XML declaration, HTML comment or
                    # <html> tag
                    return 'html'
                else:
                    break

        return 'txt'

    # pylint: disable=protected-access

    @staticmethod
    def simple_tag(elem: etree._Element) -> str:
        """Return the tag name of an Element without the XHTML namespace, if used.

        If the element lives within the XHTML namespace, just the local name is returned,
        e.g. '{http://www.w3.org/1999/xhtml}img' becomes 'img'.

        In all other cases, the tag name is returned unchanged.
        """
        tag = elem.tag

        if tag is None:
            return tag

        if tag.startswith(XHTML_NAMESPACE):
            return tag[len(XHTML_NAMESPACE):]
        else:
            return tag

    def convert_html_elem(self, elem: etree._Element) -> None:
        """Recursively convert an element in an HTML document and its children.

        Whether to convert textual content is decided on the level of block-level tags (such
        as 'h1', 'blockquote', 'p') that do NOT contain any directly nested block-level tags
        (e.g. if an 'ol' contains 'li' elements, the decision will be made for each of the
        latter independently, not for the whole 'ol'). If a large part of the text embedded
        in such an element seems to be in a foreign language, the whole element will NOT be
        converted.
        """
        # Check if we should made the foreign-language test as this level
        if self.simple_tag(elem) in BLOCK_LEVEL_TAGS:
            decide_on_conversion = True

            for child in elem:
                if self.simple_tag(child) in BLOCK_LEVEL_TAGS:
                    decide_on_conversion = False
                    break

            if decide_on_conversion:
                full_text = str(etree.XPath('string()')(elem))
                if self.text_looks_foreign(full_text):
                    # Skip this part of the document tree (doesn't seem to be English)
                    return

        # Convert immediate content (only block-level elements count as start of a new paragraph)
        elem.text = self.convert_para(elem.text, False,
                                      starts_sent=self.simple_tag(elem) in BLOCK_LEVEL_TAGS)

        # Convert child elements (except comments and those that don't contain normal text)
        for child in elem:
            if not (isinstance(child, (etree._Comment, etree._ProcessingInstruction))
                    or self.simple_tag(child) in ('script', 'style')):
                self.convert_html_elem(child)

            if child.tail:
                child.tail = self.convert_para(child.tail, False, starts_sent=False)

        # Convert a few textual attributes, if they are present
        for attrib in ('alt', 'title'):
            if attrib in elem.attrib and elem.attrib[attrib]:
                elem.attrib[attrib] = self.convert_para(elem.attrib[attrib])

    def convert_html_document(self, filename_or_bytes: Union[str, BytesIO]) -> bytes:
        """Convert and return an HTML or XHTML file.

        Returns an UTF-8 encoded bytestring.
        """
        doc = None
        is_xhtml = True

        # Try parsing as XHTML, if that doesn't work, parse as regular HTML
        try:
            doc = etree.parse(filename_or_bytes)
        except etree.XMLSyntaxError:
            doc = html.parse(filename_or_bytes)
            is_xhtml = False

        root = doc.getroot()
        title = root.find('.//title', namespaces=root.nsmap)
        body = root.find('body', namespaces=root.nsmap)

        if title is not None:
            title.text = self.convert_para(title.text)
        if body is not None:
            self.convert_html_elem(body)

        if is_xhtml:
            return etree.tostring(doc, encoding='utf8')
        else:
            return html.tostring(doc, encoding='utf8')

    def convert_xml_elem(self, elem: etree._Element) -> None:
        """Recursively convert an element in an XML document and its children.

        This function is only meant for elements that aren't (X)HTML. All textual content of
        the element and its child elements are converted (unless they look foreign), while
        all attribute values are left alone. In HTML, on the other hand, certain elements
        (such as 'script') are skipped and certain attributes (such as 'alt') are converted.

        Also, while in HTML the foreign-language test is made at the level of block-level tags
        (such as 'p' or 'li'), here every text fragment is tested independently.
        """
        # Convert immediate content
        elem.text = self.convert_para(elem.text)

        # Convert child elements (except comments and PIs)
        for child in elem:
            if not isinstance(child, (etree._Comment, etree._ProcessingInstruction)):
                self.convert_xml_elem(child)
            child.tail = self.convert_para(child.tail, starts_sent=False)

    def convert_xml_document(self, filename_or_bytes: Union[str, BytesIO]) -> bytes:
        """Convert and return an XML file.

        This function is only meant for documents that aren't (X)HTML.

        Returns an UTF-8 encoded bytestring.
        """
        doc = etree.parse(filename_or_bytes)
        self.convert_xml_elem(doc.getroot())
        return etree.tostring(doc, encoding='utf8')

    @staticmethod
    def _make_hrefs_absolute(items: Sequence[etree._Element], dirname: str) -> Sequence[
            etree._Element]:
        """Convert a list of XML elements with filenames from relative into absolute filenames.

        Each member of the 'items' sequence must have a 'href' attribute that will be
        modified accordingly.

        If dirname is empty, the original list is returned unchanged.
        """
        if dirname:
            for item in items:
                item.attrib['href'] = path.join(dirname, item.attrib['href'])

        return items

    def _find_epub_members_to_convert(self, zin: ZipFile) -> Tuple[Set[str], Set[str]]:
        """Find the files in an epub ZIP archive that should be converted.

        'zin' must be a ZipFile open for reading.

        Returns a tuple of two sets of absolute file names:

        * Set of HTML files to convert
        * Set of other XML files to convert (OPF files and the deprecated NCX files)
        """
        # Find the contents metafile
        bstr = zin.read('META-INF/container.xml')
        tree = etree.fromstring(bstr)
        # Usually there is just one OPF file, but multiple-rendition epubs have several ones
        opf_files = tree.xpath('n:rootfiles/n:rootfile/@full-path',
                               namespaces={'n': 'urn:oasis:names:tc:opendocument:xmlns:container'})
        absolute_items: List[etree._Element] = []

        # Find the XML files that need conversion
        for opf_file in opf_files:
            bstr = zin.read(opf_file)
            tree = etree.fromstring(bstr)
            relative_items = tree.xpath('/p:package/p:manifest/p:item',
                                        namespaces={'p': 'http://www.idpf.org/2007/opf'})
            absolute_items += self._make_hrefs_absolute(relative_items, path.dirname(opf_file))

        html_files = set()
        ncx_files = set()

        for item in absolute_items:
            media_type = item.attrib['media-type']

            if media_type == 'application/xhtml+xml':
                html_files.add(item.attrib['href'])
            elif media_type == 'application/x-dtbncx+xml':
                # deprecated, but might occur in older epubs
                ncx_files.add(item.attrib['href'])

        other_xml_files = ncx_files.union(opf_files)
        return html_files, other_xml_files

    def convert_epub(self, filename: str, out_filename: str = None) -> None:
        """Convert an epub file.

        The 'out_filename' argument is optional; if omitted, a suitable name is generated by
        appending '-lytspel' before the file extension (if the input file is called 'FILE.epub',
        the output file will be called 'FILE-lytspel.epub').
        """
        if not out_filename:
            root, ext = path.splitext(filename)
            out_filename = '{}-lytspel{}'.format(root, ext)

        with ZipFile(filename, 'r') as zin:
            html_files, other_xml_files = self._find_epub_members_to_convert(zin)

            # Copy files to output archive, converting them if needed
            with ZipFile(out_filename, 'w') as zout:
                zout.comment = zin.comment  # Preserve the comment, if any

                for item in zin.infolist():
                    bstr = zin.read(item.filename)

                    if item.filename in html_files:
                        bio = BytesIO(bstr)
                        bstr = self.convert_html_document(bio)
                    if item.filename in other_xml_files:
                        bio = BytesIO(bstr)
                        bstr = self.convert_xml_document(bio)

                    zout.writestr(item, bstr)

        print('{}: Output written to {}'.format(SCRIPTNAME, out_filename), file=stderr)

    def convert_stdin_interactively(self) -> None:
        """Interactively convert plain text read from stdin, writing the output to stdout."""
        try:
            while True:
                # We use the input function here, since it allows pressing Arrow-Up to repeat
                # earlier inputs
                line = input()
                print(self.convert_para(line))
        except EOFError:
            return

    def convert_text_document(self, filename: str, out_filename: str = None) -> None:
        """Convert a plain text file.

        If 'filename' is '-', input is read from stdin.

        If `out_filename` is omitted, output will be written to stdout.
        """
        # pylint: disable=too-many-branches
        if filename == '-':
            if stdin.isatty():
                self.convert_stdin_interactively()
                return

            infile = stdin
        else:
            infile = open(filename)

        if out_filename:
            outfile = open(out_filename, 'w')
        else:
            outfile = stdout

        para = ''

        for line in infile:
            line = line.rstrip()
            # Paragraphs are considered to be separated by empty lines. However, very long
            # lines (200+ chars) are considered paragraphs in their own right.
            if len(line) >= 200:  # stand-alone paragraph
                if para:
                    print(self.convert_para(para), file=outfile)
                    para = ''

                print(self.convert_para(line), file=outfile)
            elif line:            # regular line
                if para:
                    para += '\n'

                para += line
            else:                 # empty line
                if para:
                    print(self.convert_para(para), file=outfile)
                    para = ''

                print(file=outfile)

        # Convert final paragraph, if any
        if para:
            print(self.convert_para(para), file=outfile)

        if infile is not stdin:
            infile.close()

        if out_filename:
            outfile.close()

    def convert_file(self, filename: str, out_filename: str = None) -> None:
        """Convert the file with the specified name.

        Recognized file types are epub, HTML and plain text.

        If 'filename' is '-', input is read from stdin and assumed to be plain text.

        The 'out_filename' argument is optional; if omitted, HTML and text output will b
        written to stdout, while a suitable file name will be generated for epub.
        """
        if filename == '-':
            filetype = 'txt'
        else:
            filetype = self.determine_file_type(filename)

            if path.getsize(filename) / 1024 >= 256:
                print('{}: Converting {} -- this may take a while...'.format(SCRIPTNAME, filename),
                      file=stderr)

        if filetype == 'txt':
            self.convert_text_document(filename, out_filename)
        elif filetype == 'html':
            bstr = self.convert_html_document(filename)

            if out_filename:
                with open(out_filename, 'wb') as outfile:
                    outfile.write(bstr)
                    outfile.write(b'\n')
            else:
                print(bstr.decode())

        elif filetype == 'epub':
            self.convert_epub(filename, out_filename)
        elif filetype is None:
            exit('{}: Cannot convert {} (unsupported file type)'.format(SCRIPTNAME, filename))
        else:
            raise ValueError('Unexpected file type: {}'.format(filetype))

    def _unify_case_differences(self) -> None:
        """Unify entries within the internal unknown word counter that differ only by case.

        Calling this method when this instance had been initialized with 'use_unknown_counter' =
        False will trigger an error.

        Any counts of capitalized words to the count of the lower-case variant if it exists,
        deleting the alternative entries. For example, Counter(hulla=3, Hulla=1, HULLA=1)
        becomes Counter(hulla=5).

        If there is no lower-case variant, other variants that differ in case will NOT be
        unified. For example, Counter(Hulla=1, HULLA=1) remains unchanged.
        """
        entries_to_delete = []

        for key, count in self._unknown_counter.items():
            if any(x.isupper() for x in key):
                lower = key.lower()

                if lower in self._unknown_counter:
                    self._unknown_counter[lower] += count
                    entries_to_delete.append(key)

        for entry in entries_to_delete:
            del self._unknown_counter[entry]

    def print_unknown_words(self, min_count: int = 1) -> None:
        """Writes the list of unknown words encountered in the converted texts to stderr.

        Only words encountered at least 'min_count' times are shown.

        Calling this method when this instance had been initialized with 'use_unknown_counter' =
        False will trigger a warning.
        """
        if self._unknown_counter is None:
            warn('print_unknown_words called on a Converter without an unknown_counter')
            return

        header_shown = False
        self._unify_case_differences()

        for key, count in sorted(self._unknown_counter.items(), key=lambda pair: pair[0].lower()):
            if count < min_count:
                continue

            output = '  ' + key

            if count > 1:
                output += ' ({}x)'.format(count)
            if not header_shown:
                print('Unknown words:', file=stderr)
                header_shown = True

            print(output, file=stderr)

        if not header_shown:
            if min_count <= 1:
                print('No unknown words', file=stderr)
            else:
                print('No unknown words (occurring {} times or more)'.format(min_count),
                      file=stderr)


def main() -> None:
    """Run this script."""
    try:
        parser = argparse.ArgumentParser(
            epilog='Specify "-" instead of a FILE to convert text read from stdin. This only '
                   'works for plain text, not for HTML or epub. It also works interactively: '
                   'enter a line, then press return to see the converted output; press Ctrl-D '
                   '(or the local equivalent on your system) to quit.')
        parser.add_argument('files', metavar='FILE', nargs='*',
                            help='file to convert')
        parser.add_argument('-c', '--convert', metavar='TEXT',
                            help='convert TEXT that follows (any FILES will be ignored)')
        parser.add_argument('-o', '--outfile', metavar='OUTFILE',
                            help='write output to OUTFILE (exactly one input FILE required)')
        parser.add_argument('-u', '--unknown', action='count', default=0,
                            help='list unknown words (repeat option n times to list only words'
                            ' that occur at least that often)')
        args = parser.parse_args()

        use_unknown_counter = args.unknown > 0
        conv = Converter(use_unknown_counter)

        if args.convert:
            print(conv.convert_para(args.convert, test_if_foreign=False))
        else:
            if not args.files:
                exit('{}: Specify file(s) to convert or use the -c argument.\n'
                     "Try '{} -h' for more information.".format(SCRIPTNAME, SCRIPTNAME))

            if args.outfile:
                if not len(args.files) == 1:
                    exit('{}: -o/--outfile argument requires exactly 1 input FILE, not {}.\n'
                         "Try '{} -h' for more information.".format(
                             SCRIPTNAME, len(args.files), SCRIPTNAME))

            for file in args.files:
                conv.convert_file(file, args.outfile)

        if use_unknown_counter:
            conv.print_unknown_words(args.unknown)
    except Exception as err:  # pylint: disable=broad-except
        # Find highest stack frame in the current file (closest to source of exception)
        frames = trace()
        last_useful_frame = frames[0]
        current_file = last_useful_frame.filename

        for frame in frames[1:]:
            if frame.filename == current_file:
                last_useful_frame = frame
            else:
                break  # Stepping out of current file

        fname = path.split(current_file)[1]
        exit('{}:{}: {}: {}'.format(fname, last_useful_frame.lineno, err.__class__.__name__, err))


if __name__ == '__main__':
    main()
