#!/usr/bin/env python3

# http://inamidst.com/saxo/
# Created by Sean B. Palmer

import json
import re

import saxo

upper = tuple("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
lower = tuple("abcdefghijklmnopqrstuvwxyz")
abbreviations = upper + lower + ("etc", "ca", "cf", "Co", "Ltd", "Inc",
    "Mt", "Mr", "Mrs", "Dr", "Ms", "Rev", "Fr", "St", "Sgt", "pron",
    "approx", "lit", "syn", "transl", "sess", "fl", "Op", "Dec", "Brig",
    "Gen", "Bros")
abbreviations_pattern = r"(?<!\b%s)" % r")(?<!\b".join(abbreviations)
regex_sentence= re.compile(
    r"^(.{5,}?%s(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z|\n))" % abbreviations_pattern)

regex_block = re.compile(r"(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>")
regex_tr = re.compile(r"(?ims)<tr.*?</(tr|table)>")
regex_footnote = re.compile(r"[|]*\[[0-9]+\][|]*")
regex_tag = re.compile(r"<[^>]+>")

def article_search(term, language):
    site = "%s.wikipedia.org" % language
    phrase = "site:%s %s" % (site, term.replace("_", " "))
    url = saxo.call("bing", phrase)
    if not url:
        return
    if "?" in url:
        url = url.split("?", 1)[0]
    return url.split("/wiki/", 1).pop().replace("_", " ")

def optflag(arg, flag=None):
    if arg.startswith(":"):
        if " " in arg:
            flag, arg = arg.split(" ", 1)
        else:
            flag, arg = arg, ""
        flag = flag[1:]
    return arg, flag

def utf8(text):
    return text.encode("utf-8")

@saxo.pipe
def wik(arg):
    term, language = optflag(arg, "en")
    debug = []

    def underscored(term):
        return term.replace(" ", "_")

    def spaced(term):
        return term.replace("_", " ")

    def search(term):
        debug.append("Searched %s for %r" % (language, spaced(term)))
        return article_search(spaced(term), language)

    def skip_html_block(block):
        skip = ("technical limitations", "window.showTocToggle",
            "Deletion_policy", "Template:AfD_footer",    'disambiguation)"',
            "(images and media)", "This article contains a",
            'id="coordinates"', 'class="thumb'    ,
            "using the Article Wizard if you wish", "or add a request for it",
            "in existing articles")

        if not block:
            return True
        if block.startswith("<p><i>") and block.endswith("</i></p>"):
            return True
        for text in skip:
            if text in block:
                return True

        return False

    def skip_text_block(block):
        if len(block) < 32:
            return True
        if not block:
            return True
        return False

    def scrape(block):
        block = block.replace("<sup>", "|")
        block = block.replace("</sup>", "|")
        block = block.replace("\n", " / ")
        block = regex_tag.sub("", block)
        return regex_footnote.sub("", block)

    for attempt in range(2):
        # See if the article actually exists first, with following on
        page = saxo.request("https://%s.wikipedia.org/wiki/%s" %
                (language, underscored(term)), follow=True)

        # @@ it's an int here? huh
        if page["status"] != 200:
            debug.append("%s gave %s" % (page["url"], page["status"]))
            term = search(term)
            if not term:
                return "Google had no results"
            continue

        # Don't think they have those old 200 Redirects now
        text = regex_tr.sub("", page["text"])
        blocks = regex_block.findall(text)
        if not blocks:
            debug.append("%s had no blocks" % page["url"])
            term = search(term)
            continue

        # Filter out stuff we don't want
        blocks = (block for block in blocks if not skip_html_block(block))
        blocks = (scrape(block) for block in blocks)
        blocks = (block for block in blocks if not skip_text_block(block))

        block = next(blocks)
        try: block += " " + next(blocks) + " " + next(blocks)
        except StopIteration:
            ...

        sentence_match = regex_sentence.match(block)
        if not sentence_match:
            debug.append("%s had no sentences" % page["url"])
            term = search(term)
            continue

        sentence = sentence_match.group(1)
        if 'href="/wiki/Category:Disambiguation_pages"' in page["text"]:
            sentence = "[Disambiguation] " + sentence

        if len(sentence) < 128:
            block = block[len(sentence):].lstrip()
            sentence_match2 = regex_sentence.match(block)
            if sentence_match2:
                sentence += " " + sentence_match2.group(1)

        maxlength = 275
        ellipsis = False
        while len(utf8(sentence)) > maxlength:
            sentence = " ".join(sentence.split(" ")[:-1])
            ellipsis = True
        if ellipsis is True:
            sentence += " [...]"
        sentence = sentence.replace('"', "'")
        if sentence.endswith(":"):
            sentence = sentence[:-1] + "..."

        # url = page["url"]
        # term = spaced(term)
        sentence = sentence.replace("\r", "")
        sentence = sentence.replace("\n", "")
        return '"' + sentence + '" - ' + page["url"]

    return "Article not found: " + " | ".join(debug)
