#!/usr/bin/env python3

# http://inamidst.com/saxo/
# Created by Sean B. Palmer

import json
import re

import saxo

upper = tuple("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
lower = tuple("abcdefghijklmnopqrstuvwxyz")
abbreviations = upper + lower + ("etc", "ca", "cf", "Co", "Ltd", "Inc",
    "Mt", "Mr", "Mrs", "Dr", "Ms", "Rev", "Fr", "St", "Sgt", "pron",
    "approx", "lit", "syn", "transl", "sess", "fl", "Op", "Dec", "Brig",
    "Gen", "Bros")
abbreviations_pattern = r"(?<!\b%s)" % r")(?<!\b".join(abbreviations)
regex_sentence= re.compile(
    r"^(.{5,}?%s(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z|\n))" % abbreviations_pattern)

regex_block = re.compile(r"(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>")
regex_tr = re.compile(r"(?ims)<tr.*?</(tr|table)>")
regex_footnote = re.compile(r"[|]*\[[0-9]+\][|]*")
regex_tag = re.compile(r"<[^>]+>")

#######################################################################
# TODO: Copied from ./g
import re
import urllib.parse

import saxo

def bing(arg):
    regex_bing_result = re.compile(r"<h2><a href=\"([^\"]+)\"")
    page = saxo.request(
        "http://www.bing.com/search",
        query={"mkt": "en-GB", "q": arg}, modern=True
    )

    for url in regex_bing_result.findall(page["text"]):
        if "r.msn.com/" in url:
            continue
        if url.startswith("/"):
            continue
        return url
    return "No result found"

def g(arg):
    if not arg:
        return "Search using Google (or Bing)"

    base = "http://www.google.co.uk/search"
    query = "ie=UTF-8&hl=en-GB&source=hp&q=%s&btnG=Google+Search&gbv=1"
    url = base + "?" + (query % urllib.parse.quote(arg))

    text = saxo.lynx(url, source=False)
    if text == "LynxNotFound":
        return bing(arg)

    for line in text.splitlines():
        if "url?q=" in line:
            link = line.split(". ").pop().strip()
            q = urllib.parse.urlparse(link).query
            r = urllib.parse.parse_qs(q).get("q")
            return r.pop(0)
    return bing(arg)
#
#######################################################################

def article_search(term, language):
    site = "%s.wikipedia.org" % language
    phrase = "site:%s %s" % (site, term.replace("_", " "))
    url = g(phrase)
    if not url:
        return
    if "?" in url:
        url = url.split("?", 1)[0]
    return url.split("/wiki/", 1).pop().replace("_", " ")

def optflag(arg, flag=None):
    if arg.startswith(":"):
        if " " in arg:
            flag, arg = arg.split(" ", 1)
        else:
            flag, arg = arg, ""
        flag = flag[1:]
    return arg, flag

@saxo.pipe
def wik(arg):
    term, language = optflag(arg, "en")
    debug = []

    def underscored(term):
        return term.replace(" ", "_")

    def spaced(term):
        return term.replace("_", " ")

    def search(term):
        debug.append("Searched %s for %r" % (language, spaced(term)))
        return article_search(spaced(term), language)

    def skip_html_block(block):
        skip = ("technical limitations", "window.showTocToggle",
            "Deletion_policy", "Template:AfD_footer",    'disambiguation)"',
            "(images and media)", "This article contains a",
            'id="coordinates"', 'class="thumb'    ,
            "using the Article Wizard if you wish", "or add a request for it",
            "in existing articles")

        if not block:
            return True
        if block.startswith("<p><i>") and block.endswith("</i></p>"):
            return True
        for text in skip:
            if text in block:
                return True

        return False

    def skip_text_block(block):
        if len(block) < 32:
            return True
        if not block:
            return True
        return False

    def scrape(block):
        block = block.replace("<sup>", "|")
        block = block.replace("</sup>", "|")
        block = block.replace("\n", " / ")
        block = regex_tag.sub("", block)
        return regex_footnote.sub("", block)

    for attempt in range(2):
        # See if the article actually exists first, with following on
        page = saxo.request("https://%s.wikipedia.org/wiki/%s" %
                (language, underscored(term)), follow=True)

        # @@ it's an int here? huh
        if page["status"] != 200:
            debug.append("%s gave %s" % (page["url"], page["status"]))
            term = search(term)
            if not term:
                return "Google had no results"
            continue

        # Don't think they have those old 200 Redirects now
        text = regex_tr.sub("", page["text"])
        blocks = regex_block.findall(text)
        if not blocks:
            debug.append("%s had no blocks" % page["url"])
            term = search(term)
            continue

        # Filter out stuff we don't want
        blocks = (block for block in blocks if not skip_html_block(block))
        blocks = (scrape(block) for block in blocks)
        blocks = (block for block in blocks if not skip_text_block(block))

        block = next(blocks)
        try: block += " " + next(blocks) + " " + next(blocks)
        except StopIteration:
            ...

        sentence_match = regex_sentence.match(block)
        if not sentence_match:
            debug.append("%s had no sentences" % page["url"])
            term = search(term)
            continue

        sentence = sentence_match.group(1)
        if 'href="/wiki/Category:Disambiguation_pages"' in page["text"]:
            sentence = "[Disambiguation] " + sentence

        if len(sentence) < 128:
            block = block[len(sentence):].lstrip()
            sentence_match2 = regex_sentence.match(block)
            if sentence_match2:
                sentence += " " + sentence_match2.group(1)

        maxlength = 275
        if len(sentence) > maxlength:
            words = sentence[:maxlength][:-6].split(" ")
            sentence = " ".join(words[:-1]) + " [...]"
        sentence = sentence.replace('"', "'")
        if sentence.endswith(":"):
            sentence = sentence[:-1] + "..."

        # url = page["url"]
        # term = spaced(term)
        sentence = sentence.replace("\r", "")
        sentence = sentence.replace("\n", "")
        return '"' + sentence + '" - ' + page["url"]

    return "Article not found: " + " | ".join(debug)
