#!python
"""Stem.

Stem and leaf plot from a csv or excel spreadsheet using best defaults. Can do text (text and dot) or graphic (kde,
graphic, hist, line).

Usage:
    stem <input> [-c <column>] [-d] [-f] [-k <file>] [-o <file>] [-p <percent>] [-r <random>] [-s <server>] [-t <type>] [-u] [-w]
    stem -h | --help
    stem --version

Options:
    -h --help    Show this screen.
    -c <column>  column index
    -d           describe the data
    -f           force dask
    -k <file>    persist sample to file (.csv, .pkl)
    -o <file>    output file (.txt, .png) or stdout
    -p <percent> trim data on both ends (ex: 0.2)
    -r <random>  random_state seed (ex: 42)
    -s <server>  head node for distributed cluster
    -t <type>    alternate type of distribution plot
    -u           use all data (default: 300 on text, 900 on graphics)
    -w           wide format (horizontal)
    --version

"""
from datetime import datetime
import os
from warnings import warn

from docopt import docopt
import matplotlib
matplotlib.rcParams["backend"]="Agg"
from matplotlib import pyplot as plt
import pandas as pd
from stemgraphic.num import (
    heatmatrix,
    text_heatmap,
    text_dot,
    text_hist,
    stem_tally,
    stem_text
)
from stemgraphic import (
    __version__,
    heatmap,
    stem_graphic,
    stem_dot,
    stem_hist,
    stem_kde,
    stem_line,
    dd,
)
from stemgraphic.helpers import na_count
try:
    from stemgraphic.helpers import savefig
    stdout = True
except ImportError:
    warn('No sixel graphic support')
    stdout = None


if __name__ == "__main__":
    arguments = docopt(__doc__, version=f"Stemgraphic {__version__}")
    filename = arguments["<input>"]

    alpha = False
    try:
        statinfo = os.stat(filename)  # file could be on hdfs or s3
        filesize = statinfo.st_size
        if filesize > 2 * 1024 ** 3:  # 2GB, consider large
            large = True
        else:
            large = arguments["-f"]

    except FileNotFoundError:
        # doesn't exist, or is distributed.
        filesize = 0
        large = True
    trim = arguments.get("-p", False)
    wide = arguments.get("-w", False)
    server = arguments.get("-s", False)
    persistence = arguments.get("-k", None)
    if server:
        large = True  # force it when server is specified
    if dd and large:  # dask is available
        data_frame = dd
        if server:
            from distributed import Client

            client = Client(server)  # distributed, head node
            print(client.ncores())
    else:
        data_frame = pd
    try:
        cols = int(arguments["-c"])
    except ValueError:
        cols = arguments["-c"]
        print(cols)
    except TypeError:
        pass
    if filename[-4:] in ("xlsx", ".xls"):
        if arguments["-c"] is not None:
            df = pd.read_excel(filename,  usecols=[cols])
        else:
            df = pd.read_excel(filename)  # no way an excel sheet needs dask
    elif filename[-4:] == ".tsv":
        df = data_frame.read_table(filename)
    else:
        if arguments["-c"] is not None:
            df = data_frame.read_csv(filename, usecols=[cols])
            print(df.dtypes)
        else:
            df = data_frame.read_csv(filename)
    if df.dtypes[0] == "object":
        print("Specified column is an object, trying stemgraphic.alpha.")
        from stemgraphic.alpha import (
            stem_graphic,
            stem_text,
            text_heatmap,
            heatmap,
            heatmatrix
        )

        alpha = True

    type_to_alias = {
        "dot": stem_dot,
        "text_heatmap": text_heatmap,
        "heatmap": heatmap,
        "heatmatrix": heatmatrix,
        "hist": stem_hist,
        "graphic": stem_graphic,
        "kde": stem_kde,
        "line": stem_line,
        "tally": stem_tally,
        "text": stem_text,
        "text_dot": text_dot,
        "text_hist": text_hist,
    }

    display = None
    if arguments.get("-u"):
        display = df.shape[0]

    if arguments["-w"]:
        pd.set_option('display.width', 1000)

    if arguments["-d"]:
        print(datetime.today())
        res = df.describe(include="all", percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).T
        res["total"] = df.shape[0]
        res["data_type"] = df.dtypes
        res["missing"] = na_count(
            df
        )  # currently a straight is null, but will evolve to a data quality type metric
        pd.set_option("display.max_rows", 500)
        pd.set_option("display.max_columns", 500)
        pd.options.display.float_format = "{:,}".format
        print(res)

    if arguments["-t"] in ('text_heatmap', 'heatmap', 'heatmatrix'):
        pd.options.display.float_format = '{:,.0f}'.format
        pd.set_option('display.width', 1000)

    random_state = arguments.get("-r", None)
    if random_state:
        random_state = int(random_state)
    if arguments["-o"]:
        if display is None:
            display = 900
        res = type_to_alias.get(arguments.get("-t", "graphic"), stem_graphic)(
            df, trim=trim, flip_axes=wide, random_state=random_state, display=display
        )
        fig = res[0]
        if arguments["-t"] == 'heatmap':
            ax = res[1]
            fig = ax.get_figure()
        if arguments["-o"] == 'stdout' and stdout:
            savefig(plt)
        else:
            fig.savefig(arguments["-o"])
    else:
        if display is None:
            display = 300
        if alpha:
            print("alpha")
            type_to_alias.get(arguments.get("-t", "text"), stem_text)(
                df.iloc[:, 0].tolist(), caps=False, flip_axes=wide, random_state=random_state, display=display
            )
        else:
            type_to_alias.get(arguments.get("-t", "text"), stem_text)(
                df.iloc[:, 0], outliers=True, persistence=persistence, trim=trim,
                flip_axes=wide, random_state=random_state, display=display
            )
