#!python

"""
Script peek_hdf
================
This script is used to peek into a hdf file and return some basic informations about it.

It uses the installed npfc libary in your favorite env manager.

Examples:

    >>> peek_hdf.py -i example.hdf

"""

###############################################################################
# Dependencies
###############################################################################

# standard
import sys
from pathlib import Path
import datetime
import logging
import argparse
# data handling
import pandas as pd
# chemoinformatics
import rdkit
# custom libraries
import npfc
from npfc import load


###############################################################################
# Script
###############################################################################


if __name__ == '__main__':

    # parameters CLI

    parser = argparse.ArgumentParser(description='Split a SDF into hdf chunks', formatter_class=argparse.ArgumentDefaultsHelpFormatter)    # (1)
    parser.add_argument('-i', '--input_hdf', type=str, default=None, help="File name of the input HDF.")
    parser.add_argument('--col_mol', type=str, default='mol', help="Molecule column.")
    parser.add_argument('--decode', type=bool, default=False, help="Decode encoded data (molecules, fragment maps, colormaps, etc.")
    parser.add_argument('-n', '--nrecords', type=int, default=10, help="Number of records to display.")

    args = parser.parse_args()

    # logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s -- %(levelname)s -- %(message)s')
    logger = logging.getLogger(__name__)
    logger.info("STARTING PEEK_HDF.PY")
    logger.info("INITIALIZE")

    # define padding for logging
    pad = 30

    # versions
    logger.info("LIBRARY VERSIONS")
    logger.info("rdkit".ljust(pad) + f"{rdkit.__version__}")
    logger.info("pandas".ljust(pad) + f"{pd.__version__}")
    logger.info("npfc".ljust(pad) + f"{npfc.__version__}")

    # input_file
    if args.input_hdf is None:
        logging.critical(f"INPUT_HDF IS NOT SPECIFIED")
        sys.exit(1)
    elif not Path(args.input_hdf).is_file():
        logging.critical(f"INPUT_HDF COULD NOT BE FOUND AT '{args.input_hdf}'")
        sys.exit(2)
    # mol_col
    if args.mol_col is not None and not isinstance(args.mol_col, str):
        logging.critical(f"MOL_ID IS NOT A STRING ({type(args.mol_col)})")
        sys.exit(3)
    # decode_mols
    if not isinstance(args.decode_mols, bool):
        logging.critical(f"DECODE_MOLS IS NOT A BOOLEAN ({type(args.decode_mols)})")
        sys.exit(4)

    # describe current job

    # arguments
    logger.info("INPUT_HDF".ljust(pad) + f"{args.input_hdf}")
    logger.info("MOL_COL".ljust(pad) + f"{args.mol_col}")
    logger.info("DECODE_MOLS".ljust(pad) + f"{args.decode_mols}")
    logger.info("NRECORDS".ljust(pad) + f"{args.nrecords}")

    # run

    logger.info("BEGIN")
    # load
    d0 = datetime.datetime.now()
    logger.info("LOADING INPUT_HDF")
    df = load.file(args.input_hdf, decode=args.decode, in_mol=args.col_mol, out_mol=args.col_mol)
    num_failed = df[args.col_mol].isna().sum()
    logger.info(f"LOADED {len(df)} RECORDS WITH {num_failed} FAILURE(S)")
    logger.info("DESCRIBING INPUT_HDF:")
    logger.info(f"COLUMNS\n\n{df.dtypes}\n")
    logger.info(f"DISPLAYING THE {args.nrecords} FIRST RECORD(S):\n")
    print(f"\n{df.head(args.nrecords)}\n")
    d2 = datetime.datetime.now()
    # end
    logger.info("END")
