#!python
"""Simple commandline interface for parsing PDF to HTML."""
import argparse
import logging
import os

import pdftotree

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""
        Script to extract tree structure from PDF files. Takes a PDF as input
        and outputs an HTML-like representation of the document's structure. By
        default, this conversion is done using heuristics. However, a model can
        be provided as a parameter to use a machine-learning-based approach.
        """,
        usage="%(prog)s [options] pdf_file",
    )
    parser.add_argument(
        "-mt",
        "--model_type",
        type=str,
        default=None,
        choices=["vision", "ml", None],
        help="Model type to use.  None (default) for heuristics approach.",
    )
    parser.add_argument(
        "-m",
        "--model_path",
        type=str,
        default=None,
        help="Pretrained model, generated by extract_tables tool",
    )
    parser.add_argument(
        "pdf_file",
        type=str,
        help="PDF file name for which tree structure needs to be extracted",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        help="Path where tree structure should be saved. If none, HTML is printed to stdout.",
    )
    parser.add_argument(
        "-f",
        "--favor_figures",
        type=str,
        help="Whether figures must be favored over other parts such as tables and section headers",
        default="True",
    )
    parser.add_argument(
        "-V",
        "--visualize",
        dest="visualize",
        action="store_true",
        help="Whether to output visualization images for the tree",
    )
    parser.add_argument(
        "-d",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="Run pdftotree, but do not save any output or print to console.",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        dest="verbose",
        action="store_true",
        help="Output INFO level logging.",
    )
    parser.add_argument(
        "-vv",
        "--veryverbose",
        dest="debug",
        action="store_true",
        help="Output DEBUG level logging.",
    )
    parser.set_defaults(visualize=False)
    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose:
        log_level = logging.INFO
    else:
        log_level = logging.ERROR

    if bool(args.model_type) != bool(args.model_path):
        parser.error("Both a model_type and a model_path must be provided together.")
    elif args.model_type and not os.path.exists(args.model_path):
        parser.error("A valid path to a pretrained model must be provided.")

    # Configure logging for this application
    log = logging.getLogger("pdftotree")
    log.propagate = 0  # prevent propagation to the root logger
    ch = logging.StreamHandler()
    log.setLevel(log_level)
    ch.setLevel(log_level)
    formatter = logging.Formatter("[%(levelname)s] %(name)s - %(message)s")
    ch.setFormatter(formatter)
    log.addHandler(ch)

    if args.dry_run:
        print("This is just a dry run. No HTML will be output.")
        args.output = None

    # Call the main routine
    result = pdftotree.parse(
        args.pdf_file,
        args.output,
        args.model_type,
        args.model_path,
        args.favor_figures,
        args.visualize,
    )

    if result:
        if not args.dry_run:
            print(result)
    else:
        print("HTML output to {}".format(args.output))
