#!/usr/bin/python3
#
# prepare:
# sudo apt install python3 tesseract-ocr poppler-utils
# sudo brew install python tesseract poppler
#
# install:
# pip3 install mdrocr

import argparse
import os
import re
import shutil
import subprocess
import sys
import tempfile

def sorted_list(l):
    convert = lambda text: float(text) if text.isdigit() else text
    alphanum = lambda key: [convert(c) for c in re.split('([-+]?[0-9]*\.?[0-9]*)', key)]
    l.sort(key=alphanum)
    return l

parser = argparse.ArgumentParser(description='OCR folders of images')
parser.add_argument('--lang', dest='lang', default="eng",
                    help='language to use (default eng)')
parser.add_argument('--dpi', dest='dpi', default="300",
                    help='dpi of scan (default 300)')
parser.add_argument('folder')

args = parser.parse_args()

language = args.lang
dpi = args.dpi
base = os.path.normpath(os.path.abspath(args.folder))

for root, folders, files in os.walk(base):
    if not folders and files:
        pdf = root + '.pdf'
        if os.path.exists(pdf):
            print("\n--\nskip", root, "pdf exists", pdf)
            continue
        images = sorted_list([f for f in files if f.split('.')[-1] in ('jpg', 'jp2', 'jpeg')])
        print("\n---\nprocessing %s\n%s pages\n" % (root, len(images)))
        tmp = tempfile.mkdtemp()
        pages = []
        for image in images:
            image = os.path.join(root, image)
            tmp_pdf = os.path.join(tmp, os.path.basename(os.path.splitext(image)[0]))
            cmd = ["tesseract", "-l", language, '--dpi', dpi, image, tmp_pdf, "pdf"]
            print(image[len(root) + 1:])
            subprocess.check_output(cmd)
            pages.append(tmp_pdf + ".pdf")
        cmd = ["pdfunite"] + pages + [pdf]
        subprocess.check_output(cmd)
        print("\ncreated %s" % pdf)
        shutil.rmtree(tmp)
print("")
