#!/bin/bash
set -eu # exit on error, unknown var or failed pipe


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DISCLAIMER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


# This script is used for efficiently splitting up a SDF into SDF chunks.
# It does not parse molecules, just split the text defining them at every '$$$$'.
# It is also memory efficient and can thus be applied on very large files.


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


function config {
    echo -e "\nConfiguration"
    echo -e "- input_dir: $input_dir"
    echo -e "- output_csv: $output_csv"
    echo -e "\nStart time: $(date)"
    echo ""
}

function exitcodes {
    echo -e "\nExit codes are:"
    echo -e "0 - this script executed without any error"
    echo -e "1 - no argument so only usage was displayed"
    echo -e "2 - an error was found with provided arguments"
    echo ""
}

function usage {
    echo -e "\nUsage is:"
    echo -e "\nconcat_csv -i <INPUT_DIR> -o <OUTPUT_SDF>\n"
    echo -e "with <INPUT_DIR> the SDF to split into chunks"
    echo -e "with <OUTPUT_SDF> the output folder, by default the parent_dir from input_dir"
    echo -e "\n"
    exitcodes
}

function concat_synonyms {
    input_dir=$1
    output_csv=$2

    # 0/ determine gunzip version so we know if argument k is available or not
    gunzip_version=$(echo $(gunzip --version) | cut -d' ' -f3)
    echo "gunzip_version is $gunzip_version"

    # 1/ find all csv files
    input_files=$(find $input_dir -name '*_synonyms.csv.gz' | sort)
    echo -e "CONCATENATING SYNONYMS FILES...\n"

    # 2/ concatenate all csv files (works if either all compressed (gzip) or uncompressed)
    # for i in $input_files; do zcat "$i" | sed '1d'; done > $output_csv
    output_csv_base=$(dirname $output_csv)"/"$(basename $output_csv '.gz')
    echo 'group_on|idm_kept|idm_filtered' > $output_csv_base
    echo "INTIALIZED OUTPUT_CSV_BASE AT '$output_csv_base'"
    ls $output_csv_base
    for i in $input_files
    do
        echo $i
        zcat "$i" | sed '1d' >> $output_csv_base

    done
    ext=$(echo $output_csv | rev| cut -d. -f1 | rev)
    if [ $ext = 'gz' ]; then
        echo "COMPRESSING OUTPUT..."
        gzip -f $output_csv_base
    fi

    # 3/ print output
    echo -e "Result:"
    ls "${output_csv}"

}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


echo -e "\nCONCAT_SYNONYMS"
d0="$(date -u +%s)"  # time in seconds

# usage if not enough arguments
if [ $# -lt 1 ]
then
    usage
    exit 1
fi

# parse arguments
echo ""
while getopts "i:o:h" opt; do
    case $opt in
        i) input_dir=$(realpath $OPTARG);;
        o) output_csv=$(realpath $OPTARG);;
        h) usage; exit 1;;
        :)  echo "Option -$OPTARG requires an argument." >&2 ; exit 2;;
        \?) echo "Invalid option: -$OPTARG" >&2 ; exit 2;;
    esac
done

# check for errors in arguments
errors=0
# input_dir
if [ -z ${input_dir+x} ]
then
    echo "Error! Argument input_dir is unset!"
    errors=$[errors + 1]
else
    if [ ! -d $input_dir ]
    then
        echo "Error! input_dir could not be found! ($input_dir)"
        errors=$[errors + 1]
    fi
fi

# output_csv
if [ -z ${output_csv+x} ]
then
    echo "Error! Argument output_csv is unset."
    errors=$[errors + 1]
else
    output_dir=$(dirname $output_csv)
    mkdir -p $output_dir
    if [ ! -d $output_dir ]
    then
        echo "Error! output_dir could not be created! ($output_dir)"
        errors=$[errors + 1]
    fi
fi

# result of checking
if [ $errors -gt 0 ]
then
    echo -e "\nOne or several errors were found with arguments. Aborting script execution."
    exit 2
fi


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MAIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

config
concat_synonyms "$input_dir" "$output_csv"
d1="$(date -u +%s)"  # time in seconds
echo -e "\nTotal elapsed time (s): $(($d1-$d0))"
exit 0
