#!/bin/bash

# Copyright 2020 by Leipzig University Library, http://ub.uni-leipzig.de
#                   The Finc Authors, http://finc.info
#                   Martin Czygan, <martin.czygan@uni-leipzig.de>
#
# This file is part of some open source application.
#
# Some open source application is free software: you can redistribute
# it and/or modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation, either
# version 3 of the License, or (at your option) any later version.
#
# Some open source application is distributed in the hope that it will
# be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
#
# @license GPL-3.0+ <http://spdx.org/licenses/GPL-3.0+>

# Cleanup obsolete task artifacts.
#
# Will delete all except the most recent artifacts of configured tasks.

set -e -u -o pipefail

CO='\033[0;33m'
NC='\033[0m'

command -v taskhome >/dev/null 2>&1 || {
	echo >&2 "taskhome not found, assuming siskin not installed, exiting"
	exit 1
}

taskhome=$(taskhome) # Root directory of outputs.

# Note that find wants these directories to exists, so the task names must match.
declare -A tasks
tasks=(
	["ai"]="AIApplyOpenAccessFlag AIExport AIIntermediateSchema AIIntermediateSchemaDeduplicated AILicensing AILocalData AIRedact AIInstitutionChanges"
	["28"]="DOAJIntermediateSchemaDirty DOAJIntermediateSchema DOAJHarvest DOAJDownloadDump DOAJTable"
	["48"]="GeniosIntermediateSchema GeniosCombinedIntermediateSchema"
	["55"]="JstorXML JstorIntermediateSchemaGenericCollection JstorIntermediateSchema JstorMembers JstorLatestMembers"
	["85"]="ElsevierJournalsIntermediateSchema ElsevierJournalsUpdatesIntermediateSchema"
	["89"]="IEEEUpdatesIntermediateSchema IEEEIntermediateSchema"
	["126"]="BaseSingleFile"
)

# Collect deletion candidates.
matching_files=()

for src in "${!tasks[@]}"; do
	for t in ${tasks[$src]}; do
		# Limitation: the filename must start with "date-..." (check output filename via "taskoutput TASKNAME")
		for f in $(find "$taskhome/$src/$t" -type f -and -not -name "$(date +'date-%Y-%m-*')"); do
			matching_files+=($f)
		done
	done
done

if [ "${#matching_files[@]}" -eq "0" ]; then
	echo "Nothing to delete. Bye."
	exit 0
fi

for f in "${matching_files[@]}"; do
	echo "$f"
done

# Get out early.
if [ "${1-}" == "--force" ]; then
	for f in "${matching_files[@]}"; do
		rm -vf "$f"
	done
	exit 0
fi

# How much space can we reclaim? Needs bc(1).
freed=$(for f in "${matching_files[@]}"; do
	stat --printf="%s\n" "$f"
done | paste -sd+ | awk '{ print "scale=2; ("$0") / 1073741824"}' | bc -ql)

echo
echo -e -n "Delete the above ${#matching_files[@]} file(s) [${CO}${freed}GB${NC} would be freed] [y/N]? "
read -r answer
if [ "$answer" != "${answer#[Yy]}" ]; then
	for f in "${matching_files[@]}"; do
		rm -vf "$f"
	done
else
	echo "Not deleting anything. Bye."
fi
