#!/usr/bin/env python
# Copyright European Organization for Nuclear Research (CERN)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#                       http://www.apache.org/licenses/LICENSE-2.0
#
# Authors:
# - Ralph Vigne, <ralph.vigne@cern.ch> 2015
# - Cedric Serfon, <cedric.serfon@cern.ch> 2017
#
# If the connection to Elasticsearch (ES) is broken, Redis starts persisting data on disk till ES is back.
# If the downtime of ES is longer, the disk might run full and take the node down. Currently the disk space
# lasts for about 30 hours (2Gb/h). In order to protect the node from running out of disk space, this node
# check the free disk space and if below a certain threshold, drops entries from the rucio.debug queue to
# free up disk space again.
#
# If after dropping data, the disk space is low, a critical error is raised and an email alert sent.
#
# This probe should be executed every 15 minutes on the host running Redis.

import os
import socket
import sys
import time

import redis

from pystatsd import Client

UNKNOWN = 3
CRITICAL = 2
WARNING = 1
OK = 0

REDIS_URL = "rucio-logger-prod-01"
REDIS_QUEUES = ["rucio.haproxy", "rucio.apache", "rucio.daemons", "rucio.debug"]

# Size when it starts to reported (but not alerted!)
THRESHOLD_QUEUE_SIZE = 50000

# Threshold where it starts to be reported (but not alerted!)
THRESHOLD_GB = 8
# Threshold where alters are sent via email
THRESHOLD_CRITICAL_GB = 4

# Number of events removed when critical, e.g. 2.5M events per hour as a rule of thumb
TRIM_CRITICAL_EVENTS = 5000000

HOST = socket.getfqdn()


def get_free_disk_space():
    state = os.statvfs('/')
    return (state.f_bavail * state.f_frsize) / 1024 / 1024


def report_to_graphite(name, value):
    serv = 'rucio-graphite-prod-02.cern.ch'
    port = 8125
    try:
        Client(host=serv, port=port, prefix='%s.%s' % ('rucio.monitoring.redis', HOST)).gauge(name, value)
    except Exception:
        pass


if __name__ == '__main__':
    server = redis.Redis(REDIS_URL)
    STATUS = OK

    for queue in REDIS_QUEUES:
        try:
            queue_len = server.llen(queue)
            # Report to Graphite
            report_to_graphite('queue.%s' % queue, queue_len)
            if queue_len > THRESHOLD_QUEUE_SIZE:
                STATUS = WARNING
        except redis.exceptions.ConnectionError as error:
            print error
            sys.exit(CRITICAL)

    free_disk_space = get_free_disk_space()
    report_to_graphite('avail_diskspace', free_disk_space)

    if free_disk_space < THRESHOLD_GB * (1024):  # GB to bytes
        debug_cutoff = server.llen('rucio.debug') / 2
        server.ltrim('rucio.debug', debug_cutoff, -1)
        time.sleep(120)  # Sleeping two minutes to wait for the effects on queue trimming on disk
        free_disk_space_after = get_free_disk_space()
        report_to_graphite('avail_diskspace', free_disk_space_after)
        STATUS = WARNING if STATUS == OK else STATUS
        if free_disk_space_after < THRESHOLD_CRITICAL_GB * (1024):
            print '[%s] Cleaning debug queue wasn\'t sufficient to free enough disk space. Last gain: %d mb (%d entries)' % (HOST, (free_disk_space_after - free_disk_space), debug_cutoff)
            server.ltrim('rucio.daemons', TRIM_CRITICAL_EVENTS, -1)
            time.sleep(120)  # Sleeping two minutes to wait for the effects on queue trimming on disk
            crit_free_disk_space = get_free_disk_space()
            report_to_graphite('avail_diskspace', crit_free_disk_space)
            print '[%s] Remvoing %s events from daemon queue to gain %d mb of disk space.' % (HOST, TRIM_CRITICAL_EVENTS, (crit_free_disk_space - free_disk_space_after))
            STATUS = CRITICAL

    sys.exit(STATUS)
