#!/usr/bin/env python
# Copyright European Organization for Nuclear Research (CERN) 2013
#
# Licensed under the Apache License, Version 2.0 (the "License");
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
#
# Authors:
# - Wen Guan, <wen.guan@cern.ch>, 2015 - 2016

'''
Probe to check stalled fts connections.
'''

import commands
import datetime
import os
import socket
import sys
import time
import traceback

from rucio.core import monitor

# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3


def get_conveyor_services():
    services = {}
    status, output = commands.getstatusoutput('/usr/bin/supervisorctl status')
    if status == 0:
        for line in output.split("\n"):
            items = line.split()
            name = items[0]
            # name = ''.join([i for i in name if not i.isdigit()])
            pid = items[3].split(',')[0]
            if name.startswith("conveyor"):
                services[pid] = {'name': name, 'stalled': 0, 'cxns': []}
    return services


def check_connections(services):
    status, output = commands.getstatusoutput("/usr/sbin/lsof|/bin/grep fts")
    if status == 0:
        outputs = output.split("\n")
        for line in outputs:
            try:
                items = line.split()
                pid, cxn = items[1], items[8].split('->')[1]
                if pid in services.keys():
                    fd = items[3].replace('u', '')
                    file = "/proc/%s/fd/%s" % (pid, fd)
                    if os.path.exists(file):
                        mtime = os.lstat(file).st_mtime
                        if (time.time() - mtime) > 3600:
                            services[pid]['stalled'] += 1
                            if cxn not in services[pid]['cxns']:
                                services[pid]['cxns'].append(cxn)
            except:
                print traceback.format_exc()
    ret = {}
    for pid in services:
        if services[pid]['name'] not in ret:
            ret[services[pid]['name']] = {'stalled': services[pid]['stalled'], 'cxns': services[pid]['cxns']}
        else:
            ret[services[pid]['name']]['stalled'] += services[pid]['stalled']
            for cxn in services[pid]['cxns']:
                if cxn not in ret[services[pid]['name']]['cxns']:
                    ret[services[pid]['name']]['cxns'].append(cxn)
    return ret


if __name__ == "__main__":
    try:
        hostname = socket.getfqdn()

        services = get_conveyor_services()
        print services
        services = check_connections(services)
        print services

        for name in services:
            stalled, cxns = services[name]['stalled'], services[name]['cxns']
            monitor.record_counter('daemons.conveyor.stalled_fts_connections.%s.%s' % (hostname.split('.')[0], name), stalled)

            fileoutput = '/tmp/connections_probes.txt.%i' % (time.time())
            g = open(fileoutput, 'w')
            if stalled == 0:
                g.write('%s\t%s : Connections stalled for more than 60 minutes\t%i\tNo stalled connections\n' % (hostname, name, OK))
            else:
                g.write('%s\t%s : Connections stalled for more than 60 minutes\t%i\t%i stalled connections [%s]\n' % (hostname, name, CRITICAL, stalled, ','.join(cxns)))
            g.close()
            for i in xrange(0, 10):
                s, o = commands.getstatusoutput('/usr/sbin/send_nsca rucio-nagios-prod.cern.ch -c /etc/nagios/send_nsca.cfg < %s' % (fileoutput))
                if not s:
                    break
                print o
                time.sleep(2)
            os.remove(fileoutput)

            if stalled > 0:
                dt = datetime.datetime.now()
                if dt.hour > 8 and dt.hour < 18 and dt.isoweekday():
                    pass
                else:
                    status, output = commands.getstatusoutput("/usr/bin/supervisorctl restart %s" % name)
    except:
        print traceback.format_exc()
        sys.exit(UNKNOWN)
    sys.exit(OK)
