#!/usr/bin/env python
#-*- coding: utf-8 -*-
# just a test
# just a test too

from __future__ import print_function
import threading
import sys
import signal
import argparse
import multiprocessing
import os
import time
import signal
from tempfile import TemporaryFile
import fcntl

pid = os.getpid()
attempt = 0
while True:
    try:
        from o2locktoplib import util
        from o2locktoplib import dlm
        from o2locktoplib import printer
        from o2locktoplib import keyboard
        from o2locktoplib import config
        from o2locktoplib.retry import retry
        break
    except ImportError as e:
        if attempt:
            print("\no2locktop: error: can't find the o2locktoplib, installed error!\n", file=sys.stderr)
            sys.exit(0)
        else:
            attempt = 1
        python_version = "{0}.{1}".format(str(sys.version_info[0]), str(sys.version_info[1]))
        package_path_64 = "/usr/lib64/python{0}/site-packages".format(python_version)
        package_path = "/usr/lib/python{0}/site-packages".format(python_version)
        local_package_path_64 = "/usr/local/lib64/python{0}/site-packages".format(python_version)
        local_package_path = "/usr/local/lib/python{0}/site-packages".format(python_version)

        for pk_path in package_path_64, package_path, local_package_path_64, local_package_path:
            if pk_path not in sys.path:
                sys.path.append(pk_path)
   

def parse_args():

    DESCRIPTION = """\
It is a top-like tool to monitor OCFS2 DLM lock usage in the cluster, and can
be used to detect hot files/directories, which intensively acquire DLM locks.
"""

    NOTES = """\
The average/maximal wait time for DLM lock acquisitions likely gives hints to
the administrator when concern about OCFS2 performance, for example,
- if the workload is unbalanced among nodes.
- if a file is too hot, then maybe need check the related applications above.
- if a directory is too hot, then maybe split it to smaller with less number
  of files underneath.

OUTPUT ANNOTATION:
  - The output is refreshed every 5 seconds, and sorted by the sum of 
    DLM EX(exclusive) and PR(protected read) lock average wait time
  - One row, one inode (including the system meta files if with '-d' argument)
  - Columns:
    "TYPE" is DLM lock types,
      'M' -> Meta data lock for the inode
      'W' -> Write lock for the inode
      'O' -> Open lock for the inode

    "INO" is the inode number of the file

    "EX NUM" is the number of EX lock acquisitions
    "EX TIME" is the maximal wait time to get EX lock
    "EX AVG" is the average wait time to get EX lock

    "PR NUM" is the number of PR(read) lock acquisitions
    "PR TIME" is the maximal wait time to get PR lock
    "PR AVG" is the average wait time to get PR lock

SHORTCUTS:
  - Type "d" to display DLM lock statistics for each node
  - Type "Ctrl+C" or "q" to exit o2locktop process

PREREQUISITES:
  o2locktop reads OCFS2_FS_STATS statistics from /sys/kernel/debug/. That says,
  for all cluster nodes, the kernel option must be set(enabled). Check it out:
      grep OCFS2_FS_STATS < /boot/config-\`uname -r\`

  o2locktop uses the passwordless SSH to OCFS2 nodes as root. Set it up if not:
      ssh-keygen; ssh-copy-id root@node1

EXAMPLES:
  - At any machine within or outside of the cluster:

    o2locktop -n node1 -n node2 -n node3 /mnt/shared

    To find the absolute path of the inode file:
    find <MOUNT_POINT> -inum <INO>
 
"""

    parser = argparse.ArgumentParser(description=DESCRIPTION, 
                                     prog='o2locktop', 
                                     epilog=NOTES,
                                     formatter_class=argparse.RawDescriptionHelpFormatter
                                    )
#                                     add_help=False)

    parser.add_argument('-n', metavar='NODE_IP',
                        dest='host_list', action='append',
                        help='OCFS2 node IP address for ssh')

    parser.add_argument('-o', metavar='LOG_FILE', dest='log',
                        action='store',
                        help='log path')

    parser.add_argument('-l', metavar='DISPLAY_LENGTH',
                        dest='display_len', type=int,
                        help='number of lock records to display')

    parser.add_argument('-V','--version',action="store_true", 
                        help='the current version of o2locktop')

    parser.add_argument('-d','--debug',action="store_true",
                        help='show all the inode including the system inode number')

    parser.add_argument('mount_point', metavar='MOUNT_POINT', nargs='?',
                        help='the OCFS2 mount point, eg. /mnt/shared')

    args = parser.parse_args()

    # FIXME: to delete ZERO argument detection after implementing the new feat to
    # allow users run this tool without argument inside the ocfs2 cluster. 
    if len(sys.argv) < 2:
        parser.print_usage()
        sys.exit(1)

    node_list = []
    met_colon = False
    if args.version:
        print(config.VERSION)
        sys.exit(0)
    if args.display_len != None and args.display_len <= 0:
        util.eprint("\no2locktop: error: The length of the line to show must be greater than 0\n")
        sys.exit(0)
    if args.host_list:
        if not args.mount_point:
            util.eprint("\no2locktop: error: ocfs2 mount point is needed\n")
            parser.print_usage()
            sys.exit(0)
        for i in args.host_list:
            node_list.append(i)

        return {
                "mode":"remote",
                "mount_node" : node_list[0],
                "mount_point" : args.mount_point,
                "node_list" : node_list,
                "log" : args.log,
                "display_len" : args.display_len,
                "debug" : args.debug
                }
    else:
        if not args.mount_point:
            util.eprint("\no2locktop: error: ocfs2 mount point is needed\n")
            parser.print_usage()
            sys.exit(0)
        return {
                "mode":"local",
                "mount_point" : args.mount_point,
                "log" : args.log,
                "display_len" : args.display_len,
                "debug" : args.debug
                }

    parser.print_help()
    sys.exit(0)

def _connection_test_worker(node,mount_point,queue,pid):
    now = time.time()
    uuid = util.get_dlm_lockspace_mp(node, mount_point)
    if not uuid:
        if (time.time()-now) > 5:
            util.eprint("\no2locktop: error: network connection to {0} failed\n".format(node))
            try:
                os.kill(pid,signal.SIGUSR1)
            except:
                pass
            return
        elif not util.is_passwdless_ssh_set(node):
            util.eprint('\no2locktop: error: o2locktop uses the passwordless SSH to OCFS2 nodes as root. Set it up if not: \nssh-keygen; ssh-copy-id root@{node}\n'.format(node=node))
            try:
                os.kill(pid,signal.SIGUSR1)
            except:
                pass
            return
        else:
            util.eprint("\no2locktop: error: can't find the mount point: {0}, please cheack and retry\n".format(mount_point))
            try:
                os.kill(pid,signal.SIGUSR1)
            except:
                pass
            return
    queue.put(uuid)


def connection_test(nodes,mount_point):
    assert(nodes != None and len(nodes) > 0)
    queue = multiprocessing.Queue()
    process_list = []
    for node in nodes:
        p = multiprocessing.Process(target = _connection_test_worker, args = (node,mount_point,queue,pid))
        p.daemon = True
        p.start()
        process_list.append(p)
    for p in process_list:
        p.join()
    uuid = queue.get()
    for i in range(len(process_list) - 1):
        tmp = queue.get()
        if uuid != tmp:
            util.eprint("\no2locktop: error: can't find the shared storage in the cluster, "\
                          "check if the node in the command line has input errors\n")
            sys.exit(0) 

def _connection_ocfs2_debug_test_worker(node, pid):
    if(False == util.is_kernel_ocfs2_fs_stats_enabled(node)):
        util.eprint("\no2locktop: error: the node({0}) do not support ocfs2 debug, please cheack and retry\n".format(node))
        os.kill(pid,signal.SIGUSR1)
        sys.exit(0)

process_list = []
def connection_ocfs2_debug_test(nodes):
    assert(nodes != None and len(nodes) > 0)
    # do must wait the children process finished
    for node in nodes:
        p = multiprocessing.Process(target = _connection_ocfs2_debug_test_worker, args = (node,pid))
        p.daemon = True
        p.start()
        process_list.append(p)

def _remote_cmd_test_worker(node, pid, uuid):
    result = util.cmd_is_exist(config.CMDS, node)
    if not result[0]:
        util.eprint("\no2locktop: error: the node({0}) do not have the command {1}, please install and retry\n".format(node, result[1]))
        os.kill(pid,signal.SIGUSR1)
        sys.exit(0)
    # to test if the remote node support ocfs2 debug v4, if support, set the v4 filter
    v4_support = util.check_support_debug_v4_and_get_interval(uuid, node)
    if len(v4_support) != 0:
        util.set_debug_v4_interval(uuid, node, config.interval*2+1)

def remote_cmd_test(nodes, mount_point):
    assert(nodes != None and len(nodes) > 0)
    uuid = util.get_dlm_lockspace_mp(nodes[0], mount_point)
    for node in nodes:
        p = multiprocessing.Process(target = _remote_cmd_test_worker, args = (node,pid,uuid))
        p.daemon = True
        p.start()
        process_list.append(p)


def local_test(mount_point):
    uuid = util.get_dlm_lockspace_mp(None, mount_point)
    if not uuid:
        util.eprint("\no2locktop: error: can't find the mount point: {0}, please cheack and retry\n".format(mount_point))
        sys.exit(0)

def local_ocfs2_debug_test():
    if(False == util.is_kernel_ocfs2_fs_stats_enabled()):
        util.eprint("\no2locktop: error: the node({0}) do not support ocfs2 debug, please cheack and retry\n".format("localhost"))
        sys.exit(0)

def local_cmd_test(mount_point):
    result = util.cmd_is_exist(config.CMDS)
    if not result[0]:
        util.eprint("\no2locktop: error: the local node do not have the command {0}, please install and retry\n".format(result[1]))
        sys.exit(0)
    # to test if the local node support ocfs2 debug v4, if support, set the v4 filter
    uuid = util.get_dlm_lockspace_mp(None, mount_point)
    v4_support = util.check_support_debug_v4_and_get_interval(uuid, None)
    if len(v4_support) != 0:
        util.set_debug_v4_interval(uuid, None, config.interval*2+1)

def main():
    args = parse_args()

    # can also use dup2
    if not config.debug:
        tmp_stderr = TemporaryFile('w+t')
        sys.stderr = tmp_stderr
    '''
    else:
        flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL)
        flags &= ~os.O_NONBLOCK;
        fcntl.fcntl(sys.stdout,fcntl.F_SETFL,flags);
    '''

    @retry(10)
    def sigcont_handler(signum, frame):
        keyboard.set_terminal()

    @retry(10)
    def sigusr1_handler(signum, frame):
        keyboard.reset_terminal()
        # util.eprint("\no2locktop: error: can't find the mount point: {0}, please cheack and retry\n".format(args["mount_point"]))
        os._exit(0)

    signal.signal(signal.SIGUSR1, sigusr1_handler)

    log = args["log"]
    display_len = args["display_len"] 
    debug = args["debug"]

    if args['mode'] == "remote":
        mount_host, mount_point = args["mount_node"], args["mount_point"]
        nodes = args["node_list"]
        connection_test(nodes, mount_point)
        connection_ocfs2_debug_test(nodes)
        remote_cmd_test(nodes, mount_point)
        lock_space_str = util.get_dlm_lockspace_mp(mount_host, mount_point)
        max_sys_inode_num = util.get_dlm_lockspace_max_sys_inode_number(mount_host, mount_point)
        mount_info = ':'.join([mount_host, mount_point])
    elif args['mode'] == "local":
        mount_point = args["mount_point"]
        local_test(mount_point)
        local_ocfs2_debug_test()
        local_cmd_test(mount_point)
        lock_space_str = util.get_dlm_lockspace_mp(None, mount_point)
        max_sys_inode_num = util.get_dlm_lockspace_max_sys_inode_number(None, mount_point)
        mount_info = mount_point

    if lock_space_str is None:
        #print("Error while getting lockspace")
        sys.exit(0)

    if args["mode"] == "local":
        nodes = None

    for p in process_list:
        p.join()

    printer_queue = multiprocessing.Queue()
    printer_process = multiprocessing.Process(target=printer.worker,
                                        args=(printer_queue, log),
                                        kwargs={"mount_info":mount_info}
                                    )
    printer_process.daemon = True
    lock_space_process = multiprocessing.Process(target=dlm.worker,
                                        args=(lock_space_str, max_sys_inode_num, debug, display_len, nodes, printer_queue)
                                        )

    lock_space_process.daemon = True
    printer_process.start()
    lock_space_process.start()

    signal.signal(signal.SIGCONT, sigcont_handler)
    keyboard.worker(printer_queue)

    lock_space_process.terminate()
    lock_space_process.join()

    #printer_process will exit on quit message
    #printer_process.terminate()
    printer_process.join()


    sys.exit(0)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        #print("Bye")
        keyboard.reset_terminal()
