#!/usr/bin/env python
# vim: set expandtab ts=4 sw=4:

# Copyright (C) 2009 University of Victoria
# You may distribute under the terms of either the GNU General Public
# License or the Apache v2 License, as specified in the README file.

## Auth: Duncan Penfold-Brown. 6/15/2009

## CLOUD SCHEDULER
##
## The main body for the cloud scheduler, that encapsulates and organizes
## all cloud scheduler functionality.
##
## Using optparse for command line options (http://docs.python.org/library/optparse.html)
##

## Imports
from __future__ import with_statement
import sys

if sys.version_info[:2] < (2,5):
    print "You need at least Python 2.5 to run Cloud Scheduler"
    sys.exit(1)

import os
import re
import time
import string
import getopt
import signal
import logging
import urllib2
import threading
import traceback
import ConfigParser
import logging.handlers
import multiprocessing
from itertools import islice
from optparse import OptionParser
from decimal import *
from collections import defaultdict

import cloudscheduler.config as config
import cloudscheduler.utilities as utilities
import cloudscheduler.__version__ as version
import cloudscheduler.info_server as info_server
import cloudscheduler.admin_server as admin_server
import cloudscheduler.cloud_management as cloud_management
import cloudscheduler.job_management as job_management
import cloudscheduler.proxy_refreshers as proxy_refreshers

from cloudscheduler.cloud_management import VMDestroyCmd
from cloudscheduler.cloud_management import VMMachine

#from cloudscheduler.monitoring.get_clouds import getCloudsClient

log = utilities.get_cloudscheduler_logger()

# Threaded Classes

class VMPoller(threading.Thread):
    """
    VMPoller - Polls all the requested VMs, and checks on their status
    """
    def __init__(self, resource_pool, job_pool):
        threading.Thread.__init__(self, name=self.__class__.__name__)
        self.resource_pool = resource_pool
        self.job_pool = job_pool
        self.quit          = False
        self.starting_poll_interval = 120 # 2 minutes
        self.running_poll_interval = 900 # 15 minutes
        self.run_interval = config.vm_poller_interval
        self.destroy_threads = {}
        self.heart_beat = time.time()

    def stop(self):
        log.debug("Waiting for VM polling loop to end")
        self.quit = True

    def run(self):
        log.info("Starting VM polling...")

        while not self.quit:
            start_loop_time = time.time()
            self.poll_all_machines()
            self.check_destroy_threads()
            sleep_tics = self.run_interval
            elapsed_loop_time = time.time() - start_loop_time
            log.verbose("VMPoller thread loop time: %s" % str(elapsed_loop_time))
            self.heart_beat = time.time()
            while (not self.quit) and sleep_tics > 0:
                time.sleep(1)
                sleep_tics -= 1
            

    def poll_all_machines(self):
        """
        poll_all_machines - internal function to poll all running VMs,
                            and then update their status
        """
        log.verbose("Polling all running VMs...")
        for cluster in self.resource_pool.resources:
            for vm in cluster.vms:
                now = int(time.time())

                if vm.lastpoll and ((vm.status == "Starting" or vm.status == "Unpropagated") and now - vm.lastpoll < self.starting_poll_interval):
                    log.verbose("Skipped polling %s, which has status %s" % (vm.id, vm.status))
                    continue
                elif vm.lastpoll and (vm.status == "Running" and now - vm.lastpoll < self.running_poll_interval):
                    log.verbose("Skipped polling %s, which has status %s" % (vm.id, vm.status))
                    continue

                ret_state = cluster.vm_poll(vm)

                # Print polled VM's state and details
                log.verbose("Polled VM %s, which has status %s" % (vm.id, ret_state))

                # If the VM is in an error state, keep track of error and
                # after passing some threshold destroy the machine.
                if ret_state == "Error" or ret_state == "Shutdown":
                    vm.errorcount += 1
                    log.verbose("Error in VM %s, increased counter to %s" % (str(vm.id), str(vm.errorcount)))
                elif vm.errorcount > 0:
                    vm.errorcount = 0
                if ret_state == "HttpError":
                    vm.errorcount = config.polling_error_threshold
                    self.handle_bad_image(vm.user, vm.image)
                if ret_state == "Running":
                    if vm.startup_time == None:
                        vm.startup_time = vm.last_state_change - vm.initialize_time
                if ret_state == "ConnectionRefused":
                    if not vm.errorconnect:
                        vm.errorconnect = time.time()
                    else:
                        if time.time() - vm.errorconnect > config.vm_connection_fail_threshold:
                            #Have been unable to connect to service for extended period
                            #Assume that service is down - disable cloud for some period of time
                            cluster = self.resource_pool.get_cluster_with_vm(vm)
                            cluster.errorconnect = time.time()
                            cluster.enabled = False
                            cluster.connection_problem = True

                if vm.errorcount >= config.polling_error_threshold:
                    log.verbose("VM %s reached threshold in errors, %s" % (str(vm.id), str(vm.errorcount)))
                    # Destroy the VM
                    if not self.check_destroy(cluster, vm) and not cluster.connection_problem:
                        dt = VMDestroyCmd(cluster, vm, reason="VM is in an Error state.")
                        self.destroy_threads["".join([cluster.name, vm.id])] = dt
                        dt.start()

    def handle_bad_image(self, user, image):
        """Respond to image url with a failed Http response, will attempt to 
        condor_hold those jobs so they will not be considered for scheduling."""
        log.info("User %s has job(s) with bad image location: %s - temporarily banning/holding those jobs" % (user, image))
        user_jobs = self.job_pool.job_container.get_jobs_for_user(user)
        jobs_to_hold = []
        HELD = 5
        for job in user_jobs:
            if job.req_imageloc == image and not job.banned:
                job.banned = True
                job.ban_time = time.time()
                job.override_status = "HTTPFail"
                if job.job_status != HELD:
                    jobs_to_hold.append(job)
        self.job_pool.job_hold_local(jobs_to_hold)

    def check_destroy_threads(self):
        """Checks the  VM destroy thread list for threads that have finished
        execution and cleans them up."""
        to_remove = []
        for k, thread in self.destroy_threads.iteritems():
            if not thread.is_alive():
                if thread.get_result() != 0:
                    log.error("Destroying VM %s failed. Leaving in error state." % thread.get_vm().id)
                thread.join()
                to_remove.append(k)
        for key in to_remove:
            del self.destroy_threads[key]
    def check_destroy(self, cluster, vm):
        """Checks the destroy thread list to make sure the VM to be shutdown is not
        already in the process of being destroyed."""
        found = False
        if "".join([cluster.name, vm.id]) in self.destroy_threads.keys():
            found = True
        return found

class JobPoller(threading.Thread):
    """
    JobPoller - Polls the Condor schedd for job status, and new jobs
    """

    def __init__(self, job_pool):
        threading.Thread.__init__(self, name=self.__class__.__name__)
        self.job_pool = job_pool
        self.quit = False
        self.heart_beat = time.time()
        self.polling_interval = config.job_poller_interval

    def stop(self):
        log.debug("Waiting for job polling loop to end")
        self.quit = True

    def run(self):
        try:
            log.info("Starting job polling...")
            prev_req_vmtypes = []
            new_req_vmtypes = []
            while not self.quit:
                start_loop_time = time.time()
                log.verbose("Polling job scheduler")

                ## Query the job pool to get new unscheduled jobs
                # Populates the 'jobs' and 'scheduled_jobs' lists appropriately
                condor_jobs = self.job_pool.job_query()
                if condor_jobs != None:
                    self.job_pool.update_jobs(condor_jobs)
                else:
                    log.error("Failed to contact Condor job scheduler. Continuing with VM management.")
                del condor_jobs

                new_req_vmtypes = self.job_pool.get_required_uservmtypes()
                # What's no longer needed
                taken_out = set(prev_req_vmtypes) - set(new_req_vmtypes)
                for vmtype in taken_out:
                    log.debug("%s vmtype removed from required types" % vmtype)
                # What's been added?
                added_in = set(new_req_vmtypes) - set(prev_req_vmtypes)
                for vmtype in added_in:
                    log.debug("%s vmtype added to required types" % vmtype)

                log.verbose("Job Poller waiting %ds..." % self.polling_interval)
                prev_req_vmtypes = new_req_vmtypes
                sleep_tics = self.polling_interval
                elapsed_loop_time = time.time() - start_loop_time
                log.verbose("JobPoller loop time: %s" % elapsed_loop_time)
                self.heart_beat = time.time()
                while (not self.quit) and sleep_tics > 0:
                    time.sleep(1)
                    sleep_tics -= 1

            log.info("Exiting job polling thread")
        except:
            log.error(traceback.format_exc())

class MachinePoller(threading.Thread):
    """
    MachinePoller - Polls the Condor collector for VM status, and new VMs
    """

    def __init__(self, resource_pool):
        threading.Thread.__init__(self, name=self.__class__.__name__)
        self.resource_pool = resource_pool
        self.quit = False
        self.heart_beat = time.time()
        self.polling_interval = config.machine_poller_interval

    def stop(self):
        log.debug("Waiting for machine polling loop to end")
        self.quit = True

    def run(self):
        log.info("Starting machine polling...")
        zero_len_count = 0
        while not self.quit:
            start_loop_time = time.time()
            log.verbose("Polling machine scheduler")

            self.resource_pool.prev_machine_list = self.resource_pool.machine_list
            self.resource_pool.prev_vm_machine_list = self.resource_pool.vm_machine_list
            self.resource_pool.machine_list = self.resource_pool.resource_query()
            self.resource_pool.master_list = self.resource_pool.master_resource_query_local()
            self.resource_pool.vm_machine_list = self.resource_pool.machinelist_to_vmmachinelist(self.resource_pool.machine_list, self.resource_pool.master_list)
            if len(self.resource_pool.machine_list) == 0 and len(self.resource_pool.prev_machine_list) != 0 and zero_len_count < 3:
                zero_len_count += 1
                self.resource_pool.machine_list = self.resource_pool.prev_machine_list
                self.resource_pool.vm_machine_list = self.resource_pool.prev_vm_machine_list
            else:
                zero_len_count = 0
            log.verbose("Machine Poller waiting %ds..." % self.polling_interval)
            sleep_tics = self.polling_interval
            elapsed_loop_time = time.time() - start_loop_time
            log.verbose("MachinePoller loop time: %s" % str(elapsed_loop_time))
            self.heart_beat = time.time()
            while (not self.quit) and sleep_tics > 0:
                time.sleep(1)
                sleep_tics -= 1

        log.info("Exiting machine polling thread")

class Scheduler(threading.Thread):
    """
    Scheduler thread matches jobs to available resources, and starts
    VMs for them
    """
    ## Condor Job Status mapping 
    NEW      = 0
    IDLE     = 1
    RUNNING  = 2
    REMOVED  = 3
    COMPLETE = 4
    HELD     = 5
    ERROR    = 6
    CONDOR_STATUS = ("New", "Idle", "Running", "Removed", "Complete", "Held", "Error")

    def __init__(self, resource_pool, job_pool):
        threading.Thread.__init__(self, name=self.__class__.__name__)
        self.resource_pool = resource_pool
        self.job_pool      = job_pool
        self.quit          = False
        self.quick_exit    = False
        self.heart_beat = time.time()
        self.scheduling_interval = config.scheduler_interval

        if config.scheduling_algorithm.lower() == "fairshare":
            log.debug("Using fairshare scheduling algorithm.")
            self.scheduling_method = self.scheduler_fair_share
        elif config.scheduling_algorithm.lower() == "fifo":
            log.debug("Using fifo scheduling algorithm.")
            self.scheduling_method = self.scheduler_fifo
        else:
            log.debug("Cannot use %s scheduling, switching to fairshare" % config.scheduling_algorithm)
            self.scheduling_method = self.scheduler_fair_share

    def stop(self):
        log.debug("Waiting for scheduling loop to end")
        self.quit = True
    
    def toggle_quick_exit(self):
        log.debug("Toggle quick exit flag to not skip VM Shutdown.")
        self.quick_exit = not self.quick_exit

    def run(self):
        log.info("Starting job scheduling...")

        ########################################################################
        ## Full scheduler loop
        ########################################################################
        while not self.quit:
            start_loop_time = time.time()
            log.verbose("### Scheduler Cycle:")

            self.scheduling_method()

            self.resource_pool.save_persistence()

            ## Wait for a number of seconds
            log.verbose("Scheduler - Waiting %ss" % self.scheduling_interval)
            sleep_tics = self.scheduling_interval
            elapsed_loop_time = time.time() - start_loop_time
            log.verbose("Scheduler loop time: %s" % str(elapsed_loop_time))
            self.heart_beat = time.time()
            while (not self.quit) and sleep_tics > 0:
                time.sleep(1)
                sleep_tics -= 1

        # Exit the scheduling thread - clean up VMs and exit
        log.debug("Exiting scheduler thread")
        if not self.quick_exit:
            # Destroy all VMs and finish
            log.info("### Destroying all remaining VMs and exiting :-(")
            remaining_vms = []
            failed_vms = []
            threadfail = True
            while threadfail:
                (remains, threadfail, failed) = self.scheduler_full_shutdown()
                remaining_vms.extend(remains)
                failed_vms.extend(failed)
            # Print list of VMs cloud scheduler failed to destroy before exit.
            if (remaining_vms != [] or failed_vms != []):
                log.error("The following VMs could not be destroyed properly:")
                for vm in remaining_vms + failed_vms:
                    log.error("VM: %s, ID: %s" % (vm.name, vm.id))

        self.resource_pool.save_persistence()



    def scheduler_full_shutdown(self):
        """Shutdown all VMs in the system and exit gracefully."""
        remaining_vms = []
        failed_vms = []
        threadfail = False
        for cluster in self.resource_pool.resources:
            destroy_threads = []
            for vm in reversed(cluster.vms):
                log.info("Destroying VM: %s" % vm.id)
                vm.log()
                if vm.override_status in ("ExpiredProxy", "NoProxy", "ConnectionRefused", "BrokenPipe"):
                    failed_vms.append(vm)
                    continue
                if len(destroy_threads) < config.max_destroy_threads:
                    destroy_threads.append(VMDestroyCmd(cluster, vm, reason="Full shutdown in progress."))
                else:
                    threadfail = True
                    break
                try:
                    destroy_threads[-1].start()
                except:
                    log.error("Error starting thread, backing out of this one and trying to let rest finish.")
                    destroy_threads.pop()
                    threadfail = True
                    break
            for thread in destroy_threads:
                thread.join()
            for thread in destroy_threads:
                destroy_ret = thread.get_result()
                if destroy_ret != 0:
                    log.error("Destroying VM failed. Continuing anyway... check VM logs")
                    failed_vms.append(thread.get_vm())
        return (remaining_vms, threadfail, failed_vms)

    def scheduler_fifo(self):
        """Approximate First In First Out scheduling of jobs based on Condor Job ID."""
        existing_jobs = self.job_pool.job_container.get_scheduled_jobs_sorted_by_id()
        new_jobs = self.job_pool.job_container.get_unscheduled_jobs_sorted_by_id()
        machine_list = self.resource_pool.mv_machine_list
        vm_slots = self.resource_pool.vm_slots_total()
        vm_list = self.resource_pool.get_all_vms()
        #look ahead n jobs, where n is the number of vm_slots or length of new_jobs, whichever is smaller
        lookahead_jobs = [new_jobs[x] for x in xrange(vm_slots if vm_slots <= len(new_jobs) else len(new_jobs))]
               
        for job in new_jobs:
            if(vm_slots > len(existing_jobs)):
                if self.sched_resource_create_track(job.user, job):
                    log.verbose("VM Created.")
                else:
                    log.verbose("VM could not be created, trying next job...")
            else:
                log.verbose("All VM Slots filled, not trying to schedule more jobs right now.")
                break
        #If there are no lookahead jobs, no reason to kill machines; let them die of natural causes
        if len(lookahead_jobs):
            for machine in machine_list:
                #find the VM object that corresponds with the machine (actual Nimbus VM)
                log.debug("cloud_scheduler.py::435::do_condor_off::Name %s, addr %s"%(machine.machine_name,machine.address_startd))
                matching_vm = None
                for vm in vm_list:
                    if utilities.match_host_with_condor_host(vm.hostname, machine.name) \
                    or utilities.match_host_with_condor_host(vm.alt_hostname, machine.name):
                        matching_vm = vm
                        break
                #If no matching VM or a machine hasn't been assigned a job, we don't want to retire it yet. Same if already retired.
                if not matching_vm or "JobId" not in machine or matching_vm.force_retire:
                    continue
                retire_machine = True
                for job in lookahead_jobs:
                    if matching_vm.uservmtype == job.uservmtype:
                        if job.machine_reserved == "" or job.machine_reserved == machine.name:
                            retire_machine = False
                            job.machine_reserved = machine.name
                            break
                if not retire_machine:
                    log.verbose("No need to retire machine with job:  %s" % machine.job_id)
                    continue
                (_, ret2, _, ret22) = self.resource_pool.do_condor_off(machine.machine_name, machine.address_startd, matching_vm.condormasteraddr)
                if ret2 == 0 and ret22 == 0:
                    log.debug("Set %s to die after completing current job: %s" % (machine.name,machine.job_id))
                    matching_vm.force_retire = True
                    matching_vm.override_status = 'Retiring'
                else:
                    log.debug("Failed to retire VM %s" % machine.name)
        
    def scheduler_fair_share(self):
        """Fair User Sharing algorithm.
        Fairness based on configured resource distribution.
        """
        ## Figure out distribution of VMs requested and available
        current_types = self.resource_pool.vmtype_distribution()
        desired_types = self.job_pool.job_type_distribution()
        # Negative difference means will need to create that type
        diff_types = {}
        for vmtype in current_types.keys():
            if vmtype in desired_types.keys():
                diff_types[vmtype] = current_types[vmtype] - desired_types[vmtype]
            else:
                diff_types[vmtype] = 1 # changed from 0 to handle users with multiple job types, back to 0 from 1.
        for vmtype in desired_types.keys():
            if vmtype not in current_types.keys():
                diff_types[vmtype] = -desired_types[vmtype]

        # With user limiting will need to reset any users that are at their limits
        # so they will not interfere with scheduling
        # will need to redistribute negatives to the non-limited users
        limited_users = []
        userjoblimits = self.job_pool.get_usertype_limits()
        for vmusertype in diff_types.keys():
            user = vmusertype.split(':')[0]
            if self.resource_pool.user_at_limit(user):
                if vmusertype not in limited_users:
                    limited_users.append(vmusertype)
            if vmusertype in userjoblimits.keys():
                if self.resource_pool.uservmtype_at_limit(vmusertype, userjoblimits[vmusertype]):
                    if vmusertype not in limited_users:
                        limited_users.append(vmusertype)
        neg_total = 0
        for usertype in limited_users:
            if diff_types[usertype] < 0:
                neg_total += diff_types[usertype]
        splitby = len(diff_types) - len(limited_users)
        adjustby = 0
        if splitby > 0:
            adjustby = neg_total / splitby
        elif splitby == 0:
            log.verbose("All users are limited.")
        else:
            log.error("More user vmtypes limited than what's in diff types, something weird here.")

        for usertype in diff_types.keys():
            if usertype not in limited_users:
                diff_types[usertype] += adjustby # the 'extra' will be negative so add it

        if len(diff_types) == 0:
            if len(self.job_pool.get_required_vmtypes()) != 0:
                log.error("Possible discrepancy in diff_types detected.")

        ## Check failures to ban jobs on affected resources
        self.resource_pool.check_failures()
        if config.max_starting_vm < 0 or self.resource_pool.get_num_starting_vms() < config.max_starting_vm:
            ## Schedule user jobs
            log.verbose("Schedule any high priority jobs")
            high_priority_jobs_by_users = self.job_pool.job_container.get_unscheduled_high_priority_jobs_by_users(prioritized = True)
            for user in high_priority_jobs_by_users.keys():
                for job in high_priority_jobs_by_users[user]:
    
                    if job.job_status > self.RUNNING or job.banned or job.status == 'Scheduled':
                        continue
                    if self.sched_resource_create_track(user, job):
                        break
            ## For starters we'll only schedule user jobs when there's no
            ## High Priority jobs waiting to start
            if len(high_priority_jobs_by_users) == 0:
                users = self.job_pool.job_container.get_users()
                for user in users:
                    if self.resource_pool.user_at_limit(user):
                        log.debug("User: %s is at their VM limit - skipping." % user)
                        continue
                    # Attempt to schedule jobs in order of their appearance in user's job list
                    # (currently sorted by Job priority)
                    user_jobs = self.job_pool.job_container.get_unscheduled_user_jobs_by_type(user, prioritized=True)
                    for vmtype in user_jobs.keys():
                        vmusertype = ''.join([user,':',vmtype])
                        if vmusertype in userjoblimits.keys() and self.resource_pool.uservmtype_at_limit(vmusertype, userjoblimits[vmusertype]):
                            log.debug("User: %s 's vmtype: %s is at their Limit - skipping." % (user, vmtype))
                            continue
                        for job in user_jobs[vmtype]:
    
                            if job.job_status >= self.RUNNING:
                                log.verbose("Skipping %s job '%s' with status '%s'" % (job.uservmtype, job.id, self.CONDOR_STATUS[job.job_status]))
                                continue
                            elif job.status == job.SCHEDULED:
                                log.verbose("Skipping %s previously scheduled job '%s'" % (job.uservmtype, job.id))
                                continue
                            elif job.banned:
                                log.verbose("Skipping %s.banned job '%s'" % (job.uservmtype, job.id))
                                continue
        
                            log.verbose("Job '%s' Type: %s not running or scheduled, trying to schedule it" % (job.id, job.uservmtype))
        
                            # Check that type of VM for job is needed
                            if (job.uservmtype in diff_types.keys() and diff_types[job.uservmtype] <= 0) or self.sched_allow_over_allocation(diff_types, job):
                                if self.sched_resource_create_track(user, job):
                                    if job.job_per_core and job.req_cpucores > 1:
                                        for job in self.job_pool.job_container.find_unscheduled_jobs_with_matching_reqs(user, \
                                        job, (job.req_cpucores - 1)):
        
                                            job.status = job.statuses[0]
                                    break
                                else:
                                    log.verbose("Failed to schedule %s job '%s' for user %s" % (job.uservmtype, job.id, user))
                                    break # only try one per user's job types
                            elif job.uservmtype in diff_types.keys():
                                log.verbose("User %s vmtype %s already has share" % (user, job.uservmtype))
                                break
                            elif job.uservmtype not in diff_types.keys():
                                log.verbose("User %s vmtype %s not being considered for scheduling" % (user, job.uservmtype))
                                break
                            else:
                                log.verbose("User %s vmtype %s not being scheduled. Exceptional case, report" % (user, job.uservmtype))
                                break
        else:
            log.debug("At Max Starting VMs CloudScheduler not booting any new VMs.")

    def sched_allow_over_allocation(self, diff_types, job):
        """Determine if a VM request is allowed to have more than that users fairshare.
        Handles cases where a user does not have their fairshare but there are no possible
        resources to give them to increase that share, this will allow other users to
        continue to use other remaining resources that the underallocated user is unable to use."""
        allow = False
        #retiring = self.resource_pool.retiring_vms_of_usertype(job.uservmtype)
        userjoblimits = self.job_pool.get_usertype_limits()
        if job.uservmtype in diff_types.keys() and diff_types[job.uservmtype] > 0:
            over_allocate = True
            # Job may be candidate to over allocate if all underallocated jobs have no available resources
            unscheduled_jobs_by_users = self.job_pool.job_container.get_unscheduled_jobs_by_users(prioritized = True)
            for user in unscheduled_jobs_by_users.keys():
                if self.resource_pool.user_at_limit(user):
                    continue
                userjob = unscheduled_jobs_by_users[user][0]
                if userjob.uservmtype in userjoblimits.keys() and self.resource_pool.uservmtype_at_limit(userjob.uservmtype, userjoblimits[userjob.uservmtype]):
                    continue
                # Check for an underallocated job that has resources
                if userjob and userjob.uservmtype in diff_types.keys() and diff_types[userjob.uservmtype] <= 0:
                    good_resources = self.resource_pool.get_resourceBF(userjob.req_network, \
                        userjob.req_cpuarch, userjob.req_memory, userjob.req_cpucores, userjob.req_storage, \
                        userjob.req_ami, userjob.req_imageloc, userjob.target_clouds, userjob.req_hypervisor, \
                        userjob.blocked_clouds)
                    # See if there's a valid resource for job to boot on
                    if len(good_resources) > 0:
                        over_allocate = False
                        break
            # Checked all the users with under allocated jobs
            if over_allocate:
                log.verbose("Possible Allow - check for resources: %s" % job.req_vmtype)
                good_resources = self.resource_pool.get_resourceBF(job.req_network, \
                        job.req_cpuarch, job.req_memory, job.req_cpucores, job.req_storage, \
                        job.req_ami, job.req_imageloc, job.target_clouds, job.req_hypervisor, \
                        job.blocked_clouds)
                # See if there's a valid resource for job to boot on
                if len(good_resources) > 0:
                    allow = True
                    log.debug("Allowing over-allocation of %s" % job.req_vmtype)
        return allow

    def sched_resource_create_track(self, user, job):
        """Helper function to select the cloud to boot a VM on and then attempt
        to create that VM. Optional failure/error tracking.
        """
        # Find resources that match the job's requirements
        good_resources = self.resource_pool.get_resourceBF(job.req_network, \
        job.req_cpuarch, job.req_memory, job.req_cpucores, job.req_storage, \
        job.req_ami, job.req_imageloc, job.target_clouds, job.req_hypervisor, \
        job.blocked_clouds)

        # If no resource fits, continue to next job in user's list
        for resource in reversed(good_resources):
            if resource == None:
                good_resources.pop()
        if len(good_resources) == 0:
            log.verbose("No resource to match job: %s Leaving job unscheduled." % job.id)
            return False

        create_ret = self.vm_creation(job, good_resources)
        if create_ret == 0:
            # Mark job as scheduled
            self.job_pool.schedule(job)
            job.failed_boot = 0
            job.failed_boot_reason.clear()
            if config.ban_tracking:
                self.resource_pool.track_failures(job, good_resources, True)
        elif create_ret == -1: # proxy problem 
            job.banned = True
            job.ban_time = time.time()
            job.override_status = "TempBanned"
            log.verbose("VM Creation failed - temporarily banning job %s" % job.id)
            return False
        elif create_ret == -2: # -2 on Nimbus resource failures previously banned, but need to resolve resource misconfig - admin will need to manually reconfig to resolve - adjusted happens in the nimbus cloud vm_creation() 
            if config.adjust_insufficient_resources:
                log.info("Resources on cloud adjusted due to insufficient availability.")
            else:
                log.debug("Insufficient resources to boot VM, will keep trying.")
                job.failed_boot += 1
                job.failed_boot_reason = "Insufficient Resources on cloud"
                job.last_boot_attempt = time.time()
            if job.failed_boot > 5:
                log.debug("Repeatedly failed to boot VM for job %s blocking temporarily." % job.id)
                job.block_time = int(time.time())
            return False
        elif create_ret == -3: # exceeded maximum or not authorized
            for cloud in good_resources:
                if cloud.name not in job.blocked_clouds:
                    job.blocked_clouds.append(cloud.name)
                    job.block_time = int(time.time())
            return False
        else:
            if config.ban_tracking:
                self.resource_pool.track_failures(job, good_resources, False)
            return False
        return True

    def vm_creation(self, job, good_resources):
        """Helper function for performaing the creation calls to IaaS clouds."""
        # Create an optional customization metadata file
        log.verbose("Preparing to create vm for job '%s'." % job.id)
        customizations = self.build_customizations_list(job)
        create_ret = None

        pre_customizations = []
        extra_userdata = []
        
        if job.ami_config:
            job_ami_config = ""
            try:
                amiconfigs = job.ami_config.split(',')
                if len(amiconfigs) == 1:
                    if len(amiconfigs[0].split(':')) == 1: # no mime types
                        job_ami_config = job.ami_config
                    else:
                        job_ami_config = amiconfigs[0].split(':')[0]
                elif len(amiconfigs) > 1:
                    for conf in amiconfigs:
                        try:
                            # find the cloud-config one that CS shares
                            if conf.split(':')[1].strip() == "cloud-config":
                                job_ami_config = conf.split(':')[0].strip()
                            else:
                                extra_userdata.append(conf)
                        except:
                            log.error("Problem parsing amiconfig pair, should be path:type - %s" % conf)
                
            except:
                log.error("Could not parse amiconfig: %s" % job.ami_config)
                
            try:
                if job_ami_config.startswith('http'):
                    file_content = urllib2.urlopen(job_ami_config).read()
                elif os.path.isfile(job_ami_config):
                    file_content = open(job_ami_config).read()
                pre_customizations.append(file_content)
            except:
                log.error("Unable to read ami_config - check path or url: %s" % job_ami_config)               

        log.verbose("Finished customizations for job '%s'" % job.id)
        cloud_type_file_dest = "/var/lib/cloud_type"
        cloud_name_file_dest = "/var/lib/cloud_name"
        vmimage_expanded = self.resource_pool.resolve_vmami_cloud_alias(job.req_ami)
        vminstancetype_expanded = self.resource_pool.resolve_vminstancetype_cloud_alias(job.instance_type)
        for resource in good_resources:
            # Print details of the resource selected
            if resource == None:
                log.debug("None resource in good_resources ??")
                continue
            log.debug("Booting VM for job %s on: %s" % (job.id, resource.name))
            resource.log()

            #TODO: unify this
            if resource.__class__.__name__ == "NimbusCluster":
                customizations.append(("nimbus", cloud_type_file_dest))
                customizations.append((resource.name, cloud_name_file_dest))
                imageloc = job.req_imageloc
                imagesplit = imageloc.split('/')
                if '__hypervisor__' in imagesplit:
                    index = imagesplit.index('__hypervisor__')
                    imagesplit[index] = resource.hypervisor
                    imageloc = '/'.join(imagesplit)

                args = {'vm_name':job.req_image,
                        'vm_type':job.req_vmtype,
                        'vm_user':job.user,
                        'vm_networkassoc':job.req_network,
                        'vm_cpuarch':job.req_cpuarch,
                        'vm_image':imageloc,
                        'vm_mem':job.req_memory,
                        'vm_cores':job.req_cpucores,
                        'vm_storage':job.req_storage,
                        'customization':customizations,
                        'vm_keepalive':job.keep_alive,
                        'job_proxy_file_path':job.get_x509userproxy(),
                        'myproxy_creds_name':job.get_myproxy_creds_name(), 
                        'myproxy_server':job.get_myproxy_server(),
                        'myproxy_server_port':job.get_myproxy_server_port(),
                        'job_per_core':job.job_per_core,
                        'proxy_non_boot':job.proxy_non_boot,
                        'vmimage_proxy_file':job.vmimage_proxy_file,
                        'vmimage_proxy_file_path':job.get_vmimage_proxy_file_path()}
                create_ret = resource.vm_create(**args)
            elif resource.__class__.__name__ == "EC2Cluster":
                customizations.append((resource.cloud_type, cloud_type_file_dest))
                customizations.append((resource.name, cloud_name_file_dest))
                args = {'vm_name':job.req_image,
                        'vm_type':job.req_vmtype,
                        'vm_user':job.user,
                        'vm_networkassoc':job.req_network,
                        'vm_image':vmimage_expanded,
                        'vm_mem':job.req_memory,
                        'vm_cores':job.req_cpucores,
                        'vm_storage':job.req_storage,
                        'customization':customizations,
                        'pre_customization': pre_customizations,
                        'extra_userdata': extra_userdata,
                        'vm_keepalive':job.keep_alive,
                        'instance_type':vminstancetype_expanded,
                        'maximum_price':job.maximum_price,
                        'job_per_core':job.job_per_core,
                        'securitygroup':job.req_security_group,
                        'key_name': job.key_name,
                        'use_cloud_init': job.use_cloud_init}
                create_ret = resource.vm_create(**args)
            elif resource.__class__.__name__ == "IBMCluster":
                customizations.append(("ibm", cloud_type_file_dest))
                customizations.append((resource.name, cloud_name_file_dest))
                args = {'vm_name':job.req_image,
                        'vm_type':job.req_vmtype,
                        'vm_user':job.user,
                        'vm_networkassoc':job.req_network,
                        'vm_image':job.req_image_id,
                        'vm_mem':job.req_memory,
                        'vm_cores':job.req_cpucores,
                        'vm_storage':job.req_storage,
                        'customization':customizations,
                        'vm_keepalive':job.keep_alive,
                        'instance_type':job.req_instance_type_ibm,
                        'location':job.location,
                        'job_per_core':job.job_per_core,
                        'vm_keyname':job.key_name}
                create_ret = resource.vm_create(**args)
            elif resource.__class__.__name__ == "StratusLabCluster":
                customizations.append(("stratuslab", cloud_type_file_dest))
                customizations.append((resource.name, cloud_name_file_dest))
                args = {'vm_name':job.req_image,
                        'vm_type':job.req_vmtype,
                        'vm_user':job.user,
                        'vm_networkassoc':job.req_network,
                        'vm_image':job.req_image_id,
                        'vm_mem':job.req_memory,
                        'vm_cores':job.req_cpucores,
                        'vm_storage':job.req_storage,
                        'customization':customizations,
                        'vm_keepalive':job.keep_alive,
                        'job_per_core':job.job_per_core,
                        'vm_loc':job.req_imageloc}
                create_ret = resource.vm_create(**args)
            elif resource.__class__.__name__ == "GoogleComputeEngineCluster":
                customizations.append(("gce", cloud_type_file_dest))
                customizations.append((resource.name, cloud_name_file_dest))
                args = {'vm_name':job.req_image,
                        'vm_type':job.req_vmtype,
                        'vm_user':job.user,
                        'vm_networkassoc':job.req_network,
                        'vm_image':vmimage_expanded,
                        'vm_mem':job.req_memory,
                        'vm_cores':job.req_cpucores,
                        'vm_storage':job.req_storage,
                        'customization':customizations,
                        'vm_keepalive':job.keep_alive,
                        'instance_type':vminstancetype_expanded,
                        'maximum_price':job.maximum_price,
                        'job_per_core':job.job_per_core,
                        'securitygroup':job.req_security_group,
                        "pre_customization":pre_customizations,
                        'extra_userdata': extra_userdata,
                        "use_cloud_init":True}
                create_ret = resource.vm_create(**args)
            elif resource.__class__.__name__ == "OpenStackCluster":
                customizations.append((resource.cloud_type, cloud_type_file_dest))
                customizations.append((resource.name, cloud_name_file_dest))
                args = {'vm_name':job.req_image,
                        'vm_type':job.req_vmtype,
                        'vm_user':job.user,
                        'vm_networkassoc':job.req_network,
                        'vm_image':vmimage_expanded,
                        'vm_mem':job.req_memory,
                        'vm_cores':job.req_cpucores,
                        'vm_storage':job.req_storage,
                        'customization':customizations,
                        'pre_customization':pre_customizations,
                        'extra_userdata': extra_userdata,
                        'vm_keepalive':job.keep_alive,
                        'instance_type':vminstancetype_expanded,
                        'job_per_core':job.job_per_core,
                        'securitygroup':job.req_security_group,
                        'key_name':job.key_name,
                        'use_cloud_init': job.use_cloud_init}
                create_ret = resource.vm_create(**args)

            # If the VM create fails, try again on another resource
            if (create_ret != 0):
                log.debug("Creating VM for job %s failed on %s. " % (job.id, resource.name))
                job.last_boot_attempt = time.time()
                failure_reasons = {"-1": "Proxy/Auth related issue",
                                   "-2": "Resource Availability / Quota Problem",
                                   "-3": "Other VM Create Error",
                                   "2": "Resource Request Denied",
                                   "1": "Error when making VM request: check log"}
                if create_ret == None:
                    create_ret = -3
                try:
                    job.failed_boot_reason.add(failure_reasons[str(create_ret)])
                except Exception as e:
                    log.exception("Unable to set failure reason: %s" % e)
                continue

            # If the vm create didn't fail, break out of the loop
            if job.job_per_core:
                if job.req_cpucores == config.default_VMJobPerCore:
                    flav_names = [f.name for f in resource.flavor_set]
                    fl = None
                    for _,v in job.instance_type.iteritems():
                        if v in flav_names:
                            fl = v
                    for f in resource.flavor_set:
                        if f.name == fl:
                            job.req_cpucores = f.cores
                    pass
                pass

            break

        # If VM creation fails for user-job on all resources move to next user
        if create_ret != 0:
            log.debug("None of the resources could boot a vm for job %s. " % job.id + \
                      "Leaving %s's job unscheduled." % job.user)
        return create_ret

    def build_customizations_list(self, job):
        customizations = []
        if config.condor_host != "localhost" and config.condor_context_file:
            customizations.append((config.condor_host, config.condor_context_file))

        # local modifications are changes made to the VM's condor_config.local modifications file
        local_modifications = ""
        #local_modifications = 'CSManagedVM = True\n'
        #local_modifications += 'STARTD_ATTRS = COLLECTOR_HOST_STRING VMType CSManagedVM\n'

        if config.override_vmtype:
            local_modifications += 'VMType = "%s"\n' % job.req_vmtype

        if config.cert_file:
            file_contents = open(config.cert_file).read()

            if config.cert_file_on_vm:
                file_location = config.cert_file_on_vm
            else:
                file_location = config.cert_file

            customizations.append((file_contents, file_location))

        if config.key_file:
            file_contents = open(config.key_file).read()

            if config.key_file_on_vm:
                file_location = config.key_file_on_vm
            else:
                file_location = config.key_file

            customizations.append((file_contents, file_location, "'0600'"))

        # Copy CA root certs and signing policies if needed.
        if config.ca_root_certs and job.inject_ca:
            for entry in config.ca_root_certs:
                source = entry.split(':')[0]
                if entry.find(':') != -1:
                    destination = entry.split(':')[1]
                else:
                    destination = source
                try:
                    file_contents = open(source).read()
                    customizations.append((file_contents, destination))
                except:
                    log.error('Error reading %s' % (source))

        if config.ca_signing_policies and job.inject_ca:
            for entry in config.ca_signing_policies:
                source = entry.split(':')[0]
                if entry.find(':') != -1:
                    destination = entry.split(':')[1]
                else:
                    destination = source
                try:
                    file_contents = open(source).read()
                    customizations.append((file_contents, destination))
                except:
                    log.error('Error reading %s' % (source))

        # Add the owner restriction requirement.
        # This is to prevent a VM started by a user being recycled to run jobs from another user.
        if job.user != None:
            start_requirement = 'START=(Owner == "%s")\n' % (job.user)
            log.verbose("Adding START requirement to match resource's original owner:\n%s" % start_requirement)
            local_modifications += start_requirement

        customizations.append((local_modifications, '/etc/condor/condor_config.local.modifications'))
        
        # Read and add admin's user data scripts which must be local files
        if config.default_VMUserData:
            for userdata in config.default_VMUserData:
                try:
                    file_content = open(userdata).read()
                    basename = os.path.basename(userdata)
                    filename = "admin_userdata_%s" % (basename) 
                    destination = "/etc/condor/%s" % (filename)
                    executable = False
                    if re.match('^#!/.*', file_content):
                        executable = True
                    customizations.append((file_content, destination, executable))            
                except:
                    log.error('Error reading default %s' % (userdata))
        
                    
        # Read and add user's user data scripts which must be accessible via http 
        if job.user_data:
            for userdata in job.user_data:
                if userdata:
                    try:
                        file_content = urllib2.urlopen(userdata).read()
                        basename = os.path.basename(userdata)
                        filename = "user_userdata_%s" % (basename) 
                        destination = "/etc/condor/%s" % (filename)
                        executable = False
                        if re.match('^#!/.*', file_content):
                            executable = True
                        customizations.append((file_content, destination, executable)) 
                    except:
                        log.error('Error reading userdata %s' % (userdata))
        return customizations

class Cleanup(threading.Thread):
    """
    Cleanup - Periodically syncs the new and scheduled job queues to better
              reflect the state of the jobs
    """

    IDLE = 1
    RUNNING = 2

    def __init__(self, resource_pool, job_pool):
        threading.Thread.__init__(self, name=self.__class__.__name__)
        self.job_pool = job_pool
        self.resource_pool = resource_pool
        self.quit = False
        self.polling_interval = config.cleanup_interval
        self.destroy_threads = {}
        self.heart_beat = time.time()
        
        #Different scheduling algorithms require different balancing
        if(config.scheduling_algorithm.lower() == "fifo"):
            self.clean_balance_vms = self.clean_balance_vms_fifo
        elif(config.scheduling_algorithm.lower() == "fairshare"):
            self.clean_balance_vms = self.clean_balance_vms_fairshare
        else:
            log.error("Scheduling algorithm not recognized...fatal error")
            self.quit = True
            
    def stop(self):
        log.debug("Waiting for cleanup loop to end")
        self.quit = True

    def run(self):
        log.info("Starting Cleanup Thread...")

        while not self.quit:
            start_loop_time = time.time()
            self.check_destroy_threads()
            if config.retire_before_lifetime:
                # Check for VMs near max lifetime 
                self.clean_retire_near_lifetime()
            # Make sure no VMs with proxys are about to expire and get stuck in expired proxy state
            self.check_vm_proxy_shutdown_threshold()
            # See if any VMs are have been in a Starting state for too long if timeouts are set.
            self.clean_kill_start_timeout_vms()
            # Remove unneeded VMs.
            # Make sure we only do this if we have ever gotten a list of jobs
            # from Condor. Otherwise, when we persist from a previous run
            # we would shut down all the VMs for those jobs. Sometimes querying
            # a slow schedd can take quite a few minutes
            if self.job_pool.last_query:
                ## Check that jobs are valid for the clusters available
                self.clean_invalid_jobs()
                ## Clear all un-needed VMs from the system
                log.verbose("Clearing all un-needed VMs from the system")
                self.clean_unneeded_vms()
                ## See if any stray entries in condor_status - incomplete feature
                self.clean_check_vms_extra_machines(self.resource_pool.vm_machine_list)
                # Make sure VMs have registered with Condor
                # Check if any retiring VMs have Retired
                unregisteredvms, retiredvms = self.clean_check_diff_vms_machines(self.resource_pool.vm_machine_list)
                self.clean_map_master_machines(self.resource_pool.vm_machine_list)
                # Shutdown the unregistered VMs over the limit
                self.clean_kill_unregistered_vms(unregisteredvms)
                # Shutdown the Retired VMs
                self.clean_retired_vms(retiredvms)
                # Deal with retired resources from a reconfigure
                unregisteredvms, retiredvms = self.clean_check_diff_vms_machines(self.resource_pool.vm_machine_list, True)
                self.clean_kill_unregistered_vms(unregisteredvms, True)
                self.clean_retired_vms(retiredvms, True)
                if config.clean_shutdown_idle:
                    # Check for Idle machines that cannot run any jobs
                    self.clean_verify_vm_job_reqs()
                log.verbose("Attempting to balance VMs")
                self.clean_balance_vms()

            # Check through new jobs for running jobs and move to sched
            log.verbose("Syncing job queues")
            self.clean_scheduled_unscheduled()
            # Check the scheduled Jobs to see which running jobs are on what cloud
            self.clean_match_jobs_clouds()
            # See if any clouds with connection problems should be retried.
            self.check_connection_problems()

            log.verbose("Cleanup waiting %ds..." % self.polling_interval)
            sleep_tics = self.polling_interval
            elapsed_loop_time = time.time() - start_loop_time
            log.verbose("Cleanup thread loop time: %s" % str(elapsed_loop_time))
            self.heart_beat = time.time()
            while (not self.quit) and sleep_tics > 0:
                time.sleep(1)
                sleep_tics -= 1

        log.info("Exiting cleanup thread")

    def clean_invalid_jobs(self):
        """Checks all unscheduled jobs to ensure there is a cloud that can
        support their requirements.
        Attempts to condor_hold jobs that match no clouds.
        """
        bad_jobs = []
        cloud_names = []
        for cloud in self.resource_pool.resources:
            cloud_names.append(cloud.name)
        for job in self.job_pool.job_container.get_unscheduled_jobs():
            if not self.resource_pool.resourcePF(job.req_network, job.req_cpuarch):
                bad_jobs.append(job)
                log.debug("#1 network and cpuarch - No cluster fits job %s ignoring" % job.id)
                continue
            if job.req_memory > 0:
                if not self.resource_pool.resourcePF(job.req_network, job.req_cpuarch, memory=job.req_memory):
                    bad_jobs.append(job)
                    log.debug("#2 memory - No cluster fits job %s ignoring" % job.id)
                    continue
            if job.req_storage > 0:
                if not self.resource_pool.resourcePF(job.req_network, job.req_cpuarch, disk=job.req_storage):
                    bad_jobs.append(job)
                    log.debug("#3 storage - No cluster fits job %s ignoring" % job.id)
                    continue
            if job.target_clouds:
                unrolled_targets = self.resource_pool.resolve_target_cloud_alias(job.target_clouds)
                if not set(unrolled_targets).intersection(set(cloud_names)):
                    bad_jobs.append(job)
                    log.debug("No matching target cloud for job %s" % job.id)
                    continue
        failedhold = []
        if len(bad_jobs) > 0:
            failedhold = self.job_pool.job_hold_local(bad_jobs)
        if failedhold and len(failedhold) > 0:
            # failed to hold some of these jobs remove from container instead
            self.job_pool.job_container.remove_jobs(failedhold)

    def clean_balance_vms_fifo(self):
        """This function overrides the fairshare balancing function."""
        # VM shutdowns are handled in the main scheduling function: scheduler_fifo()
        pass

    def clean_balance_vms_fairshare(self):
        """Primary balancing function for the fairshare scheduling algorithm."""
        # Count the number of jobs that require a certain VM type,
        # and then destroy the EXCESS VMs in that type TODO: leave a few for spare?
        log.verbose("Gathering required VM types.")
        machineList = self.resource_pool.vm_machine_list
        if machineList:

            # Remove excess VMs when available VM type exceedes required by jobs.
            required_vmtypes_dict = self.job_pool.get_required_uservmtypes_dict()
            if self.job_pool.last_query:
                available_vmtypes_dict = self.resource_pool.get_uservmtypes_count(machineList)
                log.verbose("Removing excess VMs from the system.")
                to_remove = {}
                for vmtype, count in available_vmtypes_dict.iteritems():
                    if vmtype in required_vmtypes_dict.keys():
                        if count > required_vmtypes_dict[vmtype]:
                            to_remove[vmtype] = count - required_vmtypes_dict[vmtype]
                    # This type no longer needed, remove all remaining
                    else:
                        to_remove[vmtype] = count
                del available_vmtypes_dict
                log.verbose("Will try to remove: %s" % str(to_remove))
                # Go over the types and find idle machines to remove
                self.remove_idle_machines(machineList, to_remove)

            # Balancing Resources
            # Figure how many VMs to add or remove of each type
            current_types = self.resource_pool.vmtype_distribution()
            desired_types = self.job_pool.job_type_distribution()
            # Negative difference means will need to create that type
            diff_types = {}
            for vmtype in current_types.keys():
                if vmtype in desired_types.keys():
                    diff_types[vmtype] = current_types[vmtype] - desired_types[vmtype]
                else:
                    diff_types[vmtype] = 1 #changed from 0 to handle users with multiple job types
                    # changed back to 0 from 1, users with multiple job types should use the _multi_dist option
                    # which should better handle that case.
                    # and back to 1 again 12/04/30 - Need to review my logic for for split vs normal job distribution
            for vmtype in desired_types.keys():
                if vmtype not in current_types.keys():
                    diff_types[vmtype] = -desired_types[vmtype]
            log.verbose("Diff Types Before Limits: %s" % str(diff_types))

            # With user limiting will need to reset any users that are at their limits
            # so they will not interfere with scheduling
            # will need to redistribute negatives to the non-limited users
            userjoblimits = self.job_pool.get_usertype_limits()
            limited_users = []
            for vmusertype in diff_types.keys():
                user = vmusertype.split(':')[0]
                if self.resource_pool.user_at_limit(user):
                    # This user at their limit alter their diff_types
                    if vmusertype not in limited_users:
                        limited_users.append(vmusertype)
                if vmusertype in userjoblimits.keys():
                    if self.resource_pool.uservmtype_at_limit(vmusertype, userjoblimits[vmusertype]):
                        if vmusertype not in limited_users:
                            limited_users.append(vmusertype)
            neg_total = 0
            for usertype in limited_users:
                if diff_types[usertype] < 0:
                    neg_total += diff_types[usertype]
            splitby = len(diff_types) - len(limited_users)
            adjustby = 0
            if splitby > 0:
                adjustby = neg_total / splitby
            elif splitby == 0:
                log.verbose("All users are limited.")
            else:
                log.error("More user vmtypes limited than what's in diff types, something weird here.")
    
            for usertype in diff_types.keys():
                if usertype not in limited_users:
                    diff_types[usertype] += adjustby # the 'extra' will be negative so add it
            log.verbose("Diff Types After Limits: %s" % str(diff_types))
            num_to_change = self.clean_determine_num_to_change(diff_types, required_vmtypes_dict)

            next_diff_types = {}
            vmcount = self.resource_pool.get_vmtypes_count_internal()
            for vmtype in vmcount.keys():
                if vmtype in num_to_change.keys():
                    vmcount[vmtype] += -num_to_change[vmtype]
            next_types = self.resource_pool.vmtype_distribution(vmcount)
            for vmtype in next_types.keys():
                if vmtype in desired_types.keys():
                    next_diff_types[vmtype] = next_types[vmtype] - desired_types[vmtype]
                else:
                    next_diff_types[vmtype] = 1  #probably smart to set this to 1 as well to match above
            for vmtype in desired_types.keys():
                if vmtype not in current_types.keys():
                    next_diff_types[vmtype] = -desired_types[vmtype]
            log.verbose("Next Diff Types: %s" % str(next_diff_types))
            
            #       determine new num_to_change based on updated diff_types
            next_num_to_change = self.clean_determine_num_to_change(next_diff_types, required_vmtypes_dict)
            #       compare the 2 num_to_change results and try to detect the flipflopping
            for vmtype in next_num_to_change.keys():
                if vmtype in num_to_change.keys():
                    if next_num_to_change[vmtype] < 0 and num_to_change[vmtype] > 0 \
                    or next_num_to_change[vmtype] > 0 and num_to_change[vmtype] < 0:
                        # Has fliped, adjust num_to_change to prevent flipflop
                        num_to_change[vmtype] = num_to_change[vmtype] + next_num_to_change[vmtype]
            
            log.verbose("Pre-User-Throttling values: %s" % str(num_to_change))
            # Make sure not going to retire too many machines if some are throttled
            adjust_num_total = 0
            total_pos_num = 0
            for uservmtype in num_to_change.keys():
                user = uservmtype.split(':', 1)[0]
                if user in self.resource_pool.user_vm_limits.keys():
                    if num_to_change[uservmtype] < 0:
                        if self.resource_pool.get_vm_count_user(user) + abs(num_to_change[uservmtype]) > self.resource_pool.user_vm_limits[user]:
                            diff_limit = self.resource_pool.user_vm_limits[user] - self.resource_pool.get_vm_count_user(user)
                            adjust_num = abs(num_to_change[uservmtype]) - diff_limit
                            adjust_num_total += adjust_num
                    elif num_to_change[uservmtype] > 0:
                        total_pos_num += 1
            max_adjust_per_pos = 0
            while adjust_num_total > 0:
                if total_pos_num > 0:
                    max_adjust_per_pos = adjust_num_total / total_pos_num
                    for uservmtype in num_to_change.keys():
                        if num_to_change[uservmtype] > 0:
                            if num_to_change[uservmtype] < max_adjust_per_pos:
                                adjust_num_total -= num_to_change[uservmtype]
                                num_to_change[uservmtype] = 0
                                total_pos_num -= 1
                            else:
                                adjust_num_total -= max_adjust_per_pos
                                num_to_change[uservmtype] -= max_adjust_per_pos
                else:
                    break

            log.info("Final(for real) num to change: %s" % str(num_to_change))

            log.verbose("Ready to balance via configured method")
            if config.graceful_shutdown:
                # shutdown using condor_off
                self.graceful_shutdown_condor_off(machineList, num_to_change)
            else:
                # shutdown after the previous job has finished executing
                # interupts the new running job and will be rescheduled by condor
                self.balance_hard_shutdown(self.resource_pool.vm_machine_list, self.resource_pool.prev_vm_machine_list, num_to_change)
        else:
            log.verbose("No Machines returned by Condor Collector Query")

    def clean_determine_num_to_change(self, diff_types, required_vmtypes_dict):
        """Attempts to calculate the number of VMs of each type that CS wants to 
        start or shutdown in order to achieve a balanced resource distribution."""
        num_to_change = {}
        free_space_for_vmtype = {}
        vm_count = self.resource_pool.vm_count()
        for vmtype, val in diff_types.iteritems():
            num_to_change[vmtype] = int(round(val * vm_count))
        log.verbose("Initial num to change: %s adjust for queued jobs." % str(num_to_change))
        excess_diff = 0
        positive_types = []
        # check if there are fewer jobs in queue than trying to adjust VM count by
        # this makes up the excess amount
        for vmtype, val in num_to_change.iteritems():
            if vmtype in required_vmtypes_dict.keys() and val < 0 and abs(val) > required_vmtypes_dict[vmtype]:
                excess_diff += abs(val) - required_vmtypes_dict[vmtype] #-1 # off by 1 ?
                num_to_change[vmtype] = -required_vmtypes_dict[vmtype]
            elif val > 0:
                positive_types.append(vmtype)
        del required_vmtypes_dict
        # redistribute the excess back to the remaining types
        if excess_diff > 0 and len(positive_types) > 0:
            reduce_per_type = int(excess_diff / len(positive_types))
            if reduce_per_type == 0 and len(positive_types) > 1:
                reduce_per_type = 1
                positive_types.pop()
            for vmtype in positive_types:
                num_to_change[vmtype] -= reduce_per_type
                if num_to_change[vmtype] < 0:
                    num_to_change[vmtype] = 0

        # Postive num_to_change is how many CS going to try and shutdown
        # Negative is how many it wants to start
        log.verbose("Midway num to change: %s adjusting for free resources." % str(num_to_change))
        pos_change_types = []
        for vmtype, val in num_to_change.iteritems():
            if val > 0:
                pos_change_types.append(vmtype)
        unsched_jobs = self.job_pool.job_container.get_unscheduled_jobs_by_usertype()
        fits_by_type = {}
        for vmtype in num_to_change.keys():
            if vmtype in unsched_jobs.keys():
                job = unsched_jobs[vmtype][0]
                fitting = self.resource_pool.get_fitting_resources(job.req_network,  \
                        job.req_cpuarch, job.req_memory, job.req_cpucores, \
                        job.req_storage, job.req_ami, job.req_imageloc, job.target_clouds, job.req_hypervisor, job.blocked_clouds)
                fits_by_type[vmtype] = set(fitting)
        for vmtype in num_to_change.keys():
            if num_to_change[vmtype] < 0:
                num_fit = 0
                if vmtype in fits_by_type.keys():
                    for cloud in fits_by_type[vmtype]:
                        num_fit_cloud = 0
                        if cloud.__class__.__name__ == "NimbusCluster":
                            if vmtype in unsched_jobs.keys():
                                if unsched_jobs[vmtype][0].req_network in cloud.network_pools:
                                    num_fit_cloud = cloud.net_slots[unsched_jobs[vmtype][0].req_network]
                                    # IF no specified network do something
                                elif not unsched_jobs[vmtype][0].req_network:
                                    # Need to get total slots on all network pools
                                    total_netpool_slots = 0
                                    for netpool in cloud.net_slots.keys():
                                        total_netpool_slots += cloud.net_slots[netpool]
                                    num_fit_cloud = total_netpool_slots
                        else:
                            num_fit_cloud = cloud.vm_slots
                        if job.req_storage == 0:
                            numstore = -1
                        else:
                            numstore = int(cloud.storageGB / job.req_storage)
                        num_fit_cloud = numstore if numstore < num_fit_cloud and numstore != -1 else num_fit_cloud
                        nummem = 0
                        for mement in cloud.memory:
                            nummem += (mement / job.req_memory)
                        numcpu = 0
                        if cloud.total_cpu_cores and cloud.total_cpu_cores == -1:
                            numcpu = -1
                        else:
                            if job.req_cpucores != 0:
                                numcpu = int(cloud.total_cpu_cores / job.req_cpucores)
                        if numstore < 0:
                            numstore = max(nummem, num_fit_cloud, numcpu)
                        if numcpu < 0:
                            numcpu = max(nummem, num_fit_cloud, numstore)
                        num_fit_cloud = min(nummem, num_fit_cloud, numstore, numcpu)
                        num_fit += num_fit_cloud
                        #log.verbose("Can fit %i VMs on the %s cloud." % (num_fit_cloud, cloud.name))
                    #log.verbose("Can fit %i VMs total." % num_fit)
                    free_space_for_vmtype[vmtype] = True if num_fit > 0 else False
                # Add the number that can fit to what it wants to change
                can_boot = num_to_change[vmtype] + num_fit
                excess = 0
                if can_boot >= 0:
                    # Room for all desired VMs with current resources
                    # Don't need to shutdown others
                    excess = num_to_change[vmtype]
                    num_to_change[vmtype] = 0
                else:
                    num_to_change[vmtype] = can_boot
                    excess = num_fit
                if len(pos_change_types) > 0 and excess > 0:
                    cantuse = set()
                    preftypes = []
                    for postype in pos_change_types:
                        if postype in fits_by_type.keys() and vmtype in fits_by_type.keys():
                            cu = fits_by_type[postype] - fits_by_type[vmtype]
                            if len(cu) > 0:
                                preftypes.append(postype)
                                cantuse.update(cu)
                    if len(cantuse) > 0:
                        adjustby = excess / len(cantuse)
                        if len(preftypes) > 0:
                            for preftype in preftypes:
                                num_to_change[preftype] -= adjustby
                                if num_to_change[preftype] < 0:
                                    num_to_change[preftype] = 0
                    else:
                        adjustby = excess / len(pos_change_types)
                        for postype in pos_change_types:
                            num_to_change[postype] -= adjustby
                            if num_to_change[postype] < 0:
                                    num_to_change[postype] = 0
        log.verbose("End num to change: %s" % str(num_to_change))
        free_space = True
        for vmtype in free_space_for_vmtype.keys():
            if not free_space_for_vmtype[vmtype]:
                free_space = False
                break
        if free_space:
            log.verbose("There is some free space in the cloud for requested VMs, not changing anything yet.")
            for vmtype in num_to_change.keys():
                num_to_change[vmtype] = 0
        return num_to_change

    def clean_unneeded_vms(self):
        """Looks for VMs that are no longer required by the remaining jobs and 
        performs a shutdown on them."""
        req_vmtypes = self.job_pool.get_required_uservmtypes()
        for cluster in self.resource_pool.resources:
            for vm in reversed(cluster.vms):
                if vm.uservmtype not in req_vmtypes and (vm.idle_start and (int(time.time()) - vm.idle_start > vm.keep_alive) or not vm.idle_start):
                    #if vm.override_status != "Retiring":
                    self.resource_pool.force_retire_vm(vm)
                    if not self.check_destroy(cluster, vm) and not cluster.connection_problem:
                        th = VMDestroyCmd(cluster, vm, reason="VMType %s is no longer required." % vm.vmtype)
                        th.start()
                        self.destroy_threads["".join([cluster.name, vm.id])] = th
                    else:
                        log.verbose("Already a thread for vm: %s" % vm.id)

    def clean_scheduled_unscheduled(self):
        """Moves any running jobs into the scheduled state.
        If there are more Scheduled jobs than VMs of that type will Unschedule
        the difference so more VMs can be booted."""
        # We need the setup lock to ensure that we don't set jobs to
        # unscheduled during a reconfig.
        with self.resource_pool.setup_lock:
            with self.job_pool.job_container.lock:
                for job in self.job_pool.job_container.get_all_jobs():
                        if job.job_status == self.RUNNING:
                            self.job_pool.schedule(job)
                    
                vms = self.resource_pool.get_vmtypes_count_cpu_slots()
                job_req_count = defaultdict(int)
                for job in self.job_pool.job_container.get_scheduled_jobs():
                    if job.job_status <= self.RUNNING: # Ignore held, complete, etc
                        job_req_count[job.uservmtype] += 1
                for job in self.job_pool.job_container.get_scheduled_jobs():
                    if job.job_status == self.IDLE and (job.uservmtype not in vms.keys() or (vms[job.uservmtype] < job_req_count[job.uservmtype])):
                        self.job_pool.unschedule(job)
                        job_req_count[job.uservmtype] -= 1

    def clean_check_diff_vms_machines(self, machineList, retired=False):
        """Determines the difference between the results from condor_status
        and CS' internal representation of VMs."""
        unregisteredvms = []
        retiredvms = []
        resources = []
        if not retired:
            resources = self.resource_pool.resources
        else:
            resources = self.resource_pool.retired_resources
        for cluster in resources:
            for vm in cluster.vms:
                foundvm = False
                if vm.status == 'Running' and machineList != None:
                    for machine in machineList:
                        if utilities.match_host_with_condor_host(vm.hostname, machine.machine_name) \
                        or utilities.match_host_with_condor_host(vm.alt_hostname, machine.machine_name):
                            if vm.condorname == None:
                                vm.condorname = machine.machine_name
                            if machine.address_startd != "":
                                vm.condoraddr = machine.address_startd
                            if machine.state != "":
                                if vm.idle_start and machine.state == "Claimed":
                                    vm.idle_start = None
                                elif not vm.idle_start and machine.activity == 'Idle':
                                    vm.idle_start = int(time.time())
                            foundvm = True
                            break
                        if utilities.match_host_with_condor_host_master(vm.hostname, machine.machine_name) \
                        or utilities.match_host_with_condor_host_master(vm.alt_hostname, machine.machine_name):
                            if vm.condorname == None:
                                vm.condorname = machine.machine_name
                            if machine.address_startd != "":
                                vm.condoraddr = machine.address_startd
                            if machine.state != "":
                                if vm.idle_start and machine.state == "Claimed":
                                    vm.idle_start = None
                                elif not vm.idle_start and machine.activity == 'Idle':
                                    vm.idle_start = int(time.time())
                            foundvm = True
                            break
                    if not foundvm:
                        if vm.override_status == 'Retiring':
                            retiredvms.append(vm)
                        elif vm.condorname != None:
                            log.debug("Set CondorName to None - VM %s on Name %s has previously registered with condor but is now missing - caught mid stale refresh?" % (vm.id, vm.condorname))
                            # As a temp hack I'm going to reset the last_state_change to give the VM the register time limit to come back
                            # as well as reset the condorname to None
                            vm.last_state_change = int(time.time())
                            vm.condorname = None
                            # Now the next time the cleanup thread runs if VM is not in list again it will be eligble to shutdown but only
                            # if it has not re-registered after the condor_register_time_limit
                        else:
                            unregisteredvms.append(vm)
        return unregisteredvms, retiredvms

    def clean_map_master_machines(self, masterList):
        for cluster in self.resource_pool.resources:
            for vm in cluster.vms:
                foundvm = False
                if vm.status == 'Running' and masterList:
                    for machine in masterList:
                        if utilities.match_host_with_condor_host_master(vm.hostname, machine.machine_name) \
                        or utilities.match_host_with_condor_host_master(vm.alt_hostname, machine.machine_name):
                            vm.condormasteraddr = machine.address_master
                            foundvm = True
                            break
                    if not foundvm:
                        log.verbose("Could not find Running VM %s in master list, may be Retiring" % vm.id)

    def clean_check_vms_extra_machines(self, machineList):
        """Figure out which entries in condor_status are missing from CS."""        
        missing_vms = []
        for machine in machineList:
            vm = self.resource_pool.find_vm_with_name(machine.machine_name)
            if not vm:
                vm = self.resource_pool.find_vm_with_addr(machine.address_startd)
            if not vm:
                missing_vms.append(machine)
        # Update the resource_pools sets of missing / non cs VMs - this is ignoring slot@ currently
        self.resource_pool.missing_vm_condor_machines.clear()
        for machine in missing_vms:
            log.verbose("VM %s Could not be located within CS, may be lost." % machine.machine_name)
            self.resource_pool.missing_vm_condor_machines.add(machine)
            #if config.retire_missing_vms:
                #if machine.activity != 'Retiring':
                    #for master in masterList:
                        #if master.machine_name == machine.machine_name:
                            #self.resource_pool.do_condor_off(machine.machine_name, machine.address_startd, machine.address_master)

    def clean_kill_unregistered_vms(self, unregisteredvms, retired = False):
        """Checks to see if an unregistered VM is eligible for shutdown."""
        to_kill = []
        for vm in unregisteredvms:
            if vm.last_state_change == -1:
                vm.last_state_change = int(time.time())
                continue
            # now -state change will be how long VM 'running'
            running_time = int(time.time()) - vm.last_state_change

            if running_time > config.condor_register_time_limit:
                to_kill.append(vm)
        resources = []
        if not retired:
            resources = self.resource_pool.resources
        else:
            resources = self.resource_pool.retired_resources
        for machine in to_kill:
            killedIt = False
            for cluster in resources:
                if not cluster.connection_problem:
                    for vm in cluster.vms:
                        if vm == machine:
                            cluster.failed_image_set.add(str(vm.image))
                            cluster.vm_destroy(machine, return_resources=(not retired), reason="Has not registered with Condor after %i seconds." % config.condor_register_time_limit)
                            killedIt = True
                            break
                if killedIt:
                    break

    def clean_kill_start_timeout_vms(self):
        """Checks to see if any VMs have passed the boot_timeout value.
        Tries to stop machines from being stuck in the Starting state but failing to boot."""
        for cluster in self.resource_pool.resources:
            if cluster.boot_timeout > 0:
                for vm in cluster.vms:
                    if vm.status == "Starting" or vm.status == "Unpropagated":
                        if int(time.time() - vm.initialize_time) > cluster.boot_timeout and not cluster.connection_problem:
                            cluster.failed_image_set.add(vm.image)
                            cluster.vm_destroy(vm, reason="Has not reached Running state after %i seconds." % config.vm_start_running_timeout)

    def clean_retired_vms(self, retiredvms, retired = False):
        """Shuts down VMs that have finished Retiring."""
        for machine in retiredvms:
            killedIt = False
            resources = []
            if not retired:
                resources = self.resource_pool.resources
            else:
                resources = self.resource_pool.retired_resources
            for cluster in resources:
                if not cluster.connection_problem:
                    for vm in cluster.vms:
                        if vm == machine:
                            cluster.vm_destroy(machine, return_resources=(not retired), reason="VM Finished Retiring.")
                            killedIt = True
                            break
                if killedIt:
                    break

    def clean_match_jobs_clouds(self):
        """Determines which cloud a job is running on."""
        scheduled_jobs = self.job_pool.job_container.get_scheduled_jobs()
        for job in scheduled_jobs:
            if job.job_status == self.RUNNING and job.running_cloud == "" \
               and job.remote_host:
                (cluster_match, vm_match) = self.resource_pool.find_cluster_with_vm(job.remote_host)
                if cluster_match:
                    job.running_cloud = cluster_match.name
                    job.running_vm = vm_match

    def remove_idle_machines(self, machineList, to_remove):
        """Checks for idle machines to shutdown that are no longer required."""
        for vmtype, count in to_remove.iteritems():
            log.debug("Attempting to remove %i VMs of type %s" % (count, vmtype))
            criteria = {'vmtype': vmtype.split(':', 1)[1], 'state': 'Unclaimed', 'activity': 'Idle'}
            unused_vms_of_type = self.resource_pool.find_in_where(machineList, criteria)
            num_to_shutdown = 0
            len_unused = len(unused_vms_of_type)
            # Make sure we don't try to shutdown more than is possible
            if len_unused == 0:
                log.debug("Could not find any idle VMs to shutdown")
                criteria = {'vmtype': vmtype.split(':', 1)[1], 'state': 'Claimed', 'activity': 'Busy'}
                busy_vms_of_type = self.resource_pool.find_in_where(machineList, criteria)
                if len(busy_vms_of_type) != 0:
                    log.debug("Looks like some registered VMs are still running jobs")
                else:
                    criteria = {'VMType': vmtype.split(':', 1)[1]}
                    any_vms_of_type = self.resource_pool.find_in_where(machineList, criteria)
                    if len(any_vms_of_type) != 0:
                        log.debug("Registered VM in mystery state - maybe Retiring?")
                    else:
                        log.debug("No %s type VMs registered with Condor" % vmtype)
            elif len_unused >= count:
                num_to_shutdown = count
            elif len_unused < count:
                num_to_shutdown = len_unused
                
            for x in range(0, num_to_shutdown):
                log.verbose("Name of Condor Machine to shutdown: %s" % unused_vms_of_type[x].machine_name)
                condor_name = unused_vms_of_type[x].machine_name
                # Track if machine found or not so can break from loop early
                found_vm = False
                for cluster in self.resource_pool.resources:
                    for vm in cluster.vms:
                        if utilities.match_host_with_condor_host(vm.hostname, condor_name) \
                        or utilities.match_host_with_condor_host(vm.alt_hostname, condor_name):
                            found_vm = True
                        if vm.uservmtype == vmtype and found_vm:
                            log.verbose("Located %s in VM list." % unused_vms_of_type[x].machine_name)
                            # Check that machine is Isn't still starting and CS has picked up a zombie condor entry
                            if vm.status != 'Running' and vm.status != 'RUNNING':
                                log.verbose("VM %s still Starting, not going to shutdown" % unused_vms_of_type[x].machine_name)
                                break
                            # Verify that all slots of this VM are idle
                            is_part_of_machine = {'machine_name': unused_vms_of_type[x].machine_name}
                            slots_of_machine = self.resource_pool.find_in_where(machineList, is_part_of_machine)
                            if slots_of_machine:
                                log.verbose("Machine has %i slots, checking if all idle" % len(slots_of_machine))
                            all_slots_idle = True
                            for slot in slots_of_machine:
                                if slot.state != 'Unclaimed' or slot.activity != 'Idle':
                                    all_slots_idle = False
                            if not all_slots_idle:
                                log.verbose("VM %s Still has non-idle slots." % unused_vms_of_type[x].machine_name)
                                break
                            if vm.keep_alive == 0 or vm.idle_start:
                                # Check that enough time has passed to shutdown
                                now = int(time.time())
                                if vm.keep_alive == 0 or now - vm.idle_start > vm.keep_alive:
                                    #if vm.override_status != "Retiring":
                                    self.resource_pool.force_retire_vm(vm)
                                    if not self.check_destroy(cluster, vm) and not cluster.connection_problem:
                                        log.verbose("Starting Destroy of VM: %s" % (vm.id))
                                        th = VMDestroyCmd(cluster, vm, reason="VM %s of type %s no longer required for remaining jobs" % (vm.id, vm.vmtype))
                                        th.start()
                                        self.destroy_threads["".join([cluster.name, vm.id])] = th
                                else:
                                    log.verbose('waiting on keep_alive: %s current: %s' % (vm.keep_alive, now-vm.idle_start))
                            else:
                                vm.idle_start = int(time.time())
                                break
                    if found_vm:
                        log.verbose('vm: %s found matching: %s stopping search. Types %s and %s' % (vm.hostname, unused_vms_of_type[x].machine_name, vm.uservmtype, vmtype))
                        break
                if not found_vm:
                    log.debug("Unable to find Condor Machine %s in VM list" % unused_vms_of_type[x].machine_name)

    def balance_hard_shutdown(self, machineList, prevMachineList, num_to_change):
        """One of the balancing options for fairshare scheduling. Performs
        shutdowns without waiting for the current job to finish."""
        # Compare current Machine List with Previous
        # Looking for machines that have changed jobs since last check
        # and are of a type that has an higher than wanted distribution
        # based on diff_types
        log.verbose("Balancing via hard shutdown of VMs - any running jobs will be rescheduled by condor.")
        vm_matches = []
        vm_starting = []
        changed = self.resource_pool.machine_jobs_changed(machineList, prevMachineList)
        internal_types = self.resource_pool.get_vmtypes_count_internal()
        for cluster in self.resource_pool.resources:
            for vm in cluster.vms:
                if vm.hostname in changed:
                    vm_matches.append(vm)
                if vm.status == "Starting" and vm.uservmtype in internal_types.keys() and vm.uservmtype in num_to_change.keys() and internal_types[vm.uservmtype] > num_to_change[vm.uservmtype]:
                    vm_starting.append(vm)
        to_shutdown = []
        fitting_set = self.filter_fitting_resources(num_to_change)

        for vm in vm_starting + vm_matches:
            if vm.uservmtype in num_to_change.keys() and num_to_change[vm.uservmtype] > 0:
                vm_in_cluster = self.resource_pool.get_cluster_with_vm(vm)
                if vm_in_cluster and vm_in_cluster in fitting_set:
                    to_shutdown.append(vm)
                    num_to_change[vm.uservmtype] -= 1
        for vm in to_shutdown:
            cluster = self.resource_pool.get_cluster_with_vm(vm)
            if cluster:
                if not self.check_destroy(cluster, vm) and not cluster.connection_problem:
                    th = VMDestroyCmd(cluster, vm, reason="Rebalancing VMType %s." % vm.vmtype)
                    th.start()
                    self.destroy_threads["".join([cluster.name, vm.id])] = th

    def filter_fitting_resources(self, num_to_change):
        """Finds the clusters that are capable of booting VMs for a type of job."""
        vmtypes_jobs = self.job_pool.job_container.get_unscheduled_jobs_by_usertype()
        
        fitting_clusters = []
        for vmtype, val in num_to_change.iteritems():
            if val < 0:
                if vmtype in vmtypes_jobs.keys():
                    fitting_clusters.extend(self.resource_pool.get_potential_fitting_resources( \
                    vmtypes_jobs[vmtype][0].req_network, \
                    vmtypes_jobs[vmtype][0].req_cpuarch, vmtypes_jobs[vmtype][0].req_memory, \
                    vmtypes_jobs[vmtype][0].req_storage, vmtypes_jobs[vmtype][0].target_clouds,\
                    vmtypes_jobs[vmtype][0].req_hypervisor, vmtypes_jobs[vmtype][0].req_cpucores, \
                    vmtypes_jobs[vmtype][0].blocked_clouds))
        return set(fitting_clusters)

    def want_to_balance(self, num_to_change):
        """Checks is balancing is necessary."""
        balance = False
        for key in num_to_change.keys():
            if num_to_change[key] < 0:
                balance = True
                break
        return balance

    def graceful_shutdown_condor_off(self, machineList, num_to_change):
        """Suggested method for balancing in the fairshare scheduling method.
        makes use of condor_off to put machines into a Retiring state so they will not
        accept new jobs once their current job finishes execution."""
        log.verbose("Balancing via condor_off, machines will be shutdown once finished current job.")
        log.verbose("Num to change: %s" % str(num_to_change))
        fitting_set = self.filter_fitting_resources(num_to_change)
        internal_vms = self.resource_pool.get_vmtypes_count_internal()
        sched_jobs = self.job_pool.job_container.get_scheduled_jobs_by_usertype()
        unsched_jobs = self.job_pool.job_container.get_unscheduled_jobs_by_usertype()
        for vmtype, val in num_to_change.iteritems():
            if val == 0:
                continue # nothing to do here
            retiring_vms_of_type = self.resource_pool.retiring_vms_of_usertype(vmtype)
            num_force = 0
            for vm in reversed(retiring_vms_of_type):
                if vm.force_retire:
                    num_force += 1
                    retiring_vms_of_type.remove(vm)
            adjusted_val = val - len(retiring_vms_of_type)
            if adjusted_val > 0:
                criteria = {'vmtype': vmtype.split(':', 1)[1]}
                tot_vms_of_type = self.resource_pool.find_in_where(machineList, criteria)
                if vmtype in internal_vms.keys() and internal_vms[vmtype] > len(tot_vms_of_type):
                    # There are Starting or Unregistered VMs of this type - may not want to Retire yet
                    registered_diff = internal_vms[vmtype] - len(tot_vms_of_type) - num_force
                    # have registered_diff starting/error machines - shutdown those instead of running ones
                    if registered_diff > 0:
                        starting_vms_of_type = self.resource_pool.get_starting_of_usertype(vmtype)
                        error_vms_of_type = self.resource_pool.get_error_of_usertype(vmtype)
                        num_error_to_destroy = 0
                        num_start_to_destroy = 0
                        # Shutdown VMs in the error state first
                        if registered_diff > adjusted_val:
                            num_error_to_destroy = adjusted_val
                        else:
                            num_error_to_destroy = registered_diff
                        successful_destroy = 0
                        for start_vm in islice(reversed(error_vms_of_type), num_error_to_destroy):
                            vm_in_cluster = self.resource_pool.get_cluster_with_vm(start_vm)
                            if vm_in_cluster != None and vm_in_cluster in fitting_set and not vm_in_cluster.connection_problem:
                                destroy_ret = vm_in_cluster.vm_destroy(start_vm, reason="Trying to speed up balancing by shutting down Error state VM.")
                                if destroy_ret == 0:
                                    successful_destroy += 1
                                elif destroy_ret != 0:
                                    log.error("Failed to destroy vm %s" % start_vm.id)
                        adjusted_val -= successful_destroy
                        if adjusted_val <= 0:
                            continue
                        # See if still need to shutdown any and do the starting ones next
                        if registered_diff > adjusted_val:
                            num_start_to_destroy = adjusted_val
                        else:
                            num_start_to_destroy = registered_diff
                        successful_destroy = 0
                        for start_vm in islice(reversed(starting_vms_of_type), num_start_to_destroy):
                            # Since the machine could not retire make sure not to destroy the last VM of type
                            if vmtype in internal_vms.keys() and vmtype in sched_jobs.keys() and (internal_vms[vmtype] <= len(sched_jobs[vmtype])+1) and vmtype not in unsched_jobs.keys():
                                # fewer or equal vms left that trying to shutdown
                                continue
                            vm_in_cluster = self.resource_pool.get_cluster_with_vm(start_vm)
                            if vm_in_cluster != None and vm_in_cluster in fitting_set and not vm_in_cluster.connection_problem:
                                destroy_ret = vm_in_cluster.vm_destroy(start_vm, reason="Trying to speed up balancing by shutting down Starting VM.")
                                if destroy_ret == 0:
                                    successful_destroy += 1
                                elif destroy_ret != 0:
                                    log.error("Failed to destroy vm %s" % start_vm.id)
                        adjusted_val -= successful_destroy
                        if adjusted_val <= 0:
                            continue
                criteria = {'vmtype': vmtype.split(':', 1)[1], 'activity': 'Busy'}
                busy_vms = self.resource_pool.find_in_where(machineList, criteria)
                for busy_vm in reversed(busy_vms):
                    vm_in_cluster = None
                    vm = self.resource_pool.find_vm_with_name(busy_vm.machine_name)
                    vm_in_cluster = self.resource_pool.get_cluster_with_vm(vm)
                    if vm_in_cluster != None and vm_in_cluster not in fitting_set:
                        busy_vms.remove(busy_vm)
                if len(busy_vms) < adjusted_val:
                    adjusted_val = len(busy_vms)
                for x in range(0, adjusted_val):
                    retired_vm = self.resource_pool.find_vm_with_name(busy_vms[x].machine_name)
                    if retired_vm != None and retired_vm.override_status != 'Retiring':
                        (_, ret2, _, ret22) = self.resource_pool.do_condor_off(busy_vms[x].machine_name, busy_vms[x].address_startd, busy_vms[x].address_master)
                        if ret2 == 0 and ret22 == 0:
                            retired_vm.override_status = 'Retiring'
                        else:
                            # Since the machine could not retire make sure not to destroy the last VM of type
                            if internal_vms[vmtype] <= adjusted_val:
                                # fewer or equal vms left that trying to shutdown
                                if internal_vms[vmtype] - x-1 <= 0:
                                    continue
                            # Unable to use condor_off on this machine for some reason - address(es) are bad?
                            bad_name_vm = self.resource_pool.find_vm_with_addr(busy_vms[x].address_startd)
                            if bad_name_vm != None:
                                log.debug("Bad Addresses for VM: %s, Startd: %s, Master: %s" % (bad_name_vm.condorname, bad_name_vm.condoraddr, bad_name_vm.condormasteraddr))
                                cluster = self.resource_pool.get_cluster_with_vm(bad_name_vm)
                                if cluster and not cluster.connection_problem:
                                    destroy_ret = cluster.vm_destroy(bad_name_vm, reason="Unable to Retire VM %s due to invalid Condor Name - Forcing Shutdown - any running jobs will be evicted and rescheduled" % bad_name_vm.id)
                                    if destroy_ret != 0:
                                        log.error("Failed to destroy vm %s" % bad_name_vm.id)
                                else:
                                    log.warning("cluster lookup failed for vm %s" % bad_name_vm.id)
                            else:
                                log.error("Lookup of %s failed, does this vm still exist within CS?" % busy_vms[x].name)
                                bad_addr_vm = self.resource_pool.find_vm_with_name(busy_vms[x].machine_name)
                                if bad_addr_vm:
                                    log.verbose("Found it via name: it think it's address is %s" % bad_addr_vm.condoraddr)

    def clean_retire_near_lifetime(self):
        """Forces a VM to retire that is nearing it's maximum lifetime. This is 
        done to prevent a job's execution from being interupted by Nimbus shutting
        down the VM at the maximum lifetime."""
        for cluster in self.resource_pool.resources:
            try:
                if cluster.vm_lifetime:
                    for vm in cluster.vms:
                        # check job_run_times average run time against the vm lifetime
                        if vm.startup_time:
                            if vm.job_run_times.average() * config.retire_before_lifetime_factor > (cluster.vm_lifetime*60 - (time.time() - vm.initialize_time)):
                                # Next job submitted to this VM may not finish running before VM is shutdown
                                if not vm.force_retire:
                                    (_, ret2, _, ret22) = self.resource_pool.do_condor_off(vm.condorname, vm.condoraddr, vm.condormasteraddr)
                                    if ret2 == 0 and ret22 == 0:
                                        vm.force_retire = True
                                        vm.override_status = 'Retiring'
                                    else:
                                        log.warning("Unable to retire VM, possibly due to condor name %s" % vm.condorname)
            except AttributeError:
                # No vm lifetime for Non-Nimbus clouds move on to next 
                continue

    def check_destroy(self, cluster, vm):
        """Make sure there is not already a destroy VM thread for this particular
        cluster and VM, and that the number of destroy threads has not exceeded the max."""
        overlimit = False
        if len(self.destroy_threads) >= config.max_destroy_threads:
            overlimit = True
            log.debug("At Maximum destroy threads. Unable to shutdown now.")
        found = False
        if "".join([cluster.name, vm.id]) in self.destroy_threads.keys():
            found = True
            log.verbose("Already found a destroy thread for %s." % vm.hostname)
        return found or overlimit

    def check_destroy_threads(self):
        """See if any of the destroy request threads have finished."""
        to_remove = []
        for k, thread in self.destroy_threads.iteritems():
            if not thread.is_alive():
                if thread.get_result() != 0:
                    log.error("Destroying VM %s failed. Leaving it for now." % thread.get_vm().id)
                thread.join(timeout=1)
                to_remove.append(k)
        for key in to_remove:
            del self.destroy_threads[key]

    def check_vm_proxy_shutdown_threshold(self):
        """For VMs with a proxy, if they have not been able to renew said proxy
        by the threshold they will be shutdown to prevent that VM from entering
        an ExpiredProxy state."""
        for cluster in self.resource_pool.resources:
            for vm in cluster.vms:
                if vm.needs_proxy_shutdown():
                    if not self.check_destroy(cluster, vm) and  not cluster.connection_problem:
                        th = VMDestroyCmd(cluster, vm, reason="Passed proxy expiry threshold.")
                        th.start()
                        self.destroy_threads["".join([cluster.name, vm.id])] = th

    def clean_verify_vm_job_reqs(self):
        """Attempts to handle cases where a user has entered incorrect values for
        the VM and requirements causing machines and jobs to be Idle even though it 
        appears that there are VMs that should be able to run the job."""
        criteria = {'state': 'Unclaimed', 'activity': 'Idle'}
        idle_vms = self.resource_pool.find_in_where(self.resource_pool.vm_machine_list, criteria)
        to_shutdown = []
        to_hold = set()
        for vm in idle_vms:
            try:
                idletime = int(vm.current_time) - int(vm.entered_state_time)
                if idletime > config.vm_idle_threshold:
                    internal_vm = self.resource_pool.find_vm_with_addr(vm.address_startd)
                    if not internal_vm:
                        internal_vm = self.resource_pool.find_vm_with_name(vm.machine_name)
                        if not internal_vm:
                            log.warning("Could not locate %s with address %s in CS, maybe machinelist out-of-date of has not been matched yet?" % (vm.name,vm.address_startd ))
                            continue
                    # Verify that all slots of this VM are idle
                    is_part_of_machine = {'machine_name': internal_vm.hostname}
                    slots_of_machine = self.resource_pool.find_in_where(self.resource_pool.machine_list, is_part_of_machine)
                    is_part_of_machine = {'machine_name': internal_vm.alt_hostname}
                    slots_of_machine.extend(self.resource_pool.find_in_where(self.resource_pool.machine_list, is_part_of_machine))
                    all_slots_idle = True
                    for slot in slots_of_machine:
                        if slot.state != 'Unclaimed' or slot.activity != 'Idle' or (int(vm.current_time) - int(vm.entered_state_time) > config.vm_idle_threshold):
                            all_slots_idle = False
                    if not all_slots_idle:
                        log.debug("VM %s Still has non-idle slots." % vm.name)
                        continue

                    vmuser = None
                    try:
                        vmuser = re.search('(?<=Owner == ")\w+', vm.start_req).group(0)
                    except:
                        if vm.start_req:
                            log.warning("Failed to parse Start: (Owner=user) Start = %s, For %s, VMID: %s." % (vm.start_req, internal_vm.hostname, internal_vm.id))
                        else:
                            log.warning("Failed to parse Start: (Owner=user) from vm. No Start attrib. For %s, VMID: %s." % (internal_vm.hostname, internal_vm.id))
                        continue

                    userjobtypes = self.job_pool.job_container.get_unscheduled_user_jobs_by_type(vmuser)
                    scheduserjobtypes = self.job_pool.job_container.get_scheduled_user_jobs_by_type(vmuser)
                    potentialVM = False
                    schedjob_to_hold = None
                    unschedjob_to_hold = None

                    for jobvmtype in userjobtypes.keys():
                        if jobvmtype == internal_vm.vmtype:
                            for job in userjobtypes[jobvmtype]:
                                if job.job_status == self.IDLE:
                                    if self.check_vm_job_reqs(internal_vm, job):
                                        potentialVM = True
                                        schedjob_to_hold = job
                                        break

                    for jobvmtype in scheduserjobtypes.keys():
                        if jobvmtype == internal_vm.vmtype:
                            for job in scheduserjobtypes[jobvmtype]:
                                if job.job_status == self.IDLE:
                                    if self.check_vm_job_reqs(internal_vm, job):
                                        potentialVM = True
                                        unschedjob_to_hold = job
                                        break

                    if not potentialVM:
                        # shutdown this one
                        log.verbose("Going to shutdown vm %s, no jobs appear to be able to run there" % internal_vm.id)
                        to_shutdown.append(internal_vm)
                    else:
                        if schedjob_to_hold:
                            to_hold.add(schedjob_to_hold)
                            log.debug("job %s looks like it should be able to run on an idle machine but is not, will hold it." % schedjob_to_hold.id)
                        if unschedjob_to_hold:
                            to_hold.add(unschedjob_to_hold)
                            log.debug("job %s looks like it should be able to run on an idle machine but is not, will hold it." % unschedjob_to_hold.id)
                        # even weirder state - idle vm with a job that should run on it but is not - try checking the full requirements or just hold the job?
            except Exception as e:
                log.warning("Exception: %s" % str(e))
        if len(to_hold) > 0:
            for job in to_hold:
                job.override_status = 'HeldBadReqs'
            failedhold = self.job_pool.job_hold_local(list(to_hold))
            if failedhold and len(failedhold) > 0:
                log.debug("Failed to hold %i jobs" % len(failedhold))

        for vm in to_shutdown:
            #check for vm.keep_alive
            if vm.keep_alive > 0 and vm.idle_start and not (int(time.time()) - vm.idle_start > vm.keep_alive):
                continue
            cluster = self.resource_pool.get_cluster_with_vm(vm)
            if cluster:
                if vm.override_status != "Retiring":
                    if not self.resource_pool.force_retire_vm(vm):
                        if not self.check_destroy(cluster, vm) and not cluster.connection_problem:
                            th = VMDestroyCmd(cluster, vm, reason="Unable to run any idle jobs due to resource config.")
                            th.start()
                            self.destroy_threads["".join([cluster.name, vm.id])] = th

    def check_vm_job_reqs(self, vm, job):
        """ Check if a vm has correct attributes to run a job."""
        vmjobmatch = False
        if vm.memory >= job.req_memory and vm.network in job.req_network and vm.cpuarch == job.req_cpuarch \
            and vm.cpucores >= job.req_cpucores and vm.storage >= job.req_storage \
            and (vm.image == job.req_imageloc or vm.image == job.req_ami) \
            and vm.uservmtype == job.uservmtype:
            vmjobmatch = True
        return vmjobmatch

    def check_connection_problems(self):
        for cluster in self.resource_pool.resources:
            if cluster.connection_problem:
                if time.time() - config.connection_fail_disable_time - config.vm_connection_fail_threshold > cluster.errorconnect:
                    cluster.connection_problem = False
                    cluster.enabled = True

class GetClouds(threading.Thread):
    """
    GetClouds - Periodically syncs the cluster resources with a redis store
                being updated by the cloud-aggregator monitoring package
    """


    def __init__(self, resource_pool):
        threading.Thread.__init__(self, name=self.__class__.__name__)
        self.resource_pool = resource_pool
        self.quit = False
        self.polling_interval = 10 # secondsgetCloudsClient
        self.heart_beat = time.time()
        #self.getclouds = getCloudsClient()

    def stop(self):
        log.debug("Waiting for getclouds loop to end")
        self.quit = True

    def run(self):
        log.info("Starting getclouds Thread...")

        while not self.quit:
            log.verbose("fetching information from cloud-aggregator")
            #cloud_info = self.getclouds.getCloudsView()
            #for cloud in cloud_info:
                ## Find cluster that matches this cloud
                #for cluster in self.resource_pool.resources:
                    #if cloud['Service']['HostName'] == cluster.network_address.split('.')[0]:
                        ## Found it
                        #slots = 0
                        #for pool in cloud['NetworkPools']:
                            #slots += int(pool['AvailableIPs'])
                        #cluster.vm_slots = slots
                        #break


            log.verbose("getclouds waiting %ds..." % self.polling_interval)
            sleep_tics = self.polling_interval
            self.heart_beat = time.time()
            while (not self.quit) and sleep_tics > 0:
                time.sleep(1)
                sleep_tics -= 1

        log.debug("Exiting getclouds thread")
##
## Functions
##
def main():
    """Main Function of CS, sets up all the required threads."""
    # Create a parser and process commandline arguments
    version_str = "Cloud Scheduler " + version.version
    parser = OptionParser(version=version_str)
    set_options(parser)
    (cli_options, args) = parser.parse_args()

    # Look for global configuration file, and initialize config
    if (cli_options.config_file):
        config.setup(path=cli_options.config_file)
    else:
        config.setup()

    # Set up logging
    logging._srcfile = None 
    logging.logProcesses = 0
    log.setLevel(utilities.LEVELS[config.log_level])
    log_formatter = logging.Formatter(config.log_format)
    if config.log_stdout:
        stream_handler = logging.StreamHandler()
        stream_handler.setFormatter(log_formatter)
        log.addHandler(stream_handler)

    if config.log_location:
        file_handler = None
        if config.log_max_size:
            file_handler = logging.handlers.RotatingFileHandler(
                                            config.log_location,
                                            maxBytes=config.log_max_size)
        else:
            try:
                file_handler = logging.handlers.WatchedFileHandler(
                                            config.log_location,)
            except AttributeError:
                # Python 2.5 doesn't support WatchedFileHandler
                file_handler = logging.handlers.RotatingFileHandler(
                                            config.log_location,)

        file_handler.setFormatter(log_formatter)
        log.addHandler(file_handler)

    if not config.log_location and not config.log_stdout:
        null_handler = utilities.NullHandler()
        log.addHandler(null_handler)
    # Log entry message (for timestamp in log)
    log.info("Cloud Scheduler starting...")
    if config.log_level == 'VERBOSE':
        log.warning("WARNING - using VERBOSE logging will result is poor performance with more than a few hundred jobs in the condor queue")
    if config.log_level == 'DEBUG':
        log.warning("WARNING - using DEBUG logging can result in poor performance with more than a few thousand jobs in the condor queue")

    # Log the config 
    log.info(get_cloud_scheduler_config_output())

    # Command line options take precedence, so replace config file
    # option with command line option
    if cli_options.cloud_conffile:
        config.cloud_resource_config = cli_options.cloud_conffile

    # If the neither the cloud conffile or the MDS server are passed to obtain
    # initial cluster information, print usage and exit the system.
    if (not config.cloud_resource_config) and (not cli_options.mds_server):
        print "ERROR - main - No cloud or cluster information sources provided"
        parser.print_help()
        sys.exit(1)

    # Create a job pool
    job_pool = job_management.JobPool("Job Pool")

    # Create a resource pool
    cloud_resources = cloud_management.ResourcePool(config.cloud_resource_config)

    # Log the resource pool
    cloud_resources.log_pool()

    # We maintain two lists of threads, service and info. Service threads
    # are neccessary for cloud scheduler to actually do anything, and the
    # info threads are to give the user information about what's going on.

    service_threads = []
    info_threads = []



    # Create the Job Polling thread
    job_poller = JobPoller(job_pool)
    service_threads.append(job_poller)

    # Create the Machine Polling thread
    machine_poller = MachinePoller(cloud_resources)
    service_threads.append(machine_poller)

    # Create the VM Polling thread
    vm_poller = VMPoller(cloud_resources, job_pool)
    service_threads.append(vm_poller)

    # Create the Scheduling thread
    scheduler = Scheduler(cloud_resources, job_pool)
    service_threads.append(scheduler)

    # Create the Cleanup Thread
    cleaner = Cleanup(cloud_resources, job_pool)
    service_threads.append(cleaner)

    # Create the JobProxyRefresher thread, if needed
    if config.job_proxy_refresher_interval != -1:
        job_proxy_refresher_thread = proxy_refreshers.JobProxyRefresher(job_pool)
        service_threads.append(job_proxy_refresher_thread)
    else:
        log.debug('Job proxy refresher thread not enabled.')

    # Create the VMProxyRefresher thread, if needed
    if config.vm_proxy_refresher_interval != -1:
        vm_proxy_refresher_thread = proxy_refreshers.VMProxyRefresher(cloud_resources)
        service_threads.append(vm_proxy_refresher_thread)
    else:
        log.debug('VM proxy refresher thread not enabled.')


    # Create the GetClouds Thread if wanted
    if config.getclouds:
        getclouds = GetClouds(cloud_resources)
        service_threads.append(getclouds)

    # Start the cloud scheduler info server for RPCs
    info_serv = info_server.InfoServer(cloud_resources, job_pool, job_poller, machine_poller, vm_poller, scheduler, cleaner)
    info_serv.daemon = True
    info_threads.append(info_serv)
    
    # Start the cloud scheduler admin server for RPCs
    admin_serv = admin_server.AdminServer(cloud_resources, job_pool, job_poller, machine_poller, vm_poller, scheduler, cleaner)
    admin_serv.daemon = True
    info_threads.append(admin_serv)

    # Set SIGTERM (kill) handler
    signal.signal(signal.SIGTERM, term_handler)

    # Set SIGUSR1 (reconfig) handler
    reconfig_handler = make_reconfig_handler(cloud_resources)
    signal.signal(signal.SIGUSR1, reconfig_handler)

    # Set SIGUSR2 (reload_ban) handler
    #reload_ban_handler = make_banned_job_fileload_handler(cloud_resources)
    #signal.signal(signal.SIGUSR2, reload_ban_handler)

    # Set SIGUSR2 (quick_exit) handler
    quick_exit_handler = make_quick_exit_handler(scheduler)
    signal.signal(signal.SIGUSR2, quick_exit_handler)

    # Start all the threads
    for thread in info_threads:
        thread.start()

    for thread in service_threads:
        thread.start()

    should_be_running = True

    # Wait for keyboard input to exit the cloud scheduler
    try:
        die = False
        while not die:
            now = time.time()
            for thread in service_threads:
                if not thread.isAlive():
                    log.error("%s thread died!" % thread.name)
                    die = True
                if now - thread.heart_beat > 1800: # 30 minutes no update - make configureable
                    # Thread has become unresponsive
                    pass
                time.sleep(1)
    except (SystemExit, KeyboardInterrupt):
        log.info("Caught a signal that someone wants me to quit!")
        should_be_running = False

    if should_be_running:
        log.error("Whoops. Wasn't expecting to exit. Did a thread crash?")
        scheduler.quick_exit = True
       

    log.info("Cloud Scheduler quitting normally. (It might take a while, don't panic!)")

    # Kill all the service threads, then the info_server
    for thread in service_threads:
        thread.stop()

    for thread in service_threads:
        thread.join()

    for thread in info_threads:
        thread.stop()

    for thread in info_threads:
        thread.join()

    log.info("Cloud Scheduler stopped. Bye!")

    sys.exit()


def term_handler(signal, handler):
    """Custom SIGTERM handler."""
    log.info("Recieved SIGTERM signal")
    sys.exit()

def make_reconfig_handler(resource_pool):
    """
    make_reconfig_handler - make a signal handler that can reconfig the passed
                            ResourcePool object
    """
    def reconfig_handler(signal, handler):
        log.info("Recieved SIGUSR1 (reconfig) signal. Reloading resources file...")
        log.warning("Reconfig disabled, use quickrestart.")
        #resource_pool.setup()

    return reconfig_handler

def make_banned_job_fileload_handler(resource_pool):
    """
    make_banned_job_fileload_handler - make a signal handler that can reload
                                       the banned job file
    """
    def reload_ban_handler(signal, handler):
        log.info("Recieved SIGUSR2 (reload bans) signal, Reloading banned jobs file...")
        resource_pool.load_banned_job_resource()

    return reload_ban_handler


def make_quick_exit_handler(scheduler):
    """
    make_quick_exit_handler - make a signal handler that can enable a quick exit
                              of CloudScheduler
    """
    def quick_exit_handler(signal, handler):
        log.info("Recieved SIGUSR2 (quick_exit) signal, Setting quick exit flag...")
        scheduler.toggle_quick_exit()

    return quick_exit_handler

def set_options(parser):
    """Sets the command-line options for a passed in OptionParser object (via optparse)."""
    # Option attributes: action, type, dest, help. See optparse documentation.
    # Defaults: action=store, type=string, dest=[name of the option] help=none
    parser.add_option("-f", "--config-file", dest="config_file",
                      metavar="FILE",
                      help="Designate a config file for Cloud Scheduler")
    parser.add_option("-c", "--cloud-config", dest="cloud_conffile",
                      metavar="FILE",
                      help="Designate a config file from which cloud cluster "
                           "information is obtained")

    parser.add_option("-m", "--MDS", dest="mds_server", metavar="SERVER",
                      help="Designate an MDS server from which cloud cluster "
                           "information is obtained")

def get_cloud_scheduler_config_output():
    """Returns the cloud_scheduler's configuration key values."""
    outputlist = []
    for attr in dir(config):
        if not attr.startswith('_'):
            pair = attr + ': ' + str(getattr(config, attr))
            outputlist.append(pair)
            outputlist.append(', ')
    return "".join(outputlist)
##
## Main Functionality
##

main()
