X-Git-Url: http://git.onelab.eu/?p=nodemanager.git;a=blobdiff_plain;f=nodemanager.py;h=0fefeae566308eb3a0c81d86cf43a45c5f9dcc2c;hp=0cf1dc6e18d865d9ea7c0faaf193e5e86837da60;hb=HEAD;hpb=2306328a4623d578e93bd59982623908eb8742dd diff --git a/nodemanager.py b/nodemanager.py index 0cf1dc6..0fefeae 100755 --- a/nodemanager.py +++ b/nodemanager.py @@ -1,7 +1,4 @@ -#!/usr/bin/python -# -# $Id$ -# $URL$ +#!/usr/bin/python3 # # Useful information can be found at https://svn.planet-lab.org/wiki/NodeManager # @@ -10,207 +7,309 @@ # Copyright (C) 2008 The Trustees of Princeton University -"""Node Manager""" +""" +Node Manager +""" -import optparse import time -import xmlrpclib -import socket import os import sys -import resource import glob +import pickle +import random +from argparse import ArgumentParser import logger import tools from config import Config -from plcapi import PLCAPI -import random - -id="$Id$" -savedargv = sys.argv[:] - -# NOTE: modules listed here should also be loaded in this order -# see the priority set in each module - lower comes first -known_modules=['net','conf_files', 'sm', 'bwmon'] - -plugin_path = "/usr/share/NodeManager/plugins" - -default_period=600 -default_random=301 - -parser = optparse.OptionParser() -parser.add_option('-d', '--daemon', action='store_true', dest='daemon', default=False, help='run daemonized') -parser.add_option('-s', '--startup', action='store_true', dest='startup', default=False, help='run all sliver startup scripts') -parser.add_option('-f', '--config', action='store', dest='config', default='/etc/planetlab/plc_config', help='PLC configuration file') -parser.add_option('-k', '--session', action='store', dest='session', default='/etc/planetlab/session', help='API session key (or file)') -parser.add_option('-p', '--period', action='store', dest='period', default=default_period, - help='Polling interval (sec) - default %d'%default_period) -parser.add_option('-r', '--random', action='store', dest='random', default=default_random, - help='Range for additional random polling interval (sec) -- default %d'%default_random) -parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='more verbose log') -parser.add_option('-P', '--path', action='store', dest='path', default=plugin_path, help='Path to plugins directory') - -# NOTE: BUG the 'help' for this parser.add_option() wont list plugins from the --path argument -parser.add_option('-m', '--module', action='store', dest='module', default='', help='run a single module among '+' '.join(known_modules)) -(options, args) = parser.parse_args() - -# Deal with plugins directory -if os.path.exists(options.path): - sys.path.append(options.path) - plugins = [ os.path.split(os.path.splitext(x)[0])[1] for x in glob.glob( os.path.join(options.path,'*.py') ) ] - known_modules += plugins - -modules = [] - -def GetSlivers(config, plc): - '''Run call backs defined in modules''' - try: - logger.log("nodemanager: Syncing w/ PLC") - # retrieve GetSlivers from PLC - data = plc.GetSlivers() - # use the magic 'default' slice to retrieve system-wide defaults - getPLCDefaults(data, config) - # tweak the 'vref' attribute from GetSliceFamily - setSliversVref (data) - # always dump it for debug purposes - # used to be done only in verbose; very helpful though, and tedious to obtain, - # so let's dump this unconditionnally - logger.log_slivers(data) - logger.verbose("nodemanager: Sync w/ PLC done") - except: - logger.log_exc("nodemanager: failed in GetSlivers") - # XXX So some modules can at least boostrap. - logger.log("nodemanager: Can't contact PLC to GetSlivers(). Continuing.") - data = {} - # Invoke GetSlivers() functions from the callback modules - for module in modules: - logger.verbose('trigerring GetSlivers callback for module %s'%module.__name__) - try: - callback = getattr(module, 'GetSlivers') - callback(data, config, plc) - except: - logger.log_exc("nodemanager: GetSlivers failed to run callback for module %r"%module) - - -def getPLCDefaults(data, config): - ''' - Get PLC wide defaults from _default system slice. Adds them to config class. - ''' - for slice in data.get('slivers'): - if slice['name'] == config.PLC_SLICE_PREFIX+"_default": - attr_dict = {} - for attr in slice.get('attributes'): attr_dict[attr['tagname']] = attr['value'] - if len(attr_dict): - logger.verbose("nodemanager: Found default slice overrides.\n %s" % attr_dict) - config.OVERRIDES = attr_dict - return - # NOTE: if an _default slice existed, it would have been found above and - # the routine would return. Thus, if we've gotten here, then no default - # slice is bound to this node. - if 'OVERRIDES' in dir(config): del config.OVERRIDES - - -def setSliversVref (data): - ''' - Tweak the 'vref' attribute in all slivers based on the 'GetSliceFamily' key - ''' - # GetSlivers exposes the result of GetSliceFamily() as an separate key in data - # It is safe to override the attributes with this, as this method has the right logic - for sliver in data.get('slivers'): +from plcapi import PLCAPI + + +class NodeManager: + + PLUGIN_PATH = "/usr/share/NodeManager/plugins" + + DB_FILE = "/var/lib/nodemanager/getslivers.pickle" + + # the modules in this directory that need to be run + # NOTE: modules listed here will also be loaded in this order + # once loaded, they get re-ordered after their priority (lower comes first) + # for determining the runtime order + core_modules = ['net', 'conf_files', 'slivermanager', 'bwmon'] + + default_period = 600 + default_random = 301 + default_priority = 100 + + def __init__ (self): + + parser = ArgumentParser() + parser.add_argument( + '-d', '--daemon', action='store_true', dest='daemon', + default=False, + help='run daemonized') + parser.add_argument( + '-f', '--config', action='store', dest='config', + default='/etc/planetlab/plc_config', + help='PLC configuration file') + parser.add_argument( + '-k', '--session', action='store', dest='session', + default='/etc/planetlab/session', + help='API session key (or file)') + parser.add_argument( + '-p', '--period', action='store', dest='period', + default=NodeManager.default_period, + help='Polling interval (sec) - default {}' + .format(NodeManager.default_period)) + parser.add_argument( + '-r', '--random', action='store', dest='random', + default=NodeManager.default_random, + help='Range for additional random polling interval (sec) -- default {}' + .format(NodeManager.default_random)) + parser.add_argument( + '-v', '--verbose', action='store_true', dest='verbose', + default=False, + help='more verbose log') + parser.add_argument( + '-P', '--path', action='store', dest='path', + default=NodeManager.PLUGIN_PATH, + help='Path to plugins directory') + + parser.add_argument( + '-m', '--module', action='store', dest='user_module', + default='', + help='run a single module') + self.options = parser.parse_args() + + # determine the modules to be run + self.modules = NodeManager.core_modules + # Deal with plugins directory + if os.path.exists(self.options.path): + sys.path.append(self.options.path) + plugins = [ + os.path.split(os.path.splitext(x)[0])[1] + for x in glob.glob( os.path.join(self.options.path, '*.py') ) + if not x.endswith("/__init__.py") + ] + self.modules += plugins + if self.options.user_module: + assert self.options.user_module in self.modules + self.modules = [self.options.user_module] + logger.verbose('nodemanager: Running single module {}'.format(self.options.user_module)) + + + def GetSlivers(self, config, plc): + """ + Retrieve GetSlivers at PLC and trigger callbacks defined in modules/plugins + """ try: - slicefamily=sliver.get('GetSliceFamily') - for att in sliver['attributes']: - if att['tagname']=='vref': - att['value']=slicefamily - continue - sliver['attributes'].append({ 'tagname':'vref','value':slicefamily}) + logger.log("nodemanager: Syncing w/ PLC") + # retrieve GetSlivers from PLC + data = plc.GetSlivers() + # use the magic 'default' slice to retrieve system-wide defaults + self.getPLCDefaults(data, config) + # tweak the 'vref' attribute from GetSliceFamily + self.setSliversVref(data) + # dump it too, so it can be retrieved later in case of comm. failure + self.dumpSlivers(data) + # log it for debug purposes, no matter what verbose is + logger.log_slivers(data) + logger.verbose("nodemanager: Sync w/ PLC done") + last_data = data except: - logger.log_exc("nodemanager: Could not overwrite 'vref' attribute from 'GetSliceFamily'",name=sliver['name']) - - -def run(): - try: - if options.daemon: tools.daemon() + logger.log_exc("nodemanager: failed in GetSlivers") + # XXX So some modules can at least boostrap. + logger.log("nodemanager: Can't contact PLC to GetSlivers(). Continuing.") + data = {} + # for modules that request it though the 'persistent_data' property + last_data = self.loadSlivers() + + # Invoke GetSlivers() functions from the callback modules + for module in self.loaded_modules: + logger.verbose('nodemanager: triggering {}.GetSlivers'.format(module.__name__)) + try: + callback = getattr(module, 'GetSlivers') + module_data = data + if getattr(module, 'persistent_data', False): + module_data = last_data + callback(data, config, plc) + except SystemExit as e: + sys.exit(e) + except: + logger.log_exc("nodemanager: GetSlivers failed to run callback for module {}" + .format(module)) + + + def getPLCDefaults(self, data, config): + """ + Get PLC wide defaults from _default system slice. Adds them to config class. + """ + for slice in data.get('slivers'): + if slice['name'] == config.PLC_SLICE_PREFIX + "_default": + attr_dict = {} + for attr in slice.get('attributes'): + attr_dict[attr['tagname']] = attr['value'] + if attr_dict: + logger.verbose("nodemanager: Found default slice overrides.\n {}" + .format(attr_dict)) + config.OVERRIDES = attr_dict + return + # NOTE: if an _default slice existed, it would have been found above and + # the routine would return. Thus, if we've gotten here, then no default + # slice is bound to this node. + if 'OVERRIDES' in dir(config): + del config.OVERRIDES + + + def setSliversVref (self, data): + """ + Tweak the 'vref' attribute in all slivers based on the 'GetSliceFamily' key + """ + # GetSlivers exposes the result of GetSliceFamily() as an separate key in data + # It is safe to override the attributes with this, as this method has the right logic + for sliver in data.get('slivers'): + try: + slicefamily = sliver.get('GetSliceFamily') + for att in sliver['attributes']: + if att['tagname'] == 'vref': + att['value'] = slicefamily + continue + sliver['attributes'].append( + {'tagname': 'vref', 'value': slicefamily}) + except Exception: + logger.log_exc( + "nodemanager: Could not overwrite 'vref' attribute from 'GetSliceFamily'", + name=sliver['name']) + + + def dumpSlivers (self, slivers): + with open(NodeManager.DB_FILE, "wb") as feed: + logger.log ("nodemanager: saving successfully fetched GetSlivers in {}" + .format(NodeManager.DB_FILE)) + pickle.dump(slivers, feed) + + + def loadSlivers (self): + try: + with open(NodeManager.DB_FILE, "rb+") as feed: + logger.log("nodemanager: restoring latest known GetSlivers from {}" + .format(NodeManager.DB_FILE)) + slivers = pickle.load(feed) + return slivers + except: + logger.log("Could not restore GetSlivers from {}" + .format(NodeManager.DB_FILE)) + return {} + + def run(self): + # make sure to create /etc/planetlab/virt so others can read that + # used e.g. in vsys-scripts's sliceip + tools.get_node_virt() + try: + if self.options.daemon: + tools.daemon() - # set log level - if (options.verbose): - logger.set_level(logger.LOG_VERBOSE) + # set log level + if self.options.verbose: + logger.set_level(logger.LOG_VERBOSE) + tools.init_signals() - # Load /etc/planetlab/plc_config - config = Config(options.config) + # Load /etc/planetlab/plc_config + config = Config(self.options.config) - try: - other_pid = tools.pid_file() - if other_pid != None: - print """There might be another instance of the node manager running as pid %d. If this is not the case, please remove the pid file %s""" % (other_pid, tools.PID_FILE) - return - except OSError, err: - print "Warning while writing PID file:", err - - # Load and start modules - if options.module: - assert options.module in known_modules - running_modules=[options.module] - logger.verbose('nodemanager: Running single module %s'%options.module) - else: - running_modules=known_modules - for module in running_modules: try: - m = __import__(module) - m.start(options, config) - modules.append(m) - except ImportError, err: - print "Warning while loading module %s:" % module, err - - default_priority=100 - # sort on priority (lower first) - def sort_module_priority (m1,m2): - return getattr(m1,'priority',default_priority) - getattr(m2,'priority',default_priority) - modules.sort(sort_module_priority) - - logger.log('ordered modules:') - for module in modules: logger.log ('%s: %s'%(getattr(module,'priority',default_priority),module.__name__)) - - # Load /etc/planetlab/session - if os.path.exists(options.session): - session = file(options.session).read().strip() - else: - session = None - - # Initialize XML-RPC client - iperiod=int(options.period) - irandom=int(options.random) - plc = PLCAPI(config.plc_api_uri, config.cacert, session, timeout=iperiod/2) - - #check auth - logger.log("nodemanager: Checking Auth.") - while plc.check_authentication() != True: - try: -# import pdb -# pdb.set_trace() - plc.update_session() - logger.log("nodemanager: Authentication Failure. Retrying") - except Exception,e: - logger.log("nodemanager: Retry Failed. (%r); Waiting.."%e) - time.sleep(iperiod) - logger.log("nodemanager: Authentication Succeeded!") - - - while True: - # Main nodemanager Loop - logger.log('nodemanager: mainloop - calling GetSlivers - period=%d random=%d'%(iperiod,irandom)) - GetSlivers(config, plc) - delay=iperiod + random.randrange(0,irandom) - logger.log('nodemanager: mainloop - sleeping for %d s'%delay) - time.sleep(delay) - except: logger.log_exc("nodemanager: failed in run") + other_pid = tools.pid_file() + if other_pid is not None: + print("""There might be another instance of the node manager running as pid {}. +If this is not the case, please remove the pid file {}. -- exiting""" + .format(other_pid, tools.PID_FILE)) + return + except OSError as err: + print("Warning while writing PID file:", err) + + # load modules + self.loaded_modules = [] + for module in self.modules: + try: + m = __import__(module) + logger.verbose("nodemanager: triggering {}.start".format(m.__name__)) + try: + m.start() + except Exception: + logger.log("WARNING: module {} did not start".format(m.__name__)) + self.loaded_modules.append(m) + except Exception: + if module not in NodeManager.core_modules: + logger.log_exc("ERROR while loading module {} - skipped" + .format(module)) + else: + logger.log("FATAL : failed to start core module {}".format(module)) + sys.exit(1) + + # sort on priority (lower first) + def module_priority(module): + return getattr(module, 'priority', NodeManager.default_priority) + self.loaded_modules.sort(key=module_priority) + + logger.log('ordered modules:') + for module in self.loaded_modules: + logger.log('{}: {}' + .format(getattr(module, 'priority', + NodeManager.default_priority), + module.__name__)) + + # Load /etc/planetlab/session + if os.path.exists(self.options.session): + with open(self.options.session) as feed: + session = feed.read().strip() + else: + session = None + + + # get random periods + iperiod = int(self.options.period) + irandom = int(self.options.random) + + # Initialize XML-RPC client + plc = PLCAPI(config.plc_api_uri, config.cacert, session, timeout=iperiod/2) + + #check auth + logger.log("nodemanager: Checking Auth.") + while not plc.check_authentication(): + try: + plc.update_session() + logger.log("nodemanager: Authentication Failure. Retrying") + except Exception as exc: + logger.log("nodemanager: Retry Failed. ({}); Waiting.." + .format(exc)) + time.sleep(iperiod) + logger.log("nodemanager: Authentication Succeeded!") + + + while True: + # Main nodemanager Loop + work_beg = time.time() + logger.log('nodemanager: mainloop - calling GetSlivers - period={} random={}' + .format(iperiod, irandom)) + self.GetSlivers(config, plc) + delay = iperiod + random.randrange(0, irandom) + work_end = time.time() + work_duration = int(work_end-work_beg) + logger.log('nodemanager: mainloop has worked for {} s - sleeping for {} s' + .format(work_duration, delay)) + time.sleep(delay) + except SystemExit: + pass + except: + logger.log_exc("nodemanager: failed in run") +def run(): + logger.log("======================================== Entering nodemanager.py") + NodeManager().run() if __name__ == '__main__': - logger.log("======================================== Entering nodemanager.py "+id) run() else: - # This is for debugging purposes. Open a copy of Python and import nm + # This is for debugging purposes. Open a copy of Python and import nodemanager tools.as_daemon_thread(run)