From cdf5b60300c018790d27e6ec8b3e781f996ea687 Mon Sep 17 00:00:00 2001 From: Thierry Parmentelat Date: Thu, 28 Jan 2010 14:12:32 +0000 Subject: [PATCH] fixed broken slice-creation --- Makefile | 5 ++++- bwmon.py | 3 ++- conf_files.py | 4 ++-- database.py | 40 ++++++++++++++++++++++++++-------------- initscripts/nm | 9 +++++---- logger.py | 14 +++++++------- nm.py | 44 +++++++++++++++++++++++++++++++------------- plcapi.py | 2 +- plugins/codemux.py | 4 ++-- plugins/vsys.py | 2 +- sliver_vs.py | 30 +++++++++++++++++------------- sm.py | 4 ++-- tools.py | 2 +- 13 files changed, 101 insertions(+), 62 deletions(-) diff --git a/Makefile b/Makefile index 1a55599..4b130d5 100644 --- a/Makefile +++ b/Makefile @@ -70,6 +70,8 @@ ifdef BUILD KEYURL:=root@$(TESTMASTER):$(BUILD)/keys/key1.rsa endif +key: $(NODE).key.rsa + $(NODE).key.rsa: ifeq (,$(KEYURL)) @echo "sync: fetching $@ - You must define TESTMASTER, BUILD and NODE on the command line" @@ -77,7 +79,8 @@ ifeq (,$(KEYURL)) @echo " note that for now all test builds use the same key, so any BUILD would do" @exit 1 else - scp $(KEYURL) $@ + @echo "FETCHING key" + +scp $(KEYURL) $@ endif ### utility - find out the node name for a given BUILD diff --git a/bwmon.py b/bwmon.py index 4a3d4e8..85e7d63 100644 --- a/bwmon.py +++ b/bwmon.py @@ -712,11 +712,12 @@ def run(): # class show to check if net:InitNodeLimit:bwlimit.init has run. sync(nmdbcopy) else: logger.log("bwmon: BW limits DISABLED.") - except: logger.log_exc() + except: logger.log_exc("bwmon failed") lock.clear() def start(*args): tools.as_daemon_thread(run) def GetSlivers(*args): + logger.verbose ("triggering dummy bwmon.GetSlivers") pass diff --git a/conf_files.py b/conf_files.py index 1e5d960..bccf20a 100644 --- a/conf_files.py +++ b/conf_files.py @@ -78,8 +78,8 @@ class conf_files: if data.has_key("conf_files"): for f in data['conf_files']: try: self.update_conf_file(f) - except: logger.log_exc() - else: logger.log("conf_files: No conf_files found or API failure. Skipping") + except: logger.log_exc("failed to update conf_file") + else: logger.log("conf_files: No conf_files found or API failure. Skipping") def start(options, config): pass diff --git a/database.py b/database.py index ead97ea..171739e 100644 --- a/database.py +++ b/database.py @@ -103,19 +103,31 @@ class Database(dict): logger.verbose("database:sync : fetching accounts") existing_acct_names = accounts.all() for name in existing_acct_names: - logger.verbose("database:sync : loop on %s"%name) - if name not in self: accounts.get(name).ensure_destroyed() + if name not in self: + logger.verbose("database:sync : ensure_destroy'ing %s"%name) + accounts.get(name).ensure_destroyed() for name, rec in self.iteritems(): - slivr = accounts.get(name) - logger.verbose("database:sync : %s is %s" %(name,slivr._get_class())) - # Make sure we refresh accounts that are running - if rec['instantiation'] == 'plc-instantiated': slivr.ensure_created(rec) - elif rec['instantiation'] == 'nm-controller': slivr.ensure_created(rec) - # Back door to ensure PLC overrides Ticket in delegation. - elif rec['instantiation'] == 'delegated' and slivr._get_class() != None: - # if the ticket has been delivered and the nm-contoroller started the slice - # update rspecs and keep them up to date. - if slivr.is_running(): slivr.ensure_created(rec) + # protect this; if anything fails for a given sliver + # we still need the other ones to be handled + try: + sliver = accounts.get(name) + logger.verbose("database:sync : looping on %s (shell account class from pwd %s)" %(name,sliver._get_class())) + # Make sure we refresh accounts that are running + if rec['instantiation'] == 'plc-instantiated': + logger.verbose ("database.sync : ensure_create'ing 'instantiation' sliver %s"%name) + sliver.ensure_created(rec) + elif rec['instantiation'] == 'nm-controller': + logger.verbose ("database.sync : ensure_create'ing 'nm-controller' sliver %s"%name) + sliver.ensure_created(rec) + # Back door to ensure PLC overrides Ticket in delegation. + elif rec['instantiation'] == 'delegated' and sliver._get_class() != None: + # if the ticket has been delivered and the nm-contoroller started the slice + # update rspecs and keep them up to date. + if sliver.is_running(): + logger.verbose ("database.sync : ensure_create'ing 'delegated' sliver %s"%name) + sliver.ensure_created(rec) + except: + logger.log_exc("database.sync failed to handle sliver",name=name) # Wake up bwmom to update limits. bwmon.lock.set() @@ -135,13 +147,13 @@ def start(): dump_requested = False db_lock.release() try: tools.write_file(DB_FILE, lambda f: f.write(db_pickle)) - except: logger.log_exc() + except: logger.log_exc("failed in database.start.run") global db try: f = open(DB_FILE) try: db = cPickle.load(f) finally: f.close() except: - logger.log_exc() + logger.log_exc("failed in database.start") db = Database() tools.as_daemon_thread(run) diff --git a/initscripts/nm b/initscripts/nm index 375e157..25b3c26 100755 --- a/initscripts/nm +++ b/initscripts/nm @@ -21,7 +21,7 @@ prog="Node Manager" options=${OPTIONS-"-d -s"} restartoptions=${RESTARTOPTIONS-"-d"} # debug mode is interactive, and has faster period -debugoptions=${DEBUGOPTIONS-"-p 60 -r 31"} +debugoptions=${DEBUGOPTIONS-"-v -p 60 -r 31"} pidfile=${PIDFILE-/var/run/nm.pid} lockfile=${LOCKFILE-/var/lock/subsys/nm} RETVAL=0 @@ -66,9 +66,10 @@ case "$1" in do_start $restartoptions ;; restartdebug) + shift stop - echo "Running interactively .." - $nm $debugoptions + [[ -z "$@" ]] && echo "Running interactively .." + $nm $debugoptions "$@" ;; condrestart) if [ -f ${pidfile} ] ; then @@ -76,7 +77,7 @@ case "$1" in fi ;; *) - echo $"Usage: $0 {start|stop|restart|condrestart|status|restartdebug}" + echo $"Usage: $0 {start|stop|restart|condrestart|status|restartdebug [-d]}" exit 1 esac diff --git a/logger.py b/logger.py index 90ee755..7dff05e 100644 --- a/logger.py +++ b/logger.py @@ -26,7 +26,7 @@ def set_level(level): LOG_LEVEL=level def verbose(msg): - log(msg,LOG_VERBOSE) + log('(v) '+msg,LOG_VERBOSE) def log(msg,level=LOG_NODE): """Write to the log file if level >= current log level (default LOG_NODE).""" @@ -49,20 +49,20 @@ def log_call(*args): if child.returncode: raise Exception("command failed:\n stdout - %s\n stderr - %s" % \ (child.stdout.readlines(), child.stderr.readlines())) - except: log_exc() + except: log_exc('failed to run command %s' % ' '.join(args)) -def log_exc(name = None): +def log_exc(msg="",name=None): """Log the traceback resulting from an exception.""" - if name: - log("operation on %s failed. \n %s" %(name, traceback.format_exc())) + if name: + log("%s: EXCEPTION caught <%s> \n %s" %(name, msg, traceback.format_exc())) else: - log(traceback.format_exc()) + log("EXCEPTION caught <%s> \n %s" %(msg, traceback.format_exc())) def log_data_in_file (data, file, message=""): import pprint, time try: f=open(file,'w') - now=time.strftime("Last update: %Y.%m.%d at %H:%M:%S", time.localtime()) + now=time.strftime("Last update: %Y.%m.%d at %H:%M:%S %Z", time.localtime()) f.write(now+'\n') if message: f.write('Message:'+message+'\n') pp=pprint.PrettyPrinter(stream=f,indent=2) diff --git a/nm.py b/nm.py index 72fbd80..4a561da 100755 --- a/nm.py +++ b/nm.py @@ -65,13 +65,19 @@ def GetSlivers(config, plc): '''Run call backs defined in modules''' try: logger.log("Syncing w/ PLC") + # retrieve GetSlivers from PLC data = plc.GetSlivers() + # use the magic 'default' slice to retrieve system-wide defaults + getPLCDefaults(data, config) + # tweak the 'vref' attribute from GetSliceFamily + setSliversVref (data) + # always dump it for debug purposes # used to be done only in verbose; very helpful though, and tedious to obtain, # so let's dump this unconditionnally logger.log_slivers(data) - getPLCDefaults(data, config) + logger.verbose("Sync w/ PLC done (3)") except: - logger.log_exc() + logger.log_exc("failed in nm.GetSlivers") # XXX So some modules can at least boostrap. logger.log("nm: Can't contact PLC to GetSlivers(). Continuing.") data = {} @@ -80,7 +86,7 @@ def GetSlivers(config, plc): try: callback = getattr(module, 'GetSlivers') callback(data, config, plc) - except: logger.log_exc() + except: logger.log_exc("nm.GetSlivers failed to run callback for module %r"%module) def getPLCDefaults(data, config): @@ -91,12 +97,6 @@ def getPLCDefaults(data, config): if slice['name'] == config.PLC_SLICE_PREFIX+"_default": attr_dict = {} for attr in slice.get('attributes'): attr_dict[attr['tagname']] = attr['value'] - # GetSlivers exposes the result of GetSliceFamily() as an separate key in data - # It is safe to override the attributes with this, as this method has the right logic - try: - attr_dict['vref']=slice.get('GetSliceFamily') - except: - pass if len(attr_dict): logger.verbose("Found default slice overrides.\n %s" % attr_dict) config.OVERRIDES = attr_dict @@ -107,6 +107,24 @@ def getPLCDefaults(data, config): if 'OVERRIDES' in dir(config): del config.OVERRIDES +def setSliversVref (data): + ''' + Tweak the 'vref' attribute in all slivers based on the 'GetSliceFamily' key + ''' + # GetSlivers exposes the result of GetSliceFamily() as an separate key in data + # It is safe to override the attributes with this, as this method has the right logic + for sliver in data.get('slivers'): + try: + slicefamily=sliver.get('GetSliceFamily') + for att in sliver['attributes']: + if att['tagname']=='vref': + att['value']=slicefamily + continue + sliver['attributes'].append({ 'tagname':'vref','value':slicefamily}) + except: + logger.log_exc("Could not overwrite 'vref' attribute from 'GetSliceFamily'",name=sliver['name']) + + def run(): try: if options.daemon: tools.daemon() @@ -157,9 +175,9 @@ def run(): while plc.check_authentication() != True: try: plc.update_session() - logger.log("Authentication Failure. Retrying") + logger.log("Authentication Failure. Retrying") except: - logger.log("Retry Failed. Waiting") + logger.log("Retry Failed. Waiting") time.sleep(iperiod) logger.log("Authentication Succeeded!") @@ -171,11 +189,11 @@ def run(): delay=iperiod + random.randrange(0,irandom) logger.verbose('mainloop - sleeping for %d s'%delay) time.sleep(delay) - except: logger.log_exc() + except: logger.log_exc("failed in nm.run") if __name__ == '__main__': - logger.log("Entering nm.py "+id) + logger.log("======================================== Entering nm.py "+id) run() else: # This is for debugging purposes. Open a copy of Python and import nm diff --git a/plcapi.py b/plcapi.py index 0da084e..4e6daaa 100644 --- a/plcapi.py +++ b/plcapi.py @@ -56,7 +56,7 @@ class PLCAPI: try: authstatus = self.AuthCheck() except: - logger.log_exc() + logger.log_exc("failed in plcapi.check_authentication") return authstatus diff --git a/plugins/codemux.py b/plugins/codemux.py index fa5311f..84b0f4e 100644 --- a/plugins/codemux.py +++ b/plugins/codemux.py @@ -93,7 +93,7 @@ def writeConf(slivers, conf = CODEMUXCONF): f.truncate() f.close() try: restartService() - except: logger.log_exc() + except: logger.log_exc("codemux.writeConf failed to restart service") def sortDomains(slivers): @@ -129,7 +129,7 @@ def parseConf(conf = CODEMUXCONF): slicesinconf.setdefault(slice, []) slicesinconf[slice].append({"host": host, "port": port}) f.close() - except IOError: logger.log_exc() + except IOError: logger.log_exc("codemux.parseConf got IOError") return slicesinconf diff --git a/plugins/vsys.py b/plugins/vsys.py index 927c7c1..f33d11b 100644 --- a/plugins/vsys.py +++ b/plugins/vsys.py @@ -137,5 +137,5 @@ def parseConf(): (path, slice) = line.split() slicesinconf.append(slice) f.close() - except: logger.log_exc() + except: logger.log_exc("failed vsys.parseConf") return slicesinconf diff --git a/sliver_vs.py b/sliver_vs.py index 8723e8c..f86dad9 100644 --- a/sliver_vs.py +++ b/sliver_vs.py @@ -53,17 +53,20 @@ class Sliver_VS(accounts.Account, vserver.VServer): _init_disk_info_sem = globalsem def __init__(self, rec): - logger.verbose ('initing Sliver_VS with name=%s'%rec['name']) + name=rec['name'] + logger.verbose ('%s: initing Sliver_VS'%name) try: - vserver.VServer.__init__(self, rec['name'],logfile='/var/log/nm') + logger.log("%s: first chance..."%name) + vserver.VServer.__init__(self, name,logfile='/var/log/nm') except Exception, err: if not isinstance(err, vserver.NoSuchVServer): # Probably a bad vserver or vserver configuration file - logger.log_exc(self.name) - logger.log('%s: recreating bad vserver' % rec['name']) - self.destroy(rec['name']) - self.create(rec['name'], rec['vref']) - vserver.VServer.__init__(self, rec['name'],logfile='/var/log/nm') + logger.log_exc("sliver_vs.__init__ (1) %s",name=name) + logger.log('%s: recreating bad vserver' % name) + self.destroy(name) + self.create(name, rec['vref']) + logger.log("%s: second chance..."%name) + vserver.VServer.__init__(self, name,logfile='/var/log/nm') self.keys = '' self.rspec = {} @@ -78,8 +81,8 @@ class Sliver_VS(accounts.Account, vserver.VServer): def create(name, vref = None): logger.verbose('Sliver_VS:create - name=%s'%name) if vref is None: - logger.log("creating %s : no vref attached, this is unexpected"%name) - vref='default' + logger.log("%s: ERROR - no vref attached, this is unexpected"%name) + return # used to look in /etc/planetlab/family, # now relies on the 'GetSliceFamily' extra attribute in GetSlivers() # which for legacy is still exposed here as the 'vref' key @@ -92,6 +95,8 @@ class Sliver_VS(accounts.Account, vserver.VServer): # guess arch try: (x,y,arch)=vref.split('-') + # mh, this of course applies when 'vref' is e.g. 'netflow' + # and that's not quite right except: arch='i386' @@ -143,14 +148,14 @@ class Sliver_VS(accounts.Account, vserver.VServer): os.close(fd) try: self.chroot_call(install_initscript) - except: logger.log_exc(self.name) + except: logger.log_exc("sliver_vs.start",name=self.name) tools.close_nonstandard_fds() vserver.VServer.start(self) os._exit(0) else: os.waitpid(child_pid, 0) self.initscriptchanged = False - else: logger.log('%s: not starting, is not enabled' % self.name) + else: logger.log('not starting, is not enabled', name=self.name) def stop(self): logger.log('%s: stopping' % self.name) @@ -173,8 +178,7 @@ class Sliver_VS(accounts.Account, vserver.VServer): self.disk_usage_initialized = True vserver.VServer.set_disklimit(self, max(disk_max, self.disk_blocks)) except: - logger.log('%s: failed to set max disk usage' % self.name) - logger.log_exc(self.name) + logger.log_exc('failed to set max disk usage',name=self.name) # get/set the min/soft/hard values for all of the vserver # related RLIMITS. Note that vserver currently only diff --git a/sm.py b/sm.py index 627a649..25b5e87 100644 --- a/sm.py +++ b/sm.py @@ -72,7 +72,7 @@ def GetSlivers(data, config = None, plc=None, fullupdate=True): f = open('/etc/planetlab/node_id') try: node_id = int(f.read()) finally: f.close() - except: logger.log_exc() + except: logger.log_exc("sm.GetSlivers failed to read /etc/planetlab/node_id") if data.has_key('node_id') and data['node_id'] != node_id: return @@ -88,7 +88,7 @@ def GetSlivers(data, config = None, plc=None, fullupdate=True): initscripts[str(is_rec['name'])] = is_rec['script'] for sliver in data['slivers']: - logger.verbose("sm:GetSlivers in slivers loop") + logger.verbose("%s: sm:GetSlivers in slivers loop"%sliver['name']) rec = sliver.copy() rec.setdefault('timestamp', data['timestamp']) diff --git a/tools.py b/tools.py index 7505bd1..0acf035 100644 --- a/tools.py +++ b/tools.py @@ -81,7 +81,7 @@ def fork_as(su, function, *args): except: os.seteuid(os.getuid()) # undo su so we can write the log file os.setegid(os.getgid()) - logger.log_exc() + logger.log_exc("tools.fork_as") os._exit(0) else: os.waitpid(child_pid, 0) -- 2.43.0