X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=5e8b908359f025fe9f0cb1bd4e1fbbee5df5b365;hb=refs%2Fheads%2F1.0;hp=fb5cf5df67ac626de4e8b79515c467c66c5ad253;hpb=d9a55220c6a3ccbbe1fdd5ea38b2593fd09acfcc;p=monitor.git diff --git a/bootman.py b/bootman.py index fb5cf5d..5e8b908 100755 --- a/bootman.py +++ b/bootman.py @@ -26,6 +26,8 @@ from nodeconfig import network_config_to_str import traceback import config +class ExceptionDoubleSSHError(Exception): pass + import signal class Sopen(subprocess.Popen): def kill(self, signal = signal.SIGTERM): @@ -58,14 +60,18 @@ class NodeConnection: return "unknown" def get_dmesg(self): + t_stamp = time.strftime("%Y-%m-%d-%H:%M") self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") - download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node) + download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node)) + os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node)) log = open("log/dmesg.%s.log" % self.node, 'r') return log def get_bootmanager_log(self): - download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) - os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + t_stamp = time.strftime("%Y-%m-%d-%H:%M") + download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node)) + #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node)) log = open("log/bm.%s.log" % self.node, 'r') return log @@ -232,7 +238,7 @@ class PlanetLabSession: if ret != 0: print "\tFAILED TWICE" #sys.exit(1) - raise Exception("Failed twice trying to login with updated ssh host key") + raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key") t1 = time.time() # KILL any already running servers. @@ -338,6 +344,8 @@ def reboot(hostname, config=None, forced_action=None): try: k = SSHKnownHosts(); k.update(node); k.write(); del k except: + from nodecommon import email_exception + email_exception() print traceback.print_exc() return False @@ -346,9 +354,16 @@ def reboot(hostname, config=None, forced_action=None): session = PlanetLabSession(node, False, True) else: session = PlanetLabSession(node, config.nosetup, config.verbose) + except ExceptionDoubleSSHError, e: + msg = "ERROR setting up session for %s" % hostname + print msg + return False except Exception, e: - print "ERROR setting up session for %s" % hostname + msg = "ERROR setting up session for %s" % hostname + print msg print traceback.print_exc() + from nodecommon import email_exception + email_exception(msg) print e return False @@ -360,12 +375,18 @@ def reboot(hostname, config=None, forced_action=None): try: time.sleep(session.timeout*4) conn = session.get_connection(config) + except EOFError: + # failed twice... no need to report this really, it's just in a + # weird state... + return False except: print traceback.print_exc() + from nodecommon import email_exception + email_exception(node) return False if forced_action == "reboot": - conn.restart_node('rins') + conn.restart_node('reinstall') return True boot_state = conn.get_boot_state() @@ -572,7 +593,7 @@ def reboot(hostname, config=None, forced_action=None): ]: sequences.update({n : "restart_bootmanager_boot"}) - # conn.restart_bootmanager('rins') + # conn.restart_bootmanager('reinstall') for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", @@ -592,13 +613,17 @@ def reboot(hostname, config=None, forced_action=None): # actual solution appears to involve removing the bad files, and # continually trying to boot the node. "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", ]: sequences.update({n : "restart_bootmanager_rins"}) # repair_node_keys sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - # conn.restart_node('rins') + # conn.restart_node('reinstall') for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", @@ -692,16 +717,16 @@ def reboot(hostname, config=None, forced_action=None): conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": if config and not config.quiet: print "...Restarting BootManager.py on %s "% node - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') elif sequences[s] == "restart_node_rins": - conn.restart_node('rins') + conn.restart_node('reinstall') elif sequences[s] == "restart_node_boot": conn.restart_node('boot') elif sequences[s] == "repair_node_keys": if conn.compare_and_repair_nodekeys(): # the keys either are in sync or were forced in sync. # so try to reboot the node again. - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') pass else: # there was some failure to synchronize the keys. @@ -736,7 +761,7 @@ def reboot(hostname, config=None, forced_action=None): args = {} args['hostname'] = hostname args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, + m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) emails = plc.getTechEmails(loginbase) @@ -796,8 +821,10 @@ def reboot(hostname, config=None, forced_action=None): args = {} try: node = api.GetNodes(hostname)[0] - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + net = api.GetInterfaces(node['interface_ids'])[0] except: + from nodecommon import email_exception + email_exception() print traceback.print_exc() # TODO: api error. skip email, b/c all info is not available, # flag_set will not be recorded. @@ -806,7 +833,7 @@ def reboot(hostname, config=None, forced_action=None): args['hostname'] = hostname args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] + args['interface_id'] = net['interface_id'] m = PersistMessage(hostname, mailtxt.baddns[0] % args, mailtxt.baddns[1] % args, True, db='baddns_persistmessages')