X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=monitor%2Fbootman.py;h=36d8b353279e873c4f49da0182849f9931563d83;hb=40588e1f900ba82db3ca69c5cc375805028f2430;hp=effd7501a8f6f3be775659e8c23b225d455307cb;hpb=334378a14103c3fd02332b6ce3767553f1fe11d2;p=monitor.git diff --git a/monitor/bootman.py b/monitor/bootman.py index effd750..36d8b35 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -42,6 +42,8 @@ api = plc.getAuthAPI() fb = None +class ExceptionDoubleSSHError(Exception): pass + class NodeConnection: def __init__(self, connection, node, config): self.node = node @@ -248,7 +250,7 @@ class PlanetLabSession: if ret != 0: print "\tFAILED TWICE" #sys.exit(1) - raise Exception("Failed twice trying to login with updated ssh host key") + raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key") t1 = time.time() # KILL any already running servers. @@ -346,9 +348,11 @@ class DebugInterface: self.session = PlanetLabSession(self.hostname, False, True) else: self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) - except Exception, e: + except ExceptionDoubleSSHError, e: msg = "ERROR setting up session for %s" % self.hostname print msg + return False + except Exception, e: traceback.print_exc() email_exception(msg) return False @@ -361,6 +365,10 @@ class DebugInterface: try: time.sleep(self.session.timeout*5) conn = self.session.get_connection(config) + except EOFError: + # failed twice... no need to report this really, it's just in a + # weird state... + return False except: traceback.print_exc() email_exception(self.hostname) @@ -399,7 +407,7 @@ class DebugInterface: ]: sequences.update({n : "restart_bootmanager_boot"}) - # conn.restart_bootmanager('rins') + # conn.restart_bootmanager('reinstall') for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", @@ -422,13 +430,18 @@ class DebugInterface: "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", ]: sequences.update({n : "restart_bootmanager_rins"}) # repair_node_keys - sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) + for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done", + "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done", + ]: + sequences.update({n: "repair_node_keys"}) - # conn.restart_node('rins') + # conn.restart_node('reinstall') for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", @@ -451,12 +464,14 @@ class DebugInterface: "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done", ]: sequences.update({n: "restart_node_boot"}) # update_node_config_email for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done", "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", ]: sequences.update({n : "update_node_config_email"}) @@ -642,7 +657,7 @@ def restore(sitehist, hostname, config=None, forced_action=None): if type(conn) == type(False): return False #if forced_action == "reboot": - # conn.restart_node('rins') + # conn.restart_node('reinstall') # return True boot_state = conn.get_boot_state() @@ -731,16 +746,17 @@ def restore(sitehist, hostname, config=None, forced_action=None): conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": print "...Restarting BootManager.py on %s "%hostname - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') elif sequences[s] == "restart_node_rins": - conn.restart_node('rins') + conn.restart_node('reinstall') elif sequences[s] == "restart_node_boot": conn.restart_node('boot') elif sequences[s] == "repair_node_keys": if conn.compare_and_repair_nodekeys(): # the keys either are in sync or were forced in sync. # so try to reboot the node again. - conn.restart_bootmanager('rins') + # TODO: why was this originally 'reinstall' instead of 'boot'?? + conn.restart_bootmanager('boot') pass else: # there was some failure to synchronize the keys. @@ -813,7 +829,7 @@ def restore(sitehist, hostname, config=None, forced_action=None): args = {} try: node = plccache.GetNodeByName(hostname) - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + net = api.GetInterfaces(node['interface_ids'])[0] except: email_exception() print traceback.print_exc() @@ -824,7 +840,7 @@ def restore(sitehist, hostname, config=None, forced_action=None): args['hostname'] = hostname args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] + args['interface_id'] = net['interface_id'] sitehist.sendMessage('baddns_notice', **args)