remove unnecessary informatiton on pcuview page
[monitor.git] / monitor / bootman.py
index effd750..2afbbf6 100755 (executable)
@@ -42,6 +42,8 @@ api = plc.getAuthAPI()
 fb = None
 
 
+class ExceptionDoubleSSHError(Exception): pass
+
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
@@ -248,7 +250,7 @@ class PlanetLabSession:
                        if ret != 0:
                                print "\tFAILED TWICE"
                                #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
                # KILL any already running servers.
@@ -346,9 +348,11 @@ class DebugInterface:
                                self.session = PlanetLabSession(self.hostname, False, True)
                        else:
                                self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
-               except Exception, e:
+               except ExceptionDoubleSSHError, e:
                        msg = "ERROR setting up session for %s" % self.hostname
                        print msg
+                       return False
+               except Exception, e:
                        traceback.print_exc()
                        email_exception(msg)
                        return False
@@ -361,6 +365,10 @@ class DebugInterface:
                        try:
                                time.sleep(self.session.timeout*5)
                                conn = self.session.get_connection(config)
+                       except EOFError:
+                               # failed twice... no need to report this really, it's just in a
+                               # weird state...
+                               return False
                        except:
                                traceback.print_exc()
                                email_exception(self.hostname)
@@ -399,7 +407,7 @@ class DebugInterface:
                                ]:
                        sequences.update({n : "restart_bootmanager_boot"})
 
-               #       conn.restart_bootmanager('rins')
+               #       conn.restart_bootmanager('reinstall')
                for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
@@ -422,13 +430,18 @@ class DebugInterface:
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
 
                # repair_node_keys
-               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+               for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
+                                       "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
+                               ]:
+                       sequences.update({n: "repair_node_keys"})
 
-               #   conn.restart_node('rins')
+               #   conn.restart_node('reinstall')
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
@@ -451,12 +464,14 @@ class DebugInterface:
                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
                                 ]:
                        sequences.update({n: "restart_node_boot"})
 
                # update_node_config_email
                for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                                ]:
                        sequences.update({n : "update_node_config_email"})
@@ -628,7 +643,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        sitehist.sendMessage('newbootcd_notice', hostname=hostname)
 
                        print "\tDisabling %s due to out-of-date BootImage" % hostname
-                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
+                       api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 
                # NOTE: nothing else is possible.
                return True
@@ -642,7 +657,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
        if type(conn) == type(False): return False
 
        #if forced_action == "reboot":
-       #       conn.restart_node('rins')
+       #       conn.restart_node('reinstall')
        #       return True
 
        boot_state = conn.get_boot_state()
@@ -678,7 +693,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                                log=conn.get_dmesg().read()
                                sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disable')
+                               conn.set_nodestate('disabled')
 
                        return False
 
@@ -731,16 +746,17 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
                        print "...Restarting BootManager.py on %s "%hostname 
-                       conn.restart_bootmanager('rins')
+                       conn.restart_bootmanager('reinstall')
                elif sequences[s] == "restart_node_rins":
-                       conn.restart_node('rins')
+                       conn.restart_node('reinstall')
                elif sequences[s] == "restart_node_boot":
                        conn.restart_node('boot')
                elif sequences[s] == "repair_node_keys":
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('rins')
+                               # TODO: why was this originally 'reinstall' instead of 'boot'??
+                               conn.restart_bootmanager('boot')
                                pass
                        else:
                                # there was some failure to synchronize the keys.
@@ -797,7 +813,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                                args['log'] = conn.get_dmesg().read()
 
                                sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disable')
+                               conn.set_nodestate('disabled')
 
                elif sequences[s] == "update_hardware_email":
                        if not found_within(recent_actions, 'minimalhardware_notice', 1):
@@ -813,7 +829,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                                args = {}
                                try:
                                        node = plccache.GetNodeByName(hostname)
-                                       net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                                       net = api.GetInterfaces(node['interface_ids'])[0]
                                except:
                                        email_exception()
                                        print traceback.print_exc()
@@ -824,7 +840,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                                args['hostname'] = hostname
                                args['network_config'] = nodenet_str
-                               args['nodenetwork_id'] = net['nodenetwork_id']
+                               args['interface_id'] = net['interface_id']
 
                                sitehist.sendMessage('baddns_notice', **args)