bug fix in summary template
[monitor.git] / monitor / bootman.py
index bfb295c..fdfadb2 100755 (executable)
@@ -42,6 +42,8 @@ api = plc.getAuthAPI()
 fb = None
 
 
+class ExceptionDoubleSSHError(Exception): pass
+
 class NodeConnection:
        def __init__(self, connection, node, config):
                self.node = node
@@ -66,18 +68,34 @@ class NodeConnection:
                return "unknown"
 
        def get_dmesg(self):
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
                self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
-               log = open("log/dmesg.%s.log" % self.node, 'r')
+               download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
+               os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
+               log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
                return log
 
        def get_bootmanager_log(self):
-               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
-               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
-               log = open("log/bm.%s.log" % self.node, 'r')
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+               download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
+               os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
+               log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
                return log
 
+
+#      def get_dmesg(self):
+#              self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
+#              download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+#              log = open("log/dmesg.%s.log" % self.node, 'r')
+#              return log
+#
+#      def get_bootmanager_log(self):
+#              download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
+#              #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+#              os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
+#              log = open("log/bm.%s.log" % self.node, 'r')
+#              return log
+
        def dump_plconf_file(self):
                c = self.c
                self.c.modules.sys.path.append("/tmp/source/")
@@ -248,7 +266,7 @@ class PlanetLabSession:
                        if ret != 0:
                                print "\tFAILED TWICE"
                                #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
                # KILL any already running servers.
@@ -346,9 +364,11 @@ class DebugInterface:
                                self.session = PlanetLabSession(self.hostname, False, True)
                        else:
                                self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
-               except Exception, e:
+               except ExceptionDoubleSSHError, e:
                        msg = "ERROR setting up session for %s" % self.hostname
                        print msg
+                       return False
+               except Exception, e:
                        traceback.print_exc()
                        email_exception(msg)
                        return False
@@ -361,6 +381,10 @@ class DebugInterface:
                        try:
                                time.sleep(self.session.timeout*5)
                                conn = self.session.get_connection(config)
+                       except EOFError:
+                               # failed twice... no need to report this really, it's just in a
+                               # weird state...
+                               return False
                        except:
                                traceback.print_exc()
                                email_exception(self.hostname)
@@ -399,7 +423,7 @@ class DebugInterface:
                                ]:
                        sequences.update({n : "restart_bootmanager_boot"})
 
-               #       conn.restart_bootmanager('rins')
+               #       conn.restart_bootmanager('reinstall')
                for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
@@ -422,13 +446,18 @@ class DebugInterface:
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
 
                # repair_node_keys
-               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+               for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
+                                       "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
+                               ]:
+                       sequences.update({n: "repair_node_keys"})
 
-               #   conn.restart_node('rins')
+               #   conn.restart_node('reinstall')
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
                                "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
@@ -451,12 +480,15 @@ class DebugInterface:
                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
                                 ]:
                        sequences.update({n: "restart_node_boot"})
 
                # update_node_config_email
                for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
+                                 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                                ]:
                        sequences.update({n : "update_node_config_email"})
@@ -612,8 +644,12 @@ class DebugInterface:
 
                return sequence
                
-
 def restore(sitehist, hostname, config=None, forced_action=None):
+       ret = restore_basic(sitehist, hostname, config, forced_action)
+       session.flush()
+       return ret
+
+def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
@@ -624,27 +660,19 @@ def restore(sitehist, hostname, config=None, forced_action=None):
        if fbnode['observed_category'] == "OLDBOOTCD":
                print "\t...Notify owner to update BootImage!!!"
 
-               if not found_within(recent_actions, 'newbootcd_notice', 3):
+               if not found_within(recent_actions, 'newbootcd_notice', 3.5):
                        sitehist.sendMessage('newbootcd_notice', hostname=hostname)
 
                        print "\tDisabling %s due to out-of-date BootImage" % hostname
-                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
+                       api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 
                # NOTE: nothing else is possible.
                return True
 
        debugnode = DebugInterface(hostname)
        conn = debugnode.getConnection()
-       #print "conn: %s" % conn
-       #print "trying to use conn after returning it."
-       #print conn.c.modules.sys.path
-       #print conn.c.modules.os.path.exists('/tmp/source')
        if type(conn) == type(False): return False
 
-       #if forced_action == "reboot":
-       #       conn.restart_node('rins')
-       #       return True
-
        boot_state = conn.get_boot_state()
        if boot_state != "debug":
                print "... %s in %s state: skipping..." % (hostname , boot_state)
@@ -674,11 +702,12 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        print "...Should investigate.  Skipping node."
                        # TODO: send message related to these errors.
 
-                       if not found_within(recent_actions, 'newbootcd_notice', 3):
+                       if not found_within(recent_actions, 'baddisk_notice', 7):
+                               print "baddisk_notice not found recently"
 
                                log=conn.get_dmesg().read()
                                sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disable')
+                               conn.set_nodestate('disabled')
 
                        return False
 
@@ -715,6 +744,8 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                args['sequence'] = s
                args['bmlog'] = conn.get_bootmanager_log().read()
                args['viart'] = False
+               args['saveact'] = True
+               args['ccemail'] = True
 
                sitehist.sendMessage('unknownsequence_notice', **args)
 
@@ -731,16 +762,17 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
                        print "...Restarting BootManager.py on %s "%hostname 
-                       conn.restart_bootmanager('rins')
+                       conn.restart_bootmanager('reinstall')
                elif sequences[s] == "restart_node_rins":
-                       conn.restart_node('rins')
+                       conn.restart_node('reinstall')
                elif sequences[s] == "restart_node_boot":
                        conn.restart_node('boot')
                elif sequences[s] == "repair_node_keys":
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('rins')
+                               # TODO: why was this originally 'reinstall' instead of 'boot'??
+                               conn.restart_bootmanager('boot')
                                pass
                        else:
                                # there was some failure to synchronize the keys.
@@ -752,6 +784,8 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        args['sequence'] = s
                        args['bmlog'] = conn.get_bootmanager_log().read()
                        args['viart'] = False
+                       args['saveact'] = True
+                       args['ccemail'] = True
 
                        sitehist.sendMessage('unknownsequence_notice', **args)
                        conn.restart_bootmanager('boot')
@@ -759,7 +793,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                # TODO: differentiate this and the 'nodenetwork_email' actions.
                elif sequences[s] == "update_node_config_email":
 
-                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
                                args['hostname'] = hostname
                                sitehist.sendMessage('nodeconfig_notice', **args)
@@ -767,7 +801,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                elif sequences[s] == "nodenetwork_email":
 
-                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
                                args['hostname'] = hostname
                                args['bmlog'] = conn.get_bootmanager_log().read()
@@ -776,7 +810,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                elif sequences[s] == "update_bootcd_email":
 
-                       if not found_within(recent_actions, 'newalphacd_notice', 3):
+                       if not found_within(recent_actions, 'newalphacd_notice', 3.5):
                                args = {}
                                args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
                                args['hostname'] = hostname
@@ -790,17 +824,17 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
                        # TODO: email notice of broken hardware
-                       if not found_within(recent_actions, 'baddisk_notice', 1):
+                       if not found_within(recent_actions, 'baddisk_notice', 7):
                                print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
                                args = {}
                                args['hostname'] = hostname
                                args['log'] = conn.get_dmesg().read()
 
                                sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disable')
+                               conn.set_nodestate('disabled')
 
                elif sequences[s] == "update_hardware_email":
-                       if not found_within(recent_actions, 'minimalhardware_notice', 1):
+                       if not found_within(recent_actions, 'minimalhardware_notice', 7):
                                print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
                                args = {}
                                args['hostname'] = hostname
@@ -824,7 +858,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                                args['hostname'] = hostname
                                args['network_config'] = nodenet_str
-                               args['nodenetwork_id'] = net['nodenetwork_id']
+                               args['interface_id'] = net['interface_id']
 
                                sitehist.sendMessage('baddns_notice', **args)