bug fix in summary template
[monitor.git] / monitor / bootman.py
index 531f883..fdfadb2 100755 (executable)
@@ -68,18 +68,34 @@ class NodeConnection:
                return "unknown"
 
        def get_dmesg(self):
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
                self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
-               log = open("log/dmesg.%s.log" % self.node, 'r')
+               download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
+               os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
+               log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
                return log
 
        def get_bootmanager_log(self):
-               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
-               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
-               log = open("log/bm.%s.log" % self.node, 'r')
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+               download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
+               os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
+               log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
                return log
 
+
+#      def get_dmesg(self):
+#              self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
+#              download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+#              log = open("log/dmesg.%s.log" % self.node, 'r')
+#              return log
+#
+#      def get_bootmanager_log(self):
+#              download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
+#              #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+#              os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
+#              log = open("log/bm.%s.log" % self.node, 'r')
+#              return log
+
        def dump_plconf_file(self):
                c = self.c
                self.c.modules.sys.path.append("/tmp/source/")
@@ -430,11 +446,16 @@ class DebugInterface:
                                "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
 
                # repair_node_keys
-               sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+               for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
+                                       "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
+                               ]:
+                       sequences.update({n: "repair_node_keys"})
 
                #   conn.restart_node('reinstall')
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
@@ -459,12 +480,15 @@ class DebugInterface:
                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                                 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+                                "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
                                 ]:
                        sequences.update({n: "restart_node_boot"})
 
                # update_node_config_email
                for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
+                                 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                                ]:
                        sequences.update({n : "update_node_config_email"})
@@ -620,8 +644,12 @@ class DebugInterface:
 
                return sequence
                
-
 def restore(sitehist, hostname, config=None, forced_action=None):
+       ret = restore_basic(sitehist, hostname, config, forced_action)
+       session.flush()
+       return ret
+
+def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
@@ -632,27 +660,19 @@ def restore(sitehist, hostname, config=None, forced_action=None):
        if fbnode['observed_category'] == "OLDBOOTCD":
                print "\t...Notify owner to update BootImage!!!"
 
-               if not found_within(recent_actions, 'newbootcd_notice', 3):
+               if not found_within(recent_actions, 'newbootcd_notice', 3.5):
                        sitehist.sendMessage('newbootcd_notice', hostname=hostname)
 
                        print "\tDisabling %s due to out-of-date BootImage" % hostname
-                       api.UpdateNode(hostname, {'boot_state' : 'disable'})
+                       api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 
                # NOTE: nothing else is possible.
                return True
 
        debugnode = DebugInterface(hostname)
        conn = debugnode.getConnection()
-       #print "conn: %s" % conn
-       #print "trying to use conn after returning it."
-       #print conn.c.modules.sys.path
-       #print conn.c.modules.os.path.exists('/tmp/source')
        if type(conn) == type(False): return False
 
-       #if forced_action == "reboot":
-       #       conn.restart_node('reinstall')
-       #       return True
-
        boot_state = conn.get_boot_state()
        if boot_state != "debug":
                print "... %s in %s state: skipping..." % (hostname , boot_state)
@@ -682,11 +702,12 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        print "...Should investigate.  Skipping node."
                        # TODO: send message related to these errors.
 
-                       if not found_within(recent_actions, 'newbootcd_notice', 3):
+                       if not found_within(recent_actions, 'baddisk_notice', 7):
+                               print "baddisk_notice not found recently"
 
                                log=conn.get_dmesg().read()
                                sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disable')
+                               conn.set_nodestate('disabled')
 
                        return False
 
@@ -723,6 +744,8 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                args['sequence'] = s
                args['bmlog'] = conn.get_bootmanager_log().read()
                args['viart'] = False
+               args['saveact'] = True
+               args['ccemail'] = True
 
                sitehist.sendMessage('unknownsequence_notice', **args)
 
@@ -748,7 +771,8 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('reinstall')
+                               # TODO: why was this originally 'reinstall' instead of 'boot'??
+                               conn.restart_bootmanager('boot')
                                pass
                        else:
                                # there was some failure to synchronize the keys.
@@ -760,6 +784,8 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        args['sequence'] = s
                        args['bmlog'] = conn.get_bootmanager_log().read()
                        args['viart'] = False
+                       args['saveact'] = True
+                       args['ccemail'] = True
 
                        sitehist.sendMessage('unknownsequence_notice', **args)
                        conn.restart_bootmanager('boot')
@@ -767,7 +793,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                # TODO: differentiate this and the 'nodenetwork_email' actions.
                elif sequences[s] == "update_node_config_email":
 
-                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
                                args['hostname'] = hostname
                                sitehist.sendMessage('nodeconfig_notice', **args)
@@ -775,7 +801,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                elif sequences[s] == "nodenetwork_email":
 
-                       if not found_within(recent_actions, 'nodeconfig_notice', 3):
+                       if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
                                args['hostname'] = hostname
                                args['bmlog'] = conn.get_bootmanager_log().read()
@@ -784,7 +810,7 @@ def restore(sitehist, hostname, config=None, forced_action=None):
 
                elif sequences[s] == "update_bootcd_email":
 
-                       if not found_within(recent_actions, 'newalphacd_notice', 3):
+                       if not found_within(recent_actions, 'newalphacd_notice', 3.5):
                                args = {}
                                args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
                                args['hostname'] = hostname
@@ -798,17 +824,17 @@ def restore(sitehist, hostname, config=None, forced_action=None):
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
                        # TODO: email notice of broken hardware
-                       if not found_within(recent_actions, 'baddisk_notice', 1):
+                       if not found_within(recent_actions, 'baddisk_notice', 7):
                                print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
                                args = {}
                                args['hostname'] = hostname
                                args['log'] = conn.get_dmesg().read()
 
                                sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disable')
+                               conn.set_nodestate('disabled')
 
                elif sequences[s] == "update_hardware_email":
-                       if not found_within(recent_actions, 'minimalhardware_notice', 1):
+                       if not found_within(recent_actions, 'minimalhardware_notice', 7):
                                print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
                                args = {}
                                args['hostname'] = hostname