reduce false exceptions that should be ignored or handled better in the code.
[monitor.git] / bootman.py
index 8a1baf5..f8f6d48 100755 (executable)
@@ -26,6 +26,8 @@ from nodeconfig import network_config_to_str
 import traceback
 import config
 
+class ExceptionDoubleSSHError(Exception): pass
+
 import signal
 class Sopen(subprocess.Popen):
        def kill(self, signal = signal.SIGTERM):
@@ -65,7 +67,8 @@ class NodeConnection:
 
        def get_bootmanager_log(self):
                download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
                log = open("log/bm.%s.log" % self.node, 'r')
                return log
 
@@ -232,7 +235,7 @@ class PlanetLabSession:
                        if ret != 0:
                                print "\tFAILED TWICE"
                                #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
                # KILL any already running servers.
@@ -324,7 +327,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                        mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
                loginbase = plc.siteId(hostname)
-               m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+               emails = plc.getTechEmails(loginbase)
+               m.send(emails) 
 
                print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -337,6 +341,8 @@ def reboot(hostname, config=None, forced_action=None):
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
+               from nodecommon import email_exception
+               email_exception()
                print traceback.print_exc()
                return False
 
@@ -345,9 +351,16 @@ def reboot(hostname, config=None, forced_action=None):
                        session = PlanetLabSession(node, False, True)
                else:
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
+       except ExceptionDoubleSSHError, e:
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
+               return False
        except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception(msg)
                print e
                return False
 
@@ -359,8 +372,14 @@ def reboot(hostname, config=None, forced_action=None):
                try:
                        time.sleep(session.timeout*4)
                        conn = session.get_connection(config)
+               except EOFError:
+                       # failed twice... no need to report this really, it's just in a
+                       # weird state...
+                       return False
                except:
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception(node)
                        return False
 
        if forced_action == "reboot":
@@ -455,7 +474,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
                        return False
 
@@ -590,6 +610,8 @@ def reboot(hostname, config=None, forced_action=None):
                        # actual solution appears to involve removing the bad files, and
                        # continually trying to boot the node.
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
 
@@ -618,6 +640,7 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
                         ]:
                sequences.update({n: "restart_node_boot"})
 
@@ -674,7 +697,7 @@ def reboot(hostname, config=None, forced_action=None):
                m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
                                                                         mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
                m.reset()
-               m.send(['monitor-list@lists.planet-lab.org'])
+               m.send([config.cc_email]) 
 
                conn.restart_bootmanager('boot')
 
@@ -712,7 +735,7 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
                                                                                 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
                        m.reset()
-                       m.send(['monitor-list@lists.planet-lab.org'])
+                       m.send([config.cc_email]) 
 
                        conn.restart_bootmanager('boot')
 
@@ -723,7 +746,8 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodeid_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.dump_plconf_file()
                        conn.set_nodestate('disable')
 
@@ -732,10 +756,11 @@ def reboot(hostname, config=None, forced_action=None):
                        args = {}
                        args['hostname'] = hostname
                        args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.dump_plconf_file()
                        conn.set_nodestate('disable')
 
@@ -750,7 +775,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
 
                        print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                        conn.set_nodestate('disable')
@@ -768,7 +794,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
@@ -780,7 +807,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "bad_dns_email":
@@ -790,6 +818,8 @@ def reboot(hostname, config=None, forced_action=None):
                                node = api.GetNodes(hostname)[0]
                                net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
                        except:
+                               from nodecommon import email_exception
+                               email_exception()
                                print traceback.print_exc()
                                # TODO: api error. skip email, b/c all info is not available,
                                # flag_set will not be recorded.
@@ -803,7 +833,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
        if flag_set: