reduce false exceptions that should be ignored or handled better in the code.

[monitor.git] / bootman.py
diff --git a/bootman.py b/bootman.py

index d34e6ef..f8f6d48 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -7,7 +7,7 @@ api = plc.getAuthAPI()
  
  import sys
  import os
-import policy
+import const
  
  from getsshkeys import SSHKnownHosts
  
@@ -24,7 +24,9 @@ from unified_model import *
  from emailTxt import mailtxt
  from nodeconfig import network_config_to_str
  import traceback
-import monitorconfig
+import config
+
+class ExceptionDoubleSSHError(Exception): pass
  
  import signal
  class Sopen(subprocess.Popen):
@@ -34,9 +36,12 @@ class Sopen(subprocess.Popen):
  #from Rpyc import SocketConnection, Async
  from Rpyc import SocketConnection, Async
  from Rpyc.Utils import *
+fb = None
  
  def get_fbnode(node):
-       fb = database.dbLoad("findbad")
+       global fb
+       if fb is None:
+               fb = database.dbLoad("findbad")
         fbnode = fb['nodes'][node]['values']
         return fbnode
  
@@ -62,7 +67,8 @@ class NodeConnection:
  
         def get_bootmanager_log(self):
                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
                 log = open("log/bm.%s.log" % self.node, 'r')
                 return log
  
@@ -204,7 +210,7 @@ class PlanetLabSession:
                 args['port'] = self.port
                 args['user'] = 'root'
                 args['hostname'] = self.node
-               args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
                 ssh_port = 22
  
                 if self.nosetup:
@@ -229,7 +235,7 @@ class PlanetLabSession:
                         if ret != 0:
                                 print "\tFAILED TWICE"
                                 #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
  
                 t1 = time.time()
                 # KILL any already running servers.
@@ -321,7 +327,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
  
                 loginbase = plc.siteId(hostname)
-               m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+               emails = plc.getTechEmails(loginbase)
+               m.send(emails) 
  
                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -334,6 +341,8 @@ def reboot(hostname, config=None, forced_action=None):
         try:
                 k = SSHKnownHosts(); k.update(node); k.write(); del k
         except:
+               from nodecommon import email_exception
+               email_exception()
                 print traceback.print_exc()
                 return False
  
@@ -342,9 +351,16 @@ def reboot(hostname, config=None, forced_action=None):
                         session = PlanetLabSession(node, False, True)
                 else:
                         session = PlanetLabSession(node, config.nosetup, config.verbose)
+       except ExceptionDoubleSSHError, e:
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
+               return False
         except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception(msg)
                 print e
                 return False
  
@@ -356,10 +372,15 @@ def reboot(hostname, config=None, forced_action=None):
                 try:
                         time.sleep(session.timeout*4)
                         conn = session.get_connection(config)
+               except EOFError:
+                       # failed twice... no need to report this really, it's just in a
+                       # weird state...
+                       return False
                 except:
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception(node)
                         return False
-                       
  
         if forced_action == "reboot":
                 conn.restart_node('rins')
@@ -453,7 +474,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                         conn.set_nodestate('disable')
                         return False
  
@@ -512,6 +534,7 @@ def reboot(hostname, config=None, forced_action=None):
                         ('hardwarerequirefail' , 'Hardware requirements not met'),
                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                         ('modulefail'   , 'Unable to get list of system modules'),
                         ('writeerror'   , 'write error: No space left on device'),
@@ -539,11 +562,11 @@ def reboot(hostname, config=None, forced_action=None):
         #  By using the sequence identifier, we guarantee that there will be no
         #  frequent loops.  I'm guessing there is a better way to track loops,
         #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
  
         sequences = {}
  
@@ -581,7 +604,14 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                       # actual solution appears to involve removing the bad files, and
+                       # continually trying to boot the node.
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
                         ]:
                 sequences.update({n : "restart_bootmanager_rins"})
  
@@ -610,16 +640,20 @@ def reboot(hostname, config=None, forced_action=None):
                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
                          ]:
                 sequences.update({n: "restart_node_boot"})
  
         # update_node_config_email
         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                       "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                         ]:
                 sequences.update({n : "update_node_config_email"})
  
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                       ]:
                 sequences.update({n : "nodenetwork_email"})
  
         # update_bootcd_email
@@ -643,7 +677,11 @@ def reboot(hostname, config=None, forced_action=None):
         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
  
         # bad_dns_email
-       sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+       for n in [ 
+        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               ]:
+               sequences.update( { n : "bad_dns_email"})
  
         flag_set = True
  
@@ -659,7 +697,7 @@ def reboot(hostname, config=None, forced_action=None):
                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
                 m.reset()
-               m.send(['monitor-list@lists.planet-lab.org'])
+               m.send([config.cc_email]) 
  
                 conn.restart_bootmanager('boot')
  
@@ -697,7 +735,7 @@ def reboot(hostname, config=None, forced_action=None):
                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
                         m.reset()
-                       m.send(['monitor-list@lists.planet-lab.org'])
+                       m.send([config.cc_email]) 
  
                         conn.restart_bootmanager('boot')
  
@@ -708,7 +746,8 @@ def reboot(hostname, config=None, forced_action=None):
                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodeid_persistmessages')
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                         conn.dump_plconf_file()
                         conn.set_nodestate('disable')
  
@@ -717,10 +756,11 @@ def reboot(hostname, config=None, forced_action=None):
                         args = {}
                         args['hostname'] = hostname
                         args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodenet_persistmessages')
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                         conn.dump_plconf_file()
                         conn.set_nodestate('disable')
  
@@ -735,7 +775,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
  
                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                         conn.set_nodestate('disable')
@@ -753,7 +794,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                         conn.set_nodestate('disable')
  
                 elif sequences[s] == "update_hardware_email":
@@ -765,7 +807,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                         conn.set_nodestate('disable')
  
                 elif sequences[s] == "bad_dns_email":
@@ -775,6 +818,8 @@ def reboot(hostname, config=None, forced_action=None):
                                 node = api.GetNodes(hostname)[0]
                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
                         except:
+                               from nodecommon import email_exception
+                               email_exception()
                                 print traceback.print_exc()
                                 # TODO: api error. skip email, b/c all info is not available,
                                 # flag_set will not be recorded.
@@ -788,7 +833,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                         conn.set_nodestate('disable')
  
         if flag_set: