changes for 3.0

[monitor.git] / bootman.py
diff --git a/bootman.py b/bootman.py

index 4bd503b..5e8b908 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -26,6 +26,8 @@ from nodeconfig import network_config_to_str
  import traceback
  import config
  
+class ExceptionDoubleSSHError(Exception): pass
+
  import signal
  class Sopen(subprocess.Popen):
         def kill(self, signal = signal.SIGTERM):
@@ -58,14 +60,18 @@ class NodeConnection:
                         return "unknown"
  
         def get_dmesg(self):
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+               download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node))
+               os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node))
                 log = open("log/dmesg.%s.log" % self.node, 'r')
                 return log
  
         def get_bootmanager_log(self):
-               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+               download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node))
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node))
                 log = open("log/bm.%s.log" % self.node, 'r')
                 return log
  
@@ -232,7 +238,7 @@ class PlanetLabSession:
                         if ret != 0:
                                 print "\tFAILED TWICE"
                                 #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
  
                 t1 = time.time()
                 # KILL any already running servers.
@@ -338,6 +344,8 @@ def reboot(hostname, config=None, forced_action=None):
         try:
                 k = SSHKnownHosts(); k.update(node); k.write(); del k
         except:
+               from nodecommon import email_exception
+               email_exception()
                 print traceback.print_exc()
                 return False
  
@@ -346,9 +354,16 @@ def reboot(hostname, config=None, forced_action=None):
                         session = PlanetLabSession(node, False, True)
                 else:
                         session = PlanetLabSession(node, config.nosetup, config.verbose)
+       except ExceptionDoubleSSHError, e:
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
+               return False
         except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                 print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception(msg)
                 print e
                 return False
  
@@ -360,12 +375,18 @@ def reboot(hostname, config=None, forced_action=None):
                 try:
                         time.sleep(session.timeout*4)
                         conn = session.get_connection(config)
+               except EOFError:
+                       # failed twice... no need to report this really, it's just in a
+                       # weird state...
+                       return False
                 except:
                         print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception(node)
                         return False
  
         if forced_action == "reboot":
-               conn.restart_node('rins')
+               conn.restart_node('reinstall')
                 return True
  
         boot_state = conn.get_boot_state()
@@ -572,7 +593,7 @@ def reboot(hostname, config=None, forced_action=None):
                         ]:
                 sequences.update({n : "restart_bootmanager_boot"})
  
-       #       conn.restart_bootmanager('rins')
+       #       conn.restart_bootmanager('reinstall')
         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
@@ -592,13 +613,17 @@ def reboot(hostname, config=None, forced_action=None):
                         # actual solution appears to involve removing the bad files, and
                         # continually trying to boot the node.
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                         ]:
                 sequences.update({n : "restart_bootmanager_rins"})
  
         # repair_node_keys
         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
  
-       #   conn.restart_node('rins')
+       #   conn.restart_node('reinstall')
         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
@@ -620,6 +645,7 @@ def reboot(hostname, config=None, forced_action=None):
                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
                          ]:
                 sequences.update({n: "restart_node_boot"})
  
@@ -691,16 +717,16 @@ def reboot(hostname, config=None, forced_action=None):
                         conn.restart_bootmanager('boot')
                 elif sequences[s] == "restart_bootmanager_rins":
                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
-                       conn.restart_bootmanager('rins')
+                       conn.restart_bootmanager('reinstall')
                 elif sequences[s] == "restart_node_rins":
-                       conn.restart_node('rins')
+                       conn.restart_node('reinstall')
                 elif sequences[s] == "restart_node_boot":
                         conn.restart_node('boot')
                 elif sequences[s] == "repair_node_keys":
                         if conn.compare_and_repair_nodekeys():
                                 # the keys either are in sync or were forced in sync.
                                 # so try to reboot the node again.
-                               conn.restart_bootmanager('rins')
+                               conn.restart_bootmanager('reinstall')
                                 pass
                         else:
                                 # there was some failure to synchronize the keys.
@@ -735,7 +761,7 @@ def reboot(hostname, config=None, forced_action=None):
                         args = {}
                         args['hostname'] = hostname
                         args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodenet_persistmessages')
                         loginbase = plc.siteId(hostname)
                         emails = plc.getTechEmails(loginbase)
@@ -795,8 +821,10 @@ def reboot(hostname, config=None, forced_action=None):
                         args = {}
                         try:
                                 node = api.GetNodes(hostname)[0]
-                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                               net = api.GetInterfaces(node['interface_ids'])[0]
                         except:
+                               from nodecommon import email_exception
+                               email_exception()
                                 print traceback.print_exc()
                                 # TODO: api error. skip email, b/c all info is not available,
                                 # flag_set will not be recorded.
@@ -805,7 +833,7 @@ def reboot(hostname, config=None, forced_action=None):
  
                         args['hostname'] = hostname
                         args['network_config'] = nodenet_str
-                       args['nodenetwork_id'] = net['nodenetwork_id']
+                       args['interface_id'] = net['interface_id']
                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')