moved nodequery common code to monitor/query.py
[monitor.git] / monitor / bootman.py
index 96bc740..3ebeafe 100755 (executable)
@@ -2,8 +2,6 @@
 
 # Attempt to reboot a node in debug state.
 
-
-
 import os
 import sys
 import time
@@ -14,7 +12,6 @@ import subprocess
 from sets import Set
 
 from monitor.getsshkeys import SSHKnownHosts
-
 from monitor.Rpyc import SocketConnection, Async
 from monitor.Rpyc.Utils import *
 
@@ -36,11 +33,32 @@ from pcucontrol.transports.ssh import pxssh as pxssh
 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
 from pcucontrol.transports.ssh import pexpect as pexpect
 
-
-
 api = plc.getAuthAPI()
 fb = None
 
+def bootmanager_log_name(hostname):
+       t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+       base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
+       short_target_filename = os.path.join('history', base_filename)
+       return short_target_filename
+
+def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
+       try:
+               node = FindbadNodeRecord.get_latest_by(hostname=hostname)
+               loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
+               err = ""
+       except:
+               loginbase = "unknown"
+               err = traceback.format_exc()
+
+       act = ActionRecord(loginbase=loginbase,
+                                               hostname=hostname,
+                                               action='log',
+                                               action_type=logtype,
+                                               log_path=short_log_path,
+                                               error_string=err)
+       return
+       
 
 class ExceptionDoubleSSHError(Exception): pass
 
@@ -76,26 +94,14 @@ class NodeConnection:
                return log
 
        def get_bootmanager_log(self):
-               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
-               download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
-               os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
+               bm_name = bootmanager_log_name(self.node)
+               download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
+               #email_exception(self.node, "collected BM log for %s" % self.node)
+               bootmanager_log_action(self.node, bm_name, "collected_bm.log")
+               os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
                log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
                return log
 
-
-#      def get_dmesg(self):
-#              self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-#              download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
-#              log = open("log/dmesg.%s.log" % self.node, 'r')
-#              return log
-#
-#      def get_bootmanager_log(self):
-#              download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-#              #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
-#              os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
-#              log = open("log/bm.%s.log" % self.node, 'r')
-#              return log
-
        def dump_plconf_file(self):
                c = self.c
                self.c.modules.sys.path.append("/tmp/source/")
@@ -136,7 +142,7 @@ class NodeConnection:
                        print "Running MANUAL fsck on %s" % self.node
                        cmd = "( touch /tmp/BM_RUNNING ;  " + \
                                  "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
-                                 "  fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
+                                 "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
                                  "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
                                  "  rm -f /tmp/BM_RUNNING " + \
                                  ") &" 
@@ -255,13 +261,12 @@ class PlanetLabSession:
                self.setup_host()
 
        def get_connection(self, config):
-               conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
-               #i = 0
-               #while i < 3: 
-               #       print i, conn.c.modules.sys.path
-               #       print conn.c.modules.os.path.exists('/tmp/source')
-               #       i+=1
-               #       time.sleep(1)
+               try:
+                       conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+               except:
+                       # NOTE: try twice since this can sometimes fail the first time. If
+                       #               it fails again, let it go.
+                       conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
                return conn
        
        def setup_host(self):
@@ -300,7 +305,7 @@ class PlanetLabSession:
                        print ret
                        if ret != 0:
                                print "\tFAILED TWICE"
-                               #sys.exit(1)
+                               #email_exception("%s rsync failed twice" % self.node)
                                raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
@@ -316,19 +321,6 @@ class PlanetLabSession:
             python Rpyc/Servers/forking_server.py &> server.log &
             echo "done" >> out.log
 EOF""")
-               #cmd = """ssh %(user)s@%(hostname)s """ + \
-               #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
-               #cmd = cmd % args
-               #if self.verbose: print cmd
-               ## TODO: Add timeout
-               #print localos.system(cmd,timeout)
-
-               ## START a new rpyc server.
-               #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
-               #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
-               #cmd = cmd % args
-               #if self.verbose: print cmd
-               #print localos.system(cmd,timeout)
                print "setup rpyc server over ssh"
                print ssh.ret
 
@@ -448,13 +440,17 @@ class DebugInterface:
 
                                "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
+                               "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
                                "bminit-cfg-auth-getplc-update-debug-done",
+                               "bminit-cfg-auth-protoerror2-debug-done",
                                "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
                                "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
                                "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
                                "bminit-cfg-auth-protoerror-exception-update-debug-done",
                                "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
                                "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+                               "bminit-cfg-auth-authfail2-protoerror2-debug-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_boot"})
 
@@ -484,6 +480,8 @@ class DebugInterface:
                                "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
 
@@ -491,6 +489,9 @@ class DebugInterface:
                for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
                                        "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
                                        "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
+                                       "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
+                                       "bminit-cfg-auth-authfail-debug-done",
+                                       "bminit-cfg-auth-authfail2-authfail-debug-done",
                                ]:
                        sequences.update({n: "repair_node_keys"})
 
@@ -524,52 +525,72 @@ class DebugInterface:
                # fsck_repair
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
-                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
                                ]:
                        sequences.update({n : "fsck_repair"})
 
-               # update_node_config_email
+               # nodeconfig_notice
                for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
                                  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
                                  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+                                 "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
                                  "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
                                ]:
-                       sequences.update({n : "update_node_config_email"})
+                       sequences.update({n : "nodeconfig_notice"})
 
                for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
                                   "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
                                   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                                  "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
                                ]:
                        sequences.update({n : "nodenetwork_email"})
 
-               # update_bootcd_email
+               # noblockdevice_notice
                for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
                                "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
                                "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
                                "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
-                               "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+                               "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
+                               ]:
+                       sequences.update({n : "noblockdevice_notice"})
+
+               # update_bootcd_email
+               for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
                                ]:
                        sequences.update({n : "update_bootcd_email"})
 
                for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                                ]:
-                       sequences.update({n: "suspect_error_email"})
+                       sequences.update({n: "unknownsequence_notice"})
 
-               # update_hardware_email
-               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
-               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+               # minimalhardware_notice
+               sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
+               sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
 
-               # broken_hardware_email
-               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+               # baddisk_notice
+               sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
 
-               # bad_dns_email
+               # baddns_notice
                for n in [ 
                 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
                        "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
                        ]:
-                       sequences.update( { n : "bad_dns_email"})
+                       sequences.update( { n : "baddns_notice"})
 
                return sequences
 
@@ -577,7 +598,7 @@ class DebugInterface:
                steps = [
                        ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                        ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
-                       ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
+                       ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
 
                        ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 
@@ -648,11 +669,13 @@ class DebugInterface:
                        ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
                        ('exception'    , 'Exception'),
                        ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
+                       ('protoerror2'  , '500 Internal Server Error'),
                        ('protoerror'   , 'XML RPC protocol error'),
                        ('nodehostname' , 'Configured node hostname does not resolve'),
                        ('implementerror', 'Implementation Error'),
                        ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
                        ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
+                       ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
                        ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
                        ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
                        ('noinstall'    , 'notinstalled'),
@@ -672,8 +695,9 @@ class DebugInterface:
                        ('nospace'      , "No space left on device"),
                        ('nonode'       , 'Failed to authenticate call: No such node'),
                        ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
-                       ('bootcheckfail'     , 'BootCheckAuthentication'),
-                       ('bootupdatefail'   , 'BootUpdateNode'),
+                       ('authfail2'    , 'Authentication Failed'),
+                       ('bootcheckfail'  , 'BootCheckAuthentication'),
+                       ('bootupdatefail' , 'BootUpdateNode'),
                ]
                return steps
 
@@ -703,6 +727,8 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
 
+       bootman_action = "unknown"
+
        fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
        recent_actions = sitehist.getRecentActions(hostname=hostname)
 
@@ -716,20 +742,20 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                        api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 
                # NOTE: nothing else is possible.
-               return True
+               return "disabled"
 
        debugnode = DebugInterface(hostname)
        conn = debugnode.getConnection()
-       if type(conn) == type(False): return False
+       if type(conn) == type(False): return "connect_failed"
 
        boot_state = conn.get_boot_state()
        if boot_state != "debug":
                print "... %s in %s state: skipping..." % (hostname , boot_state)
-               return boot_state == "boot"
+               return "skipped" #boot_state == "boot"
 
        if conn.bootmanager_running():
                print "...BootManager is currently running.  Skipping host %s" %hostname 
-               return True
+               return "skipped" # True
 
        # Read persistent flags, tagged on one week intervals.
 
@@ -756,15 +782,17 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
                                log=conn.get_dmesg().read()
                                sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disabled')
+                               #conn.set_nodestate('disabled')
 
-                       return False
+                       return "skipping_baddisk"
 
        print "...Downloading bm.log from %s" %hostname 
        log = conn.get_bootmanager_log()
+       bm_log_data = log.read() # get data
+       log.seek(0)     # reset fd pointer for fdspawn
        child = fdpexpect.fdspawn(log)
 
-       if hasattr(config, 'collect') and config.collect: return True
+       if hasattr(config, 'collect') and config.collect: return "collect"
 
        if config and not config.quiet: print "...Scanning bm.log for errors"
 
@@ -791,7 +819,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                args = {}
                args['hostname'] = hostname
                args['sequence'] = s
-               args['bmlog'] = conn.get_bootmanager_log().read()
+               args['bmlog'] = bm_log_data
                args['viart'] = False
                args['saveact'] = True
                args['ccemail'] = True
@@ -800,11 +828,14 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
                conn.restart_bootmanager('boot')
 
+               bootman_action = "restart_bootmanager"
+
                # NOTE: Do not set the pflags value for this sequence if it's unknown.
                # This way, we can check it again after we've fixed it.
                flag_set = False
 
        else:
+               bootman_action = sequences[s]
 
                if   sequences[s] == "restart_bootmanager_boot":
                        print "...Restarting BootManager.py on %s "%hostname 
@@ -823,16 +854,23 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                # the keys either are in sync or were forced in sync.
                                # so try to start BM again.
                                conn.restart_bootmanager(conn.get_nodestate())
-                               pass
                        else:
                                # there was some failure to synchronize the keys.
                                print "...Unable to repair node keys on %s" %hostname 
+                               if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
+                                       args = {}
+                                       args['hostname'] = hostname
+                                       sitehist.sendMessage('nodeconfig_notice', **args)
+                                       conn.dump_plconf_file()
+                               else:
+                                       # NOTE: do not add a new action record
+                                       return ""
 
-               elif sequences[s] == "suspect_error_email":
+               elif sequences[s] == "unknownsequence_notice":
                        args = {}
                        args['hostname'] = hostname
                        args['sequence'] = s
-                       args['bmlog'] = conn.get_bootmanager_log().read()
+                       args['bmlog'] = bm_log_data
                        args['viart'] = False
                        args['saveact'] = True
                        args['ccemail'] = True
@@ -840,36 +878,42 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                        sitehist.sendMessage('unknownsequence_notice', **args)
                        conn.restart_bootmanager('boot')
 
-               # TODO: differentiate this and the 'nodenetwork_email' actions.
-               elif sequences[s] == "update_node_config_email":
+               elif sequences[s] == "nodeconfig_notice":
 
                        if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
                                args['hostname'] = hostname
                                sitehist.sendMessage('nodeconfig_notice', **args)
                                conn.dump_plconf_file()
+                       else:
+                               # NOTE: do not add a new action record
+                               return ""
 
                elif sequences[s] == "nodenetwork_email":
 
                        if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
                                args = {}
                                args['hostname'] = hostname
-                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               args['bmlog'] = bm_log_data
                                sitehist.sendMessage('nodeconfig_notice', **args)
                                conn.dump_plconf_file()
+                       else:
+                               # NOTE: do not add a new action record
+                               return ""
 
-               elif sequences[s] == "update_bootcd_email":
+               elif sequences[s] == "noblockdevice_notice":
 
-                       if not found_within(recent_actions, 'newalphacd_notice', 3.5):
+                       if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
                                args = {}
-                               args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+                               #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
                                args['hostname'] = hostname
                        
-                               sitehist.sendMessage('newalphacd_notice', **args)
-
-                               print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+                               sitehist.sendMessage('noblockdevice_notice', **args)
+                       else:
+                               # NOTE: do not add a new action record
+                               return ""
 
-               elif sequences[s] == "broken_hardware_email":
+               elif sequences[s] == "baddisk_notice":
                        # MAKE An ACTION record that this host has failed hardware.  May
                        # require either an exception "/minhw" or other manual intervention.
                        # Definitely need to send out some more EMAIL.
@@ -881,17 +925,23 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                args['log'] = conn.get_dmesg().read()
 
                                sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disabled')
+                               #conn.set_nodestate('disabled')
+                       else:
+                               # NOTE: do not add a new action record
+                               return ""
 
-               elif sequences[s] == "update_hardware_email":
+               elif sequences[s] == "minimalhardware_notice":
                        if not found_within(recent_actions, 'minimalhardware_notice', 7):
                                print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
                                args = {}
                                args['hostname'] = hostname
-                               args['bmlog'] = conn.get_bootmanager_log().read()
+                               args['bmlog'] = bm_log_data
                                sitehist.sendMessage('minimalhardware_notice', **args)
+                       else:
+                               # NOTE: do not add a new action record
+                               return ""
 
-               elif sequences[s] == "bad_dns_email":
+               elif sequences[s] == "baddns_notice":
                        if not found_within(recent_actions, 'baddns_notice', 1):
                                print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
                                args = {}
@@ -903,7 +953,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                        print traceback.print_exc()
                                        # TODO: api error. skip email, b/c all info is not available,
                                        # flag_set will not be recorded.
-                                       return False
+                                       return "exception"
                                nodenet_str = network_config_to_str(net)
 
                                args['hostname'] = hostname
@@ -911,8 +961,11 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                args['interface_id'] = net['interface_id']
 
                                sitehist.sendMessage('baddns_notice', **args)
+                       else:
+                               # NOTE: do not add a new action record
+                               return ""
 
-       return True
+       return bootman_action
        
 
 # MAIN -------------------------------------------------------------------