many improvements.
[monitor.git] / bootman.py
index ce9bb6e..981a911 100755 (executable)
@@ -3,25 +3,28 @@
 # Attempt to reboot a node in debug state.
 
 import plc
 # Attempt to reboot a node in debug state.
 
 import plc
-import auth
-api = plc.PLC(auth.auth, auth.plc)
+api = plc.getAuthAPI()
 
 import sys
 import os
 
 import sys
 import os
-import policy
+import const
 
 from getsshkeys import SSHKnownHosts
 
 import subprocess
 import time
 
 from getsshkeys import SSHKnownHosts
 
 import subprocess
 import time
-import soltesz
+import database
+import moncommands
 from sets import Set
 
 import ssh.pxssh as pxssh
 import ssh.fdpexpect as fdpexpect
 import ssh.pexpect as pexpect
 from sets import Set
 
 import ssh.pxssh as pxssh
 import ssh.fdpexpect as fdpexpect
 import ssh.pexpect as pexpect
-from unified_model import *
+from monitor.model import *
 from emailTxt import mailtxt
 from emailTxt import mailtxt
+from nodeconfig import network_config_to_str
+import traceback
+import config
 
 import signal
 class Sopen(subprocess.Popen):
 
 import signal
 class Sopen(subprocess.Popen):
@@ -31,11 +34,7 @@ class Sopen(subprocess.Popen):
 #from Rpyc import SocketConnection, Async
 from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
 #from Rpyc import SocketConnection, Async
 from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
-
-def get_fbnode(node):
-       fb = soltesz.dbLoad("findbad")
-       fbnode = fb['nodes'][node]['values']
-       return fbnode
+fb = None
 
 class NodeConnection:
        def __init__(self, connection, node, config):
 
 class NodeConnection:
        def __init__(self, connection, node, config):
@@ -65,8 +64,8 @@ class NodeConnection:
 
        def dump_plconf_file(self):
                c = self.c
 
        def dump_plconf_file(self):
                c = self.c
-               c.modules.sys.path.append("/tmp/source/")
-               c.modules.os.chdir('/tmp/source')
+               self.c.modules.sys.path.append("/tmp/source/")
+               self.c.modules.os.chdir('/tmp/source')
 
                log = c.modules.BootManager.log('/tmp/new.log')
                bm = c.modules.BootManager.BootManager(log,'boot')
 
                log = c.modules.BootManager.log('/tmp/new.log')
                bm = c.modules.BootManager.BootManager(log,'boot')
@@ -92,8 +91,8 @@ class NodeConnection:
 
        def compare_and_repair_nodekeys(self):
                c = self.c
 
        def compare_and_repair_nodekeys(self):
                c = self.c
-               c.modules.sys.path.append("/tmp/source/")
-               c.modules.os.chdir('/tmp/source')
+               self.c.modules.sys.path.append("/tmp/source/")
+               self.c.modules.os.chdir('/tmp/source')
 
                log = c.modules.BootManager.log('/tmp/new.log')
                bm = c.modules.BootManager.BootManager(log,'boot')
 
                log = c.modules.BootManager.log('/tmp/new.log')
                bm = c.modules.BootManager.BootManager(log,'boot')
@@ -201,7 +200,7 @@ class PlanetLabSession:
                args['port'] = self.port
                args['user'] = 'root'
                args['hostname'] = self.node
                args['port'] = self.port
                args['user'] = 'root'
                args['hostname'] = self.node
-               args['monitordir'] = "/home/soltesz/monitor"
+               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
                ssh_port = 22
 
                if self.nosetup:
                ssh_port = 22
 
                if self.nosetup:
@@ -209,11 +208,11 @@ class PlanetLabSession:
                        return 
 
                # COPY Rpyc files to host
                        return 
 
                # COPY Rpyc files to host
-               cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
+               cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
                if self.verbose: print cmd
                # TODO: Add timeout
                timeout = 120
                if self.verbose: print cmd
                # TODO: Add timeout
                timeout = 120
-               localos = soltesz.CMD()
+               localos = moncommands.CMD()
 
                ret = localos.system(cmd, timeout)
                print ret
 
                ret = localos.system(cmd, timeout)
                print ret
@@ -230,7 +229,7 @@ class PlanetLabSession:
 
                t1 = time.time()
                # KILL any already running servers.
 
                t1 = time.time()
                # KILL any already running servers.
-               ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
+               ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
                (ov,ev) = ssh.run_noexcept2("""<<\EOF
             rm -f out.log
             echo "kill server" >> out.log
                (ov,ev) = ssh.run_noexcept2("""<<\EOF
             rm -f out.log
             echo "kill server" >> out.log
@@ -270,7 +269,7 @@ EOF""")
                # TODO: the read() here may block indefinitely.  Need a better
                # approach therefore, that includes a timeout.
                #ret = self.command.stdout.read(5)
                # TODO: the read() here may block indefinitely.  Need a better
                # approach therefore, that includes a timeout.
                #ret = self.command.stdout.read(5)
-               ret = soltesz.read_t(self.command.stdout, 5)
+               ret = moncommands.read_t(self.command.stdout, 5)
 
                t2 = time.time()
                if 'READY' in ret:
 
                t2 = time.time()
                if 'READY' in ret:
@@ -308,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None):
 
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
 
        # NOTE: Nothing works if the bootcd is REALLY old.
        #       So, this is the first step.
-       fbnode = get_fbnode(hostname)
+       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
        if fbnode['category'] == "OLDBOOTCD":
                print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
                args = {}
        if fbnode['category'] == "OLDBOOTCD":
                print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
                args = {}
@@ -318,7 +317,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                        mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
                loginbase = plc.siteId(hostname)
                                                        mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
                loginbase = plc.siteId(hostname)
-               m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+               m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 
                print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                api.UpdateNode(hostname, {'boot_state' : 'disable'})
 
                print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -331,7 +330,7 @@ def reboot(hostname, config=None, forced_action=None):
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                return False
 
        try:
                return False
 
        try:
@@ -341,7 +340,7 @@ def reboot(hostname, config=None, forced_action=None):
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
        except Exception, e:
                print "ERROR setting up session for %s" % hostname
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
        except Exception, e:
                print "ERROR setting up session for %s" % hostname
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                print e
                return False
 
                print e
                return False
 
@@ -354,9 +353,8 @@ def reboot(hostname, config=None, forced_action=None):
                        time.sleep(session.timeout*4)
                        conn = session.get_connection(config)
                except:
                        time.sleep(session.timeout*4)
                        conn = session.get_connection(config)
                except:
-                       import traceback; print traceback.print_exc()
+                       print traceback.print_exc()
                        return False
                        return False
-                       
 
        if forced_action == "reboot":
                conn.restart_node('rins')
 
        if forced_action == "reboot":
                conn.restart_node('rins')
@@ -397,25 +395,34 @@ def reboot(hostname, config=None, forced_action=None):
                        ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 
                        ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
                        ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 
                        ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+                       ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+                       ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
                        ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
                        ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
                        ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
                        ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
                        ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
                        ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
                        ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
                        ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
                        ('floppytimeout','floppy0: floppy timeout called'),
                        ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 
                        ('floppytimeout','floppy0: floppy timeout called'),
                        ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 
+                       # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+                       # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
                        # floppy0: floppy timeout called
                        # end_request: I/O error, dev fd0, sector 0
 
                        # floppy0: floppy timeout called
                        # end_request: I/O error, dev fd0, sector 0
 
-                       #Buffer I/O error on device dm-2, logical block 8888896
-                       #ata1: status=0x51 { DriveReady SeekComplete Error }
-                       #ata1: error=0x40 { UncorrectableError }
-                       #SCSI error : <0 0 0 0> return code = 0x8000002
-                       #sda: Current: sense key: Medium Error
+                       # Buffer I/O error on device dm-2, logical block 8888896
+                       # ata1: status=0x51 { DriveReady SeekComplete Error }
+                       # ata1: error=0x40 { UncorrectableError }
+                       # SCSI error : <0 0 0 0> return code = 0x8000002
+                       # sda: Current: sense key: Medium Error
                        #       Additional sense: Unrecovered read error - auto reallocate failed
 
                        #       Additional sense: Unrecovered read error - auto reallocate failed
 
-                       #SCSI error : <0 2 0 0> return code = 0x40001
-                       #end_request: I/O error, dev sda, sector 572489600
+                       # SCSI error : <0 2 0 0> return code = 0x40001
+                       # end_request: I/O error, dev sda, sector 572489600
                ]
                id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
                sequence.append(id)
                ]
                id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
                sequence.append(id)
@@ -441,8 +448,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
-                       conn.set_nodestate('diag')
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
                        return False
 
        print "...Downloading bm.log from %s" % node
                        return False
 
        print "...Downloading bm.log from %s" % node
@@ -491,14 +498,17 @@ def reboot(hostname, config=None, forced_action=None):
                        ('nodehostname' , 'Configured node hostname does not resolve'),
                        ('implementerror', 'Implementation Error'),
                        ('readonlyfs'   , '[Errno 30] Read-only file system'),
                        ('nodehostname' , 'Configured node hostname does not resolve'),
                        ('implementerror', 'Implementation Error'),
                        ('readonlyfs'   , '[Errno 30] Read-only file system'),
+                       ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
                        ('noinstall'    , 'notinstalled'),
                        ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                        ('noblockdev'   , "No block devices detected."),
                        ('noinstall'    , 'notinstalled'),
                        ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                        ('noblockdev'   , "No block devices detected."),
+                       ('dnserror'     , 'Name or service not known'),
                        ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
                        ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
                        ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                        ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
                        ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
                        ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
                        ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                        ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
                        ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                        ('modulefail'   , 'Unable to get list of system modules'),
                        ('writeerror'   , 'write error: No space left on device'),
                        ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                        ('modulefail'   , 'Unable to get list of system modules'),
                        ('writeerror'   , 'write error: No space left on device'),
@@ -526,11 +536,11 @@ def reboot(hostname, config=None, forced_action=None):
        #  By using the sequence identifier, we guarantee that there will be no
        #  frequent loops.  I'm guessing there is a better way to track loops,
        #  though.
        #  By using the sequence identifier, we guarantee that there will be no
        #  frequent loops.  I'm guessing there is a better way to track loops,
        #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
 
        sequences = {}
 
 
        sequences = {}
 
@@ -539,6 +549,9 @@ def reboot(hostname, config=None, forced_action=None):
        for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
        for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-debug-done",
@@ -546,6 +559,7 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
                        "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
                        "bminit-cfg-auth-getplc-implementerror-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_boot"})
                        "bminit-cfg-auth-getplc-implementerror-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_boot"})
@@ -564,7 +578,13 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                        "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                        "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                       # actual solution appears to involve removing the bad files, and
+                       # continually trying to boot the node.
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
 
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
 
@@ -598,11 +618,14 @@ def reboot(hostname, config=None, forced_action=None):
 
        # update_node_config_email
        for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
 
        # update_node_config_email
        for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                       "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                        ]:
                sequences.update({n : "update_node_config_email"})
 
                        ]:
                sequences.update({n : "update_node_config_email"})
 
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                       ]:
                sequences.update({n : "nodenetwork_email"})
 
        # update_bootcd_email
                sequences.update({n : "nodenetwork_email"})
 
        # update_bootcd_email
@@ -625,6 +648,13 @@ def reboot(hostname, config=None, forced_action=None):
        # broken_hardware_email
        sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 
        # broken_hardware_email
        sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 
+       # bad_dns_email
+       for n in [ 
+        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               ]:
+               sequences.update( { n : "bad_dns_email"})
+
        flag_set = True
 
        
        flag_set = True
 
        
@@ -688,9 +718,9 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodeid_persistmessages')
                        loginbase = plc.siteId(hostname)
                        m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodeid_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                        conn.dump_plconf_file()
                        conn.dump_plconf_file()
-                       conn.set_nodestate('diag')
+                       conn.set_nodestate('disable')
 
                elif sequences[s] == "nodenetwork_email":
                        print "...Sending message to LOOK AT NODE NETWORK"
 
                elif sequences[s] == "nodenetwork_email":
                        print "...Sending message to LOOK AT NODE NETWORK"
@@ -700,9 +730,9 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
                        m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                        conn.dump_plconf_file()
                        conn.dump_plconf_file()
-                       conn.set_nodestate('diag')
+                       conn.set_nodestate('disable')
 
                elif sequences[s] == "update_bootcd_email":
                        print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 
                elif sequences[s] == "update_bootcd_email":
                        print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
@@ -715,7 +745,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
                        loginbase = plc.siteId(hostname)
                                                                mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 
                        print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                        conn.set_nodestate('disable')
 
                        print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                        conn.set_nodestate('disable')
@@ -733,7 +763,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
@@ -745,7 +775,30 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
                                                                                 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
+
+               elif sequences[s] == "bad_dns_email":
+                       print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+                       args = {}
+                       try:
+                               node = api.GetNodes(hostname)[0]
+                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                       except:
+                               print traceback.print_exc()
+                               # TODO: api error. skip email, b/c all info is not available,
+                               # flag_set will not be recorded.
+                               return False
+                       nodenet_str = network_config_to_str(net)
+
+                       args['hostname'] = hostname
+                       args['network_config'] = nodenet_str
+                       args['nodenetwork_id'] = net['nodenetwork_id']
+                       m = PersistMessage(hostname, mailtxt.baddns[0] % args,
+                                                                                mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                        conn.set_nodestate('disable')
 
        if flag_set:
                        conn.set_nodestate('disable')
 
        if flag_set:
@@ -758,10 +811,11 @@ def reboot(hostname, config=None, forced_action=None):
 # MAIN -------------------------------------------------------------------
 
 def main():
 # MAIN -------------------------------------------------------------------
 
 def main():
-       from config import config
-       from optparse import OptionParser
-       parser = OptionParser()
-       parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
+       import parser as parsermodule
+       parser = parsermodule.getParser()
+
+       parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
+                                               force=None, quiet=False)
        parser.add_option("", "--child", dest="child", action="store_true", 
                                                help="This is the child mode of this process.")
        parser.add_option("", "--force", dest="force", metavar="boot_state",
        parser.add_option("", "--child", dest="child", action="store_true", 
                                                help="This is the child mode of this process.")
        parser.add_option("", "--force", dest="force", metavar="boot_state",
@@ -770,16 +824,15 @@ def main():
                                                help="Extra quiet output messages.")
        parser.add_option("", "--verbose", dest="verbose", action="store_true", 
                                                help="Extra debug output messages.")
                                                help="Extra quiet output messages.")
        parser.add_option("", "--verbose", dest="verbose", action="store_true", 
                                                help="Extra debug output messages.")
+       parser.add_option("", "--nonet", dest="nonet", action="store_true", 
+                                               help="Do not setup the network, use existing log files to re-run a test pass.")
        parser.add_option("", "--collect", dest="collect", action="store_true", 
                                                help="No action, just collect dmesg, and bm.log")
        parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
                                                help="Do not perform the orginary setup phase.")
        parser.add_option("", "--collect", dest="collect", action="store_true", 
                                                help="No action, just collect dmesg, and bm.log")
        parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
                                                help="Do not perform the orginary setup phase.")
-       parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
-                                               help="A single node name to try to bring out of debug mode.")
-       parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
-                                               help="A list of nodes to bring out of debug mode.")
-       config = config(parser)
-       config.parse_args()
+
+       parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
+       config = parsermodule.parse_args(parser)
 
        if config.nodelist:
                nodes = config.getListFromFile(config.nodelist)
 
        if config.nodelist:
                nodes = config.getListFromFile(config.nodelist)