move clean_policy.py into monitor package

[monitor.git] / bootman.py
diff --git a/bootman.py b/bootman.py

index b9a161f..981a911 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -7,7 +7,7 @@ api = plc.getAuthAPI()
  
  import sys
  import os
-import policy
+import const
  
  from getsshkeys import SSHKnownHosts
  
@@ -20,11 +20,11 @@ from sets import Set
  import ssh.pxssh as pxssh
  import ssh.fdpexpect as fdpexpect
  import ssh.pexpect as pexpect
-from unified_model import *
+from monitor.model import *
  from emailTxt import mailtxt
  from nodeconfig import network_config_to_str
  import traceback
-import monitorconfig
+import config
  
  import signal
  class Sopen(subprocess.Popen):
@@ -34,11 +34,7 @@ class Sopen(subprocess.Popen):
  #from Rpyc import SocketConnection, Async
  from Rpyc import SocketConnection, Async
  from Rpyc.Utils import *
-
-def get_fbnode(node):
-       fb = database.dbLoad("findbad")
-       fbnode = fb['nodes'][node]['values']
-       return fbnode
+fb = None
  
  class NodeConnection:
         def __init__(self, connection, node, config):
@@ -204,7 +200,7 @@ class PlanetLabSession:
                 args['port'] = self.port
                 args['user'] = 'root'
                 args['hostname'] = self.node
-               args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
                 ssh_port = 22
  
                 if self.nosetup:
@@ -311,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None):
  
         # NOTE: Nothing works if the bootcd is REALLY old.
         #       So, this is the first step.
-       fbnode = get_fbnode(hostname)
+       fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
         if fbnode['category'] == "OLDBOOTCD":
                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
                 args = {}
@@ -321,7 +317,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
  
                 loginbase = plc.siteId(hostname)
-               m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+               m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
  
                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -359,7 +355,6 @@ def reboot(hostname, config=None, forced_action=None):
                 except:
                         print traceback.print_exc()
                         return False
-                       
  
         if forced_action == "reboot":
                 conn.restart_node('rins')
@@ -400,25 +395,34 @@ def reboot(hostname, config=None, forced_action=None):
                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
  
                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+                       ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+                       ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
                         ('floppytimeout','floppy0: floppy timeout called'),
                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
  
+                       # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+                       # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
                         # floppy0: floppy timeout called
                         # end_request: I/O error, dev fd0, sector 0
  
-                       #Buffer I/O error on device dm-2, logical block 8888896
-                       #ata1: status=0x51 { DriveReady SeekComplete Error }
-                       #ata1: error=0x40 { UncorrectableError }
-                       #SCSI error : <0 0 0 0> return code = 0x8000002
-                       #sda: Current: sense key: Medium Error
+                       # Buffer I/O error on device dm-2, logical block 8888896
+                       # ata1: status=0x51 { DriveReady SeekComplete Error }
+                       # ata1: error=0x40 { UncorrectableError }
+                       # SCSI error : <0 0 0 0> return code = 0x8000002
+                       # sda: Current: sense key: Medium Error
                         #       Additional sense: Unrecovered read error - auto reallocate failed
  
-                       #SCSI error : <0 2 0 0> return code = 0x40001
-                       #end_request: I/O error, dev sda, sector 572489600
+                       # SCSI error : <0 2 0 0> return code = 0x40001
+                       # end_request: I/O error, dev sda, sector 572489600
                 ]
                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
                 sequence.append(id)
@@ -444,7 +448,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.set_nodestate('disable')
                         return False
  
@@ -494,6 +498,7 @@ def reboot(hostname, config=None, forced_action=None):
                         ('nodehostname' , 'Configured node hostname does not resolve'),
                         ('implementerror', 'Implementation Error'),
                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
+                       ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
                         ('noinstall'    , 'notinstalled'),
                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                         ('noblockdev'   , "No block devices detected."),
@@ -503,6 +508,7 @@ def reboot(hostname, config=None, forced_action=None):
                         ('hardwarerequirefail' , 'Hardware requirements not met'),
                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                         ('modulefail'   , 'Unable to get list of system modules'),
                         ('writeerror'   , 'write error: No space left on device'),
@@ -530,11 +536,11 @@ def reboot(hostname, config=None, forced_action=None):
         #  By using the sequence identifier, we guarantee that there will be no
         #  frequent loops.  I'm guessing there is a better way to track loops,
         #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
  
         sequences = {}
  
@@ -572,7 +578,13 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                       # actual solution appears to involve removing the bad files, and
+                       # continually trying to boot the node.
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
                         ]:
                 sequences.update({n : "restart_bootmanager_rins"})
  
@@ -606,11 +618,14 @@ def reboot(hostname, config=None, forced_action=None):
  
         # update_node_config_email
         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                       "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                         ]:
                 sequences.update({n : "update_node_config_email"})
  
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                       ]:
                 sequences.update({n : "nodenetwork_email"})
  
         # update_bootcd_email
@@ -634,7 +649,11 @@ def reboot(hostname, config=None, forced_action=None):
         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
  
         # bad_dns_email
-       sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+       for n in [ 
+        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               ]:
+               sequences.update( { n : "bad_dns_email"})
  
         flag_set = True
  
@@ -699,7 +718,7 @@ def reboot(hostname, config=None, forced_action=None):
                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodeid_persistmessages')
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.dump_plconf_file()
                         conn.set_nodestate('disable')
  
@@ -711,7 +730,7 @@ def reboot(hostname, config=None, forced_action=None):
                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodenet_persistmessages')
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.dump_plconf_file()
                         conn.set_nodestate('disable')
  
@@ -726,7 +745,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
  
                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                         conn.set_nodestate('disable')
@@ -744,7 +763,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.set_nodestate('disable')
  
                 elif sequences[s] == "update_hardware_email":
@@ -756,7 +775,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.set_nodestate('disable')
  
                 elif sequences[s] == "bad_dns_email":
@@ -779,7 +798,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.set_nodestate('disable')
  
         if flag_set: