M emailTxt.py

[monitor.git] / bootman.py
diff --git a/bootman.py b/bootman.py

index ce9bb6e..faf77a2 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -3,18 +3,18 @@
  # Attempt to reboot a node in debug state.
  
  import plc
-import auth
-api = plc.PLC(auth.auth, auth.plc)
+api = plc.getAuthAPI()
  
  import sys
  import os
-import policy
+import const
  
  from getsshkeys import SSHKnownHosts
  
  import subprocess
  import time
-import soltesz
+import database
+import moncommands
  from sets import Set
  
  import ssh.pxssh as pxssh
@@ -22,6 +22,9 @@ import ssh.fdpexpect as fdpexpect
  import ssh.pexpect as pexpect
  from unified_model import *
  from emailTxt import mailtxt
+from nodeconfig import network_config_to_str
+import traceback
+import monitorconfig
  
  import signal
  class Sopen(subprocess.Popen):
@@ -31,9 +34,12 @@ class Sopen(subprocess.Popen):
  #from Rpyc import SocketConnection, Async
  from Rpyc import SocketConnection, Async
  from Rpyc.Utils import *
+fb = None
  
  def get_fbnode(node):
-       fb = soltesz.dbLoad("findbad")
+       global fb
+       if fb is None:
+               fb = database.dbLoad("findbad")
         fbnode = fb['nodes'][node]['values']
         return fbnode
  
@@ -65,8 +71,8 @@ class NodeConnection:
  
         def dump_plconf_file(self):
                 c = self.c
-               c.modules.sys.path.append("/tmp/source/")
-               c.modules.os.chdir('/tmp/source')
+               self.c.modules.sys.path.append("/tmp/source/")
+               self.c.modules.os.chdir('/tmp/source')
  
                 log = c.modules.BootManager.log('/tmp/new.log')
                 bm = c.modules.BootManager.BootManager(log,'boot')
@@ -92,8 +98,8 @@ class NodeConnection:
  
         def compare_and_repair_nodekeys(self):
                 c = self.c
-               c.modules.sys.path.append("/tmp/source/")
-               c.modules.os.chdir('/tmp/source')
+               self.c.modules.sys.path.append("/tmp/source/")
+               self.c.modules.os.chdir('/tmp/source')
  
                 log = c.modules.BootManager.log('/tmp/new.log')
                 bm = c.modules.BootManager.BootManager(log,'boot')
@@ -201,7 +207,7 @@ class PlanetLabSession:
                 args['port'] = self.port
                 args['user'] = 'root'
                 args['hostname'] = self.node
-               args['monitordir'] = "/home/soltesz/monitor"
+               args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
                 ssh_port = 22
  
                 if self.nosetup:
@@ -209,11 +215,11 @@ class PlanetLabSession:
                         return 
  
                 # COPY Rpyc files to host
-               cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
+               cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
                 if self.verbose: print cmd
                 # TODO: Add timeout
                 timeout = 120
-               localos = soltesz.CMD()
+               localos = moncommands.CMD()
  
                 ret = localos.system(cmd, timeout)
                 print ret
@@ -230,7 +236,7 @@ class PlanetLabSession:
  
                 t1 = time.time()
                 # KILL any already running servers.
-               ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
+               ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
              rm -f out.log
              echo "kill server" >> out.log
@@ -270,7 +276,7 @@ EOF""")
                 # TODO: the read() here may block indefinitely.  Need a better
                 # approach therefore, that includes a timeout.
                 #ret = self.command.stdout.read(5)
-               ret = soltesz.read_t(self.command.stdout, 5)
+               ret = moncommands.read_t(self.command.stdout, 5)
  
                 t2 = time.time()
                 if 'READY' in ret:
@@ -318,7 +324,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
  
                 loginbase = plc.siteId(hostname)
-               m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+               m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
  
                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -331,7 +337,7 @@ def reboot(hostname, config=None, forced_action=None):
         try:
                 k = SSHKnownHosts(); k.update(node); k.write(); del k
         except:
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                 return False
  
         try:
@@ -341,7 +347,7 @@ def reboot(hostname, config=None, forced_action=None):
                         session = PlanetLabSession(node, config.nosetup, config.verbose)
         except Exception, e:
                 print "ERROR setting up session for %s" % hostname
-               import traceback; print traceback.print_exc()
+               print traceback.print_exc()
                 print e
                 return False
  
@@ -354,9 +360,8 @@ def reboot(hostname, config=None, forced_action=None):
                         time.sleep(session.timeout*4)
                         conn = session.get_connection(config)
                 except:
-                       import traceback; print traceback.print_exc()
+                       print traceback.print_exc()
                         return False
-                       
  
         if forced_action == "reboot":
                 conn.restart_node('rins')
@@ -397,25 +402,34 @@ def reboot(hostname, config=None, forced_action=None):
                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
  
                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+                       ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+                       ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
                         ('floppytimeout','floppy0: floppy timeout called'),
                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
  
+                       # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+                       # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
                         # floppy0: floppy timeout called
                         # end_request: I/O error, dev fd0, sector 0
  
-                       #Buffer I/O error on device dm-2, logical block 8888896
-                       #ata1: status=0x51 { DriveReady SeekComplete Error }
-                       #ata1: error=0x40 { UncorrectableError }
-                       #SCSI error : <0 0 0 0> return code = 0x8000002
-                       #sda: Current: sense key: Medium Error
+                       # Buffer I/O error on device dm-2, logical block 8888896
+                       # ata1: status=0x51 { DriveReady SeekComplete Error }
+                       # ata1: error=0x40 { UncorrectableError }
+                       # SCSI error : <0 0 0 0> return code = 0x8000002
+                       # sda: Current: sense key: Medium Error
                         #       Additional sense: Unrecovered read error - auto reallocate failed
  
-                       #SCSI error : <0 2 0 0> return code = 0x40001
-                       #end_request: I/O error, dev sda, sector 572489600
+                       # SCSI error : <0 2 0 0> return code = 0x40001
+                       # end_request: I/O error, dev sda, sector 572489600
                 ]
                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
                 sequence.append(id)
@@ -441,8 +455,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
-                       conn.set_nodestate('diag')
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
                         return False
  
         print "...Downloading bm.log from %s" % node
@@ -494,6 +508,7 @@ def reboot(hostname, config=None, forced_action=None):
                         ('noinstall'    , 'notinstalled'),
                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
                         ('noblockdev'   , "No block devices detected."),
+                       ('dnserror'     , 'Name or service not known'),
                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
                         ('hardwarerequirefail' , 'Hardware requirements not met'),
@@ -526,11 +541,11 @@ def reboot(hostname, config=None, forced_action=None):
         #  By using the sequence identifier, we guarantee that there will be no
         #  frequent loops.  I'm guessing there is a better way to track loops,
         #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
  
         sequences = {}
  
@@ -539,6 +554,9 @@ def reboot(hostname, config=None, forced_action=None):
         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+                       "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
                         "bminit-cfg-auth-getplc-update-debug-done",
@@ -546,6 +564,7 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
                         ]:
                 sequences.update({n : "restart_bootmanager_boot"})
@@ -598,11 +617,14 @@ def reboot(hostname, config=None, forced_action=None):
  
         # update_node_config_email
         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                       "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                         ]:
                 sequences.update({n : "update_node_config_email"})
  
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                       ]:
                 sequences.update({n : "nodenetwork_email"})
  
         # update_bootcd_email
@@ -625,6 +647,13 @@ def reboot(hostname, config=None, forced_action=None):
         # broken_hardware_email
         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
  
+       # bad_dns_email
+       for n in [ 
+        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               ]:
+               sequences.update( { n : "bad_dns_email"})
+
         flag_set = True
  
         
@@ -688,9 +717,9 @@ def reboot(hostname, config=None, forced_action=None):
                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodeid_persistmessages')
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.dump_plconf_file()
-                       conn.set_nodestate('diag')
+                       conn.set_nodestate('disable')
  
                 elif sequences[s] == "nodenetwork_email":
                         print "...Sending message to LOOK AT NODE NETWORK"
@@ -700,9 +729,9 @@ def reboot(hostname, config=None, forced_action=None):
                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                 True, db='nodenet_persistmessages')
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.dump_plconf_file()
-                       conn.set_nodestate('diag')
+                       conn.set_nodestate('disable')
  
                 elif sequences[s] == "update_bootcd_email":
                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
@@ -715,7 +744,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
  
                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                         conn.set_nodestate('disable')
@@ -733,7 +762,7 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.set_nodestate('disable')
  
                 elif sequences[s] == "update_hardware_email":
@@ -745,7 +774,30 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
  
                         loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
+                       conn.set_nodestate('disable')
+
+               elif sequences[s] == "bad_dns_email":
+                       print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+                       args = {}
+                       try:
+                               node = api.GetNodes(hostname)[0]
+                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                       except:
+                               print traceback.print_exc()
+                               # TODO: api error. skip email, b/c all info is not available,
+                               # flag_set will not be recorded.
+                               return False
+                       nodenet_str = network_config_to_str(net)
+
+                       args['hostname'] = hostname
+                       args['network_config'] = nodenet_str
+                       args['nodenetwork_id'] = net['nodenetwork_id']
+                       m = PersistMessage(hostname, mailtxt.baddns[0] % args,
+                                                                                mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
+
+                       loginbase = plc.siteId(hostname)
+                       m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
                         conn.set_nodestate('disable')
  
         if flag_set:
@@ -758,10 +810,11 @@ def reboot(hostname, config=None, forced_action=None):
  # MAIN -------------------------------------------------------------------
  
  def main():
-       from config import config
-       from optparse import OptionParser
-       parser = OptionParser()
-       parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
+       import parser as parsermodule
+       parser = parsermodule.getParser()
+
+       parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
+                                               force=None, quiet=False)
         parser.add_option("", "--child", dest="child", action="store_true", 
                                                 help="This is the child mode of this process.")
         parser.add_option("", "--force", dest="force", metavar="boot_state",
@@ -770,16 +823,15 @@ def main():
                                                 help="Extra quiet output messages.")
         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
                                                 help="Extra debug output messages.")
+       parser.add_option("", "--nonet", dest="nonet", action="store_true", 
+                                               help="Do not setup the network, use existing log files to re-run a test pass.")
         parser.add_option("", "--collect", dest="collect", action="store_true", 
                                                 help="No action, just collect dmesg, and bm.log")
         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
                                                 help="Do not perform the orginary setup phase.")
-       parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
-                                               help="A single node name to try to bring out of debug mode.")
-       parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
-                                               help="A list of nodes to bring out of debug mode.")
-       config = config(parser)
-       config.parse_args()
+
+       parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
+       config = parsermodule.parse_args(parser)
  
         if config.nodelist:
                 nodes = config.getListFromFile(config.nodelist)