changes for 3.0
[monitor.git] / bootman.py
index c3116bc..5e8b908 100755 (executable)
@@ -7,7 +7,7 @@ api = plc.getAuthAPI()
 
 import sys
 import os
-import policy
+import const
 
 from getsshkeys import SSHKnownHosts
 
@@ -24,7 +24,9 @@ from unified_model import *
 from emailTxt import mailtxt
 from nodeconfig import network_config_to_str
 import traceback
-import monitorconfig
+import config
+
+class ExceptionDoubleSSHError(Exception): pass
 
 import signal
 class Sopen(subprocess.Popen):
@@ -34,9 +36,12 @@ class Sopen(subprocess.Popen):
 #from Rpyc import SocketConnection, Async
 from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
+fb = None
 
 def get_fbnode(node):
-       fb = database.dbLoad("findbad")
+       global fb
+       if fb is None:
+               fb = database.dbLoad("findbad")
        fbnode = fb['nodes'][node]['values']
        return fbnode
 
@@ -55,14 +60,18 @@ class NodeConnection:
                        return "unknown"
 
        def get_dmesg(self):
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
                self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-               download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+               download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node))
+               os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node))
                log = open("log/dmesg.%s.log" % self.node, 'r')
                return log
 
        def get_bootmanager_log(self):
-               download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-               os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+               download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node))
+               #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+               os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node))
                log = open("log/bm.%s.log" % self.node, 'r')
                return log
 
@@ -204,7 +213,7 @@ class PlanetLabSession:
                args['port'] = self.port
                args['user'] = 'root'
                args['hostname'] = self.node
-               args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+               args['monitordir'] = config.MONITOR_SCRIPT_ROOT
                ssh_port = 22
 
                if self.nosetup:
@@ -229,7 +238,7 @@ class PlanetLabSession:
                        if ret != 0:
                                print "\tFAILED TWICE"
                                #sys.exit(1)
-                               raise Exception("Failed twice trying to login with updated ssh host key")
+                               raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
                # KILL any already running servers.
@@ -321,7 +330,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                        mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
                loginbase = plc.siteId(hostname)
-               m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+               emails = plc.getTechEmails(loginbase)
+               m.send(emails) 
 
                print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -334,6 +344,8 @@ def reboot(hostname, config=None, forced_action=None):
        try:
                k = SSHKnownHosts(); k.update(node); k.write(); del k
        except:
+               from nodecommon import email_exception
+               email_exception()
                print traceback.print_exc()
                return False
 
@@ -342,9 +354,16 @@ def reboot(hostname, config=None, forced_action=None):
                        session = PlanetLabSession(node, False, True)
                else:
                        session = PlanetLabSession(node, config.nosetup, config.verbose)
+       except ExceptionDoubleSSHError, e:
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
+               return False
        except Exception, e:
-               print "ERROR setting up session for %s" % hostname
+               msg = "ERROR setting up session for %s" % hostname
+               print msg
                print traceback.print_exc()
+               from nodecommon import email_exception
+               email_exception(msg)
                print e
                return False
 
@@ -356,13 +375,18 @@ def reboot(hostname, config=None, forced_action=None):
                try:
                        time.sleep(session.timeout*4)
                        conn = session.get_connection(config)
+               except EOFError:
+                       # failed twice... no need to report this really, it's just in a
+                       # weird state...
+                       return False
                except:
                        print traceback.print_exc()
+                       from nodecommon import email_exception
+                       email_exception(node)
                        return False
-                       
 
        if forced_action == "reboot":
-               conn.restart_node('rins')
+               conn.restart_node('reinstall')
                return True
 
        boot_state = conn.get_boot_state()
@@ -400,25 +424,34 @@ def reboot(hostname, config=None, forced_action=None):
                        ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 
                        ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+                       ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+                       ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
                        ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
                        ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
                        ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
                        ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
                        ('floppytimeout','floppy0: floppy timeout called'),
                        ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 
+                       # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+                       # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
                        # floppy0: floppy timeout called
                        # end_request: I/O error, dev fd0, sector 0
 
-                       #Buffer I/O error on device dm-2, logical block 8888896
-                       #ata1: status=0x51 { DriveReady SeekComplete Error }
-                       #ata1: error=0x40 { UncorrectableError }
-                       #SCSI error : <0 0 0 0> return code = 0x8000002
-                       #sda: Current: sense key: Medium Error
+                       # Buffer I/O error on device dm-2, logical block 8888896
+                       # ata1: status=0x51 { DriveReady SeekComplete Error }
+                       # ata1: error=0x40 { UncorrectableError }
+                       # SCSI error : <0 0 0 0> return code = 0x8000002
+                       # sda: Current: sense key: Medium Error
                        #       Additional sense: Unrecovered read error - auto reallocate failed
 
-                       #SCSI error : <0 2 0 0> return code = 0x40001
-                       #end_request: I/O error, dev sda, sector 572489600
+                       # SCSI error : <0 2 0 0> return code = 0x40001
+                       # end_request: I/O error, dev sda, sector 572489600
                ]
                id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
                sequence.append(id)
@@ -444,8 +477,9 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
-                       conn.set_nodestate('diag')
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
+                       conn.set_nodestate('disable')
                        return False
 
        print "...Downloading bm.log from %s" % node
@@ -503,6 +537,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ('hardwarerequirefail' , 'Hardware requirements not met'),
                        ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
                        ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+                       ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
                        ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
                        ('modulefail'   , 'Unable to get list of system modules'),
                        ('writeerror'   , 'write error: No space left on device'),
@@ -530,11 +565,11 @@ def reboot(hostname, config=None, forced_action=None):
        #  By using the sequence identifier, we guarantee that there will be no
        #  frequent loops.  I'm guessing there is a better way to track loops,
        #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
 
        sequences = {}
 
@@ -558,7 +593,7 @@ def reboot(hostname, config=None, forced_action=None):
                        ]:
                sequences.update({n : "restart_bootmanager_boot"})
 
-       #       conn.restart_bootmanager('rins')
+       #       conn.restart_bootmanager('reinstall')
        for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
@@ -572,14 +607,23 @@ def reboot(hostname, config=None, forced_action=None):
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
                        "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+                       "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
                        "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+                       # actual solution appears to involve removing the bad files, and
+                       # continually trying to boot the node.
+                       "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+                       "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                       "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                        ]:
                sequences.update({n : "restart_bootmanager_rins"})
 
        # repair_node_keys
        sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 
-       #   conn.restart_node('rins')
+       #   conn.restart_node('reinstall')
        for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
                        "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
@@ -601,16 +645,20 @@ def reboot(hostname, config=None, forced_action=None):
                         "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+                        "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
                         ]:
                sequences.update({n: "restart_node_boot"})
 
        # update_node_config_email
        for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-                       "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+                         "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
                        ]:
                sequences.update({n : "update_node_config_email"})
 
-       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+       for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+                          "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                       ]:
                sequences.update({n : "nodenetwork_email"})
 
        # update_bootcd_email
@@ -634,7 +682,11 @@ def reboot(hostname, config=None, forced_action=None):
        sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 
        # bad_dns_email
-       sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+       for n in [ 
+        "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+               ]:
+               sequences.update( { n : "bad_dns_email"})
 
        flag_set = True
 
@@ -650,7 +702,7 @@ def reboot(hostname, config=None, forced_action=None):
                m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
                                                                         mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
                m.reset()
-               m.send(['monitor-list@lists.planet-lab.org'])
+               m.send([config.cc_email]) 
 
                conn.restart_bootmanager('boot')
 
@@ -665,16 +717,16 @@ def reboot(hostname, config=None, forced_action=None):
                        conn.restart_bootmanager('boot')
                elif sequences[s] == "restart_bootmanager_rins":
                        if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
-                       conn.restart_bootmanager('rins')
+                       conn.restart_bootmanager('reinstall')
                elif sequences[s] == "restart_node_rins":
-                       conn.restart_node('rins')
+                       conn.restart_node('reinstall')
                elif sequences[s] == "restart_node_boot":
                        conn.restart_node('boot')
                elif sequences[s] == "repair_node_keys":
                        if conn.compare_and_repair_nodekeys():
                                # the keys either are in sync or were forced in sync.
                                # so try to reboot the node again.
-                               conn.restart_bootmanager('rins')
+                               conn.restart_bootmanager('reinstall')
                                pass
                        else:
                                # there was some failure to synchronize the keys.
@@ -688,7 +740,7 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
                                                                                 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
                        m.reset()
-                       m.send(['monitor-list@lists.planet-lab.org'])
+                       m.send([config.cc_email]) 
 
                        conn.restart_bootmanager('boot')
 
@@ -699,21 +751,23 @@ def reboot(hostname, config=None, forced_action=None):
                        m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodeid_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.dump_plconf_file()
-                       conn.set_nodestate('diag')
+                       conn.set_nodestate('disable')
 
                elif sequences[s] == "nodenetwork_email":
                        print "...Sending message to LOOK AT NODE NETWORK"
                        args = {}
                        args['hostname'] = hostname
                        args['bmlog'] = conn.get_bootmanager_log().read()
-                       m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+                       m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
                                                                True, db='nodenet_persistmessages')
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.dump_plconf_file()
-                       conn.set_nodestate('diag')
+                       conn.set_nodestate('disable')
 
                elif sequences[s] == "update_bootcd_email":
                        print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
@@ -726,7 +780,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
 
                        print "\tDisabling %s due to out-of-date BOOTCD" % hostname
                        conn.set_nodestate('disable')
@@ -744,7 +799,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "update_hardware_email":
@@ -756,7 +812,8 @@ def reboot(hostname, config=None, forced_action=None):
                                                                                 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
                elif sequences[s] == "bad_dns_email":
@@ -764,8 +821,10 @@ def reboot(hostname, config=None, forced_action=None):
                        args = {}
                        try:
                                node = api.GetNodes(hostname)[0]
-                               net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+                               net = api.GetInterfaces(node['interface_ids'])[0]
                        except:
+                               from nodecommon import email_exception
+                               email_exception()
                                print traceback.print_exc()
                                # TODO: api error. skip email, b/c all info is not available,
                                # flag_set will not be recorded.
@@ -774,12 +833,13 @@ def reboot(hostname, config=None, forced_action=None):
 
                        args['hostname'] = hostname
                        args['network_config'] = nodenet_str
-                       args['nodenetwork_id'] = net['nodenetwork_id']
+                       args['interface_id'] = net['interface_id']
                        m = PersistMessage(hostname, mailtxt.baddns[0] % args,
                                                                                 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 
                        loginbase = plc.siteId(hostname)
-                       m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+                       emails = plc.getTechEmails(loginbase)
+                       m.send(emails) 
                        conn.set_nodestate('disable')
 
        if flag_set:
@@ -792,10 +852,11 @@ def reboot(hostname, config=None, forced_action=None):
 # MAIN -------------------------------------------------------------------
 
 def main():
-       from config import config
-       from optparse import OptionParser
-       parser = OptionParser()
-       parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
+       import parser as parsermodule
+       parser = parsermodule.getParser()
+
+       parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
+                                               force=None, quiet=False)
        parser.add_option("", "--child", dest="child", action="store_true", 
                                                help="This is the child mode of this process.")
        parser.add_option("", "--force", dest="force", metavar="boot_state",
@@ -810,12 +871,9 @@ def main():
                                                help="No action, just collect dmesg, and bm.log")
        parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
                                                help="Do not perform the orginary setup phase.")
-       parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
-                                               help="A single node name to try to bring out of debug mode.")
-       parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
-                                               help="A list of nodes to bring out of debug mode.")
-       config = config(parser)
-       config.parse_args()
+
+       parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
+       config = parsermodule.parse_args(parser)
 
        if config.nodelist:
                nodes = config.getListFromFile(config.nodelist)