X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=5e8b908359f025fe9f0cb1bd4e1fbbee5df5b365;hb=refs%2Fheads%2F1.0;hp=b9a161f5aeacf484590b2319d676d9b25d684f54;hpb=c3f2afdc81c6711c3825c82e2cd4970671575438;p=monitor.git diff --git a/bootman.py b/bootman.py index b9a161f..5e8b908 100755 --- a/bootman.py +++ b/bootman.py @@ -7,7 +7,7 @@ api = plc.getAuthAPI() import sys import os -import policy +import const from getsshkeys import SSHKnownHosts @@ -24,7 +24,9 @@ from unified_model import * from emailTxt import mailtxt from nodeconfig import network_config_to_str import traceback -import monitorconfig +import config + +class ExceptionDoubleSSHError(Exception): pass import signal class Sopen(subprocess.Popen): @@ -34,9 +36,12 @@ class Sopen(subprocess.Popen): #from Rpyc import SocketConnection, Async from Rpyc import SocketConnection, Async from Rpyc.Utils import * +fb = None def get_fbnode(node): - fb = database.dbLoad("findbad") + global fb + if fb is None: + fb = database.dbLoad("findbad") fbnode = fb['nodes'][node]['values'] return fbnode @@ -55,14 +60,18 @@ class NodeConnection: return "unknown" def get_dmesg(self): + t_stamp = time.strftime("%Y-%m-%d-%H:%M") self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") - download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node) + download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node)) + os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node)) log = open("log/dmesg.%s.log" % self.node, 'r') return log def get_bootmanager_log(self): - download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) - os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + t_stamp = time.strftime("%Y-%m-%d-%H:%M") + download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node)) + #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node)) log = open("log/bm.%s.log" % self.node, 'r') return log @@ -204,7 +213,7 @@ class PlanetLabSession: args['port'] = self.port args['user'] = 'root' args['hostname'] = self.node - args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT + args['monitordir'] = config.MONITOR_SCRIPT_ROOT ssh_port = 22 if self.nosetup: @@ -229,7 +238,7 @@ class PlanetLabSession: if ret != 0: print "\tFAILED TWICE" #sys.exit(1) - raise Exception("Failed twice trying to login with updated ssh host key") + raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key") t1 = time.time() # KILL any already running servers. @@ -321,7 +330,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) print "\tDisabling %s due to out-of-date BOOTCD" % hostname api.UpdateNode(hostname, {'boot_state' : 'disable'}) @@ -334,6 +344,8 @@ def reboot(hostname, config=None, forced_action=None): try: k = SSHKnownHosts(); k.update(node); k.write(); del k except: + from nodecommon import email_exception + email_exception() print traceback.print_exc() return False @@ -342,9 +354,16 @@ def reboot(hostname, config=None, forced_action=None): session = PlanetLabSession(node, False, True) else: session = PlanetLabSession(node, config.nosetup, config.verbose) + except ExceptionDoubleSSHError, e: + msg = "ERROR setting up session for %s" % hostname + print msg + return False except Exception, e: - print "ERROR setting up session for %s" % hostname + msg = "ERROR setting up session for %s" % hostname + print msg print traceback.print_exc() + from nodecommon import email_exception + email_exception(msg) print e return False @@ -356,13 +375,18 @@ def reboot(hostname, config=None, forced_action=None): try: time.sleep(session.timeout*4) conn = session.get_connection(config) + except EOFError: + # failed twice... no need to report this really, it's just in a + # weird state... + return False except: print traceback.print_exc() + from nodecommon import email_exception + email_exception(node) return False - if forced_action == "reboot": - conn.restart_node('rins') + conn.restart_node('reinstall') return True boot_state = conn.get_boot_state() @@ -400,25 +424,34 @@ def reboot(hostname, config=None, forced_action=None): ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'), ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'), + + ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'), + ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'), + ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'), ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'), + ('sdXerror' , 'sd\w: Current: sense key: Medium Error'), ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'), + ('floppytimeout','floppy0: floppy timeout called'), ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'), + # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error } + # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263 + # floppy0: floppy timeout called # end_request: I/O error, dev fd0, sector 0 - #Buffer I/O error on device dm-2, logical block 8888896 - #ata1: status=0x51 { DriveReady SeekComplete Error } - #ata1: error=0x40 { UncorrectableError } - #SCSI error : <0 0 0 0> return code = 0x8000002 - #sda: Current: sense key: Medium Error + # Buffer I/O error on device dm-2, logical block 8888896 + # ata1: status=0x51 { DriveReady SeekComplete Error } + # ata1: error=0x40 { UncorrectableError } + # SCSI error : <0 0 0 0> return code = 0x8000002 + # sda: Current: sense key: Medium Error # Additional sense: Unrecovered read error - auto reallocate failed - #SCSI error : <0 2 0 0> return code = 0x40001 - #end_request: I/O error, dev sda, sector 572489600 + # SCSI error : <0 2 0 0> return code = 0x40001 + # end_request: I/O error, dev sda, sector 572489600 ] id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) sequence.append(id) @@ -444,7 +477,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.set_nodestate('disable') return False @@ -503,6 +537,7 @@ def reboot(hostname, config=None, forced_action=None): ('hardwarerequirefail' , 'Hardware requirements not met'), ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'), ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"), + ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"), ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), ('modulefail' , 'Unable to get list of system modules'), ('writeerror' , 'write error: No space left on device'), @@ -530,11 +565,11 @@ def reboot(hostname, config=None, forced_action=None): # By using the sequence identifier, we guarantee that there will be no # frequent loops. I'm guessing there is a better way to track loops, # though. - if not config.force and pflags.getRecentFlag(s): - pflags.setRecentFlag(s) - pflags.save() - print "... flag is set or it has already run recently. Skipping %s" % node - return True + #if not config.force and pflags.getRecentFlag(s): + # pflags.setRecentFlag(s) + # pflags.save() + # print "... flag is set or it has already run recently. Skipping %s" % node + # return True sequences = {} @@ -558,7 +593,7 @@ def reboot(hostname, config=None, forced_action=None): ]: sequences.update({n : "restart_bootmanager_boot"}) - # conn.restart_bootmanager('rins') + # conn.restart_bootmanager('reinstall') for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", @@ -572,14 +607,23 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", ]: sequences.update({n : "restart_bootmanager_rins"}) # repair_node_keys sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - # conn.restart_node('rins') + # conn.restart_node('reinstall') for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", @@ -601,16 +645,20 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", ]: sequences.update({n: "restart_node_boot"}) # update_node_config_email for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", ]: sequences.update({n : "update_node_config_email"}) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]: + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: sequences.update({n : "nodenetwork_email"}) # update_bootcd_email @@ -634,7 +682,11 @@ def reboot(hostname, config=None, forced_action=None): sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) # bad_dns_email - sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"}) + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) flag_set = True @@ -650,7 +702,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args, mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages') m.reset() - m.send(['monitor-list@lists.planet-lab.org']) + m.send([config.cc_email]) conn.restart_bootmanager('boot') @@ -665,16 +717,16 @@ def reboot(hostname, config=None, forced_action=None): conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": if config and not config.quiet: print "...Restarting BootManager.py on %s "% node - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') elif sequences[s] == "restart_node_rins": - conn.restart_node('rins') + conn.restart_node('reinstall') elif sequences[s] == "restart_node_boot": conn.restart_node('boot') elif sequences[s] == "repair_node_keys": if conn.compare_and_repair_nodekeys(): # the keys either are in sync or were forced in sync. # so try to reboot the node again. - conn.restart_bootmanager('rins') + conn.restart_bootmanager('reinstall') pass else: # there was some failure to synchronize the keys. @@ -688,7 +740,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args, mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages') m.reset() - m.send(['monitor-list@lists.planet-lab.org']) + m.send([config.cc_email]) conn.restart_bootmanager('boot') @@ -699,7 +751,8 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodeid_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.dump_plconf_file() conn.set_nodestate('disable') @@ -708,10 +761,11 @@ def reboot(hostname, config=None, forced_action=None): args = {} args['hostname'] = hostname args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, + m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.dump_plconf_file() conn.set_nodestate('disable') @@ -726,7 +780,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) print "\tDisabling %s due to out-of-date BOOTCD" % hostname conn.set_nodestate('disable') @@ -744,7 +799,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": @@ -756,7 +812,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.set_nodestate('disable') elif sequences[s] == "bad_dns_email": @@ -764,8 +821,10 @@ def reboot(hostname, config=None, forced_action=None): args = {} try: node = api.GetNodes(hostname)[0] - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + net = api.GetInterfaces(node['interface_ids'])[0] except: + from nodecommon import email_exception + email_exception() print traceback.print_exc() # TODO: api error. skip email, b/c all info is not available, # flag_set will not be recorded. @@ -774,12 +833,13 @@ def reboot(hostname, config=None, forced_action=None): args['hostname'] = hostname args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] + args['interface_id'] = net['interface_id'] m = PersistMessage(hostname, mailtxt.baddns[0] % args, mailtxt.baddns[1] % args, True, db='baddns_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.set_nodestate('disable') if flag_set: