X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=22201cb69db34b69f0e91e8c732df91ef32f87be;hb=4d56ef5473c6486c321dd2797be45b45b0606dae;hp=ce9bb6ec2ff57e32560745b7e825ca0a784f021d;hpb=d0652340b89d51c6115edb13d5c7c72b34dea66f;p=monitor.git diff --git a/bootman.py b/bootman.py index ce9bb6e..22201cb 100755 --- a/bootman.py +++ b/bootman.py @@ -2,26 +2,29 @@ # Attempt to reboot a node in debug state. -import plc -import auth -api = plc.PLC(auth.auth, auth.plc) +from monitor import const +from monitor.database.info.model import * +from monitor.wrapper import plc +api = plc.getAuthAPI() import sys import os -import policy from getsshkeys import SSHKnownHosts import subprocess import time -import soltesz +from monitor.util import command as moncommands from sets import Set -import ssh.pxssh as pxssh -import ssh.fdpexpect as fdpexpect -import ssh.pexpect as pexpect -from unified_model import * -from emailTxt import mailtxt +from pcucontrol.transports.ssh import pxssh as pxssh +from pcucontrol.transports.ssh import fdpexpect as fdpexpect +from pcucontrol.transports.ssh import pexpect as pexpect +from monitor.model import * +from monitor.wrapper.emailTxt import mailtxt +from nodeconfig import network_config_to_str +import traceback +from monitor import config import signal class Sopen(subprocess.Popen): @@ -31,11 +34,7 @@ class Sopen(subprocess.Popen): #from Rpyc import SocketConnection, Async from Rpyc import SocketConnection, Async from Rpyc.Utils import * - -def get_fbnode(node): - fb = soltesz.dbLoad("findbad") - fbnode = fb['nodes'][node]['values'] - return fbnode +fb = None class NodeConnection: def __init__(self, connection, node, config): @@ -65,8 +64,8 @@ class NodeConnection: def dump_plconf_file(self): c = self.c - c.modules.sys.path.append("/tmp/source/") - c.modules.os.chdir('/tmp/source') + self.c.modules.sys.path.append("/tmp/source/") + self.c.modules.os.chdir('/tmp/source') log = c.modules.BootManager.log('/tmp/new.log') bm = c.modules.BootManager.BootManager(log,'boot') @@ -92,8 +91,8 @@ class NodeConnection: def compare_and_repair_nodekeys(self): c = self.c - c.modules.sys.path.append("/tmp/source/") - c.modules.os.chdir('/tmp/source') + self.c.modules.sys.path.append("/tmp/source/") + self.c.modules.os.chdir('/tmp/source') log = c.modules.BootManager.log('/tmp/new.log') bm = c.modules.BootManager.BootManager(log,'boot') @@ -201,7 +200,7 @@ class PlanetLabSession: args['port'] = self.port args['user'] = 'root' args['hostname'] = self.node - args['monitordir'] = "/home/soltesz/monitor" + args['monitordir'] = config.MONITOR_SCRIPT_ROOT ssh_port = 22 if self.nosetup: @@ -209,11 +208,11 @@ class PlanetLabSession: return # COPY Rpyc files to host - cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args + cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args if self.verbose: print cmd # TODO: Add timeout timeout = 120 - localos = soltesz.CMD() + localos = moncommands.CMD() ret = localos.system(cmd, timeout) print ret @@ -230,7 +229,7 @@ class PlanetLabSession: t1 = time.time() # KILL any already running servers. - ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port) + ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port) (ov,ev) = ssh.run_noexcept2("""<<\EOF rm -f out.log echo "kill server" >> out.log @@ -270,7 +269,7 @@ EOF""") # TODO: the read() here may block indefinitely. Need a better # approach therefore, that includes a timeout. #ret = self.command.stdout.read(5) - ret = soltesz.read_t(self.command.stdout, 5) + ret = moncommands.read_t(self.command.stdout, 5) t2 = time.time() if 'READY' in ret: @@ -308,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None): # NOTE: Nothing works if the bootcd is REALLY old. # So, this is the first step. - fbnode = get_fbnode(hostname) + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() if fbnode['category'] == "OLDBOOTCD": print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" args = {} @@ -318,7 +317,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) print "\tDisabling %s due to out-of-date BOOTCD" % hostname api.UpdateNode(hostname, {'boot_state' : 'disable'}) @@ -331,7 +331,7 @@ def reboot(hostname, config=None, forced_action=None): try: k = SSHKnownHosts(); k.update(node); k.write(); del k except: - import traceback; print traceback.print_exc() + print traceback.print_exc() return False try: @@ -341,7 +341,7 @@ def reboot(hostname, config=None, forced_action=None): session = PlanetLabSession(node, config.nosetup, config.verbose) except Exception, e: print "ERROR setting up session for %s" % hostname - import traceback; print traceback.print_exc() + print traceback.print_exc() print e return False @@ -354,9 +354,8 @@ def reboot(hostname, config=None, forced_action=None): time.sleep(session.timeout*4) conn = session.get_connection(config) except: - import traceback; print traceback.print_exc() + print traceback.print_exc() return False - if forced_action == "reboot": conn.restart_node('rins') @@ -397,25 +396,34 @@ def reboot(hostname, config=None, forced_action=None): ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'), ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'), + + ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'), + ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'), + ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'), ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'), + ('sdXerror' , 'sd\w: Current: sense key: Medium Error'), ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'), + ('floppytimeout','floppy0: floppy timeout called'), ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'), + # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error } + # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263 + # floppy0: floppy timeout called # end_request: I/O error, dev fd0, sector 0 - #Buffer I/O error on device dm-2, logical block 8888896 - #ata1: status=0x51 { DriveReady SeekComplete Error } - #ata1: error=0x40 { UncorrectableError } - #SCSI error : <0 0 0 0> return code = 0x8000002 - #sda: Current: sense key: Medium Error + # Buffer I/O error on device dm-2, logical block 8888896 + # ata1: status=0x51 { DriveReady SeekComplete Error } + # ata1: error=0x40 { UncorrectableError } + # SCSI error : <0 0 0 0> return code = 0x8000002 + # sda: Current: sense key: Medium Error # Additional sense: Unrecovered read error - auto reallocate failed - #SCSI error : <0 2 0 0> return code = 0x40001 - #end_request: I/O error, dev sda, sector 572489600 + # SCSI error : <0 2 0 0> return code = 0x40001 + # end_request: I/O error, dev sda, sector 572489600 ] id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) sequence.append(id) @@ -441,8 +449,9 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) - conn.set_nodestate('diag') + emails = plc.getTechEmails(loginbase) + m.send(emails) + conn.set_nodestate('disable') return False print "...Downloading bm.log from %s" % node @@ -491,14 +500,17 @@ def reboot(hostname, config=None, forced_action=None): ('nodehostname' , 'Configured node hostname does not resolve'), ('implementerror', 'Implementation Error'), ('readonlyfs' , '[Errno 30] Read-only file system'), + ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"), ('noinstall' , 'notinstalled'), ('bziperror' , 'bzip2: Data integrity error when decompressing.'), ('noblockdev' , "No block devices detected."), + ('dnserror' , 'Name or service not known'), ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'), ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'), ('hardwarerequirefail' , 'Hardware requirements not met'), ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'), ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"), + ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"), ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), ('modulefail' , 'Unable to get list of system modules'), ('writeerror' , 'write error: No space left on device'), @@ -526,11 +538,11 @@ def reboot(hostname, config=None, forced_action=None): # By using the sequence identifier, we guarantee that there will be no # frequent loops. I'm guessing there is a better way to track loops, # though. - if not config.force and pflags.getRecentFlag(s): - pflags.setRecentFlag(s) - pflags.save() - print "... flag is set or it has already run recently. Skipping %s" % node - return True + #if not config.force and pflags.getRecentFlag(s): + # pflags.setRecentFlag(s) + # pflags.save() + # print "... flag is set or it has already run recently. Skipping %s" % node + # return True sequences = {} @@ -539,6 +551,9 @@ def reboot(hostname, config=None, forced_action=None): for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-update-debug-done", @@ -546,6 +561,7 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", "bminit-cfg-auth-getplc-implementerror-update-debug-done", ]: sequences.update({n : "restart_bootmanager_boot"}) @@ -564,7 +580,13 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", ]: sequences.update({n : "restart_bootmanager_rins"}) @@ -593,16 +615,20 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", ]: sequences.update({n: "restart_node_boot"}) # update_node_config_email for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", ]: sequences.update({n : "update_node_config_email"}) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]: + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: sequences.update({n : "nodenetwork_email"}) # update_bootcd_email @@ -625,6 +651,13 @@ def reboot(hostname, config=None, forced_action=None): # broken_hardware_email sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + # bad_dns_email + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) + flag_set = True @@ -639,7 +672,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args, mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages') m.reset() - m.send(['monitor-list@lists.planet-lab.org']) + m.send([config.cc_email]) conn.restart_bootmanager('boot') @@ -677,7 +710,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args, mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages') m.reset() - m.send(['monitor-list@lists.planet-lab.org']) + m.send([config.cc_email]) conn.restart_bootmanager('boot') @@ -688,9 +721,10 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodeid_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.dump_plconf_file() - conn.set_nodestate('diag') + conn.set_nodestate('disable') elif sequences[s] == "nodenetwork_email": print "...Sending message to LOOK AT NODE NETWORK" @@ -700,9 +734,10 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.dump_plconf_file() - conn.set_nodestate('diag') + conn.set_nodestate('disable') elif sequences[s] == "update_bootcd_email": print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" @@ -715,7 +750,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) print "\tDisabling %s due to out-of-date BOOTCD" % hostname conn.set_nodestate('disable') @@ -733,7 +769,8 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": @@ -745,7 +782,32 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + emails = plc.getTechEmails(loginbase) + m.send(emails) + conn.set_nodestate('disable') + + elif sequences[s] == "bad_dns_email": + print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname + args = {} + try: + node = api.GetNodes(hostname)[0] + net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + except: + print traceback.print_exc() + # TODO: api error. skip email, b/c all info is not available, + # flag_set will not be recorded. + return False + nodenet_str = network_config_to_str(net) + + args['hostname'] = hostname + args['network_config'] = nodenet_str + args['nodenetwork_id'] = net['nodenetwork_id'] + m = PersistMessage(hostname, mailtxt.baddns[0] % args, + mailtxt.baddns[1] % args, True, db='baddns_persistmessages') + + loginbase = plc.siteId(hostname) + emails = plc.getTechEmails(loginbase) + m.send(emails) conn.set_nodestate('disable') if flag_set: @@ -758,10 +820,11 @@ def reboot(hostname, config=None, forced_action=None): # MAIN ------------------------------------------------------------------- def main(): - from config import config - from optparse import OptionParser - parser = OptionParser() - parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False) + from monitor import parser as parsermodule + parser = parsermodule.getParser() + + parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, + force=None, quiet=False) parser.add_option("", "--child", dest="child", action="store_true", help="This is the child mode of this process.") parser.add_option("", "--force", dest="force", metavar="boot_state", @@ -770,16 +833,15 @@ def main(): help="Extra quiet output messages.") parser.add_option("", "--verbose", dest="verbose", action="store_true", help="Extra debug output messages.") + parser.add_option("", "--nonet", dest="nonet", action="store_true", + help="Do not setup the network, use existing log files to re-run a test pass.") parser.add_option("", "--collect", dest="collect", action="store_true", help="No action, just collect dmesg, and bm.log") parser.add_option("", "--nosetup", dest="nosetup", action="store_true", help="Do not perform the orginary setup phase.") - parser.add_option("", "--node", dest="node", metavar="nodename.edu", - help="A single node name to try to bring out of debug mode.") - parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", - help="A list of nodes to bring out of debug mode.") - config = config(parser) - config.parse_args() + + parser = parsermodule.getParser(['nodesets', 'defaults'], parser) + config = parsermodule.parse_args(parser) if config.nodelist: nodes = config.getListFromFile(config.nodelist)