X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=981a9118bf4485037e745e595a88ec8623ca709c;hb=da913fbd1629fc4669b186915df8ff3a340482d3;hp=d34e6ef0d40b778445e1d06f7ff3f281ef8825f9;hpb=46b482b101e87c6e01c954ae8085ad9785648ef0;p=monitor.git diff --git a/bootman.py b/bootman.py index d34e6ef..981a911 100755 --- a/bootman.py +++ b/bootman.py @@ -7,7 +7,7 @@ api = plc.getAuthAPI() import sys import os -import policy +import const from getsshkeys import SSHKnownHosts @@ -20,11 +20,11 @@ from sets import Set import ssh.pxssh as pxssh import ssh.fdpexpect as fdpexpect import ssh.pexpect as pexpect -from unified_model import * +from monitor.model import * from emailTxt import mailtxt from nodeconfig import network_config_to_str import traceback -import monitorconfig +import config import signal class Sopen(subprocess.Popen): @@ -34,11 +34,7 @@ class Sopen(subprocess.Popen): #from Rpyc import SocketConnection, Async from Rpyc import SocketConnection, Async from Rpyc.Utils import * - -def get_fbnode(node): - fb = database.dbLoad("findbad") - fbnode = fb['nodes'][node]['values'] - return fbnode +fb = None class NodeConnection: def __init__(self, connection, node, config): @@ -204,7 +200,7 @@ class PlanetLabSession: args['port'] = self.port args['user'] = 'root' args['hostname'] = self.node - args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT + args['monitordir'] = config.MONITOR_SCRIPT_ROOT ssh_port = 22 if self.nosetup: @@ -311,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None): # NOTE: Nothing works if the bootcd is REALLY old. # So, this is the first step. - fbnode = get_fbnode(hostname) + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() if fbnode['category'] == "OLDBOOTCD": print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" args = {} @@ -321,7 +317,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) print "\tDisabling %s due to out-of-date BOOTCD" % hostname api.UpdateNode(hostname, {'boot_state' : 'disable'}) @@ -359,7 +355,6 @@ def reboot(hostname, config=None, forced_action=None): except: print traceback.print_exc() return False - if forced_action == "reboot": conn.restart_node('rins') @@ -453,7 +448,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') return False @@ -503,6 +498,7 @@ def reboot(hostname, config=None, forced_action=None): ('nodehostname' , 'Configured node hostname does not resolve'), ('implementerror', 'Implementation Error'), ('readonlyfs' , '[Errno 30] Read-only file system'), + ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"), ('noinstall' , 'notinstalled'), ('bziperror' , 'bzip2: Data integrity error when decompressing.'), ('noblockdev' , "No block devices detected."), @@ -512,6 +508,7 @@ def reboot(hostname, config=None, forced_action=None): ('hardwarerequirefail' , 'Hardware requirements not met'), ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'), ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"), + ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"), ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), ('modulefail' , 'Unable to get list of system modules'), ('writeerror' , 'write error: No space left on device'), @@ -539,11 +536,11 @@ def reboot(hostname, config=None, forced_action=None): # By using the sequence identifier, we guarantee that there will be no # frequent loops. I'm guessing there is a better way to track loops, # though. - if not config.force and pflags.getRecentFlag(s): - pflags.setRecentFlag(s) - pflags.save() - print "... flag is set or it has already run recently. Skipping %s" % node - return True + #if not config.force and pflags.getRecentFlag(s): + # pflags.setRecentFlag(s) + # pflags.save() + # print "... flag is set or it has already run recently. Skipping %s" % node + # return True sequences = {} @@ -581,7 +578,13 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", ]: sequences.update({n : "restart_bootmanager_rins"}) @@ -615,11 +618,14 @@ def reboot(hostname, config=None, forced_action=None): # update_node_config_email for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", ]: sequences.update({n : "update_node_config_email"}) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]: + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: sequences.update({n : "nodenetwork_email"}) # update_bootcd_email @@ -643,7 +649,11 @@ def reboot(hostname, config=None, forced_action=None): sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) # bad_dns_email - sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"}) + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) flag_set = True @@ -708,7 +718,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodeid_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.dump_plconf_file() conn.set_nodestate('disable') @@ -720,7 +730,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.dump_plconf_file() conn.set_nodestate('disable') @@ -735,7 +745,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) print "\tDisabling %s due to out-of-date BOOTCD" % hostname conn.set_nodestate('disable') @@ -753,7 +763,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": @@ -765,7 +775,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') elif sequences[s] == "bad_dns_email": @@ -788,7 +798,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddns[1] % args, True, db='baddns_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') if flag_set: