X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=a43a95bf0ffb112283a2acdf2c57ad462fe9a864;hb=66c4742c05622d6c53368e2890670eaefa5345f3;hp=0cd88ecfcacddaafe626167a2920e14ee3aaccb0;hpb=df0d79b63101a2fc33cacd8c0457a5e0d1913279;p=monitor.git diff --git a/bootman.py b/bootman.py index 0cd88ec..a43a95b 100755 --- a/bootman.py +++ b/bootman.py @@ -2,40 +2,44 @@ # Attempt to reboot a node in debug state. -from monitor import const -from monitor.database.info.model import * -from monitor.wrapper import plc -api = plc.getAuthAPI() -import sys + import os +import sys +import time +import random +import signal +import traceback +import subprocess +from sets import Set from getsshkeys import SSHKnownHosts -import subprocess -import time -from pcucontrol.util import command as moncommands -from sets import Set +from Rpyc import SocketConnection, Async +from Rpyc.Utils import * +import getconf +from monitor import config +from monitor import const +from monitor.model import * +from monitor.common import email_exception, found_within +from monitor.database.info.model import * +from monitor.wrapper import plc +from monitor.wrapper.emailTxt import mailtxt + +from pcucontrol.util import command as moncommands +from pcucontrol.util.command import Sopen from pcucontrol.transports.ssh import pxssh as pxssh from pcucontrol.transports.ssh import fdpexpect as fdpexpect from pcucontrol.transports.ssh import pexpect as pexpect -from monitor.model import * -from monitor.wrapper.emailTxt import mailtxt + from nodeconfig import network_config_to_str -import traceback -from monitor import config -import signal -class Sopen(subprocess.Popen): - def kill(self, signal = signal.SIGTERM): - os.kill(self.pid, signal) -#from Rpyc import SocketConnection, Async -from Rpyc import SocketConnection, Async -from Rpyc.Utils import * +api = plc.getAuthAPI() fb = None + class NodeConnection: def __init__(self, connection, node, config): self.node = node @@ -43,12 +47,20 @@ class NodeConnection: self.config = config def get_boot_state(self): - if self.c.modules.os.path.exists('/tmp/source'): - return "dbg" - elif self.c.modules.os.path.exists('/vservers'): - return "boot" - else: - return "unknown" + try: + if self.c.modules.os.path.exists('/tmp/source'): + return "debug" + elif self.c.modules.os.path.exists('/vservers'): + return "boot" + else: + return "unknown" + except EOFError: + traceback.print_exc() + print self.c.modules.sys.path + except: + traceback.print_exc() + + return "unknown" def get_dmesg(self): self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") @@ -177,7 +189,6 @@ class NodeConnection: return -import random class PlanetLabSession: globalport = 22000 + int(random.random()*1000) @@ -190,7 +201,14 @@ class PlanetLabSession: self.setup_host() def get_connection(self, config): - return NodeConnection(SocketConnection("localhost", self.port), self.node, config) + conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config) + #i = 0 + #while i < 3: + # print i, conn.c.modules.sys.path + # print conn.c.modules.os.path.exists('/tmp/source') + # i+=1 + # time.sleep(1) + return conn def setup_host(self): self.port = PlanetLabSession.globalport @@ -210,6 +228,7 @@ class PlanetLabSession: # COPY Rpyc files to host cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args if self.verbose: print cmd + print cmd # TODO: Add timeout timeout = 120 localos = moncommands.CMD() @@ -253,6 +272,7 @@ EOF""") #cmd = cmd % args #if self.verbose: print cmd #print localos.system(cmd,timeout) + print "setup rpyc server over ssh" print ssh.ret # TODO: Add timeout @@ -265,6 +285,7 @@ EOF""") """%(user)s@%(hostname)s""" cmd = cmd % args if self.verbose: print cmd + print cmd self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE) # TODO: the read() here may block indefinitely. Need a better # approach therefore, that includes a timeout. @@ -288,14 +309,12 @@ EOF""") def __del__(self): if self.command: if self.verbose: print "Killing SSH session %s" % self.port + print "Killing SSH session %s" % self.port self.command.kill() - -def steps_to_list(steps): - ret_list = [] - for (id,label) in steps: - ret_list.append(label) - return ret_list + +def steps_to_list(steps, index=1): + return map(lambda x: x[index], steps) def index_to_id(steps,index): if index < len(steps): @@ -303,101 +322,176 @@ def index_to_id(steps,index): else: return "done" -def reboot(hostname, config=None, forced_action=None): +class DebugInterface: + def __init__(self, hostname): + self.hostname = hostname + self.session = None - # NOTE: Nothing works if the bootcd is REALLY old. - # So, this is the first step. - fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() - print fbnode.keys() - if fbnode['observed_category'] == "OLDBOOTCD": - print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" - args = {} - args['hostname_list'] = " %s" % hostname - - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - api.UpdateNode(hostname, {'boot_state' : 'disable'}) - return True - - node = hostname - print "Creating session for %s" % node - # update known_hosts file (in case the node has rebooted since last run) - if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node - try: - k = SSHKnownHosts(); k.update(node); k.write(); del k - except: - from monitor.common import email_exception - email_exception() - print traceback.print_exc() - return False - - try: - if config == None: - session = PlanetLabSession(node, False, True) - else: - session = PlanetLabSession(node, config.nosetup, config.verbose) - except Exception, e: - msg = "ERROR setting up session for %s" % hostname - print msg - print traceback.print_exc() - from monitor.common import email_exception - email_exception(msg) - print e - return False - - try: - conn = session.get_connection(config) - except EOFError: - # NOTE: sometimes the wait in setup_host() is not long enough. - # So, here we try to wait a little longer before giving up entirely. + def getConnection(self): + print "Creating session for %s" % self.hostname + # update known_hosts file (in case the node has rebooted since last run) try: - time.sleep(session.timeout*4) - conn = session.get_connection(config) + k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k except: - print traceback.print_exc() - from monitor.common import email_exception email_exception() + print traceback.print_exc() return False - if forced_action == "reboot": - conn.restart_node('rins') - return True + try: + if config == None: + self.session = PlanetLabSession(self.hostname, False, True) + else: + self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) + except Exception, e: + msg = "ERROR setting up session for %s" % self.hostname + print msg + traceback.print_exc() + email_exception(msg) + return False - boot_state = conn.get_boot_state() - if boot_state == "boot": - print "...Boot state of %s already completed : skipping..." % node - return True - elif boot_state == "unknown": - print "...Unknown bootstate for %s : skipping..."% node - return False - else: - pass + try: + conn = self.session.get_connection(config) + except EOFError: + # NOTE: sometimes the wait in setup_host() is not long enough. + # So, here we try to wait a little longer before giving up entirely. + try: + time.sleep(self.session.timeout*5) + conn = self.session.get_connection(config) + except: + traceback.print_exc() + email_exception(self.hostname) + return False + #print "trying to use conn before returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + #time.sleep(1) - if conn.bootmanager_running(): - print "...BootManager is currently running. Skipping host %s" % node - return True + #print "conn: %s" % conn + return conn - #if config != None: - # if config.force: - # conn.restart_bootmanager(config.force) - # return True + def getSequences(self): - # Read persistent flags, tagged on one week intervals. - pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags') + # TODO: This can be replaced with a DB definition at a future time. + # This would make it possible for an admin to introduce new + # patterns without touching code. + sequences = {} + # restart_bootmanager_boot + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", + "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-implementerror-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_boot"}) + + # conn.restart_bootmanager('rins') + for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_rins"}) + + # repair_node_keys + sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) + + # conn.restart_node('rins') + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + ]: + sequences.update({n : "restart_node_rins"}) + + # restart_node_boot + for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + ]: + sequences.update({n: "restart_node_boot"}) + + # update_node_config_email + for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", + ]: + sequences.update({n : "update_node_config_email"}) + + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: + sequences.update({n : "nodenetwork_email"}) + + # update_bootcd_email + for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", + ]: + sequences.update({n : "update_bootcd_email"}) + + for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + ]: + sequences.update({n: "suspect_error_email"}) + + # update_hardware_email + sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + + # broken_hardware_email + sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + + # bad_dns_email + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) - if config and not config.quiet: print "...downloading dmesg from %s" % node - dmesg = conn.get_dmesg() - child = fdpexpect.fdspawn(dmesg) + return sequences - sequence = [] - while True: + def getDiskSteps(self): steps = [ ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), @@ -433,51 +527,19 @@ def reboot(hostname, config=None, forced_action=None): # SCSI error : <0 2 0 0> return code = 0x40001 # end_request: I/O error, dev sda, sector 572489600 ] - id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) - sequence.append(id) - - if id == "done": - break - - s = Set(sequence) - if config and not config.quiet: print "\tSET: ", s + return steps - if len(s) > 1: - print "...Potential drive errors on %s" % node - if len(s) == 2 and 'floppyerror' in s: - print "...Should investigate. Continuing with node." - else: - print "...Should investigate. Skipping node." - # TODO: send message related to these errors. - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() + def getDiskSequence(self, steps, child): + sequence = [] + while True: + id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) + sequence.append(id) - m = PersistMessage(hostname, mailtxt.baddisk[0] % args, - mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') - return False - - print "...Downloading bm.log from %s" % node - log = conn.get_bootmanager_log() - child = fdpexpect.fdspawn(log) - - try: - if config.collect: return True - except: - pass - - time.sleep(1) - - if config and not config.quiet: print "...Scanning bm.log for errors" - action_id = "dbg" - sequence = [] - while True: + if id == "done": + break + return sequence + def getBootManagerStepPatterns(self): steps = [ ('bminit' , 'Initializing the BootManager.'), ('cfg' , 'Reading node configuration file.'), @@ -528,147 +590,118 @@ def reboot(hostname, config=None, forced_action=None): ('bootcheckfail' , 'BootCheckAuthentication'), ('bootupdatefail' , 'BootUpdateNode'), ] - list = steps_to_list(steps) - index = child.expect( list + [ pexpect.EOF ]) - id = index_to_id(steps,index) - sequence.append(id) - - if id == "exception": - if config and not config.quiet: print "...Found An Exception!!!" - elif index == len(list): - #print "Reached EOF" - break + return steps + + def getBootManagerSequenceFromLog(self, steps, child): + sequence = [] + while True: + + index = child.expect( steps_to_list(steps) + [ pexpect.EOF ]) + id = index_to_id(steps,index) + sequence.append(id) + + if id == "exception": + print "...Found An Exception!!!" + elif id == "done": #index == len(steps_to_list(steps)): + #print "Reached EOF" + break + + return sequence - s = "-".join(sequence) - print " FOUND SEQUENCE: ", s - # NOTE: We get or set the flag based on the current sequence identifier. - # By using the sequence identifier, we guarantee that there will be no - # frequent loops. I'm guessing there is a better way to track loops, - # though. - #if not config.force and pflags.getRecentFlag(s): - # pflags.setRecentFlag(s) - # pflags.save() - # print "... flag is set or it has already run recently. Skipping %s" % node +def restore(sitehist, hostname, config=None, forced_action=None): + + # NOTE: Nothing works if the bootcd is REALLY old. + # So, this is the first step. + + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() + recent_actions = sitehist.getRecentActions(hostname=hostname) + + if fbnode['observed_category'] == "OLDBOOTCD": + print "\t...Notify owner to update BootImage!!!" + + if not found_within(recent_actions, 'newbootcd_notice', 3): + sitehist.sendMessage('newbootcd_notice', hostname=hostname) + + print "\tDisabling %s due to out-of-date BootImage" % hostname + api.UpdateNode(hostname, {'boot_state' : 'disable'}) + + # NOTE: nothing else is possible. + return True + + debugnode = DebugInterface(hostname) + conn = debugnode.getConnection() + #print "conn: %s" % conn + #print "trying to use conn after returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + if type(conn) == type(False): return False + + #if forced_action == "reboot": + # conn.restart_node('rins') # return True - sequences = {} + boot_state = conn.get_boot_state() + if boot_state != "debug": + print "... %s in %s state: skipping..." % (hostname , boot_state) + return boot_state == "boot" + if conn.bootmanager_running(): + print "...BootManager is currently running. Skipping host %s" %hostname + return True - # restart_bootmanager_boot - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + # Read persistent flags, tagged on one week intervals. + #pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags') - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + if config and not config.quiet: print "...downloading dmesg from %s" %hostname + dmesg = conn.get_dmesg() + child = fdpexpect.fdspawn(dmesg) - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", - "bminit-cfg-auth-protoerror-exception-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-implementerror-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_boot"}) - - # conn.restart_bootmanager('rins') - for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", - # actual solution appears to involve removing the bad files, and - # continually trying to boot the node. - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_rins"}) - - # repair_node_keys - sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - - # conn.restart_node('rins') - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - ]: - sequences.update({n : "restart_node_rins"}) - - # restart_node_boot - for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", - ]: - sequences.update({n: "restart_node_boot"}) - - # update_node_config_email - for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", - ]: - sequences.update({n : "update_node_config_email"}) + steps = debugnode.getDiskSteps() + sequence = debugnode.getDiskSequence(steps, child) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", - "bminit-cfg-update-exception-nodehostname-update-debug-done", - ]: - sequences.update({n : "nodenetwork_email"}) - - # update_bootcd_email - for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", - ]: - sequences.update({n : "update_bootcd_email"}) + s = Set(sequence) + if config and not config.quiet: print "\tSET: ", s - for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - ]: - sequences.update({n: "suspect_error_email"}) + if len(s) > 1: + print "...Potential drive errors on %s" % hostname + if len(s) == 2 and 'floppyerror' in s: + print "...Should investigate. Continuing with node." + else: + print "...Should investigate. Skipping node." + # TODO: send message related to these errors. - # update_hardware_email - sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) - sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + if not found_within(recent_actions, 'newbootcd_notice', 3): - # broken_hardware_email - sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + log=conn.get_dmesg().read() + sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log) + conn.set_nodestate('disable') - # bad_dns_email - for n in [ - "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - ]: - sequences.update( { n : "bad_dns_email"}) + return False - flag_set = True + print "...Downloading bm.log from %s" %hostname + log = conn.get_bootmanager_log() + child = fdpexpect.fdspawn(log) + + if hasattr(config, 'collect') and config.collect: return True + + if config and not config.quiet: print "...Scanning bm.log for errors" + + time.sleep(1) + steps = debugnode.getBootManagerStepPatterns() + sequence = debugnode.getBootManagerSequenceFromLog(steps, child) + + s = "-".join(sequence) + print " FOUND SEQUENCE: ", s + + # NOTE: We get or set the flag based on the current sequence identifier. + # By using the sequence identifier, we guarantee that there will be no + # frequent loops. I'm guessing there is a better way to track loops, + # though. + + sequences = debugnode.getSequences() + flag_set = True if s not in sequences: print " HOST %s" % hostname @@ -678,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None): args['hostname'] = hostname args['sequence'] = s args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args, - mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages') - m.reset() - m.send([config.cc_email]) + args['viart'] = False + + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') @@ -692,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None): else: if sequences[s] == "restart_bootmanager_boot": - if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + print "...Restarting BootManager.py on %s "%hostname conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": - if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + print "...Restarting BootManager.py on %s "%hostname conn.restart_bootmanager('rins') elif sequences[s] == "restart_node_rins": conn.restart_node('rins') @@ -709,121 +741,89 @@ def reboot(hostname, config=None, forced_action=None): pass else: # there was some failure to synchronize the keys. - print "...Unable to repair node keys on %s" % node + print "...Unable to repair node keys on %s" %hostname elif sequences[s] == "suspect_error_email": args = {} args['hostname'] = hostname args['sequence'] = s args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args, - mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages') - m.reset() - m.send([config.cc_email]) + args['viart'] = False + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') + # TODO: differentiate this and the 'nodenetwork_email' actions. elif sequences[s] == "update_node_config_email": - print "...Sending message to UPDATE NODE CONFIG" - args = {} - args['hostname'] = hostname - m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, - True, db='nodeid_persistmessages') - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.dump_plconf_file() - conn.set_nodestate('disable') + + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() elif sequences[s] == "nodenetwork_email": - print "...Sending message to LOOK AT NODE NETWORK" - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, - True, db='nodenet_persistmessages') - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.dump_plconf_file() - conn.set_nodestate('disable') - elif sequences[s] == "update_bootcd_email": - print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" - import getconf - args = {} - args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: - args['hostname_list'] = "%s" % hostname + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') + elif sequences[s] == "update_bootcd_email": - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) + if not found_within(recent_actions, 'newalphacd_notice', 3): + args = {} + args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: + args['hostname'] = hostname + + sitehist.sendMessage('newalphacd_notice', **args) - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - conn.set_nodestate('disable') + print "\tDisabling %s due to out-of-date BOOTCD" % hostname elif sequences[s] == "broken_hardware_email": # MAKE An ACTION record that this host has failed hardware. May # require either an exception "/minhw" or other manual intervention. # Definitely need to send out some more EMAIL. - print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname # TODO: email notice of broken hardware - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() - m = PersistMessage(hostname, mailtxt.baddisk[0] % args, - mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') + if not found_within(recent_actions, 'baddisk_notice', 1): + print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['log'] = conn.get_dmesg().read() - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') + sitehist.sendMessage('baddisk_notice', **args) + conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": - print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args, - mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') + if not found_within(recent_actions, 'minimalhardware_notice', 1): + print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('minimalhardware_notice', **args) elif sequences[s] == "bad_dns_email": - print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname - args = {} - try: - node = api.GetNodes(hostname)[0] - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] - except: - from monitor.common import email_exception - email_exception() - print traceback.print_exc() - # TODO: api error. skip email, b/c all info is not available, - # flag_set will not be recorded. - return False - nodenet_str = network_config_to_str(net) + if not found_within(recent_actions, 'baddns_notice', 1): + print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname + args = {} + try: + node = api.GetNodes(hostname)[0] + net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + except: + email_exception() + print traceback.print_exc() + # TODO: api error. skip email, b/c all info is not available, + # flag_set will not be recorded. + return False + nodenet_str = network_config_to_str(net) - args['hostname'] = hostname - args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] - m = PersistMessage(hostname, mailtxt.baddns[0] % args, - mailtxt.baddns[1] % args, True, db='baddns_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') - - if flag_set: - pflags.setRecentFlag(s) - pflags.save() + args['hostname'] = hostname + args['network_config'] = nodenet_str + args['nodenetwork_id'] = net['nodenetwork_id'] + + sitehist.sendMessage('baddns_notice', **args) return True