From 8b0bb120a071aa27951eb0bf9435aadd9e1df9ed Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Mon, 19 May 2008 18:36:27 +0000 Subject: [PATCH] adding files --- getnodes.py | 37 ++++ nodecommon.py | 111 +++++++++++ nodehistory.py | 136 +++++++++++++ nodequery.py | 88 +++++++++ nodereboot.py | 528 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 900 insertions(+) create mode 100755 getnodes.py create mode 100644 nodecommon.py create mode 100755 nodehistory.py create mode 100755 nodequery.py create mode 100755 nodereboot.py diff --git a/getnodes.py b/getnodes.py new file mode 100755 index 0000000..60dad7c --- /dev/null +++ b/getnodes.py @@ -0,0 +1,37 @@ +#!/usr/bin/python + +import soltesz +import plc +from optparse import OptionParser +import sys +from reboot import pcu_name, get_pcu_values + +import sys +from config import config + +parser = OptionParser() +parser.set_defaults(withpcu=False, + refresh=False) +parser.add_option("", "--refresh", action="store_true", dest="refresh", + help="Refresh the cached values") + + +config = config(parser) +config.parse_args() + +if not config.run: + k = config.__dict__.keys() + k.sort() + for o in k: + print o, "=", config.__dict__[o] + print "Add --run to actually perform the command" + sys.exit(1) + +nodelist = soltesz.if_cached_else_refresh(1, + config.refresh, + "l_plcnodes", + lambda : plc.getNodes({'peer_id':None}, ['hostname'])) +nodes = [n['hostname'] for n in nodelist] + +for nodename in nodes: + print nodename diff --git a/nodecommon.py b/nodecommon.py new file mode 100644 index 0000000..db46e40 --- /dev/null +++ b/nodecommon.py @@ -0,0 +1,111 @@ + +import struct +import reboot +esc = struct.pack('i', 27) +RED = esc + "[1;31m" +GREEN = esc + "[1;32m" +YELLOW = esc + "[1;33m" +BLUE = esc + "[1;34m" +NORMAL = esc + "[0;39m" + +def red(str): + return RED + str + NORMAL + +def yellow(str): + return YELLOW + str + NORMAL + +def green(str): + return GREEN + str + NORMAL + +def blue(str): + return BLUE + str + NORMAL + +def get_current_state(fbnode): + if 'state' in fbnode: + state = fbnode['state'] + else: + state = "none" + l = state.lower() + if l == "debug": l = 'dbg ' + return l + +def color_pcu_state(fbnode): + import plc + + if 'plcnode' in fbnode and 'pcu_ids' in fbnode['plcnode'] and len(fbnode['plcnode']['pcu_ids']) > 0 : + values = reboot.get_pcu_values(fbnode['plcnode']['pcu_ids'][0]) + if values == None: + return fbnode['pcu'] + else: + return fbnode['pcu'] + + if 'reboot' in values: + rb = values['reboot'] + if rb == 0 or rb == "0": + return fbnode['pcu'] + "OK " + #return green(fbnode['pcu']) + elif "NetDown" == rb or "Not_Run" == rb: + return fbnode['pcu'] + "DOWN" + #return yellow(fbnode['pcu']) + else: + return fbnode['pcu'] + "BAD " + #return red(fbnode['pcu']) + else: + #return red(fbnode['pcu']) + return fbnode['pcu'] + "BAD " + +def color_boot_state(l): + if l == "dbg": return yellow("dbg ") + elif l == "dbg ": return yellow(l) + elif l == "down": return red(l) + elif l == "boot": return green(l) + elif l == "rins": return blue(l) + else: + return l + +def nodegroup_display(node, fb): + if node['hostname'] in fb['nodes']: + node['current'] = get_current_state(fb['nodes'][node['hostname']]['values']) + else: + node['current'] = 'none' + + if fb['nodes'][node['hostname']]['values'] == []: + return "" + + s = fb['nodes'][node['hostname']]['values']['kernel'].split() + if len(s) >=3: + node['kernel'] = s[2] + else: + node['kernel'] = fb['nodes'][node['hostname']]['values']['kernel'] + + if '2.6' not in node['kernel']: node['kernel'] = "" + node['boot_state'] = color_boot_state(node['boot_state']) + node['current'] = color_boot_state(node['current']) + #node['boot_state'] = node['boot_state'] + #node['current'] = node['current'] + node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu'] + return "%(hostname)-38s %(boot_state)5s %(current)5s %(pcu)6s %(key)45s %(kernel)s" % node + +from model import * +import soltesz + +def node_end_record(node): + act_all = soltesz.dbLoad("act_all") + if node not in act_all: + del act_all + return False + a = Action(node, act_all[node][0]) + a.delField('rt') + a.delField('found_rt_ticket') + a.delField('second-mail-at-oneweek') + a.delField('second-mail-at-twoweeks') + a.delField('first-found') + rec = a.get() + rec['action'] = ["close_rt"] + rec['category'] = "UNKNOWN" + rec['stage'] = "monitor-end-record" + rec['time'] = time.time() - 7*60*60*24 + act_all[node].insert(0,rec) + soltesz.dbDump("act_all", act_all) + del act_all + return True diff --git a/nodehistory.py b/nodehistory.py new file mode 100755 index 0000000..f40ecc7 --- /dev/null +++ b/nodehistory.py @@ -0,0 +1,136 @@ +#!/usr/bin/python + +import plc +import auth +api = plc.PLC(auth.auth, auth.plc) + +import soltesz +import reboot +import time +from datetime import datetime, timedelta +import calendar + +import time +from model import * +from nodecommon import * + +from config import config +from optparse import OptionParser + +parser = OptionParser() +parser.set_defaults(node=None, fields='state', fromtime=None) +parser.add_option("", "--node", dest="node", metavar="nodename.edu", + help="A single node name to add to the nodegroup") +parser.add_option("", "--fields", dest="fields", metavar="key", + help="Which record field to extract from all files.") +parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD", + help="Specify a starting date from which to begin the query.") +config = config(parser) +config.parse_args() + +def datetime_fromstr(str): + if '-' in str: + tup = time.strptime(str, "%Y-%m-%d") + elif '/' in str: + tup = time.strptime(str, "%m/%d/%Y") + else: + tup = time.strptime(str, "%m/%d/%Y") + return datetime.fromtimestamp(calendar.timegm(tup)) + +def diff_time(timestamp): + now = time.time() + if timestamp == None: + return "unknown" + diff = now - timestamp + # return the number of seconds as a difference from current time. + t_str = "" + if diff < 60: # sec in min. + t = diff + t_str = "%s sec ago" % t + elif diff < 60*60: # sec in hour + t = diff // (60) + t_str = "%s min ago" % int(t) + elif diff < 60*60*24: # sec in day + t = diff // (60*60) + t_str = "%s hours ago" % int(t) + elif diff < 60*60*24*7: # sec in week + t = diff // (60*60*24) + t_str = "%s days ago" % int(t) + elif diff < 60*60*24*30: # approx sec in month + t = diff // (60*60*24*7) + t_str = "%s weeks ago" % int(t) + elif diff > 60*60*24*30: # approx sec in month + t = diff // (60*60*24*7*30) + t_str = "%s months ago" % int(t) + return t_str + +def fb_print_nodeinfo(fbnode, verbose, date=None): + if verbose: print " state | ssh | pcu | bootcd | category | kernel" + if 'checked' in fbnode: + print "%11.11s " % diff_time(fbnode['checked']), + else: + if date: print date, + else: print "Unknown", + + if fbnode['bootcd']: + fbnode['bootcd'] = fbnode['bootcd'].split()[-1] + else: + fbnode['bootcd'] = "unknown" + fbnode['state'] = color_boot_state(get_current_state(fbnode)) + fbnode['kernel'] = fbnode['kernel'].split()[2] + print " %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode + +def pcu_print_info(pcuinfo, hostname): + print " Checked: ", + if 'checked' in pcuinfo: + print "%11.11s " % diff_time(pcuinfo['checked']) + else: + print "Unknown" + + print "\t user | password | port | hostname " + print "\t %17s | %17s | %4s | %30s | %s" % \ + (pcuinfo['username'], pcuinfo['password'], + pcuinfo[hostname], reboot.pcu_name(pcuinfo), pcuinfo['model']) + + if 'portstatus' in pcuinfo and pcuinfo['portstatus'] != {}: + if pcuinfo['portstatus']['22'] == "open": + print "\t ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no %s@%s" % (pcuinfo['username'], reboot.pcu_name(pcuinfo)) + if pcuinfo['portstatus']['23'] == "open": + print "\t telnet %s" % (reboot.pcu_name(pcuinfo)) + if pcuinfo['portstatus']['80'] == "open" or \ + pcuinfo['portstatus']['443'] == "open": + print "\t http://%s" % (reboot.pcu_name(pcuinfo)) + if pcuinfo['portstatus']['443'] == "open": + print "\t racadm.py -r %s -u %s -p '%s'" % (pcuinfo['ip'], pcuinfo['username'], pcuinfo['password']) + print "\t cmdhttps/locfg.pl -s %s -f iloxml/Reset_Server.xml -u %s -p '%s' | grep MESSAGE" % \ + (reboot.pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password']) + +path = "archive-pdb" +archive = soltesz.SPickle(path) + +if config.fromtime: + begin = config.fromtime +else: + begin = "2007-11-06" + +d = datetime_fromstr(begin) +tdelta = timedelta(1) +verbose = 1 + +while True: + file = "%s.production.findbad" % d.strftime("%Y-%m-%d") + + try: + fb = archive.load(file) + if config.node in fb['nodes']: + fb_nodeinfo = fb['nodes'][config.node]['values'] + fb_print_nodeinfo(fb_nodeinfo, verbose, d.strftime("%Y-%m-%d")) + + del fb + verbose = 0 + except: + print d.strftime("%Y-%m-%d"), "No record" + + d = d + tdelta + if d > datetime.now(): break + diff --git a/nodequery.py b/nodequery.py new file mode 100755 index 0000000..4091d2f --- /dev/null +++ b/nodequery.py @@ -0,0 +1,88 @@ +#!/usr/bin/python + +import plc +import auth +api = plc.PLC(auth.auth, auth.plc) + +import soltesz +fb = soltesz.dbLoad("findbad") +from nodecommon import * + +import time + +from config import config +from optparse import OptionParser +parser = OptionParser() +parser.set_defaults(node=None, category=None, nodelist=None) +parser.add_option("", "--category", dest="category", metavar="category", + help="List all nodes in the given category") +parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", + help="A list of nodes to bring out of debug mode.") +config = config(parser) +config.parse_args() + +def diff_time(timestamp): + now = time.time() + if timestamp == None: + return "unknown" + diff = now - timestamp + # return the number of seconds as a difference from current time. + t_str = "" + if diff < 60: # sec in min. + t = diff + t_str = "%s sec ago" % t + elif diff < 60*60: # sec in hour + t = diff // (60) + t_str = "%s min ago" % int(t) + elif diff < 60*60*24: # sec in day + t = diff // (60*60) + t_str = "%s hours ago" % int(t) + elif diff < 60*60*24*7: # sec in week + t = diff // (60*60*24) + t_str = "%s days ago" % int(t) + elif diff < 60*60*24*30: # approx sec in month + t = diff // (60*60*24*7) + t_str = "%s weeks ago" % int(t) + elif diff > 60*60*24*30: # approx sec in month + t = diff // (60*60*24*7*30) + t_str = "%s months ago" % int(t) + return t_str + + +def fb_print_nodeinfo(fbnode, hostname): + fbnode['hostname'] = hostname + fbnode['checked'] = diff_time(fbnode['checked']) + if fbnode['bootcd']: + fbnode['bootcd'] = fbnode['bootcd'].split()[-1] + else: + fbnode['bootcd'] = "unknown" + fbnode['kernel'] = fbnode['kernel'].split()[2] + fbnode['pcu'] = color_pcu_state(fbnode) + print "%(hostname)-39s | %(checked)11.11s | %(state)10.10s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode + +if config.nodelist: + nodelist = config.getListFromFile(config.nodelist) +else: + nodelist = fb['nodes'].keys() + + +for node in nodelist: + config.node = node + + if node not in fb['nodes']: + continue + + fb_nodeinfo = fb['nodes'][node]['values'] + + if config.category and \ + 'state' in fb_nodeinfo and \ + config.category == fb_nodeinfo['state']: + + fb_print_nodeinfo(fb_nodeinfo, node) + elif config.nodelist and 'state' in fb_nodeinfo: + fb_print_nodeinfo(fb_nodeinfo, node) + else: + pass + + + diff --git a/nodereboot.py b/nodereboot.py new file mode 100755 index 0000000..48e99ee --- /dev/null +++ b/nodereboot.py @@ -0,0 +1,528 @@ +#!/usr/bin/python + +# Attempt to reboot a node in debug state. + + +import plc +import auth +api = plc.PLC(auth.auth, auth.plc) + +import sys +import os + +from getsshkeys import SSHKnownHosts + +import subprocess +import time +import soltesz +from sets import Set + +import ssh.pxssh as pxssh +import ssh.fdpexpect as fdpexpect +import ssh.pexpect as pexpect + + + +import signal +class Sopen(subprocess.Popen): + def kill(self, signal = signal.SIGTERM): + os.kill(self.pid, signal) + +#from Rpyc import SocketConnection, Async +from Rpyc import SocketConnection, Async +from Rpyc.Utils import * + + +class NodeConnection: + def __init__(self, connection, node, config): + self.node = node + self.c = connection + self.config = config + + def get_boot_state(self): + if self.c.modules.os.path.exists('/tmp/source'): + return "dbg" + elif self.c.modules.os.path.exists('/vservers'): + return "boot" + else: + return "unknown" + + def get_dmesg(self): + self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") + download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node) + log = open("log/dmesg.%s.log" % self.node, 'r') + return log + + def get_bootmanager_log(self): + download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node) + os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node)) + log = open("log/bm.%s.log" % self.node, 'r') + return log + + def dump_plconf_file(self): + c = self.c + c.modules.sys.path.append("/tmp/source/") + c.modules.os.chdir('/tmp/source') + + log = c.modules.BootManager.log('/tmp/new.log') + bm = c.modules.BootManager.BootManager(log,'boot') + + BootManagerException = c.modules.Exceptions.BootManagerException + InitializeBootManager = c.modules.BootManager.InitializeBootManager + ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration + bm_continue = True + + InitializeBootManager.Run(bm.VARS, bm.LOG) + try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) + except Exception, x: + bm_continue = False + print " ERROR:", x + print " Possibly, unable to find valid configuration file" + + if bm_continue and self.config and not self.config.quiet: + for key in bm.VARS.keys(): + print key, " == ", bm.VARS[key] + else: + if self.config and not self.config.quiet: print " Unable to read Node Configuration" + + + def compare_and_repair_nodekeys(self): + c = self.c + c.modules.sys.path.append("/tmp/source/") + c.modules.os.chdir('/tmp/source') + + log = c.modules.BootManager.log('/tmp/new.log') + bm = c.modules.BootManager.BootManager(log,'boot') + + BootManagerException = c.modules.Exceptions.BootManagerException + InitializeBootManager = c.modules.BootManager.InitializeBootManager + ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration + bm_continue = True + + plcnode = api.GetNodes({'hostname': self.node}, None)[0] + + InitializeBootManager.Run(bm.VARS, bm.LOG) + try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) + except Exception, x: + bm_continue = False + if not config.quiet: print "exception" + if not config.quiet: print x + print " Possibly, unable to find valid configuration file" + + if bm_continue: + print " NODE: %s" % bm.VARS['NODE_KEY'] + print " PLC : %s" % plcnode['key'] + + if bm.VARS['NODE_KEY'] == plcnode['key']: + return True + else: + if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}): + print " Successfully updated NODE_KEY with PLC" + return True + else: + return False + + #for key in bm.VARS.keys(): + # print key, " == ", bm.VARS[key] + else: + print " Unable to retrieve NODE_KEY" + + def bootmanager_running(self): + if self.c.modules.os.path.exists('/tmp/BM_RUNNING'): + return True + else: + return False + + def restart_node(self, state='boot'): + api.UpdateNode(self.node, {'boot_state' : state}) + + print " Killing all slice processes... : %s" % self.node + cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0" + self.c.modules.os.system(cmd_slicekill) + + cmd = """ shutdown -r +1 & """ + print " Restarting %s : %s" % ( self.node, cmd) + self.c.modules.os.system(cmd) + return + + def restart_bootmanager(self, forceState): + + self.c.modules.os.chdir('/tmp/source') + if self.c.modules.os.path.exists('/tmp/BM_RUNNING'): + print " BootManager is already running: try again soon..." + else: + print " Starting 'BootManager.py %s' on %s " % (forceState, self.node) + cmd = "( touch /tmp/BM_RUNNING ; " + \ + " python ./BootManager.py %s &> server.log < /dev/null ; " + \ + " rm -f /tmp/BM_RUNNING " + \ + ") &" + cmd = cmd % forceState + self.c.modules.os.system(cmd) + + return + + +class PlanetLabSession: + globalport = 22222 + + def __init__(self, node, nosetup, verbose): + self.verbose = verbose + self.node = node + self.port = None + self.nosetup = nosetup + self.command = None + self.setup_host() + + def get_connection(self, config): + return NodeConnection(SocketConnection("localhost", self.port), self.node, config) + + def setup_host(self): + self.port = PlanetLabSession.globalport + PlanetLabSession.globalport = PlanetLabSession.globalport + 1 + + args = {} + args['port'] = self.port + args['user'] = 'root' + args['hostname'] = self.node + args['monitordir'] = "/home/soltesz/monitor" + + if self.nosetup: + print "Skipping setup" + return + + # COPY Rpyc files to host + cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args + if self.verbose: print cmd + ret = os.system(cmd) + if ret != 0: + print "UNKNOWN SSH KEY FOR %s" % self.node + print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node + k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k + ret = os.system(cmd) + if ret != 0: + print "FAILED TWICE" + sys.exit(1) + + #cmd = "rsync -qv -az -e ssh %(monitordir)s/BootManager.py + # %(monitordir)s/ChainBoot.py %(user)s@%(hostname)s:/tmp/source" % args + #print cmd; os.system(cmd) + + # KILL any already running servers. + cmd = """ssh %(user)s@%(hostname)s """ + \ + """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """ + cmd = cmd % args + if self.verbose: print cmd + os.system(cmd) + + # START a new rpyc server. + cmd = """ssh %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \ + """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ + cmd = cmd % args + if self.verbose: print cmd + os.system(cmd) + + # This was tricky to make synchronous. The combination of ssh-clients-4.7p1, + # and the following options seems to work well. + cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \ + """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \ + """-o ConnectTimeout=120 """ + \ + """-n -N -L %(port)s:localhost:18812 """ + \ + """%(user)s@%(hostname)s""" + cmd = cmd % args + if self.verbose: print cmd + self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE) + ret = self.command.stdout.read(5) + if 'READY' in ret: + # We can return without delay. + time.sleep(1) + return + + if self.command.returncode is not None: + print "Failed to establish tunnel!" + raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode)) + + raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'") + + def __del__(self): + if self.command: + if self.verbose: print "Killing SSH session %s" % self.port + self.command.kill() + + +def steps_to_list(steps): + ret_list = [] + for (id,label) in steps: + ret_list.append(label) + return ret_list + +def index_to_id(steps,index): + if index < len(steps): + return steps[index][0] + else: + return "done" + +def reboot(hostname, config=None, forced_action=None): + + node = hostname + print "Creating session for %s" % node + # update known_hosts file (in case the node has rebooted since last run) + if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node + k = SSHKnownHosts(); k.update(node); k.write(); del k + + if config == None: + session = PlanetLabSession(node, False, False) + else: + session = PlanetLabSession(node, config.nosetup, config.verbose) + conn = session.get_connection(config) + + if forced_action == "reboot": + conn.restart_node('rins') + return True + + boot_state = conn.get_boot_state() + if boot_state == "boot": + print "...Boot state of %s already completed : skipping..." % node + return False + elif boot_state == "unknown": + print "...Unknown bootstate for %s : skipping..."% node + return False + else: + pass + + if conn.bootmanager_running(): + print "...BootManager is currently running. Skipping host %s" % node + return False + + if config != None: + if config.force: + conn.restart_bootmanager(config.force) + return True + + if config and not config.quiet: print "...downloading dmesg from %s" % node + dmesg = conn.get_dmesg() + child = fdpexpect.fdspawn(dmesg) + + sequence = [] + while True: + steps = [ + ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), + ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), + ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'), + ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'), + ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'), + ('sdXerror' , 'sd\w: Current: sense key: Medium Error'), + ('floppytimeout','floppy0: floppy timeout called'), + ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'), + + # floppy0: floppy timeout called + # end_request: I/O error, dev fd0, sector 0 + + #Buffer I/O error on device dm-2, logical block 8888896 + #ata1: status=0x51 { DriveReady SeekComplete Error } + #ata1: error=0x40 { UncorrectableError } + #SCSI error : <0 0 0 0> return code = 0x8000002 + #sda: Current: sense key: Medium Error + # Additional sense: Unrecovered read error - auto reallocate failed + + #SCSI error : <0 2 0 0> return code = 0x40001 + #end_request: I/O error, dev sda, sector 572489600 + ] + id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) + sequence.append(id) + + if id == "done": + break + + s = Set(sequence) + if config and not config.quiet: print "SET: ", s + + if len(s) > 1: + print "...Potential drive errors on %s" % node + if len(s) == 2 and 'floppyerror' in s: + print "...Should investigate. Continuing with node." + else: + print "...Should investigate. Skipping node." + return False + + print "...Downloading bm.log from %s" % node + log = conn.get_bootmanager_log() + child = fdpexpect.fdspawn(log) + + time.sleep(1) + + if config and not config.quiet: print "...Scanning bm.log for errors" + action_id = "dbg" + sequence = [] + while True: + + steps = [ + ('bminit' , 'Initializing the BootManager.'), + ('cfg' , 'Reading node configuration file.'), + ('auth' , 'Authenticating node with PLC.'), + ('getplc' , 'Retrieving details of node from PLC.'), + ('update' , 'Updating node boot state at PLC.'), + ('hardware' , 'Checking if hardware requirements met.'), + ('installinit' , 'Install: Initializing.'), + ('installdisk' , 'Install: partitioning disks.'), + ('installbootfs', 'Install: bootstrapfs tarball.'), + ('installcfg' , 'Install: Writing configuration files.'), + ('installstop' , 'Install: Shutting down installer.'), + ('update2' , 'Updating node boot state at PLC.'), + ('installinit2' , 'Install: Initializing.'), + ('validate' , 'Validating node installation.'), + ('rebuildinitrd', 'Rebuilding initrd'), + ('netcfg' , 'Install: Writing Network Configuration files.'), + ('update3' , 'Updating node configuration.'), + ('disk' , 'Checking for unused disks to add to LVM.'), + ('update4' , 'Sending hardware configuration to PLC.'), + ('debug' , 'Starting debug mode'), + ('bmexceptmount', 'BootManagerException during mount'), + ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'), + ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'), + ('exception' , 'Exception'), + ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'), + ('protoerror' , 'XML RPC protocol error'), + ('implementerror', 'Implementation Error'), + ('readonlyfs' , '[Errno 30] Read-only file system'), + ('noinstall' , 'notinstalled'), + ('bziperror' , 'bzip2: Data integrity error when decompressing.'), + ('noblockdev' , "No block devices detected."), + ('hardwarefail' , 'Hardware requirements not met'), + ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), + ('modulefail' , 'Unable to get list of system modules'), + ('writeerror' , 'write error: No space left on device'), + ('nonode' , 'Failed to authenticate call: No such node'), + ('authfail' , 'Failed to authenticate call: Call could not be authenticated'), + ('bootcheckfail' , 'BootCheckAuthentication'), + ('bootupdatefail' , 'BootUpdateNode'), + ] + list = steps_to_list(steps) + index = child.expect( list + [ pexpect.EOF ]) + id = index_to_id(steps,index) + sequence.append(id) + + if id == "exception": + if config and not config.quiet: print "...Found An Exception!!!" + elif index == len(list): + #print "Reached EOF" + break + + s = "-".join(sequence) + print " FOUND SEQUENCE: ", s + + if s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done": + if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + conn.restart_bootmanager('boot') + elif s == "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": + if conn.compare_and_repair_nodekeys(): + # the keys either are in sync or were forced in sync. + # so try to reboot the node again. + conn.restart_bootmanager('boot') + else: + # there was some failure to synchronize the keys. + print "...Unable to repair node keys on %s" % node + elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done" or \ + s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done": + conn.restart_bootmanager('boot') + elif s == "bminit-cfg-auth-getplc-update-debug-done": + conn.restart_bootmanager('boot') + elif s == "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done": + conn.restart_bootmanager('rins') + elif s == "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done": + conn.restart_bootmanager('boot') + elif s == "bminit-cfg-auth-protoerror-exception-update-debug-done": + conn.restart_bootmanager('boot') + elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done": + # reinstall b/c it is not installed. + conn.restart_bootmanager('rins') + elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done": + + conn.restart_bootmanager('rins') + elif s == "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done": + conn.restart_node('rins') + elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done": + conn.restart_node('rins') + elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done": + conn.restart_node('rins') + elif s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done": + conn.restart_bootmanager('rins') + elif s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done": + conn.restart_bootmanager('rins') + elif s == "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done" or \ + s == "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done": + conn.dump_plconf_file() + elif s == "bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done": + print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" + pass + + elif s == "bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done": + # MAKE An ACTION record that this host has failed hardware. May + # require either an exception "/minhw" or other manual intervention. + # Definitely need to send out some more EMAIL. + print "...NOTIFY OWNER OF BROKEN HARDWARE!!!" + pass + + elif s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done" or \ + s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done": + conn.restart_node('rins') + #conn.restart_bootmanager('rins') + print "...Need to follow up on this one." + + ## If the disk is full, just start over. + #conn.restart_bootmanager('rins') + elif s == "": + pass + + else: + print " HOST %s" % hostname + print " UNKNOWN SEQUENCE: %s" % s + pass + + return True + + +# MAIN ------------------------------------------------------------------- + +def main(): + from config import config + from optparse import OptionParser + parser = OptionParser() + parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False) + parser.add_option("", "--child", dest="child", action="store_true", + help="This is the child mode of this process.") + parser.add_option("", "--force", dest="force", metavar="boot_state", + help="Force a boot state passed to BootManager.py.") + parser.add_option("", "--quiet", dest="quiet", action="store_true", + help="Extra quiet output messages.") + parser.add_option("", "--verbose", dest="verbose", action="store_true", + help="Extra debug output messages.") + parser.add_option("", "--nosetup", dest="nosetup", action="store_true", + help="Do not perform the orginary setup phase.") + parser.add_option("", "--node", dest="node", metavar="nodename.edu", + help="A single node name to try to bring out of debug mode.") + parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", + help="A list of nodes to bring out of debug mode.") + config = config(parser) + config.parse_args() + + if config.nodelist: + nodes = config.getListFromFile(config.nodelist) + elif config.node: + nodes = [ config.node ] + else: + parser.print_help() + sys.exit(1) + + for node in nodes: + reboot(node, config) + +if __name__ == "__main__": + main() -- 2.47.0