From 66c4742c05622d6c53368e2890670eaefa5345f3 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Mon, 30 Mar 2009 20:00:48 +0000 Subject: [PATCH] added policy.py and updated bootman.py to work with the new policy framework. replaced old persistflags objects with node or site history queries. minor tweaks to web interface. --- bootman.py | 754 +++++++++--------- monitor/common.py | 21 +- monitor/wrapper/emailTxt.py | 4 +- nodeinfo.py | 2 +- policy.py | 432 ++++++++++ sitebad.py | 2 +- web/MonitorWeb/monitorweb/controllers.py | 2 + .../monitorweb/templates/pcuview.kid | 2 +- www/gadgets/sitemonitor.py | 6 +- 9 files changed, 838 insertions(+), 387 deletions(-) create mode 100755 policy.py diff --git a/bootman.py b/bootman.py index 0cd88ec..a43a95b 100755 --- a/bootman.py +++ b/bootman.py @@ -2,40 +2,44 @@ # Attempt to reboot a node in debug state. -from monitor import const -from monitor.database.info.model import * -from monitor.wrapper import plc -api = plc.getAuthAPI() -import sys + import os +import sys +import time +import random +import signal +import traceback +import subprocess +from sets import Set from getsshkeys import SSHKnownHosts -import subprocess -import time -from pcucontrol.util import command as moncommands -from sets import Set +from Rpyc import SocketConnection, Async +from Rpyc.Utils import * +import getconf +from monitor import config +from monitor import const +from monitor.model import * +from monitor.common import email_exception, found_within +from monitor.database.info.model import * +from monitor.wrapper import plc +from monitor.wrapper.emailTxt import mailtxt + +from pcucontrol.util import command as moncommands +from pcucontrol.util.command import Sopen from pcucontrol.transports.ssh import pxssh as pxssh from pcucontrol.transports.ssh import fdpexpect as fdpexpect from pcucontrol.transports.ssh import pexpect as pexpect -from monitor.model import * -from monitor.wrapper.emailTxt import mailtxt + from nodeconfig import network_config_to_str -import traceback -from monitor import config -import signal -class Sopen(subprocess.Popen): - def kill(self, signal = signal.SIGTERM): - os.kill(self.pid, signal) -#from Rpyc import SocketConnection, Async -from Rpyc import SocketConnection, Async -from Rpyc.Utils import * +api = plc.getAuthAPI() fb = None + class NodeConnection: def __init__(self, connection, node, config): self.node = node @@ -43,12 +47,20 @@ class NodeConnection: self.config = config def get_boot_state(self): - if self.c.modules.os.path.exists('/tmp/source'): - return "dbg" - elif self.c.modules.os.path.exists('/vservers'): - return "boot" - else: - return "unknown" + try: + if self.c.modules.os.path.exists('/tmp/source'): + return "debug" + elif self.c.modules.os.path.exists('/vservers'): + return "boot" + else: + return "unknown" + except EOFError: + traceback.print_exc() + print self.c.modules.sys.path + except: + traceback.print_exc() + + return "unknown" def get_dmesg(self): self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") @@ -177,7 +189,6 @@ class NodeConnection: return -import random class PlanetLabSession: globalport = 22000 + int(random.random()*1000) @@ -190,7 +201,14 @@ class PlanetLabSession: self.setup_host() def get_connection(self, config): - return NodeConnection(SocketConnection("localhost", self.port), self.node, config) + conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config) + #i = 0 + #while i < 3: + # print i, conn.c.modules.sys.path + # print conn.c.modules.os.path.exists('/tmp/source') + # i+=1 + # time.sleep(1) + return conn def setup_host(self): self.port = PlanetLabSession.globalport @@ -210,6 +228,7 @@ class PlanetLabSession: # COPY Rpyc files to host cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args if self.verbose: print cmd + print cmd # TODO: Add timeout timeout = 120 localos = moncommands.CMD() @@ -253,6 +272,7 @@ EOF""") #cmd = cmd % args #if self.verbose: print cmd #print localos.system(cmd,timeout) + print "setup rpyc server over ssh" print ssh.ret # TODO: Add timeout @@ -265,6 +285,7 @@ EOF""") """%(user)s@%(hostname)s""" cmd = cmd % args if self.verbose: print cmd + print cmd self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE) # TODO: the read() here may block indefinitely. Need a better # approach therefore, that includes a timeout. @@ -288,14 +309,12 @@ EOF""") def __del__(self): if self.command: if self.verbose: print "Killing SSH session %s" % self.port + print "Killing SSH session %s" % self.port self.command.kill() - -def steps_to_list(steps): - ret_list = [] - for (id,label) in steps: - ret_list.append(label) - return ret_list + +def steps_to_list(steps, index=1): + return map(lambda x: x[index], steps) def index_to_id(steps,index): if index < len(steps): @@ -303,101 +322,176 @@ def index_to_id(steps,index): else: return "done" -def reboot(hostname, config=None, forced_action=None): +class DebugInterface: + def __init__(self, hostname): + self.hostname = hostname + self.session = None - # NOTE: Nothing works if the bootcd is REALLY old. - # So, this is the first step. - fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() - print fbnode.keys() - if fbnode['observed_category'] == "OLDBOOTCD": - print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" - args = {} - args['hostname_list'] = " %s" % hostname - - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - api.UpdateNode(hostname, {'boot_state' : 'disable'}) - return True - - node = hostname - print "Creating session for %s" % node - # update known_hosts file (in case the node has rebooted since last run) - if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node - try: - k = SSHKnownHosts(); k.update(node); k.write(); del k - except: - from monitor.common import email_exception - email_exception() - print traceback.print_exc() - return False - - try: - if config == None: - session = PlanetLabSession(node, False, True) - else: - session = PlanetLabSession(node, config.nosetup, config.verbose) - except Exception, e: - msg = "ERROR setting up session for %s" % hostname - print msg - print traceback.print_exc() - from monitor.common import email_exception - email_exception(msg) - print e - return False - - try: - conn = session.get_connection(config) - except EOFError: - # NOTE: sometimes the wait in setup_host() is not long enough. - # So, here we try to wait a little longer before giving up entirely. + def getConnection(self): + print "Creating session for %s" % self.hostname + # update known_hosts file (in case the node has rebooted since last run) try: - time.sleep(session.timeout*4) - conn = session.get_connection(config) + k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k except: - print traceback.print_exc() - from monitor.common import email_exception email_exception() + print traceback.print_exc() return False - if forced_action == "reboot": - conn.restart_node('rins') - return True + try: + if config == None: + self.session = PlanetLabSession(self.hostname, False, True) + else: + self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) + except Exception, e: + msg = "ERROR setting up session for %s" % self.hostname + print msg + traceback.print_exc() + email_exception(msg) + return False - boot_state = conn.get_boot_state() - if boot_state == "boot": - print "...Boot state of %s already completed : skipping..." % node - return True - elif boot_state == "unknown": - print "...Unknown bootstate for %s : skipping..."% node - return False - else: - pass + try: + conn = self.session.get_connection(config) + except EOFError: + # NOTE: sometimes the wait in setup_host() is not long enough. + # So, here we try to wait a little longer before giving up entirely. + try: + time.sleep(self.session.timeout*5) + conn = self.session.get_connection(config) + except: + traceback.print_exc() + email_exception(self.hostname) + return False + #print "trying to use conn before returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + #time.sleep(1) - if conn.bootmanager_running(): - print "...BootManager is currently running. Skipping host %s" % node - return True + #print "conn: %s" % conn + return conn - #if config != None: - # if config.force: - # conn.restart_bootmanager(config.force) - # return True + def getSequences(self): - # Read persistent flags, tagged on one week intervals. - pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags') + # TODO: This can be replaced with a DB definition at a future time. + # This would make it possible for an admin to introduce new + # patterns without touching code. + sequences = {} + # restart_bootmanager_boot + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", + "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-implementerror-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_boot"}) + + # conn.restart_bootmanager('rins') + for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_rins"}) + + # repair_node_keys + sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) + + # conn.restart_node('rins') + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + ]: + sequences.update({n : "restart_node_rins"}) + + # restart_node_boot + for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + ]: + sequences.update({n: "restart_node_boot"}) + + # update_node_config_email + for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", + ]: + sequences.update({n : "update_node_config_email"}) + + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: + sequences.update({n : "nodenetwork_email"}) + + # update_bootcd_email + for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", + ]: + sequences.update({n : "update_bootcd_email"}) + + for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + ]: + sequences.update({n: "suspect_error_email"}) + + # update_hardware_email + sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + + # broken_hardware_email + sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + + # bad_dns_email + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) - if config and not config.quiet: print "...downloading dmesg from %s" % node - dmesg = conn.get_dmesg() - child = fdpexpect.fdspawn(dmesg) + return sequences - sequence = [] - while True: + def getDiskSteps(self): steps = [ ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), @@ -433,51 +527,19 @@ def reboot(hostname, config=None, forced_action=None): # SCSI error : <0 2 0 0> return code = 0x40001 # end_request: I/O error, dev sda, sector 572489600 ] - id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) - sequence.append(id) - - if id == "done": - break - - s = Set(sequence) - if config and not config.quiet: print "\tSET: ", s + return steps - if len(s) > 1: - print "...Potential drive errors on %s" % node - if len(s) == 2 and 'floppyerror' in s: - print "...Should investigate. Continuing with node." - else: - print "...Should investigate. Skipping node." - # TODO: send message related to these errors. - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() + def getDiskSequence(self, steps, child): + sequence = [] + while True: + id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) + sequence.append(id) - m = PersistMessage(hostname, mailtxt.baddisk[0] % args, - mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') - return False - - print "...Downloading bm.log from %s" % node - log = conn.get_bootmanager_log() - child = fdpexpect.fdspawn(log) - - try: - if config.collect: return True - except: - pass - - time.sleep(1) - - if config and not config.quiet: print "...Scanning bm.log for errors" - action_id = "dbg" - sequence = [] - while True: + if id == "done": + break + return sequence + def getBootManagerStepPatterns(self): steps = [ ('bminit' , 'Initializing the BootManager.'), ('cfg' , 'Reading node configuration file.'), @@ -528,147 +590,118 @@ def reboot(hostname, config=None, forced_action=None): ('bootcheckfail' , 'BootCheckAuthentication'), ('bootupdatefail' , 'BootUpdateNode'), ] - list = steps_to_list(steps) - index = child.expect( list + [ pexpect.EOF ]) - id = index_to_id(steps,index) - sequence.append(id) - - if id == "exception": - if config and not config.quiet: print "...Found An Exception!!!" - elif index == len(list): - #print "Reached EOF" - break + return steps + + def getBootManagerSequenceFromLog(self, steps, child): + sequence = [] + while True: + + index = child.expect( steps_to_list(steps) + [ pexpect.EOF ]) + id = index_to_id(steps,index) + sequence.append(id) + + if id == "exception": + print "...Found An Exception!!!" + elif id == "done": #index == len(steps_to_list(steps)): + #print "Reached EOF" + break + + return sequence - s = "-".join(sequence) - print " FOUND SEQUENCE: ", s - # NOTE: We get or set the flag based on the current sequence identifier. - # By using the sequence identifier, we guarantee that there will be no - # frequent loops. I'm guessing there is a better way to track loops, - # though. - #if not config.force and pflags.getRecentFlag(s): - # pflags.setRecentFlag(s) - # pflags.save() - # print "... flag is set or it has already run recently. Skipping %s" % node +def restore(sitehist, hostname, config=None, forced_action=None): + + # NOTE: Nothing works if the bootcd is REALLY old. + # So, this is the first step. + + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() + recent_actions = sitehist.getRecentActions(hostname=hostname) + + if fbnode['observed_category'] == "OLDBOOTCD": + print "\t...Notify owner to update BootImage!!!" + + if not found_within(recent_actions, 'newbootcd_notice', 3): + sitehist.sendMessage('newbootcd_notice', hostname=hostname) + + print "\tDisabling %s due to out-of-date BootImage" % hostname + api.UpdateNode(hostname, {'boot_state' : 'disable'}) + + # NOTE: nothing else is possible. + return True + + debugnode = DebugInterface(hostname) + conn = debugnode.getConnection() + #print "conn: %s" % conn + #print "trying to use conn after returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + if type(conn) == type(False): return False + + #if forced_action == "reboot": + # conn.restart_node('rins') # return True - sequences = {} + boot_state = conn.get_boot_state() + if boot_state != "debug": + print "... %s in %s state: skipping..." % (hostname , boot_state) + return boot_state == "boot" + if conn.bootmanager_running(): + print "...BootManager is currently running. Skipping host %s" %hostname + return True - # restart_bootmanager_boot - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + # Read persistent flags, tagged on one week intervals. + #pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags') - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + if config and not config.quiet: print "...downloading dmesg from %s" %hostname + dmesg = conn.get_dmesg() + child = fdpexpect.fdspawn(dmesg) - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", - "bminit-cfg-auth-protoerror-exception-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-implementerror-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_boot"}) - - # conn.restart_bootmanager('rins') - for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", - # actual solution appears to involve removing the bad files, and - # continually trying to boot the node. - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_rins"}) - - # repair_node_keys - sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - - # conn.restart_node('rins') - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - ]: - sequences.update({n : "restart_node_rins"}) - - # restart_node_boot - for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", - ]: - sequences.update({n: "restart_node_boot"}) - - # update_node_config_email - for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", - ]: - sequences.update({n : "update_node_config_email"}) + steps = debugnode.getDiskSteps() + sequence = debugnode.getDiskSequence(steps, child) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", - "bminit-cfg-update-exception-nodehostname-update-debug-done", - ]: - sequences.update({n : "nodenetwork_email"}) - - # update_bootcd_email - for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", - ]: - sequences.update({n : "update_bootcd_email"}) + s = Set(sequence) + if config and not config.quiet: print "\tSET: ", s - for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - ]: - sequences.update({n: "suspect_error_email"}) + if len(s) > 1: + print "...Potential drive errors on %s" % hostname + if len(s) == 2 and 'floppyerror' in s: + print "...Should investigate. Continuing with node." + else: + print "...Should investigate. Skipping node." + # TODO: send message related to these errors. - # update_hardware_email - sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) - sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + if not found_within(recent_actions, 'newbootcd_notice', 3): - # broken_hardware_email - sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + log=conn.get_dmesg().read() + sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log) + conn.set_nodestate('disable') - # bad_dns_email - for n in [ - "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - ]: - sequences.update( { n : "bad_dns_email"}) + return False - flag_set = True + print "...Downloading bm.log from %s" %hostname + log = conn.get_bootmanager_log() + child = fdpexpect.fdspawn(log) + + if hasattr(config, 'collect') and config.collect: return True + + if config and not config.quiet: print "...Scanning bm.log for errors" + + time.sleep(1) + steps = debugnode.getBootManagerStepPatterns() + sequence = debugnode.getBootManagerSequenceFromLog(steps, child) + + s = "-".join(sequence) + print " FOUND SEQUENCE: ", s + + # NOTE: We get or set the flag based on the current sequence identifier. + # By using the sequence identifier, we guarantee that there will be no + # frequent loops. I'm guessing there is a better way to track loops, + # though. + + sequences = debugnode.getSequences() + flag_set = True if s not in sequences: print " HOST %s" % hostname @@ -678,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None): args['hostname'] = hostname args['sequence'] = s args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args, - mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages') - m.reset() - m.send([config.cc_email]) + args['viart'] = False + + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') @@ -692,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None): else: if sequences[s] == "restart_bootmanager_boot": - if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + print "...Restarting BootManager.py on %s "%hostname conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": - if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + print "...Restarting BootManager.py on %s "%hostname conn.restart_bootmanager('rins') elif sequences[s] == "restart_node_rins": conn.restart_node('rins') @@ -709,121 +741,89 @@ def reboot(hostname, config=None, forced_action=None): pass else: # there was some failure to synchronize the keys. - print "...Unable to repair node keys on %s" % node + print "...Unable to repair node keys on %s" %hostname elif sequences[s] == "suspect_error_email": args = {} args['hostname'] = hostname args['sequence'] = s args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args, - mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages') - m.reset() - m.send([config.cc_email]) + args['viart'] = False + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') + # TODO: differentiate this and the 'nodenetwork_email' actions. elif sequences[s] == "update_node_config_email": - print "...Sending message to UPDATE NODE CONFIG" - args = {} - args['hostname'] = hostname - m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, - True, db='nodeid_persistmessages') - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.dump_plconf_file() - conn.set_nodestate('disable') + + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() elif sequences[s] == "nodenetwork_email": - print "...Sending message to LOOK AT NODE NETWORK" - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, - True, db='nodenet_persistmessages') - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.dump_plconf_file() - conn.set_nodestate('disable') - elif sequences[s] == "update_bootcd_email": - print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" - import getconf - args = {} - args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: - args['hostname_list'] = "%s" % hostname + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') + elif sequences[s] == "update_bootcd_email": - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) + if not found_within(recent_actions, 'newalphacd_notice', 3): + args = {} + args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: + args['hostname'] = hostname + + sitehist.sendMessage('newalphacd_notice', **args) - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - conn.set_nodestate('disable') + print "\tDisabling %s due to out-of-date BOOTCD" % hostname elif sequences[s] == "broken_hardware_email": # MAKE An ACTION record that this host has failed hardware. May # require either an exception "/minhw" or other manual intervention. # Definitely need to send out some more EMAIL. - print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname # TODO: email notice of broken hardware - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() - m = PersistMessage(hostname, mailtxt.baddisk[0] % args, - mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') + if not found_within(recent_actions, 'baddisk_notice', 1): + print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['log'] = conn.get_dmesg().read() - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') + sitehist.sendMessage('baddisk_notice', **args) + conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": - print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args, - mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') + if not found_within(recent_actions, 'minimalhardware_notice', 1): + print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('minimalhardware_notice', **args) elif sequences[s] == "bad_dns_email": - print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname - args = {} - try: - node = api.GetNodes(hostname)[0] - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] - except: - from monitor.common import email_exception - email_exception() - print traceback.print_exc() - # TODO: api error. skip email, b/c all info is not available, - # flag_set will not be recorded. - return False - nodenet_str = network_config_to_str(net) + if not found_within(recent_actions, 'baddns_notice', 1): + print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname + args = {} + try: + node = api.GetNodes(hostname)[0] + net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + except: + email_exception() + print traceback.print_exc() + # TODO: api error. skip email, b/c all info is not available, + # flag_set will not be recorded. + return False + nodenet_str = network_config_to_str(net) - args['hostname'] = hostname - args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] - m = PersistMessage(hostname, mailtxt.baddns[0] % args, - mailtxt.baddns[1] % args, True, db='baddns_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') - - if flag_set: - pflags.setRecentFlag(s) - pflags.save() + args['hostname'] = hostname + args['network_config'] = nodenet_str + args['nodenetwork_id'] = net['nodenetwork_id'] + + sitehist.sendMessage('baddns_notice', **args) return True diff --git a/monitor/common.py b/monitor/common.py index aecd866..d082dbb 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -7,7 +7,8 @@ from monitor import database from monitor.wrapper import plc, plccache from datetime import datetime, timedelta -from monitor.model import PersistFlags, Message +from monitor.model import Message +from monitor.database.info import HistoryNodeRecord esc = struct.pack('i', 27) RED = esc + "[1;31m" @@ -85,6 +86,8 @@ def diff_time(timestamp, abstime=True): now = time.time() if timestamp == None: return "unknown" + if type(timestamp) == type(datetime.now()): + timestamp = time.mktime(timestamp.timetuple()) if abstime: diff = now - timestamp else: @@ -153,7 +156,7 @@ def nodegroup_display(node, fbdata, conf=None): node['pcu'] = "PCU" node['lastupdate'] = diff_time(node['last_contact']) - pf = PersistFlags(node['hostname'], 1, db='node_persistflags') + pf = HistoryNodeRecord.get_by(hostname=node['hostname']) try: node['lc'] = diff_time(pf.last_changed) except: @@ -237,11 +240,23 @@ def changed_greaterthan(last_changed, days): else: #print "last changed less than %s" % timedelta(days) return False + +def found_between(recent_actions, action_type, lower, upper): + return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower) + +def found_before(recent_actions, action_type, within): + for action in recent_actions: + if action_type == action.action_type and \ + action.date_created < (datetime.now() - timedelta(within)): + return True + return False def found_within(recent_actions, action_type, within): for action in recent_actions: + #print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) ) if action_type == action.action_type and \ - datetime.now() - action.date_created < timedelta(within): + action.date_created > (datetime.now() - timedelta(within)): + #datetime.now() - action.date_created < timedelta(within): # recent action of given type. #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created) return True diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 98c8856..05afe6e 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -373,9 +373,9 @@ Thank you very much for your help, """) newalphacd_notice=(""" New Boot Images for %(hostname)s""", -"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware: +"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported. -%(hostname)s + %(hostname)s To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file. diff --git a/nodeinfo.py b/nodeinfo.py index e599d24..a237a8c 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -44,7 +44,7 @@ def plc_print_nodeinfo(plcnode): diff_time(plcnode['last_contact']), plcnode['key']) def fb_print_nodeinfo(fbnode): - pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags') + pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname']) try: fbnode['last_change'] = diff_time(pf.last_changed) except: diff --git a/policy.py b/policy.py new file mode 100755 index 0000000..3d226f4 --- /dev/null +++ b/policy.py @@ -0,0 +1,432 @@ +#!/usr/bin/python + +# This script is used to manipulate the operational state of nodes in +# different node groups. These are basically set operations on nodes via the +# PLC api. +# +# Take the ng name as an argument.... +# optionally, +# * get a list of nodes in the given nodegroup. +# * set some or all in the set to rins. +# * restart them all. +# * do something else to them all. +# + +import os +import time +import traceback +import sys +from optparse import OptionParser + +import bootman # debug nodes + +from monitor import util +from monitor import const +from monitor import reboot +from monitor import config +from monitor import database +from monitor import parser as parsermodule +from monitor.common import * +from monitor.model import * +from monitor.wrapper import plc +from monitor.wrapper import plccache +from monitor.wrapper.emailTxt import mailtxt +from monitor.database.info.model import * + +from nodequery import verify,query_to_dict,node_select + +api = plc.getAuthAPI() + + +class SiteInterface(HistorySiteRecord): + @classmethod + def get_or_make(cls, if_new_set={}, **kwargs): + if 'hostname' in kwargs: + kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']] + del kwargs['hostname'] + res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs) + return SiteInterface(res) + + def __init__(self, sitehist): + self.db = sitehist + + def getRecentActions(self, **kwargs): + # TODO: make query only return records within a certin time range, + # i.e. greater than 0.5 days ago. or 5 days, etc. + + #print "kwargs: ", kwargs + + recent_actions = [] + if 'loginbase' in kwargs: + recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc()) + elif 'hostname' in kwargs: + recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc()) + return recent_actions + + def increasePenalty(self): + #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',) + self.db.penalty_level += 1 + # NOTE: this is to prevent overflow or index errors in applyPenalty. + # there's probably a better approach to this. + if self.db.penalty_level >= 2: + self.db.penalty_level = 2 + self.db.penalty_applied = True + + def applyPenalty(self): + penalty_map = [] + penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None, + 'disable' : lambda site: None } ) + penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site), + 'disable' : lambda site: plc.enableSiteSliceCreation(site) } ) + penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site), + 'disable' : lambda site: plc.enableSiteSlices(site) } ) + + for i in range(len(penalty_map)-1,self.db.penalty_level,-1): + print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase) + penalty_map[i]['disable'](self.db.loginbase) + + for i in range(0,self.db.penalty_level+1): + print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase) + penalty_map[i]['enable'](self.db.loginbase) + + return + + def pausePenalty(self): + act = ActionRecord(loginbase=self.db.loginbase, + action='penalty', + action_type='pause_penalty',) + + def clearPenalty(self): + #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',) + self.db.penalty_level = 0 + self.db.penalty_applied = False + + def getTicketStatus(self): + if self.db.message_id != 0: + rtstatus = mailer.getTicketStatus(self.db.message_id) + self.db.message_status = rtstatus['Status'] + self.db.message_queue = rtstatus['Queue'] + self.db.message_created = datetime.fromtimestamp(rtstatus['Created']) + + def setTicketStatus(self, status): + print 'SETTING status %s' % status + if self.db.message_id != 0: + rtstatus = mailer.setTicketStatus(self.db.message_id, status) + + def getContacts(self): + contacts = [] + if self.db.penalty_level >= 0: + contacts += plc.getTechEmails(self.db.loginbase) + + if self.db.penalty_level >= 1: + contacts += plc.getPIEmails(self.db.loginbase) + + if self.db.penalty_level >= 2: + contacts += plc.getSliceUserEmails(self.db.loginbase) + + return contacts + + def sendMessage(self, type, **kwargs): + + # NOTE: evidently changing an RT message's subject opens the ticket. + # the logic in this policy depends up a ticket only being 'open' + # if a user has replied to it. + # So, to preserve these semantics, we check the status before + # sending, then after sending, reset the status to the + # previous status. + # There is a very tiny race here, where a user sends a reply + # within the time it takes to check, send, and reset. + # This sucks. It's almost certainly fragile. + + # + # TODO: catch any errors here, and add an ActionRecord that contains + # those errors. + + args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level} + args.update(kwargs) + + hostname = None + if 'hostname' in args: + hostname = args['hostname'] + + if hasattr(mailtxt, type): + + message = getattr(mailtxt, type) + viart = True + if 'viart' in kwargs: + viart = kwargs['viart'] + + if viart: + self.getTicketStatus() # get current message status + + m = Message(message[0] % args, message[1] % args, viart, self.db.message_id) + + contacts = self.getContacts() + contacts = [config.cc_email] # TODO: remove after testing... + + print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname) + + ret = m.send(contacts) + if viart: + self.db.message_id = ret + # reset to previous status, since a new subject 'opens' RT tickets. + self.setTicketStatus(self.db.message_status) + + # NOTE: only make a record of it if it's in RT. + act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', + action_type=type, message_id=self.db.message_id) + + else: + print "+-- WARNING! ------------------------------" + print "| No such message name in emailTxt.mailtxt: %s" % type + print "+------------------------------------------" + + return + + def closeTicket(self): + # TODO: close the rt ticket before overwriting the message_id + mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor") + act = ActionRecord(loginbase=self.db.loginbase, action='notice', + action_type='end_notice', message_id=self.db.message_id) + self.db.message_id = 0 + self.db.message_status = "new" + + def runBootManager(self, hostname): + print "attempting BM reboot of %s" % hostname + ret = "" + try: + ret = bootman.restore(self, hostname) + err = "" + except: + err = traceback.format_exc() + print err + + act = ActionRecord(loginbase=self.db.loginbase, + hostname=hostname, + action='reboot', + action_type='bootmanager_restore', + error_string=err) + return ret + + def attemptReboot(self, hostname): + print "attempting PCU reboot of %s" % hostname + ret = reboot.reboot_str(hostname) + if ret == 0 or ret == "0": + ret = "" + act = ActionRecord(loginbase=self.db.loginbase, + hostname=hostname, + action='reboot', + action_type='first_try_reboot', + error_string=ret) + +def logic(): + + plc.nodeBootState(host, 'rins') + node_end_record(host) + + + + +def main(hostnames, sitenames): + l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) + # commands: + i = 1 + node_count = 1 + site_count = 1 + #print "hosts: %s" % hostnames + for host in hostnames: + try: + lb = plccache.plcdb_hn2lb[host] + except: + print "unknown host in plcdb_hn2lb %s" % host + continue + + sitehist = SiteInterface.get_or_make(loginbase=lb) + + recent_actions = sitehist.getRecentActions(hostname=host) + + nodehist = HistoryNodeRecord.findby_or_create(hostname=host) + + print "%s %s" % ( nodehist.hostname, nodehist.status) + if nodehist.status == 'good' and \ + changed_lessthan(nodehist.last_changed, 1.0) and \ + not found_within(recent_actions, 'online_notice', 0.5): + # NOTE: there is a narrow window in which this command must be + # evaluated, otherwise the notice will not go out. this is not ideal. + sitehist.sendMessage('online_notice', hostname=host) + print "send message for host %s online" % host + + pass + + if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + changed_greaterthan(nodehist.last_changed,1.0) and \ + not found_between(recent_actions, 'first_try_reboot', 3.5, 1): + + sitehist.attemptReboot(host) + print "send message for host %s first_try_reboot" % host + pass + + # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1) + # will be false for a day after the above condition is satisfied + if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + changed_greaterthan(nodehist.last_changed,1.5) and \ + found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \ + not found_within(recent_actions, 'pcufailed_notice', 3.5): + # found_within(recent_actions, 'first_try_reboot', 3.5) and \ + + # send pcu failure message + #act = ActionRecord(**kwargs) + sitehist.sendMessage('pcufailed_notice', hostname=host) + print "send message for host %s PCU Failure" % host + pass + + if nodehist.status == 'monitordebug' and \ + changed_greaterthan(nodehist.last_changed, 1) and \ + not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): + # send down node notice + # delay 0.5 days before retrying... + + print "send message for host %s bootmanager_restore" % host + sitehist.runBootManager(host) + # sitehist.sendMessage('retry_bootman', hostname=host) + + if nodehist.status == 'down' and \ + changed_greaterthan(nodehist.last_changed, 2) and \ + not found_within(recent_actions, 'down_notice', 3.5): + # send down node notice + + sitehist.sendMessage('down_notice', hostname=host) + print "send message for host %s offline" % host + pass + + node_count = node_count + 1 + + for site in sitenames: + sitehist = SiteInterface.get_or_make(loginbase=site) + # TODO: make query only return records within a certin time range, + # i.e. greater than 0.5 days ago. or 5 days, etc. + recent_actions = sitehist.getRecentActions(loginbase=site) + + #sitehist.sendMessage('test_notice', host) + + print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status) + if sitehist.db.status == 'down': + if not found_within(recent_actions, 'pause_penalty', 30) and \ + not found_within(recent_actions, 'increase_penalty', 7) and \ + changed_greaterthan(sitehist.db.last_changed, 7): + + # TODO: catch errors + sitehist.increasePenalty() + #sitehist.applyPenalty() + sitehist.sendMessage('increase_penalty') + + print "send message for site %s penalty increase" % site + + if sitehist.db.status == 'good': + # clear penalty + # NOTE: because 'all clear' should have an indefinite status, we + # have a boolean value rather than a 'recent action' + if sitehist.db.penalty_applied: + # send message that penalties are cleared. + + sitehist.clearPenalty() + #sitehist.applyPenalty() + sitehist.sendMessage('clear_penalty') + sitehist.closeTicket() + + print "send message for site %s penalty cleared" % site + + # find all ticket ids for site ( could be on the site record? ) + # determine if there are penalties within the last 30 days? + # if so, add a 'pause_penalty' action. + if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0: + # pause escalation + print "Pausing penalties for %s" % site + sitehist.pausePenalty() + + site_count = site_count + 1 + + session.flush() + + return + + +if __name__ == "__main__": + parser = parsermodule.getParser(['nodesets']) + parser.set_defaults( timewait=0, + skip=0, + rins=False, + reboot=False, + findbad=False, + force=False, + nosetup=False, + verbose=False, + quiet=False, + ) + + parser.add_option("", "--stopselect", dest="stopselect", metavar="", + help="The select string that must evaluate to true for the node to be considered 'done'") + parser.add_option("", "--findbad", dest="findbad", action="store_true", + help="Re-run findbad on the nodes we're going to check before acting.") + parser.add_option("", "--force", dest="force", action="store_true", + help="Force action regardless of previous actions/logs.") + parser.add_option("", "--rins", dest="rins", action="store_true", + help="Set the boot_state to 'rins' for all nodes.") + parser.add_option("", "--reboot", dest="reboot", action="store_true", + help="Actively try to reboot the nodes, keeping a log of actions.") + + parser.add_option("", "--verbose", dest="verbose", action="store_true", + help="Extra debug output messages.") + parser.add_option("", "--nosetup", dest="nosetup", action="store_true", + help="Do not perform the orginary setup phase.") + parser.add_option("", "--skip", dest="skip", + help="Number of machines to skip on the input queue.") + parser.add_option("", "--timewait", dest="timewait", + help="Minutes to wait between iterations of 10 nodes.") + + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + +# # COLLECT nodegroups, nodes and node lists +# if config.nodegroup: +# ng = api.GetNodeGroups({'name' : config.nodegroup}) +# nodelist = api.GetNodes(ng[0]['node_ids']) +# hostnames = [ n['hostname'] for n in nodelist ] + +# if config.node or config.nodelist: +# if config.node: hostnames = [ config.node ] +# else: hostnames = util.file.getListFromFile(config.nodelist) +# +# fbquery = FindbadNodeRecord.get_all_latest() +# fb_nodelist = [ n.hostname for n in fbquery ] + +# if config.nodeselect: +# hostnames = node_select(config.nodeselect, fb_nodelist) + + fbquery = HistoryNodeRecord.query.all() + hostnames = [ n.hostname for n in fbquery ] + + fbquery = HistorySiteRecord.query.all() + sitenames = [ s.loginbase for s in fbquery ] + + if config.site: + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + filter_hostnames = [ n['hostname'] for n in l_nodes ] + + hostnames = filter(lambda x: x in filter_hostnames, hostnames) + sitenames = [config.site] + + if config.node: + hostnames = [ config.node ] + sitenames = [ plccache.plcdb_hn2lb[config.node] ] + + try: + main(hostnames, sitenames) + except KeyboardInterrupt: + print "Killed by interrupt" + sys.exit(0) + except: + #email_exception() + print traceback.print_exc(); + print "Continuing..." diff --git a/sitebad.py b/sitebad.py index 5a2f3be..a0407c9 100755 --- a/sitebad.py +++ b/sitebad.py @@ -41,7 +41,7 @@ def getnodesup(nodelist): for node in nodelist: try: nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) - if nodehist is not None and nodehist.status == "good": + if nodehist is not None and nodehist.status != 'down': up = up + 1 except: import traceback diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 0d4e703..774ad00 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -218,6 +218,8 @@ class Root(controllers.RootController): query.append(node) elif filter == node.history.status: query.append(node) + elif filter == 'boot': + query.append(node) #if filter == node.observed_status: # if filter == "DOWN": diff --git a/web/MonitorWeb/monitorweb/templates/pcuview.kid b/web/MonitorWeb/monitorweb/templates/pcuview.kid index 694fc4d..e51c743 100644 --- a/web/MonitorWeb/monitorweb/templates/pcuview.kid +++ b/web/MonitorWeb/monitorweb/templates/pcuview.kid @@ -224,7 +224,7 @@ from links import * - ${act.message_id} + ${act.message_id}

 				
 			
diff --git a/www/gadgets/sitemonitor.py b/www/gadgets/sitemonitor.py
index c52b36b..3ec6231 100755
--- a/www/gadgets/sitemonitor.py
+++ b/www/gadgets/sitemonitor.py
@@ -108,7 +108,8 @@ def main():
 
 	fb = database.dbLoad("findbad")
 	lb2hn = database.dbLoad("plcdb_lb2hn")
-	pf = database.dbLoad("node_persistflags")
+	# todo: pull from HistoryNodeRecord table instead
+	#pf = database.dbLoad("node_persistflags")
 
 	# SETUP header
 	t = TABLE(border="0", cellspacing="0", cellpadding="0")
@@ -135,7 +136,8 @@ def main():
 			url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
 			td = TD(A(host, target='_blank', href=url), bgcolor=color)
 			r.append(td)
-			lc = pf[host].last_changed
+			#lc = pf[host].last_changed
+			lc=-1
 			td = TD(diff_time(lc))
 			r.append(td)
 			t.append(r)
-- 
2.43.0