From c9d06f3b274ecbc092a0b3eb1f5ceb6c0f734aad Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 16 Apr 2009 19:17:37 +0000 Subject: [PATCH] svn merge -r 12308:13112 https://svn.planet-lab.org/svn/Monitor/branches/2.0/ copying all monitor2 changes back into trunk to begin updates for 4.3 and updates to sortable columns. --- Makefile | 8 +- automate-default.sh | 24 +- blacklist.py | 51 +- bootman.py | 749 +++++++++--------- findall.py | 14 +- findbad.py | 38 +- findbadpcu.py | 47 +- get_metasite_nodes.py | 2 - grouprins.py | 379 --------- mailmonitor.py | 5 +- monitor/common.py | 62 +- monitor/database/info/__init__.py | 1 + monitor/database/info/action.py | 71 +- monitor/database/info/findbad.py | 74 +- monitor/database/info/history.py | 15 + monitor/database/info/interface.py | 198 +++++ monitor/database/info/model.py | 1 + monitor/database/info/plc.py | 33 + monitor/model.py | 2 + monitor/policy.py | 3 +- monitor/reboot.py | 144 ++++ monitor/scanapi.py | 47 +- monitor/wrapper/emailTxt.py | 96 ++- monitor/wrapper/plc.py | 78 +- monitor/wrapper/plccache.py | 142 ++-- nodebad.py | 128 ++- nodegroups.py | 15 +- nodeinfo.py | 8 +- nodequery.py | 9 +- pcubad.py | 108 ++- pcucontrol/models/APCControl.py | 14 +- pcucontrol/models/BayTech.py | 6 + pcucontrol/models/DRAC.py | 19 +- pcucontrol/models/HPiLO.py | 3 +- pcucontrol/models/IPAL.py | 18 +- pcucontrol/models/ePowerSwitch.py | 12 +- .../models/intelamt/RemoteControlSample.cpp | 14 +- pcucontrol/reboot.py | 189 ++--- pcucontrol/util/__init__.py | 0 {monitor => pcucontrol}/util/command.py | 107 ++- policy.py | 237 ++++++ setup.py | 15 +- sitebad.py | 104 ++- siteinfo.py | 7 +- testapi.py | 2 +- nodenetwork.py => tests/nodenetwork.py | 0 web/MonitorWeb/monitorweb/controllers.py | 158 +++- web/MonitorWeb/monitorweb/monitor_xmlrpc.py | 161 ++++ .../monitorweb/static/css/style.css | 15 +- web/MonitorWeb/monitorweb/templates/links.py | 2 + .../monitorweb/templates/nodehistory.kid | 60 ++ .../monitorweb/templates/nodelist.kid | 12 +- .../monitorweb/templates/pcuview.kid | 60 +- .../monitorweb/templates/sitehistory.kid | 55 ++ .../monitorweb/templates/sitelist.kid | 2 +- .../monitorweb/templates/sitemenu.kid | 8 +- www/gadgets/sitemonitor.py | 6 +- zabbix.spec | 37 + zabbix/zabbixsync.py | 4 +- 59 files changed, 2525 insertions(+), 1354 deletions(-) delete mode 100755 grouprins.py create mode 100644 monitor/database/info/interface.py create mode 100644 monitor/database/info/plc.py create mode 100755 monitor/reboot.py create mode 100644 pcucontrol/util/__init__.py rename {monitor => pcucontrol}/util/command.py (71%) create mode 100755 policy.py rename nodenetwork.py => tests/nodenetwork.py (100%) create mode 100644 web/MonitorWeb/monitorweb/monitor_xmlrpc.py create mode 100644 web/MonitorWeb/monitorweb/templates/nodehistory.kid create mode 100644 web/MonitorWeb/monitorweb/templates/sitehistory.kid diff --git a/Makefile b/Makefile index ec5927a..375baec 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,11 @@ SHA1SUM = sha1sum SPECFILE = zabbix.spec #main.URL := http://voxel.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.1.tar.gz -#main.SHA1SUM:= 6e66efdbbdf23dc3de01379b30ded7b005fb49d9 -main.URL := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz -main.SHA1SUM:= 575c443adec1703c2c242dbf353de9dc3bb4cafb +#main.SHA1SUM := 6e66efdbbdf23dc3de01379b30ded7b005fb49d9 +#main.URL := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz +#main.SHA1SUM := 575c443adec1703c2c242dbf353de9dc3bb4cafb +main.URL := http://build.planet-lab.org/third-party/zabbix-1.6.2.tar.gz +main.SHA1SUM := 575c443adec1703c2c242dbf353de9dc3bb4cafb main.FILE := $(notdir $(main.URL)) # Thierry - when called from within the build, PWD is /build diff --git a/automate-default.sh b/automate-default.sh index 046c1ac..24a9e61 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -61,30 +61,20 @@ fi source ${MONITOR_SCRIPT_ROOT}/agent.sh -echo "Performing Findbad Nodes" +echo "Performing FindAll Nodes" ######################### # 1. FINDBAD NODES -${MONITOR_SCRIPT_ROOT}/findbad.py --increment $DATE || : +${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || : ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || : - -echo "Performing Findbad PCUs" -######################### -# 2. FINDBAD PCUS -${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment $DATE || : # clean up stray 'locfg' processes that hang around inappropriately... ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || : -echo "Performing uptime changes for sites, nodes, and pcus" -######################## -# 3. record last-changed for sites, nodes and pcus. -${MONITOR_SCRIPT_ROOT}/sitebad.py || : -${MONITOR_SCRIPT_ROOT}/nodebad.py || : -${MONITOR_SCRIPT_ROOT}/pcubad.py || : +${MONITOR_SCRIPT_ROOT}/policy.py $DATE echo "Archiving pkl files" ######################### # Archive pkl files. -for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do +for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl else @@ -92,11 +82,5 @@ for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistfl fi done -#echo "Running grouprins on all dbg nodes" -############################ -# 5. Check if there are any nodes in dbg state. Clean up afterward. -#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || : -#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || : - cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log rm -f $MONITOR_PID diff --git a/blacklist.py b/blacklist.py index c96dc89..8704b59 100755 --- a/blacklist.py +++ b/blacklist.py @@ -4,8 +4,8 @@ import os import sys import string import time -import database -import plc +from monitor import database +from monitor.database.info.model import * import getopt def usage(): @@ -13,38 +13,61 @@ def usage(): def main(): + loginbase = False + try: - longopts = ["delete=", "help"] - (opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts) + longopts = ["delete=", "loginbase", "help"] + (opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts) except getopt.GetoptError, err: print "Error: " + err.msg sys.exit(1) - l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) + hostnames_q = BlacklistRecord.getHostnameBlacklist() + loginbases_q = BlacklistRecord.getLoginbaseBlacklist() + hostnames = [ h.hostname for h in hostnames_q ] + loginbases = [ h.loginbase for h in loginbases_q ] for (opt, optval) in opts: if opt in ["-d", "--delete"]: - i = int(optval) - del l_blacklist[i] + i = optval + bl = BlacklistRecord.get_by(hostname=i) + bl.delete() + elif opt in ["-l", "--loginbase"]: + loginbase = True else: usage() sys.exit(0) i_cnt = 0 - for i in l_blacklist: - print i_cnt, " ", i - i_cnt += 1 + if not loginbase: + for i in hostnames: + print i + i_cnt += 1 + else: + for i in loginbases: + print i + i_cnt += 1 + + while 1: line = sys.stdin.readline() if not line: break line = line.strip() - if not line in l_blacklist: - l_blacklist.append(line) + if line not in hostnames and line not in loginbases: + if loginbase: + bl = BlacklistRecord(loginbase=line) + else: + bl = BlacklistRecord(hostname=line) + bl.flush() + i_cnt += 1 - print "Total %d nodes in blacklist" % (len(l_blacklist)) - database.dbDump("l_blacklist") + session.flush() + if loginbase: + print "Total %d loginbases in blacklist" % (i_cnt) + else: + print "Total %d nodes in blacklist" % (i_cnt) if __name__ == '__main__': import os diff --git a/bootman.py b/bootman.py index 22201cb..1a04ef0 100755 --- a/bootman.py +++ b/bootman.py @@ -2,40 +2,45 @@ # Attempt to reboot a node in debug state. -from monitor import const -from monitor.database.info.model import * -from monitor.wrapper import plc -api = plc.getAuthAPI() -import sys + import os +import sys +import time +import random +import signal +import traceback +import subprocess +from sets import Set from getsshkeys import SSHKnownHosts -import subprocess -import time -from monitor.util import command as moncommands -from sets import Set +from Rpyc import SocketConnection, Async +from Rpyc.Utils import * + +import getconf +from monitor import config +from monitor import const +from monitor.model import * +from monitor.common import email_exception, found_within +from monitor.database.info.model import * +from monitor.wrapper import plc +from monitor.wrapper import plccache +from monitor.wrapper.emailTxt import mailtxt +from pcucontrol.util import command as moncommands +from pcucontrol.util.command import Sopen from pcucontrol.transports.ssh import pxssh as pxssh from pcucontrol.transports.ssh import fdpexpect as fdpexpect from pcucontrol.transports.ssh import pexpect as pexpect -from monitor.model import * -from monitor.wrapper.emailTxt import mailtxt + from nodeconfig import network_config_to_str -import traceback -from monitor import config -import signal -class Sopen(subprocess.Popen): - def kill(self, signal = signal.SIGTERM): - os.kill(self.pid, signal) -#from Rpyc import SocketConnection, Async -from Rpyc import SocketConnection, Async -from Rpyc.Utils import * +api = plc.getAuthAPI() fb = None + class NodeConnection: def __init__(self, connection, node, config): self.node = node @@ -43,12 +48,20 @@ class NodeConnection: self.config = config def get_boot_state(self): - if self.c.modules.os.path.exists('/tmp/source'): - return "dbg" - elif self.c.modules.os.path.exists('/vservers'): - return "boot" - else: - return "unknown" + try: + if self.c.modules.os.path.exists('/tmp/source'): + return "debug" + elif self.c.modules.os.path.exists('/vservers'): + return "boot" + else: + return "unknown" + except EOFError: + traceback.print_exc() + print self.c.modules.sys.path + except: + traceback.print_exc() + + return "unknown" def get_dmesg(self): self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log") @@ -82,11 +95,11 @@ class NodeConnection: print " ERROR:", x print " Possibly, unable to find valid configuration file" - if bm_continue and self.config and not self.config.quiet: + if bm_continue: for key in bm.VARS.keys(): print key, " == ", bm.VARS[key] else: - if self.config and not self.config.quiet: print " Unable to read Node Configuration" + print " Unable to read Node Configuration" def compare_and_repair_nodekeys(self): @@ -102,7 +115,7 @@ class NodeConnection: ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration bm_continue = True - plcnode = api.GetNodes({'hostname': self.node}, None)[0] + plcnode = plccache.GetNodeByName(self.node) InitializeBootManager.Run(bm.VARS, bm.LOG) try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG) @@ -177,7 +190,6 @@ class NodeConnection: return -import random class PlanetLabSession: globalport = 22000 + int(random.random()*1000) @@ -190,7 +202,14 @@ class PlanetLabSession: self.setup_host() def get_connection(self, config): - return NodeConnection(SocketConnection("localhost", self.port), self.node, config) + conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config) + #i = 0 + #while i < 3: + # print i, conn.c.modules.sys.path + # print conn.c.modules.os.path.exists('/tmp/source') + # i+=1 + # time.sleep(1) + return conn def setup_host(self): self.port = PlanetLabSession.globalport @@ -210,6 +229,7 @@ class PlanetLabSession: # COPY Rpyc files to host cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args if self.verbose: print cmd + print cmd # TODO: Add timeout timeout = 120 localos = moncommands.CMD() @@ -253,6 +273,7 @@ EOF""") #cmd = cmd % args #if self.verbose: print cmd #print localos.system(cmd,timeout) + print "setup rpyc server over ssh" print ssh.ret # TODO: Add timeout @@ -265,6 +286,7 @@ EOF""") """%(user)s@%(hostname)s""" cmd = cmd % args if self.verbose: print cmd + print cmd self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE) # TODO: the read() here may block indefinitely. Need a better # approach therefore, that includes a timeout. @@ -288,14 +310,12 @@ EOF""") def __del__(self): if self.command: if self.verbose: print "Killing SSH session %s" % self.port + print "Killing SSH session %s" % self.port self.command.kill() - -def steps_to_list(steps): - ret_list = [] - for (id,label) in steps: - ret_list.append(label) - return ret_list + +def steps_to_list(steps, index=1): + return map(lambda x: x[index], steps) def index_to_id(steps,index): if index < len(steps): @@ -303,93 +323,176 @@ def index_to_id(steps,index): else: return "done" -def reboot(hostname, config=None, forced_action=None): +class DebugInterface: + def __init__(self, hostname): + self.hostname = hostname + self.session = None - # NOTE: Nothing works if the bootcd is REALLY old. - # So, this is the first step. - fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() - if fbnode['category'] == "OLDBOOTCD": - print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" - args = {} - args['hostname_list'] = " %s" % hostname - - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - api.UpdateNode(hostname, {'boot_state' : 'disable'}) - return True - - node = hostname - print "Creating session for %s" % node - # update known_hosts file (in case the node has rebooted since last run) - if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node - try: - k = SSHKnownHosts(); k.update(node); k.write(); del k - except: - print traceback.print_exc() - return False - - try: - if config == None: - session = PlanetLabSession(node, False, True) - else: - session = PlanetLabSession(node, config.nosetup, config.verbose) - except Exception, e: - print "ERROR setting up session for %s" % hostname - print traceback.print_exc() - print e - return False - - try: - conn = session.get_connection(config) - except EOFError: - # NOTE: sometimes the wait in setup_host() is not long enough. - # So, here we try to wait a little longer before giving up entirely. + def getConnection(self): + print "Creating session for %s" % self.hostname + # update known_hosts file (in case the node has rebooted since last run) try: - time.sleep(session.timeout*4) - conn = session.get_connection(config) + k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k except: + email_exception() print traceback.print_exc() return False - if forced_action == "reboot": - conn.restart_node('rins') - return True + try: + if config == None: + self.session = PlanetLabSession(self.hostname, False, True) + else: + self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose) + except Exception, e: + msg = "ERROR setting up session for %s" % self.hostname + print msg + traceback.print_exc() + email_exception(msg) + return False - boot_state = conn.get_boot_state() - if boot_state == "boot": - print "...Boot state of %s already completed : skipping..." % node - return True - elif boot_state == "unknown": - print "...Unknown bootstate for %s : skipping..."% node - return False - else: - pass + try: + conn = self.session.get_connection(config) + except EOFError: + # NOTE: sometimes the wait in setup_host() is not long enough. + # So, here we try to wait a little longer before giving up entirely. + try: + time.sleep(self.session.timeout*5) + conn = self.session.get_connection(config) + except: + traceback.print_exc() + email_exception(self.hostname) + return False + #print "trying to use conn before returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + #time.sleep(1) - if conn.bootmanager_running(): - print "...BootManager is currently running. Skipping host %s" % node - return True + #print "conn: %s" % conn + return conn - #if config != None: - # if config.force: - # conn.restart_bootmanager(config.force) - # return True + def getSequences(self): - # Read persistent flags, tagged on one week intervals. - pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags') + # TODO: This can be replaced with a DB definition at a future time. + # This would make it possible for an admin to introduce new + # patterns without touching code. + sequences = {} + # restart_bootmanager_boot + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", + "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", + "bminit-cfg-auth-protoerror-exception-update-debug-done", + "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", + "bminit-cfg-auth-getplc-implementerror-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_boot"}) + + # conn.restart_bootmanager('rins') + for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + ]: + sequences.update({n : "restart_bootmanager_rins"}) + + # repair_node_keys + sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) + + # conn.restart_node('rins') + for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + ]: + sequences.update({n : "restart_node_rins"}) + + # restart_node_boot + for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", + "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", + "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + ]: + sequences.update({n: "restart_node_boot"}) + + # update_node_config_email + for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", + ]: + sequences.update({n : "update_node_config_email"}) + + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: + sequences.update({n : "nodenetwork_email"}) + + # update_bootcd_email + for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", + "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", + ]: + sequences.update({n : "update_bootcd_email"}) + + for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", + ]: + sequences.update({n: "suspect_error_email"}) + + # update_hardware_email + sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + + # broken_hardware_email + sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + + # bad_dns_email + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) - if config and not config.quiet: print "...downloading dmesg from %s" % node - dmesg = conn.get_dmesg() - child = fdpexpect.fdspawn(dmesg) + return sequences - sequence = [] - while True: + def getDiskSteps(self): steps = [ ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), @@ -425,51 +528,19 @@ def reboot(hostname, config=None, forced_action=None): # SCSI error : <0 2 0 0> return code = 0x40001 # end_request: I/O error, dev sda, sector 572489600 ] - id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) - sequence.append(id) - - if id == "done": - break - - s = Set(sequence) - if config and not config.quiet: print "\tSET: ", s - - if len(s) > 1: - print "...Potential drive errors on %s" % node - if len(s) == 2 and 'floppyerror' in s: - print "...Should investigate. Continuing with node." - else: - print "...Should investigate. Skipping node." - # TODO: send message related to these errors. - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() - - m = PersistMessage(hostname, mailtxt.baddisk[0] % args, - mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') - return False + return steps - print "...Downloading bm.log from %s" % node - log = conn.get_bootmanager_log() - child = fdpexpect.fdspawn(log) - - try: - if config.collect: return True - except: - pass + def getDiskSequence(self, steps, child): + sequence = [] + while True: + id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ])) + sequence.append(id) - time.sleep(1) - - if config and not config.quiet: print "...Scanning bm.log for errors" - action_id = "dbg" - sequence = [] - while True: + if id == "done": + break + return sequence + def getBootManagerStepPatterns(self): steps = [ ('bminit' , 'Initializing the BootManager.'), ('cfg' , 'Reading node configuration file.'), @@ -520,146 +591,117 @@ def reboot(hostname, config=None, forced_action=None): ('bootcheckfail' , 'BootCheckAuthentication'), ('bootupdatefail' , 'BootUpdateNode'), ] - list = steps_to_list(steps) - index = child.expect( list + [ pexpect.EOF ]) - id = index_to_id(steps,index) - sequence.append(id) - - if id == "exception": - if config and not config.quiet: print "...Found An Exception!!!" - elif index == len(list): - #print "Reached EOF" - break + return steps + + def getBootManagerSequenceFromLog(self, steps, child): + sequence = [] + while True: + + index = child.expect( steps_to_list(steps) + [ pexpect.EOF ]) + id = index_to_id(steps,index) + sequence.append(id) + + if id == "exception": + print "...Found An Exception!!!" + elif id == "done": #index == len(steps_to_list(steps)): + #print "Reached EOF" + break + + return sequence - s = "-".join(sequence) - print " FOUND SEQUENCE: ", s - # NOTE: We get or set the flag based on the current sequence identifier. - # By using the sequence identifier, we guarantee that there will be no - # frequent loops. I'm guessing there is a better way to track loops, - # though. - #if not config.force and pflags.getRecentFlag(s): - # pflags.setRecentFlag(s) - # pflags.save() - # print "... flag is set or it has already run recently. Skipping %s" % node +def restore(sitehist, hostname, config=None, forced_action=None): + + # NOTE: Nothing works if the bootcd is REALLY old. + # So, this is the first step. + + fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict() + recent_actions = sitehist.getRecentActions(hostname=hostname) + + if fbnode['observed_category'] == "OLDBOOTCD": + print "\t...Notify owner to update BootImage!!!" + + if not found_within(recent_actions, 'newbootcd_notice', 3): + sitehist.sendMessage('newbootcd_notice', hostname=hostname) + + print "\tDisabling %s due to out-of-date BootImage" % hostname + api.UpdateNode(hostname, {'boot_state' : 'disable'}) + + # NOTE: nothing else is possible. + return True + + debugnode = DebugInterface(hostname) + conn = debugnode.getConnection() + #print "conn: %s" % conn + #print "trying to use conn after returning it." + #print conn.c.modules.sys.path + #print conn.c.modules.os.path.exists('/tmp/source') + if type(conn) == type(False): return False + + #if forced_action == "reboot": + # conn.restart_node('rins') # return True - sequences = {} + boot_state = conn.get_boot_state() + if boot_state != "debug": + print "... %s in %s state: skipping..." % (hostname , boot_state) + return boot_state == "boot" + if conn.bootmanager_running(): + print "...BootManager is currently running. Skipping host %s" %hostname + return True - # restart_bootmanager_boot - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done", + # Read persistent flags, tagged on one week intervals. - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done", + if config and not config.quiet: print "...downloading dmesg from %s" %hostname + dmesg = conn.get_dmesg() + child = fdpexpect.fdspawn(dmesg) - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", - "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done", - "bminit-cfg-auth-protoerror-exception-update-debug-done", - "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", - "bminit-cfg-auth-getplc-implementerror-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_boot"}) - - # conn.restart_bootmanager('rins') - for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", - # actual solution appears to involve removing the bad files, and - # continually trying to boot the node. - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", - ]: - sequences.update({n : "restart_bootmanager_rins"}) - - # repair_node_keys - sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) - - # conn.restart_node('rins') - for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - ]: - sequences.update({n : "restart_node_rins"}) - - # restart_node_boot - for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done", - "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", - "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", - "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", - ]: - sequences.update({n: "restart_node_boot"}) - - # update_node_config_email - for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", - ]: - sequences.update({n : "update_node_config_email"}) + steps = debugnode.getDiskSteps() + sequence = debugnode.getDiskSequence(steps, child) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", - "bminit-cfg-update-exception-nodehostname-update-debug-done", - ]: - sequences.update({n : "nodenetwork_email"}) - - # update_bootcd_email - for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done", - "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done", - ]: - sequences.update({n : "update_bootcd_email"}) + s = Set(sequence) + if config and not config.quiet: print "\tSET: ", s - for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", - ]: - sequences.update({n: "suspect_error_email"}) + if len(s) > 1: + print "...Potential drive errors on %s" % hostname + if len(s) == 2 and 'floppyerror' in s: + print "...Should investigate. Continuing with node." + else: + print "...Should investigate. Skipping node." + # TODO: send message related to these errors. - # update_hardware_email - sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) - sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"}) + if not found_within(recent_actions, 'newbootcd_notice', 3): - # broken_hardware_email - sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) + log=conn.get_dmesg().read() + sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log) + conn.set_nodestate('disable') - # bad_dns_email - for n in [ - "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", - ]: - sequences.update( { n : "bad_dns_email"}) + return False - flag_set = True + print "...Downloading bm.log from %s" %hostname + log = conn.get_bootmanager_log() + child = fdpexpect.fdspawn(log) + + if hasattr(config, 'collect') and config.collect: return True + + if config and not config.quiet: print "...Scanning bm.log for errors" + + time.sleep(1) + + steps = debugnode.getBootManagerStepPatterns() + sequence = debugnode.getBootManagerSequenceFromLog(steps, child) + + s = "-".join(sequence) + print " FOUND SEQUENCE: ", s + # NOTE: We get or set the flag based on the current sequence identifier. + # By using the sequence identifier, we guarantee that there will be no + # frequent loops. I'm guessing there is a better way to track loops, + # though. + + sequences = debugnode.getSequences() + flag_set = True if s not in sequences: print " HOST %s" % hostname @@ -669,10 +711,9 @@ def reboot(hostname, config=None, forced_action=None): args['hostname'] = hostname args['sequence'] = s args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args, - mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages') - m.reset() - m.send([config.cc_email]) + args['viart'] = False + + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') @@ -683,10 +724,10 @@ def reboot(hostname, config=None, forced_action=None): else: if sequences[s] == "restart_bootmanager_boot": - if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + print "...Restarting BootManager.py on %s "%hostname conn.restart_bootmanager('boot') elif sequences[s] == "restart_bootmanager_rins": - if config and not config.quiet: print "...Restarting BootManager.py on %s "% node + print "...Restarting BootManager.py on %s "%hostname conn.restart_bootmanager('rins') elif sequences[s] == "restart_node_rins": conn.restart_node('rins') @@ -700,119 +741,89 @@ def reboot(hostname, config=None, forced_action=None): pass else: # there was some failure to synchronize the keys. - print "...Unable to repair node keys on %s" % node + print "...Unable to repair node keys on %s" %hostname elif sequences[s] == "suspect_error_email": args = {} args['hostname'] = hostname args['sequence'] = s args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args, - mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages') - m.reset() - m.send([config.cc_email]) + args['viart'] = False + sitehist.sendMessage('unknownsequence_notice', **args) conn.restart_bootmanager('boot') + # TODO: differentiate this and the 'nodenetwork_email' actions. elif sequences[s] == "update_node_config_email": - print "...Sending message to UPDATE NODE CONFIG" - args = {} - args['hostname'] = hostname - m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, - True, db='nodeid_persistmessages') - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.dump_plconf_file() - conn.set_nodestate('disable') + + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() elif sequences[s] == "nodenetwork_email": - print "...Sending message to LOOK AT NODE NETWORK" - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, - True, db='nodenet_persistmessages') - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.dump_plconf_file() - conn.set_nodestate('disable') - elif sequences[s] == "update_bootcd_email": - print "...NOTIFY OWNER TO UPDATE BOOTCD!!!" - import getconf - args = {} - args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: - args['hostname_list'] = "%s" % hostname + if not found_within(recent_actions, 'nodeconfig_notice', 3): + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('nodeconfig_notice', **args) + conn.dump_plconf_file() - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') + elif sequences[s] == "update_bootcd_email": - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) + if not found_within(recent_actions, 'newalphacd_notice', 3): + args = {} + args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user: + args['hostname'] = hostname + + sitehist.sendMessage('newalphacd_notice', **args) - print "\tDisabling %s due to out-of-date BOOTCD" % hostname - conn.set_nodestate('disable') + print "\tDisabling %s due to out-of-date BOOTCD" % hostname elif sequences[s] == "broken_hardware_email": # MAKE An ACTION record that this host has failed hardware. May # require either an exception "/minhw" or other manual intervention. # Definitely need to send out some more EMAIL. - print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname # TODO: email notice of broken hardware - args = {} - args['hostname'] = hostname - args['log'] = conn.get_dmesg().read() - m = PersistMessage(hostname, mailtxt.baddisk[0] % args, - mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') + if not found_within(recent_actions, 'baddisk_notice', 1): + print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['log'] = conn.get_dmesg().read() - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') + sitehist.sendMessage('baddisk_notice', **args) + conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": - print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname - args = {} - args['hostname'] = hostname - args['bmlog'] = conn.get_bootmanager_log().read() - m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args, - mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') + if not found_within(recent_actions, 'minimalhardware_notice', 1): + print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname + args = {} + args['hostname'] = hostname + args['bmlog'] = conn.get_bootmanager_log().read() + sitehist.sendMessage('minimalhardware_notice', **args) elif sequences[s] == "bad_dns_email": - print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname - args = {} - try: - node = api.GetNodes(hostname)[0] - net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] - except: - print traceback.print_exc() - # TODO: api error. skip email, b/c all info is not available, - # flag_set will not be recorded. - return False - nodenet_str = network_config_to_str(net) + if not found_within(recent_actions, 'baddns_notice', 1): + print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname + args = {} + try: + node = plccache.GetNodeByName(hostname) + net = api.GetNodeNetworks(node['nodenetwork_ids'])[0] + except: + email_exception() + print traceback.print_exc() + # TODO: api error. skip email, b/c all info is not available, + # flag_set will not be recorded. + return False + nodenet_str = network_config_to_str(net) - args['hostname'] = hostname - args['network_config'] = nodenet_str - args['nodenetwork_id'] = net['nodenetwork_id'] - m = PersistMessage(hostname, mailtxt.baddns[0] % args, - mailtxt.baddns[1] % args, True, db='baddns_persistmessages') - - loginbase = plc.siteId(hostname) - emails = plc.getTechEmails(loginbase) - m.send(emails) - conn.set_nodestate('disable') - - if flag_set: - pflags.setRecentFlag(s) - pflags.save() + args['hostname'] = hostname + args['network_config'] = nodenet_str + args['nodenetwork_id'] = net['nodenetwork_id'] + + sitehist.sendMessage('baddns_notice', **args) return True diff --git a/findall.py b/findall.py index 8be5b27..64c4987 100755 --- a/findall.py +++ b/findall.py @@ -4,6 +4,9 @@ from monitor import parser as parsermodule from findbad import main as findbad_main from findbadpcu import main as findbadpcu_main from sitebad import main as sitebad_main +from nodebad import main as nodebad_main +from pcubad import main as pcubad_main +from monitor.wrapper import plccache import sys if __name__ == '__main__': @@ -11,7 +14,7 @@ if __name__ == '__main__': parser = parsermodule.getParser(['nodesets']) parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, - force=False,) + force=False, pcuselect=None, pcuid=None, pcu=None) parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", @@ -26,8 +29,17 @@ if __name__ == '__main__': cfg = parsermodule.parse_args(parser) try: + print "sync with plc" + plccache.sync() + print "findbad" findbad_main() + print "findbadpcu" findbadpcu_main() + print "nodebad" + nodebad_main() + print "pcubad" + pcubad_main() + print "sitebad" sitebad_main() except Exception, err: import traceback diff --git a/findbad.py b/findbad.py index 7bb31a0..7ae4b13 100755 --- a/findbad.py +++ b/findbad.py @@ -9,10 +9,10 @@ import threadpool import threading from monitor.util import file -from monitor.util import command +from pcucontrol.util import command from monitor import config -from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session +from monitor.database.info.model import FindbadNodeRecord, session from monitor.sources import comon from monitor.wrapper import plc, plccache @@ -53,9 +53,10 @@ def checkAndRecordState(l_nodes, cohash): # CREATE all the work requests for nodename in l_nodes: - fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0}) - node_round = fbnodesync.round - fbnodesync.flush() + #fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0}) + #node_round = fbnodesync.round + node_round = global_round - 1 + #fbnodesync.flush() if node_round < global_round or config.force: # recreate node stats when refreshed @@ -86,16 +87,16 @@ def checkAndRecordState(l_nodes, cohash): print "All results collected." break - print FindbadNodeRecordSync.query.count() + #print FindbadNodeRecordSync.query.count() print FindbadNodeRecord.query.count() session.flush() def main(): global global_round - fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - if_new_set={'round' : global_round}) - global_round = fbsync.round + #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", + # if_new_set={'round' : global_round}) + #global_round = fbsync.round if config.increment: # update global round number to force refreshes across all nodes @@ -118,24 +119,24 @@ def main(): l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes) elif config.nodegroup: ng = api.GetNodeGroups({'name' : config.nodegroup}) - l_nodes = api.GetNodes(ng[0]['node_ids']) + l_nodes = plccache.GetNodesByIds(ng[0]['node_ids']) elif config.site: - site = api.GetSites(config.site) - l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + site = plccache.GetSitesByName([config.site]) + l_nodes = plccache.GetNodesByIds(site[0]['node_ids']) elif config.sitelist: site_list = config.sitelist.split(',') - sites = api.GetSites(site_list) + sites = plccache.GetSitesByName(site_list) node_ids = [] for s in sites: node_ids += s['node_ids'] - l_nodes = api.GetNodes(node_ids, ['hostname']) + l_nodes = plccache.GetNodesByIds(node_ids) l_nodes = [node['hostname'] for node in l_nodes] # perform this query after the above options, so that the filter above # does not break. if config.nodeselect: - plcnodes = api.GetNodes({'peer_id' : None}, ['hostname']) + plcnodes = plccache.l_nodes plcnodes = [ node['hostname'] for node in plcnodes ] l_nodes = node_select(config.nodeselect, plcnodes, None) @@ -145,8 +146,9 @@ def main(): if config.increment: # update global round number to force refreshes across all nodes - fbsync.round = global_round - fbsync.flush() + #fbsync.round = global_round + #fbsync.flush() + pass return 0 @@ -175,6 +177,8 @@ if __name__ == '__main__': main() except Exception, err: print traceback.print_exc() + from monitor.common import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." sys.exit(0) diff --git a/findbadpcu.py b/findbadpcu.py index 815a77e..ab4f5ff 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -13,9 +13,8 @@ import threadpool import threading import monitor -from pcucontrol import reboot from monitor import config -from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session +from monitor.database.info.model import FindbadPCURecord, session from monitor import database from monitor import util from monitor.wrapper import plc, plccache @@ -44,10 +43,11 @@ def checkPCUs(l_pcus, cohash): # CREATE all the work requests for pcuname in l_pcus: pcu_id = int(pcuname) - fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0}) - fbnodesync.flush() + #fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0}) + #fbnodesync.flush() - node_round = fbnodesync.round + #node_round = fbnodesync.round + node_round = global_round - 1 if node_round < global_round or config.force: # recreate node stats when refreshed #print "%s" % nodename @@ -76,7 +76,7 @@ def checkPCUs(l_pcus, cohash): print "All results collected." break - print FindbadPCURecordSync.query.count() + #print FindbadPCURecordSync.query.count() print FindbadPCURecord.query.count() session.flush() @@ -87,29 +87,38 @@ def main(): l_pcus = plccache.l_pcus cohash = {} - fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, - if_new_set={'round' : global_round}) + #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, + #if_new_set={'round' : global_round}) - global_round = fbsync.round + #global_round = fbsync.round api = plc.getAuthAPI() if config.site is not None: - site = api.GetSites(config.site) - l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids']) + site = plccache.GetSitesByName([config.site]) + l_nodes = plccache.GetNodesByIds(site[0]['node_ids']) pcus = [] for node in l_nodes: pcus += node['pcu_ids'] # clear out dups. l_pcus = [pcu for pcu in sets.Set(pcus)] + + elif config.node is not None: + l_nodes = plcacche.GetNodeByName(config.node) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + elif config.sitelist: site_list = config.sitelist.split(',') - sites = api.GetSites(site_list) + sites = plccache.GetSitesByName(site_list) node_ids = [] for s in sites: node_ids += s['node_ids'] - l_nodes = api.GetNodes(node_ids, ['pcu_ids']) + l_nodes = plccache.GetNodeByIds(node_ids) pcus = [] for node in l_nodes: pcus += node['pcu_ids'] @@ -140,8 +149,8 @@ def main(): if config.increment: # update global round number to force refreshes across all nodes - fbsync.round = global_round - fbsync.flush() + #fbsync.round = global_round + #fbsync.flush() session.flush() return 0 @@ -164,6 +173,8 @@ if __name__ == '__main__': pcuid=None, pcuselect=None, site=None, + node=None, + sitelist=None, dbname="findbadpcus", cachenodes=False, cachecalls=True, @@ -171,8 +182,12 @@ if __name__ == '__main__': ) parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE", help="Provide the input file for the node list") + parser.add_option("", "--node", dest="node", metavar="FILE", + help="Get all pcus associated with the given node") parser.add_option("", "--site", dest="site", metavar="FILE", help="Get all pcus associated with the given site's nodes") + parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE", + help="Get all pcus associated with the given site's nodes") parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE", help="Query string to apply to the findbad pcus") parser.add_option("", "--pcuid", dest="pcuid", metavar="id", @@ -203,6 +218,8 @@ if __name__ == '__main__': time.sleep(1) except Exception, err: traceback.print_exc() + from monitor.common import email_exception + email_exception() print "Exception: %s" % err print "Saving data... exitting." sys.exit(0) diff --git a/get_metasite_nodes.py b/get_metasite_nodes.py index 7fb46ef..e2d5764 100755 --- a/get_metasite_nodes.py +++ b/get_metasite_nodes.py @@ -7,7 +7,6 @@ import sys def main(): meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide'] l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"] - #l_blacklist = database.dbLoad("l_blacklist") l_sitelist = [] count = 0 # for each prefix above @@ -33,7 +32,6 @@ def main(): print "Found %d nodes" % count print "Found %d sites " % len(l_sitelist) - database.dbDump("l_blacklist") if __name__=="__main__": main() diff --git a/grouprins.py b/grouprins.py deleted file mode 100755 index ed6149d..0000000 --- a/grouprins.py +++ /dev/null @@ -1,379 +0,0 @@ -#!/usr/bin/python - -# This script is used to manipulate the operational state of nodes in -# different node groups. These are basically set operations on nodes via the -# PLC api. -# -# Take the ng name as an argument.... -# optionally, -# * get a list of nodes in the given nodegroup. -# * set some or all in the set to rins. -# * restart them all. -# * do something else to them all. -# - -from monitor import config -from monitor import util -from monitor import const -from monitor import database -from monitor import parser as parsermodule -from pcucontrol import reboot -from monitor.wrapper import plc -api = plc.getAuthAPI() - -import traceback -from optparse import OptionParser - -from monitor.common import * -from nodequery import verify,query_to_dict,node_select -from monitor.model import * -import os - -import time - -import bootman # debug nodes -import mailmonitor # down nodes without pcu -from monitor.wrapper.emailTxt import mailtxt -import sys - -class Reboot(object): - def __init__(self, fbnode): - self.fbnode = fbnode - - def _send_pcunotice(self, host): - args = {} - args['hostname'] = host - try: - args['pcu_id'] = plc.getpcu(host)['pcu_id'] - except: - args['pcu_id'] = host - - m = PersistMessage(host, mailtxt.pcudown_one[0] % args, - mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages') - - loginbase = plc.siteId(host) - m.send([const.TECHEMAIL % loginbase]) - - def pcu(self, host): - # TODO: It should be possible to diagnose the various conditions of - # the PCU here, and send different messages as appropriate. - print "'%s'" % self.fbnode['pcu'] - if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']: - self.action = "reboot.reboot('%s')" % host - - pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags') - #pflags.resetRecentFlag('pcutried') - if not pflags.getRecentFlag('pcutried'): - try: - print "CALLING REBOOT!!!" - ret = reboot.reboot(host) - - pflags.setRecentFlag('pcutried') - pflags.save() - return ret - - except Exception,e: - print traceback.print_exc(); print e - - # NOTE: this failure could be an implementation issue on - # our end. So, extra notices are confusing... - # self._send_pcunotice(host) - - pflags.setRecentFlag('pcufailed') - pflags.save() - return False - - elif not pflags.getRecentFlag('pcu_rins_tried'): - try: - # set node to 'rins' boot state. - print "CALLING REBOOT +++ RINS" - plc.nodeBootState(host, 'rins') - ret = reboot.reboot(host) - - pflags.setRecentFlag('pcu_rins_tried') - pflags.save() - return ret - - except Exception,e: - print traceback.print_exc(); print e - - # NOTE: this failure could be an implementation issue on - # our end. So, extra notices are confusing... - # self._send_pcunotice(host) - - pflags.setRecentFlag('pcufailed') - pflags.save() - return False - else: - # we've tried the pcu recently, but it didn't work, - # so did we send a message about it recently? - if not pflags.getRecentFlag('pcumessagesent'): - - self._send_pcunotice(host) - - pflags.setRecentFlag('pcumessagesent') - pflags.save() - - # This will result in mail() being called next, to try to - # engage the technical contact to take care of it also. - print "RETURNING FALSE" - return False - - else: - print "NO PCUOK" - self.action = "None" - return False - - def mail(self, host): - - # Reset every 4 weeks or so - pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags') - if not pflags.getRecentFlag('endrecord'): - node_end_record(host) - pflags.setRecentFlag('endrecord') - pflags.save() - - # Then in either case, run mailmonitor.reboot() - self.action = "mailmonitor.reboot('%s')" % host - try: - return mailmonitor.reboot(host) - except Exception, e: - print traceback.print_exc(); print e - return False - -class RebootDebug(Reboot): - - def direct(self, host): - self.action = "bootman.reboot('%s', config, None)" % host - return bootman.reboot(host, config, None) - -class RebootBoot(Reboot): - - def direct(self, host): - self.action = "bootman.reboot('%s', config, 'reboot')" % host - return bootman.reboot(host, config, 'reboot') - -class RebootDown(Reboot): - - def direct(self, host): - self.action = "None" - return False # this always fails, since the node will be down. - -def set_node_to_rins(host, fb): - - node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created']) - record = {'observation' : node[0], - 'model' : 'USER_REQUEST', - 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host, - 'time' : time.time()} - l = Log(host, record) - - ret = api.UpdateNode(host, {'boot_state' : 'rins'}) - if ret: - # it's nice to see the current status rather than the previous status on the console - node = api.GetNodes(host)[0] - print l - print "%-2d" % (i-1), nodegroup_display(node, fb) - return l - else: - print "FAILED TO UPDATE NODE BOOT STATE : %s" % host - return None - - -try: - rebootlog = database.dbLoad("rebootlog") -except: - rebootlog = LogRoll() - -parser = parsermodule.getParser(['nodesets']) -parser.set_defaults( timewait=0, - skip=0, - rins=False, - reboot=False, - findbad=False, - force=False, - nosetup=False, - verbose=False, - quiet=False, - ) - -parser.add_option("", "--stopselect", dest="stopselect", metavar="", - help="The select string that must evaluate to true for the node to be considered 'done'") -parser.add_option("", "--findbad", dest="findbad", action="store_true", - help="Re-run findbad on the nodes we're going to check before acting.") -parser.add_option("", "--force", dest="force", action="store_true", - help="Force action regardless of previous actions/logs.") -parser.add_option("", "--rins", dest="rins", action="store_true", - help="Set the boot_state to 'rins' for all nodes.") -parser.add_option("", "--reboot", dest="reboot", action="store_true", - help="Actively try to reboot the nodes, keeping a log of actions.") - -parser.add_option("", "--verbose", dest="verbose", action="store_true", - help="Extra debug output messages.") -parser.add_option("", "--nosetup", dest="nosetup", action="store_true", - help="Do not perform the orginary setup phase.") -parser.add_option("", "--skip", dest="skip", - help="Number of machines to skip on the input queue.") -parser.add_option("", "--timewait", dest="timewait", - help="Minutes to wait between iterations of 10 nodes.") - -parser = parsermodule.getParser(['defaults'], parser) -config = parsermodule.parse_args(parser) - -# COLLECT nodegroups, nodes and node lists -if config.nodegroup: - ng = api.GetNodeGroups({'name' : config.nodegroup}) - nodelist = api.GetNodes(ng[0]['node_ids']) - hostnames = [ n['hostname'] for n in nodelist ] - -if config.site: - site = api.GetSites(config.site) - l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) - hostnames = [ n['hostname'] for n in l_nodes ] - -if config.node or config.nodelist: - if config.node: hostnames = [ config.node ] - else: hostnames = util.file.getListFromFile(config.nodelist) - -fbquery = FindbadNodeRecord.get_all_latest() -fb_nodelist = [ n.hostname for n in fbquery ] - -if config.nodeselect: - hostnames = node_select(config.nodeselect, fb_nodelist) - -if config.findbad: - # rerun findbad with the nodes in the given nodes. - file = "findbad.txt" - util.file.setFileFromList(file, hostnames) - os.system("./findbad.py --cachenodes --increment --nodelist %s" % file) - # TODO: shouldn't we reload the node list now? - -l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) -# commands: -i = 1 -count = 1 -#print "hosts: %s" % hostnames -for host in hostnames: - - #if 'echo' in host or 'hptest-1' in host: continue - - try: - try: - node = api.GetNodes(host)[0] - except: - print traceback.print_exc(); - print "FAILED GETNODES for host: %s" % host - continue - - print "%-2d" % i, nodegroup_display(node, fb) - i += 1 - if i-1 <= int(config.skip): continue - if host in l_blacklist: - print "%s is blacklisted. Skipping." % host - continue - - if config.stopselect: - dict_query = query_to_dict(config.stopselect) - fbnode = fb['nodes'][host]['values'] - observed_state = get_current_state(fbnode) - - if verify(dict_query, fbnode) and observed_state != "dbg ": - # evaluates to true, therefore skip. - print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host ) - try: - # todo: clean up act_all record here. - # todo: send thank you, etc. - mailmonitor.reboot(host) - except Exception, e: - print traceback.print_exc(); print e - - continue - #else: - #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state ) - #sys.exit(1) - - if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2): - print "recently rebooted %s. skipping... " % host - continue - - if config.reboot: - - fbnode = fb['nodes'][host]['values'] - observed_state = get_current_state(fbnode) - - if observed_state == "dbg ": - o = RebootDebug(fbnode) - - elif observed_state == "boot" : - if config.rins: - l = set_node_to_rins(host, fb) - if l: rebootlog.add(l) - - o = RebootBoot(fbnode) - - elif observed_state == "down": - if config.rins: - l = set_node_to_rins(host, fb) - if l: rebootlog.add(l) - - o = RebootDown(fbnode) - - - if o.direct(host): - record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state, - 'action' : o.action, - 'model' : "none", - 'time' : time.time()} - elif o.pcu(host): - record = {'observation' : "PCU_SUCCESS: %s" % observed_state, - 'action' : o.action, - 'model' : "none", - 'time' : time.time()} - elif o.mail(host): - record = {'observation' : "MAIL_SUCCESS: %s" % observed_state, - 'action' : o.action, - 'model' : "none", - 'time' : time.time()} - else: - record = {'observation' : "REBOOT_FAILED: %s" % observed_state, - 'action' : "log failure", - 'model' : "none", - 'time' : time.time()} - - print "ALL METHODS OF RESTARTING %s FAILED" % host - args = {} - args['hostname'] = host - #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args, - # "CANNOT CONTACT", False, db='suspect_persistmessages') - #m.reset() - #m.send(['monitor-list@lists.planet-lab.org']) - - l = Log(host, record) - print l - rebootlog.add(l) - except KeyboardInterrupt: - print "Killed by interrupt" - sys.exit(0) - except: - print traceback.print_exc(); - print "Continuing..." - - time.sleep(1) - if count % 10 == 0: - print "Saving rebootlog" - database.dbDump("rebootlog", rebootlog) - wait_time = int(config.timewait) - print "Sleeping %d minutes" % wait_time - ti = 0 - print "Minutes slept: ", - sys.stdout.flush() - while ti < wait_time: - print "%s" % ti, - sys.stdout.flush() - time.sleep(60) - ti = ti+1 - - count = count + 1 - -print "Saving rebootlog" -database.dbDump("rebootlog", rebootlog) diff --git a/mailmonitor.py b/mailmonitor.py index 8af368a..fab3e65 100644 --- a/mailmonitor.py +++ b/mailmonitor.py @@ -12,6 +12,7 @@ from monitor import database from monitor.wrapper import rt from monitor.wrapper import plc from monitor.policy import * +from monitor.database.info.model import * api = plc.getAuthAPI() @@ -22,9 +23,9 @@ def reboot(hostname): if len(l_nodes) == 0: raise Exception("No such host: %s" % hostname) - l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) - l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) + q_blacklist = BlacklistRecord.query.all() + l_blacklist = [ n.hostname for n in q_blacklist ] l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) if len(l_nodes) == 0: raise Exception("Host removed via blacklist: %s" % hostname) diff --git a/monitor/common.py b/monitor/common.py index 051cd61..d082dbb 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -1,14 +1,14 @@ import time import struct -from pcucontrol import reboot - +from monitor import reboot from monitor import util from monitor import database from monitor.wrapper import plc, plccache -from datetime import datetime -from monitor.model import PersistFlags +from datetime import datetime, timedelta +from monitor.model import Message +from monitor.database.info import HistoryNodeRecord esc = struct.pack('i', 27) RED = esc + "[1;31m" @@ -86,6 +86,8 @@ def diff_time(timestamp, abstime=True): now = time.time() if timestamp == None: return "unknown" + if type(timestamp) == type(datetime.now()): + timestamp = time.mktime(timestamp.timetuple()) if abstime: diff = now - timestamp else: @@ -154,7 +156,7 @@ def nodegroup_display(node, fbdata, conf=None): node['pcu'] = "PCU" node['lastupdate'] = diff_time(node['last_contact']) - pf = PersistFlags(node['hostname'], 1, db='node_persistflags') + pf = HistoryNodeRecord.get_by(hostname=node['hostname']) try: node['lc'] = diff_time(pf.last_changed) except: @@ -211,4 +213,54 @@ def get_nodeset(config): l_nodes = node_select(config.nodeselect, node_list, None) return l_nodes + +def email_exception(content=None): + import config + from monitor.model import Message + import traceback + msg=traceback.format_exc() + if content: + msg = content + "\n" + msg + m=Message("exception running monitor", msg, False) + m.send([config.cc_email]) + return + +def changed_lessthan(last_changed, days): + if datetime.now() - last_changed <= timedelta(days): + #print "last changed less than %s" % timedelta(days) + return True + else: + #print "last changed more than %s" % timedelta(days) + return False + +def changed_greaterthan(last_changed, days): + if datetime.now() - last_changed > timedelta(days): + #print "last changed more than %s" % timedelta(days) + return True + else: + #print "last changed less than %s" % timedelta(days) + return False + +def found_between(recent_actions, action_type, lower, upper): + return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower) + +def found_before(recent_actions, action_type, within): + for action in recent_actions: + if action_type == action.action_type and \ + action.date_created < (datetime.now() - timedelta(within)): + return True + return False + +def found_within(recent_actions, action_type, within): + for action in recent_actions: + #print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) ) + if action_type == action.action_type and \ + action.date_created > (datetime.now() - timedelta(within)): + #datetime.now() - action.date_created < timedelta(within): + # recent action of given type. + #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created) + return True + + print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) ) + return False diff --git a/monitor/database/info/__init__.py b/monitor/database/info/__init__.py index 9c3df82..03a1b74 100644 --- a/monitor/database/info/__init__.py +++ b/monitor/database/info/__init__.py @@ -44,4 +44,5 @@ Entity.findby_or_create = classmethod(findby_or_create) from monitor.database.info.action import * from monitor.database.info.findbad import * from monitor.database.info.history import * +from monitor.database.info.plc import * setup_all() diff --git a/monitor/database/info/action.py b/monitor/database/info/action.py index 2569e35..0abec62 100644 --- a/monitor/database/info/action.py +++ b/monitor/database/info/action.py @@ -1,6 +1,7 @@ from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany from elixir import options_defaults, using_options, setup_all, has_one from elixir import String, Integer, DateTime, PickleType, Boolean +from elixir.ext.versioned import * from datetime import datetime,timedelta import elixir import traceback @@ -38,6 +39,43 @@ __session__ = mon_session # issue_type = ManyToMany('IssueType') # actions = OneToMany('ActionRecord', order_by='-date_created') +class BlacklistRecord(Entity): + date_created = Field(DateTime,default=datetime.now) + hostname = Field(String,default=None) + loginbase = Field(String,default=None) + expires = Field(Integer,default=0) # seconds plus + acts_as_versioned(['hostname']) + + @classmethod + def getLoginbaseBlacklist(cls): + # TODO: need to sort on 'round' since actions will not be globally sync'd. + return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc()) + + @classmethod + def getHostnameBlacklist(cls): + # TODO: need to sort on 'round' since actions will not be globally sync'd. + return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc()) + + def neverExpires(self): + if self.expires == 0: + return True + else: + return False + + def expired(self): + if self.neverExpires(): + return False + else: + if self.date_created + timedelta(0,self.expires) > datetime.now(): + return True + else: + return False + + def willExpire(self): + if self.neverExpires(): + return "never" + else: + return self.date_created + timedelta(0, self.expires) class ActionRecord(Entity): @classmethod @@ -47,8 +85,27 @@ class ActionRecord(Entity): # ACCOUNTING date_created = Field(DateTime,default=datetime.now) + loginbase = Field(String,default=None) hostname = Field(String,default=None) - loginbase = Field(String) + # NOTE: + # the expected kinds of actions are: + # * reboot node + # * open ticket, send notice + # * close ticket + # * apply penalty to site + # * backoff penalty to site + action = Field(String) + + # NOTE: describes the kind of action. i.e. online-notice, offline-notice, + # reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create, + # penalty-disable-slices, + action_type = Field(String, default=None) + + message_id = Field(Integer, default=0) + penalty_level = Field(Integer, default=0) + + # NOTE: in case an exception is thrown while trying to perform an action. + error_string = Field(String, default=None) #issue = ManyToOne('IssueRecord') # NOTE: this is the parent relation to fb records. first create the @@ -61,15 +118,15 @@ class ActionRecord(Entity): # OR # - find fbnode records # - create action record with fbnodes as argument - findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked') + # findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked') # NOTE: can I move 'message_index, escellation_level, and penalty_level' # into the same value? Maybe not penalty level, since there are only two; # and, there may be additional message and escellation levels. - send_email_to = Field(PickleType, default=None) - action_description = Field(PickleType, default=None) - message_arguments = Field(PickleType, default=None) + #send_email_to = Field(PickleType, default=None) + #action_description = Field(PickleType, default=None) + #message_arguments = Field(PickleType, default=None) # NOTE: not sure this needs to be in the db. - escellation_level = Field(Integer, default=0) - stage = Field(String, default=None) + #escellation_level = Field(Integer, default=0) + #stage = Field(String, default=None) diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py index e58ef3a..a5139eb 100644 --- a/monitor/database/info/findbad.py +++ b/monitor/database/info/findbad.py @@ -4,54 +4,58 @@ from elixir import String, Integer as Int, DateTime, PickleType, Boolean from datetime import datetime,timedelta import elixir import traceback +from elixir.ext.versioned import * from monitor.database.dborm import mon_metadata, mon_session __metadata__ = mon_metadata __session__ = mon_session -class FindbadNodeRecordSync(Entity): - hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname') - round = Field(Int,default=0) +#class FindbadNodeRecordSync(Entity): +# hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname') +# round = Field(Int,default=0) -class FindbadPCURecordSync(Entity): - plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid') - round = Field(Int,default=0) +#class FindbadPCURecordSync(Entity): +# plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid') +# round = Field(Int,default=0) class FindbadNodeRecord(Entity): @classmethod def get_all_latest(cls): - fbsync = FindbadNodeRecordSync.get_by(hostname="global") - if fbsync: - return cls.query.filter_by(round=fbsync.round) - else: - return [] + return cls.query.all() + #fbsync = FindbadNodeRecordSync.get_by(hostname="global") + #if fbsync: + # return cls.query.filter_by(round=fbsync.round) + #else: + # return [] @classmethod def get_latest_by(cls, **kwargs): - fbsync = FindbadNodeRecordSync.get_by(hostname="global") - if fbsync: - kwargs['round'] = fbsync.round - return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc()) - else: - return [] + return cls.query.filter_by(**kwargs).first() + #fbsync = FindbadNodeRecordSync.get_by(hostname="global") + #if fbsync: + # kwargs['round'] = fbsync.round + # return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc()) + #else: + # return [] @classmethod def get_latest_n_by(cls, n=3, **kwargs): - fbsync = FindbadNodeRecordSync.get_by(hostname="global") - kwargs['round'] = fbsync.round - ret = [] - for i in range(0,n): - kwargs['round'] = kwargs['round'] - i - f = cls.query.filter_by(**kwargs).first() - if f: - ret.append(f) - return ret + return cls.query.filter_by(**kwargs) + #fbsync = FindbadNodeRecordSync.get_by(hostname="global") + #kwargs['round'] = fbsync.round + #ret = [] + #for i in range(0,n): + # kwargs['round'] = kwargs['round'] - i + # f = cls.query.filter_by(**kwargs).first() + # if f: + # ret.append(f) + #return ret # ACCOUNTING date_checked = Field(DateTime,default=datetime.now) round = Field(Int,default=0) - hostname = Field(String,default=None) + hostname = Field(String,primary_key=True,default=None) loginbase = Field(String) # INTERNAL @@ -79,23 +83,19 @@ class FindbadNodeRecord(Entity): observed_category = Field(String,default=None) observed_status = Field(String,default=None) + acts_as_versioned(ignore=['date_checked']) # NOTE: this is the child relation - action = ManyToOne('ActionRecord', required=False) + #action = ManyToOne('ActionRecord', required=False) class FindbadPCURecord(Entity): @classmethod def get_all_latest(cls): - fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0) - if fbsync: - return cls.query.filter_by(round=fbsync.round) - else: - return [] + return cls.query.all() @classmethod def get_latest_by(cls, **kwargs): - fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0) - kwargs['round'] = fbsync.round - return cls.query.filter_by(**kwargs).order_by(FindbadPCURecord.date_checked.desc()) + return cls.query.filter_by(**kwargs).first() + # ACCOUNTING date_checked = Field(DateTime) round = Field(Int,default=0) @@ -110,3 +110,5 @@ class FindbadPCURecord(Entity): # INTERNAL # INFERRED reboot_trial_status = Field(String) + + acts_as_versioned(ignore=['date_checked']) diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py index dc53860..3c5842a 100644 --- a/monitor/database/info/history.py +++ b/monitor/database/info/history.py @@ -1,6 +1,8 @@ from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany from elixir import options_defaults, using_options, setup_all from elixir import String, Integer as Int, DateTime, Boolean +from elixir.ext.versioned import * + from datetime import datetime,timedelta from monitor.database.dborm import mon_metadata, mon_session @@ -13,6 +15,7 @@ class HistoryNodeRecord(Entity): last_checked = Field(DateTime,default=datetime.now) last_changed = Field(DateTime,default=datetime.now) status = Field(String,default="unknown") + acts_as_versioned(ignore=['last_changed', 'last_checked']) @classmethod def by_hostname(cls, hostname): @@ -28,10 +31,13 @@ class HistoryPCURecord(Entity): last_valid = Field(DateTime,default=None) valid = Field(String,default="unknown") + acts_as_versioned(ignore=['last_changed', 'last_checked']) + @classmethod def by_pcuid(cls, pcuid): return cls.query.filter_by(pcuid=pcuid).first() + class HistorySiteRecord(Entity): loginbase = Field(String(250),primary_key=True) @@ -50,6 +56,15 @@ class HistorySiteRecord(Entity): status = Field(String,default="unknown") + message_id = Field(Int, default=0) + message_status = Field(String, default=None) + message_queue = Field(String, default=None) + message_created = Field(DateTime, default=None) + + penalty_level = Field(Int, default=0) + penalty_applied = Field(Boolean, default=False) + acts_as_versioned(ignore=['last_changed', 'last_checked']) + @classmethod def by_loginbase(cls, loginbase): return cls.query.filter_by(loginbase=loginbase).first() diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py new file mode 100644 index 0000000..2e5064d --- /dev/null +++ b/monitor/database/info/interface.py @@ -0,0 +1,198 @@ +import bootman # debug nodes + +from monitor import reboot +from monitor.common import * +from monitor.model import * +from monitor.wrapper import plc +from monitor.wrapper import plccache +from monitor.wrapper.emailTxt import mailtxt +from monitor.database.info.model import * + +class SiteInterface(HistorySiteRecord): + @classmethod + def get_or_make(cls, if_new_set={}, **kwargs): + if 'hostname' in kwargs: + kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']] + del kwargs['hostname'] + res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs) + return SiteInterface(res) + + def __init__(self, sitehist): + self.db = sitehist + + def getRecentActions(self, **kwargs): + # TODO: make query only return records within a certin time range, + # i.e. greater than 0.5 days ago. or 5 days, etc. + + #print "kwargs: ", kwargs + + recent_actions = [] + if 'loginbase' in kwargs: + recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc()) + elif 'hostname' in kwargs: + recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc()) + return recent_actions + + def increasePenalty(self): + #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',) + self.db.penalty_level += 1 + # NOTE: this is to prevent overflow or index errors in applyPenalty. + # there's probably a better approach to this. + if self.db.penalty_level >= 2: + self.db.penalty_level = 2 + self.db.penalty_applied = True + + def applyPenalty(self): + penalty_map = [] + penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None, + 'disable' : lambda site: None } ) + penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site), + 'disable' : lambda site: plc.enableSiteSliceCreation(site) } ) + penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site), + 'disable' : lambda site: plc.enableSiteSlices(site) } ) + + for i in range(len(penalty_map)-1,self.db.penalty_level,-1): + print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase) + penalty_map[i]['disable'](self.db.loginbase) + + for i in range(0,self.db.penalty_level+1): + print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase) + penalty_map[i]['enable'](self.db.loginbase) + + return + + def pausePenalty(self): + act = ActionRecord(loginbase=self.db.loginbase, + action='penalty', + action_type='pause_penalty',) + + def clearPenalty(self): + #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',) + self.db.penalty_level = 0 + self.db.penalty_applied = False + + def getTicketStatus(self): + if self.db.message_id != 0: + rtstatus = mailer.getTicketStatus(self.db.message_id) + self.db.message_status = rtstatus['Status'] + self.db.message_queue = rtstatus['Queue'] + self.db.message_created = datetime.fromtimestamp(rtstatus['Created']) + + def setTicketStatus(self, status): + print 'SETTING status %s' % status + if self.db.message_id != 0: + rtstatus = mailer.setTicketStatus(self.db.message_id, status) + + def getContacts(self): + contacts = [] + if self.db.penalty_level >= 0: + contacts += plc.getTechEmails(self.db.loginbase) + + if self.db.penalty_level >= 1: + contacts += plc.getPIEmails(self.db.loginbase) + + if self.db.penalty_level >= 2: + contacts += plc.getSliceUserEmails(self.db.loginbase) + + return contacts + + def sendMessage(self, type, **kwargs): + + # NOTE: evidently changing an RT message's subject opens the ticket. + # the logic in this policy depends up a ticket only being 'open' + # if a user has replied to it. + # So, to preserve these semantics, we check the status before + # sending, then after sending, reset the status to the + # previous status. + # There is a very tiny race here, where a user sends a reply + # within the time it takes to check, send, and reset. + # This sucks. It's almost certainly fragile. + + # + # TODO: catch any errors here, and add an ActionRecord that contains + # those errors. + + args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level} + args.update(kwargs) + + hostname = None + if 'hostname' in args: + hostname = args['hostname'] + + if hasattr(mailtxt, type): + + message = getattr(mailtxt, type) + viart = True + if 'viart' in kwargs: + viart = kwargs['viart'] + + if viart: + self.getTicketStatus() # get current message status + + m = Message(message[0] % args, message[1] % args, viart, self.db.message_id) + + contacts = self.getContacts() + contacts = [config.cc_email] # TODO: remove after testing... + + print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname) + + ret = m.send(contacts) + if viart: + self.db.message_id = ret + # reset to previous status, since a new subject 'opens' RT tickets. + self.setTicketStatus(self.db.message_status) + + # NOTE: only make a record of it if it's in RT. + act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', + action_type=type, message_id=self.db.message_id) + + else: + print "+-- WARNING! ------------------------------" + print "| No such message name in emailTxt.mailtxt: %s" % type + print "+------------------------------------------" + + return + + def closeTicket(self): + # TODO: close the rt ticket before overwriting the message_id + mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor") + act = ActionRecord(loginbase=self.db.loginbase, action='notice', + action_type='close_ticket', message_id=self.db.message_id) + self.db.message_id = 0 + self.db.message_status = "new" + + def runBootManager(self, hostname): + print "attempting BM reboot of %s" % hostname + ret = "" + try: + ret = bootman.restore(self, hostname) + err = "" + except: + err = traceback.format_exc() + print err + + act = ActionRecord(loginbase=self.db.loginbase, + hostname=hostname, + action='reboot', + action_type='bootmanager_restore', + error_string=err) + return ret + + def attemptReboot(self, hostname): + print "attempting PCU reboot of %s" % hostname + err = "" + try: + ret = reboot.reboot_str(hostname) + except Exception, e: + err = traceback.format_exc() + ret = str(e) + + if ret == 0 or ret == "0": + ret = "" + + act = ActionRecord(loginbase=self.db.loginbase, + hostname=hostname, + action='reboot', + action_type='first_try_reboot', + error_string=err) + diff --git a/monitor/database/info/model.py b/monitor/database/info/model.py index 151f428..c538c66 100644 --- a/monitor/database/info/model.py +++ b/monitor/database/info/model.py @@ -1,4 +1,5 @@ from monitor.database.info.action import * from monitor.database.info.findbad import * from monitor.database.info.history import * +from monitor.database.info.plc import * from monitor.database.dborm import mon_session as session diff --git a/monitor/database/info/plc.py b/monitor/database/info/plc.py new file mode 100644 index 0000000..0847057 --- /dev/null +++ b/monitor/database/info/plc.py @@ -0,0 +1,33 @@ +from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany +from elixir import options_defaults, using_options, setup_all +from elixir import PickleType, String, Integer, DateTime, Boolean +from elixir.ext.versioned import * + +from datetime import datetime,timedelta + +from monitor.database.dborm import mon_metadata, mon_session +__metadata__ = mon_metadata +__session__ = mon_session + +class PlcSite(Entity): + site_id = Field(Integer,primary_key=True) + loginbase = Field(String,default=None) + date_checked = Field(DateTime,default=datetime.now) + + plc_site_stats = Field(PickleType,default=None) + acts_as_versioned(ignore=['date_checked']) + +class PlcNode(Entity): + node_id = Field(Integer,primary_key=True) + hostname = Field(String,default=None) + date_checked = Field(DateTime,default=datetime.now) + + plc_node_stats = Field(PickleType,default=None) + acts_as_versioned(ignore=['date_checked']) + +class PlcPCU(Entity): + pcu_id = Field(Integer,primary_key=True) + date_checked = Field(DateTime,default=datetime.now) + + plc_pcu_stats = Field(PickleType,default=None) + acts_as_versioned(ignore=['date_checked']) diff --git a/monitor/model.py b/monitor/model.py index b4db483..2f2f5e3 100755 --- a/monitor/model.py +++ b/monitor/model.py @@ -527,6 +527,8 @@ class Record(object): else: print "takeAction: increasing penalty for %s"%self.hostname pp.increase() + + print "takeAction: applying penalty to %s as index %s"% (self.hostname, index) pp.index = index pp.apply(self.hostname) pp.save() diff --git a/monitor/policy.py b/monitor/policy.py index c23e7de..4574de7 100644 --- a/monitor/policy.py +++ b/monitor/policy.py @@ -171,10 +171,11 @@ class MonitorMergeDiagnoseSendEscellate: #### APPLY PENALTY if ( record.data['take_action'] and diag['Squeeze'] ): - print "action: taking action" + print "action: taking squeeze action" record.takeAction(record.data['penalty_level']) del diag['Squeeze'] if diag.getFlag('BackOff'): + print "action: taking backoff action" record.takeAction(0) del diag['BackOff'] diff --git a/monitor/reboot.py b/monitor/reboot.py new file mode 100755 index 0000000..15d5c52 --- /dev/null +++ b/monitor/reboot.py @@ -0,0 +1,144 @@ +#!/usr/bin/python +# +# Reboot specified nodes +# + +import getpass, getopt +import os, sys +import xml, xmlrpclib +import errno, time, traceback +import urllib2 +import urllib +import threading, popen2 +import array, struct +import base64 +from subprocess import PIPE, Popen +import pcucontrol.transports.ssh.pxssh as pxssh +import pcucontrol.transports.ssh.pexpect as pexpect +import socket + +# Use our versions of telnetlib and pyssh +sys.path.insert(0, os.path.dirname(sys.argv[0])) +import pcucontrol.transports.telnetlib as telnetlib +sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh") +import pcucontrol.transports.pyssh as pyssh + +from monitor import config +from monitor.wrapper import plc + +from pcucontrol.util import command +from pcucontrol.reboot import pcu_name, model_to_object, reboot_api, convert_oldmodelname_to_newmodelname, reboot_test_new + + +# Event class ID from pcu events +#NODE_POWER_CONTROL = 3 + +# Monitor user ID +#MONITOR_USER_ID = 11142 + +import logging +logger = logging.getLogger("monitor") +verbose = 1 +#dryrun = 0; + +def get_pcu_values(pcu_id): + from monitor.database.info.model import FindbadPCURecord + print "pcuid: %s" % pcu_id + try: + pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id) + if pcurec: + values = pcurec.to_dict() + else: + values = None + except: + values = None + + return values + +def reboot(nodename): + return reboot_policy(nodename, True, False) + +def reboot_str(nodename): + global verbose + continue_probe = True + dryrun=False + + pcu = plc.getpcu(nodename) + if not pcu: + logger.debug("no pcu for %s" % nodename) + print "no pcu for %s" % nodename + return "%s has no pcu" % nodename + + values = get_pcu_values(pcu['pcu_id']) + if values == None: + logger.debug("No values for pcu probe %s" % nodename) + print "No values for pcu probe %s" % nodename + return "no info for pcu_id %s" % pcu['pcu_id'] + + # Try the PCU first + logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model'])) + + ret = reboot_test_new(nodename, values, verbose, dryrun) + return ret + +def reboot_policy(nodename, continue_probe, dryrun): + global verbose + + pcu = plc.getpcu(nodename) + if not pcu: + logger.debug("no pcu for %s" % nodename) + print "no pcu for %s" % nodename + return False # "%s has no pcu" % nodename + + values = get_pcu_values(pcu['pcu_id']) + if values == None: + logger.debug("No values for pcu probe %s" % nodename) + print "No values for pcu probe %s" % nodename + return False #"no info for pcu_id %s" % pcu['pcu_id'] + + # Try the PCU first + logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model'])) + + ret = reboot_test_new(nodename, values, verbose, dryrun) + + if ret != 0: + print ret + return False + else: + print "return true" + return True + +def main(): + logger.setLevel(logging.DEBUG) + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter('LOGGER - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + + try: + if "test" in sys.argv: + dryrun = True + else: + dryrun = False + + for node in sys.argv[1:]: + if node == "test": continue + + print "Rebooting %s" % node + if reboot_policy(node, True, dryrun): + print "success" + else: + print "failed" + except Exception, err: + import traceback; traceback.print_exc() + from monitor.common import email_exception + email_exception(node) + print err + +if __name__ == '__main__': + logger = logging.getLogger("monitor") + main() + f = open("/tmp/rebootlog", 'a') + f.write("reboot %s\n" % sys.argv) + f.close() diff --git a/monitor/scanapi.py b/monitor/scanapi.py index 194ab40..963822d 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -11,8 +11,7 @@ import threading import socket from pcucontrol import reboot -from monitor import util -from monitor.util import command +from pcucontrol.util import command from monitor import config from monitor.database.info.model import * @@ -113,7 +112,7 @@ class ScanInterface(object): syncclass = None primarykey = 'hostname' - def __init__(self, round): + def __init__(self, round=1): self.round = round self.count = 1 @@ -134,22 +133,24 @@ class ScanInterface(object): try: if values is None: return - - fbnodesync = self.syncclass.findby_or_create( - if_new_set={'round' : self.round}, + + if self.syncclass: + fbnodesync = self.syncclass.findby_or_create( + #if_new_set={'round' : self.round}, **{ self.primarykey : nodename}) # NOTE: This code will either add a new record for the new self.round, # OR it will find the previous value, and update it with new information. # The data that is 'lost' is not that important, b/c older # history still exists. fbrec = self.recordclass.findby_or_create( - **{'round':self.round, self.primarykey:nodename}) + **{ self.primarykey:nodename}) fbrec.set( **values ) fbrec.flush() - fbnodesync.round = self.round - fbnodesync.flush() + if self.syncclass: + fbnodesync.round = self.round + fbnodesync.flush() print "%d %s %s" % (self.count, nodename, values) self.count += 1 @@ -161,13 +162,14 @@ class ScanInterface(object): class ScanNodeInternal(ScanInterface): recordclass = FindbadNodeRecord - syncclass = FindbadNodeRecordSync + #syncclass = FindbadNodeRecordSync + syncclass = None primarykey = 'hostname' def collectNMAP(self, nodename, cohash): #### RUN NMAP ############################### values = {} - nmap = util.command.CMD() + nmap = command.CMD() print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) # NOTE: an empty / error value for oval, will still work. @@ -209,7 +211,7 @@ class ScanNodeInternal(ScanInterface): echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' echo "}" - EOF """) +EOF """) values['ssh_error'] = errval if len(oval) > 0: @@ -376,9 +378,9 @@ class ScanNodeInternal(ScanInterface): return (nodename, values) def internalprobe(hostname): - fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - if_new_set={'round' : 1}) - scannode = ScanNodeInternal(fbsync.round) + #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", + # if_new_set={'round' : 1}) + scannode = ScanNodeInternal() # fbsync.round) try: (nodename, values) = scannode.collectInternal(hostname, {}) scannode.record(None, (nodename, values)) @@ -389,9 +391,9 @@ def internalprobe(hostname): return False def externalprobe(hostname): - fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - if_new_set={'round' : 1}) - scannode = ScanNodeInternal(fbsync.round) + #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", + # if_new_set={'round' : 1}) + scannode = ScanNodeInternal() # fbsync.round) try: (nodename, values) = scannode.collectNMAP(hostname, {}) scannode.record(None, (nodename, values)) @@ -403,7 +405,7 @@ def externalprobe(hostname): class ScanPCU(ScanInterface): recordclass = FindbadPCURecord - syncclass = FindbadPCURecordSync + syncclass = None primarykey = 'plc_pcuid' def collectInternal(self, pcuname, cohash): @@ -432,7 +434,7 @@ class ScanPCU(ScanInterface): #### RUN NMAP ############################### if continue_probe: - nmap = util.command.CMD() + nmap = command.CMD() print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']) (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) # NOTE: an empty / error value for oval, will still work. @@ -494,7 +496,7 @@ class ScanPCU(ScanInterface): ###### DRY RUN ############################ - if 'node_ids' in values['plc_pcu_stats'] and \ + if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \ len(values['plc_pcu_stats']['node_ids']) > 0: rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, 1, True) @@ -510,7 +512,8 @@ class ScanPCU(ScanInterface): print "____________________________________" errors['traceback'] = traceback.format_exc() print errors['traceback'] - values['reboot_trial_status'] = errors['traceback'] + values['reboot_trial_status'] = str(errors['traceback']) + print values values['entry_complete']=" ".join(values['entry_complete']) diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index d1bccaa..220eb10 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -207,6 +207,84 @@ ERROR- This is an error state, where there is absolutely no contact with PlanetLab. """) + pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""", + +"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU +registered for %(hostname)s, but could not for some reason. + +Please help. + +Thank you very much for your help, + -- PlanetLab Central (support@planet-lab.org) +""") + online_notice=("""MONTEST: Host %(hostname)s is online""", + """ +This notice is simply to let you know that: + %(hostname)s + +is online and operational. Thank you very much for your help! + """) + test_notice=("""MONTEST: Host %(hostname)s is testing""", + """ +This notice is simply to test whether notices work. + %(hostname)s + +Thank you very much for your help! + """) + retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""", + """ +This notice is simply to let you know that: + %(hostname)s + +appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py. +If any action is needed from you, you will recieve additional notices. Thank you! + """) + down_notice=("""MONTEST: Host %(hostname)s is down""", + """ +This notice is simply to let you know that: + %(hostname)s + +is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help! + """) + + clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""", + """ +This notice is to let you know that any penalties previously applied to your site have +been removed: %(penalty_level)s. + +All privileges have been restored. If your slices were disabled, please allow +up to 30 minutes for them to return to enabled. + +Legend: + + 0 - no penalties applied + 1 - site is disabled. no new slices can be created. + 2+ - all existing slices will be disabled. + """) + + increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""", + """ +This notice is to let you know that the penalty applied to your site has +increased: %(penalty_level)s. + +legend: + + 0 - no penalty applied + 1 - site is disabled. no new slices can be created. + 2+ - all existing slices will be disabled. + """) + + newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """ +As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: + + %(hostname)s + +This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick). + +Thank you for your help, + -- PlanetLab Central (support@planet-lab.org) +""") + nmreset =("""NM Reset at %(loginbase)s""", """ Monitor restarted NM on the following machines: @@ -294,10 +372,10 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", -"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: + newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""", +"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported. -%(hostname_list)s + %(hostname)s To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file. @@ -318,14 +396,14 @@ Thank you for your help, # TODO: need reminder versions for repeats... newdown=[newdown_one, newdown_two, newdown_three] newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three] - newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one] + #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one] newthankyou=[thankyou,thankyou,thankyou] pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one] NMReset=[nmreset,nmreset,nmreset] pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one] pcudown=[pcudown_one, pcudown_one, pcudown_one] - unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", + unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -411,7 +489,7 @@ Thank you for your help, donation_down = [ donation_down_one, donation_down_one, donation_down_one ] - minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", + minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -431,7 +509,7 @@ BootManager.log output follows: %(bmlog)s """ ) - baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", + baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node. Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org. @@ -497,7 +575,7 @@ BootManager.log output follows: %(bmlog)s """) - plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", + nodeconfig_notice=("""MONTEST: Please Update Configuration file for PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit: https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s @@ -537,7 +615,7 @@ Thanks. """) - baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", + baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""", """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries. %(hostname)s diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 2ab1808..2f0f19d 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -17,8 +17,12 @@ from monitor import database try: from monitor import config debug = config.debug + XMLRPC_SERVER=config.API_SERVER except: debug = False + # NOTE: this host is used by default when there are no auth files. + XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/" + logger = logging.getLogger("monitor") class Auth: @@ -34,8 +38,6 @@ class Auth: 'AuthMethod' : 'password', 'AuthString' : password} -# NOTE: this host is used by default when there are no auth files. -XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/" # NOTE: by default, use anonymous access, but if auth files are # configured, use them, with their auth definitions. @@ -54,7 +56,7 @@ except: auth = Auth() auth.server = XMLRPC_SERVER -api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) +global_error_count = 0 class PLC: def __init__(self, auth, url): @@ -67,11 +69,23 @@ class PLC: if method is None: raise AssertionError("method does not exist") - return lambda *params : method(self.auth, *params) + try: + return lambda *params : method(self.auth, *params) + except ProtocolError: + traceback.print_exc() + global_error_count += 1 + if global_error_count >= 10: + print "maximum error count exceeded; exiting..." + sys.exit(1) + else: + print "%s errors have occurred" % global_error_count + raise Exception("ProtocolError continuing") def __repr__(self): return self.api.__repr__() +api = PLC(auth.auth, auth.server) + class CachedPLC(PLC): def _param_to_str(self, name, *params): @@ -327,6 +341,19 @@ def nodePOD(nodename): except Exception, exc: logger.info("nodePOD: %s" % exc) +''' +Freeze all site slices. +''' +def suspendSiteSlices(loginbase): + api = xmlrpclib.Server(auth.server, verbose=False) + for slice in slices(loginbase): + logger.info("Suspending slice %s" % slice) + try: + if not debug: + api.AddSliceAttribute(auth.auth, slice, "enabled", "0") + except Exception, exc: + logger.info("suspendSlices: %s" % exc) + ''' Freeze all site slices. ''' @@ -340,6 +367,25 @@ def suspendSlices(nodename): except Exception, exc: logger.info("suspendSlices: %s" % exc) +def enableSiteSlices(loginbase): + api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) + for slice in slices(loginbase): + logger.info("Enabling slices %s" % slice) + try: + if not debug: + slice_list = api.GetSlices(auth.auth, {'name': slice}, None) + if len(slice_list) == 0: + return + slice_id = slice_list[0]['slice_id'] + l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None) + for attr in l_attr: + if "enabled" == attr['name'] and attr['value'] == "0": + logger.info("Deleted enable=0 attribute from slice %s" % slice) + api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id']) + except Exception, exc: + logger.info("enableSiteSlices: %s" % exc) + print "exception: %s" % exc + def enableSlices(nodename): api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) for slice in slices(siteId(nodename)): @@ -369,6 +415,17 @@ def enableSlices(nodename): # logger.info("Suspending slice %s" % slice) # api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"}) # +def enableSiteSliceCreation(loginbase): + api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) + try: + logger.info("Enabling slice creation for site %s" % loginbase) + if not debug: + logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase) + api.UpdateSite(auth.auth, loginbase, {'enabled': True}) + except Exception, exc: + print "ERROR: enableSiteSliceCreation: %s" % exc + logger.info("ERROR: enableSiteSliceCreation: %s" % exc) + def enableSliceCreation(nodename): api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) try: @@ -381,6 +438,19 @@ def enableSliceCreation(nodename): print "ERROR: enableSliceCreation: %s" % exc logger.info("ERROR: enableSliceCreation: %s" % exc) +''' +Removes site's ability to create slices. Returns previous max_slices +''' +def removeSiteSliceCreation(sitename): + print "removeSiteSliceCreation(%s)" % sitename + api = xmlrpclib.Server(auth.server, verbose=False) + try: + logger.info("Removing slice creation for site %s" % sitename) + if not debug: + api.UpdateSite(auth.auth, sitename, {'enabled': False}) + except Exception, exc: + logger.info("removeSiteSliceCreation: %s" % exc) + ''' Removes ability to create slices. Returns previous max_slices ''' diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index 3efd791..0645b18 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -2,8 +2,7 @@ import sys from monitor.wrapper import plc -from monitor import database -from monitor import config +from monitor.database.info.model import * def dsites_from_lsites(l_sites): d_sites = {} @@ -53,98 +52,107 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes): hn2lb[hostname] = login_base return (dsn, hn2lb, lb2hn) -def create_netid2ip(l_nodes, l_nodenetworks): - netid2ip = {} - for node in l_nodes: - for netid in node['nodenetwork_ids']: - found = False - for nn in l_nodenetworks: - if nn['nodenetwork_id'] == netid: - found = True - netid2ip[netid] = nn['ip'] - if not found: - print "ERROR! %s" % node - - return netid2ip - l_sites = None l_nodes = None l_pcus = None -l_nodenetworks = None plcdb_hn2lb = None plcdb_lb2hn = None -plcdb_netid2ip = None plcdb_id2lb = None def init(): global l_sites global l_nodes global l_pcus - global l_nodenetworks global plcdb_hn2lb global plcdb_lb2hn - global plcdb_netid2ip global plcdb_id2lb - api = plc.getCachedAuthAPI() - l_sites = api.GetSites({'peer_id':None}, - ['login_base', 'site_id', 'abbreviated_name', 'latitude', - 'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ]) - l_nodes = api.GetNodes({'peer_id':None}, - ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated', - 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids']) - l_pcus = api.GetPCUs() - l_nodenetworks = api.GetNodeNetworks() + dbsites = PlcSite.query.all() + l_sites = [ s.plc_site_stats for s in dbsites ] + + dbnodes = PlcNode.query.all() + l_nodes = [ s.plc_node_stats for s in dbnodes ] + + dbpcus = PlcPCU.query.all() + l_pcus = [ s.plc_pcu_stats for s in dbpcus ] (d_sites,id2lb) = dsites_from_lsites(l_sites) (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes) - netid2ip = create_netid2ip(l_nodes, l_nodenetworks) plcdb_hn2lb = hn2lb plcdb_lb2hn = lb2hn - plcdb_netid2ip = netid2ip plcdb_id2lb = id2lb - return l_nodes - - -def create_plcdb(): - - # get sites, and stats - l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude', - 'max_slices', 'slice_ids', 'node_ids' ]) - if len(l_sites) == 0: - print "no sites! exiting..." - sys.exit(1) - (d_sites,id2lb) = dsites_from_lsites(l_sites) + return + +def GetNodesByIds(ids): + ret = [] + for node_id in ids: + node = PlcNode.get_by(node_id=node_id) + ret.append(node.plc_node_stats) + return ret + +def GetNodesBySite(loginbase): + site = PlcSite.get_by(loginbase=loginbase) + return GetNodesByIds(site.plc_site_stats['node_ids']) + +def GetNodeByName(hostname): + node = PlcNode.get_by(hostname=hostname) + return node.plc_node_stats + +def GetSitesByName(sitelist): + ret = [] + for site in sitelist: + site = PlcSite.get_by(loginbase=site) + ret.append(site.plc_site_stats) + return ret + +def sync(): + l_sites = plc.api.GetSites({'peer_id':None}, + ['login_base', 'site_id', 'abbreviated_name', 'latitude', + 'longitude', 'max_slices', 'slice_ids', 'node_ids', + 'enabled', 'date_created' ]) + l_nodes = plc.api.GetNodes({'peer_id':None}, + ['hostname', 'node_id', 'ports', 'site_id', + 'version', 'last_updated', 'date_created', + 'last_contact', 'pcu_ids', 'nodenetwork_ids']) + l_pcus = plc.api.GetPCUs() + + print "sync sites" + for site in l_sites: + dbsite = PlcSite.findby_or_create(site_id=site['site_id']) + dbsite.loginbase = site['login_base'] + dbsite.date_checked = datetime.now() + dbsite.plc_site_stats = site + #dbsite.flush() + # TODO: delete old records. + session.flush() + + print "sync nodes" + for node in l_nodes: + dbnode = PlcNode.findby_or_create(node_id=node['node_id']) + dbnode.hostname = node['hostname'] + dbnode.date_checked = datetime.now() + dbnode.plc_node_stats = node + #dbnode.flush() + # TODO: delete old records. + session.flush() + + print "sync pcus" + for pcu in l_pcus: + dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id']) + dbpcu.date_checked = datetime.now() + dbpcu.plc_pcu_stats = pcu + #dbpcu.flush() + # TODO: delete old records. + session.flush() - # get nodes at each site, and - l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version', - 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids']) + init() - l_nodenetworks = plc.getNodeNetworks() - (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes) - netid2ip = create_netid2ip(l_nodes, l_nodenetworks) - - # save information for future. - id2lb = id2lb - hn2lb = hn2lb - db = plcdb - - if ('cachenodes' in dir(config) and config.cachenodes) or \ - 'cachenodes' not in dir(config): - database.dbDump("plcdb_hn2lb", hn2lb) - database.dbDump("plcdb_lb2hn", lb2hn) - database.dbDump("plcdb_netid2ip", netid2ip) - database.dbDump("l_plcnodenetworks", l_nodenetworks) - database.dbDump("l_plcnodes", l_nodes) - database.dbDump("l_plcsites", l_sites) - - return l_nodes + return if __name__ == '__main__': - create_plcdb() + sync() else: - #print "calling plccache init()" init() diff --git a/nodebad.py b/nodebad.py index 767a4fe..46ca879 100755 --- a/nodebad.py +++ b/nodebad.py @@ -22,33 +22,112 @@ api = plc.getAuthAPI() round = 1 count = 0 +def main(): + main2(config) -def main(config): +def main2(config): l_plcnodes = plccache.l_nodes l_nodes = get_nodeset(config) checkAndRecordState(l_nodes, l_plcnodes) +# Node states: + +def check_node_state(rec, node): + + node_state = rec.observed_status + if rec.plc_node_stats: + boot_state = rec.plc_node_stats['boot_state'] + last_contact = rec.plc_node_stats['last_contact'] + else: + boot_state = "unknown" + last_contact = None + + if boot_state == 'disable': boot_state = 'disabled' + if boot_state == 'diag': boot_state = 'diagnose' + + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need + # 'translations' into the node.status state + # 'BOOT' is a permanent state, but we want it to have a bit of + # hysteresis (less than 0.5 days) + + ################################################################# + # "Initialize" the findbad states into nodebad status if they are not already set + + if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' : + print "changed status from %s to offline" % node.status + node.status = 'offline' + node.last_changed = datetime.now() + + if node_state == 'DEBUG' and node.status != 'monitordebug' and \ + node.status != 'disabled' and \ + node.status != 'diagnose': + if boot_state != 'disabled' and boot_state != 'diagnose': + + print "changed status from %s to monitordebug" % (node.status) + node.status = "monitordebug" + node.last_changed = datetime.now() + else: + print "changed status from %s to %s" % (node.status, boot_state) + node.status = boot_state + node.last_changed = datetime.now() + + if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': + print "changed status from %s to online" % node.status + node.status = 'online' + node.last_changed = datetime.now() + + ################################################################# + # Switch temporary hystersis states into their 'firm' states. + # online -> good after half a day + # offline -> down after two days + # monitordebug -> down after 30 days + # diagnose -> monitordebug after 60 days + # disabled -> down after 60 days + + if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): + print "changed status from %s to good" % node.status + node.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60): + print "changed status from %s to down" % node.status + # NOTE: change an admin mode back into monitordebug after two months. + node.status = 'monitordebug' + node.last_changed = datetime.now() + + # extreme cases of offline nodes + if ( boot_state == 'disabled' or last_contact == None ) and \ + changed_greaterthan(node.last_changed, 2*30) and \ + node.status != 'down': + print "changed status from %s to down" % node.status + node.status = 'down' + node.last_changed = datetime.now() + def checkAndRecordState(l_nodes, l_plcnodes): global count for nodename in l_nodes: - d_node = None - for node in l_plcnodes: - if node['hostname'] == nodename: - d_node = node - break - if not d_node: - continue - pf = HistoryNodeRecord.findby_or_create(hostname=nodename) - pf.last_checked = datetime.now() + nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + nodehist.last_checked = datetime.now() try: # Find the most recent record - noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first() - #print "NODEREC: ", noderec.date_checked + noderec = FindbadNodeRecord.get_latest_by(hostname=nodename) except: print "COULD NOT FIND %s" % nodename import traceback @@ -59,33 +138,16 @@ def checkAndRecordState(l_nodes, l_plcnodes): print "none object for %s"% nodename continue - node_state = noderec.observed_status - if noderec.plc_node_stats: - boot_state = noderec.plc_node_stats['boot_state'] - else: - boot_state = "unknown" - - if node_state == "BOOT": - if pf.status != "good": - pf.last_changed = datetime.now() - pf.status = "good" - elif node_state == "DEBUG": - if pf.status != boot_state: - pf.last_changed = datetime.now() - pf.status = boot_state - else: - if pf.status != "down": - pf.last_changed = datetime.now() - pf.status = "down" + check_node_state(noderec, nodehist) count += 1 - print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple()))) + print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) # NOTE: this commits all pending operations to the DB. Do not remove, or # replace with another operations that also commits all pending ops, such # as session.commit() or flush() or something - print HistoryNodeRecord.query.count() session.flush() + print HistoryNodeRecord.query.count() return True @@ -97,7 +159,7 @@ if __name__ == '__main__': config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback print traceback.print_exc() diff --git a/nodegroups.py b/nodegroups.py index d6beb54..999902f 100755 --- a/nodegroups.py +++ b/nodegroups.py @@ -59,16 +59,15 @@ def main(): # given to GetNodes nodelist = [] for h in hostlist: - nodelist += api.GetNodes(h) + nodelist.append( plccache.GetNodeByName(h) ) - #nodelist = api.GetNodes(hostlist) group_str = "Given" elif config.site: - site = api.GetSites(config.site) + site = plccache.GetSitesByName([config.site]) if len (site) > 0: site = site[0] - nodelist = api.GetNodes(site['node_ids']) + nodelist = plccache.GetNodesByIds(site['node_ids']) else: nodelist = [] @@ -76,13 +75,13 @@ def main(): elif config.nodeselect: hostlist = node_select(config.nodeselect) - nodelist = api.GetNodes(hostlist) + nodelist = [ plccache.GetNodeByName(h) for h in hostlist ] group_str = "selection" else: ng = api.GetNodeGroups({'name' : config.nodegroup}) - nodelist = api.GetNodes(ng[0]['node_ids']) + nodelist = plccache.GetNodesByIds(ng[0]['node_ids']) group_str = config.nodegroup @@ -91,7 +90,7 @@ def main(): ng_nodes = nodelist # Get all nodes - all_nodes = api.GetNodes({'peer_id': None}) + all_nodes = plccache.l_nodes # remove ngnodes from all node list ng_list = [ x['hostname'] for x in ng_nodes ] @@ -121,7 +120,7 @@ def main(): i = 1 for node in nodelist: print "%-2d" % i, - fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first() + fbrec = FindbadNodeRecord.get_latest_by(hostname=node['hostname']) fbdata = fbrec.to_dict() print nodegroup_display(node, fbdata, config) i += 1 diff --git a/nodeinfo.py b/nodeinfo.py index 9afed5c..726f250 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -7,8 +7,8 @@ from monitor import * from monitor import util from monitor import parser as parsermodule -from monitor import database -from pcucontrol import reboot +from monitor.database.info.model import * +from monitor import reboot import time from monitor.model import * @@ -44,7 +44,7 @@ def plc_print_nodeinfo(plcnode): diff_time(plcnode['last_contact']), plcnode['key']) def fb_print_nodeinfo(fbnode): - pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags') + pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname']) try: fbnode['last_change'] = diff_time(pf.last_changed) except: @@ -140,7 +140,7 @@ if config.findbad: for node in config.args: config.node = node - plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0] + plc_nodeinfo = plccache.GetNodeByName(config.node) fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) fb_nodeinfo = fb_noderec.to_dict() plc_print_nodeinfo(plc_nodeinfo) diff --git a/nodequery.py b/nodequery.py index dfe3f95..1f41ceb 100755 --- a/nodequery.py +++ b/nodequery.py @@ -13,11 +13,10 @@ import time import re import string -from pcucontrol import reboot from monitor.wrapper import plc, plccache api = plc.getAuthAPI() -from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session +from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session from monitor import util from monitor import config @@ -270,6 +269,8 @@ def pcu_select(str_query, nodelist=None): fbquery = FindbadNodeRecord.get_all_latest() fb_nodelist = [ n.hostname for n in fbquery ] if True: + # NOTE: this doesn't work when there are only a few records current. + # pcu_select should apply to all pcus globally, not just the most recent records. fbpcuquery = FindbadPCURecord.get_all_latest() fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ] @@ -381,8 +382,6 @@ def main(): #fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed() fb = None - #reboot.fb = fbpcu - if config.nodelist: nodelist = util.file.getListFromFile(config.nodelist) else: @@ -413,7 +412,7 @@ def main(): try: # Find the most recent record - fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first() + fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node) except: print traceback.print_exc() pass diff --git a/pcubad.py b/pcubad.py index 181f001..9f0468c 100755 --- a/pcubad.py +++ b/pcubad.py @@ -4,10 +4,11 @@ import os import sys import string import time +import sets from datetime import datetime,timedelta from monitor import database -from pcucontrol import reboot +from monitor import reboot from monitor import parser as parsermodule from monitor import config from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord @@ -21,12 +22,32 @@ from monitor.model import * api = plc.getAuthAPI() -def main(config): +def main(): + main2(config) + +def main2(config): l_plcpcus = plccache.l_pcus l_pcus = None - if config.pcu: + if config.site is not None: + site = plccache.GetSitesByName([config.site]) + l_nodes = plccache.GetNodesByIds(site[0]['node_ids']) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + + elif config.node: + l_nodes = plccache.GetNodeByName(config.node) + pcus = [] + for node in l_nodes: + pcus += node['pcu_ids'] + # clear out dups. + l_pcus = [pcu for pcu in sets.Set(pcus)] + + elif config.pcu: for pcu in l_plcpcus: if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \ ( pcu['ip'] is not None and config.pcu in pcu['ip'] ): @@ -41,6 +62,38 @@ def main(config): hn2lb = plccache.plcdb_hn2lb +def check_pcu_state(rec, pcu): + + pcu_state = rec.reboot_trial_status + + if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \ + ( pcu.status == 'online' or pcu.status == 'good' ): + print "changed status from %s to offline" % pcu.status + pcu.status = 'offline' + pcu.last_changed = datetime.now() + + if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]: + print "changed status from %s to online" % pcu.status + pcu.status = 'online' + pcu.last_changed = datetime.now() + + if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5): + #send thank you notice, or on-line notice. + print "changed status from %s to good" % pcu.status + pcu.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2): + # send down pcu notice + print "changed status from %s to down" % pcu.status + pcu.status = 'down' + pcu.last_changed = datetime.now() + + if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30): + print "changed status from %s to down" % pcu.status + pcu.status = 'down' + pcu.last_changed = datetime.now() + def checkAndRecordState(l_pcus, l_plcpcus): count = 0 for pcuname in l_pcus: @@ -53,65 +106,56 @@ def checkAndRecordState(l_pcus, l_plcpcus): if not d_pcu: continue - pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id']) - pf.last_checked = datetime.now() + pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'], + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + pcuhist.last_checked = datetime.now() try: # Find the most recent record - pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first() - print "NODEREC: ", pcurec.date_checked + pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first() except: - print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu) + print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu) import traceback print traceback.print_exc() # don't have the info to create a new entry right now, so continue. continue - pcu_state = pcurec.reboot_trial_status - current_state = pcu_state - - if current_state == 0 or current_state == "0": - if pf.status != "good": - pf.last_changed = datetime.now() - pf.status = "good" - elif current_state == 'NetDown': - if pf.status != "netdown": - pf.last_changed = datetime.now() - pf.status = "netdown" - elif current_state == 'Not_Run': - if pf.status != "badconfig": - pf.last_changed = datetime.now() - pf.status = "badconfig" - else: - if pf.status != "error": - pf.last_changed = datetime.now() - pf.status = "error" + if not pcurec: + print "none object for pcu %s"% reboot.pcu_name(d_pcu) + continue + + check_pcu_state(pcurec, pcuhist) count += 1 - print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple()))) + print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple()))) # NOTE: this commits all pending operations to the DB. Do not remove, or # replace with another operations that also commits all pending ops, such # as session.commit() or flush() or something - print HistoryPCURecord.query.count() session.flush() + print HistoryPCURecord.query.count() return True if __name__ == '__main__': parser = parsermodule.getParser() - parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False) + parser.set_defaults(filename=None, pcu=None, node=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False) parser.add_option("", "--pcu", dest="pcu", metavar="hostname", help="Provide a single pcu to operate on") + parser.add_option("", "--site", dest="site", metavar="sitename", + help="Provide a single sitename to operate on") + parser.add_option("", "--node", dest="node", metavar="nodename", + help="Provide a single node to operate on") parser.add_option("", "--pculist", dest="pculist", metavar="file.list", help="Provide a list of files to operate on") config = parsermodule.parse_args(parser) try: - main(config) + main2(config) except Exception, err: import traceback - print traceback.print_exc() + traceback.print_exc() print "Exception: %s" % err sys.exit(0) diff --git a/pcucontrol/models/APCControl.py b/pcucontrol/models/APCControl.py index 62f5f6f..59cc649 100644 --- a/pcucontrol/models/APCControl.py +++ b/pcucontrol/models/APCControl.py @@ -6,7 +6,7 @@ class APCControl(PCUControl): def run(self, node_port, dryrun): print "RUNNING!!!!!!!!!!!!" - if self.type == Transport.HTTPS or self.type == Transport.HTTP: + if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP: print "APC via http...." return self.run_http_or_https(node_port, dryrun) else: @@ -58,9 +58,9 @@ class APCControl(PCUControl): else: # TODO: also send message for https, since that doesn't work this way... - if self.type == Transport.HTTPS: + if self.transport.type == Transport.HTTPS: cmd = self.get_https_cmd() - elif self.type == Transport.HTTP: + elif self.transport.type == Transport.HTTP: cmd = self.get_http_cmd() else: raise ExceptionNoTransport("Unsupported transport for http command") @@ -118,12 +118,12 @@ class APCControl(PCUControl): # NOTE: we may need to return software version, no model version to # know which file to request on the server. - if self.type == Transport.HTTP: + if self.transport.type == Transport.HTTP: cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \ """ | sed -e "s/<[^>]*>//g" -e "s/ //g" -e "/^$/d" """ + \ """ | grep -E "AP[[:digit:]]+" """ #""" | grep -E "v[[:digit:]].*" """ - elif self.type == Transport.HTTPS: + elif self.transport.type == Transport.HTTPS: cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \ """ | sed -e "s/<[^>]*>//g" -e "s/ //g" -e "/^$/d" """ + \ """ | grep -E "AP[[:digit:]]+" """ @@ -138,10 +138,10 @@ class APCControl(PCUControl): def logout(self): # NOTE: log out again, to allow other uses to access the machine. - if self.type == Transport.HTTP: + if self.transport.type == Transport.HTTP: cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \ """ | grep -E '^[^<]+' """ - elif self.type == Transport.HTTPS: + elif self.transport.type == Transport.HTTPS: cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \ """ | grep -E '^[^<]+' """ else: diff --git a/pcucontrol/models/BayTech.py b/pcucontrol/models/BayTech.py index 83de3a5..065cc28 100644 --- a/pcucontrol/models/BayTech.py +++ b/pcucontrol/models/BayTech.py @@ -1,6 +1,7 @@ from pcucontrol.reboot import * class BayTechRPC3NC(PCUControl): + supported_ports = [22,23] def run_telnet(self, node_port, dryrun): return self.run_ssh(node_port, dryrun) @@ -22,6 +23,7 @@ class BayTechRPC3NC(PCUControl): return 0 class BayTechRPC16(PCUControl): + supported_ports = [22,23] def run_telnet(self, node_port, dryrun): return self.run_ssh(node_port, dryrun) def run_ssh(self, node_port, dryrun): @@ -48,6 +50,7 @@ class BayTechCtrlCUnibe(PCUControl): indefinitely, unless you send a Ctrl-C after the password. No idea why. """ + supported_ports = [22] def run_ssh(self, node_port, dryrun): print "BayTechCtrlC %s" % self.host @@ -69,9 +72,11 @@ class BayTechCtrlCUnibe(PCUControl): if index == 0: print "3" s.send("3\r\n") + time.sleep(5) index = s.expect(["DS-RPC>", "Enter user name:"]) if index == 1: s.send(self.username + "\r\n") + time.sleep(5) index = s.expect(["DS-RPC>"]) if index == 0: @@ -112,6 +117,7 @@ class BayTechCtrlC(PCUControl): indefinitely, unless you send a Ctrl-C after the password. No idea why. """ + supported_ports = [22] def run_ssh(self, node_port, dryrun): print "BayTechCtrlC %s" % self.host diff --git a/pcucontrol/models/DRAC.py b/pcucontrol/models/DRAC.py index e7c030a..e3172b6 100644 --- a/pcucontrol/models/DRAC.py +++ b/pcucontrol/models/DRAC.py @@ -12,11 +12,14 @@ class DRAC(PCUControl): "-o PasswordAuthentication=yes "+\ "-o PubkeyAuthentication=no" s = pxssh.pxssh() - if not s.login(self.host, self.username, self.password, ssh_options, + try: + if not s.login(self.host, self.username, self.password, ssh_options, original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT): - raise ExceptionPassword("Invalid Password") - - print "logging in..." + raise ExceptionPassword("Invalid Password") + except pexpect.EOF: + raise ExceptionPrompt("Disconnect before login prompt") + + print "logging in... %s" % self.host s.send("\r\n\r\n") try: # Testing Reboot ? @@ -148,11 +151,9 @@ def racadm_reboot(host, username, password, port, dryrun): print "RUNCMD: %s" % output if verbose: - logger.debug(output) + print output return 0 except Exception, err: - logger.debug("runcmd raised exception %s" % err) - if verbose: - logger.debug(err) - return err + print "runcmd raised exception %s" % err + return str(err) diff --git a/pcucontrol/models/HPiLO.py b/pcucontrol/models/HPiLO.py index 25d4331..78ceb0a 100644 --- a/pcucontrol/models/HPiLO.py +++ b/pcucontrol/models/HPiLO.py @@ -1,4 +1,5 @@ from pcucontrol.reboot import * +from distutils.sysconfig import get_python_lib; class HPiLO(PCUControl): supported_ports = [22,443] @@ -34,7 +35,7 @@ class HPiLO(PCUControl): locfg = command.CMD() - cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/hpilo/" + cmd_str = get_python_lib(1) + "/pcucontrol/models/hpilo/" cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % ( self.host, cmd_str+"iloxml/Get_Network.xml", diff --git a/pcucontrol/models/IPAL.py b/pcucontrol/models/IPAL.py index 75668db..48394df 100644 --- a/pcucontrol/models/IPAL.py +++ b/pcucontrol/models/IPAL.py @@ -78,7 +78,9 @@ class IPAL(PCUControl): s.close() if e[0] == errno.ECONNREFUSED: # cannot connect to remote host - raise Exception(e[1]) + raise ExceptionNotFound(e[1]) + elif e[0] == errno.ETIMEDOUT: + raise ExceptionTimeout(e[1]) else: # TODO: what other conditions are there? raise Exception(e) @@ -90,7 +92,7 @@ class IPAL(PCUControl): print "Current status is '%s'" % ret if ret == '': - raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret)) + raise Exception("Status returned 'another session already open' on %s %s : %s" % (self.host, node_port, ret)) if node_port < len(ret): status = ret[node_port] @@ -100,10 +102,12 @@ class IPAL(PCUControl): elif status == '0': # down power_on = False + elif status == '6': + raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret)) else: - raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret)) + raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret)) else: - raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret)) + raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret)) if not dryrun: @@ -128,10 +132,12 @@ class IPAL(PCUControl): elif status == '0': # down power_on = False + elif status == '6': + raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret)) else: - raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret)) + raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret)) else: - raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret)) + raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret)) if power_on: return 0 diff --git a/pcucontrol/models/ePowerSwitch.py b/pcucontrol/models/ePowerSwitch.py index 7650689..edff5cc 100644 --- a/pcucontrol/models/ePowerSwitch.py +++ b/pcucontrol/models/ePowerSwitch.py @@ -50,14 +50,14 @@ class ePowerSwitchNew(PCUControl): req.add_header("Authorization", authheader) # add data to handler, f = urllib2.urlopen(req, data) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() except: import traceback; traceback.print_exc() # fetch url one more time on cmd.html, econtrol.html or whatever. # pass else: - if self.verbose: print f.read() + if self.transport.verbose: print f.read() return 0 @@ -74,12 +74,12 @@ class ePowerSwitchOld(PCUControl): # NOTE: it doesn't seem to matter whether this authinfo is here or not. transport = urllib2.build_opener(authinfo) f = transport.open(self.url) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() if not dryrun: transport = urllib2.build_opener(authhandler) f = transport.open(self.url + "cmd.html", "P%d=r" % node_port) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() self.transport.close() return 0 @@ -103,12 +103,12 @@ class ePowerSwitchOld(PCUControl): # NOTE: it doesn't seem to matter whether this authinfo is here or not. transport = urllib2.build_opener() f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() if not dryrun: transport = urllib2.build_opener(authhandler) f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port) - if self.verbose: print f.read() + if self.transport.verbose: print f.read() # data= "P%d=r" % node_port #self.open(self.host, self.username, self.password) diff --git a/pcucontrol/models/intelamt/RemoteControlSample.cpp b/pcucontrol/models/intelamt/RemoteControlSample.cpp index c488b64..f12cab5 100644 --- a/pcucontrol/models/intelamt/RemoteControlSample.cpp +++ b/pcucontrol/models/intelamt/RemoteControlSample.cpp @@ -29,7 +29,7 @@ void DisplaySystemFirmwareCapabilities(uint32 systemFirmwareCapabilities); void DisplayOemDefinedCapabilities(uint32 OemDefinedCapabilities); bool ExecuteGetSystemPowerstate(Soap *server, bool verbose = true); bool ExecuteGetRemoteControlCapabilities(Soap *server, bool verbose = true); -bool ExecuteRemoteControl(Soap *server, bool default_val = false); +bool ExecuteRemoteControl(Soap *server, bool default_val = false, uint8 icommand=Reset); bool MainFlow(Soap *server,int option,bool verbose); bool ValidateOption(char *option, int *parameter); @@ -173,7 +173,13 @@ bool MainFlow(Soap *server, int option, bool verbose) { return status; } - if ((status = ExecuteRemoteControl(server,true)) == false) + /* Ensure that the machine is powered up before trying to + * 'reset' it, since a reset on a down node will fail. */ + if ((status = ExecuteRemoteControl(server,true,PowerUp)) == false) + { + return status; + } + if ((status = ExecuteRemoteControl(server,true,Reset)) == false) { return status; } @@ -344,7 +350,7 @@ bool ExecuteGetRemoteControlCapabilities(Soap* server, bool verbose) * true - on success * false - on failure */ -bool ExecuteRemoteControl(Soap* server,bool def_values) +bool ExecuteRemoteControl(Soap* server,bool def_values, uint8 icommand) { int res; bool status = true; @@ -357,7 +363,7 @@ bool ExecuteRemoteControl(Soap* server,bool def_values) _rci__RemoteControlResponse response; // example values - uint8 *command = new uint8(Reset); + uint8 *command = new uint8(icommand); uint32 *ianaOemNumber = new uint32(IntelIanaNumber); uint8 *specialCommand = NULL; //none uint16 *oemParameter = NULL; //none diff --git a/pcucontrol/reboot.py b/pcucontrol/reboot.py index 9d171a2..5744141 100755 --- a/pcucontrol/reboot.py +++ b/pcucontrol/reboot.py @@ -11,13 +11,12 @@ import urllib2 import urllib import threading, popen2 import array, struct -from monitor.wrapper import plc import base64 from subprocess import PIPE, Popen import pcucontrol.transports.ssh.pxssh as pxssh import pcucontrol.transports.ssh.pexpect as pexpect import socket -from monitor.util import command + # Use our versions of telnetlib and pyssh @@ -25,8 +24,6 @@ sys.path.insert(0, os.path.dirname(sys.argv[0])) import pcucontrol.transports.telnetlib as telnetlib sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh") import pcucontrol.transports.pyssh as pyssh -from monitor import config - # Event class ID from pcu events #NODE_POWER_CONTROL = 3 @@ -35,7 +32,6 @@ from monitor import config #MONITOR_USER_ID = 11142 import logging -logger = logging.getLogger("monitor") verbose = 1 #dryrun = 0; @@ -135,7 +131,7 @@ class Transport: transport.set_debuglevel(self.verbose) if username is not None: self.transport = transport - self.transport.ifThenSend(prompt, username, ExceptionUsername) + self.ifThenSend(prompt, username, ExceptionUsername) elif self.type == self.SSH: if username is not None: @@ -206,7 +202,7 @@ class Transport: print r except urllib2.URLError,err: - logger.info('Could not open http connection', err) + print 'Could not open http connection', err return "http transport error" return 0 @@ -255,17 +251,25 @@ class PCUControl(PCUModel,PCURecord): def reboot(self, node_port, dryrun): port_list = [] + # There are two sources of potential ports. Those that are open and + # those that are part of the PCU's supported_ports. + # I think we should start with supported_ports and then filter that + # by the open ports. + + port_list = self.supported_ports + if hasattr(self, 'port_status') and self.port_status: + # get out the open ports port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys()) port_list = [ int(x) for x in port_list ] + # take only the open ports that are supported_ports + port_list = filter(lambda x: x in self.supported_ports, port_list) if port_list == []: - raise ExceptionPort("Unsupported Port: No transport from open ports") - else: - port_list = self.supported_ports + raise ExceptionPort("No Open Port: No transport from open ports") print port_list - ret = "could not run" + ret = "No implementation for open ports on selected PCU model" for port in port_list: if port not in Transport.porttypemap: continue @@ -273,7 +277,9 @@ class PCUControl(PCUModel,PCURecord): type = Transport.porttypemap[port] self.transport = Transport(type, verbose) + print "checking for run_%s" % type if hasattr(self, "run_%s" % type): + print "found run_%s" % type fxn = getattr(self, "run_%s" % type) ret = self.catcherror(fxn, node_port, dryrun) if ret == 0: # NOTE: success!, so stop @@ -316,14 +322,16 @@ class PCUControl(PCUModel,PCURecord): except urllib2.URLError, err: return "URLError: " + str(err) except EOFError, err: - if self.verbose: - logger.debug("reboot: EOF") - logger.debug(err) self.transport.close() import traceback traceback.print_exc() return "EOF connection reset" + str(err) + except Exception, err: + from monitor.common import email_exception + email_exception(self.host) + raise Exception(err) +from pcucontrol.util import command from pcucontrol.models import * def pcu_name(pcu): @@ -334,73 +342,6 @@ def pcu_name(pcu): else: return None -def get_pcu_values(pcu_id): - from monitor.database.info.model import FindbadPCURecord - print "pcuid: %s" % pcu_id - try: - pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first() - if pcurec: - values = pcurec.to_dict() - else: - values = None - except: - values = None - - return values - -def reboot(nodename): - return reboot_policy(nodename, True, False) - -def reboot_str(nodename): - global verbose - continue_probe = True - dryrun=False - - pcu = plc.getpcu(nodename) - if not pcu: - logger.debug("no pcu for %s" % nodename) - print "no pcu for %s" % nodename - return False # "%s has no pcu" % nodename - - values = get_pcu_values(pcu['pcu_id']) - if values == None: - logger.debug("No values for pcu probe %s" % nodename) - print "No values for pcu probe %s" % nodename - return False #"no info for pcu_id %s" % pcu['pcu_id'] - - # Try the PCU first - logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model'])) - - ret = reboot_test_new(nodename, values, verbose, dryrun) - return ret - -def reboot_policy(nodename, continue_probe, dryrun): - global verbose - - pcu = plc.getpcu(nodename) - if not pcu: - logger.debug("no pcu for %s" % nodename) - print "no pcu for %s" % nodename - return False # "%s has no pcu" % nodename - - values = get_pcu_values(pcu['pcu_id']) - if values == None: - logger.debug("No values for pcu probe %s" % nodename) - print "No values for pcu probe %s" % nodename - return False #"no info for pcu_id %s" % pcu['pcu_id'] - - # Try the PCU first - logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model'])) - - ret = reboot_test_new(nodename, values, verbose, dryrun) - - if ret != 0: - print ret - return False - else: - print "return true" - return True - class Unknown(PCUControl): supported_ports = [22,23,80,443,5869,9100,16992] @@ -435,7 +376,7 @@ def model_to_object(modelname): print "UNKNOWN model %s"%modelname return Unknown -def reboot_api(node, pcu): #, verbose, dryrun): +def reboot_api(node, pcu): rb_ret = "" try: @@ -452,19 +393,68 @@ def reboot_api(node, pcu): #, verbose, dryrun): rb_ret = "No modelname in PCU record." # TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults except Exception, err: - rb_ret = str(err) + rb_ret = "Exception Model(%s): " % modelname + rb_ret += str(err) return rb_ret +def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id): + newmodelname = None + update = { 'AP79xx' : 'APCControl13p13', + 'Masterswitch' : 'APCControl13p13', + 'DS4-RPC' : 'BayTech', + 'IP-41x_IP-81x' : 'IPAL', + 'DRAC3' : 'DRAC', + 'DRAC4' : 'DRAC', + 'ePowerSwitch' : 'ePowerSwitchOld', + 'ilo2' : 'HPiLO', + 'ilo1' : 'HPiLO', + 'PM211-MIP' : 'PM211MIP', + 'AMT2.5' : 'IntelAMT', + 'AMT3.0' : 'IntelAMT', + 'WTI_IPS-4' : 'WTIIPS4', + 'unknown' : 'ManualPCU', + 'DRAC5' : 'DRAC', + 'ipmi' : 'OpenIPMI', + 'bbsemaverick' : 'BlackBoxPSMaverick', + 'manualadmin' : 'ManualPCU', + } + + if oldmodelname in update: + newmodelname = update[oldmodelname] + else: + newmodelname = oldmodelname + + if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]: + newmodelname = 'APCControl12p3' + elif pcu_id in [1110,86]: + newmodelname = 'APCControl1p4' + elif pcu_id in [1221,1225,1220,1192]: + newmodelname = 'APCControl121p3' + elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]: + newmodelname = 'APCControl121p1' + elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]: + newmodelname = 'BayTechCtrlC' + elif pcu_id in [93]: + newmodelname = 'BayTechRPC3NC' + elif pcu_id in [1057]: + newmodelname = 'BayTechCtrlCUnibe' + elif pcu_id in [1012]: + newmodelname = 'BayTechRPC16' + elif pcu_id in [1089, 1071, 1046, 1035, 1118]: + newmodelname = 'ePowerSwitchNew' + + return newmodelname + def reboot_test_new(nodename, values, verbose, dryrun): rb_ret = "" if 'plc_pcu_stats' in values: values.update(values['plc_pcu_stats']) try: - modelname = values['model'] + modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id']) if modelname: - object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname) + object = eval('%s(values, verbose)' % modelname) rb_ret = object.reboot(values[nodename], dryrun) else: rb_ret = "Not_Run" @@ -477,34 +467,7 @@ def reboot_test_new(nodename, values, verbose, dryrun): return rb_ret def main(): - logger.setLevel(logging.DEBUG) - ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) - formatter = logging.Formatter('LOGGER - %(message)s') - ch.setFormatter(formatter) - logger.addHandler(ch) - - try: - if "test" in sys.argv: - dryrun = True - else: - dryrun = False - - for node in sys.argv[1:]: - if node == "test": continue - - print "Rebooting %s" % node - if reboot_policy(node, True, dryrun): - print "success" - else: - print "failed" - except Exception, err: - import traceback; traceback.print_exc() - print err + print "this does not work." if __name__ == '__main__': - logger = logging.getLogger("monitor") main() - f = open("/tmp/rebootlog", 'a') - f.write("reboot %s\n" % sys.argv) - f.close() diff --git a/pcucontrol/util/__init__.py b/pcucontrol/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/monitor/util/command.py b/pcucontrol/util/command.py similarity index 71% rename from monitor/util/command.py rename to pcucontrol/util/command.py index da7ddae..47627b4 100644 --- a/monitor/util/command.py +++ b/pcucontrol/util/command.py @@ -4,10 +4,12 @@ import subprocess import signal import time import traceback +import fcntl DEBUG= 0 class ExceptionTimeout(Exception): pass +class ExceptionReadTimeout(Exception): pass COMMAND_TIMEOUT = 60 ssh_options = { 'StrictHostKeyChecking':'no', 'BatchMode':'yes', @@ -15,15 +17,47 @@ ssh_options = { 'StrictHostKeyChecking':'no', 'ConnectTimeout':'%s' % COMMAND_TIMEOUT} class Sopen(subprocess.Popen): - def kill(self, signal = signal.SIGTERM): - os.kill(self.pid, signal) + def kill(self, sig = signal.SIGTERM): + try: + # NOTE: this also kills parent... so doesn't work like I want. + # NOTE: adding 'exec' before the cmd removes the extra sh, and + # partially addresses this problem. + #os.killpg(os.getpgid(self.pid), signal.SIGKILL) + os.kill(self.pid, sig) + except OSError: + # no such process, due to it already exiting... + pass + + +def read_t(stream, count=1, timeout=COMMAND_TIMEOUT*2): + if count == 1: + retstr = "" + + while True: + lin, lout, lerr = select([stream], [], [], timeout) + if len(lin) == 0: + print "timeout!" + raise ExceptionReadTimeout("TIMEOUT reading from command") -def read_t(stream, count, timeout=COMMAND_TIMEOUT*2): - lin, lout, lerr = select([stream], [], [], timeout) - if len(lin) == 0: - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) + try: + outbytes = stream.read(count) + except IOError, err: + print 'no content yet.' + # due to no content. + # the select timeout should catch this. + continue - return stream.read(count) + if not outbytes: + break + retstr += outbytes + + return retstr + else: + lin, lout, lerr = select([stream], [], [], timeout) + if len(lin) == 0: + raise ExceptionReadTimeout("TIMEOUT reading from command") + + return stream.read(count) class CMD: def __init__(self): @@ -31,12 +65,21 @@ class CMD: def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2): - #print "CMD.run_noexcept(%s)" % cmd try: return CMD.run(self,cmd,timeout) except ExceptionTimeout: print traceback.print_exc() - return ("", "SCRIPTTIMEOUT") + return ("", "ScriptTimeout") + except ExceptionReadTimeout: + print traceback.print_exc() + return ("", "RunningScriptTimeout") + except KeyboardInterrupt: + print "Interrupted, exiting..." + sys.exit(1) + except Exception, err: + from monitor.common import email_exception + email_exception() + return ("", str(err)) def system(self, cmd, timeout=COMMAND_TIMEOUT*2): (o,e) = self.run(cmd, timeout) @@ -48,16 +91,13 @@ class CMD: def run(self, cmd, timeout=COMMAND_TIMEOUT*2): - #print "CMD.run(%s)" % cmd s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) self.s = s (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr) - #print "calling select(%s)" % timeout lout, lin, lerr = select([f_out], [], [f_err], timeout) - #print "TIMEOUT!!!!!!!!!!!!!!!!!!!" if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0: # Reached a timeout! Nuke process so it does not hang. - #print "KILLING" + print "TIMEOUT!!!!!!!!!!!!!!!!!!!" s.kill(signal.SIGKILL) raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) else: @@ -68,28 +108,26 @@ class CMD: o_value = "" e_value = "" - o_value = f_out.read() + #o_value = f_out.read() + flags = fcntl.fcntl(f_out, fcntl.F_GETFL) + fcntl.fcntl(f_out, fcntl.F_SETFL, flags | os.O_NONBLOCK) + + try: + o_value = read_t(f_out,1,30) + except ExceptionReadTimeout: + s.kill(signal.SIGKILL) + raise ExceptionReadTimeout("TIMEOUT: failed to read from cmd: %s" % cmd) + e_value = f_err.read() - #print "striping output" o_value = o_value.strip() e_value = e_value.strip() - #print "OUTPUT -%s-%s-" % (o_value, e_value) - - #print "closing files" f_out.close() f_in.close() f_err.close() - try: - #print "s.kill()" - s.kill() - #print "after s.kill()" - except OSError: - # no such process, due to it already exiting... - pass + s.kill(signal.SIGKILL) - #print o_value, e_value return (o_value, e_value) def runargs(self, args, timeout=COMMAND_TIMEOUT*2): @@ -114,11 +152,7 @@ class CMD: f_out.close() f_in.close() f_err.close() - try: - s.kill() - except OSError: - # no such process, due to it already exiting... - pass + s.kill(signal.SIGKILL) return (o_value, e_value) @@ -161,17 +195,10 @@ class SSH(CMD): return CMD.run_noexcept(self, cmd) def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), + cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), self.user, self.host, cmd) - #print "SSH.run_noexcept2(%s)" % cmd + #print cmd r = CMD.run_noexcept(self, cmd, timeout) - - # XXX: this may be resulting in deadlocks... not sure. - #if self.s.returncode is None: - # #self.s.kill() - # self.s.kill(signal.SIGKILL) - # self.s.wait() - # self.ret = self.s.returncode self.ret = -1 return r diff --git a/policy.py b/policy.py new file mode 100755 index 0000000..4befbd9 --- /dev/null +++ b/policy.py @@ -0,0 +1,237 @@ +#!/usr/bin/python + +# This script is used to manipulate the operational state of nodes in +# different node groups. These are basically set operations on nodes via the +# PLC api. +# +# Take the ng name as an argument.... +# optionally, +# * get a list of nodes in the given nodegroup. +# * set some or all in the set to rins. +# * restart them all. +# * do something else to them all. +# + +import os +import time +import traceback +import sys +from optparse import OptionParser + +from monitor import config +from monitor import parser as parsermodule +from monitor.common import * +from monitor.model import * +from monitor.wrapper import plc +from monitor.wrapper import plccache +from monitor.database.info.model import * +from monitor.database.info.interface import * + +from nodequery import verify,query_to_dict,node_select + +api = plc.getAuthAPI() + +def logic(): + + plc.nodeBootState(host, 'rins') + node_end_record(host) + +def main(hostnames, sitenames): + # commands: + i = 1 + node_count = 1 + site_count = 1 + #print "hosts: %s" % hostnames + for i,host in enumerate(hostnames): + try: + lb = plccache.plcdb_hn2lb[host] + except: + print "unknown host in plcdb_hn2lb %s" % host + continue + + nodeblack = BlacklistRecord.get_by(hostname=host) + + if nodeblack and not nodeblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() ) + continue + + sitehist = SiteInterface.get_or_make(loginbase=lb) + + recent_actions = sitehist.getRecentActions(hostname=host) + + nodehist = HistoryNodeRecord.findby_or_create(hostname=host) + + print "%s %s %s" % (i, nodehist.hostname, nodehist.status) + if nodehist.status == 'good' and \ + changed_lessthan(nodehist.last_changed, 1.0) and \ + not found_within(recent_actions, 'online_notice', 0.5): + # NOTE: there is a narrow window in which this command must be + # evaluated, otherwise the notice will not go out. this is not ideal. + sitehist.sendMessage('online_notice', hostname=host, viart=False) + print "send message for host %s online" % host + + pass + + if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + changed_greaterthan(nodehist.last_changed,1.0) and \ + not found_between(recent_actions, 'first_try_reboot', 3.5, 1): + + sitehist.attemptReboot(host) + print "send message for host %s first_try_reboot" % host + pass + + # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1) + # will be false for a day after the above condition is satisfied + if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + changed_greaterthan(nodehist.last_changed,1.5) and \ + found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \ + not found_within(recent_actions, 'pcufailed_notice', 3.5): + # found_within(recent_actions, 'first_try_reboot', 3.5) and \ + + # send pcu failure message + #act = ActionRecord(**kwargs) + sitehist.sendMessage('pcufailed_notice', hostname=host) + print "send message for host %s PCU Failure" % host + pass + + if nodehist.status == 'monitordebug' and \ + changed_greaterthan(nodehist.last_changed, 1) and \ + not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): + # send down node notice + # delay 0.5 days before retrying... + + print "send message for host %s bootmanager_restore" % host + sitehist.runBootManager(host) + # sitehist.sendMessage('retry_bootman', hostname=host) + + if nodehist.status == 'down' and \ + changed_greaterthan(nodehist.last_changed, 2) and \ + not found_within(recent_actions, 'down_notice', 3.5): + # send down node notice + + sitehist.sendMessage('down_notice', hostname=host) + print "send message for host %s down" % host + pass + + node_count = node_count + 1 + session.flush() + + for i,site in enumerate(sitenames): + sitehist = SiteInterface.get_or_make(loginbase=site) + siteblack = BlacklistRecord.get_by(loginbase=site) + + if siteblack and not siteblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() ) + continue + + # TODO: make query only return records within a certin time range, + # i.e. greater than 0.5 days ago. or 5 days, etc. + recent_actions = sitehist.getRecentActions(loginbase=site) + + print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status) + if sitehist.db.status == 'down': + if not found_within(recent_actions, 'pause_penalty', 30) and \ + not found_within(recent_actions, 'increase_penalty', 7) and \ + changed_greaterthan(sitehist.db.last_changed, 7): + + # TODO: catch errors + sitehist.increasePenalty() + #sitehist.applyPenalty() + sitehist.sendMessage('increase_penalty') + + print "send message for site %s penalty increase" % site + + if sitehist.db.status == 'good': + # clear penalty + # NOTE: because 'all clear' should have an indefinite status, we + # have a boolean value rather than a 'recent action' + if sitehist.db.penalty_applied: + # send message that penalties are cleared. + + sitehist.clearPenalty() + #sitehist.applyPenalty() + sitehist.sendMessage('clear_penalty') + sitehist.closeTicket() + + print "send message for site %s penalty cleared" % site + + # find all ticket ids for site ( could be on the site record? ) + # determine if there are penalties within the last 30 days? + # if so, add a 'pause_penalty' action. + if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0: + # pause escalation + print "Pausing penalties for %s" % site + sitehist.pausePenalty() + + site_count = site_count + 1 + + session.flush() + + session.flush() + return + + +if __name__ == "__main__": + parser = parsermodule.getParser(['nodesets']) + parser.set_defaults( timewait=0, + skip=0, + rins=False, + reboot=False, + findbad=False, + force=False, + nosetup=False, + verbose=False, + quiet=False,) + + parser.add_option("", "--stopselect", dest="stopselect", metavar="", + help="The select string that must evaluate to true for the node to be considered 'done'") + parser.add_option("", "--findbad", dest="findbad", action="store_true", + help="Re-run findbad on the nodes we're going to check before acting.") + parser.add_option("", "--force", dest="force", action="store_true", + help="Force action regardless of previous actions/logs.") + parser.add_option("", "--rins", dest="rins", action="store_true", + help="Set the boot_state to 'rins' for all nodes.") + parser.add_option("", "--reboot", dest="reboot", action="store_true", + help="Actively try to reboot the nodes, keeping a log of actions.") + + parser.add_option("", "--verbose", dest="verbose", action="store_true", + help="Extra debug output messages.") + parser.add_option("", "--nosetup", dest="nosetup", action="store_true", + help="Do not perform the orginary setup phase.") + parser.add_option("", "--skip", dest="skip", + help="Number of machines to skip on the input queue.") + parser.add_option("", "--timewait", dest="timewait", + help="Minutes to wait between iterations of 10 nodes.") + + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + + fbquery = HistoryNodeRecord.query.all() + hostnames = [ n.hostname for n in fbquery ] + + fbquery = HistorySiteRecord.query.all() + sitenames = [ s.loginbase for s in fbquery ] + + if config.site: + # TODO: replace with calls to local db. the api fails so often that + # these calls should be regarded as unreliable. + l_nodes = plccache.GetNodesBySite(config.site) + filter_hostnames = [ n['hostname'] for n in l_nodes ] + + hostnames = filter(lambda x: x in filter_hostnames, hostnames) + sitenames = [config.site] + + if config.node: + hostnames = [ config.node ] + sitenames = [ plccache.plcdb_hn2lb[config.node] ] + + try: + main(hostnames, sitenames) + except KeyboardInterrupt: + print "Killed by interrupt" + session.flush() + sys.exit(0) + except: + #email_exception() + print traceback.print_exc(); + print "fail all..." diff --git a/setup.py b/setup.py index 19532fa..f9cb03a 100644 --- a/setup.py +++ b/setup.py @@ -2,13 +2,17 @@ from distutils.core import setup -packages=['monitor', 'monitor.database', 'monitor.database.zabbixapi', - 'monitor.database.info', 'monitor.sources', - 'monitor.util', 'monitor.wrapper' ] +packages=[ 'monitor', + 'monitor.database', + 'monitor.database.zabbixapi', + 'monitor.database.info', + 'monitor.sources', + 'monitor.util', + 'monitor.wrapper' ] print packages setup(name='MonitorModule', - version='1.1', + version='2.0', description='Monitor Utility Module', author='Stephen Soltesz', author_email='soltesz@cs.princeton.edu', @@ -17,6 +21,7 @@ setup(name='MonitorModule', ) packages=['pcucontrol', + 'pcucontrol.util', 'pcucontrol.transports', 'pcucontrol.transports.ssh', 'pcucontrol.transports.pyssh', @@ -31,7 +36,7 @@ packages=['pcucontrol', # TODO: add data dir for intelamt and hpilo stuff print packages setup(name='PCUControlModule', - version='1.1', + version='2.0', description='PCU Control Module', author='Stephen Soltesz', author_email='soltesz@cs.princeton.edu', diff --git a/sitebad.py b/sitebad.py index f8524f0..4d9ee33 100755 --- a/sitebad.py +++ b/sitebad.py @@ -7,10 +7,9 @@ import time from datetime import datetime,timedelta from monitor import database -from pcucontrol import reboot from monitor import parser as parsermodule from monitor import config -from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session +from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session, BlacklistRecord from monitor.wrapper import plc, plccache from monitor.const import MINUP @@ -29,6 +28,8 @@ def main2(config): if config.site: l_sites = [config.site] + elif config.node: + l_sites = [plccache.plcdb_hn2lb[config.node]] elif config.sitelist: site_list = config.sitelist.split(',') l_sites = site_list @@ -37,33 +38,55 @@ def main2(config): checkAndRecordState(l_sites, l_plcsites) -def getnewsite(nodelist): - new = True - for node in nodelist: - try: - noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first() - if noderec is not None and \ - noderec.plc_node_stats['last_contact'] != None: - new = False - except: - import traceback - print traceback.print_exc() - return new - def getnodesup(nodelist): + # NOTE : assume that a blacklisted node is fine, since we're told not to + # ignore it, no policy actions should be taken for it. up = 0 for node in nodelist: try: - noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first() - #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], - # orderBy='date_checked').reversed()[0] - if noderec is not None and noderec.observed_status == "BOOT": + nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) + nodebl = BlacklistRecord.get_by(hostname=node['hostname']) + if (nodehist is not None and nodehist.status != 'down') or \ + (nodebl is not None and not nodebl.expired()): up = up + 1 except: import traceback print traceback.print_exc() return up +def check_site_state(rec, sitehist): + + if sitehist.new and sitehist.status not in ['new', 'online', 'good']: + sitehist.status = 'new' + sitehist.penalty_applied = True # because new sites are disabled by default, i.e. have a penalty. + sitehist.last_changed = datetime.now() + + if sitehist.nodes_up >= MINUP: + + if sitehist.status != 'online' and sitehist.status != 'good': + sitehist.last_changed = datetime.now() + + if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online': + print "changed status from %s to online" % sitehist.status + sitehist.status = 'online' + + if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good': + print "changed status from %s to good" % sitehist.status + sitehist.status = 'good' + + elif not sitehist.new: + + if sitehist.status != 'offline' and sitehist.status != 'down': + sitehist.last_changed = datetime.now() + + if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline': + print "changed status from %s to offline" % sitehist.status + sitehist.status = 'offline' + + if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down': + print "changed status from %s to down" % sitehist.status + sitehist.status = 'down' + def checkAndRecordState(l_sites, l_plcsites): count = 0 lb2hn = plccache.plcdb_lb2hn @@ -77,27 +100,32 @@ def checkAndRecordState(l_sites, l_plcsites): continue if sitename in lb2hn: - pf = HistorySiteRecord.findby_or_create(loginbase=sitename) - - pf.last_checked = datetime.now() - pf.slices_total = d_site['max_slices'] - pf.slices_used = len(d_site['slice_ids']) - pf.nodes_total = len(lb2hn[sitename]) - pf.nodes_up = getnodesup(lb2hn[sitename]) - pf.new = getnewsite(lb2hn[sitename]) - pf.enabled = d_site['enabled'] - - if pf.nodes_up >= MINUP: - if pf.status != "good": pf.last_changed = datetime.now() - pf.status = "good" - else: - if pf.status != "down": pf.last_changed = datetime.now() - pf.status = "down" + sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename, + if_new_set={'status' : 'unknown', + 'last_changed' : datetime.now(), + 'message_id': 0, + 'penalty_level' : 0}) + sitehist.last_checked = datetime.now() + + sitehist.slices_total = d_site['max_slices'] + sitehist.slices_used = len(d_site['slice_ids']) + sitehist.nodes_total = len(lb2hn[sitename]) + if sitehist.message_id != 0: + rtstatus = mailer.getTicketStatus(sitehist.message_id) + sitehist.message_status = rtstatus['Status'] + sitehist.message_queue = rtstatus['Queue'] + sitehist.message_created = datetime.fromtimestamp(rtstatus['Created']) + + sitehist.nodes_up = getnodesup(lb2hn[sitename]) + sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago + sitehist.enabled = d_site['enabled'] + + check_site_state(d_site, sitehist) count += 1 - print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, - pf.nodes_total, pf.nodes_up, pf.status) - pf.flush() + print "%d %15s slices(%2s) nodes(%2s) notdown(%2s) %s" % (count, sitename, sitehist.slices_used, + sitehist.nodes_total, sitehist.nodes_up, sitehist.status) + sitehist.flush() print HistorySiteRecord.query.count() session.flush() diff --git a/siteinfo.py b/siteinfo.py index cfce458..4b4daf7 100755 --- a/siteinfo.py +++ b/siteinfo.py @@ -4,7 +4,6 @@ from monitor.wrapper import plc api = plc.getAuthAPI() from monitor import database -from pcucontrol import reboot import time from monitor.common import * @@ -63,7 +62,7 @@ def plc_print_siteinfo(plcsite): diff_time(plcsite['last_updated'])) print "" - nodes = api.GetNodes(plcsite['node_ids']) + nodes = plccache.GetNodesByIds(plcsite['node_ids']) print " Checked: %s" % time.ctime() print "\t host | state | obs | created | updated | last_contact " for plcnode in nodes: @@ -80,7 +79,7 @@ act_all = database.dbLoad("act_all") for site in config.args: config.site = site - plc_siteinfo = api.GetSites({'login_base': config.site})[0] + plc_siteinfo = plccache.GetSitesByName([config.site]) url = "https://www.planet-lab.org/db/sites/index.php?site_pattern=" plc_siteinfo['url'] = url + plc_siteinfo['login_base'] @@ -88,7 +87,7 @@ for site in config.args: # rerun findbad with the nodes in the given nodes. import os file = "findbad.txt" - nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname']) + nodes = plccache.GetNodesByIds(plc_siteinfo['node_ids']) nodes = [ n['hostname'] for n in nodes ] util.file.setFileFromList(file, nodes) os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) diff --git a/testapi.py b/testapi.py index f473d4b..d60effb 100755 --- a/testapi.py +++ b/testapi.py @@ -16,5 +16,5 @@ try: network = api.GetNodeNetworks(node['nodenetwork_ids']) print "ok" except: - sys.stderr.write(traceback.print_exc()) + sys.stderr.write(traceback.format_exc()) print "fail" diff --git a/nodenetwork.py b/tests/nodenetwork.py similarity index 100% rename from nodenetwork.py rename to tests/nodenetwork.py diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index bb0580b..1c4efe9 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -11,15 +11,17 @@ from monitor.database.info.model import * from monitor.database.zabbixapi.model import * from monitor.database.dborm import zab_session as session from monitor.database.dborm import zab_metadata as metadata +from monitor_xmlrpc import MonitorXmlrpcServer + +from monitor import reboot +from monitor import scanapi -from pcucontrol import reboot from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn from monitorweb.templates.links import * -from monitor import scanapi def query_to_dict(query): @@ -103,7 +105,7 @@ class NodeWidget(widgets.Widget): def prep_node_for_display(node): if node.plc_pcuid: - pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid) if pcu: node.pcu_status = pcu.reboot_trial_status node.pcu_short_status = format_pcu_shortstatus(pcu) @@ -132,6 +134,10 @@ def prep_node_for_display(node): if node.loginbase: node.site = HistorySiteRecord.by_loginbase(node.loginbase) + if node.site is None: + # TODO: need a cleaner fix for this... + node.site = HistorySiteRecord.by_loginbase("pl") + node.history = HistoryNodeRecord.by_hostname(node.hostname) @@ -144,7 +150,7 @@ def prep_node_for_display(node): -class Root(controllers.RootController): +class Root(controllers.RootController, MonitorXmlrpcServer): @expose(template="monitorweb.templates.welcome") def index(self): import time @@ -161,48 +167,84 @@ class Root(controllers.RootController): prep_node_for_display(node) nodequery += [node] - return self.pcuview(None, hostname) # dict(nodequery=nodequery) + return self.pcuview(None, None, hostname) # dict(nodequery=nodequery) @expose(template="monitorweb.templates.nodelist") - def node(self, filter='BOOT'): + def node(self, filter='boot'): import time fbquery = FindbadNodeRecord.get_all_latest() query = [] - filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0} + filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, + 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0} for node in fbquery: # NOTE: reformat some fields. prep_node_for_display(node) - # NOTE: count filters - if node.observed_status != 'DOWN': - filtercount[node.observed_status] += 1 - else: + node.history.status + + if node.history.status in ['down', 'offline']: if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: - filtercount[node.observed_status] += 1 + filtercount['down'] += 1 else: filtercount['neverboot'] += 1 + elif node.history.status in ['good', 'online']: + filtercount['boot'] += 1 + elif node.history.status in ['debug', 'monitordebug']: + filtercount['debug'] += 1 + else: + filtercount[node.history.status] += 1 + + ## NOTE: count filters + #if node.observed_status != 'DOWN': + # print node.hostname, node.observed_status + # if node.observed_status == 'DEBUG': + # if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']: + # filtercount[node.plc_node_stats['boot_state']] += 1 + # else: + # filtercount['debug'] += 1 + # + # else: + # filtercount[node.observed_status] += 1 + #else: + # if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: + # filtercount[node.observed_status] += 1 + # else: + # filtercount['neverboot'] += 1 # NOTE: apply filter - if filter == node.observed_status: - if filter == "DOWN": - if node.plc_node_stats['last_contact'] != None: - query.append(node) - else: - query.append(node) - elif filter == "neverboot": + if filter == "neverboot": if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None: query.append(node) - elif filter == "pending": - # TODO: look in message logs... - pass elif filter == "all": query.append(node) + elif filter == node.history.status: + query.append(node) + elif filter == 'boot': + query.append(node) + + #if filter == node.observed_status: + # if filter == "DOWN": + # if node.plc_node_stats['last_contact'] != None: + # query.append(node) + # else: + # query.append(node) + #elif filter == "neverboot": + # if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None: + # query.append(node) + #elif filter == "pending": + # # TODO: look in message logs... + # pass + #elif filter == node.plc_node_stats['boot_state']: + # query.append(node) + #elif filter == "all": + # query.append(node) widget = NodeWidget(template='monitorweb.templates.node_template') return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget) def nodeaction_handler(self, tg_exceptions=None): """Handle any kind of error.""" + print "NODEACTION_HANDLER------------------" if 'pcuid' in request.params: pcuid = request.params['pcuid'] @@ -217,7 +259,7 @@ class Root(controllers.RootController): if 'pcuid' in val: pcuid = val['pcuid'] elif 'hostname' in val: - pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid + pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid else: pcuid=None else: @@ -231,6 +273,7 @@ class Root(controllers.RootController): return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions)) def nodeaction(self, **data): + print "NODEACTION------------------" for item in data.keys(): print "%s %s" % ( item, data[item] ) @@ -254,7 +297,7 @@ class Root(controllers.RootController): ret = reboot.reboot_str(str(hostname)) print ret if ret: raise RuntimeError("Error using PCU: " + str(ret)) - flash("Reboot appeared to work. All at most 5 minutes. Run ExternalScan to check current status.") + flash("Reboot appeared to work. Allow at most 5 minutes. Then run ExternalScan to check current status.") elif action == "ExternalScan": scanapi.externalprobe(str(hostname)) @@ -271,9 +314,12 @@ class Root(controllers.RootController): @expose(template="monitorweb.templates.pcuview") @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)") def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data): + print "PCUVIEW------------------" + session.clear() sitequery=[] pcuquery=[] nodequery=[] + actions=[] exceptions = None for key in data: @@ -286,15 +332,19 @@ class Root(controllers.RootController): exceptions = data['exceptions'] if loginbase: + actions = ActionRecord.query.filter_by(loginbase=loginbase + ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7) + ).order_by(ActionRecord.date_created.desc()) + actions = [ a for a in actions ] sitequery = [HistorySiteRecord.by_loginbase(loginbase)] pcus = {} for plcnode in site_lb2hn[loginbase]: - for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']): + node = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']) # NOTE: reformat some fields. prep_node_for_display(node) nodequery += [node] if node.plc_pcuid: # not None - pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid) prep_pcu_for_display(pcu) pcus[node.plc_pcuid] = pcu @@ -303,37 +353,61 @@ class Root(controllers.RootController): if pcuid and hostname is None: print "pcuid: %s" % pcuid - for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid): - # NOTE: count filter - prep_pcu_for_display(pcu) - pcuquery += [pcu] + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=pcuid) + # NOTE: count filter + prep_pcu_for_display(pcu) + pcuquery += [pcu] if 'site_id' in pcu.plc_pcu_stats: sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)] if 'nodenames' in pcu.plc_pcu_stats: for nodename in pcu.plc_pcu_stats['nodenames']: print "query for %s" % nodename - q = FindbadNodeRecord.get_latest_by(hostname=nodename) - node = q.first() + node = FindbadNodeRecord.get_latest_by(hostname=nodename) print "%s" % node.port_status print "%s" % node.to_dict() - print "%s" % len(q.all()) if node: prep_node_for_display(node) nodequery += [node] if hostname and pcuid is None: - for node in FindbadNodeRecord.get_latest_by(hostname=hostname): + node = FindbadNodeRecord.get_latest_by(hostname=hostname) # NOTE: reformat some fields. prep_node_for_display(node) sitequery = [node.site] nodequery += [node] if node.plc_pcuid: # not None - pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid) prep_pcu_for_display(pcu) pcuquery += [pcu] - return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions) + return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions, exceptions=exceptions) + + @expose(template="monitorweb.templates.nodehistory") + def nodehistory(self, hostname=None): + query = [] + if hostname: + fbnode = FindbadNodeRecord.get_by(hostname=hostname) + # TODO: add links for earlier history if desired. + l = fbnode.versions[-100:] + l.reverse() + for node in l: + prep_node_for_display(node) + query.append(node) + return dict(query=query, hostname=hostname) + + @expose(template="monitorweb.templates.sitehistory") + def sitehistory(self, loginbase=None): + query = [] + if loginbase: + fbsite = HistorySiteRecord.get_by(loginbase=loginbase) + # TODO: add links for earlier history if desired. + l = fbsite.versions[-100:] + l.reverse() + for site in l: + query.append(site) + return dict(query=query, loginbase=loginbase) + @expose(template="monitorweb.templates.pculist") def pcu(self, filter='all'): @@ -384,7 +458,7 @@ class Root(controllers.RootController): @expose(template="monitorweb.templates.sitelist") def site(self, filter='all'): - filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0} + filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0} fbquery = HistorySiteRecord.query.all() query = [] for site in fbquery: @@ -394,8 +468,10 @@ class Root(controllers.RootController): filtercount['new'] += 1 elif not site.enabled: filtercount['pending'] += 1 - else: - filtercount[site.status] += 1 + elif site.status in ['good', 'online']: + filtercount['good'] += 1 + elif site.status in ['down', 'offline']: + filtercount['down'] += 1 # apply filter if filter == "all": @@ -404,7 +480,9 @@ class Root(controllers.RootController): query.append(site) elif filter == "pending" and not site.enabled: query.append(site) - elif filter == site.status: + elif filter == 'good' and site.status in ['good', 'online']: + query.append(site) + elif filter == 'down' and site.status in ['down', 'offline']: query.append(site) return dict(query=query, fc=filtercount) diff --git a/web/MonitorWeb/monitorweb/monitor_xmlrpc.py b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py new file mode 100644 index 0000000..a0c5052 --- /dev/null +++ b/web/MonitorWeb/monitorweb/monitor_xmlrpc.py @@ -0,0 +1,161 @@ +import sys +import xmlrpclib +import cherrypy +import turbogears +from datetime import datetime, timedelta +import time + +from monitor.database.info.model import * +from monitor.database.info.interface import * + +class MonitorXmlrpcServerMethods: + @cherrypy.expose + def listMethods(self): + mod = MonitorXmlrpcServer() + ret_list = [] + for f in dir(mod): + if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))): + ret_list += [f] + return ret_list + +def convert_datetime(d, keys=None): + ret = d.copy() + n = datetime.now() + if keys == None: + keys = d.keys() + for k in keys: + if type(d[k]) == type(n): + ret[k] = time.mktime(d[k].utctimetuple()) + + return ret + +class MonitorXmlrpcServer(object): + + @cherrypy.expose + def listMethods(self): + mod = MonitorXmlrpcServer() + ret_list = [] + for f in dir(mod): + if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))): + ret_list += [f] + return ret_list + + @turbogears.expose() + def XMLRPC(self): + params, method = xmlrpclib.loads(cherrypy.request.body.read()) + try: + if method == "xmlrpc": + # prevent recursion + raise AssertionError("method cannot be 'xmlrpc'") + # Get the function and make sure it's exposed. + method = getattr(self, method, None) + # Use the same error message to hide private method names + if method is None or not getattr(method, "exposed", False): + raise AssertionError("method does not exist") + + session.clear() + # Call the method, convert it into a 1-element tuple + # as expected by dumps + response = method(*params) + + session.flush() + response = xmlrpclib.dumps((response,), methodresponse=1, allow_none=1) + except xmlrpclib.Fault, fault: + # Can't marshal the result + response = xmlrpclib.dumps(fault, allow_none=1) + except: + # Some other error; send back some error info + response = xmlrpclib.dumps( + xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value)) + ) + + cherrypy.response.headers["Content-Type"] = "text/xml" + return response + + # User-defined functions must use cherrypy.expose; turbogears.expose + # does additional checking of the response type that we don't want. + @cherrypy.expose + def upAndRunning(self): + return True + + # SITES ------------------------------------------------------------ + + @cherrypy.expose + def getSiteStatus(self, auth): + ret_list = [] + sites = HistorySiteRecord.query.all() + for q in sites: + d = q.to_dict(exclude=['timestamp', 'version', ]) + d = convert_datetime(d, ['last_checked', 'last_changed', 'message_created']) + ret_list.append(d) + return ret_list + + @cherrypy.expose + def clearSitePenalty(self, auth, loginbase): + sitehist = SiteInterface.get_or_make(loginbase=loginbase) + sitehist.clearPenalty() + #sitehist.applyPenalty() + #sitehist.sendMessage('clear_penalty') + sitehist.closeTicket() + return True + + @cherrypy.expose + def increaseSitePenalty(self, auth, loginbase): + sitehist = SiteInterface.get_or_make(loginbase=loginbase) + sitehist.increasePenalty() + #sitehist.applyPenalty() + #sitehist.sendMessage('increase_penalty') + return True + + # NODES ------------------------------------------------------------ + + @cherrypy.expose + def getNodeStatus(self, auth): + ret_list = [] + sites = HistoryNodeRecord.query.all() + for q in sites: + d = q.to_dict(exclude=['timestamp', 'version', ]) + d = convert_datetime(d, ['last_checked', 'last_changed',]) + ret_list.append(d) + return ret_list + + @cherrypy.expose + def getRecentActions(self, auth, loginbase=None, hostname=None): + ret_list = [] + return ret_list + + # BLACKLIST ------------------------------------------------------------ + + @cherrypy.expose + def getBlacklist(self, auth): + bl = BlacklistRecord.query.all() + ret_list = [] + for q in bl: + d = q.to_dict(exclude=['timestamp', 'version', 'id', ]) + d = convert_datetime(d, ['date_created']) + ret_list.append(d) + + return ret_list + # datetime.datetime.fromtimestamp(time.mktime(time.strptime(mytime, time_format))) + + @cherrypy.expose + def addHostToBlacklist(self, auth, hostname, expires=0): + bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires) + return True + + @cherrypy.expose + def addSiteToBlacklist(self, auth, loginbase, expires=0): + bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires) + return True + + @cherrypy.expose + def deleteFromBlacklist(self, auth, loginbase=None, hostname=None): + if (loginbase==None and hostname == None) or (loginbase != None and hostname != None): + raise Exception("Please specify a single record to delete: either hostname or loginbase") + elif loginbase != None: + bl = BlacklistRecord.get_by(loginbase=loginbase) + bl.delete() + elif hostname != None: + bl = BlacklistRecord.get_by(hostname=hostname) + bl.delete() + return True diff --git a/web/MonitorWeb/monitorweb/static/css/style.css b/web/MonitorWeb/monitorweb/static/css/style.css index df07184..4367a0a 100644 --- a/web/MonitorWeb/monitorweb/static/css/style.css +++ b/web/MonitorWeb/monitorweb/static/css/style.css @@ -17,10 +17,10 @@ tr.even td {background-color:#fff;} #header { height: 40px; - width: 780px; + /*width: 780px;*/ /*background: blue URL('../images/header_inner.png') no-repeat;*/ - border-left: 1px solid #aaa; - border-right: 1px solid #aaa; + /*border-left: 1px solid #aaa;*/ + /*border-right: 1px solid #aaa;*/ margin: 0 auto 0 auto; text-align: center; font-size: 180%; @@ -102,9 +102,16 @@ a.right { float: right; } #status-error { background-color: indianred; } #status-none { background-color: white; } +#site-new { background-color: gold; } #site-good { background-color : darkseagreen; } +#site-online { background-color : lightgreen; } +#site-offline { background-color: red; } #site-down { background-color: indianred; } +/*#site-0 { background-color : white; }*/ +#site-1 { background-color: gold; } +#site-2 { background-color: indianred; } + #node-BOOT { background-color: darkseagreen; } #node-DOWN { background-color: indianred; } #node-DEBUG { background-color: gold; } @@ -182,7 +189,7 @@ h2 { } #footer { - border: 1px solid #aaa; + /*border: 1px solid #aaa;*/ border-top: 0px none; color: #999; background-color: white; diff --git a/web/MonitorWeb/monitorweb/templates/links.py b/web/MonitorWeb/monitorweb/templates/links.py index 6b47bb1..2bc6917 100644 --- a/web/MonitorWeb/monitorweb/templates/links.py +++ b/web/MonitorWeb/monitorweb/templates/links.py @@ -2,6 +2,8 @@ from monitor import config import turbogears as tg import urllib +def plc_mail_uri(ticketid): + return config.RT_WEB_SERVER + "/Ticket/Display.html?id=" + str(ticketid) def plc_node_uri(hostname): return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname) def plc_site_uri(loginbase): diff --git a/web/MonitorWeb/monitorweb/templates/nodehistory.kid b/web/MonitorWeb/monitorweb/templates/nodehistory.kid new file mode 100644 index 0000000..8fa825b --- /dev/null +++ b/web/MonitorWeb/monitorweb/templates/nodehistory.kid @@ -0,0 +1,60 @@ + + + + +
+

Node History : ${hostname}

+ + + + + + +
+ + + + + + + + + + + + + + + + + + + +
Hostnamekernellast_contact
+ your.host.org
+
+
+ + diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid index 5b4e7c3..53bbe5b 100644 --- a/web/MonitorWeb/monitorweb/templates/nodelist.kid +++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid @@ -13,17 +13,19 @@ from links import * - - - + + + + + - + - + - + @@ -151,6 +154,7 @@ from links import * + - + diff --git a/web/MonitorWeb/monitorweb/templates/sitemenu.kid b/web/MonitorWeb/monitorweb/templates/sitemenu.kid index 4383b84..301e6ae 100644 --- a/web/MonitorWeb/monitorweb/templates/sitemenu.kid +++ b/web/MonitorWeb/monitorweb/templates/sitemenu.kid @@ -1,7 +1,7 @@ - App Name - ${page_title} + ${page_title} @@ -13,8 +13,8 @@ -
Production(${fc['BOOT']})Debug(${fc['DEBUG']})Down(${fc['DOWN']})Prod(${fc['boot']})Down(${fc['down']})Errors(${fc['debug']})Diagnose (${fc['diagnose']})Disabled (${fc['disabled']}) Never Booted(${fc['neverboot']})Pending Reply(${fc['pending']}) All
+ diff --git a/web/MonitorWeb/monitorweb/templates/pcuview.kid b/web/MonitorWeb/monitorweb/templates/pcuview.kid index 5bf82b8..fc471d9 100644 --- a/web/MonitorWeb/monitorweb/templates/pcuview.kid +++ b/web/MonitorWeb/monitorweb/templates/pcuview.kid @@ -16,6 +16,7 @@ from links import *
+ @@ -26,11 +27,12 @@ from links import * + - + @@ -131,7 +133,7 @@ from links import *
History Site name Enabled Penalty
history ${site.loginbase} n/a${site.penalty_level} ${site.slices_used}/${site.slices_total} ${site.nodes_up} / ${site.nodes_total}
-

Nodes

+

Nodes

There are no registered nodes for this site.

@@ -139,9 +141,10 @@ from links import *
History Hostname last_contactLast_checkedlast_checked Port Status
history ${node.hostname} @@ -193,21 +197,61 @@ from links import *
-

Convenience Calls

- -
+ +

Actions Over the Last Week

+

+ There are no recent actions taken for this site. +

+ + + + + + + + + + + + + + + + + + + + + + + +
DateAction taken onAction TypeMessage IDErrors
+ + ${act.hostname} + + + ${act.loginbase} + + ${act.message_id}
+ + +

Convenience Calls

+
ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no ${pcu.plc_pcu_stats['username']}@${pcu_name(pcu.plc_pcu_stats)} +
telnet ${pcu_name(pcu.plc_pcu_stats)} +
http://${pcu_name(pcu.plc_pcu_stats)} +

diff --git a/web/MonitorWeb/monitorweb/templates/sitehistory.kid b/web/MonitorWeb/monitorweb/templates/sitehistory.kid new file mode 100644 index 0000000..66cc0d1 --- /dev/null +++ b/web/MonitorWeb/monitorweb/templates/sitehistory.kid @@ -0,0 +1,55 @@ + + + + +
+

Site History : ${loginbase}

+ + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
Site nameEnabledPenaltySlices/MaxNodes/TotalDate Checked
+ + ${site.penalty_level}${site.slices_used}/${site.slices_total}${site.nodes_up} / ${site.nodes_total}
+
+
+ + diff --git a/web/MonitorWeb/monitorweb/templates/sitelist.kid b/web/MonitorWeb/monitorweb/templates/sitelist.kid index a9b7685..a2bac31 100644 --- a/web/MonitorWeb/monitorweb/templates/sitelist.kid +++ b/web/MonitorWeb/monitorweb/templates/sitelist.kid @@ -46,7 +46,7 @@ from links import *
n/a${site.penalty_level} ${site.slices_used}/${site.slices_total} ${site.nodes_up} / ${site.nodes_total}
+ +
@@ -24,7 +24,7 @@ - + @@ -38,8 +38,8 @@
Sites PCUs NodesActionsActions
- diff --git a/www/gadgets/sitemonitor.py b/www/gadgets/sitemonitor.py index c52b36b..3ec6231 100755 --- a/www/gadgets/sitemonitor.py +++ b/www/gadgets/sitemonitor.py @@ -108,7 +108,8 @@ def main(): fb = database.dbLoad("findbad") lb2hn = database.dbLoad("plcdb_lb2hn") - pf = database.dbLoad("node_persistflags") + # todo: pull from HistoryNodeRecord table instead + #pf = database.dbLoad("node_persistflags") # SETUP header t = TABLE(border="0", cellspacing="0", cellpadding="0") @@ -135,7 +136,8 @@ def main(): url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host td = TD(A(host, target='_blank', href=url), bgcolor=color) r.append(td) - lc = pf[host].last_changed + #lc = pf[host].last_changed + lc=-1 td = TD(diff_time(lc)) r.append(td) t.append(r) diff --git a/zabbix.spec b/zabbix.spec index 2a408e3..3a91d20 100644 --- a/zabbix.spec +++ b/zabbix.spec @@ -290,6 +290,43 @@ rm -f %{zabbix_logdir}/zabbix_agentd.log %{zabbix_webdir} %changelog +* Fri Apr 03 2009 Stephen Soltesz - Monitor-2.0-9 +- added new models to db. +- major updates throughout. +- better unification. needs an install test. + +* Wed Apr 01 2009 Stephen Soltesz - Monitor-2.0-8 +- removed old pkl database references. +- added blacklist to db model +- added fix to IntelAMT remoteControl to start an power-down node +- added policy.py +- added global error count before bailing entirely. + +* Fri Mar 27 2009 Stephen Soltesz - Monitor-2.0-7 +- improved db model +- updated files that use db model +- updated web view based on node, site, and pcu states. +- added local mirror to zabbix Make file. + +* Tue Mar 24 2009 Stephen Soltesz - Monitor-2.0-6 +- added action view to gui +- added penalty_applied bit to db model. + +* Fri Mar 20 2009 Stephen Soltesz - Monitor-2.0-5 +- tag for updates to 2.0 db model + +* Fri Mar 13 2009 Stephen Soltesz - Monitor-2.0-4 +- splits reboot.py across pcucontrol and monitor modules +- moves command.py from monitor/util to pcucontrol/util + +* Tue Mar 10 2009 Stephen Soltesz - Monitor-2.0-3 +- add email exceptions +- other bug fixes. + +* Tue Mar 10 2009 Stephen Soltesz - Monitor-2.0-2 +- getting the pcucontrol and findall.py scripts to work in an integrated +- fashion. + * Fri Feb 27 2009 Stephen Soltesz - Monitor-2.0-1 - preparing to make a 2.0 branch for monitor. diff --git a/zabbix/zabbixsync.py b/zabbix/zabbixsync.py index 5cc2cd3..aaee4ff 100755 --- a/zabbix/zabbixsync.py +++ b/zabbix/zabbixsync.py @@ -44,7 +44,7 @@ if __name__=="__main__": from monitor import parser as parsermodule parser = parsermodule.getParser(['cacheset']) - parser.set_defaults( setupglobal=False, syncsite=True, site=None, setupids=False) + parser.set_defaults( setupglobal=False, syncsite=True, site=None, sitelist=None, setupids=False) parser.add_option("", "--setupids", action="store_true", dest="setupids", help="Setup global IDs.") parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal", @@ -53,6 +53,8 @@ if __name__=="__main__": help="Do not sync sites.") parser.add_option("", "--site", dest="site", help="Sync only given site name.") + parser.add_option("", "--sitelist", dest="sitelist", + help="Sync only given site names in the list.") opts = parsermodule.parse_args(parser) os.system("""echo '' > /usr/share/monitor/nodelist.txt""") -- 2.43.0