SPECFILE = zabbix.spec
#main.URL := http://voxel.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.1.tar.gz
-#main.SHA1SUM:= 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
-main.URL := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
-main.SHA1SUM:= 575c443adec1703c2c242dbf353de9dc3bb4cafb
+#main.SHA1SUM := 6e66efdbbdf23dc3de01379b30ded7b005fb49d9
+#main.URL := http://superb-east.dl.sourceforge.net/sourceforge/zabbix/zabbix-1.6.2.tar.gz
+#main.SHA1SUM := 575c443adec1703c2c242dbf353de9dc3bb4cafb
+main.URL := http://build.planet-lab.org/third-party/zabbix-1.6.2.tar.gz
+main.SHA1SUM := 575c443adec1703c2c242dbf353de9dc3bb4cafb
main.FILE := $(notdir $(main.URL))
# Thierry - when called from within the build, PWD is /build
source ${MONITOR_SCRIPT_ROOT}/agent.sh
-echo "Performing Findbad Nodes"
+echo "Performing FindAll Nodes"
#########################
# 1. FINDBAD NODES
-${MONITOR_SCRIPT_ROOT}/findbad.py --increment $DATE || :
+${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || :
ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
-
-echo "Performing Findbad PCUs"
-#########################
-# 2. FINDBAD PCUS
-${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment $DATE || :
# clean up stray 'locfg' processes that hang around inappropriately...
ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
-echo "Performing uptime changes for sites, nodes, and pcus"
-########################
-# 3. record last-changed for sites, nodes and pcus.
-${MONITOR_SCRIPT_ROOT}/sitebad.py || :
-${MONITOR_SCRIPT_ROOT}/nodebad.py || :
-${MONITOR_SCRIPT_ROOT}/pcubad.py || :
+${MONITOR_SCRIPT_ROOT}/policy.py $DATE
echo "Archiving pkl files"
#########################
# Archive pkl files.
-for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
+for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
else
fi
done
-#echo "Running grouprins on all dbg nodes"
-############################
-# 5. Check if there are any nodes in dbg state. Clean up afterward.
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
-#${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
-
cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
rm -f $MONITOR_PID
import sys
import string
import time
-import database
-import plc
+from monitor import database
+from monitor.database.info.model import *
import getopt
def usage():
def main():
+ loginbase = False
+
try:
- longopts = ["delete=", "help"]
- (opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts)
+ longopts = ["delete=", "loginbase", "help"]
+ (opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts)
except getopt.GetoptError, err:
print "Error: " + err.msg
sys.exit(1)
- l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+ hostnames_q = BlacklistRecord.getHostnameBlacklist()
+ loginbases_q = BlacklistRecord.getLoginbaseBlacklist()
+ hostnames = [ h.hostname for h in hostnames_q ]
+ loginbases = [ h.loginbase for h in loginbases_q ]
for (opt, optval) in opts:
if opt in ["-d", "--delete"]:
- i = int(optval)
- del l_blacklist[i]
+ i = optval
+ bl = BlacklistRecord.get_by(hostname=i)
+ bl.delete()
+ elif opt in ["-l", "--loginbase"]:
+ loginbase = True
else:
usage()
sys.exit(0)
i_cnt = 0
- for i in l_blacklist:
- print i_cnt, " ", i
- i_cnt += 1
+ if not loginbase:
+ for i in hostnames:
+ print i
+ i_cnt += 1
+ else:
+ for i in loginbases:
+ print i
+ i_cnt += 1
+
+
while 1:
line = sys.stdin.readline()
if not line:
break
line = line.strip()
- if not line in l_blacklist:
- l_blacklist.append(line)
+ if line not in hostnames and line not in loginbases:
+ if loginbase:
+ bl = BlacklistRecord(loginbase=line)
+ else:
+ bl = BlacklistRecord(hostname=line)
+ bl.flush()
+ i_cnt += 1
- print "Total %d nodes in blacklist" % (len(l_blacklist))
- database.dbDump("l_blacklist")
+ session.flush()
+ if loginbase:
+ print "Total %d loginbases in blacklist" % (i_cnt)
+ else:
+ print "Total %d nodes in blacklist" % (i_cnt)
if __name__ == '__main__':
import os
# Attempt to reboot a node in debug state.
-from monitor import const
-from monitor.database.info.model import *
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
-import sys
+
import os
+import sys
+import time
+import random
+import signal
+import traceback
+import subprocess
+from sets import Set
from getsshkeys import SSHKnownHosts
-import subprocess
-import time
-from monitor.util import command as moncommands
-from sets import Set
+from Rpyc import SocketConnection, Async
+from Rpyc.Utils import *
+
+import getconf
+from monitor import config
+from monitor import const
+from monitor.model import *
+from monitor.common import email_exception, found_within
+from monitor.database.info.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from pcucontrol.util import command as moncommands
+from pcucontrol.util.command import Sopen
from pcucontrol.transports.ssh import pxssh as pxssh
from pcucontrol.transports.ssh import fdpexpect as fdpexpect
from pcucontrol.transports.ssh import pexpect as pexpect
-from monitor.model import *
-from monitor.wrapper.emailTxt import mailtxt
+
from nodeconfig import network_config_to_str
-import traceback
-from monitor import config
-import signal
-class Sopen(subprocess.Popen):
- def kill(self, signal = signal.SIGTERM):
- os.kill(self.pid, signal)
-#from Rpyc import SocketConnection, Async
-from Rpyc import SocketConnection, Async
-from Rpyc.Utils import *
+api = plc.getAuthAPI()
fb = None
+
class NodeConnection:
def __init__(self, connection, node, config):
self.node = node
self.config = config
def get_boot_state(self):
- if self.c.modules.os.path.exists('/tmp/source'):
- return "dbg"
- elif self.c.modules.os.path.exists('/vservers'):
- return "boot"
- else:
- return "unknown"
+ try:
+ if self.c.modules.os.path.exists('/tmp/source'):
+ return "debug"
+ elif self.c.modules.os.path.exists('/vservers'):
+ return "boot"
+ else:
+ return "unknown"
+ except EOFError:
+ traceback.print_exc()
+ print self.c.modules.sys.path
+ except:
+ traceback.print_exc()
+
+ return "unknown"
def get_dmesg(self):
self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
print " ERROR:", x
print " Possibly, unable to find valid configuration file"
- if bm_continue and self.config and not self.config.quiet:
+ if bm_continue:
for key in bm.VARS.keys():
print key, " == ", bm.VARS[key]
else:
- if self.config and not self.config.quiet: print " Unable to read Node Configuration"
+ print " Unable to read Node Configuration"
def compare_and_repair_nodekeys(self):
ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
bm_continue = True
- plcnode = api.GetNodes({'hostname': self.node}, None)[0]
+ plcnode = plccache.GetNodeByName(self.node)
InitializeBootManager.Run(bm.VARS, bm.LOG)
try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
return
-import random
class PlanetLabSession:
globalport = 22000 + int(random.random()*1000)
self.setup_host()
def get_connection(self, config):
- return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+ conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
+ #i = 0
+ #while i < 3:
+ # print i, conn.c.modules.sys.path
+ # print conn.c.modules.os.path.exists('/tmp/source')
+ # i+=1
+ # time.sleep(1)
+ return conn
def setup_host(self):
self.port = PlanetLabSession.globalport
# COPY Rpyc files to host
cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
if self.verbose: print cmd
+ print cmd
# TODO: Add timeout
timeout = 120
localos = moncommands.CMD()
#cmd = cmd % args
#if self.verbose: print cmd
#print localos.system(cmd,timeout)
+ print "setup rpyc server over ssh"
print ssh.ret
# TODO: Add timeout
"""%(user)s@%(hostname)s"""
cmd = cmd % args
if self.verbose: print cmd
+ print cmd
self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
# TODO: the read() here may block indefinitely. Need a better
# approach therefore, that includes a timeout.
def __del__(self):
if self.command:
if self.verbose: print "Killing SSH session %s" % self.port
+ print "Killing SSH session %s" % self.port
self.command.kill()
-
-def steps_to_list(steps):
- ret_list = []
- for (id,label) in steps:
- ret_list.append(label)
- return ret_list
+
+def steps_to_list(steps, index=1):
+ return map(lambda x: x[index], steps)
def index_to_id(steps,index):
if index < len(steps):
else:
return "done"
-def reboot(hostname, config=None, forced_action=None):
+class DebugInterface:
+ def __init__(self, hostname):
+ self.hostname = hostname
+ self.session = None
- # NOTE: Nothing works if the bootcd is REALLY old.
- # So, this is the first step.
- fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
- if fbnode['category'] == "OLDBOOTCD":
- print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
- args = {}
- args['hostname_list'] = " %s" % hostname
-
- m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
- mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
-
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
-
- print "\tDisabling %s due to out-of-date BOOTCD" % hostname
- api.UpdateNode(hostname, {'boot_state' : 'disable'})
- return True
-
- node = hostname
- print "Creating session for %s" % node
- # update known_hosts file (in case the node has rebooted since last run)
- if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
- try:
- k = SSHKnownHosts(); k.update(node); k.write(); del k
- except:
- print traceback.print_exc()
- return False
-
- try:
- if config == None:
- session = PlanetLabSession(node, False, True)
- else:
- session = PlanetLabSession(node, config.nosetup, config.verbose)
- except Exception, e:
- print "ERROR setting up session for %s" % hostname
- print traceback.print_exc()
- print e
- return False
-
- try:
- conn = session.get_connection(config)
- except EOFError:
- # NOTE: sometimes the wait in setup_host() is not long enough.
- # So, here we try to wait a little longer before giving up entirely.
+ def getConnection(self):
+ print "Creating session for %s" % self.hostname
+ # update known_hosts file (in case the node has rebooted since last run)
try:
- time.sleep(session.timeout*4)
- conn = session.get_connection(config)
+ k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
except:
+ email_exception()
print traceback.print_exc()
return False
- if forced_action == "reboot":
- conn.restart_node('rins')
- return True
+ try:
+ if config == None:
+ self.session = PlanetLabSession(self.hostname, False, True)
+ else:
+ self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
+ except Exception, e:
+ msg = "ERROR setting up session for %s" % self.hostname
+ print msg
+ traceback.print_exc()
+ email_exception(msg)
+ return False
- boot_state = conn.get_boot_state()
- if boot_state == "boot":
- print "...Boot state of %s already completed : skipping..." % node
- return True
- elif boot_state == "unknown":
- print "...Unknown bootstate for %s : skipping..."% node
- return False
- else:
- pass
+ try:
+ conn = self.session.get_connection(config)
+ except EOFError:
+ # NOTE: sometimes the wait in setup_host() is not long enough.
+ # So, here we try to wait a little longer before giving up entirely.
+ try:
+ time.sleep(self.session.timeout*5)
+ conn = self.session.get_connection(config)
+ except:
+ traceback.print_exc()
+ email_exception(self.hostname)
+ return False
+ #print "trying to use conn before returning it."
+ #print conn.c.modules.sys.path
+ #print conn.c.modules.os.path.exists('/tmp/source')
+ #time.sleep(1)
- if conn.bootmanager_running():
- print "...BootManager is currently running. Skipping host %s" % node
- return True
+ #print "conn: %s" % conn
+ return conn
- #if config != None:
- # if config.force:
- # conn.restart_bootmanager(config.force)
- # return True
+ def getSequences(self):
- # Read persistent flags, tagged on one week intervals.
- pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
+ # TODO: This can be replaced with a DB definition at a future time.
+ # This would make it possible for an admin to introduce new
+ # patterns without touching code.
+ sequences = {}
+ # restart_bootmanager_boot
+ for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-debug-done",
+ "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
+ "bminit-cfg-auth-getplc-implementerror-update-debug-done",
+ ]:
+ sequences.update({n : "restart_bootmanager_boot"})
+
+ # conn.restart_bootmanager('rins')
+ for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+ # actual solution appears to involve removing the bad files, and
+ # continually trying to boot the node.
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ ]:
+ sequences.update({n : "restart_bootmanager_rins"})
+
+ # repair_node_keys
+ sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+
+ # conn.restart_node('rins')
+ for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ ]:
+ sequences.update({n : "restart_node_rins"})
+
+ # restart_node_boot
+ for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+ ]:
+ sequences.update({n: "restart_node_boot"})
+
+ # update_node_config_email
+ for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
+ ]:
+ sequences.update({n : "update_node_config_email"})
+
+ for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
+ "bminit-cfg-update-exception-nodehostname-update-debug-done",
+ ]:
+ sequences.update({n : "nodenetwork_email"})
+
+ # update_bootcd_email
+ for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
+ ]:
+ sequences.update({n : "update_bootcd_email"})
+
+ for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+ ]:
+ sequences.update({n: "suspect_error_email"})
+
+ # update_hardware_email
+ sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+ sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+
+ # broken_hardware_email
+ sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+ # bad_dns_email
+ for n in [
+ "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ ]:
+ sequences.update( { n : "bad_dns_email"})
- if config and not config.quiet: print "...downloading dmesg from %s" % node
- dmesg = conn.get_dmesg()
- child = fdpexpect.fdspawn(dmesg)
+ return sequences
- sequence = []
- while True:
+ def getDiskSteps(self):
steps = [
('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
# SCSI error : <0 2 0 0> return code = 0x40001
# end_request: I/O error, dev sda, sector 572489600
]
- id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
- sequence.append(id)
-
- if id == "done":
- break
-
- s = Set(sequence)
- if config and not config.quiet: print "\tSET: ", s
-
- if len(s) > 1:
- print "...Potential drive errors on %s" % node
- if len(s) == 2 and 'floppyerror' in s:
- print "...Should investigate. Continuing with node."
- else:
- print "...Should investigate. Skipping node."
- # TODO: send message related to these errors.
- args = {}
- args['hostname'] = hostname
- args['log'] = conn.get_dmesg().read()
-
- m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
- mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
-
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
- conn.set_nodestate('disable')
- return False
+ return steps
- print "...Downloading bm.log from %s" % node
- log = conn.get_bootmanager_log()
- child = fdpexpect.fdspawn(log)
-
- try:
- if config.collect: return True
- except:
- pass
+ def getDiskSequence(self, steps, child):
+ sequence = []
+ while True:
+ id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
+ sequence.append(id)
- time.sleep(1)
-
- if config and not config.quiet: print "...Scanning bm.log for errors"
- action_id = "dbg"
- sequence = []
- while True:
+ if id == "done":
+ break
+ return sequence
+ def getBootManagerStepPatterns(self):
steps = [
('bminit' , 'Initializing the BootManager.'),
('cfg' , 'Reading node configuration file.'),
('bootcheckfail' , 'BootCheckAuthentication'),
('bootupdatefail' , 'BootUpdateNode'),
]
- list = steps_to_list(steps)
- index = child.expect( list + [ pexpect.EOF ])
- id = index_to_id(steps,index)
- sequence.append(id)
-
- if id == "exception":
- if config and not config.quiet: print "...Found An Exception!!!"
- elif index == len(list):
- #print "Reached EOF"
- break
+ return steps
+
+ def getBootManagerSequenceFromLog(self, steps, child):
+ sequence = []
+ while True:
+
+ index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
+ id = index_to_id(steps,index)
+ sequence.append(id)
+
+ if id == "exception":
+ print "...Found An Exception!!!"
+ elif id == "done": #index == len(steps_to_list(steps)):
+ #print "Reached EOF"
+ break
+
+ return sequence
- s = "-".join(sequence)
- print " FOUND SEQUENCE: ", s
- # NOTE: We get or set the flag based on the current sequence identifier.
- # By using the sequence identifier, we guarantee that there will be no
- # frequent loops. I'm guessing there is a better way to track loops,
- # though.
- #if not config.force and pflags.getRecentFlag(s):
- # pflags.setRecentFlag(s)
- # pflags.save()
- # print "... flag is set or it has already run recently. Skipping %s" % node
+def restore(sitehist, hostname, config=None, forced_action=None):
+
+ # NOTE: Nothing works if the bootcd is REALLY old.
+ # So, this is the first step.
+
+ fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
+ recent_actions = sitehist.getRecentActions(hostname=hostname)
+
+ if fbnode['observed_category'] == "OLDBOOTCD":
+ print "\t...Notify owner to update BootImage!!!"
+
+ if not found_within(recent_actions, 'newbootcd_notice', 3):
+ sitehist.sendMessage('newbootcd_notice', hostname=hostname)
+
+ print "\tDisabling %s due to out-of-date BootImage" % hostname
+ api.UpdateNode(hostname, {'boot_state' : 'disable'})
+
+ # NOTE: nothing else is possible.
+ return True
+
+ debugnode = DebugInterface(hostname)
+ conn = debugnode.getConnection()
+ #print "conn: %s" % conn
+ #print "trying to use conn after returning it."
+ #print conn.c.modules.sys.path
+ #print conn.c.modules.os.path.exists('/tmp/source')
+ if type(conn) == type(False): return False
+
+ #if forced_action == "reboot":
+ # conn.restart_node('rins')
# return True
- sequences = {}
+ boot_state = conn.get_boot_state()
+ if boot_state != "debug":
+ print "... %s in %s state: skipping..." % (hostname , boot_state)
+ return boot_state == "boot"
+ if conn.bootmanager_running():
+ print "...BootManager is currently running. Skipping host %s" %hostname
+ return True
- # restart_bootmanager_boot
- for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
+ # Read persistent flags, tagged on one week intervals.
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
+ if config and not config.quiet: print "...downloading dmesg from %s" %hostname
+ dmesg = conn.get_dmesg()
+ child = fdpexpect.fdspawn(dmesg)
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-debug-done",
- "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
- "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
- "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
- "bminit-cfg-auth-protoerror-exception-update-debug-done",
- "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
- "bminit-cfg-auth-getplc-implementerror-update-debug-done",
- ]:
- sequences.update({n : "restart_bootmanager_boot"})
-
- # conn.restart_bootmanager('rins')
- for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
- # actual solution appears to involve removing the bad files, and
- # continually trying to boot the node.
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
- ]:
- sequences.update({n : "restart_bootmanager_rins"})
-
- # repair_node_keys
- sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
-
- # conn.restart_node('rins')
- for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
- "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
- ]:
- sequences.update({n : "restart_node_rins"})
-
- # restart_node_boot
- for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
- "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
- "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
- ]:
- sequences.update({n: "restart_node_boot"})
-
- # update_node_config_email
- for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
- ]:
- sequences.update({n : "update_node_config_email"})
+ steps = debugnode.getDiskSteps()
+ sequence = debugnode.getDiskSequence(steps, child)
- for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
- "bminit-cfg-update-exception-nodehostname-update-debug-done",
- ]:
- sequences.update({n : "nodenetwork_email"})
-
- # update_bootcd_email
- for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
- ]:
- sequences.update({n : "update_bootcd_email"})
+ s = Set(sequence)
+ if config and not config.quiet: print "\tSET: ", s
- for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
- ]:
- sequences.update({n: "suspect_error_email"})
+ if len(s) > 1:
+ print "...Potential drive errors on %s" % hostname
+ if len(s) == 2 and 'floppyerror' in s:
+ print "...Should investigate. Continuing with node."
+ else:
+ print "...Should investigate. Skipping node."
+ # TODO: send message related to these errors.
- # update_hardware_email
- sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
- sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+ if not found_within(recent_actions, 'newbootcd_notice', 3):
- # broken_hardware_email
- sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+ log=conn.get_dmesg().read()
+ sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
+ conn.set_nodestate('disable')
- # bad_dns_email
- for n in [
- "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
- "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
- ]:
- sequences.update( { n : "bad_dns_email"})
+ return False
- flag_set = True
+ print "...Downloading bm.log from %s" %hostname
+ log = conn.get_bootmanager_log()
+ child = fdpexpect.fdspawn(log)
+
+ if hasattr(config, 'collect') and config.collect: return True
+
+ if config and not config.quiet: print "...Scanning bm.log for errors"
+
+ time.sleep(1)
+
+ steps = debugnode.getBootManagerStepPatterns()
+ sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
+
+ s = "-".join(sequence)
+ print " FOUND SEQUENCE: ", s
+ # NOTE: We get or set the flag based on the current sequence identifier.
+ # By using the sequence identifier, we guarantee that there will be no
+ # frequent loops. I'm guessing there is a better way to track loops,
+ # though.
+
+ sequences = debugnode.getSequences()
+ flag_set = True
if s not in sequences:
print " HOST %s" % hostname
args['hostname'] = hostname
args['sequence'] = s
args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
- mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
- m.reset()
- m.send([config.cc_email])
+ args['viart'] = False
+
+ sitehist.sendMessage('unknownsequence_notice', **args)
conn.restart_bootmanager('boot')
else:
if sequences[s] == "restart_bootmanager_boot":
- if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+ print "...Restarting BootManager.py on %s "%hostname
conn.restart_bootmanager('boot')
elif sequences[s] == "restart_bootmanager_rins":
- if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
+ print "...Restarting BootManager.py on %s "%hostname
conn.restart_bootmanager('rins')
elif sequences[s] == "restart_node_rins":
conn.restart_node('rins')
pass
else:
# there was some failure to synchronize the keys.
- print "...Unable to repair node keys on %s" % node
+ print "...Unable to repair node keys on %s" %hostname
elif sequences[s] == "suspect_error_email":
args = {}
args['hostname'] = hostname
args['sequence'] = s
args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
- mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
- m.reset()
- m.send([config.cc_email])
+ args['viart'] = False
+ sitehist.sendMessage('unknownsequence_notice', **args)
conn.restart_bootmanager('boot')
+ # TODO: differentiate this and the 'nodenetwork_email' actions.
elif sequences[s] == "update_node_config_email":
- print "...Sending message to UPDATE NODE CONFIG"
- args = {}
- args['hostname'] = hostname
- m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
- True, db='nodeid_persistmessages')
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
- conn.dump_plconf_file()
- conn.set_nodestate('disable')
+
+ if not found_within(recent_actions, 'nodeconfig_notice', 3):
+ args = {}
+ args['hostname'] = hostname
+ sitehist.sendMessage('nodeconfig_notice', **args)
+ conn.dump_plconf_file()
elif sequences[s] == "nodenetwork_email":
- print "...Sending message to LOOK AT NODE NETWORK"
- args = {}
- args['hostname'] = hostname
- args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
- True, db='nodenet_persistmessages')
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
- conn.dump_plconf_file()
- conn.set_nodestate('disable')
- elif sequences[s] == "update_bootcd_email":
- print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
- import getconf
- args = {}
- args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
- args['hostname_list'] = "%s" % hostname
+ if not found_within(recent_actions, 'nodeconfig_notice', 3):
+ args = {}
+ args['hostname'] = hostname
+ args['bmlog'] = conn.get_bootmanager_log().read()
+ sitehist.sendMessage('nodeconfig_notice', **args)
+ conn.dump_plconf_file()
- m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
- mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+ elif sequences[s] == "update_bootcd_email":
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
+ if not found_within(recent_actions, 'newalphacd_notice', 3):
+ args = {}
+ args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
+ args['hostname'] = hostname
+
+ sitehist.sendMessage('newalphacd_notice', **args)
- print "\tDisabling %s due to out-of-date BOOTCD" % hostname
- conn.set_nodestate('disable')
+ print "\tDisabling %s due to out-of-date BOOTCD" % hostname
elif sequences[s] == "broken_hardware_email":
# MAKE An ACTION record that this host has failed hardware. May
# require either an exception "/minhw" or other manual intervention.
# Definitely need to send out some more EMAIL.
- print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
# TODO: email notice of broken hardware
- args = {}
- args['hostname'] = hostname
- args['log'] = conn.get_dmesg().read()
- m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
- mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
+ if not found_within(recent_actions, 'baddisk_notice', 1):
+ print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
+ args = {}
+ args['hostname'] = hostname
+ args['log'] = conn.get_dmesg().read()
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
- conn.set_nodestate('disable')
+ sitehist.sendMessage('baddisk_notice', **args)
+ conn.set_nodestate('disable')
elif sequences[s] == "update_hardware_email":
- print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
- args = {}
- args['hostname'] = hostname
- args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
- mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
-
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
- conn.set_nodestate('disable')
+ if not found_within(recent_actions, 'minimalhardware_notice', 1):
+ print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
+ args = {}
+ args['hostname'] = hostname
+ args['bmlog'] = conn.get_bootmanager_log().read()
+ sitehist.sendMessage('minimalhardware_notice', **args)
elif sequences[s] == "bad_dns_email":
- print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
- args = {}
- try:
- node = api.GetNodes(hostname)[0]
- net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
- except:
- print traceback.print_exc()
- # TODO: api error. skip email, b/c all info is not available,
- # flag_set will not be recorded.
- return False
- nodenet_str = network_config_to_str(net)
+ if not found_within(recent_actions, 'baddns_notice', 1):
+ print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
+ args = {}
+ try:
+ node = plccache.GetNodeByName(hostname)
+ net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+ except:
+ email_exception()
+ print traceback.print_exc()
+ # TODO: api error. skip email, b/c all info is not available,
+ # flag_set will not be recorded.
+ return False
+ nodenet_str = network_config_to_str(net)
- args['hostname'] = hostname
- args['network_config'] = nodenet_str
- args['nodenetwork_id'] = net['nodenetwork_id']
- m = PersistMessage(hostname, mailtxt.baddns[0] % args,
- mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
-
- loginbase = plc.siteId(hostname)
- emails = plc.getTechEmails(loginbase)
- m.send(emails)
- conn.set_nodestate('disable')
-
- if flag_set:
- pflags.setRecentFlag(s)
- pflags.save()
+ args['hostname'] = hostname
+ args['network_config'] = nodenet_str
+ args['nodenetwork_id'] = net['nodenetwork_id']
+
+ sitehist.sendMessage('baddns_notice', **args)
return True
from findbad import main as findbad_main
from findbadpcu import main as findbadpcu_main
from sitebad import main as sitebad_main
+from nodebad import main as nodebad_main
+from pcubad import main as pcubad_main
+from monitor.wrapper import plccache
import sys
if __name__ == '__main__':
parser = parsermodule.getParser(['nodesets'])
parser.set_defaults( increment=False, dbname="findbad", cachenodes=False,
- force=False,)
+ force=False, pcuselect=None, pcuid=None, pcu=None)
parser.add_option("", "--cachenodes", action="store_true",
help="Cache node lookup from PLC")
parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
cfg = parsermodule.parse_args(parser)
try:
+ print "sync with plc"
+ plccache.sync()
+ print "findbad"
findbad_main()
+ print "findbadpcu"
findbadpcu_main()
+ print "nodebad"
+ nodebad_main()
+ print "pcubad"
+ pcubad_main()
+ print "sitebad"
sitebad_main()
except Exception, err:
import traceback
import threading
from monitor.util import file
-from monitor.util import command
+from pcucontrol.util import command
from monitor import config
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, session
+from monitor.database.info.model import FindbadNodeRecord, session
from monitor.sources import comon
from monitor.wrapper import plc, plccache
# CREATE all the work requests
for nodename in l_nodes:
- fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
- node_round = fbnodesync.round
- fbnodesync.flush()
+ #fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round':0})
+ #node_round = fbnodesync.round
+ node_round = global_round - 1
+ #fbnodesync.flush()
if node_round < global_round or config.force:
# recreate node stats when refreshed
print "All results collected."
break
- print FindbadNodeRecordSync.query.count()
+ #print FindbadNodeRecordSync.query.count()
print FindbadNodeRecord.query.count()
session.flush()
def main():
global global_round
- fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
- if_new_set={'round' : global_round})
- global_round = fbsync.round
+ #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
+ # if_new_set={'round' : global_round})
+ #global_round = fbsync.round
if config.increment:
# update global round number to force refreshes across all nodes
l_nodes = filter(lambda x: x['hostname'] in f_nodes, l_nodes)
elif config.nodegroup:
ng = api.GetNodeGroups({'name' : config.nodegroup})
- l_nodes = api.GetNodes(ng[0]['node_ids'])
+ l_nodes = plccache.GetNodesByIds(ng[0]['node_ids'])
elif config.site:
- site = api.GetSites(config.site)
- l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+ site = plccache.GetSitesByName([config.site])
+ l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
elif config.sitelist:
site_list = config.sitelist.split(',')
- sites = api.GetSites(site_list)
+ sites = plccache.GetSitesByName(site_list)
node_ids = []
for s in sites:
node_ids += s['node_ids']
- l_nodes = api.GetNodes(node_ids, ['hostname'])
+ l_nodes = plccache.GetNodesByIds(node_ids)
l_nodes = [node['hostname'] for node in l_nodes]
# perform this query after the above options, so that the filter above
# does not break.
if config.nodeselect:
- plcnodes = api.GetNodes({'peer_id' : None}, ['hostname'])
+ plcnodes = plccache.l_nodes
plcnodes = [ node['hostname'] for node in plcnodes ]
l_nodes = node_select(config.nodeselect, plcnodes, None)
if config.increment:
# update global round number to force refreshes across all nodes
- fbsync.round = global_round
- fbsync.flush()
+ #fbsync.round = global_round
+ #fbsync.flush()
+ pass
return 0
main()
except Exception, err:
print traceback.print_exc()
+ from monitor.common import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
sys.exit(0)
import threading
import monitor
-from pcucontrol import reboot
from monitor import config
-from monitor.database.info.model import FindbadPCURecordSync, FindbadPCURecord, session
+from monitor.database.info.model import FindbadPCURecord, session
from monitor import database
from monitor import util
from monitor.wrapper import plc, plccache
# CREATE all the work requests
for pcuname in l_pcus:
pcu_id = int(pcuname)
- fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
- fbnodesync.flush()
+ #fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : 0})
+ #fbnodesync.flush()
- node_round = fbnodesync.round
+ #node_round = fbnodesync.round
+ node_round = global_round - 1
if node_round < global_round or config.force:
# recreate node stats when refreshed
#print "%s" % nodename
print "All results collected."
break
- print FindbadPCURecordSync.query.count()
+ #print FindbadPCURecordSync.query.count()
print FindbadPCURecord.query.count()
session.flush()
l_pcus = plccache.l_pcus
cohash = {}
- fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
- if_new_set={'round' : global_round})
+ #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0,
+ #if_new_set={'round' : global_round})
- global_round = fbsync.round
+ #global_round = fbsync.round
api = plc.getAuthAPI()
if config.site is not None:
- site = api.GetSites(config.site)
- l_nodes = api.GetNodes(site[0]['node_ids'], ['pcu_ids'])
+ site = plccache.GetSitesByName([config.site])
+ l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
pcus = []
for node in l_nodes:
pcus += node['pcu_ids']
# clear out dups.
l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+ elif config.node is not None:
+ l_nodes = plcacche.GetNodeByName(config.node)
+ pcus = []
+ for node in l_nodes:
+ pcus += node['pcu_ids']
+ # clear out dups.
+ l_pcus = [pcu for pcu in sets.Set(pcus)]
+
elif config.sitelist:
site_list = config.sitelist.split(',')
- sites = api.GetSites(site_list)
+ sites = plccache.GetSitesByName(site_list)
node_ids = []
for s in sites:
node_ids += s['node_ids']
- l_nodes = api.GetNodes(node_ids, ['pcu_ids'])
+ l_nodes = plccache.GetNodeByIds(node_ids)
pcus = []
for node in l_nodes:
pcus += node['pcu_ids']
if config.increment:
# update global round number to force refreshes across all nodes
- fbsync.round = global_round
- fbsync.flush()
+ #fbsync.round = global_round
+ #fbsync.flush()
session.flush()
return 0
pcuid=None,
pcuselect=None,
site=None,
+ node=None,
+ sitelist=None,
dbname="findbadpcus",
cachenodes=False,
cachecalls=True,
)
parser.add_option("-f", "--nodelist", dest="nodelist", metavar="FILE",
help="Provide the input file for the node list")
+ parser.add_option("", "--node", dest="node", metavar="FILE",
+ help="Get all pcus associated with the given node")
parser.add_option("", "--site", dest="site", metavar="FILE",
help="Get all pcus associated with the given site's nodes")
+ parser.add_option("", "--sitelist", dest="sitelist", metavar="FILE",
+ help="Get all pcus associated with the given site's nodes")
parser.add_option("", "--pcuselect", dest="pcuselect", metavar="FILE",
help="Query string to apply to the findbad pcus")
parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
time.sleep(1)
except Exception, err:
traceback.print_exc()
+ from monitor.common import email_exception
+ email_exception()
print "Exception: %s" % err
print "Saving data... exitting."
sys.exit(0)
def main():
meta_sites = ['canarie', 'rnp', 'jgn2', 'i2', 'tp', 'princeton', 'princetondsl', 'plcolo', 'wide']
l_blacklist = [ "grouse.hpl.hp.com", "planet1.att.nodes.planet-lab.org"]
- #l_blacklist = database.dbLoad("l_blacklist")
l_sitelist = []
count = 0
# for each prefix above
print "Found %d nodes" % count
print "Found %d sites " % len(l_sitelist)
- database.dbDump("l_blacklist")
if __name__=="__main__":
main()
+++ /dev/null
-#!/usr/bin/python
-
-# This script is used to manipulate the operational state of nodes in
-# different node groups. These are basically set operations on nodes via the
-# PLC api.
-#
-# Take the ng name as an argument....
-# optionally,
-# * get a list of nodes in the given nodegroup.
-# * set some or all in the set to rins.
-# * restart them all.
-# * do something else to them all.
-#
-
-from monitor import config
-from monitor import util
-from monitor import const
-from monitor import database
-from monitor import parser as parsermodule
-from pcucontrol import reboot
-from monitor.wrapper import plc
-api = plc.getAuthAPI()
-
-import traceback
-from optparse import OptionParser
-
-from monitor.common import *
-from nodequery import verify,query_to_dict,node_select
-from monitor.model import *
-import os
-
-import time
-
-import bootman # debug nodes
-import mailmonitor # down nodes without pcu
-from monitor.wrapper.emailTxt import mailtxt
-import sys
-
-class Reboot(object):
- def __init__(self, fbnode):
- self.fbnode = fbnode
-
- def _send_pcunotice(self, host):
- args = {}
- args['hostname'] = host
- try:
- args['pcu_id'] = plc.getpcu(host)['pcu_id']
- except:
- args['pcu_id'] = host
-
- m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
- mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
-
- loginbase = plc.siteId(host)
- m.send([const.TECHEMAIL % loginbase])
-
- def pcu(self, host):
- # TODO: It should be possible to diagnose the various conditions of
- # the PCU here, and send different messages as appropriate.
- print "'%s'" % self.fbnode['pcu']
- if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
- self.action = "reboot.reboot('%s')" % host
-
- pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
- #pflags.resetRecentFlag('pcutried')
- if not pflags.getRecentFlag('pcutried'):
- try:
- print "CALLING REBOOT!!!"
- ret = reboot.reboot(host)
-
- pflags.setRecentFlag('pcutried')
- pflags.save()
- return ret
-
- except Exception,e:
- print traceback.print_exc(); print e
-
- # NOTE: this failure could be an implementation issue on
- # our end. So, extra notices are confusing...
- # self._send_pcunotice(host)
-
- pflags.setRecentFlag('pcufailed')
- pflags.save()
- return False
-
- elif not pflags.getRecentFlag('pcu_rins_tried'):
- try:
- # set node to 'rins' boot state.
- print "CALLING REBOOT +++ RINS"
- plc.nodeBootState(host, 'rins')
- ret = reboot.reboot(host)
-
- pflags.setRecentFlag('pcu_rins_tried')
- pflags.save()
- return ret
-
- except Exception,e:
- print traceback.print_exc(); print e
-
- # NOTE: this failure could be an implementation issue on
- # our end. So, extra notices are confusing...
- # self._send_pcunotice(host)
-
- pflags.setRecentFlag('pcufailed')
- pflags.save()
- return False
- else:
- # we've tried the pcu recently, but it didn't work,
- # so did we send a message about it recently?
- if not pflags.getRecentFlag('pcumessagesent'):
-
- self._send_pcunotice(host)
-
- pflags.setRecentFlag('pcumessagesent')
- pflags.save()
-
- # This will result in mail() being called next, to try to
- # engage the technical contact to take care of it also.
- print "RETURNING FALSE"
- return False
-
- else:
- print "NO PCUOK"
- self.action = "None"
- return False
-
- def mail(self, host):
-
- # Reset every 4 weeks or so
- pflags = PersistFlags(host, 27*60*60*24, db='mail_persistflags')
- if not pflags.getRecentFlag('endrecord'):
- node_end_record(host)
- pflags.setRecentFlag('endrecord')
- pflags.save()
-
- # Then in either case, run mailmonitor.reboot()
- self.action = "mailmonitor.reboot('%s')" % host
- try:
- return mailmonitor.reboot(host)
- except Exception, e:
- print traceback.print_exc(); print e
- return False
-
-class RebootDebug(Reboot):
-
- def direct(self, host):
- self.action = "bootman.reboot('%s', config, None)" % host
- return bootman.reboot(host, config, None)
-
-class RebootBoot(Reboot):
-
- def direct(self, host):
- self.action = "bootman.reboot('%s', config, 'reboot')" % host
- return bootman.reboot(host, config, 'reboot')
-
-class RebootDown(Reboot):
-
- def direct(self, host):
- self.action = "None"
- return False # this always fails, since the node will be down.
-
-def set_node_to_rins(host, fb):
-
- node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
- record = {'observation' : node[0],
- 'model' : 'USER_REQUEST',
- 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
- 'time' : time.time()}
- l = Log(host, record)
-
- ret = api.UpdateNode(host, {'boot_state' : 'rins'})
- if ret:
- # it's nice to see the current status rather than the previous status on the console
- node = api.GetNodes(host)[0]
- print l
- print "%-2d" % (i-1), nodegroup_display(node, fb)
- return l
- else:
- print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
- return None
-
-
-try:
- rebootlog = database.dbLoad("rebootlog")
-except:
- rebootlog = LogRoll()
-
-parser = parsermodule.getParser(['nodesets'])
-parser.set_defaults( timewait=0,
- skip=0,
- rins=False,
- reboot=False,
- findbad=False,
- force=False,
- nosetup=False,
- verbose=False,
- quiet=False,
- )
-
-parser.add_option("", "--stopselect", dest="stopselect", metavar="",
- help="The select string that must evaluate to true for the node to be considered 'done'")
-parser.add_option("", "--findbad", dest="findbad", action="store_true",
- help="Re-run findbad on the nodes we're going to check before acting.")
-parser.add_option("", "--force", dest="force", action="store_true",
- help="Force action regardless of previous actions/logs.")
-parser.add_option("", "--rins", dest="rins", action="store_true",
- help="Set the boot_state to 'rins' for all nodes.")
-parser.add_option("", "--reboot", dest="reboot", action="store_true",
- help="Actively try to reboot the nodes, keeping a log of actions.")
-
-parser.add_option("", "--verbose", dest="verbose", action="store_true",
- help="Extra debug output messages.")
-parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
- help="Do not perform the orginary setup phase.")
-parser.add_option("", "--skip", dest="skip",
- help="Number of machines to skip on the input queue.")
-parser.add_option("", "--timewait", dest="timewait",
- help="Minutes to wait between iterations of 10 nodes.")
-
-parser = parsermodule.getParser(['defaults'], parser)
-config = parsermodule.parse_args(parser)
-
-# COLLECT nodegroups, nodes and node lists
-if config.nodegroup:
- ng = api.GetNodeGroups({'name' : config.nodegroup})
- nodelist = api.GetNodes(ng[0]['node_ids'])
- hostnames = [ n['hostname'] for n in nodelist ]
-
-if config.site:
- site = api.GetSites(config.site)
- l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
- hostnames = [ n['hostname'] for n in l_nodes ]
-
-if config.node or config.nodelist:
- if config.node: hostnames = [ config.node ]
- else: hostnames = util.file.getListFromFile(config.nodelist)
-
-fbquery = FindbadNodeRecord.get_all_latest()
-fb_nodelist = [ n.hostname for n in fbquery ]
-
-if config.nodeselect:
- hostnames = node_select(config.nodeselect, fb_nodelist)
-
-if config.findbad:
- # rerun findbad with the nodes in the given nodes.
- file = "findbad.txt"
- util.file.setFileFromList(file, hostnames)
- os.system("./findbad.py --cachenodes --increment --nodelist %s" % file)
- # TODO: shouldn't we reload the node list now?
-
-l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
-# commands:
-i = 1
-count = 1
-#print "hosts: %s" % hostnames
-for host in hostnames:
-
- #if 'echo' in host or 'hptest-1' in host: continue
-
- try:
- try:
- node = api.GetNodes(host)[0]
- except:
- print traceback.print_exc();
- print "FAILED GETNODES for host: %s" % host
- continue
-
- print "%-2d" % i, nodegroup_display(node, fb)
- i += 1
- if i-1 <= int(config.skip): continue
- if host in l_blacklist:
- print "%s is blacklisted. Skipping." % host
- continue
-
- if config.stopselect:
- dict_query = query_to_dict(config.stopselect)
- fbnode = fb['nodes'][host]['values']
- observed_state = get_current_state(fbnode)
-
- if verify(dict_query, fbnode) and observed_state != "dbg ":
- # evaluates to true, therefore skip.
- print "%s evaluates true for %s ; skipping..." % ( config.stopselect, host )
- try:
- # todo: clean up act_all record here.
- # todo: send thank you, etc.
- mailmonitor.reboot(host)
- except Exception, e:
- print traceback.print_exc(); print e
-
- continue
- #else:
- #print "%s failed to match %s: -%s-" % ( host, dict_query, observed_state )
- #sys.exit(1)
-
- if not config.force and rebootlog.find(host, {'action' : ".*reboot"}, 60*60*2):
- print "recently rebooted %s. skipping... " % host
- continue
-
- if config.reboot:
-
- fbnode = fb['nodes'][host]['values']
- observed_state = get_current_state(fbnode)
-
- if observed_state == "dbg ":
- o = RebootDebug(fbnode)
-
- elif observed_state == "boot" :
- if config.rins:
- l = set_node_to_rins(host, fb)
- if l: rebootlog.add(l)
-
- o = RebootBoot(fbnode)
-
- elif observed_state == "down":
- if config.rins:
- l = set_node_to_rins(host, fb)
- if l: rebootlog.add(l)
-
- o = RebootDown(fbnode)
-
-
- if o.direct(host):
- record = {'observation' : "DIRECT_SUCCESS: %s" % observed_state,
- 'action' : o.action,
- 'model' : "none",
- 'time' : time.time()}
- elif o.pcu(host):
- record = {'observation' : "PCU_SUCCESS: %s" % observed_state,
- 'action' : o.action,
- 'model' : "none",
- 'time' : time.time()}
- elif o.mail(host):
- record = {'observation' : "MAIL_SUCCESS: %s" % observed_state,
- 'action' : o.action,
- 'model' : "none",
- 'time' : time.time()}
- else:
- record = {'observation' : "REBOOT_FAILED: %s" % observed_state,
- 'action' : "log failure",
- 'model' : "none",
- 'time' : time.time()}
-
- print "ALL METHODS OF RESTARTING %s FAILED" % host
- args = {}
- args['hostname'] = host
- #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
- # "CANNOT CONTACT", False, db='suspect_persistmessages')
- #m.reset()
- #m.send(['monitor-list@lists.planet-lab.org'])
-
- l = Log(host, record)
- print l
- rebootlog.add(l)
- except KeyboardInterrupt:
- print "Killed by interrupt"
- sys.exit(0)
- except:
- print traceback.print_exc();
- print "Continuing..."
-
- time.sleep(1)
- if count % 10 == 0:
- print "Saving rebootlog"
- database.dbDump("rebootlog", rebootlog)
- wait_time = int(config.timewait)
- print "Sleeping %d minutes" % wait_time
- ti = 0
- print "Minutes slept: ",
- sys.stdout.flush()
- while ti < wait_time:
- print "%s" % ti,
- sys.stdout.flush()
- time.sleep(60)
- ti = ti+1
-
- count = count + 1
-
-print "Saving rebootlog"
-database.dbDump("rebootlog", rebootlog)
from monitor.wrapper import rt
from monitor.wrapper import plc
from monitor.policy import *
+from monitor.database.info.model import *
api = plc.getAuthAPI()
if len(l_nodes) == 0:
raise Exception("No such host: %s" % hostname)
- l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
- l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+ q_blacklist = BlacklistRecord.query.all()
+ l_blacklist = [ n.hostname for n in q_blacklist ]
l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
if len(l_nodes) == 0:
raise Exception("Host removed via blacklist: %s" % hostname)
import time
import struct
-from pcucontrol import reboot
-
+from monitor import reboot
from monitor import util
from monitor import database
from monitor.wrapper import plc, plccache
-from datetime import datetime
-from monitor.model import PersistFlags
+from datetime import datetime, timedelta
+from monitor.model import Message
+from monitor.database.info import HistoryNodeRecord
esc = struct.pack('i', 27)
RED = esc + "[1;31m"
now = time.time()
if timestamp == None:
return "unknown"
+ if type(timestamp) == type(datetime.now()):
+ timestamp = time.mktime(timestamp.timetuple())
if abstime:
diff = now - timestamp
else:
node['pcu'] = "PCU"
node['lastupdate'] = diff_time(node['last_contact'])
- pf = PersistFlags(node['hostname'], 1, db='node_persistflags')
+ pf = HistoryNodeRecord.get_by(hostname=node['hostname'])
try:
node['lc'] = diff_time(pf.last_changed)
except:
l_nodes = node_select(config.nodeselect, node_list, None)
return l_nodes
+
+def email_exception(content=None):
+ import config
+ from monitor.model import Message
+ import traceback
+ msg=traceback.format_exc()
+ if content:
+ msg = content + "\n" + msg
+ m=Message("exception running monitor", msg, False)
+ m.send([config.cc_email])
+ return
+
+def changed_lessthan(last_changed, days):
+ if datetime.now() - last_changed <= timedelta(days):
+ #print "last changed less than %s" % timedelta(days)
+ return True
+ else:
+ #print "last changed more than %s" % timedelta(days)
+ return False
+
+def changed_greaterthan(last_changed, days):
+ if datetime.now() - last_changed > timedelta(days):
+ #print "last changed more than %s" % timedelta(days)
+ return True
+ else:
+ #print "last changed less than %s" % timedelta(days)
+ return False
+
+def found_between(recent_actions, action_type, lower, upper):
+ return found_before(recent_actions, action_type, upper) and found_within(recent_actions, action_type, lower)
+
+def found_before(recent_actions, action_type, within):
+ for action in recent_actions:
+ if action_type == action.action_type and \
+ action.date_created < (datetime.now() - timedelta(within)):
+ return True
+ return False
+
+def found_within(recent_actions, action_type, within):
+ for action in recent_actions:
+ #print "%s - %s %s > %s - %s (%s) ==> %s" % (action.loginbase, action.action_type, action.date_created, datetime.now(), timedelta(within), datetime.now()-timedelta(within), action.date_created > (datetime.now() - timedelta(within)) )
+ if action_type == action.action_type and \
+ action.date_created > (datetime.now() - timedelta(within)):
+ #datetime.now() - action.date_created < timedelta(within):
+ # recent action of given type.
+ #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created)
+ return True
+
+ print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
+ return False
from monitor.database.info.action import *
from monitor.database.info.findbad import *
from monitor.database.info.history import *
+from monitor.database.info.plc import *
setup_all()
from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
from elixir import options_defaults, using_options, setup_all, has_one
from elixir import String, Integer, DateTime, PickleType, Boolean
+from elixir.ext.versioned import *
from datetime import datetime,timedelta
import elixir
import traceback
# issue_type = ManyToMany('IssueType')
# actions = OneToMany('ActionRecord', order_by='-date_created')
+class BlacklistRecord(Entity):
+ date_created = Field(DateTime,default=datetime.now)
+ hostname = Field(String,default=None)
+ loginbase = Field(String,default=None)
+ expires = Field(Integer,default=0) # seconds plus
+ acts_as_versioned(['hostname'])
+
+ @classmethod
+ def getLoginbaseBlacklist(cls):
+ # TODO: need to sort on 'round' since actions will not be globally sync'd.
+ return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc())
+
+ @classmethod
+ def getHostnameBlacklist(cls):
+ # TODO: need to sort on 'round' since actions will not be globally sync'd.
+ return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc())
+
+ def neverExpires(self):
+ if self.expires == 0:
+ return True
+ else:
+ return False
+
+ def expired(self):
+ if self.neverExpires():
+ return False
+ else:
+ if self.date_created + timedelta(0,self.expires) > datetime.now():
+ return True
+ else:
+ return False
+
+ def willExpire(self):
+ if self.neverExpires():
+ return "never"
+ else:
+ return self.date_created + timedelta(0, self.expires)
class ActionRecord(Entity):
@classmethod
# ACCOUNTING
date_created = Field(DateTime,default=datetime.now)
+ loginbase = Field(String,default=None)
hostname = Field(String,default=None)
- loginbase = Field(String)
+ # NOTE:
+ # the expected kinds of actions are:
+ # * reboot node
+ # * open ticket, send notice
+ # * close ticket
+ # * apply penalty to site
+ # * backoff penalty to site
+ action = Field(String)
+
+ # NOTE: describes the kind of action. i.e. online-notice, offline-notice,
+ # reboot-first-try, reboot-second-try, penalty-pause, penalty-warning, penalty-no-create,
+ # penalty-disable-slices,
+ action_type = Field(String, default=None)
+
+ message_id = Field(Integer, default=0)
+ penalty_level = Field(Integer, default=0)
+
+ # NOTE: in case an exception is thrown while trying to perform an action.
+ error_string = Field(String, default=None)
#issue = ManyToOne('IssueRecord')
# NOTE: this is the parent relation to fb records. first create the
# OR
# - find fbnode records
# - create action record with fbnodes as argument
- findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
+ # findbad_records = OneToMany('FindbadNodeRecord', order_by='-date_checked')
# NOTE: can I move 'message_index, escellation_level, and penalty_level'
# into the same value? Maybe not penalty level, since there are only two;
# and, there may be additional message and escellation levels.
- send_email_to = Field(PickleType, default=None)
- action_description = Field(PickleType, default=None)
- message_arguments = Field(PickleType, default=None)
+ #send_email_to = Field(PickleType, default=None)
+ #action_description = Field(PickleType, default=None)
+ #message_arguments = Field(PickleType, default=None)
# NOTE: not sure this needs to be in the db.
- escellation_level = Field(Integer, default=0)
- stage = Field(String, default=None)
+ #escellation_level = Field(Integer, default=0)
+ #stage = Field(String, default=None)
from datetime import datetime,timedelta
import elixir
import traceback
+from elixir.ext.versioned import *
from monitor.database.dborm import mon_metadata, mon_session
__metadata__ = mon_metadata
__session__ = mon_session
-class FindbadNodeRecordSync(Entity):
- hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
- round = Field(Int,default=0)
+#class FindbadNodeRecordSync(Entity):
+# hostname = Field(String(250),primary_key=True) #,alternateMethodName='by_hostname')
+# round = Field(Int,default=0)
-class FindbadPCURecordSync(Entity):
- plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
- round = Field(Int,default=0)
+#class FindbadPCURecordSync(Entity):
+# plc_pcuid = Field(Int,primary_key=True) #,alternateMethodName='by_pcuid')
+# round = Field(Int,default=0)
class FindbadNodeRecord(Entity):
@classmethod
def get_all_latest(cls):
- fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- if fbsync:
- return cls.query.filter_by(round=fbsync.round)
- else:
- return []
+ return cls.query.all()
+ #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ #if fbsync:
+ # return cls.query.filter_by(round=fbsync.round)
+ #else:
+ # return []
@classmethod
def get_latest_by(cls, **kwargs):
- fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- if fbsync:
- kwargs['round'] = fbsync.round
- return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
- else:
- return []
+ return cls.query.filter_by(**kwargs).first()
+ #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ #if fbsync:
+ # kwargs['round'] = fbsync.round
+ # return cls.query.filter_by(**kwargs).order_by(FindbadNodeRecord.date_checked.desc())
+ #else:
+ # return []
@classmethod
def get_latest_n_by(cls, n=3, **kwargs):
- fbsync = FindbadNodeRecordSync.get_by(hostname="global")
- kwargs['round'] = fbsync.round
- ret = []
- for i in range(0,n):
- kwargs['round'] = kwargs['round'] - i
- f = cls.query.filter_by(**kwargs).first()
- if f:
- ret.append(f)
- return ret
+ return cls.query.filter_by(**kwargs)
+ #fbsync = FindbadNodeRecordSync.get_by(hostname="global")
+ #kwargs['round'] = fbsync.round
+ #ret = []
+ #for i in range(0,n):
+ # kwargs['round'] = kwargs['round'] - i
+ # f = cls.query.filter_by(**kwargs).first()
+ # if f:
+ # ret.append(f)
+ #return ret
# ACCOUNTING
date_checked = Field(DateTime,default=datetime.now)
round = Field(Int,default=0)
- hostname = Field(String,default=None)
+ hostname = Field(String,primary_key=True,default=None)
loginbase = Field(String)
# INTERNAL
observed_category = Field(String,default=None)
observed_status = Field(String,default=None)
+ acts_as_versioned(ignore=['date_checked'])
# NOTE: this is the child relation
- action = ManyToOne('ActionRecord', required=False)
+ #action = ManyToOne('ActionRecord', required=False)
class FindbadPCURecord(Entity):
@classmethod
def get_all_latest(cls):
- fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
- if fbsync:
- return cls.query.filter_by(round=fbsync.round)
- else:
- return []
+ return cls.query.all()
@classmethod
def get_latest_by(cls, **kwargs):
- fbsync = FindbadPCURecordSync.get_by(plc_pcuid=0)
- kwargs['round'] = fbsync.round
- return cls.query.filter_by(**kwargs).order_by(FindbadPCURecord.date_checked.desc())
+ return cls.query.filter_by(**kwargs).first()
+
# ACCOUNTING
date_checked = Field(DateTime)
round = Field(Int,default=0)
# INTERNAL
# INFERRED
reboot_trial_status = Field(String)
+
+ acts_as_versioned(ignore=['date_checked'])
from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
from elixir import options_defaults, using_options, setup_all
from elixir import String, Integer as Int, DateTime, Boolean
+from elixir.ext.versioned import *
+
from datetime import datetime,timedelta
from monitor.database.dborm import mon_metadata, mon_session
last_checked = Field(DateTime,default=datetime.now)
last_changed = Field(DateTime,default=datetime.now)
status = Field(String,default="unknown")
+ acts_as_versioned(ignore=['last_changed', 'last_checked'])
@classmethod
def by_hostname(cls, hostname):
last_valid = Field(DateTime,default=None)
valid = Field(String,default="unknown")
+ acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
@classmethod
def by_pcuid(cls, pcuid):
return cls.query.filter_by(pcuid=pcuid).first()
+
class HistorySiteRecord(Entity):
loginbase = Field(String(250),primary_key=True)
status = Field(String,default="unknown")
+ message_id = Field(Int, default=0)
+ message_status = Field(String, default=None)
+ message_queue = Field(String, default=None)
+ message_created = Field(DateTime, default=None)
+
+ penalty_level = Field(Int, default=0)
+ penalty_applied = Field(Boolean, default=False)
+ acts_as_versioned(ignore=['last_changed', 'last_checked'])
+
@classmethod
def by_loginbase(cls, loginbase):
return cls.query.filter_by(loginbase=loginbase).first()
--- /dev/null
+import bootman # debug nodes
+
+from monitor import reboot
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.wrapper.emailTxt import mailtxt
+from monitor.database.info.model import *
+
+class SiteInterface(HistorySiteRecord):
+ @classmethod
+ def get_or_make(cls, if_new_set={}, **kwargs):
+ if 'hostname' in kwargs:
+ kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']]
+ del kwargs['hostname']
+ res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs)
+ return SiteInterface(res)
+
+ def __init__(self, sitehist):
+ self.db = sitehist
+
+ def getRecentActions(self, **kwargs):
+ # TODO: make query only return records within a certin time range,
+ # i.e. greater than 0.5 days ago. or 5 days, etc.
+
+ #print "kwargs: ", kwargs
+
+ recent_actions = []
+ if 'loginbase' in kwargs:
+ recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc())
+ elif 'hostname' in kwargs:
+ recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc())
+ return recent_actions
+
+ def increasePenalty(self):
+ #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',)
+ self.db.penalty_level += 1
+ # NOTE: this is to prevent overflow or index errors in applyPenalty.
+ # there's probably a better approach to this.
+ if self.db.penalty_level >= 2:
+ self.db.penalty_level = 2
+ self.db.penalty_applied = True
+
+ def applyPenalty(self):
+ penalty_map = []
+ penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None,
+ 'disable' : lambda site: None } )
+ penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site),
+ 'disable' : lambda site: plc.enableSiteSliceCreation(site) } )
+ penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site),
+ 'disable' : lambda site: plc.enableSiteSlices(site) } )
+
+ for i in range(len(penalty_map)-1,self.db.penalty_level,-1):
+ print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+ penalty_map[i]['disable'](self.db.loginbase)
+
+ for i in range(0,self.db.penalty_level+1):
+ print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase)
+ penalty_map[i]['enable'](self.db.loginbase)
+
+ return
+
+ def pausePenalty(self):
+ act = ActionRecord(loginbase=self.db.loginbase,
+ action='penalty',
+ action_type='pause_penalty',)
+
+ def clearPenalty(self):
+ #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',)
+ self.db.penalty_level = 0
+ self.db.penalty_applied = False
+
+ def getTicketStatus(self):
+ if self.db.message_id != 0:
+ rtstatus = mailer.getTicketStatus(self.db.message_id)
+ self.db.message_status = rtstatus['Status']
+ self.db.message_queue = rtstatus['Queue']
+ self.db.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+ def setTicketStatus(self, status):
+ print 'SETTING status %s' % status
+ if self.db.message_id != 0:
+ rtstatus = mailer.setTicketStatus(self.db.message_id, status)
+
+ def getContacts(self):
+ contacts = []
+ if self.db.penalty_level >= 0:
+ contacts += plc.getTechEmails(self.db.loginbase)
+
+ if self.db.penalty_level >= 1:
+ contacts += plc.getPIEmails(self.db.loginbase)
+
+ if self.db.penalty_level >= 2:
+ contacts += plc.getSliceUserEmails(self.db.loginbase)
+
+ return contacts
+
+ def sendMessage(self, type, **kwargs):
+
+ # NOTE: evidently changing an RT message's subject opens the ticket.
+ # the logic in this policy depends up a ticket only being 'open'
+ # if a user has replied to it.
+ # So, to preserve these semantics, we check the status before
+ # sending, then after sending, reset the status to the
+ # previous status.
+ # There is a very tiny race here, where a user sends a reply
+ # within the time it takes to check, send, and reset.
+ # This sucks. It's almost certainly fragile.
+
+ #
+ # TODO: catch any errors here, and add an ActionRecord that contains
+ # those errors.
+
+ args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level}
+ args.update(kwargs)
+
+ hostname = None
+ if 'hostname' in args:
+ hostname = args['hostname']
+
+ if hasattr(mailtxt, type):
+
+ message = getattr(mailtxt, type)
+ viart = True
+ if 'viart' in kwargs:
+ viart = kwargs['viart']
+
+ if viart:
+ self.getTicketStatus() # get current message status
+
+ m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
+
+ contacts = self.getContacts()
+ contacts = [config.cc_email] # TODO: remove after testing...
+
+ print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname)
+
+ ret = m.send(contacts)
+ if viart:
+ self.db.message_id = ret
+ # reset to previous status, since a new subject 'opens' RT tickets.
+ self.setTicketStatus(self.db.message_status)
+
+ # NOTE: only make a record of it if it's in RT.
+ act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice',
+ action_type=type, message_id=self.db.message_id)
+
+ else:
+ print "+-- WARNING! ------------------------------"
+ print "| No such message name in emailTxt.mailtxt: %s" % type
+ print "+------------------------------------------"
+
+ return
+
+ def closeTicket(self):
+ # TODO: close the rt ticket before overwriting the message_id
+ mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor")
+ act = ActionRecord(loginbase=self.db.loginbase, action='notice',
+ action_type='close_ticket', message_id=self.db.message_id)
+ self.db.message_id = 0
+ self.db.message_status = "new"
+
+ def runBootManager(self, hostname):
+ print "attempting BM reboot of %s" % hostname
+ ret = ""
+ try:
+ ret = bootman.restore(self, hostname)
+ err = ""
+ except:
+ err = traceback.format_exc()
+ print err
+
+ act = ActionRecord(loginbase=self.db.loginbase,
+ hostname=hostname,
+ action='reboot',
+ action_type='bootmanager_restore',
+ error_string=err)
+ return ret
+
+ def attemptReboot(self, hostname):
+ print "attempting PCU reboot of %s" % hostname
+ err = ""
+ try:
+ ret = reboot.reboot_str(hostname)
+ except Exception, e:
+ err = traceback.format_exc()
+ ret = str(e)
+
+ if ret == 0 or ret == "0":
+ ret = ""
+
+ act = ActionRecord(loginbase=self.db.loginbase,
+ hostname=hostname,
+ action='reboot',
+ action_type='first_try_reboot',
+ error_string=err)
+
from monitor.database.info.action import *
from monitor.database.info.findbad import *
from monitor.database.info.history import *
+from monitor.database.info.plc import *
from monitor.database.dborm import mon_session as session
--- /dev/null
+from elixir import Entity, Field, OneToMany, ManyToOne, ManyToMany
+from elixir import options_defaults, using_options, setup_all
+from elixir import PickleType, String, Integer, DateTime, Boolean
+from elixir.ext.versioned import *
+
+from datetime import datetime,timedelta
+
+from monitor.database.dborm import mon_metadata, mon_session
+__metadata__ = mon_metadata
+__session__ = mon_session
+
+class PlcSite(Entity):
+ site_id = Field(Integer,primary_key=True)
+ loginbase = Field(String,default=None)
+ date_checked = Field(DateTime,default=datetime.now)
+
+ plc_site_stats = Field(PickleType,default=None)
+ acts_as_versioned(ignore=['date_checked'])
+
+class PlcNode(Entity):
+ node_id = Field(Integer,primary_key=True)
+ hostname = Field(String,default=None)
+ date_checked = Field(DateTime,default=datetime.now)
+
+ plc_node_stats = Field(PickleType,default=None)
+ acts_as_versioned(ignore=['date_checked'])
+
+class PlcPCU(Entity):
+ pcu_id = Field(Integer,primary_key=True)
+ date_checked = Field(DateTime,default=datetime.now)
+
+ plc_pcu_stats = Field(PickleType,default=None)
+ acts_as_versioned(ignore=['date_checked'])
else:
print "takeAction: increasing penalty for %s"%self.hostname
pp.increase()
+
+ print "takeAction: applying penalty to %s as index %s"% (self.hostname, index)
pp.index = index
pp.apply(self.hostname)
pp.save()
#### APPLY PENALTY
if ( record.data['take_action'] and diag['Squeeze'] ):
- print "action: taking action"
+ print "action: taking squeeze action"
record.takeAction(record.data['penalty_level'])
del diag['Squeeze']
if diag.getFlag('BackOff'):
+ print "action: taking backoff action"
record.takeAction(0)
del diag['BackOff']
--- /dev/null
+#!/usr/bin/python
+#
+# Reboot specified nodes
+#
+
+import getpass, getopt
+import os, sys
+import xml, xmlrpclib
+import errno, time, traceback
+import urllib2
+import urllib
+import threading, popen2
+import array, struct
+import base64
+from subprocess import PIPE, Popen
+import pcucontrol.transports.ssh.pxssh as pxssh
+import pcucontrol.transports.ssh.pexpect as pexpect
+import socket
+
+# Use our versions of telnetlib and pyssh
+sys.path.insert(0, os.path.dirname(sys.argv[0]))
+import pcucontrol.transports.telnetlib as telnetlib
+sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")
+import pcucontrol.transports.pyssh as pyssh
+
+from monitor import config
+from monitor.wrapper import plc
+
+from pcucontrol.util import command
+from pcucontrol.reboot import pcu_name, model_to_object, reboot_api, convert_oldmodelname_to_newmodelname, reboot_test_new
+
+
+# Event class ID from pcu events
+#NODE_POWER_CONTROL = 3
+
+# Monitor user ID
+#MONITOR_USER_ID = 11142
+
+import logging
+logger = logging.getLogger("monitor")
+verbose = 1
+#dryrun = 0;
+
+def get_pcu_values(pcu_id):
+ from monitor.database.info.model import FindbadPCURecord
+ print "pcuid: %s" % pcu_id
+ try:
+ pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id)
+ if pcurec:
+ values = pcurec.to_dict()
+ else:
+ values = None
+ except:
+ values = None
+
+ return values
+
+def reboot(nodename):
+ return reboot_policy(nodename, True, False)
+
+def reboot_str(nodename):
+ global verbose
+ continue_probe = True
+ dryrun=False
+
+ pcu = plc.getpcu(nodename)
+ if not pcu:
+ logger.debug("no pcu for %s" % nodename)
+ print "no pcu for %s" % nodename
+ return "%s has no pcu" % nodename
+
+ values = get_pcu_values(pcu['pcu_id'])
+ if values == None:
+ logger.debug("No values for pcu probe %s" % nodename)
+ print "No values for pcu probe %s" % nodename
+ return "no info for pcu_id %s" % pcu['pcu_id']
+
+ # Try the PCU first
+ logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+ ret = reboot_test_new(nodename, values, verbose, dryrun)
+ return ret
+
+def reboot_policy(nodename, continue_probe, dryrun):
+ global verbose
+
+ pcu = plc.getpcu(nodename)
+ if not pcu:
+ logger.debug("no pcu for %s" % nodename)
+ print "no pcu for %s" % nodename
+ return False # "%s has no pcu" % nodename
+
+ values = get_pcu_values(pcu['pcu_id'])
+ if values == None:
+ logger.debug("No values for pcu probe %s" % nodename)
+ print "No values for pcu probe %s" % nodename
+ return False #"no info for pcu_id %s" % pcu['pcu_id']
+
+ # Try the PCU first
+ logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
+
+ ret = reboot_test_new(nodename, values, verbose, dryrun)
+
+ if ret != 0:
+ print ret
+ return False
+ else:
+ print "return true"
+ return True
+
+def main():
+ logger.setLevel(logging.DEBUG)
+ ch = logging.StreamHandler()
+ ch.setLevel(logging.DEBUG)
+ formatter = logging.Formatter('LOGGER - %(message)s')
+ ch.setFormatter(formatter)
+ logger.addHandler(ch)
+
+ try:
+ if "test" in sys.argv:
+ dryrun = True
+ else:
+ dryrun = False
+
+ for node in sys.argv[1:]:
+ if node == "test": continue
+
+ print "Rebooting %s" % node
+ if reboot_policy(node, True, dryrun):
+ print "success"
+ else:
+ print "failed"
+ except Exception, err:
+ import traceback; traceback.print_exc()
+ from monitor.common import email_exception
+ email_exception(node)
+ print err
+
+if __name__ == '__main__':
+ logger = logging.getLogger("monitor")
+ main()
+ f = open("/tmp/rebootlog", 'a')
+ f.write("reboot %s\n" % sys.argv)
+ f.close()
import socket
from pcucontrol import reboot
-from monitor import util
-from monitor.util import command
+from pcucontrol.util import command
from monitor import config
from monitor.database.info.model import *
syncclass = None
primarykey = 'hostname'
- def __init__(self, round):
+ def __init__(self, round=1):
self.round = round
self.count = 1
try:
if values is None:
return
-
- fbnodesync = self.syncclass.findby_or_create(
- if_new_set={'round' : self.round},
+
+ if self.syncclass:
+ fbnodesync = self.syncclass.findby_or_create(
+ #if_new_set={'round' : self.round},
**{ self.primarykey : nodename})
# NOTE: This code will either add a new record for the new self.round,
# OR it will find the previous value, and update it with new information.
# The data that is 'lost' is not that important, b/c older
# history still exists.
fbrec = self.recordclass.findby_or_create(
- **{'round':self.round, self.primarykey:nodename})
+ **{ self.primarykey:nodename})
fbrec.set( **values )
fbrec.flush()
- fbnodesync.round = self.round
- fbnodesync.flush()
+ if self.syncclass:
+ fbnodesync.round = self.round
+ fbnodesync.flush()
print "%d %s %s" % (self.count, nodename, values)
self.count += 1
class ScanNodeInternal(ScanInterface):
recordclass = FindbadNodeRecord
- syncclass = FindbadNodeRecordSync
+ #syncclass = FindbadNodeRecordSync
+ syncclass = None
primarykey = 'hostname'
def collectNMAP(self, nodename, cohash):
#### RUN NMAP ###############################
values = {}
- nmap = util.command.CMD()
+ nmap = command.CMD()
print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename
(oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename)
# NOTE: an empty / error value for oval, will still work.
echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
echo "}"
- EOF """)
+EOF """)
values['ssh_error'] = errval
if len(oval) > 0:
return (nodename, values)
def internalprobe(hostname):
- fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
- if_new_set={'round' : 1})
- scannode = ScanNodeInternal(fbsync.round)
+ #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
+ # if_new_set={'round' : 1})
+ scannode = ScanNodeInternal() # fbsync.round)
try:
(nodename, values) = scannode.collectInternal(hostname, {})
scannode.record(None, (nodename, values))
return False
def externalprobe(hostname):
- fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
- if_new_set={'round' : 1})
- scannode = ScanNodeInternal(fbsync.round)
+ #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global",
+ # if_new_set={'round' : 1})
+ scannode = ScanNodeInternal() # fbsync.round)
try:
(nodename, values) = scannode.collectNMAP(hostname, {})
scannode.record(None, (nodename, values))
class ScanPCU(ScanInterface):
recordclass = FindbadPCURecord
- syncclass = FindbadPCURecordSync
+ syncclass = None
primarykey = 'plc_pcuid'
def collectInternal(self, pcuname, cohash):
#### RUN NMAP ###############################
if continue_probe:
- nmap = util.command.CMD()
+ nmap = command.CMD()
print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])
(oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']))
# NOTE: an empty / error value for oval, will still work.
###### DRY RUN ############################
- if 'node_ids' in values['plc_pcu_stats'] and \
+ if continue_probe and 'node_ids' in values['plc_pcu_stats'] and \
len(values['plc_pcu_stats']['node_ids']) > 0:
rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0],
values, 1, True)
print "____________________________________"
errors['traceback'] = traceback.format_exc()
print errors['traceback']
- values['reboot_trial_status'] = errors['traceback']
+ values['reboot_trial_status'] = str(errors['traceback'])
+ print values
values['entry_complete']=" ".join(values['entry_complete'])
with PlanetLab.
""")
+ pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
+
+"""As part of PlanetLab node monitoring and maintenance, we tried to use the PCU
+registered for %(hostname)s, but could not for some reason.
+
+Please help.
+
+Thank you very much for your help,
+ -- PlanetLab Central (support@planet-lab.org)
+""")
+ online_notice=("""MONTEST: Host %(hostname)s is online""",
+ """
+This notice is simply to let you know that:
+ %(hostname)s
+
+is online and operational. Thank you very much for your help!
+ """)
+ test_notice=("""MONTEST: Host %(hostname)s is testing""",
+ """
+This notice is simply to test whether notices work.
+ %(hostname)s
+
+Thank you very much for your help!
+ """)
+ retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""",
+ """
+This notice is simply to let you know that:
+ %(hostname)s
+
+appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py.
+If any action is needed from you, you will recieve additional notices. Thank you!
+ """)
+ down_notice=("""MONTEST: Host %(hostname)s is down""",
+ """
+This notice is simply to let you know that:
+ %(hostname)s
+
+is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help!
+ """)
+
+ clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""",
+ """
+This notice is to let you know that any penalties previously applied to your site have
+been removed: %(penalty_level)s.
+
+All privileges have been restored. If your slices were disabled, please allow
+up to 30 minutes for them to return to enabled.
+
+Legend:
+
+ 0 - no penalties applied
+ 1 - site is disabled. no new slices can be created.
+ 2+ - all existing slices will be disabled.
+ """)
+
+ increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""",
+ """
+This notice is to let you know that the penalty applied to your site has
+increased: %(penalty_level)s.
+
+legend:
+
+ 0 - no penalty applied
+ 1 - site is disabled. no new slices can be created.
+ 2+ - all existing slices will be disabled.
+ """)
+
+ newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """
+As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD:
+
+ %(hostname)s
+
+This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick).
+
+Thank you for your help,
+ -- PlanetLab Central (support@planet-lab.org)
+""")
+
nmreset =("""NM Reset at %(loginbase)s""",
"""
Monitor restarted NM on the following machines:
-- PlanetLab Central (support@planet-lab.org)
""")
- newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""",
-"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware:
+ newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""",
+"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
-%(hostname_list)s
+ %(hostname)s
To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file.
# TODO: need reminder versions for repeats...
newdown=[newdown_one, newdown_two, newdown_three]
newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three]
- newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
+ #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one]
newthankyou=[thankyou,thankyou,thankyou]
pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one]
NMReset=[nmreset,nmreset,nmreset]
pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
pcudown=[pcudown_one, pcudown_one, pcudown_one]
- unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""",
+ unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
- minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""",
+ minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
%(bmlog)s
""" )
- baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""",
+ baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org.
%(bmlog)s
""")
- plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
+ nodeconfig_notice=("""MONTEST: Please Update Configuration file for PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
""")
- baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""",
+ baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
%(hostname)s
try:
from monitor import config
debug = config.debug
+ XMLRPC_SERVER=config.API_SERVER
except:
debug = False
+ # NOTE: this host is used by default when there are no auth files.
+ XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
+
logger = logging.getLogger("monitor")
class Auth:
'AuthMethod' : 'password',
'AuthString' : password}
-# NOTE: this host is used by default when there are no auth files.
-XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
# NOTE: by default, use anonymous access, but if auth files are
# configured, use them, with their auth definitions.
auth = Auth()
auth.server = XMLRPC_SERVER
-api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+global_error_count = 0
class PLC:
def __init__(self, auth, url):
if method is None:
raise AssertionError("method does not exist")
- return lambda *params : method(self.auth, *params)
+ try:
+ return lambda *params : method(self.auth, *params)
+ except ProtocolError:
+ traceback.print_exc()
+ global_error_count += 1
+ if global_error_count >= 10:
+ print "maximum error count exceeded; exiting..."
+ sys.exit(1)
+ else:
+ print "%s errors have occurred" % global_error_count
+ raise Exception("ProtocolError continuing")
def __repr__(self):
return self.api.__repr__()
+api = PLC(auth.auth, auth.server)
+
class CachedPLC(PLC):
def _param_to_str(self, name, *params):
except Exception, exc:
logger.info("nodePOD: %s" % exc)
+'''
+Freeze all site slices.
+'''
+def suspendSiteSlices(loginbase):
+ api = xmlrpclib.Server(auth.server, verbose=False)
+ for slice in slices(loginbase):
+ logger.info("Suspending slice %s" % slice)
+ try:
+ if not debug:
+ api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+ except Exception, exc:
+ logger.info("suspendSlices: %s" % exc)
+
'''
Freeze all site slices.
'''
except Exception, exc:
logger.info("suspendSlices: %s" % exc)
+def enableSiteSlices(loginbase):
+ api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+ for slice in slices(loginbase):
+ logger.info("Enabling slices %s" % slice)
+ try:
+ if not debug:
+ slice_list = api.GetSlices(auth.auth, {'name': slice}, None)
+ if len(slice_list) == 0:
+ return
+ slice_id = slice_list[0]['slice_id']
+ l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+ for attr in l_attr:
+ if "enabled" == attr['name'] and attr['value'] == "0":
+ logger.info("Deleted enable=0 attribute from slice %s" % slice)
+ api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+ except Exception, exc:
+ logger.info("enableSiteSlices: %s" % exc)
+ print "exception: %s" % exc
+
def enableSlices(nodename):
api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
for slice in slices(siteId(nodename)):
# logger.info("Suspending slice %s" % slice)
# api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
#
+def enableSiteSliceCreation(loginbase):
+ api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+ try:
+ logger.info("Enabling slice creation for site %s" % loginbase)
+ if not debug:
+ logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
+ api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+ except Exception, exc:
+ print "ERROR: enableSiteSliceCreation: %s" % exc
+ logger.info("ERROR: enableSiteSliceCreation: %s" % exc)
+
def enableSliceCreation(nodename):
api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
try:
print "ERROR: enableSliceCreation: %s" % exc
logger.info("ERROR: enableSliceCreation: %s" % exc)
+'''
+Removes site's ability to create slices. Returns previous max_slices
+'''
+def removeSiteSliceCreation(sitename):
+ print "removeSiteSliceCreation(%s)" % sitename
+ api = xmlrpclib.Server(auth.server, verbose=False)
+ try:
+ logger.info("Removing slice creation for site %s" % sitename)
+ if not debug:
+ api.UpdateSite(auth.auth, sitename, {'enabled': False})
+ except Exception, exc:
+ logger.info("removeSiteSliceCreation: %s" % exc)
+
'''
Removes ability to create slices. Returns previous max_slices
'''
import sys
from monitor.wrapper import plc
-from monitor import database
-from monitor import config
+from monitor.database.info.model import *
def dsites_from_lsites(l_sites):
d_sites = {}
hn2lb[hostname] = login_base
return (dsn, hn2lb, lb2hn)
-def create_netid2ip(l_nodes, l_nodenetworks):
- netid2ip = {}
- for node in l_nodes:
- for netid in node['nodenetwork_ids']:
- found = False
- for nn in l_nodenetworks:
- if nn['nodenetwork_id'] == netid:
- found = True
- netid2ip[netid] = nn['ip']
- if not found:
- print "ERROR! %s" % node
-
- return netid2ip
-
l_sites = None
l_nodes = None
l_pcus = None
-l_nodenetworks = None
plcdb_hn2lb = None
plcdb_lb2hn = None
-plcdb_netid2ip = None
plcdb_id2lb = None
def init():
global l_sites
global l_nodes
global l_pcus
- global l_nodenetworks
global plcdb_hn2lb
global plcdb_lb2hn
- global plcdb_netid2ip
global plcdb_id2lb
- api = plc.getCachedAuthAPI()
- l_sites = api.GetSites({'peer_id':None},
- ['login_base', 'site_id', 'abbreviated_name', 'latitude',
- 'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled' ])
- l_nodes = api.GetNodes({'peer_id':None},
- ['hostname', 'node_id', 'ports', 'site_id', 'version', 'last_updated',
- 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
- l_pcus = api.GetPCUs()
- l_nodenetworks = api.GetNodeNetworks()
+ dbsites = PlcSite.query.all()
+ l_sites = [ s.plc_site_stats for s in dbsites ]
+
+ dbnodes = PlcNode.query.all()
+ l_nodes = [ s.plc_node_stats for s in dbnodes ]
+
+ dbpcus = PlcPCU.query.all()
+ l_pcus = [ s.plc_pcu_stats for s in dbpcus ]
(d_sites,id2lb) = dsites_from_lsites(l_sites)
(plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
- netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
plcdb_hn2lb = hn2lb
plcdb_lb2hn = lb2hn
- plcdb_netid2ip = netid2ip
plcdb_id2lb = id2lb
- return l_nodes
-
-
-def create_plcdb():
-
- # get sites, and stats
- l_sites = plc.getSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', 'longitude',
- 'max_slices', 'slice_ids', 'node_ids' ])
- if len(l_sites) == 0:
- print "no sites! exiting..."
- sys.exit(1)
- (d_sites,id2lb) = dsites_from_lsites(l_sites)
+ return
+
+def GetNodesByIds(ids):
+ ret = []
+ for node_id in ids:
+ node = PlcNode.get_by(node_id=node_id)
+ ret.append(node.plc_node_stats)
+ return ret
+
+def GetNodesBySite(loginbase):
+ site = PlcSite.get_by(loginbase=loginbase)
+ return GetNodesByIds(site.plc_site_stats['node_ids'])
+
+def GetNodeByName(hostname):
+ node = PlcNode.get_by(hostname=hostname)
+ return node.plc_node_stats
+
+def GetSitesByName(sitelist):
+ ret = []
+ for site in sitelist:
+ site = PlcSite.get_by(loginbase=site)
+ ret.append(site.plc_site_stats)
+ return ret
+
+def sync():
+ l_sites = plc.api.GetSites({'peer_id':None},
+ ['login_base', 'site_id', 'abbreviated_name', 'latitude',
+ 'longitude', 'max_slices', 'slice_ids', 'node_ids',
+ 'enabled', 'date_created' ])
+ l_nodes = plc.api.GetNodes({'peer_id':None},
+ ['hostname', 'node_id', 'ports', 'site_id',
+ 'version', 'last_updated', 'date_created',
+ 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+ l_pcus = plc.api.GetPCUs()
+
+ print "sync sites"
+ for site in l_sites:
+ dbsite = PlcSite.findby_or_create(site_id=site['site_id'])
+ dbsite.loginbase = site['login_base']
+ dbsite.date_checked = datetime.now()
+ dbsite.plc_site_stats = site
+ #dbsite.flush()
+ # TODO: delete old records.
+ session.flush()
+
+ print "sync nodes"
+ for node in l_nodes:
+ dbnode = PlcNode.findby_or_create(node_id=node['node_id'])
+ dbnode.hostname = node['hostname']
+ dbnode.date_checked = datetime.now()
+ dbnode.plc_node_stats = node
+ #dbnode.flush()
+ # TODO: delete old records.
+ session.flush()
+
+ print "sync pcus"
+ for pcu in l_pcus:
+ dbpcu = PlcPCU.findby_or_create(pcu_id=pcu['pcu_id'])
+ dbpcu.date_checked = datetime.now()
+ dbpcu.plc_pcu_stats = pcu
+ #dbpcu.flush()
+ # TODO: delete old records.
+ session.flush()
- # get nodes at each site, and
- l_nodes = plc.getNodes({'peer_id':None}, ['hostname', 'node_id', 'ports', 'site_id', 'version',
- 'last_updated', 'date_created', 'last_contact', 'pcu_ids', 'nodenetwork_ids'])
+ init()
- l_nodenetworks = plc.getNodeNetworks()
- (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
- netid2ip = create_netid2ip(l_nodes, l_nodenetworks)
-
- # save information for future.
- id2lb = id2lb
- hn2lb = hn2lb
- db = plcdb
-
- if ('cachenodes' in dir(config) and config.cachenodes) or \
- 'cachenodes' not in dir(config):
- database.dbDump("plcdb_hn2lb", hn2lb)
- database.dbDump("plcdb_lb2hn", lb2hn)
- database.dbDump("plcdb_netid2ip", netid2ip)
- database.dbDump("l_plcnodenetworks", l_nodenetworks)
- database.dbDump("l_plcnodes", l_nodes)
- database.dbDump("l_plcsites", l_sites)
-
- return l_nodes
+ return
if __name__ == '__main__':
- create_plcdb()
+ sync()
else:
- #print "calling plccache init()"
init()
round = 1
count = 0
+def main():
+ main2(config)
-def main(config):
+def main2(config):
l_plcnodes = plccache.l_nodes
l_nodes = get_nodeset(config)
checkAndRecordState(l_nodes, l_plcnodes)
+# Node states:
+
+def check_node_state(rec, node):
+
+ node_state = rec.observed_status
+ if rec.plc_node_stats:
+ boot_state = rec.plc_node_stats['boot_state']
+ last_contact = rec.plc_node_stats['last_contact']
+ else:
+ boot_state = "unknown"
+ last_contact = None
+
+ if boot_state == 'disable': boot_state = 'disabled'
+ if boot_state == 'diag': boot_state = 'diagnose'
+
+ # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
+ # 'translations' into the node.status state
+ # 'BOOT' is a permanent state, but we want it to have a bit of
+ # hysteresis (less than 0.5 days)
+
+ #################################################################
+ # "Initialize" the findbad states into nodebad status if they are not already set
+
+ if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' :
+ print "changed status from %s to offline" % node.status
+ node.status = 'offline'
+ node.last_changed = datetime.now()
+
+ if node_state == 'DEBUG' and node.status != 'monitordebug' and \
+ node.status != 'disabled' and \
+ node.status != 'diagnose':
+ if boot_state != 'disabled' and boot_state != 'diagnose':
+
+ print "changed status from %s to monitordebug" % (node.status)
+ node.status = "monitordebug"
+ node.last_changed = datetime.now()
+ else:
+ print "changed status from %s to %s" % (node.status, boot_state)
+ node.status = boot_state
+ node.last_changed = datetime.now()
+
+ if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+ print "changed status from %s to online" % node.status
+ node.status = 'online'
+ node.last_changed = datetime.now()
+
+ #################################################################
+ # Switch temporary hystersis states into their 'firm' states.
+ # online -> good after half a day
+ # offline -> down after two days
+ # monitordebug -> down after 30 days
+ # diagnose -> monitordebug after 60 days
+ # disabled -> down after 60 days
+
+ if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+ print "changed status from %s to good" % node.status
+ node.status = 'good'
+ # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+ if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+ if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+ if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60):
+ print "changed status from %s to down" % node.status
+ # NOTE: change an admin mode back into monitordebug after two months.
+ node.status = 'monitordebug'
+ node.last_changed = datetime.now()
+
+ # extreme cases of offline nodes
+ if ( boot_state == 'disabled' or last_contact == None ) and \
+ changed_greaterthan(node.last_changed, 2*30) and \
+ node.status != 'down':
+ print "changed status from %s to down" % node.status
+ node.status = 'down'
+ node.last_changed = datetime.now()
+
def checkAndRecordState(l_nodes, l_plcnodes):
global count
for nodename in l_nodes:
- d_node = None
- for node in l_plcnodes:
- if node['hostname'] == nodename:
- d_node = node
- break
- if not d_node:
- continue
- pf = HistoryNodeRecord.findby_or_create(hostname=nodename)
- pf.last_checked = datetime.now()
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename,
+ if_new_set={'status' : 'offline',
+ 'last_changed' : datetime.now()})
+ nodehist.last_checked = datetime.now()
try:
# Find the most recent record
- noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==nodename).order_by(FindbadNodeRecord.date_checked.desc()).first()
- #print "NODEREC: ", noderec.date_checked
+ noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
except:
print "COULD NOT FIND %s" % nodename
import traceback
print "none object for %s"% nodename
continue
- node_state = noderec.observed_status
- if noderec.plc_node_stats:
- boot_state = noderec.plc_node_stats['boot_state']
- else:
- boot_state = "unknown"
-
- if node_state == "BOOT":
- if pf.status != "good":
- pf.last_changed = datetime.now()
- pf.status = "good"
- elif node_state == "DEBUG":
- if pf.status != boot_state:
- pf.last_changed = datetime.now()
- pf.status = boot_state
- else:
- if pf.status != "down":
- pf.last_changed = datetime.now()
- pf.status = "down"
+ check_node_state(noderec, nodehist)
count += 1
- print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+ print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
# NOTE: this commits all pending operations to the DB. Do not remove, or
# replace with another operations that also commits all pending ops, such
# as session.commit() or flush() or something
- print HistoryNodeRecord.query.count()
session.flush()
+ print HistoryNodeRecord.query.count()
return True
config = parsermodule.parse_args(parser)
try:
- main(config)
+ main2(config)
except Exception, err:
import traceback
print traceback.print_exc()
# given to GetNodes
nodelist = []
for h in hostlist:
- nodelist += api.GetNodes(h)
+ nodelist.append( plccache.GetNodeByName(h) )
- #nodelist = api.GetNodes(hostlist)
group_str = "Given"
elif config.site:
- site = api.GetSites(config.site)
+ site = plccache.GetSitesByName([config.site])
if len (site) > 0:
site = site[0]
- nodelist = api.GetNodes(site['node_ids'])
+ nodelist = plccache.GetNodesByIds(site['node_ids'])
else:
nodelist = []
elif config.nodeselect:
hostlist = node_select(config.nodeselect)
- nodelist = api.GetNodes(hostlist)
+ nodelist = [ plccache.GetNodeByName(h) for h in hostlist ]
group_str = "selection"
else:
ng = api.GetNodeGroups({'name' : config.nodegroup})
- nodelist = api.GetNodes(ng[0]['node_ids'])
+ nodelist = plccache.GetNodesByIds(ng[0]['node_ids'])
group_str = config.nodegroup
ng_nodes = nodelist
# Get all nodes
- all_nodes = api.GetNodes({'peer_id': None})
+ all_nodes = plccache.l_nodes
# remove ngnodes from all node list
ng_list = [ x['hostname'] for x in ng_nodes ]
i = 1
for node in nodelist:
print "%-2d" % i,
- fbrec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
+ fbrec = FindbadNodeRecord.get_latest_by(hostname=node['hostname'])
fbdata = fbrec.to_dict()
print nodegroup_display(node, fbdata, config)
i += 1
from monitor import util
from monitor import parser as parsermodule
-from monitor import database
-from pcucontrol import reboot
+from monitor.database.info.model import *
+from monitor import reboot
import time
from monitor.model import *
diff_time(plcnode['last_contact']), plcnode['key'])
def fb_print_nodeinfo(fbnode):
- pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
+ pf = HistoryNodeRecord.get_by(hostname= fbnode['hostname'])
try:
fbnode['last_change'] = diff_time(pf.last_changed)
except:
for node in config.args:
config.node = node
- plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
+ plc_nodeinfo = plccache.GetNodeByName(config.node)
fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node)
fb_nodeinfo = fb_noderec.to_dict()
plc_print_nodeinfo(plc_nodeinfo)
import re
import string
-from pcucontrol import reboot
from monitor.wrapper import plc, plccache
api = plc.getAuthAPI()
-from monitor.database.info.model import FindbadNodeRecordSync, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
from monitor import util
from monitor import config
fbquery = FindbadNodeRecord.get_all_latest()
fb_nodelist = [ n.hostname for n in fbquery ]
if True:
+ # NOTE: this doesn't work when there are only a few records current.
+ # pcu_select should apply to all pcus globally, not just the most recent records.
fbpcuquery = FindbadPCURecord.get_all_latest()
fbpcu_list = [ p.plc_pcuid for p in fbpcuquery ]
#fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed()
fb = None
- #reboot.fb = fbpcu
-
if config.nodelist:
nodelist = util.file.getListFromFile(config.nodelist)
else:
try:
# Find the most recent record
- fb_noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node).order_by(FindbadNodeRecord.date_checked.desc()).first()
+ fb_noderec = FindbadNodeRecord.get_latest_by(hostname=node)
except:
print traceback.print_exc()
pass
import sys
import string
import time
+import sets
from datetime import datetime,timedelta
from monitor import database
-from pcucontrol import reboot
+from monitor import reboot
from monitor import parser as parsermodule
from monitor import config
from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord
api = plc.getAuthAPI()
-def main(config):
+def main():
+ main2(config)
+
+def main2(config):
l_plcpcus = plccache.l_pcus
l_pcus = None
- if config.pcu:
+ if config.site is not None:
+ site = plccache.GetSitesByName([config.site])
+ l_nodes = plccache.GetNodesByIds(site[0]['node_ids'])
+ pcus = []
+ for node in l_nodes:
+ pcus += node['pcu_ids']
+ # clear out dups.
+ l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+ elif config.node:
+ l_nodes = plccache.GetNodeByName(config.node)
+ pcus = []
+ for node in l_nodes:
+ pcus += node['pcu_ids']
+ # clear out dups.
+ l_pcus = [pcu for pcu in sets.Set(pcus)]
+
+ elif config.pcu:
for pcu in l_plcpcus:
if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \
( pcu['ip'] is not None and config.pcu in pcu['ip'] ):
hn2lb = plccache.plcdb_hn2lb
+def check_pcu_state(rec, pcu):
+
+ pcu_state = rec.reboot_trial_status
+
+ if ( pcu_state == 'NetDown' or pcu_state == 'Not_Run' or not ( pcu_state == 0 or pcu_state == "0" ) ) and \
+ ( pcu.status == 'online' or pcu.status == 'good' ):
+ print "changed status from %s to offline" % pcu.status
+ pcu.status = 'offline'
+ pcu.last_changed = datetime.now()
+
+ if ( pcu_state == 0 or pcu_state == "0" ) and pcu.status not in [ 'online', 'good' ]:
+ print "changed status from %s to online" % pcu.status
+ pcu.status = 'online'
+ pcu.last_changed = datetime.now()
+
+ if pcu.status == 'online' and changed_greaterthan(pcu.last_changed, 0.5):
+ #send thank you notice, or on-line notice.
+ print "changed status from %s to good" % pcu.status
+ pcu.status = 'good'
+ # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+ if pcu.status == 'offline' and changed_greaterthan(pcu.last_changed, 2):
+ # send down pcu notice
+ print "changed status from %s to down" % pcu.status
+ pcu.status = 'down'
+ pcu.last_changed = datetime.now()
+
+ if ( pcu.status == 'offline' or pcu.status == 'down' ) and changed_greaterthan(pcu.last_changed, 2*30):
+ print "changed status from %s to down" % pcu.status
+ pcu.status = 'down'
+ pcu.last_changed = datetime.now()
+
def checkAndRecordState(l_pcus, l_plcpcus):
count = 0
for pcuname in l_pcus:
if not d_pcu:
continue
- pf = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'])
- pf.last_checked = datetime.now()
+ pcuhist = HistoryPCURecord.findby_or_create(plc_pcuid=d_pcu['pcu_id'],
+ if_new_set={'status' : 'offline',
+ 'last_changed' : datetime.now()})
+ pcuhist.last_checked = datetime.now()
try:
# Find the most recent record
- pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).order_by(FindbadPCURecord.date_checked.desc()).first()
- print "NODEREC: ", pcurec.date_checked
+ pcurec = FindbadPCURecord.query.filter(FindbadPCURecord.plc_pcuid==pcuname).first()
except:
- print "COULD NOT FIND FB record for %s" % reboot.pcu_name(pcu)
+ print "COULD NOT FIND FB record for %s" % reboot.pcu_name(d_pcu)
import traceback
print traceback.print_exc()
# don't have the info to create a new entry right now, so continue.
continue
- pcu_state = pcurec.reboot_trial_status
- current_state = pcu_state
-
- if current_state == 0 or current_state == "0":
- if pf.status != "good":
- pf.last_changed = datetime.now()
- pf.status = "good"
- elif current_state == 'NetDown':
- if pf.status != "netdown":
- pf.last_changed = datetime.now()
- pf.status = "netdown"
- elif current_state == 'Not_Run':
- if pf.status != "badconfig":
- pf.last_changed = datetime.now()
- pf.status = "badconfig"
- else:
- if pf.status != "error":
- pf.last_changed = datetime.now()
- pf.status = "error"
+ if not pcurec:
+ print "none object for pcu %s"% reboot.pcu_name(d_pcu)
+ continue
+
+ check_pcu_state(pcurec, pcuhist)
count += 1
- print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple())))
+ print "%d %35s %s since(%s)" % (count, reboot.pcu_name(d_pcu), pcuhist.status, diff_time(time.mktime(pcuhist.last_changed.timetuple())))
# NOTE: this commits all pending operations to the DB. Do not remove, or
# replace with another operations that also commits all pending ops, such
# as session.commit() or flush() or something
- print HistoryPCURecord.query.count()
session.flush()
+ print HistoryPCURecord.query.count()
return True
if __name__ == '__main__':
parser = parsermodule.getParser()
- parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False)
+ parser.set_defaults(filename=None, pcu=None, node=None, site=None, pcuselect=False, pcugroup=None, cachepcus=False)
parser.add_option("", "--pcu", dest="pcu", metavar="hostname",
help="Provide a single pcu to operate on")
+ parser.add_option("", "--site", dest="site", metavar="sitename",
+ help="Provide a single sitename to operate on")
+ parser.add_option("", "--node", dest="node", metavar="nodename",
+ help="Provide a single node to operate on")
parser.add_option("", "--pculist", dest="pculist", metavar="file.list",
help="Provide a list of files to operate on")
config = parsermodule.parse_args(parser)
try:
- main(config)
+ main2(config)
except Exception, err:
import traceback
- print traceback.print_exc()
+ traceback.print_exc()
print "Exception: %s" % err
sys.exit(0)
def run(self, node_port, dryrun):
print "RUNNING!!!!!!!!!!!!"
- if self.type == Transport.HTTPS or self.type == Transport.HTTP:
+ if self.transport.type == Transport.HTTPS or self.type == Transport.HTTP:
print "APC via http...."
return self.run_http_or_https(node_port, dryrun)
else:
else:
# TODO: also send message for https, since that doesn't work this way...
- if self.type == Transport.HTTPS:
+ if self.transport.type == Transport.HTTPS:
cmd = self.get_https_cmd()
- elif self.type == Transport.HTTP:
+ elif self.transport.type == Transport.HTTP:
cmd = self.get_http_cmd()
else:
raise ExceptionNoTransport("Unsupported transport for http command")
# NOTE: we may need to return software version, no model version to
# know which file to request on the server.
- if self.type == Transport.HTTP:
+ if self.transport.type == Transport.HTTP:
cmd = """curl -s --anyauth --user '%s:%s' http://%s/about.htm """ + \
""" | sed -e "s/<[^>]*>//g" -e "s/ //g" -e "/^$/d" """ + \
""" | grep -E "AP[[:digit:]]+" """
#""" | grep -E "v[[:digit:]].*" """
- elif self.type == Transport.HTTPS:
+ elif self.transport.type == Transport.HTTPS:
cmd = """curl -s --insecure --user '%s:%s' https://%s/about.htm """ + \
""" | sed -e "s/<[^>]*>//g" -e "s/ //g" -e "/^$/d" """ + \
""" | grep -E "AP[[:digit:]]+" """
def logout(self):
# NOTE: log out again, to allow other uses to access the machine.
- if self.type == Transport.HTTP:
+ if self.transport.type == Transport.HTTP:
cmd = """curl -s --anyauth --user '%s:%s' http://%s/logout.htm """ + \
""" | grep -E '^[^<]+' """
- elif self.type == Transport.HTTPS:
+ elif self.transport.type == Transport.HTTPS:
cmd = """curl -s --insecure --user '%s:%s' http://%s/logout.htm """ + \
""" | grep -E '^[^<]+' """
else:
from pcucontrol.reboot import *
class BayTechRPC3NC(PCUControl):
+ supported_ports = [22,23]
def run_telnet(self, node_port, dryrun):
return self.run_ssh(node_port, dryrun)
return 0
class BayTechRPC16(PCUControl):
+ supported_ports = [22,23]
def run_telnet(self, node_port, dryrun):
return self.run_ssh(node_port, dryrun)
def run_ssh(self, node_port, dryrun):
indefinitely, unless you send a Ctrl-C after the password. No idea
why.
"""
+ supported_ports = [22]
def run_ssh(self, node_port, dryrun):
print "BayTechCtrlC %s" % self.host
if index == 0:
print "3"
s.send("3\r\n")
+ time.sleep(5)
index = s.expect(["DS-RPC>", "Enter user name:"])
if index == 1:
s.send(self.username + "\r\n")
+ time.sleep(5)
index = s.expect(["DS-RPC>"])
if index == 0:
indefinitely, unless you send a Ctrl-C after the password. No idea
why.
"""
+ supported_ports = [22]
def run_ssh(self, node_port, dryrun):
print "BayTechCtrlC %s" % self.host
"-o PasswordAuthentication=yes "+\
"-o PubkeyAuthentication=no"
s = pxssh.pxssh()
- if not s.login(self.host, self.username, self.password, ssh_options,
+ try:
+ if not s.login(self.host, self.username, self.password, ssh_options,
original_prompts="Dell", login_timeout=Transport.TELNET_TIMEOUT):
- raise ExceptionPassword("Invalid Password")
-
- print "logging in..."
+ raise ExceptionPassword("Invalid Password")
+ except pexpect.EOF:
+ raise ExceptionPrompt("Disconnect before login prompt")
+
+ print "logging in... %s" % self.host
s.send("\r\n\r\n")
try:
# Testing Reboot ?
print "RUNCMD: %s" % output
if verbose:
- logger.debug(output)
+ print output
return 0
except Exception, err:
- logger.debug("runcmd raised exception %s" % err)
- if verbose:
- logger.debug(err)
- return err
+ print "runcmd raised exception %s" % err
+ return str(err)
from pcucontrol.reboot import *
+from distutils.sysconfig import get_python_lib;
class HPiLO(PCUControl):
supported_ports = [22,443]
locfg = command.CMD()
- cmd_str = config.MONITOR_SCRIPT_ROOT + "/pcucontrol/models/hpilo/"
+ cmd_str = get_python_lib(1) + "/pcucontrol/models/hpilo/"
cmd = cmd_str + "locfg.pl -s %s -f %s -u %s -p '%s' | grep 'MESSAGE' | grep -v 'No error'" % (
self.host, cmd_str+"iloxml/Get_Network.xml",
s.close()
if e[0] == errno.ECONNREFUSED:
# cannot connect to remote host
- raise Exception(e[1])
+ raise ExceptionNotFound(e[1])
+ elif e[0] == errno.ETIMEDOUT:
+ raise ExceptionTimeout(e[1])
else:
# TODO: what other conditions are there?
raise Exception(e)
print "Current status is '%s'" % ret
if ret == '':
- raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret))
+ raise Exception("Status returned 'another session already open' on %s %s : %s" % (self.host, node_port, ret))
if node_port < len(ret):
status = ret[node_port]
elif status == '0':
# down
power_on = False
+ elif status == '6':
+ raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
else:
- raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+ raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
else:
- raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+ raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
if not dryrun:
elif status == '0':
# down
power_on = False
+ elif status == '6':
+ raise ExceptionPort("IPAL reported 'Cable Error' on %s socket %s : %s" % (self.host, node_port, ret))
else:
- raise Exception("Unknown status for PCU socket %s : %s" % (node_port, ret))
+ raise Exception("Unknown status for PCU %s socket %s : %s" % (self.host, node_port, ret))
else:
- raise Exception("Mismatch between configured port and PCU status: %s %s" % (node_port, ret))
+ raise Exception("Mismatch between configured port and PCU %s status: %s %s" % (self.host, node_port, ret))
if power_on:
return 0
req.add_header("Authorization", authheader)
# add data to handler,
f = urllib2.urlopen(req, data)
- if self.verbose: print f.read()
+ if self.transport.verbose: print f.read()
except:
import traceback; traceback.print_exc()
# fetch url one more time on cmd.html, econtrol.html or whatever.
# pass
else:
- if self.verbose: print f.read()
+ if self.transport.verbose: print f.read()
return 0
# NOTE: it doesn't seem to matter whether this authinfo is here or not.
transport = urllib2.build_opener(authinfo)
f = transport.open(self.url)
- if self.verbose: print f.read()
+ if self.transport.verbose: print f.read()
if not dryrun:
transport = urllib2.build_opener(authhandler)
f = transport.open(self.url + "cmd.html", "P%d=r" % node_port)
- if self.verbose: print f.read()
+ if self.transport.verbose: print f.read()
self.transport.close()
return 0
# NOTE: it doesn't seem to matter whether this authinfo is here or not.
transport = urllib2.build_opener()
f = transport.open(self.url + "elogin.html", "pwd=%s" % self.password)
- if self.verbose: print f.read()
+ if self.transport.verbose: print f.read()
if not dryrun:
transport = urllib2.build_opener(authhandler)
f = transport.open(self.url + "econtrol.html", "P%d=r" % node_port)
- if self.verbose: print f.read()
+ if self.transport.verbose: print f.read()
# data= "P%d=r" % node_port
#self.open(self.host, self.username, self.password)
void DisplayOemDefinedCapabilities(uint32 OemDefinedCapabilities);
bool ExecuteGetSystemPowerstate(Soap *server, bool verbose = true);
bool ExecuteGetRemoteControlCapabilities(Soap *server, bool verbose = true);
-bool ExecuteRemoteControl(Soap *server, bool default_val = false);
+bool ExecuteRemoteControl(Soap *server, bool default_val = false, uint8 icommand=Reset);
bool MainFlow(Soap *server,int option,bool verbose);
bool ValidateOption(char *option, int *parameter);
{
return status;
}
- if ((status = ExecuteRemoteControl(server,true)) == false)
+ /* Ensure that the machine is powered up before trying to
+ * 'reset' it, since a reset on a down node will fail. */
+ if ((status = ExecuteRemoteControl(server,true,PowerUp)) == false)
+ {
+ return status;
+ }
+ if ((status = ExecuteRemoteControl(server,true,Reset)) == false)
{
return status;
}
* true - on success
* false - on failure
*/
-bool ExecuteRemoteControl(Soap* server,bool def_values)
+bool ExecuteRemoteControl(Soap* server,bool def_values, uint8 icommand)
{
int res;
bool status = true;
_rci__RemoteControlResponse response;
// example values
- uint8 *command = new uint8(Reset);
+ uint8 *command = new uint8(icommand);
uint32 *ianaOemNumber = new uint32(IntelIanaNumber);
uint8 *specialCommand = NULL; //none
uint16 *oemParameter = NULL; //none
import urllib
import threading, popen2
import array, struct
-from monitor.wrapper import plc
import base64
from subprocess import PIPE, Popen
import pcucontrol.transports.ssh.pxssh as pxssh
import pcucontrol.transports.ssh.pexpect as pexpect
import socket
-from monitor.util import command
+
# Use our versions of telnetlib and pyssh
import pcucontrol.transports.telnetlib as telnetlib
sys.path.insert(0, os.path.dirname(sys.argv[0]) + "/pyssh")
import pcucontrol.transports.pyssh as pyssh
-from monitor import config
-
# Event class ID from pcu events
#NODE_POWER_CONTROL = 3
#MONITOR_USER_ID = 11142
import logging
-logger = logging.getLogger("monitor")
verbose = 1
#dryrun = 0;
transport.set_debuglevel(self.verbose)
if username is not None:
self.transport = transport
- self.transport.ifThenSend(prompt, username, ExceptionUsername)
+ self.ifThenSend(prompt, username, ExceptionUsername)
elif self.type == self.SSH:
if username is not None:
print r
except urllib2.URLError,err:
- logger.info('Could not open http connection', err)
+ print 'Could not open http connection', err
return "http transport error"
return 0
def reboot(self, node_port, dryrun):
port_list = []
+ # There are two sources of potential ports. Those that are open and
+ # those that are part of the PCU's supported_ports.
+ # I think we should start with supported_ports and then filter that
+ # by the open ports.
+
+ port_list = self.supported_ports
+
if hasattr(self, 'port_status') and self.port_status:
+ # get out the open ports
port_list = filter(lambda x: self.port_status[x] == "open" , self.port_status.keys())
port_list = [ int(x) for x in port_list ]
+ # take only the open ports that are supported_ports
+ port_list = filter(lambda x: x in self.supported_ports, port_list)
if port_list == []:
- raise ExceptionPort("Unsupported Port: No transport from open ports")
- else:
- port_list = self.supported_ports
+ raise ExceptionPort("No Open Port: No transport from open ports")
print port_list
- ret = "could not run"
+ ret = "No implementation for open ports on selected PCU model"
for port in port_list:
if port not in Transport.porttypemap:
continue
type = Transport.porttypemap[port]
self.transport = Transport(type, verbose)
+ print "checking for run_%s" % type
if hasattr(self, "run_%s" % type):
+ print "found run_%s" % type
fxn = getattr(self, "run_%s" % type)
ret = self.catcherror(fxn, node_port, dryrun)
if ret == 0: # NOTE: success!, so stop
except urllib2.URLError, err:
return "URLError: " + str(err)
except EOFError, err:
- if self.verbose:
- logger.debug("reboot: EOF")
- logger.debug(err)
self.transport.close()
import traceback
traceback.print_exc()
return "EOF connection reset" + str(err)
+ except Exception, err:
+ from monitor.common import email_exception
+ email_exception(self.host)
+ raise Exception(err)
+from pcucontrol.util import command
from pcucontrol.models import *
def pcu_name(pcu):
else:
return None
-def get_pcu_values(pcu_id):
- from monitor.database.info.model import FindbadPCURecord
- print "pcuid: %s" % pcu_id
- try:
- pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=pcu_id).first()
- if pcurec:
- values = pcurec.to_dict()
- else:
- values = None
- except:
- values = None
-
- return values
-
-def reboot(nodename):
- return reboot_policy(nodename, True, False)
-
-def reboot_str(nodename):
- global verbose
- continue_probe = True
- dryrun=False
-
- pcu = plc.getpcu(nodename)
- if not pcu:
- logger.debug("no pcu for %s" % nodename)
- print "no pcu for %s" % nodename
- return False # "%s has no pcu" % nodename
-
- values = get_pcu_values(pcu['pcu_id'])
- if values == None:
- logger.debug("No values for pcu probe %s" % nodename)
- print "No values for pcu probe %s" % nodename
- return False #"no info for pcu_id %s" % pcu['pcu_id']
-
- # Try the PCU first
- logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
- ret = reboot_test_new(nodename, values, verbose, dryrun)
- return ret
-
-def reboot_policy(nodename, continue_probe, dryrun):
- global verbose
-
- pcu = plc.getpcu(nodename)
- if not pcu:
- logger.debug("no pcu for %s" % nodename)
- print "no pcu for %s" % nodename
- return False # "%s has no pcu" % nodename
-
- values = get_pcu_values(pcu['pcu_id'])
- if values == None:
- logger.debug("No values for pcu probe %s" % nodename)
- print "No values for pcu probe %s" % nodename
- return False #"no info for pcu_id %s" % pcu['pcu_id']
-
- # Try the PCU first
- logger.debug("Trying PCU %s %s" % (pcu['hostname'], pcu['model']))
-
- ret = reboot_test_new(nodename, values, verbose, dryrun)
-
- if ret != 0:
- print ret
- return False
- else:
- print "return true"
- return True
-
class Unknown(PCUControl):
supported_ports = [22,23,80,443,5869,9100,16992]
print "UNKNOWN model %s"%modelname
return Unknown
-def reboot_api(node, pcu): #, verbose, dryrun):
+def reboot_api(node, pcu):
rb_ret = ""
try:
rb_ret = "No modelname in PCU record."
# TODO: how to handle the weird, georgetown pcus, the drac faults, and ilo faults
except Exception, err:
- rb_ret = str(err)
+ rb_ret = "Exception Model(%s): " % modelname
+ rb_ret += str(err)
return rb_ret
+def convert_oldmodelname_to_newmodelname(oldmodelname, pcu_id):
+ newmodelname = None
+ update = { 'AP79xx' : 'APCControl13p13',
+ 'Masterswitch' : 'APCControl13p13',
+ 'DS4-RPC' : 'BayTech',
+ 'IP-41x_IP-81x' : 'IPAL',
+ 'DRAC3' : 'DRAC',
+ 'DRAC4' : 'DRAC',
+ 'ePowerSwitch' : 'ePowerSwitchOld',
+ 'ilo2' : 'HPiLO',
+ 'ilo1' : 'HPiLO',
+ 'PM211-MIP' : 'PM211MIP',
+ 'AMT2.5' : 'IntelAMT',
+ 'AMT3.0' : 'IntelAMT',
+ 'WTI_IPS-4' : 'WTIIPS4',
+ 'unknown' : 'ManualPCU',
+ 'DRAC5' : 'DRAC',
+ 'ipmi' : 'OpenIPMI',
+ 'bbsemaverick' : 'BlackBoxPSMaverick',
+ 'manualadmin' : 'ManualPCU',
+ }
+
+ if oldmodelname in update:
+ newmodelname = update[oldmodelname]
+ else:
+ newmodelname = oldmodelname
+
+ if pcu_id in [1102,1163,1055,1111,1231,1113,1127,1128,1148]:
+ newmodelname = 'APCControl12p3'
+ elif pcu_id in [1110,86]:
+ newmodelname = 'APCControl1p4'
+ elif pcu_id in [1221,1225,1220,1192]:
+ newmodelname = 'APCControl121p3'
+ elif pcu_id in [1173,1240,47,1363,1405,1401,1372,1371]:
+ newmodelname = 'APCControl121p1'
+ elif pcu_id in [1056,1237,1052,1209,1002,1008,1013,1022]:
+ newmodelname = 'BayTechCtrlC'
+ elif pcu_id in [93]:
+ newmodelname = 'BayTechRPC3NC'
+ elif pcu_id in [1057]:
+ newmodelname = 'BayTechCtrlCUnibe'
+ elif pcu_id in [1012]:
+ newmodelname = 'BayTechRPC16'
+ elif pcu_id in [1089, 1071, 1046, 1035, 1118]:
+ newmodelname = 'ePowerSwitchNew'
+
+ return newmodelname
+
def reboot_test_new(nodename, values, verbose, dryrun):
rb_ret = ""
if 'plc_pcu_stats' in values:
values.update(values['plc_pcu_stats'])
try:
- modelname = values['model']
+ modelname = convert_oldmodelname_to_newmodelname(values['model'], values['pcu_id'])
if modelname:
- object = eval('%s(values, verbose, ["22", "23", "80", "443", "9100", "16992", "5869"])' % modelname)
+ object = eval('%s(values, verbose)' % modelname)
rb_ret = object.reboot(values[nodename], dryrun)
else:
rb_ret = "Not_Run"
return rb_ret
def main():
- logger.setLevel(logging.DEBUG)
- ch = logging.StreamHandler()
- ch.setLevel(logging.DEBUG)
- formatter = logging.Formatter('LOGGER - %(message)s')
- ch.setFormatter(formatter)
- logger.addHandler(ch)
-
- try:
- if "test" in sys.argv:
- dryrun = True
- else:
- dryrun = False
-
- for node in sys.argv[1:]:
- if node == "test": continue
-
- print "Rebooting %s" % node
- if reboot_policy(node, True, dryrun):
- print "success"
- else:
- print "failed"
- except Exception, err:
- import traceback; traceback.print_exc()
- print err
+ print "this does not work."
if __name__ == '__main__':
- logger = logging.getLogger("monitor")
main()
- f = open("/tmp/rebootlog", 'a')
- f.write("reboot %s\n" % sys.argv)
- f.close()
import signal
import time
import traceback
+import fcntl
DEBUG= 0
class ExceptionTimeout(Exception): pass
+class ExceptionReadTimeout(Exception): pass
COMMAND_TIMEOUT = 60
ssh_options = { 'StrictHostKeyChecking':'no',
'BatchMode':'yes',
'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
class Sopen(subprocess.Popen):
- def kill(self, signal = signal.SIGTERM):
- os.kill(self.pid, signal)
+ def kill(self, sig = signal.SIGTERM):
+ try:
+ # NOTE: this also kills parent... so doesn't work like I want.
+ # NOTE: adding 'exec' before the cmd removes the extra sh, and
+ # partially addresses this problem.
+ #os.killpg(os.getpgid(self.pid), signal.SIGKILL)
+ os.kill(self.pid, sig)
+ except OSError:
+ # no such process, due to it already exiting...
+ pass
+
+
+def read_t(stream, count=1, timeout=COMMAND_TIMEOUT*2):
+ if count == 1:
+ retstr = ""
+
+ while True:
+ lin, lout, lerr = select([stream], [], [], timeout)
+ if len(lin) == 0:
+ print "timeout!"
+ raise ExceptionReadTimeout("TIMEOUT reading from command")
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
- lin, lout, lerr = select([stream], [], [], timeout)
- if len(lin) == 0:
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+ try:
+ outbytes = stream.read(count)
+ except IOError, err:
+ print 'no content yet.'
+ # due to no content.
+ # the select timeout should catch this.
+ continue
- return stream.read(count)
+ if not outbytes:
+ break
+ retstr += outbytes
+
+ return retstr
+ else:
+ lin, lout, lerr = select([stream], [], [], timeout)
+ if len(lin) == 0:
+ raise ExceptionReadTimeout("TIMEOUT reading from command")
+
+ return stream.read(count)
class CMD:
def __init__(self):
def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
- #print "CMD.run_noexcept(%s)" % cmd
try:
return CMD.run(self,cmd,timeout)
except ExceptionTimeout:
print traceback.print_exc()
- return ("", "SCRIPTTIMEOUT")
+ return ("", "ScriptTimeout")
+ except ExceptionReadTimeout:
+ print traceback.print_exc()
+ return ("", "RunningScriptTimeout")
+ except KeyboardInterrupt:
+ print "Interrupted, exiting..."
+ sys.exit(1)
+ except Exception, err:
+ from monitor.common import email_exception
+ email_exception()
+ return ("", str(err))
def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
(o,e) = self.run(cmd, timeout)
def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
- #print "CMD.run(%s)" % cmd
s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
self.s = s
(f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
- #print "calling select(%s)" % timeout
lout, lin, lerr = select([f_out], [], [f_err], timeout)
- #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
# Reached a timeout! Nuke process so it does not hang.
- #print "KILLING"
+ print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
s.kill(signal.SIGKILL)
raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
else:
o_value = ""
e_value = ""
- o_value = f_out.read()
+ #o_value = f_out.read()
+ flags = fcntl.fcntl(f_out, fcntl.F_GETFL)
+ fcntl.fcntl(f_out, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+ try:
+ o_value = read_t(f_out,1,30)
+ except ExceptionReadTimeout:
+ s.kill(signal.SIGKILL)
+ raise ExceptionReadTimeout("TIMEOUT: failed to read from cmd: %s" % cmd)
+
e_value = f_err.read()
- #print "striping output"
o_value = o_value.strip()
e_value = e_value.strip()
- #print "OUTPUT -%s-%s-" % (o_value, e_value)
-
- #print "closing files"
f_out.close()
f_in.close()
f_err.close()
- try:
- #print "s.kill()"
- s.kill()
- #print "after s.kill()"
- except OSError:
- # no such process, due to it already exiting...
- pass
+ s.kill(signal.SIGKILL)
- #print o_value, e_value
return (o_value, e_value)
def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
f_out.close()
f_in.close()
f_err.close()
- try:
- s.kill()
- except OSError:
- # no such process, due to it already exiting...
- pass
+ s.kill(signal.SIGKILL)
return (o_value, e_value)
return CMD.run_noexcept(self, cmd)
def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
+ cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
self.user, self.host, cmd)
- #print "SSH.run_noexcept2(%s)" % cmd
+ #print cmd
r = CMD.run_noexcept(self, cmd, timeout)
-
- # XXX: this may be resulting in deadlocks... not sure.
- #if self.s.returncode is None:
- # #self.s.kill()
- # self.s.kill(signal.SIGKILL)
- # self.s.wait()
- # self.ret = self.s.returncode
self.ret = -1
return r
--- /dev/null
+#!/usr/bin/python
+
+# This script is used to manipulate the operational state of nodes in
+# different node groups. These are basically set operations on nodes via the
+# PLC api.
+#
+# Take the ng name as an argument....
+# optionally,
+# * get a list of nodes in the given nodegroup.
+# * set some or all in the set to rins.
+# * restart them all.
+# * do something else to them all.
+#
+
+import os
+import time
+import traceback
+import sys
+from optparse import OptionParser
+
+from monitor import config
+from monitor import parser as parsermodule
+from monitor.common import *
+from monitor.model import *
+from monitor.wrapper import plc
+from monitor.wrapper import plccache
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+from nodequery import verify,query_to_dict,node_select
+
+api = plc.getAuthAPI()
+
+def logic():
+
+ plc.nodeBootState(host, 'rins')
+ node_end_record(host)
+
+def main(hostnames, sitenames):
+ # commands:
+ i = 1
+ node_count = 1
+ site_count = 1
+ #print "hosts: %s" % hostnames
+ for i,host in enumerate(hostnames):
+ try:
+ lb = plccache.plcdb_hn2lb[host]
+ except:
+ print "unknown host in plcdb_hn2lb %s" % host
+ continue
+
+ nodeblack = BlacklistRecord.get_by(hostname=host)
+
+ if nodeblack and not nodeblack.expired():
+ print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() )
+ continue
+
+ sitehist = SiteInterface.get_or_make(loginbase=lb)
+
+ recent_actions = sitehist.getRecentActions(hostname=host)
+
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=host)
+
+ print "%s %s %s" % (i, nodehist.hostname, nodehist.status)
+ if nodehist.status == 'good' and \
+ changed_lessthan(nodehist.last_changed, 1.0) and \
+ not found_within(recent_actions, 'online_notice', 0.5):
+ # NOTE: there is a narrow window in which this command must be
+ # evaluated, otherwise the notice will not go out. this is not ideal.
+ sitehist.sendMessage('online_notice', hostname=host, viart=False)
+ print "send message for host %s online" % host
+
+ pass
+
+ if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+ changed_greaterthan(nodehist.last_changed,1.0) and \
+ not found_between(recent_actions, 'first_try_reboot', 3.5, 1):
+
+ sitehist.attemptReboot(host)
+ print "send message for host %s first_try_reboot" % host
+ pass
+
+ # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1)
+ # will be false for a day after the above condition is satisfied
+ if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \
+ changed_greaterthan(nodehist.last_changed,1.5) and \
+ found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \
+ not found_within(recent_actions, 'pcufailed_notice', 3.5):
+ # found_within(recent_actions, 'first_try_reboot', 3.5) and \
+
+ # send pcu failure message
+ #act = ActionRecord(**kwargs)
+ sitehist.sendMessage('pcufailed_notice', hostname=host)
+ print "send message for host %s PCU Failure" % host
+ pass
+
+ if nodehist.status == 'monitordebug' and \
+ changed_greaterthan(nodehist.last_changed, 1) and \
+ not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
+ # send down node notice
+ # delay 0.5 days before retrying...
+
+ print "send message for host %s bootmanager_restore" % host
+ sitehist.runBootManager(host)
+ # sitehist.sendMessage('retry_bootman', hostname=host)
+
+ if nodehist.status == 'down' and \
+ changed_greaterthan(nodehist.last_changed, 2) and \
+ not found_within(recent_actions, 'down_notice', 3.5):
+ # send down node notice
+
+ sitehist.sendMessage('down_notice', hostname=host)
+ print "send message for host %s down" % host
+ pass
+
+ node_count = node_count + 1
+ session.flush()
+
+ for i,site in enumerate(sitenames):
+ sitehist = SiteInterface.get_or_make(loginbase=site)
+ siteblack = BlacklistRecord.get_by(loginbase=site)
+
+ if siteblack and not siteblack.expired():
+ print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() )
+ continue
+
+ # TODO: make query only return records within a certin time range,
+ # i.e. greater than 0.5 days ago. or 5 days, etc.
+ recent_actions = sitehist.getRecentActions(loginbase=site)
+
+ print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status)
+ if sitehist.db.status == 'down':
+ if not found_within(recent_actions, 'pause_penalty', 30) and \
+ not found_within(recent_actions, 'increase_penalty', 7) and \
+ changed_greaterthan(sitehist.db.last_changed, 7):
+
+ # TODO: catch errors
+ sitehist.increasePenalty()
+ #sitehist.applyPenalty()
+ sitehist.sendMessage('increase_penalty')
+
+ print "send message for site %s penalty increase" % site
+
+ if sitehist.db.status == 'good':
+ # clear penalty
+ # NOTE: because 'all clear' should have an indefinite status, we
+ # have a boolean value rather than a 'recent action'
+ if sitehist.db.penalty_applied:
+ # send message that penalties are cleared.
+
+ sitehist.clearPenalty()
+ #sitehist.applyPenalty()
+ sitehist.sendMessage('clear_penalty')
+ sitehist.closeTicket()
+
+ print "send message for site %s penalty cleared" % site
+
+ # find all ticket ids for site ( could be on the site record? )
+ # determine if there are penalties within the last 30 days?
+ # if so, add a 'pause_penalty' action.
+ if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0:
+ # pause escalation
+ print "Pausing penalties for %s" % site
+ sitehist.pausePenalty()
+
+ site_count = site_count + 1
+
+ session.flush()
+
+ session.flush()
+ return
+
+
+if __name__ == "__main__":
+ parser = parsermodule.getParser(['nodesets'])
+ parser.set_defaults( timewait=0,
+ skip=0,
+ rins=False,
+ reboot=False,
+ findbad=False,
+ force=False,
+ nosetup=False,
+ verbose=False,
+ quiet=False,)
+
+ parser.add_option("", "--stopselect", dest="stopselect", metavar="",
+ help="The select string that must evaluate to true for the node to be considered 'done'")
+ parser.add_option("", "--findbad", dest="findbad", action="store_true",
+ help="Re-run findbad on the nodes we're going to check before acting.")
+ parser.add_option("", "--force", dest="force", action="store_true",
+ help="Force action regardless of previous actions/logs.")
+ parser.add_option("", "--rins", dest="rins", action="store_true",
+ help="Set the boot_state to 'rins' for all nodes.")
+ parser.add_option("", "--reboot", dest="reboot", action="store_true",
+ help="Actively try to reboot the nodes, keeping a log of actions.")
+
+ parser.add_option("", "--verbose", dest="verbose", action="store_true",
+ help="Extra debug output messages.")
+ parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
+ help="Do not perform the orginary setup phase.")
+ parser.add_option("", "--skip", dest="skip",
+ help="Number of machines to skip on the input queue.")
+ parser.add_option("", "--timewait", dest="timewait",
+ help="Minutes to wait between iterations of 10 nodes.")
+
+ parser = parsermodule.getParser(['defaults'], parser)
+ config = parsermodule.parse_args(parser)
+
+ fbquery = HistoryNodeRecord.query.all()
+ hostnames = [ n.hostname for n in fbquery ]
+
+ fbquery = HistorySiteRecord.query.all()
+ sitenames = [ s.loginbase for s in fbquery ]
+
+ if config.site:
+ # TODO: replace with calls to local db. the api fails so often that
+ # these calls should be regarded as unreliable.
+ l_nodes = plccache.GetNodesBySite(config.site)
+ filter_hostnames = [ n['hostname'] for n in l_nodes ]
+
+ hostnames = filter(lambda x: x in filter_hostnames, hostnames)
+ sitenames = [config.site]
+
+ if config.node:
+ hostnames = [ config.node ]
+ sitenames = [ plccache.plcdb_hn2lb[config.node] ]
+
+ try:
+ main(hostnames, sitenames)
+ except KeyboardInterrupt:
+ print "Killed by interrupt"
+ session.flush()
+ sys.exit(0)
+ except:
+ #email_exception()
+ print traceback.print_exc();
+ print "fail all..."
from distutils.core import setup
-packages=['monitor', 'monitor.database', 'monitor.database.zabbixapi',
- 'monitor.database.info', 'monitor.sources',
- 'monitor.util', 'monitor.wrapper' ]
+packages=[ 'monitor',
+ 'monitor.database',
+ 'monitor.database.zabbixapi',
+ 'monitor.database.info',
+ 'monitor.sources',
+ 'monitor.util',
+ 'monitor.wrapper' ]
print packages
setup(name='MonitorModule',
- version='1.1',
+ version='2.0',
description='Monitor Utility Module',
author='Stephen Soltesz',
author_email='soltesz@cs.princeton.edu',
)
packages=['pcucontrol',
+ 'pcucontrol.util',
'pcucontrol.transports',
'pcucontrol.transports.ssh',
'pcucontrol.transports.pyssh',
# TODO: add data dir for intelamt and hpilo stuff
print packages
setup(name='PCUControlModule',
- version='1.1',
+ version='2.0',
description='PCU Control Module',
author='Stephen Soltesz',
author_email='soltesz@cs.princeton.edu',
from datetime import datetime,timedelta
from monitor import database
-from pcucontrol import reboot
from monitor import parser as parsermodule
from monitor import config
-from monitor.database.info.model import HistorySiteRecord, FindbadNodeRecord, session
+from monitor.database.info.model import HistorySiteRecord, HistoryNodeRecord, session, BlacklistRecord
from monitor.wrapper import plc, plccache
from monitor.const import MINUP
if config.site:
l_sites = [config.site]
+ elif config.node:
+ l_sites = [plccache.plcdb_hn2lb[config.node]]
elif config.sitelist:
site_list = config.sitelist.split(',')
l_sites = site_list
checkAndRecordState(l_sites, l_plcsites)
-def getnewsite(nodelist):
- new = True
- for node in nodelist:
- try:
- noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
- if noderec is not None and \
- noderec.plc_node_stats['last_contact'] != None:
- new = False
- except:
- import traceback
- print traceback.print_exc()
- return new
-
def getnodesup(nodelist):
+ # NOTE : assume that a blacklisted node is fine, since we're told not to
+ # ignore it, no policy actions should be taken for it.
up = 0
for node in nodelist:
try:
- noderec = FindbadNodeRecord.query.filter(FindbadNodeRecord.hostname==node['hostname']).order_by(FindbadNodeRecord.date_checked.desc()).first()
- #noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'],
- # orderBy='date_checked').reversed()[0]
- if noderec is not None and noderec.observed_status == "BOOT":
+ nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname'])
+ nodebl = BlacklistRecord.get_by(hostname=node['hostname'])
+ if (nodehist is not None and nodehist.status != 'down') or \
+ (nodebl is not None and not nodebl.expired()):
up = up + 1
except:
import traceback
print traceback.print_exc()
return up
+def check_site_state(rec, sitehist):
+
+ if sitehist.new and sitehist.status not in ['new', 'online', 'good']:
+ sitehist.status = 'new'
+ sitehist.penalty_applied = True # because new sites are disabled by default, i.e. have a penalty.
+ sitehist.last_changed = datetime.now()
+
+ if sitehist.nodes_up >= MINUP:
+
+ if sitehist.status != 'online' and sitehist.status != 'good':
+ sitehist.last_changed = datetime.now()
+
+ if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online':
+ print "changed status from %s to online" % sitehist.status
+ sitehist.status = 'online'
+
+ if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good':
+ print "changed status from %s to good" % sitehist.status
+ sitehist.status = 'good'
+
+ elif not sitehist.new:
+
+ if sitehist.status != 'offline' and sitehist.status != 'down':
+ sitehist.last_changed = datetime.now()
+
+ if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline':
+ print "changed status from %s to offline" % sitehist.status
+ sitehist.status = 'offline'
+
+ if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down':
+ print "changed status from %s to down" % sitehist.status
+ sitehist.status = 'down'
+
def checkAndRecordState(l_sites, l_plcsites):
count = 0
lb2hn = plccache.plcdb_lb2hn
continue
if sitename in lb2hn:
- pf = HistorySiteRecord.findby_or_create(loginbase=sitename)
-
- pf.last_checked = datetime.now()
- pf.slices_total = d_site['max_slices']
- pf.slices_used = len(d_site['slice_ids'])
- pf.nodes_total = len(lb2hn[sitename])
- pf.nodes_up = getnodesup(lb2hn[sitename])
- pf.new = getnewsite(lb2hn[sitename])
- pf.enabled = d_site['enabled']
-
- if pf.nodes_up >= MINUP:
- if pf.status != "good": pf.last_changed = datetime.now()
- pf.status = "good"
- else:
- if pf.status != "down": pf.last_changed = datetime.now()
- pf.status = "down"
+ sitehist = HistorySiteRecord.findby_or_create(loginbase=sitename,
+ if_new_set={'status' : 'unknown',
+ 'last_changed' : datetime.now(),
+ 'message_id': 0,
+ 'penalty_level' : 0})
+ sitehist.last_checked = datetime.now()
+
+ sitehist.slices_total = d_site['max_slices']
+ sitehist.slices_used = len(d_site['slice_ids'])
+ sitehist.nodes_total = len(lb2hn[sitename])
+ if sitehist.message_id != 0:
+ rtstatus = mailer.getTicketStatus(sitehist.message_id)
+ sitehist.message_status = rtstatus['Status']
+ sitehist.message_queue = rtstatus['Queue']
+ sitehist.message_created = datetime.fromtimestamp(rtstatus['Created'])
+
+ sitehist.nodes_up = getnodesup(lb2hn[sitename])
+ sitehist.new = changed_lessthan(datetime.fromtimestamp(d_site['date_created']), 30) # created < 30 days ago
+ sitehist.enabled = d_site['enabled']
+
+ check_site_state(d_site, sitehist)
count += 1
- print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used,
- pf.nodes_total, pf.nodes_up, pf.status)
- pf.flush()
+ print "%d %15s slices(%2s) nodes(%2s) notdown(%2s) %s" % (count, sitename, sitehist.slices_used,
+ sitehist.nodes_total, sitehist.nodes_up, sitehist.status)
+ sitehist.flush()
print HistorySiteRecord.query.count()
session.flush()
api = plc.getAuthAPI()
from monitor import database
-from pcucontrol import reboot
import time
from monitor.common import *
diff_time(plcsite['last_updated']))
print ""
- nodes = api.GetNodes(plcsite['node_ids'])
+ nodes = plccache.GetNodesByIds(plcsite['node_ids'])
print " Checked: %s" % time.ctime()
print "\t host | state | obs | created | updated | last_contact "
for plcnode in nodes:
for site in config.args:
config.site = site
- plc_siteinfo = api.GetSites({'login_base': config.site})[0]
+ plc_siteinfo = plccache.GetSitesByName([config.site])
url = "https://www.planet-lab.org/db/sites/index.php?site_pattern="
plc_siteinfo['url'] = url + plc_siteinfo['login_base']
# rerun findbad with the nodes in the given nodes.
import os
file = "findbad.txt"
- nodes = api.GetNodes(plc_siteinfo['node_ids'], ['hostname'])
+ nodes = plccache.GetNodesByIds(plc_siteinfo['node_ids'])
nodes = [ n['hostname'] for n in nodes ]
util.file.setFileFromList(file, nodes)
os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
network = api.GetNodeNetworks(node['nodenetwork_ids'])
print "ok"
except:
- sys.stderr.write(traceback.print_exc())
+ sys.stderr.write(traceback.format_exc())
print "fail"
from monitor.database.zabbixapi.model import *
from monitor.database.dborm import zab_session as session
from monitor.database.dborm import zab_metadata as metadata
+from monitor_xmlrpc import MonitorXmlrpcServer
+
+from monitor import reboot
+from monitor import scanapi
-from pcucontrol import reboot
from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn
from monitorweb.templates.links import *
-from monitor import scanapi
def query_to_dict(query):
def prep_node_for_display(node):
if node.plc_pcuid:
- pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
if pcu:
node.pcu_status = pcu.reboot_trial_status
node.pcu_short_status = format_pcu_shortstatus(pcu)
if node.loginbase:
node.site = HistorySiteRecord.by_loginbase(node.loginbase)
+ if node.site is None:
+ # TODO: need a cleaner fix for this...
+ node.site = HistorySiteRecord.by_loginbase("pl")
+
node.history = HistoryNodeRecord.by_hostname(node.hostname)
-class Root(controllers.RootController):
+class Root(controllers.RootController, MonitorXmlrpcServer):
@expose(template="monitorweb.templates.welcome")
def index(self):
import time
prep_node_for_display(node)
nodequery += [node]
- return self.pcuview(None, hostname) # dict(nodequery=nodequery)
+ return self.pcuview(None, None, hostname) # dict(nodequery=nodequery)
@expose(template="monitorweb.templates.nodelist")
- def node(self, filter='BOOT'):
+ def node(self, filter='boot'):
import time
fbquery = FindbadNodeRecord.get_all_latest()
query = []
- filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0}
+ filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0,
+ 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0}
for node in fbquery:
# NOTE: reformat some fields.
prep_node_for_display(node)
- # NOTE: count filters
- if node.observed_status != 'DOWN':
- filtercount[node.observed_status] += 1
- else:
+ node.history.status
+
+ if node.history.status in ['down', 'offline']:
if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
- filtercount[node.observed_status] += 1
+ filtercount['down'] += 1
else:
filtercount['neverboot'] += 1
+ elif node.history.status in ['good', 'online']:
+ filtercount['boot'] += 1
+ elif node.history.status in ['debug', 'monitordebug']:
+ filtercount['debug'] += 1
+ else:
+ filtercount[node.history.status] += 1
+
+ ## NOTE: count filters
+ #if node.observed_status != 'DOWN':
+ # print node.hostname, node.observed_status
+ # if node.observed_status == 'DEBUG':
+ # if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']:
+ # filtercount[node.plc_node_stats['boot_state']] += 1
+ # else:
+ # filtercount['debug'] += 1
+ #
+ # else:
+ # filtercount[node.observed_status] += 1
+ #else:
+ # if node.plc_node_stats and node.plc_node_stats['last_contact'] != None:
+ # filtercount[node.observed_status] += 1
+ # else:
+ # filtercount['neverboot'] += 1
# NOTE: apply filter
- if filter == node.observed_status:
- if filter == "DOWN":
- if node.plc_node_stats['last_contact'] != None:
- query.append(node)
- else:
- query.append(node)
- elif filter == "neverboot":
+ if filter == "neverboot":
if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
query.append(node)
- elif filter == "pending":
- # TODO: look in message logs...
- pass
elif filter == "all":
query.append(node)
+ elif filter == node.history.status:
+ query.append(node)
+ elif filter == 'boot':
+ query.append(node)
+
+ #if filter == node.observed_status:
+ # if filter == "DOWN":
+ # if node.plc_node_stats['last_contact'] != None:
+ # query.append(node)
+ # else:
+ # query.append(node)
+ #elif filter == "neverboot":
+ # if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None:
+ # query.append(node)
+ #elif filter == "pending":
+ # # TODO: look in message logs...
+ # pass
+ #elif filter == node.plc_node_stats['boot_state']:
+ # query.append(node)
+ #elif filter == "all":
+ # query.append(node)
widget = NodeWidget(template='monitorweb.templates.node_template')
return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget)
def nodeaction_handler(self, tg_exceptions=None):
"""Handle any kind of error."""
+ print "NODEACTION_HANDLER------------------"
if 'pcuid' in request.params:
pcuid = request.params['pcuid']
if 'pcuid' in val:
pcuid = val['pcuid']
elif 'hostname' in val:
- pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid
+ pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid
else:
pcuid=None
else:
return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions))
def nodeaction(self, **data):
+ print "NODEACTION------------------"
for item in data.keys():
print "%s %s" % ( item, data[item] )
ret = reboot.reboot_str(str(hostname))
print ret
if ret: raise RuntimeError("Error using PCU: " + str(ret))
- flash("Reboot appeared to work. All at most 5 minutes. Run ExternalScan to check current status.")
+ flash("Reboot appeared to work. Allow at most 5 minutes. Then run ExternalScan to check current status.")
elif action == "ExternalScan":
scanapi.externalprobe(str(hostname))
@expose(template="monitorweb.templates.pcuview")
@exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)")
def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data):
+ print "PCUVIEW------------------"
+ session.clear()
sitequery=[]
pcuquery=[]
nodequery=[]
+ actions=[]
exceptions = None
for key in data:
exceptions = data['exceptions']
if loginbase:
+ actions = ActionRecord.query.filter_by(loginbase=loginbase
+ ).filter(ActionRecord.date_created >= datetime.now() - timedelta(7)
+ ).order_by(ActionRecord.date_created.desc())
+ actions = [ a for a in actions ]
sitequery = [HistorySiteRecord.by_loginbase(loginbase)]
pcus = {}
for plcnode in site_lb2hn[loginbase]:
- for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']):
+ node = FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname'])
# NOTE: reformat some fields.
prep_node_for_display(node)
nodequery += [node]
if node.plc_pcuid: # not None
- pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
prep_pcu_for_display(pcu)
pcus[node.plc_pcuid] = pcu
if pcuid and hostname is None:
print "pcuid: %s" % pcuid
- for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid):
- # NOTE: count filter
- prep_pcu_for_display(pcu)
- pcuquery += [pcu]
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=pcuid)
+ # NOTE: count filter
+ prep_pcu_for_display(pcu)
+ pcuquery += [pcu]
if 'site_id' in pcu.plc_pcu_stats:
sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)]
if 'nodenames' in pcu.plc_pcu_stats:
for nodename in pcu.plc_pcu_stats['nodenames']:
print "query for %s" % nodename
- q = FindbadNodeRecord.get_latest_by(hostname=nodename)
- node = q.first()
+ node = FindbadNodeRecord.get_latest_by(hostname=nodename)
print "%s" % node.port_status
print "%s" % node.to_dict()
- print "%s" % len(q.all())
if node:
prep_node_for_display(node)
nodequery += [node]
if hostname and pcuid is None:
- for node in FindbadNodeRecord.get_latest_by(hostname=hostname):
+ node = FindbadNodeRecord.get_latest_by(hostname=hostname)
# NOTE: reformat some fields.
prep_node_for_display(node)
sitequery = [node.site]
nodequery += [node]
if node.plc_pcuid: # not None
- pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first()
+ pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid)
prep_pcu_for_display(pcu)
pcuquery += [pcu]
- return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions)
+ return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, actions=actions, exceptions=exceptions)
+
+ @expose(template="monitorweb.templates.nodehistory")
+ def nodehistory(self, hostname=None):
+ query = []
+ if hostname:
+ fbnode = FindbadNodeRecord.get_by(hostname=hostname)
+ # TODO: add links for earlier history if desired.
+ l = fbnode.versions[-100:]
+ l.reverse()
+ for node in l:
+ prep_node_for_display(node)
+ query.append(node)
+ return dict(query=query, hostname=hostname)
+
+ @expose(template="monitorweb.templates.sitehistory")
+ def sitehistory(self, loginbase=None):
+ query = []
+ if loginbase:
+ fbsite = HistorySiteRecord.get_by(loginbase=loginbase)
+ # TODO: add links for earlier history if desired.
+ l = fbsite.versions[-100:]
+ l.reverse()
+ for site in l:
+ query.append(site)
+ return dict(query=query, loginbase=loginbase)
+
@expose(template="monitorweb.templates.pculist")
def pcu(self, filter='all'):
@expose(template="monitorweb.templates.sitelist")
def site(self, filter='all'):
- filtercount = {'good' : 0, 'down': 0, 'new' : 0, 'pending' : 0, 'all' : 0}
+ filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
fbquery = HistorySiteRecord.query.all()
query = []
for site in fbquery:
filtercount['new'] += 1
elif not site.enabled:
filtercount['pending'] += 1
- else:
- filtercount[site.status] += 1
+ elif site.status in ['good', 'online']:
+ filtercount['good'] += 1
+ elif site.status in ['down', 'offline']:
+ filtercount['down'] += 1
# apply filter
if filter == "all":
query.append(site)
elif filter == "pending" and not site.enabled:
query.append(site)
- elif filter == site.status:
+ elif filter == 'good' and site.status in ['good', 'online']:
+ query.append(site)
+ elif filter == 'down' and site.status in ['down', 'offline']:
query.append(site)
return dict(query=query, fc=filtercount)
--- /dev/null
+import sys
+import xmlrpclib
+import cherrypy
+import turbogears
+from datetime import datetime, timedelta
+import time
+
+from monitor.database.info.model import *
+from monitor.database.info.interface import *
+
+class MonitorXmlrpcServerMethods:
+ @cherrypy.expose
+ def listMethods(self):
+ mod = MonitorXmlrpcServer()
+ ret_list = []
+ for f in dir(mod):
+ if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+ ret_list += [f]
+ return ret_list
+
+def convert_datetime(d, keys=None):
+ ret = d.copy()
+ n = datetime.now()
+ if keys == None:
+ keys = d.keys()
+ for k in keys:
+ if type(d[k]) == type(n):
+ ret[k] = time.mktime(d[k].utctimetuple())
+
+ return ret
+
+class MonitorXmlrpcServer(object):
+
+ @cherrypy.expose
+ def listMethods(self):
+ mod = MonitorXmlrpcServer()
+ ret_list = []
+ for f in dir(mod):
+ if isinstance(mod.__getattribute__(f),type(mod.__getattribute__('addDowntime'))):
+ ret_list += [f]
+ return ret_list
+
+ @turbogears.expose()
+ def XMLRPC(self):
+ params, method = xmlrpclib.loads(cherrypy.request.body.read())
+ try:
+ if method == "xmlrpc":
+ # prevent recursion
+ raise AssertionError("method cannot be 'xmlrpc'")
+ # Get the function and make sure it's exposed.
+ method = getattr(self, method, None)
+ # Use the same error message to hide private method names
+ if method is None or not getattr(method, "exposed", False):
+ raise AssertionError("method does not exist")
+
+ session.clear()
+ # Call the method, convert it into a 1-element tuple
+ # as expected by dumps
+ response = method(*params)
+
+ session.flush()
+ response = xmlrpclib.dumps((response,), methodresponse=1, allow_none=1)
+ except xmlrpclib.Fault, fault:
+ # Can't marshal the result
+ response = xmlrpclib.dumps(fault, allow_none=1)
+ except:
+ # Some other error; send back some error info
+ response = xmlrpclib.dumps(
+ xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value))
+ )
+
+ cherrypy.response.headers["Content-Type"] = "text/xml"
+ return response
+
+ # User-defined functions must use cherrypy.expose; turbogears.expose
+ # does additional checking of the response type that we don't want.
+ @cherrypy.expose
+ def upAndRunning(self):
+ return True
+
+ # SITES ------------------------------------------------------------
+
+ @cherrypy.expose
+ def getSiteStatus(self, auth):
+ ret_list = []
+ sites = HistorySiteRecord.query.all()
+ for q in sites:
+ d = q.to_dict(exclude=['timestamp', 'version', ])
+ d = convert_datetime(d, ['last_checked', 'last_changed', 'message_created'])
+ ret_list.append(d)
+ return ret_list
+
+ @cherrypy.expose
+ def clearSitePenalty(self, auth, loginbase):
+ sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+ sitehist.clearPenalty()
+ #sitehist.applyPenalty()
+ #sitehist.sendMessage('clear_penalty')
+ sitehist.closeTicket()
+ return True
+
+ @cherrypy.expose
+ def increaseSitePenalty(self, auth, loginbase):
+ sitehist = SiteInterface.get_or_make(loginbase=loginbase)
+ sitehist.increasePenalty()
+ #sitehist.applyPenalty()
+ #sitehist.sendMessage('increase_penalty')
+ return True
+
+ # NODES ------------------------------------------------------------
+
+ @cherrypy.expose
+ def getNodeStatus(self, auth):
+ ret_list = []
+ sites = HistoryNodeRecord.query.all()
+ for q in sites:
+ d = q.to_dict(exclude=['timestamp', 'version', ])
+ d = convert_datetime(d, ['last_checked', 'last_changed',])
+ ret_list.append(d)
+ return ret_list
+
+ @cherrypy.expose
+ def getRecentActions(self, auth, loginbase=None, hostname=None):
+ ret_list = []
+ return ret_list
+
+ # BLACKLIST ------------------------------------------------------------
+
+ @cherrypy.expose
+ def getBlacklist(self, auth):
+ bl = BlacklistRecord.query.all()
+ ret_list = []
+ for q in bl:
+ d = q.to_dict(exclude=['timestamp', 'version', 'id', ])
+ d = convert_datetime(d, ['date_created'])
+ ret_list.append(d)
+
+ return ret_list
+ # datetime.datetime.fromtimestamp(time.mktime(time.strptime(mytime, time_format)))
+
+ @cherrypy.expose
+ def addHostToBlacklist(self, auth, hostname, expires=0):
+ bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+ return True
+
+ @cherrypy.expose
+ def addSiteToBlacklist(self, auth, loginbase, expires=0):
+ bl = BlacklistRecord.findby_or_create(hostname=hostname, expires=expires)
+ return True
+
+ @cherrypy.expose
+ def deleteFromBlacklist(self, auth, loginbase=None, hostname=None):
+ if (loginbase==None and hostname == None) or (loginbase != None and hostname != None):
+ raise Exception("Please specify a single record to delete: either hostname or loginbase")
+ elif loginbase != None:
+ bl = BlacklistRecord.get_by(loginbase=loginbase)
+ bl.delete()
+ elif hostname != None:
+ bl = BlacklistRecord.get_by(hostname=hostname)
+ bl.delete()
+ return True
\r
#header {\r
height: 40px;\r
- width: 780px;\r
+ /*width: 780px;*/\r
/*background: blue URL('../images/header_inner.png') no-repeat;*/\r
- border-left: 1px solid #aaa;\r
- border-right: 1px solid #aaa;\r
+ /*border-left: 1px solid #aaa;*/\r
+ /*border-right: 1px solid #aaa;*/\r
margin: 0 auto 0 auto;\r
text-align: center;\r
font-size: 180%;\r
#status-error { background-color: indianred; }\r
#status-none { background-color: white; }\r
\r
+#site-new { background-color: gold; }\r
#site-good { background-color : darkseagreen; }\r
+#site-online { background-color : lightgreen; }\r
+#site-offline { background-color: red; }\r
#site-down { background-color: indianred; }\r
\r
+/*#site-0 { background-color : white; }*/\r
+#site-1 { background-color: gold; }\r
+#site-2 { background-color: indianred; }\r
+\r
#node-BOOT { background-color: darkseagreen; }\r
#node-DOWN { background-color: indianred; }\r
#node-DEBUG { background-color: gold; }\r
}\r
\r
#footer {\r
- border: 1px solid #aaa;\r
+ /*border: 1px solid #aaa;*/\r
border-top: 0px none;\r
color: #999;\r
background-color: white;\r
import turbogears as tg
import urllib
+def plc_mail_uri(ticketid):
+ return config.RT_WEB_SERVER + "/Ticket/Display.html?id=" + str(ticketid)
def plc_node_uri(hostname):
return "https://" + config.PLC_WWW_HOSTNAME + "/db/nodes/index.php?nodepattern=" + str(hostname)
def plc_site_uri(loginbase):
--- /dev/null
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Node List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+ xmlns:py="http://purl.org/kid/ns#"
+ xmlns:mochi="http://www.mochi.org">
+
+ <div py:match="item.tag == 'content'">
+ <h3>Node History : ${hostname}</h3>
+ <table width="100%">
+ <tbody>
+ <tr>
+ <td>
+ <table id="sortable_table" class="datagrid" border="1" width="100%">
+ <thead>
+ <tr>
+ <th mochi:format="int"></th>
+ <!--th>Site</th>
+ <th>pcu</th-->
+ <th>Hostname</th>
+ <th>kernel</th>
+ <th>last_contact</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr py:for="i,node in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+ <td></td>
+ <!--td id="site-${node.site.status}">
+ <a href="${link('pcuview', loginbase=node.loginbase)}">${node.loginbase}</a>
+ </td>
+ <td width="20%" nowrap='true' align='center' id="status-${node.pcu_short_status}">
+ <div id="links">
+ <a class="info" py:if="'error' in node.pcu_short_status"
+ href="${link('pcuview', pcuid=node.plc_pcuid)}">
+ Error<span><pre>${node.pcu.reboot_trial_status}</pre></span></a>
+ <a py:if="'error' not in node.pcu_short_status and 'none' not in node.pcu_short_status"
+ href="${link('pcuview', pcuid=node.plc_pcuid)}"
+ py:content="node.pcu_short_status">Reboot Status</a>
+ <span py:if="'none' in node.pcu_short_status"
+ py:content="node.pcu_short_status">Reboot Status</span>
+ </div>
+ </td-->
+ <td id="node-${node.observed_status}" nowrap="true">
+ <a target="_top" href="${link('pcuview', hostname=node.hostname)}" py:content="node.hostname">your.host.org</a></td>
+ <td nowrap="true" py:content="node.kernel"></td>
+ <td py:content="node.date_checked"></td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+
+</html>
<table width="100%">
<thead>
<tr>
- <th><a href="${link('node', filter='BOOT')}">Production(${fc['BOOT']})</a></th>
- <th><a href="${link('node', filter='DEBUG')}">Debug(${fc['DEBUG']})</a></th>
- <th><a href="${link('node', filter='DOWN')}">Down(${fc['DOWN']})</a></th>
+ <th><a href="${link('node', filter='boot')}">Prod(${fc['boot']})</a></th>
+ <th><a href="${link('node', filter='down')}">Down(${fc['down']})</a></th>
+ <th><a href="${link('node', filter='monitordebug')}">Errors(${fc['debug']})</a></th>
+ <th><a href="${link('node', filter='diagnose')}">Diagnose (${fc['diagnose']})</a></th>
+ <th><a href="${link('node', filter='disabled')}">Disabled (${fc['disabled']})</a></th>
<th><a href="${link('node', filter='neverboot')}">Never Booted(${fc['neverboot']})</a></th>
- <th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th>
+ <!--th><a href="${link('node', filter='pending')}">Pending Reply(${fc['pending']})</a></th-->
<th><a href="${link('node', filter='all')}">All</a></th>
</tr>
</thead>
<tbody>
<tr>
- <td colspan="5">
+ <td colspan="7">
<table id="sortable_table" class="datagrid" border="1" width="100%">
<thead>
<tr>
<table py:if="len(sitequery) > 0" id="sub-table" border="1" width="100%">
<thead>
<tr>
+ <th>History</th>
<th>Site name</th>
<th>Enabled</th>
<th>Penalty</th>
</thead>
<tbody>
<tr py:for="i,site in enumerate(sitequery)" class="${i%2 and 'odd' or 'even'}" >
+ <td><a href="sitehistory?loginbase=${site.loginbase}">history</a></td>
<td nowrap="true"><a class="ext-link" href="${plc_site_uri(site.loginbase)}">
<span class="icon">${site.loginbase}</span></a>
</td>
<td py:content="site.enabled"></td>
- <td>n/a</td>
+ <td id="site-${site.penalty_level}">${site.penalty_level}</td>
<td>${site.slices_used}/${site.slices_total}</td>
<td>${site.nodes_up} / ${site.nodes_total}</td>
<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
</table>
</span> </a>
</div>
- <h3>Nodes</h3>
+ <h3>Nodes</h3>
<p py:if="len(nodequery) == 0">
There are no registered nodes for this site.
</p>
<thead>
<tr>
<th mochi:format="int"></th>
+ <th>History</th>
<th>Hostname</th>
<th>last_contact</th>
- <th>Last_checked</th>
+ <th>last_checked</th>
<th nowrap='true'>Port Status</th>
<th></th>
<th></th>
<tbody>
<tr py:for="i,node in enumerate(nodequery)" class="${i%2 and 'odd' or 'even'}" >
<td></td>
+ <td><a href="nodehistory?hostname=${node.hostname}">history</a></td>
<td id="node-${node.observed_status}" nowrap="true" >
<a class="ext-link" href="${plc_node_uri(node.hostname)}">
<span class="icon">${node.hostname}</span></a>
</div>
<div id="status_block" class="flash"
py:if="value_of('tg_flash', None)" py:content="tg_flash"></div>
- <h4 py:if="len(pcuquery) > 0">Convenience Calls</h4>
- <?python
- if len(pcuquery) == 0: pcu = None
- ?>
- <div py:if="pcu is not None" class="code">
+
+ <h4>Actions Over the Last Week</h4>
+ <p py:if="actions and len(actions) == 0">
+ There are no recent actions taken for this site.
+ </p>
+ <table py:if="actions and len(actions) > 0" id="sortable_table" class="datagrid" border="1" width="100%">
+ <thead>
+ <tr>
+ <th mochi:format="int"></th>
+ <th>Date</th>
+ <th>Action taken on</th>
+ <th>Action Type</th>
+ <th>Message ID</th>
+ <th>Errors</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr py:for="i,act in enumerate(actions)" class="${i%2 and 'odd' or 'even'}" >
+ <td></td>
+ <td py:content="act.date_created"></td>
+ <td py:if="act.hostname is not None" nowrap="true" >
+ <a class="ext-link" href="${plc_node_uri(act.hostname)}">
+ <span class="icon">${act.hostname}</span></a>
+ </td>
+ <td py:if="act.hostname is None" nowrap="true">
+ <a class="ext-link" href="${plc_site_uri(act.loginbase)}">
+ <span class="icon">${act.loginbase}</span></a>
+ </td>
+ <!--td py : content="diff_time(mktime(node.date_checked.timetuple()))"></td-->
+ <td py:content="act.action_type"></td>
+ <td><a class="ext-link" href="${plc_mail_uri(act.message_id)}">
+ <span py:if="act.message_id != 0" class="icon">${act.message_id}</span></a></td>
+ <td><pre py:content="act.error_string"></pre></td>
+ </tr>
+ </tbody>
+ </table>
+
+ <!-- TODO: figure out how to make this conditional by model rather than port;
+ it is convenient to have links to ilo, drac, amt, etc.
+ regardless of whether the last PCU scan was successful. -->
+ <h4 py:if="len(pcuquery) != 0">Convenience Calls</h4>
+ <div py:if="len(pcuquery) != 0" class="code"> <!-- pcu is not None" class="code"-->
<span py:for="port,state in pcu.ports">
<span class="code" py:if="port == 22 and state == 'open'">
ssh -o PasswordAuthentication=yes -o PubkeyAuthentication=no
${pcu.plc_pcu_stats['username']}@${pcu_name(pcu.plc_pcu_stats)}
+ <br/>
</span>
<span class="code" py:if="port == 23 and state == 'open'">
telnet ${pcu_name(pcu.plc_pcu_stats)}
+ <br/>
</span>
<span class="code" py:if="port == 80 and state == 'open'">
<a href="http://${pcu_name(pcu.plc_pcu_stats)}">http://${pcu_name(pcu.plc_pcu_stats)}</a>
+ <br/>
</span>
<span class="code" py:if="port == 443 and state == 'open'">
<br/>
--- /dev/null
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<?python
+layout_params['page_title'] = "Monitor Site History List"
+from monitor.util import diff_time
+from time import mktime
+from links import *
+?>
+<html py:layout="'sitemenu.kid'"
+ xmlns:py="http://purl.org/kid/ns#"
+ xmlns:mochi="http://www.mochi.org">
+
+ <div py:match="item.tag == 'content'">
+ <h3>Site History : ${loginbase}</h3>
+ <table width="100%">
+ <tbody>
+ <tr>
+ <td>
+ <table id="sortable_table" class="datagrid" border="1" width="100%">
+ <thead>
+ <tr>
+ <th mochi:format="int"></th>
+ <th>Site name</th>
+ <th>Enabled</th>
+ <th>Penalty</th>
+ <th mochi:format="int">Slices/Max</th>
+ <th mochi:format="int">Nodes/Total</th>
+ <th>Date Checked</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr py:for="i,site in enumerate(query)" class="${i%2 and 'odd' or 'even'}" >
+ <td></td>
+ <td nowrap="true">
+ <div class='oneline'>
+ <a class='left' href="${link('pcuview', loginbase=site.loginbase)}">${site.loginbase}</a>
+ <a class='right' href="${plc_site_uri(site.loginbase)}">
+ <img style='display: inline' border='0' src="static/images/extlink.gif" align='right'/></a>
+ </div>
+ </td>
+ <td py:content="site.enabled"></td>
+ <td id="site-${site.penalty_level}">${site.penalty_level}</td>
+ <td>${site.slices_used}/${site.slices_total}</td>
+ <td>${site.nodes_up} / ${site.nodes_total}</td>
+ <td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
+ <td py:content="site.timestamp"></td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+
+</html>
</div>
</td>
<td py:content="site.enabled"></td>
- <td>n/a</td>
+ <td id="site-${site.penalty_level}">${site.penalty_level}</td>
<td>${site.slices_used}/${site.slices_total}</td>
<td>${site.nodes_up} / ${site.nodes_total}</td>
<td id="site-${site.status}" py:content="diff_time(mktime(site.last_changed.timetuple()))"></td>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns:py="http://purl.org/kid/ns#">
<head>
- <title>App Name - ${page_title}</title>
+ <title>${page_title}</title>
<link href="static/css/style.css" type="text/css" rel="stylesheet" />
<script type="text/javascript" src="tg_js/MochiKit.js"></script>
<script type="text/javascript" src="static/javascript/sortable_tables.js"></script>
</head>
<body>
- <div id="header">Monitor : ${page_title}</div>
<table valign="top" border="1" bgcolor="white" align="center" width="700px">
+ <tr> <td> <div id="header">${page_title}</div> </td> </tr>
<tr>
<td>
<table id="nps-table" width="100%">
<th><a href="${link('site')}">Sites</a></th>
<th><a href="${link('pcu')}">PCUs</a></th>
<th><a href="${link('node')}">Nodes</a></th>
- <th><a href="${link('action')}">Actions</a></th>
+ <th><a href="">Actions</a></th>
</tr>
</thead>
<tbody>
</table>
</td>
</tr>
+ <tr> <td> <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div> </td> </tr>
</table>
- <div id="footer">Copywrite © 2007-2008 The Trustees of Princeton University</div>
</body>
</html>
fb = database.dbLoad("findbad")
lb2hn = database.dbLoad("plcdb_lb2hn")
- pf = database.dbLoad("node_persistflags")
+ # todo: pull from HistoryNodeRecord table instead
+ #pf = database.dbLoad("node_persistflags")
# SETUP header
t = TABLE(border="0", cellspacing="0", cellpadding="0")
url = 'http://www.planet-lab.org/db/nodes/index.php?nodepattern=%s' % host
td = TD(A(host, target='_blank', href=url), bgcolor=color)
r.append(td)
- lc = pf[host].last_changed
+ #lc = pf[host].last_changed
+ lc=-1
td = TD(diff_time(lc))
r.append(td)
t.append(r)
%{zabbix_webdir}
%changelog
+* Fri Apr 03 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-9
+- added new models to db.
+- major updates throughout.
+- better unification. needs an install test.
+
+* Wed Apr 01 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-8
+- removed old pkl database references.
+- added blacklist to db model
+- added fix to IntelAMT remoteControl to start an power-down node
+- added policy.py
+- added global error count before bailing entirely.
+
+* Fri Mar 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-7
+- improved db model
+- updated files that use db model
+- updated web view based on node, site, and pcu states.
+- added local mirror to zabbix Make file.
+
+* Tue Mar 24 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-6
+- added action view to gui
+- added penalty_applied bit to db model.
+
+* Fri Mar 20 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-5
+- tag for updates to 2.0 db model
+
+* Fri Mar 13 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-4
+- splits reboot.py across pcucontrol and monitor modules
+- moves command.py from monitor/util to pcucontrol/util
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-3
+- add email exceptions
+- other bug fixes.
+
+* Tue Mar 10 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-2
+- getting the pcucontrol and findall.py scripts to work in an integrated
+- fashion.
+
* Fri Feb 27 2009 Stephen Soltesz <soltesz@cs.princeton.edu> - Monitor-2.0-1
- preparing to make a 2.0 branch for monitor.
from monitor import parser as parsermodule
parser = parsermodule.getParser(['cacheset'])
- parser.set_defaults( setupglobal=False, syncsite=True, site=None, setupids=False)
+ parser.set_defaults( setupglobal=False, syncsite=True, site=None, sitelist=None, setupids=False)
parser.add_option("", "--setupids", action="store_true", dest="setupids",
help="Setup global IDs.")
parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal",
help="Do not sync sites.")
parser.add_option("", "--site", dest="site",
help="Sync only given site name.")
+ parser.add_option("", "--sitelist", dest="sitelist",
+ help="Sync only given site names in the list.")
opts = parsermodule.parse_args(parser)
os.system("""echo '' > /usr/share/monitor/nodelist.txt""")