From 0fabfc8dbe8f1f2c0d12397e1bc8c6ed686fb5ed Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Wed, 12 Nov 2008 23:02:56 +0000 Subject: [PATCH] Moved some files around and merged from 1.0 branch: svn merge -r 10858:11011 https://svn.planet-lab.org/svn/Monitor/branches/1.0/ [to trunk] --- Monitor.spec | 10 +- automate-default.sh | 59 +++-- automate.py => automate/automate.py | 0 fetch.py => automate/fetch.py | 0 query.py => automate/query.py | 0 vxargs.py => automate/vxargs.py | 0 bootman.py | 3 + clean_policy.py | 3 +- monitor-default.conf | 3 +- monitor-server.cron | 3 +- monitor/wrapper/mailer.py | 2 +- monitor/wrapper/plc.py | 17 +- nodeinfo.py | 5 +- printbadcsv.py | 8 +- soltesz.py | 368 ---------------------------- unified_model.py | 15 +- www/printbadnodes.py | 94 +++++-- www/printbadpcus.php | 4 +- 18 files changed, 156 insertions(+), 438 deletions(-) rename automate.py => automate/automate.py (100%) rename fetch.py => automate/fetch.py (100%) rename query.py => automate/query.py (100%) rename vxargs.py => automate/vxargs.py (100%) delete mode 100644 soltesz.py diff --git a/Monitor.spec b/Monitor.spec index 04dd860..10360b2 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -48,7 +48,11 @@ maintenance. %package server Summary: Monitor hooks for the PLC server. Group: Applications/System + Requires: python +Requires: python-sqlalchemy +Requires: python-elixir + Requires: openssh-clients Requires: perl-libwww-perl Requires: perl-IO-Socket-SSL @@ -78,7 +82,7 @@ cd .. rm -rf $RPM_BUILD_ROOT #################### CLIENT install -D -m 755 monitor.init $RPM_BUILD_ROOT/%{_initrddir}/monitor -install -D -m 755 monitor.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor +install -D -m 644 monitor.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor #################### SERVER install -d $RPM_BUILD_ROOT/usr/share/%{name} @@ -96,7 +100,7 @@ echo " * Installing web pages" rsync -a www/ $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/ echo " * Installing cron job for automated polling" -install -D -m 644 %{name}.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/%{name}.cron +install -D -m 644 monitor-server.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor-server.cron echo " * TODO: Setting up Monitor account in local MyPLC" # TODO: @@ -125,7 +129,7 @@ rm -rf $RPM_BUILD_ROOT /usr/share/%{name} /var/lib/%{name} /var/www/cgi-bin/monitor -%{_sysconfdir}/cron.d/%{name}.cron +%{_sysconfdir}/cron.d/monitor-server.cron %{python_sitearch}/threadpool.py %{python_sitearch}/threadpool.pyc %{python_sitearch}/threadpool.pyo diff --git a/automate-default.sh b/automate-default.sh index 8e7be9c..b5508c1 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -8,8 +8,9 @@ source $INSTALLPATH/monitorconfig.sh cd ${MONITOR_SCRIPT_ROOT} set -e DATE=`date +%Y-%m-%d-%T` -MONITOR_PID="$HOME/monitor/SKIP" +MONITOR_PID="${MONITOR_SCRIPT_ROOT}/SKIP" +echo "#######################################"; echo "Running Monitor at $DATE"; echo "######################################" echo "Performing API test" API=$(./testapi.py) if [ "$API" != "ok" ] ; then @@ -23,7 +24,12 @@ if [ -f $MONITOR_PID ] ; then echo "KILLING Monitor" PID=`cat $MONITOR_PID` rm -f $MONITOR_PID - ${MONITOR_SCRIPT_ROOT}/kill.cmd.sh $PID + if [ -z $PID ] ; then + ${MONITOR_SCRIPT_ROOT}/kill.cmd.sh $PID + echo "done." + else + echo "No PID to be killed." + fi else # skipping monitor echo "SKIPPING Monitor" @@ -35,13 +41,14 @@ echo $$ > $MONITOR_PID # SETUP act_all database if it's not there. if [ ! -f ${MONITOR_SCRIPT_ROOT}/actallsetup.flag ]; then if ! python -c 'import database; database.dbLoad("act_all")' 2>/dev/null ; then - python -c 'import database; database.dbDump("act_all", {})' 2>/dev/null ; then touch ${MONITOR_SCRIPT_ROOT}/actallsetup.flag fi fi +set +e AGENT=`ps ax | grep ssh-agent | grep -v grep` +set -e if [ -z "$AGENT" ] ; then echo "starting ssh agent" # if no agent is running, set it up. @@ -60,7 +67,7 @@ echo "Performing Findbad Nodes" rm -f ${MONITOR_DATA_ROOT}/production.findbad2.pkl ${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE || : cp ${MONITOR_DATA_ROOT}/production.findbad2.pkl ${MONITOR_DATA_ROOT}/production.findbad.pkl -ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill || : +ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || : echo "Performing Findbad PCUs" ######################### @@ -69,13 +76,13 @@ rm -f ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl ${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE || : cp ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl ${MONITOR_DATA_ROOT}/production.findbadpcus.pkl # clean up stray 'locfg' processes that hang around inappropriately... -ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill || : +ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || : -echo "Generating web data" +#echo "Generating web data" # badcsv.txt -${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt -cp badcsv.txt /plc/data/var/www/html/monitor/ -${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print ""} { print ""} END{print "
", $0, "
"}' | sed -e 's\|\\g' > /plc/data/var/www/html/monitor/regions.html +#${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt +#cp badcsv.txt /plc/data/var/www/html/monitor/ +#${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print ""} { print ""} END{print "
", $0, "
"}' | sed -e 's\|\\g' > /plc/data/var/www/html/monitor/regions.html echo "Performing uptime changes for sites, nodes, and pcus" ######################## @@ -88,31 +95,33 @@ echo "Converting pkl files to phpserial" ######################### # 4. convert pkl to php serialize format. ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbadpcus2 -o findbadpcus -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i act_all -o act_all -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i plcdb_hn2lb -o plcdb_hn2lb +for f in act_all plcdb_hn2lb ; do + if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ]; then + ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i $f -o $f + else + echo "Warning: ${MONITOR_DATA_ROOT}/production.$f.pkl does not exist." + fi +done ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbad -o findbadnodes -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets -${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets +#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets +#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets echo "Archiving pkl files" ######################### # Archive pkl files. -for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do - cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl +for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do + if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then + cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl + else + echo "Warning: It failed to archive ${MONITOR_DATA_ROOT}/production.$f.pkl" + fi done echo "Running grouprins on all dbg nodes" ############################ # 5. Check if there are any nodes in dbg state. Clean up afterward. -${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 \ - --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' \ - --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \ - --reboot || : -${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad" --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' || : - -echo "Collecting RT database dump" -########################## -# 6. cache the RT db locally. -python ${MONITOR_SCRIPT_ROOT}/rt.py +${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || : +${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || : +cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log rm -f $MONITOR_PID diff --git a/automate.py b/automate/automate.py similarity index 100% rename from automate.py rename to automate/automate.py diff --git a/fetch.py b/automate/fetch.py similarity index 100% rename from fetch.py rename to automate/fetch.py diff --git a/query.py b/automate/query.py similarity index 100% rename from query.py rename to automate/query.py diff --git a/vxargs.py b/automate/vxargs.py similarity index 100% rename from vxargs.py rename to automate/vxargs.py diff --git a/bootman.py b/bootman.py index e8dc7b8..e3125d1 100755 --- a/bootman.py +++ b/bootman.py @@ -582,6 +582,9 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", + # actual solution appears to involve removing the bad files, and + # continually trying to boot the node. + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", ]: sequences.update({n : "restart_bootmanager_rins"}) diff --git a/clean_policy.py b/clean_policy.py index 516a8de..3ae3811 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -2,7 +2,6 @@ import config import database import time import mailer -from unified_model import cmpCategoryVal import sys import emailTxt import string @@ -201,7 +200,7 @@ class MonitorMergeDiagnoseSendEscellate: del diag['CloseRT'] else: - print "NOT sending email : %s %s" % (config.mail, record.data['rt']) + print "NOT sending email : %s" % config.mail return diff --git a/monitor-default.conf b/monitor-default.conf index 9d02b5e..407ad75 100644 --- a/monitor-default.conf +++ b/monitor-default.conf @@ -10,6 +10,7 @@ RT_WEB_TOOLS_PATH= RT_WEB_USER= RT_WEB_PASSWORD= RT_WEB_DEBUG=0 +RT_QUEUE= # PLC admin account API_SERVER=https://boot.planet-lab.org/PLCAPI/ @@ -20,7 +21,7 @@ API_AUTH_PASSWORD= MONITOR_HOSTNAME=monitor.planet-lab.org MONITOR_SCRIPT_ROOT=/usr/share/monitor-server MONITOR_DATA_ROOT=/var/lib/monitor-server -MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb +MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb email= diff --git a/monitor-server.cron b/monitor-server.cron index dc5302e..1433b79 100644 --- a/monitor-server.cron +++ b/monitor-server.cron @@ -1,5 +1,6 @@ # Runs every three hours to poll all nodes and PCUs, as well as take some # automated actions for debug nodes. -01 6,9,12,15,18,21 * * * root /usr/share/monitor-server/automate.sh +01 * * * * root /usr/share/monitor-server/automate.sh 2>&1 > /usr/share/monitor-server/monitor.log + diff --git a/monitor/wrapper/mailer.py b/monitor/wrapper/mailer.py index 142ba04..ace9971 100755 --- a/monitor/wrapper/mailer.py +++ b/monitor/wrapper/mailer.py @@ -218,7 +218,7 @@ def emailViaRT_NoTicket(subject, text, to): input_text = "Subject: %s\n" input_text += "Requestor: %s\n"% FROM input_text += "id: ticket/new\n" - input_text += "Queue: Monitor\n" + input_text += "Queue: %s\n" % config.RT_QUEUE for recipient in to: input_text += "AdminCc: %s\n" % recipient input_text += "Text: %s" diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 63f17a4..76db7d8 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -122,7 +122,7 @@ def getTechEmails(loginbase): # get site details. s = api.GetSites(loginbase)[0] # get people at site - p = api.GetPersons(s['person_ids'])[0] + p = api.GetPersons(s['person_ids']) # pull out those with the right role. emails = [ person['email'] for person in filter(lambda x: 'tech' in x['roles'], p) ] return emails @@ -135,7 +135,7 @@ def getPIEmails(loginbase): # get site details. s = api.GetSites(loginbase)[0] # get people at site - p = api.GetPersons(s['person_ids'])[0] + p = api.GetPersons(s['person_ids']) # pull out those with the right role. emails = [ person['email'] for person in filter(lambda x: 'pi' in x['roles'], p) ] return emails @@ -174,19 +174,20 @@ def nodesDbg(): Returns loginbase for given nodename ''' def siteId(nodename): - api = xmlrpclib.Server(auth.server, verbose=False) - anon = {'AuthMethod': "anonymous"} - site_id = api.GetNodes (anon, {"hostname": nodename}, ['site_id']) + api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) + site_id = api.GetNodes (auth.auth, {"hostname": nodename}, ['site_id']) if len(site_id) == 1: - loginbase = api.GetSites (anon, site_id[0], ["login_base"]) + loginbase = api.GetSites (auth.auth, site_id[0], ["login_base"]) return loginbase[0]['login_base'] + else: + print "Not nodes returned!!!!" ''' Returns list of slices for a site. ''' def slices(loginbase): siteslices = [] - api = xmlrpclib.Server(auth.server, verbose=False) + api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) sliceids = api.GetSites (auth.auth, {"login_base" : loginbase}, ["slice_ids"])[0]['slice_ids'] for slice in api.GetSlices(auth.auth, {"slice_id" : sliceids}, ["name"]): siteslices.append(slice['name']) @@ -196,7 +197,7 @@ def slices(loginbase): Returns dict of PCU info of a given node. ''' def getpcu(nodename): - api = xmlrpclib.Server(auth.server, verbose=False) + api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True) anon = {'AuthMethod': "anonymous"} nodeinfo = api.GetNodes(auth.auth, {"hostname": nodename}, ["pcu_ids", "ports"])[0] if nodeinfo['pcu_ids']: diff --git a/nodeinfo.py b/nodeinfo.py index 4a946c5..9968b4b 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -45,7 +45,10 @@ def plc_print_nodeinfo(plcnode): def fb_print_nodeinfo(fbnode): pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags') - fbnode['last_change'] = diff_time(pf.last_changed) + try: + fbnode['last_change'] = diff_time(pf.last_changed) + except: + fbnode['last_change'] = diff_time(time.time()) print " Checked: ", if 'checked' in fbnode: print "%11.11s " % diff_time(fbnode['checked']) diff --git a/printbadcsv.py b/printbadcsv.py index cae8480..2f5036d 100755 --- a/printbadcsv.py +++ b/printbadcsv.py @@ -115,13 +115,13 @@ def main(): print str keys = categories.keys() - for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA', - 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: + for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD', + 'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: if cat not in keys: categories[cat] = 0 keys = categories.keys() - for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA', - 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: + for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD', + 'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']: if cat in keys: print "%d," % categories[cat], print "" diff --git a/soltesz.py b/soltesz.py deleted file mode 100644 index 6fc714f..0000000 --- a/soltesz.py +++ /dev/null @@ -1,368 +0,0 @@ -import os -import sys -import pickle -noserial=False -try: - from PHPSerialize import * - from PHPUnserialize import * -except: - #print >>sys.stderr, "PHPSerial db type not allowed." - noserial=True - -import inspect -import shutil -import config - -import config - -DEBUG= 0 -PICKLE_PATH=config.MONITOR_DATA_ROOT - -class ExceptionTimeout(Exception): pass - -def dbLoad(name, type=None): - return SPickle().load(name, type) - -def dbExists(name, type=None): - #if self.config.debug: - # name = "debug.%s" % name - return SPickle().exists(name, type) - -def dbDump(name, obj=None, type=None): - # depth of the dump is 2 now, since we're redirecting to '.dump' - return SPickle().dump(name, obj, type, 2) - -def if_cached_else_refresh(cond, refresh, name, function, type=None): - s = SPickle() - if refresh: - if not config.debug and s.exists("production.%s" % name, type): - s.remove("production.%s" % name, type) - if config.debug and s.exists("debug.%s" % name, type): - s.remove("debug.%s" % name, type) - - return if_cached_else(cond, name, function, type) - -def if_cached_else(cond, name, function, type=None): - s = SPickle() - if (cond and s.exists("production.%s" % name, type)) or \ - (cond and config.debug and s.exists("debug.%s" % name, type)): - o = s.load(name, type) - else: - o = function() - if cond: - s.dump(name, o, type) # cache the object using 'name' - o = s.load(name, type) - # TODO: what if 'o' hasn't been converted... - return o - -class SPickle: - def __init__(self, path=PICKLE_PATH): - self.path = path - - def if_cached_else(self, cond, name, function, type=None): - if cond and self.exists("production.%s" % name, type): - o = self.load(name, type) - else: - o = function() - if cond: - self.dump(name, o, type) # cache the object using 'name' - return o - - def __file(self, name, type=None): - if type == None: - return "%s/%s.pkl" % (self.path, name) - else: - if noserial: - raise Exception("No PHPSerializer module available") - - return "%s/%s.phpserial" % (self.path, name) - - def exists(self, name, type=None): - return os.path.exists(self.__file(name, type)) - - def remove(self, name, type=None): - return os.remove(self.__file(name, type)) - - def load(self, name, type=None): - """ - In debug mode, we should fail if neither file exists. - if the debug file exists, reset name - elif the original file exists, make a copy, reset name - else neither exist, raise an error - Otherwise, it's normal mode, if the file doesn't exist, raise error - Load the file - """ - - if config.debug: - if self.exists("debug.%s" % name, type): - name = "debug.%s" % name - elif self.exists("production.%s" % name, type): - debugname = "debug.%s" % name - if not self.exists(debugname, type): - name = "production.%s" % name - shutil.copyfile(self.__file(name, type), self.__file(debugname, type)) - name = debugname - else: # neither exist - raise Exception, "No such pickle based on %s" % self.__file("debug.%s" % name, type) - else: - if self.exists("production.%s" % name, type): - name = "production.%s" % name - elif self.exists(name, type): - name = name - else: - raise Exception, "No such file %s" % name - - - #print "loading %s" % self.__file(name, type) - f = open(self.__file(name, type), 'r') - if type == None: - o = pickle.load(f) - else: - if noserial: - raise Exception("No PHPSerializer module available") - s = PHPUnserialize() - o = s.unserialize(f.read()) - f.close() - return o - - - # use the environment to extract the data associated with the local - # variable 'name' - def dump(self, name, obj=None, type=None, depth=1): - if obj == None: - o = inspect.getouterframes(inspect.currentframe()) - up1 = o[depth][0] # get the frame one prior to (up from) this frame - argvals = inspect.getargvalues(up1) - # TODO: check that 'name' is a local variable; otherwise this would fail. - obj = argvals[3][name] # extract the local variable name 'name' - if not os.path.isdir("%s/" % self.path): - os.mkdir("%s" % self.path) - if config.debug: - name = "debug.%s" % name - else: - name = "production.%s" % name - f = open(self.__file(name, type), 'w') - if type == None: - pickle.dump(obj, f) - else: - if noserial: - raise Exception("No PHPSerializer module available") - s = PHPSerialize() - f.write(s.serialize(obj)) - f.close() - return - - -COMMAND_TIMEOUT = 60 -ssh_options = { 'StrictHostKeyChecking':'no', - 'BatchMode':'yes', - 'PasswordAuthentication':'no', - 'ConnectTimeout':'%s' % COMMAND_TIMEOUT} -from select import select -import subprocess -import signal - -class Sopen(subprocess.Popen): - def kill(self, signal = signal.SIGTERM): - os.kill(self.pid, signal) - -def read_t(stream, count, timeout=COMMAND_TIMEOUT*2): - lin, lout, lerr = select([stream], [], [], timeout) - if len(lin) == 0: - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) - - return stream.read(count) - -class CMD: - def __init__(self): - pass - - def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2): - - #print "CMD.run_noexcept(%s)" % cmd - try: - return CMD.run(self,cmd,timeout) - except ExceptionTimeout: - import traceback; print traceback.print_exc() - return ("", "SCRIPTTIMEOUT") - - def system(self, cmd, timeout=COMMAND_TIMEOUT*2): - (o,e) = self.run(cmd, timeout) - self.output = o - self.error = e - if self.s.returncode is None: - self.s.wait() - return self.s.returncode - - def run(self, cmd, timeout=COMMAND_TIMEOUT*2): - - #print "CMD.run(%s)" % cmd - s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - self.s = s - (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr) - #print "calling select(%s)" % timeout - lout, lin, lerr = select([f_out], [], [f_err], timeout) - #print "TIMEOUT!!!!!!!!!!!!!!!!!!!" - if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0: - # Reached a timeout! Nuke process so it does not hang. - #print "KILLING" - s.kill(signal.SIGKILL) - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) - else: - #print "RETURNING" - #print len(lin), len(lout), len(lerr) - pass - - o_value = "" - e_value = "" - - #print "reading from f_out" - if len(lout) > 0: o_value = f_out.read() - #print "reading from f_err" - if len(lerr) > 0: e_value = f_err.read() - - #print "striping output" - o_value = o_value.strip() - e_value = e_value.strip() - - #print "OUTPUT", o_value, e_value - - #print "closing files" - f_out.close() - f_in.close() - f_err.close() - try: - #print "s.kill()" - s.kill() - #print "after s.kill()" - except OSError: - # no such process, due to it already exiting... - pass - - #print o_value, e_value - return (o_value, e_value) - - def runargs(self, args, timeout=COMMAND_TIMEOUT*2): - - #print "CMD.run(%s)" % " ".join(args) - s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - self.s = s - (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr) - lout, lin, lerr = select([f_out], [], [f_err], timeout) - if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0: - # Reached a timeout! Nuke process so it does not hang. - s.kill(signal.SIGKILL) - raise ExceptionTimeout("TIMEOUT Running: %s" % cmd) - o_value = f_out.read() - e_value = "" - if o_value == "": # An error has occured - e_value = f_err.read() - - o_value = o_value.strip() - e_value = e_value.strip() - - f_out.close() - f_in.close() - f_err.close() - try: - s.kill() - except OSError: - # no such process, due to it already exiting... - pass - - return (o_value, e_value) - - -class SSH(CMD): - def __init__(self, user, host, port=22, options = ssh_options): - self.options = options - self.user = user - self.host = host - self.port = port - return - - def __options_to_str(self): - options = "" - for o,v in self.options.iteritems(): - options = options + "-o %s=%s " % (o,v) - return options - - def run(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.run(%s)" % cmd - return CMD.run(self, cmd, timeout) - - def get_file(self, rmt_filename, local_filename=None): - if local_filename == None: - local_filename = "./" - cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(), - self.user, self.host, - rmt_filename, local_filename) - # output : - # errors will be on stderr, - # success will have a blank stderr... - return CMD.run_noexcept(self, cmd) - - def run_noexcept(self, cmd): - cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.run_noexcept(%s)" % cmd - return CMD.run_noexcept(self, cmd) - - def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.run_noexcept2(%s)" % cmd - r = CMD.run_noexcept(self, cmd, timeout) - - # XXX: this may be resulting in deadlocks... not sure. - #if self.s.returncode is None: - # #self.s.kill() - # self.s.kill(signal.SIGKILL) - # self.s.wait() - # self.ret = self.s.returncode - self.ret = -1 - - return r - - def system2(self, cmd, timeout=COMMAND_TIMEOUT*2): - cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - #print "SSH.system2(%s)" % cmd - return CMD.system(self, cmd, timeout) - - def runE(self, cmd): - cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(), - self.user, self.host, cmd) - if ( DEBUG == 1 ): - print cmd, - (f_in, f_out, f_err) = os.popen3(cmd) - - value = f_out.read() - if value == "": # An error has occured - value = f_err.read() - value = value.strip() - - if ( DEBUG == 1 ): - print " == %s" % value - f_out.close() - f_in.close() - f_err.close() - return value.strip() - -import time -class MyTimer: - def __init__(self): - self.start = time.time() - - def end(self): - self.end = time.time() - t = self.end-self.start - return t - - def diff(self): - self.end = time.time() - t = self.end-self.start - self.start = self.end - return t diff --git a/unified_model.py b/unified_model.py index 805dd0e..891bab0 100755 --- a/unified_model.py +++ b/unified_model.py @@ -36,6 +36,10 @@ def cmpValMap(v1, v2, map): raise Exception("No index %s or %s in map" % (v1, v2)) def cmpCategoryVal(v1, v2): + # Terrible hack to manage migration to no more 'ALPHA' states. + if v1 == 'ALPHA': v1 = "PROD" + if v2 == 'ALPHA': v2 = "PROD" + #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) return cmpValMap(v1,v2,map) @@ -569,14 +573,17 @@ class Record(object): if ADMIN & roles: contacts += [config.email] if TECH & roles: - contacts += [TECHEMAIL % self.loginbase] + #contacts += [TECHEMAIL % self.loginbase] + contacts += plc.getTechEmails(self.loginbase) if PI & roles: - contacts += [PIEMAIL % self.loginbase] + #contacts += [PIEMAIL % self.loginbase] + contacts += plc.getSliceUserEmails(self.loginbase) if USER & roles: + contacts += plc.getSliceUserEmails(self.loginbase) slices = plc.slices(self.loginbase) if len(slices) >= 1: - for slice in slices: - contacts += [SLICEMAIL % slice] + #for slice in slices: + # contacts += [SLICEMAIL % slice] print "SLIC: %20s : %d slices" % (self.loginbase, len(slices)) else: print "SLIC: %20s : 0 slices" % self.loginbase diff --git a/www/printbadnodes.py b/www/printbadnodes.py index 9b5692c..47ef62e 100755 --- a/www/printbadnodes.py +++ b/www/printbadnodes.py @@ -3,6 +3,7 @@ from monitor import database from monitor import config import string import sys +import time categories = {} ssherror = False @@ -62,11 +63,11 @@ def cmpState(l1, l2): return cmpMap(l1,l2,'state', map) def cmpCategoryVal(v1, v2): - map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) + map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ]) return cmpValMap(v1,v2,map) def cmpCategory(l1, l2): - map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ]) + map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ]) return cmpMap(l1,l2,'category', map) def cmpPCU(l1, l2): @@ -225,7 +226,7 @@ def fields_to_html(fields, vals): #print "state: %s
" % pcu_state(vals['plcnode']['pcu_ids'][0]) #print "color: %s
" % pcu_colorMap[pcu_state(vals['plcnode']['pcu_ids'][0])] bgcolor = "bgcolor='%s'" % pcu_colorMap[pcu_state(vals['plcnode']['pcu_ids'][0])] - url = "PCU" % vals['plcnode']['pcu_ids'][0] + url = "PCU" % vals['plcnode']['pcu_ids'][0] r_str += "%s" % (bgcolor, url) else: r_str += "%s" % (bgcolor, f) @@ -234,10 +235,56 @@ def fields_to_html(fields, vals): return r_str +def my_diff_time(timestamp): + now = time.time() + if timestamp == None: + return "not yet contacted" + diff = now - timestamp + # return the number of seconds as a difference from current time. + t_str = "" + if diff < 60: # sec in min. + t = diff + t_str = "%s sec ago" % t + elif diff < 60*60: # sec in hour + t = diff // (60) + t_str = "%s min ago" % int(t) + elif diff < 60*60*24: # sec in day + t = diff // (60*60) + t_str = "%s hours ago" % int(t) + elif diff < 60*60*24*7: # sec in week + t = diff // (60*60*24) + t_str = "%s days ago" % int(t) + elif diff < 60*60*24*30: # approx sec in month + t = diff // (60*60*24*7) + t_str = "%s weeks ago" % int(t) + elif diff > 60*60*24*30 and diff < 60*60*24*30*2: # approx sec in month + month = int( diff // (60*60*24*30) ) + weeks = (diff - (month * (60*60*24*30))) // (60*60*24*7) + if weeks == 0: + t_str = "%s month ago" % int(month) + elif weeks == 4: + t_str = "2 months ago" + else: + t_str = "%s month and %s weeks ago" % ( int(month) , int(weeks) ) + elif diff >= 60*60*24*30*2: + month = diff // (60*60*24*30) + t_str = "%s months ago" % int(month) + return t_str def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): global fb + import os + import datetime + if nodeonlyfilter == None: + print "\n" + + try: + mtime = os.stat("/var/lib/monitor-server/production.findbad.pkl")[-2] + print "Last Updated: %s GMT" % datetime.datetime.fromtimestamp(mtime) + except: + pass + db = database.dbLoad(config.dbname) fb = database.dbLoad("findbadpcus") @@ -251,6 +298,7 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): 'state' : 5, 'kernel' : 10.65, 'comonstats' : 5, + 'last_contact' : 10.65, 'plcsite' : 12, 'bootcd' : 10.65} ## create format string based on config.fields @@ -259,6 +307,7 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): format_fields = [] for f in config.fields.split(','): fields[f] = "%%(%s)s" % f + #print f #if f in maxFieldLengths: # fields[f] = "%%(%s)%ds" % (f, maxFieldLengths[f]) #else: @@ -356,16 +405,19 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): if comonfilter != None: cmf = re.compile(comonfilter) else: cmf = None + + output_str = "" #l_loginbase = bysite.keys() #l_loginbase.sort() if nodeonlyfilter == None: - print "" + output_str += "
" prev_sitestring = "" for row in d2: vals = row + #added by guto about last contact information if (catfilter != None and cf.match(vals['category']) == None): continue @@ -376,16 +428,16 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): continue if nodeonlyfilter != None: - print vals['nodename'] + output_str += vals['nodename'] continue site_string = row['site_string'] if site_string != prev_sitestring: - print "" + output_str += "" else: - print "" + output_str += "" prev_sitestring = site_string @@ -431,6 +483,12 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): url = "%s" % (config.MONITOR_HOSTNAME, vals['nodename'], vals['nodename']) vals['nodename'] = url + if 'plcnode' in vals: + if vals['plcnode']['status'] == "GN_FAILED": + vals['last_contact'] = "UNKNOWN" + else: + vals['last_contact'] = my_diff_time(vals['plcnode']['last_contact']) + try: str_fields = [] count = 0 @@ -441,15 +499,15 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): print >>sys.stderr, vals s = fields_to_html(str_fields, vals) - print s + output_str += s - print "\n" + output_str += "\n" if nodeonlyfilter == None: - print "
" - print site_string - print "
" + output_str += site_string + output_str += "
 
 
" - print "" + output_str += "
" keys = categories.keys() keys.sort() + print "" for cat in keys: print "" print "" % cat @@ -458,6 +516,10 @@ def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): if nodeonlyfilter == None: print "
Total %s
" + print output_str + if nodeonlyfilter == None: + print "\n" + if __name__ == '__main__': @@ -496,7 +558,7 @@ if __name__ == '__main__': config.cmpdays=False config.comon="sshstatus" - config.fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd" + config.fields="nodename,ping,ssh,pcu,category,state,last_contact,kernel,bootcd" config.dbname="findbad" config.cmpping=False config.cmpdns=False @@ -505,11 +567,7 @@ if __name__ == '__main__': config.cmpcategory=False print "Content-Type: text/html\r\n" - if mynodeonly == None: - print "\n" if len(sys.argv) > 1: if sys.argv[1] == "ssherror": ssherror = True main(myfilter, mycategory, mystate, mycomon,mynodeonly) - if mynodeonly == None: - print "\n" diff --git a/www/printbadpcus.php b/www/printbadpcus.php index 7db3e8e..500be1f 100644 --- a/www/printbadpcus.php +++ b/www/printbadpcus.php @@ -2,12 +2,12 @@ function plc_site_link($site_name) { - return "https://www.planet-lab.org/db/sites/index.php?site_pattern=" . $site_name; + return "https://" . MONITOR_HOSTNAME . "/db/sites/index.php?site_pattern=" . $site_name; } function pcu_link($pcu) { - return "https://www.planet-lab.org/db/sites/pcu.php?id=" . $pcu['pcu_id']; + return "https://" . MONITOR_HOSTNAME . "/db/sites/pcu.php?id=" . $pcu['pcu_id']; } function pcu_site($pcu) -- 2.43.0