svn merge -r 10858:11011 https://svn.planet-lab.org/svn/Monitor/branches/1.0/ [to trunk]
%package server
Summary: Monitor hooks for the PLC server.
Group: Applications/System
+
Requires: python
+Requires: python-sqlalchemy
+Requires: python-elixir
+
Requires: openssh-clients
Requires: perl-libwww-perl
Requires: perl-IO-Socket-SSL
rm -rf $RPM_BUILD_ROOT
#################### CLIENT
install -D -m 755 monitor.init $RPM_BUILD_ROOT/%{_initrddir}/monitor
-install -D -m 755 monitor.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor
+install -D -m 644 monitor.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor
#################### SERVER
install -d $RPM_BUILD_ROOT/usr/share/%{name}
rsync -a www/ $RPM_BUILD_ROOT/var/www/cgi-bin/monitor/
echo " * Installing cron job for automated polling"
-install -D -m 644 %{name}.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/%{name}.cron
+install -D -m 644 monitor-server.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/monitor-server.cron
echo " * TODO: Setting up Monitor account in local MyPLC"
# TODO:
/usr/share/%{name}
/var/lib/%{name}
/var/www/cgi-bin/monitor
-%{_sysconfdir}/cron.d/%{name}.cron
+%{_sysconfdir}/cron.d/monitor-server.cron
%{python_sitearch}/threadpool.py
%{python_sitearch}/threadpool.pyc
%{python_sitearch}/threadpool.pyo
cd ${MONITOR_SCRIPT_ROOT}
set -e
DATE=`date +%Y-%m-%d-%T`
-MONITOR_PID="$HOME/monitor/SKIP"
+MONITOR_PID="${MONITOR_SCRIPT_ROOT}/SKIP"
+echo "#######################################"; echo "Running Monitor at $DATE"; echo "######################################"
echo "Performing API test"
API=$(./testapi.py)
if [ "$API" != "ok" ] ; then
echo "KILLING Monitor"
PID=`cat $MONITOR_PID`
rm -f $MONITOR_PID
- ${MONITOR_SCRIPT_ROOT}/kill.cmd.sh $PID
+ if [ -z $PID ] ; then
+ ${MONITOR_SCRIPT_ROOT}/kill.cmd.sh $PID
+ echo "done."
+ else
+ echo "No PID to be killed."
+ fi
else
# skipping monitor
echo "SKIPPING Monitor"
# SETUP act_all database if it's not there.
if [ ! -f ${MONITOR_SCRIPT_ROOT}/actallsetup.flag ]; then
if ! python -c 'import database; database.dbLoad("act_all")' 2>/dev/null ; then
- python -c 'import database; database.dbDump("act_all", {})' 2>/dev/null ; then
touch ${MONITOR_SCRIPT_ROOT}/actallsetup.flag
fi
fi
+set +e
AGENT=`ps ax | grep ssh-agent | grep -v grep`
+set -e
if [ -z "$AGENT" ] ; then
echo "starting ssh agent"
# if no agent is running, set it up.
rm -f ${MONITOR_DATA_ROOT}/production.findbad2.pkl
${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE || :
cp ${MONITOR_DATA_ROOT}/production.findbad2.pkl ${MONITOR_DATA_ROOT}/production.findbad.pkl
-ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill || :
+ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
echo "Performing Findbad PCUs"
#########################
${MONITOR_SCRIPT_ROOT}/findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE || :
cp ${MONITOR_DATA_ROOT}/production.findbadpcus2.pkl ${MONITOR_DATA_ROOT}/production.findbadpcus.pkl
# clean up stray 'locfg' processes that hang around inappropriately...
-ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill || :
+ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
-echo "Generating web data"
+#echo "Generating web data"
# badcsv.txt
-${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt
-cp badcsv.txt /plc/data/var/www/html/monitor/
-${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print "<table>"} { print "<tr><td>", $0, "</td></tr>"} END{print "</table>"}' | sed -e 's\|\</td><td>\g' > /plc/data/var/www/html/monitor/regions.html
+#${MONITOR_SCRIPT_ROOT}/printbadcsv.py | grep -v loading | tr -d ' ' > badcsv.txt
+#cp badcsv.txt /plc/data/var/www/html/monitor/
+#${MONITOR_SCRIPT_ROOT}/showlatlon.py | head -9 | awk 'BEGIN {print "<table>"} { print "<tr><td>", $0, "</td></tr>"} END{print "</table>"}' | sed -e 's\|\</td><td>\g' > /plc/data/var/www/html/monitor/regions.html
echo "Performing uptime changes for sites, nodes, and pcus"
########################
#########################
# 4. convert pkl to php serialize format.
${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbadpcus2 -o findbadpcus
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i act_all -o act_all
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i plcdb_hn2lb -o plcdb_hn2lb
+for f in act_all plcdb_hn2lb ; do
+ if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ]; then
+ ${MONITOR_SCRIPT_ROOT}/pkl2php.py -i $f -o $f
+ else
+ echo "Warning: ${MONITOR_DATA_ROOT}/production.$f.pkl does not exist."
+ fi
+done
${MONITOR_SCRIPT_ROOT}/pkl2php.py -i findbad -o findbadnodes
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets
-${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets
+#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i ad_dbTickets -o ad_dbTickets
+#${MONITOR_SCRIPT_ROOT}/pkl2php.py -i idTickets -o idTickets
echo "Archiving pkl files"
#########################
# Archive pkl files.
-for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
- cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
+for f in findbad act_all findbadpcus l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
+ if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
+ cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
+ else
+ echo "Warning: It failed to archive ${MONITOR_DATA_ROOT}/production.$f.pkl"
+ fi
done
echo "Running grouprins on all dbg nodes"
############################
# 5. Check if there are any nodes in dbg state. Clean up afterward.
-${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 \
- --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' \
- --stopselect 'state=BOOT&&kernel=2.6.22.19-vs2.3.0.34.9.planetlab' \
- --reboot || :
-${MONITOR_SCRIPT_ROOT}/findbad.py --increment --cachenodes --debug=0 --dbname="findbad" --nodeselect 'state=DEBUG&&boot_state=dbg||state=DEBUG&&boot_state=boot' || :
-
-echo "Collecting RT database dump"
-##########################
-# 6. cache the RT db locally.
-python ${MONITOR_SCRIPT_ROOT}/rt.py
+${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DOWN&&boot_state=(boot|rins|dbg|diag)' --stopselect "state=BOOT" || :
+${MONITOR_SCRIPT_ROOT}/grouprins.py --mail=1 --reboot --nodeselect 'state=DEBUG&&boot_state=(rins|dbg|boot)' --stopselect 'state=BOOT' || :
+cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
rm -f $MONITOR_PID
"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+ # actual solution appears to involve removing the bad files, and
+ # continually trying to boot the node.
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
import database
import time
import mailer
-from unified_model import cmpCategoryVal
import sys
import emailTxt
import string
del diag['CloseRT']
else:
- print "NOT sending email : %s %s" % (config.mail, record.data['rt'])
+ print "NOT sending email : %s" % config.mail
return
RT_WEB_USER=
RT_WEB_PASSWORD=
RT_WEB_DEBUG=0
+RT_QUEUE=
# PLC admin account
API_SERVER=https://boot.planet-lab.org/PLCAPI/
MONITOR_HOSTNAME=monitor.planet-lab.org
MONITOR_SCRIPT_ROOT=/usr/share/monitor-server
MONITOR_DATA_ROOT=/var/lib/monitor-server
-MONITOR_ARCHIVE_ROOT=/usr/share/monitor-server/archive-pdb
+MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb
email=
# Runs every three hours to poll all nodes and PCUs, as well as take some
# automated actions for debug nodes.
-01 6,9,12,15,18,21 * * * root /usr/share/monitor-server/automate.sh
+01 * * * * root /usr/share/monitor-server/automate.sh 2>&1 > /usr/share/monitor-server/monitor.log
+
input_text = "Subject: %s\n"
input_text += "Requestor: %s\n"% FROM
input_text += "id: ticket/new\n"
- input_text += "Queue: Monitor\n"
+ input_text += "Queue: %s\n" % config.RT_QUEUE
for recipient in to:
input_text += "AdminCc: %s\n" % recipient
input_text += "Text: %s"
# get site details.
s = api.GetSites(loginbase)[0]
# get people at site
- p = api.GetPersons(s['person_ids'])[0]
+ p = api.GetPersons(s['person_ids'])
# pull out those with the right role.
emails = [ person['email'] for person in filter(lambda x: 'tech' in x['roles'], p) ]
return emails
# get site details.
s = api.GetSites(loginbase)[0]
# get people at site
- p = api.GetPersons(s['person_ids'])[0]
+ p = api.GetPersons(s['person_ids'])
# pull out those with the right role.
emails = [ person['email'] for person in filter(lambda x: 'pi' in x['roles'], p) ]
return emails
Returns loginbase for given nodename
'''
def siteId(nodename):
- api = xmlrpclib.Server(auth.server, verbose=False)
- anon = {'AuthMethod': "anonymous"}
- site_id = api.GetNodes (anon, {"hostname": nodename}, ['site_id'])
+ api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
+ site_id = api.GetNodes (auth.auth, {"hostname": nodename}, ['site_id'])
if len(site_id) == 1:
- loginbase = api.GetSites (anon, site_id[0], ["login_base"])
+ loginbase = api.GetSites (auth.auth, site_id[0], ["login_base"])
return loginbase[0]['login_base']
+ else:
+ print "Not nodes returned!!!!"
'''
Returns list of slices for a site.
'''
def slices(loginbase):
siteslices = []
- api = xmlrpclib.Server(auth.server, verbose=False)
+ api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
sliceids = api.GetSites (auth.auth, {"login_base" : loginbase}, ["slice_ids"])[0]['slice_ids']
for slice in api.GetSlices(auth.auth, {"slice_id" : sliceids}, ["name"]):
siteslices.append(slice['name'])
Returns dict of PCU info of a given node.
'''
def getpcu(nodename):
- api = xmlrpclib.Server(auth.server, verbose=False)
+ api = xmlrpclib.Server(auth.server, verbose=False, allow_none=True)
anon = {'AuthMethod': "anonymous"}
nodeinfo = api.GetNodes(auth.auth, {"hostname": nodename}, ["pcu_ids", "ports"])[0]
if nodeinfo['pcu_ids']:
def fb_print_nodeinfo(fbnode):
pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
- fbnode['last_change'] = diff_time(pf.last_changed)
+ try:
+ fbnode['last_change'] = diff_time(pf.last_changed)
+ except:
+ fbnode['last_change'] = diff_time(time.time())
print " Checked: ",
if 'checked' in fbnode:
print "%11.11s " % diff_time(fbnode['checked'])
print str
keys = categories.keys()
- for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA',
- 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
+ for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD',
+ 'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
if cat not in keys:
categories[cat] = 0
keys = categories.keys()
- for cat in ['BOOT-ALPHA', 'BOOT-PROD', 'BOOT-OLDBOOTCD', 'DEBUG-ALPHA',
- 'DEBUG-PROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
+ for cat in ['BOOT-PROD', 'BOOT-OLDPROD', 'BOOT-OLDBOOTCD', 'DEBUG-PROD',
+ 'DEBUG-OLDPROD', 'DEBUG-OLDBOOTCD', 'DOWN-ERROR']:
if cat in keys:
print "%d," % categories[cat],
print ""
+++ /dev/null
-import os
-import sys
-import pickle
-noserial=False
-try:
- from PHPSerialize import *
- from PHPUnserialize import *
-except:
- #print >>sys.stderr, "PHPSerial db type not allowed."
- noserial=True
-
-import inspect
-import shutil
-import config
-
-import config
-
-DEBUG= 0
-PICKLE_PATH=config.MONITOR_DATA_ROOT
-
-class ExceptionTimeout(Exception): pass
-
-def dbLoad(name, type=None):
- return SPickle().load(name, type)
-
-def dbExists(name, type=None):
- #if self.config.debug:
- # name = "debug.%s" % name
- return SPickle().exists(name, type)
-
-def dbDump(name, obj=None, type=None):
- # depth of the dump is 2 now, since we're redirecting to '.dump'
- return SPickle().dump(name, obj, type, 2)
-
-def if_cached_else_refresh(cond, refresh, name, function, type=None):
- s = SPickle()
- if refresh:
- if not config.debug and s.exists("production.%s" % name, type):
- s.remove("production.%s" % name, type)
- if config.debug and s.exists("debug.%s" % name, type):
- s.remove("debug.%s" % name, type)
-
- return if_cached_else(cond, name, function, type)
-
-def if_cached_else(cond, name, function, type=None):
- s = SPickle()
- if (cond and s.exists("production.%s" % name, type)) or \
- (cond and config.debug and s.exists("debug.%s" % name, type)):
- o = s.load(name, type)
- else:
- o = function()
- if cond:
- s.dump(name, o, type) # cache the object using 'name'
- o = s.load(name, type)
- # TODO: what if 'o' hasn't been converted...
- return o
-
-class SPickle:
- def __init__(self, path=PICKLE_PATH):
- self.path = path
-
- def if_cached_else(self, cond, name, function, type=None):
- if cond and self.exists("production.%s" % name, type):
- o = self.load(name, type)
- else:
- o = function()
- if cond:
- self.dump(name, o, type) # cache the object using 'name'
- return o
-
- def __file(self, name, type=None):
- if type == None:
- return "%s/%s.pkl" % (self.path, name)
- else:
- if noserial:
- raise Exception("No PHPSerializer module available")
-
- return "%s/%s.phpserial" % (self.path, name)
-
- def exists(self, name, type=None):
- return os.path.exists(self.__file(name, type))
-
- def remove(self, name, type=None):
- return os.remove(self.__file(name, type))
-
- def load(self, name, type=None):
- """
- In debug mode, we should fail if neither file exists.
- if the debug file exists, reset name
- elif the original file exists, make a copy, reset name
- else neither exist, raise an error
- Otherwise, it's normal mode, if the file doesn't exist, raise error
- Load the file
- """
-
- if config.debug:
- if self.exists("debug.%s" % name, type):
- name = "debug.%s" % name
- elif self.exists("production.%s" % name, type):
- debugname = "debug.%s" % name
- if not self.exists(debugname, type):
- name = "production.%s" % name
- shutil.copyfile(self.__file(name, type), self.__file(debugname, type))
- name = debugname
- else: # neither exist
- raise Exception, "No such pickle based on %s" % self.__file("debug.%s" % name, type)
- else:
- if self.exists("production.%s" % name, type):
- name = "production.%s" % name
- elif self.exists(name, type):
- name = name
- else:
- raise Exception, "No such file %s" % name
-
-
- #print "loading %s" % self.__file(name, type)
- f = open(self.__file(name, type), 'r')
- if type == None:
- o = pickle.load(f)
- else:
- if noserial:
- raise Exception("No PHPSerializer module available")
- s = PHPUnserialize()
- o = s.unserialize(f.read())
- f.close()
- return o
-
-
- # use the environment to extract the data associated with the local
- # variable 'name'
- def dump(self, name, obj=None, type=None, depth=1):
- if obj == None:
- o = inspect.getouterframes(inspect.currentframe())
- up1 = o[depth][0] # get the frame one prior to (up from) this frame
- argvals = inspect.getargvalues(up1)
- # TODO: check that 'name' is a local variable; otherwise this would fail.
- obj = argvals[3][name] # extract the local variable name 'name'
- if not os.path.isdir("%s/" % self.path):
- os.mkdir("%s" % self.path)
- if config.debug:
- name = "debug.%s" % name
- else:
- name = "production.%s" % name
- f = open(self.__file(name, type), 'w')
- if type == None:
- pickle.dump(obj, f)
- else:
- if noserial:
- raise Exception("No PHPSerializer module available")
- s = PHPSerialize()
- f.write(s.serialize(obj))
- f.close()
- return
-
-
-COMMAND_TIMEOUT = 60
-ssh_options = { 'StrictHostKeyChecking':'no',
- 'BatchMode':'yes',
- 'PasswordAuthentication':'no',
- 'ConnectTimeout':'%s' % COMMAND_TIMEOUT}
-from select import select
-import subprocess
-import signal
-
-class Sopen(subprocess.Popen):
- def kill(self, signal = signal.SIGTERM):
- os.kill(self.pid, signal)
-
-def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
- lin, lout, lerr = select([stream], [], [], timeout)
- if len(lin) == 0:
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
-
- return stream.read(count)
-
-class CMD:
- def __init__(self):
- pass
-
- def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
-
- #print "CMD.run_noexcept(%s)" % cmd
- try:
- return CMD.run(self,cmd,timeout)
- except ExceptionTimeout:
- import traceback; print traceback.print_exc()
- return ("", "SCRIPTTIMEOUT")
-
- def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
- (o,e) = self.run(cmd, timeout)
- self.output = o
- self.error = e
- if self.s.returncode is None:
- self.s.wait()
- return self.s.returncode
-
- def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
-
- #print "CMD.run(%s)" % cmd
- s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
- self.s = s
- (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
- #print "calling select(%s)" % timeout
- lout, lin, lerr = select([f_out], [], [f_err], timeout)
- #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
- if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
- # Reached a timeout! Nuke process so it does not hang.
- #print "KILLING"
- s.kill(signal.SIGKILL)
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
- else:
- #print "RETURNING"
- #print len(lin), len(lout), len(lerr)
- pass
-
- o_value = ""
- e_value = ""
-
- #print "reading from f_out"
- if len(lout) > 0: o_value = f_out.read()
- #print "reading from f_err"
- if len(lerr) > 0: e_value = f_err.read()
-
- #print "striping output"
- o_value = o_value.strip()
- e_value = e_value.strip()
-
- #print "OUTPUT", o_value, e_value
-
- #print "closing files"
- f_out.close()
- f_in.close()
- f_err.close()
- try:
- #print "s.kill()"
- s.kill()
- #print "after s.kill()"
- except OSError:
- # no such process, due to it already exiting...
- pass
-
- #print o_value, e_value
- return (o_value, e_value)
-
- def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
-
- #print "CMD.run(%s)" % " ".join(args)
- s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
- self.s = s
- (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
- lout, lin, lerr = select([f_out], [], [f_err], timeout)
- if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
- # Reached a timeout! Nuke process so it does not hang.
- s.kill(signal.SIGKILL)
- raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
- o_value = f_out.read()
- e_value = ""
- if o_value == "": # An error has occured
- e_value = f_err.read()
-
- o_value = o_value.strip()
- e_value = e_value.strip()
-
- f_out.close()
- f_in.close()
- f_err.close()
- try:
- s.kill()
- except OSError:
- # no such process, due to it already exiting...
- pass
-
- return (o_value, e_value)
-
-
-class SSH(CMD):
- def __init__(self, user, host, port=22, options = ssh_options):
- self.options = options
- self.user = user
- self.host = host
- self.port = port
- return
-
- def __options_to_str(self):
- options = ""
- for o,v in self.options.iteritems():
- options = options + "-o %s=%s " % (o,v)
- return options
-
- def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.run(%s)" % cmd
- return CMD.run(self, cmd, timeout)
-
- def get_file(self, rmt_filename, local_filename=None):
- if local_filename == None:
- local_filename = "./"
- cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(),
- self.user, self.host,
- rmt_filename, local_filename)
- # output :
- # errors will be on stderr,
- # success will have a blank stderr...
- return CMD.run_noexcept(self, cmd)
-
- def run_noexcept(self, cmd):
- cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.run_noexcept(%s)" % cmd
- return CMD.run_noexcept(self, cmd)
-
- def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.run_noexcept2(%s)" % cmd
- r = CMD.run_noexcept(self, cmd, timeout)
-
- # XXX: this may be resulting in deadlocks... not sure.
- #if self.s.returncode is None:
- # #self.s.kill()
- # self.s.kill(signal.SIGKILL)
- # self.s.wait()
- # self.ret = self.s.returncode
- self.ret = -1
-
- return r
-
- def system2(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- #print "SSH.system2(%s)" % cmd
- return CMD.system(self, cmd, timeout)
-
- def runE(self, cmd):
- cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
- self.user, self.host, cmd)
- if ( DEBUG == 1 ):
- print cmd,
- (f_in, f_out, f_err) = os.popen3(cmd)
-
- value = f_out.read()
- if value == "": # An error has occured
- value = f_err.read()
- value = value.strip()
-
- if ( DEBUG == 1 ):
- print " == %s" % value
- f_out.close()
- f_in.close()
- f_err.close()
- return value.strip()
-
-import time
-class MyTimer:
- def __init__(self):
- self.start = time.time()
-
- def end(self):
- self.end = time.time()
- t = self.end-self.start
- return t
-
- def diff(self):
- self.end = time.time()
- t = self.end-self.start
- self.start = self.end
- return t
raise Exception("No index %s or %s in map" % (v1, v2))
def cmpCategoryVal(v1, v2):
+ # Terrible hack to manage migration to no more 'ALPHA' states.
+ if v1 == 'ALPHA': v1 = "PROD"
+ if v2 == 'ALPHA': v2 = "PROD"
+ #map = array_to_priority_map([ None, 'PROD', 'ALPHA', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
return cmpValMap(v1,v2,map)
if ADMIN & roles:
contacts += [config.email]
if TECH & roles:
- contacts += [TECHEMAIL % self.loginbase]
+ #contacts += [TECHEMAIL % self.loginbase]
+ contacts += plc.getTechEmails(self.loginbase)
if PI & roles:
- contacts += [PIEMAIL % self.loginbase]
+ #contacts += [PIEMAIL % self.loginbase]
+ contacts += plc.getSliceUserEmails(self.loginbase)
if USER & roles:
+ contacts += plc.getSliceUserEmails(self.loginbase)
slices = plc.slices(self.loginbase)
if len(slices) >= 1:
- for slice in slices:
- contacts += [SLICEMAIL % slice]
+ #for slice in slices:
+ # contacts += [SLICEMAIL % slice]
print "SLIC: %20s : %d slices" % (self.loginbase, len(slices))
else:
print "SLIC: %20s : 0 slices" % self.loginbase
from monitor import config
import string
import sys
+import time
categories = {}
ssherror = False
return cmpMap(l1,l2,'state', map)
def cmpCategoryVal(v1, v2):
- map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
+ map = array_to_priority_map([ None, 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'FORCED', 'ERROR', ])
return cmpValMap(v1,v2,map)
def cmpCategory(l1, l2):
- map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ])
+ map = array_to_priority_map([ 'ALPHA', 'PROD', 'OLDPROD', 'OLDBOOTCD', 'UNKNOWN', 'ERROR', ])
return cmpMap(l1,l2,'category', map)
def cmpPCU(l1, l2):
#print "state: %s<br>" % pcu_state(vals['plcnode']['pcu_ids'][0])
#print "color: %s<br>" % pcu_colorMap[pcu_state(vals['plcnode']['pcu_ids'][0])]
bgcolor = "bgcolor='%s'" % pcu_colorMap[pcu_state(vals['plcnode']['pcu_ids'][0])]
- url = "<a href='/cgi-bin/printbadpcus.php#id%s'>PCU</a>" % vals['plcnode']['pcu_ids'][0]
+ url = "<a href='/cgi-bin/monitor/printbadpcus.php?id=%s'>PCU</a>" % vals['plcnode']['pcu_ids'][0]
r_str += "<td nowrap %s>%s</td>" % (bgcolor, url)
else:
r_str += "<td nowrap %s>%s</td>" % (bgcolor, f)
return r_str
+def my_diff_time(timestamp):
+ now = time.time()
+ if timestamp == None:
+ return "not yet contacted"
+ diff = now - timestamp
+ # return the number of seconds as a difference from current time.
+ t_str = ""
+ if diff < 60: # sec in min.
+ t = diff
+ t_str = "%s sec ago" % t
+ elif diff < 60*60: # sec in hour
+ t = diff // (60)
+ t_str = "%s min ago" % int(t)
+ elif diff < 60*60*24: # sec in day
+ t = diff // (60*60)
+ t_str = "%s hours ago" % int(t)
+ elif diff < 60*60*24*7: # sec in week
+ t = diff // (60*60*24)
+ t_str = "%s days ago" % int(t)
+ elif diff < 60*60*24*30: # approx sec in month
+ t = diff // (60*60*24*7)
+ t_str = "%s weeks ago" % int(t)
+ elif diff > 60*60*24*30 and diff < 60*60*24*30*2: # approx sec in month
+ month = int( diff // (60*60*24*30) )
+ weeks = (diff - (month * (60*60*24*30))) // (60*60*24*7)
+ if weeks == 0:
+ t_str = "%s month ago" % int(month)
+ elif weeks == 4:
+ t_str = "2 months ago"
+ else:
+ t_str = "%s month and %s weeks ago" % ( int(month) , int(weeks) )
+ elif diff >= 60*60*24*30*2:
+ month = diff // (60*60*24*30)
+ t_str = "%s months ago" % int(month)
+ return t_str
def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
global fb
+ import os
+ import datetime
+ if nodeonlyfilter == None:
+ print "<html><body>\n"
+
+ try:
+ mtime = os.stat("/var/lib/monitor-server/production.findbad.pkl")[-2]
+ print "Last Updated: %s GMT" % datetime.datetime.fromtimestamp(mtime)
+ except:
+ pass
+
db = database.dbLoad(config.dbname)
fb = database.dbLoad("findbadpcus")
'state' : 5,
'kernel' : 10.65,
'comonstats' : 5,
+ 'last_contact' : 10.65,
'plcsite' : 12,
'bootcd' : 10.65}
## create format string based on config.fields
format_fields = []
for f in config.fields.split(','):
fields[f] = "%%(%s)s" % f
+ #print f
#if f in maxFieldLengths:
# fields[f] = "%%(%s)%ds" % (f, maxFieldLengths[f])
#else:
if comonfilter != None: cmf = re.compile(comonfilter)
else: cmf = None
+
+ output_str = ""
#l_loginbase = bysite.keys()
#l_loginbase.sort()
if nodeonlyfilter == None:
- print "<table width=80% border=1>"
+ output_str += "<table width=80% border=1>"
prev_sitestring = ""
for row in d2:
vals = row
+ #added by guto about last contact information
if (catfilter != None and cf.match(vals['category']) == None):
continue
continue
if nodeonlyfilter != None:
- print vals['nodename']
+ output_str += vals['nodename']
continue
site_string = row['site_string']
if site_string != prev_sitestring:
- print "<tr><td bgcolor=lightblue nowrap>"
- print site_string
- print "</td>"
+ output_str += "<tr><td bgcolor=lightblue nowrap>"
+ output_str += site_string
+ output_str += "</td>"
else:
- print "<tr><td> </td>"
+ output_str += "<tr><td> </td>"
prev_sitestring = site_string
url = "<a href='https://%s/db/nodes/index.php?nodepattern=%s'>%s</a>" % (config.MONITOR_HOSTNAME, vals['nodename'], vals['nodename'])
vals['nodename'] = url
+ if 'plcnode' in vals:
+ if vals['plcnode']['status'] == "GN_FAILED":
+ vals['last_contact'] = "UNKNOWN"
+ else:
+ vals['last_contact'] = my_diff_time(vals['plcnode']['last_contact'])
+
try:
str_fields = []
count = 0
print >>sys.stderr, vals
s = fields_to_html(str_fields, vals)
- print s
+ output_str += s
- print "\n</tr>"
+ output_str += "\n</tr>"
if nodeonlyfilter == None:
- print "</table>"
- print "<table>"
+ output_str += "</table>"
keys = categories.keys()
keys.sort()
+ print "<table>"
for cat in keys:
print "<tr>"
print "<th nowrap align=left>Total %s</th>" % cat
if nodeonlyfilter == None:
print "</table>"
+ print output_str
+ if nodeonlyfilter == None:
+ print "</body></html>\n"
+
if __name__ == '__main__':
config.cmpdays=False
config.comon="sshstatus"
- config.fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd"
+ config.fields="nodename,ping,ssh,pcu,category,state,last_contact,kernel,bootcd"
config.dbname="findbad"
config.cmpping=False
config.cmpdns=False
config.cmpcategory=False
print "Content-Type: text/html\r\n"
- if mynodeonly == None:
- print "<html><body>\n"
if len(sys.argv) > 1:
if sys.argv[1] == "ssherror":
ssherror = True
main(myfilter, mycategory, mystate, mycomon,mynodeonly)
- if mynodeonly == None:
- print "</body></html>\n"
function plc_site_link($site_name)
{
- return "https://www.planet-lab.org/db/sites/index.php?site_pattern=" . $site_name;
+ return "https://" . MONITOR_HOSTNAME . "/db/sites/index.php?site_pattern=" . $site_name;
}
function pcu_link($pcu)
{
- return "https://www.planet-lab.org/db/sites/pcu.php?id=" . $pcu['pcu_id'];
+ return "https://" . MONITOR_HOSTNAME . "/db/sites/pcu.php?id=" . $pcu['pcu_id'];
}
function pcu_site($pcu)