cd $HOME/monitor/
DATE=`date +%Y-%m-%d-%T`
-
if [ -f $HOME/monitor/SKIP ] ; then
+ # echo "SKIPPING Monitor"
+ # exit
# TODO: should be possible to kill the old version if
# desired and prevent lingering instances of automate.
if [ -z "$1" ] ; then
echo "KILLING Monitor"
- ./kill.cmd.sh `cat $HOME/monitor/SKIP`
+ PID=`cat $HOME/monitor/SKIP`
rm -f $HOME/monitor/SKIP
+ ./kill.cmd.sh $PID
else
# skipping monitor
echo "SKIPPING Monitor"
#########################
# 1. FINDBAD NODES
rm -f pdb/production.findbad2.pkl
-./findbad.py --cachenodes --debug=0 --dbname="findbad2" $DATE
+./findbad.py --increment --cachenodes --debug=0 --dbname="findbad2" $DATE
+
+ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs kill
########################
# COPY to golf for diagnose.py and action.py
rm -f pdb/production.findbadpcus2.pkl
./findbadpcu.py --increment --refresh --debug=0 --dbname=findbadpcus2 $DATE
+./sitebad.py --increment
+
# clean up stray 'locfg' processes that hang around inappropriately...
ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs kill
./pkl2php.py -i idTickets -o idTickets
for f in findbad act_all findbadpcus l_plcnodes; do
- cp pdb/production.$f.pkl archive-pdb/`date +%F`.production.$f.pkl
+ cp pdb/production.$f.pkl archive-pdb/`date +%F-%H:%M`.production.$f.pkl
done
rm -f $HOME/monitor/SKIP
sudo /usr/sbin/vps ax
-If you have a BootCD older than 3.0, you will need to create a new BootCD and configuration file. You can find instructions for this at the Technical Contact's Guide:
+If you have a BootCD older than 3.0, you will need to create burn a new BootImage to CD or USB. You can find instructions for this at the Technical Contact's Guide:
https://www.planet-lab.org/doc/guides/bootcdsetup
If after following these directions, and either logging in with your site_admin account or seeing the CoMon report of your machine, there is no need to respond to this message. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken.
-After a week, we will disable your site's ability to create new slices. Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
-
Thank you for your help,
-- PlanetLab Central (support@planet-lab.org)
""")
+#If no one responds, then after a week, we will disable your site's ability to create new slices. Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
+
newdown_two=("""PlanetLab node(s) down: %(loginbase)s""",
"""
Hello,
If your node returns to normal operation after following these directions, then there's no need to respond to this message. However, if there are any console messages relating to the node's failure, please report them to PlanetLab support (support@planet-lab.org) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken.
-After a week, we will disable your site's ability to create new slices. Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
-
Thank you for your help,
-- PlanetLab Central (support@planet-lab.org)
""")
+#After a week, we will disable your site's ability to create new slices. Because this action will directly affect your site's registered PI, we will also CC the PI for help at that time.
+
newbootcd_two=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", # : %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD:
"""As part of our machine monitoring and maintenance, we tried to use the PCU
registered below, but could not for the following reason at the link below:
- https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php#id%(pcu_id)s
+ https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php?id=%(pcu_id)s
We need your help resolving this issue in two ways:
registered below, and though it appears to succeed, we do not subsequently
observe the associated nodes rebooting:
- https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php#id%(pcu_id)s
+ https://pl-virtual-03.cs.princeton.edu/cgi-bin/printbadpcus.php?id=%(pcu_id)s
%(hostname_list)s
baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
-Please verify the integrity of the disk, and order a replacment if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org.
+Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org.
Thanks.
-- PlanetLab Central (support@planet-lab.org)
""")
+ plnode_cfg=(""" Please Verify Network Configuration for PlanetLab node %(hostname)s""",
+"""Hello,
+
+As part of PlanetLab node monitoring, we noticed that %(hostname)s has a network configuration error related to DNS or hostname lookups. Often this can happen either due local configuraiton changes, or a misconfiguration of the node's DNS servers. To resolve the issue we require your assistance. All that is needed is to visit:
+
+ https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
+
+Find the primary node network entry and confirm that the settings are correct.
+
+If you use 'static' network configuration, verify that the DNS servers are correct. If you are using 'dhcp' then you will need to confirm that the information returned for the node will allow it to perform lookups on it's own hostname.
+
+If you change the network settings, then select, "Download -> Download plnode.txt file for %(hostname)s" menu. This will generate a new configuration file for your node. Copy this file to the appropriate read-only media, either floppy or USB stick, and reboot the machine. If you are using an All-in-One boot image, then you will need to download the All-in-One image instead, burn it to the appropriate media (CD or USB) and reboot.
+
+Please let us know if you need any assistance.
+
+Thank you for your help,
+ -- PlanetLab Central (support@planet-lab.org)
+
+BootManager.log output follows:
+---------------------------------------------------------
+%(bmlog)s
+""")
+
plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
- """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
+"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
from optparse import OptionParser
from automate import *
-parser = OptionParser()
-parser.set_defaults(nodelist=None,
- node=None,
- outdir=None,
- querystr=None,
- timeout=0,
- simple=False,
- run=False,
- cmdfile=None,)
-
-parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
- help="Read list of nodes from specified file")
-parser.add_option("", "--node", dest="node", metavar="hostname",
- help="specify a single node name.")
-parser.add_option("", "--timeout", dest="timeout", metavar="seconds",
- help="Number of seconds to wait before timing out on host.")
-parser.add_option("", "--outdir", dest="outdir", metavar="dirname",
- help="Name of directory to place output")
-parser.add_option("", "--cmd", dest="cmdfile", metavar="filename",
- help="Name of file that contains a unix-to-csv command " + \
- "to run on the hosts.")
-
-config = config(parser)
-config.parse_args()
-
def build_vx_args(shell_cmd):
ssh_options="-q -o UserKnownHostsFile=junkssh -o StrictHostKeyChecking=no"
cmd="""ssh %s root@{} """ % ssh_options
args.append(shell_cmd)
return args
-def vx_start(filelist,outdir,cmd):
+def vx_start(filelist,outdir,cmd, timeout=0):
args = build_vx_args(cmd)
- #vxargs.start(None, 10, filelist, outdir, False, args, 120)
- vxargs.start(None, 10, filelist, outdir, False, args, int(config.timeout))
+ vxargs.start(None, 20, filelist, outdir, False, args, timeout)
+
+if __name__ == "__main__":
+ parser = OptionParser()
+ parser.set_defaults(nodelist=None,
+ node=None,
+ outdir=None,
+ querystr=None,
+ timeout=0,
+ simple=False,
+ run=False,
+ cmdfile=None,)
+
+ parser.add_option("", "--nodelist", dest="nodelist", metavar="filename",
+ help="Read list of nodes from specified file")
+ parser.add_option("", "--node", dest="node", metavar="hostname",
+ help="specify a single node name.")
+ parser.add_option("", "--timeout", dest="timeout", metavar="seconds",
+ help="Number of seconds to wait before timing out on host.")
+ parser.add_option("", "--outdir", dest="outdir", metavar="dirname",
+ help="Name of directory to place output")
+ parser.add_option("", "--cmd", dest="cmdfile", metavar="filename",
+ help="Name of file that contains a unix-to-csv command " + \
+ "to run on the hosts.")
+
+ config = config(parser)
+ config.parse_args()
-if config.outdir == None:
- outdir="checkhosts"
-else:
- outdir=config.outdir
+ if config.outdir == None:
+ outdir="checkhosts"
+ else:
+ outdir=config.outdir
-if not os.path.exists(outdir):
- os.system('mkdir -p %s' % outdir)
+ if not os.path.exists(outdir):
+ os.system('mkdir -p %s' % outdir)
-if config.nodelist == None and config.node == None:
- filelist="nocomon.txt"
- filelist = vxargs.getListFromFile(open(filelist,'r'))
-elif os.path.exists(str(config.nodelist)) and os.path.isfile(config.nodelist):
- filelist = vxargs.getListFromFile(open(config.nodelist,'r'))
-elif os.path.exists(str(config.nodelist)) and os.path.isdir(config.nodelist):
- filelist = get_hostlist_from_dir(config.nodelist)
-elif config.node is not None:
- filelist = [(config.node, '')]
-else:
- # probably no such file.
- raise Exception("No such file %s" % config.nodelist)
+ if config.nodelist == None and config.node == None:
+ filelist="nocomon.txt"
+ filelist = vxargs.getListFromFile(open(filelist,'r'))
+ elif os.path.exists(str(config.nodelist)) and os.path.isfile(config.nodelist):
+ filelist = vxargs.getListFromFile(open(config.nodelist,'r'))
+ elif os.path.exists(str(config.nodelist)) and os.path.isdir(config.nodelist):
+ filelist = get_hostlist_from_dir(config.nodelist)
+ elif config.node is not None:
+ filelist = [(config.node, '')]
+ else:
+ # probably no such file.
+ raise Exception("No such file %s" % config.nodelist)
-if config.cmdfile == None:
- f = open("command.txt",'r')
- cmd = f.read()
-else:
- f = open(config.cmdfile,'r')
- cmd = f.read()
+ if config.cmdfile == None:
+ f = open("command.txt",'r')
+ cmd = f.read()
+ else:
+ f = open(config.cmdfile,'r')
+ cmd = f.read()
-vx_start(filelist, outdir, cmd)
+ vx_start(filelist, outdir, cmd, int(config.timeout))
# QUERY all nodes.
COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
"table=table_nodeview&" + \
- "dumpcols='name,resptime,sshstatus,uptime,lastcotop'&" + \
+ "dumpcols='name,resptime,sshstatus,uptime,lastcotop,cpuspeed,memsize,disksize'&" + \
"formatcsv"
#"formatcsv&" + \
#"select='lastcotop!=0'"
import comon
import threadpool
import syncplcdb
+from nodequery import verify,query_to_dict,node_select
import plc
import auth
def collectPingAndSSH(nodename, cohash):
### RUN PING ######################
ping = soltesz.CMD()
- (oval,eval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
+ (oval,errval) = ping.run_noexcept("ping -c 1 -q %s | grep rtt" % nodename)
values = {}
else:
values['ping'] = "PING"
- #uptime = soltesz.SSH('root', nodename)
- #(oval,eval) = uptime.run_noexcept("uptime | awk '{print $3,$4}' | tr , ' '")
+ try:
+ for port in [22, 806]:
+ ssh = soltesz.SSH('root', nodename, port)
+
+ (oval, errval) = ssh.run_noexcept2(""" <<\EOF
+ echo "{"
+ echo ' "kernel":"'`uname -a`'",'
+ echo ' "bmlog":"'`ls /tmp/bm.log`'",'
+ echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",'
+ echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",'
+ echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",'
+
+ ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'`
+
+ echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
+ echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
+ echo "}"
+EOF """)
+
+ if len(oval) > 0:
+ values.update(eval(oval))
+ values['sshport'] = port
+ break
+ else:
+ values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', 'nm' :
+ '', 'princeton_comon' : '', 'princeton_comon_running' : '',
+ 'princeton_comon_procs' : '', 'sshport' : None})
+ except:
+ import traceback; print traceback.print_exc()
+ sys.exit(1)
### RUN SSH ######################
b_getbootcd_id = True
- ssh = soltesz.SSH('root', nodename)
- oval = ""
- eval = ""
- (oval, eval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
- val = oval
+ #ssh = soltesz.SSH('root', nodename)
+ #oval = ""
+ #errval = ""
+ #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`')
+
+ oval = values['kernel']
if "2.6.17" in oval or "2.6.2" in oval:
values['ssh'] = 'SSH'
values['category'] = 'ALPHA'
- if "bm.log" in oval:
+ if "bm.log" in values['bmlog']:
values['state'] = 'DEBUG'
else:
values['state'] = 'BOOT'
elif "2.6.12" in oval or "2.6.10" in oval:
values['ssh'] = 'SSH'
values['category'] = 'PROD'
- if "bm.log" in oval:
+ if "bm.log" in values['bmlog']:
values['state'] = 'DEBUG'
else:
values['state'] = 'BOOT'
- elif "2.4" in oval:
+
+ # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why.
+ elif "2.4" in oval or "2.6.8" in oval:
b_getbootcd_id = False
values['ssh'] = 'SSH'
values['category'] = 'OLDBOOTCD'
elif oval != "":
values['ssh'] = 'SSH'
values['category'] = 'UNKNOWN'
- if "bm.log" in oval:
+ if "bm.log" in values['bmlog']:
values['state'] = 'DEBUG'
else:
values['state'] = 'BOOT'
values['ssh'] = 'NOSSH'
values['category'] = 'ERROR'
values['state'] = 'DOWN'
- val = eval.strip()
+ val = errval.strip()
+ values['kernel'] = val
- values['kernel'] = val
+ #values['kernel'] = val
if b_getbootcd_id:
# try to get BootCD for all nodes that are not 2.4 nor inaccessible
- (oval, eval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
- val = oval
- if "BootCD" in val:
- values['bootcd'] = val
- if "v2" in val and \
+ #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID')
+ oval = values['bootcd']
+ if "BootCD" in oval:
+ values['bootcd'] = oval
+ if "v2" in oval and \
( nodename is not "planetlab1.cs.unc.edu" and \
nodename is not "planetlab2.cs.unc.edu" ):
values['category'] = 'OLDBOOTCD'
# TODO: get bm.log for debug nodes.
# 'zcat /tmp/bm.log'
+
+ #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep')
+ oval = values['nm']
+ if "nm.py" in oval:
+ values['nm'] = "Y"
+ else:
+ values['nm'] = "N"
+
+ continue_slice_check = True
+ #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon')
+ oval = values['princeton_comon']
+ if "princeton_comon" in oval:
+ values['princeton_comon'] = "Y"
+ else:
+ values['princeton_comon'] = "N"
+ continue_slice_check = False
+
+ if continue_slice_check:
+ #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; ls -d /proc/virtual/$ID')
+ oval = values['princeton_comon_running']
+ if len(oval) > len('/proc/virtual/'):
+ values['princeton_comon_running'] = "Y"
+ else:
+ values['princeton_comon_running'] = "N"
+ continue_slice_check = False
+ else:
+ values['princeton_comon_running'] = "-"
+
+ if continue_slice_check:
+ #(oval, errval) = ssh.run_noexcept('ID=`grep princeton_comon /etc/passwd | awk -F : "{if ( \\\$3 > 500 ) { print \\\$3}}"`; vps ax | grep $ID | grep -v grep | wc -l')
+ oval = values['princeton_comon_procs']
+ values['princeton_comon_procs'] = oval
+ else:
+ values['princeton_comon_procs'] = "-"
+
if nodename in cohash:
values['comonstats'] = cohash[nodename]
count += 1
print "%d %s %s" % (count, nodename, externalState['nodes'][nodename]['values'])
- soltesz.dbDump(config.dbname, externalState)
+ if count % 20 == 0:
+ soltesz.dbDump(config.dbname, externalState)
# this will be called when an exception occurs within a thread
def handle_exception(request, result):
print "All results collected."
break
+ soltesz.dbDump(config.dbname, externalState)
+
def main():
cotop_url = COMON_COTOPURL
# history information for all nodes
+ #cohash = {}
cohash = cotop.coget(cotop_url)
l_nodes = syncplcdb.create_plcdb()
if config.filename:
elif config.nodegroup:
ng = api.GetNodeGroups({'name' : config.nodegroup})
l_nodes = api.GetNodes(ng[0]['node_ids'])
-
+ elif config.site:
+ site = api.GetSites(config.site)
+ l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+
l_nodes = [node['hostname'] for node in l_nodes]
+ # perform this query after the above options, so that the filter above
+ # does not break.
+ if config.nodeselect:
+ l_nodes = node_select(config.nodeselect)
+
print "fetching %s hosts" % len(l_nodes)
checkAndRecordState(l_nodes, cohash)
from config import config
from optparse import OptionParser
parser = OptionParser()
- parser.set_defaults(filename=None, node=None, nodegroup=None, increment=False, dbname="findbadnodes", cachenodes=False)
+ parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None,
+ increment=False, dbname="findbadnodes", cachenodes=False)
parser.add_option("", "--node", dest="node", metavar="hostname",
help="Provide a single node to operate on")
parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE",
help="Provide the input file for the node list")
+ parser.add_option("", "--nodeselect", dest="nodeselect", metavar="query string",
+ help="Provide a selection string to return a node list.")
parser.add_option("", "--nodegroup", dest="nodegroup", metavar="FILE",
help="Provide the nodegroup for the list of nodes.")
+ parser.add_option("", "--site", dest="site", metavar="site name",
+ help="Specify a site to view node status")
parser.add_option("", "--cachenodes", action="store_true",
help="Cache node lookup from PLC")
#
#orig_sig_handler = signal.signal(signal.SIGCHLD, sig_handler)
-from config import config
-from optparse import OptionParser
-parser = OptionParser()
-parser.set_defaults(filename=None,
- increment=False,
- pcuid=None,
- dbname="findbadpcus",
- cachenodes=False,
- refresh=False,
- )
-parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE",
- help="Provide the input file for the node list")
-parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
- help="Provide the id for a single pcu")
-parser.add_option("", "--cachenodes", action="store_true",
- help="Cache node lookup from PLC")
-parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
- help="Specify the name of the database to which the information is saved")
-parser.add_option("", "--refresh", action="store_true", dest="refresh",
- help="Refresh the cached values")
-parser.add_option("-i", "--increment", action="store_true", dest="increment",
- help="Increment round number to force refresh or retry")
-config = config(parser)
-config.parse_args()
# QUERY all nodes.
COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
#### RUN NMAP ###############################
if continue_probe:
nmap = soltesz.CMD()
- (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,16992 %s | grep Host:" % pcu_name(values))
+ (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % pcu_name(values))
# NOTE: an empty / error value for oval, will still work.
(values['portstatus'], continue_probe) = nmap_portstatus(oval)
else:
return 0
-import logging
-logger = logging.getLogger("monitor")
-logger.setLevel(logging.DEBUG)
-fh = logging.FileHandler("monitor.log", mode = 'a')
-fh.setLevel(logging.DEBUG)
-formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-fh.setFormatter(formatter)
-logger.addHandler(fh)
-
if __name__ == '__main__':
+ import logging
+ logger = logging.getLogger("monitor")
+ logger.setLevel(logging.DEBUG)
+ fh = logging.FileHandler("monitor.log", mode = 'a')
+ fh.setLevel(logging.DEBUG)
+ formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+ fh.setFormatter(formatter)
+ logger.addHandler(fh)
+ from config import config
+ from optparse import OptionParser
+ parser = OptionParser()
+ parser.set_defaults(filename=None,
+ increment=False,
+ pcuid=None,
+ dbname="findbadpcus",
+ cachenodes=False,
+ refresh=False,
+ )
+ parser.add_option("-f", "--nodelist", dest="filename", metavar="FILE",
+ help="Provide the input file for the node list")
+ parser.add_option("", "--pcuid", dest="pcuid", metavar="id",
+ help="Provide the id for a single pcu")
+ parser.add_option("", "--cachenodes", action="store_true",
+ help="Cache node lookup from PLC")
+ parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
+ help="Specify the name of the database to which the information is saved")
+ parser.add_option("", "--refresh", action="store_true", dest="refresh",
+ help="Refresh the cached values")
+ parser.add_option("-i", "--increment", action="store_true", dest="increment",
+ help="Increment round number to force refresh or retry")
+ config = config(parser)
+ config.parse_args()
try:
# NOTE: evidently, there is a bizarre interaction between iLO and ssh
# when LANG is set... Do not know why. Unsetting LANG, fixes the problem.
import sys
import os
-def getconf(hostname):
+def getconf(hostname, force=False, media=None):
api = plc.PLC(auth.auth, auth.plc)
n = api.GetNodes(hostname)
filename = "bootcd-alpha/" + hostname + ".txt"
- if not os.path.exists(filename):
+ if not os.path.exists(filename) or force:
f = open("bootcd-alpha/" + hostname + ".txt", 'w')
f.write( api.AdmGenerateNodeConfFile(n[0]['node_id']) )
f.close()
pass
args = {}
- args['url_list'] = " http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
- args['url_list'] += " http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+ if not media:
+ args['url_list'] = " http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+ args['url_list'] += " http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+ else:
+ if media == "usb":
+ args['url_list'] = " http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+ elif media == "iso":
+ args['url_list'] = " http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+ else:
+ args['url_list'] = " http://pl-virtual-03.cs.princeton.edu/bootcds/%s-partition.usb\n" % hostname
+ args['url_list'] += " http://pl-virtual-03.cs.princeton.edu/bootcds/%s.iso" % hostname
+
#print "http://pl-virtual-03.cs.princeton.edu/bootcds/%s.usb\n" % hostname
return args
from config import config as cfg
from optparse import OptionParser
parser = OptionParser()
- parser.set_defaults(media='both')
+ parser.set_defaults(media='both', force=False)
parser.add_option("", "--media", dest="media", metavar="usb, iso, both",
help="""Which media to generate the message for.""")
+ parser.add_option("", "--force", dest="force", action="store_true",
+ help="""Force the recreation of the usb images.""")
config = cfg(parser)
config.parse_args()
ret = {'url_list' : ''}
for i in config.args:
- conf = getconf(i)
+ conf = getconf(i, config.force, config.media)
ret['url_list'] += conf['url_list']
ret['hostname'] = i
""" % ret
- elif config.media == "cd":
+ elif config.media == "iso":
print """
Hello,
key = key.strip()
# TODO: check for '==' at end of key.
- if key[-1] != '=':
+ if len(key) > 0 and key[-1] != '=':
print "Host with corrupt key! for %s %s" % (node['boot_state'], node['hostname'])
s_date = time.strftime("%Y/%m/%d_%H:%M:%S",time.gmtime(time.time()))
import policy
from config import config as cfg
+import config as config2
from optparse import OptionParser
from nodecommon import *
import bootman # debug nodes
import monitor # down nodes with pcu
import reboot # down nodes without pcu
-reboot.verbose = 0
+from emailTxt import mailtxt
+#reboot.verbose = 0
import sys
class Reboot(object):
m = PersistMessage(host, mailtxt.pcudown_one[0] % args,
mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
- loginbase = plc.siteId(hostname)
+ loginbase = plc.siteId(host)
m.send([policy.TECHEMAIL % loginbase])
def pcu(self, host):
if self.fbnode['pcu'] == "PCU":
self.action = "reboot.reboot('%s')" % host
- pflags = PersistFlags(host, 1*60*60*24, db='pcu_persistflags')
- if not pflags.getRecentFlag('pcutried'): # or not pflags.getFlag('pcufailed'):
+ pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
+ if not pflags.getRecentFlag('pcutried'):
pflags.setRecentFlag('pcutried')
try:
ret = reboot.reboot(host)
pflags.setRecentFlag('pcumessagesent')
pflags.save()
+ # NOTE: this will result in just one message sent at a time.
+ return True
else:
- pass # just skip it?
-
+ return False
else:
self.action = "None"
return False
self.action = "None"
return False # this always fails, since the node will be down.
+def set_node_to_rins(host, fb):
+
+ node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
+ record = {'observation' : node[0],
+ 'model' : 'USER_REQUEST',
+ 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
+ 'time' : time.time()}
+ l = Log(host, record)
+
+ ret = api.UpdateNode(host, {'boot_state' : 'rins'})
+ if ret:
+ # it's nice to see the current status rather than the previous status on the console
+ node = api.GetNodes(host)[0]
+ print l
+ print "%-2d" % (i-1), nodegroup_display(node, fb)
+ return l
+ else:
+ print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
+ return None
+
try:
rebootlog = soltesz.dbLoad("rebootlog")
node=None,
nodelist=None,
nodeselect=None,
- timewait=30,
+ timewait=0,
skip=0,
rins=False,
reboot=False,
# rerun findbad with the nodes in the given nodes.
import os
file = "findbad.txt"
- config.setFileFromList(file, hostnames)
+ config2.setFileFromList(file, hostnames)
os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
fb = soltesz.dbLoad("findbad")
for host in hostnames:
#if 'echo' in host or 'hptest-1' in host: continue
-
try:
try:
node = api.GetNodes(host)[0]
print "recently rebooted %s. skipping... " % host
continue
- if config.rins:
- # reset the boot_state to 'rins'
- node = api.GetNodes(host, ['boot_state', 'last_contact', 'last_updated', 'date_created'])
- record = {'observation' : node[0],
- 'model' : 'USER_REQUEST',
- 'action' : 'api.UpdateNode(%s, {"boot_state" : "rins"})' % host,
- 'time' : time.time()}
- l = Log(host, record)
-
- ret = api.UpdateNode(host, {'boot_state' : 'rins'})
- if ret:
- # it's nice to see the current status rather than the previous status on the console
- node = api.GetNodes(host)[0]
- print l
- print "%-2d" % (i-1), nodegroup_display(node, fb)
- rebootlog.add(l)
- else:
- print "FAILED TO UPDATE NODE BOOT STATE : %s" % host
-
-
if config.reboot:
fbnode = fb['nodes'][host]['values']
o = RebootDebug(fbnode)
elif observed_state == "boot" :
+ if config.rins:
+ l = set_node_to_rins(host, fb)
+ if l: rebootlog.add(l)
+
o = RebootBoot(fbnode)
elif observed_state == "down":
+ if config.rins:
+ l = set_node_to_rins(host, fb)
+ if l: rebootlog.add(l)
+
o = RebootDown(fbnode)
'time' : time.time()}
print "ALL METHODS OF RESTARTING %s FAILED" % host
+ args = {}
+ args['hostname'] = host
+ m = PersistMessage(host, "ALL FAIL for %(hostname)s" % args,
+ "nada", False, db='suspect_persistmessages')
+ m.reset()
+ m.send(['monitor-list@lists.planet-lab.org'])
l = Log(host, record)
print l
from emailTxt import *
import smtplib
from config import config
+import calendar
import logging
import os
import time
if ticket_id == None or ticket_id == "":
return {}
- cmd = "rt show -t ticket -f id,subject,status,queue %s" % (ticket_id)
+ cmd = "rt show -t ticket -f id,subject,status,queue,created %s" % (ticket_id)
(f_in, f_out, f_err) = os.popen3(cmd)
value = f_out.read()
l_values = value.split('\n')
if len(line) == 0: continue
vals = line.split(':')
key = vals[0]
- r_values[key] = "".join(vals[1:])
+ r_values[key] = ":".join(vals[1:])
r_values[key] = r_values[key].strip()
+
+ r_values['Created'] = calendar.timegm(time.strptime(r_values['Created']))
return r_values
def setAdminCCViaRT(ticket_id, to):
cmp = re.compile(filter[key])
res = cmp.search(log.__getattribute__(key))
if res != None:
- print "found match in log: %s %s ~=~ %s" % (log, key, filter[key])
+ #print "found match in log: %s %s ~=~ %s" % (log, key, filter[key])
if log.time > time.time() - timerange:
print "returning log b/c it occured within time."
return log
from monitor_policy import *
import rt
+import sys
import plc
import auth
api = plc.PLC(auth.auth, auth.plc)
+from clean_policy import *
+
def reboot(hostname):
+ print "calling reboot!!! %s " % hostname
l_nodes = api.GetNodes(hostname)
if len(l_nodes) == 0:
if ad_dbTickets == None:
raise Exception("Could not find cached dbTickets")
+ print "starting new thing"
+ mon = MonitorMergeDiagnoseSendEscellate(hostname, True)
+ mon.run()
+
#print "merge"
- merge = Merge( [node['hostname'] for node in l_nodes])
- record_list = merge.run()
- #print "rt"
- rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
- record_list = rt.run()
- #print "diagnose"
- diag = Diagnose(record_list)
- diagnose_out = diag.run()
+ #merge = Merge( [node['hostname'] for node in l_nodes])
+ #record_list = merge.run()
+ ##print "rt"
+ #rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
+ #record_list = rt.run()
+ ##print "diagnose"
+ #diag = Diagnose(record_list)
+ #diagnose_out = diag.run()
#print diagnose_out
#print "action"
- action = Action(diagnose_out)
- action.run()
+ #action = Action(diagnose_out)
+ #action.run()
return True
def main():
- pass
+ for host in sys.argv[1:]:
+ reboot(host)
+print "hello?"
if __name__ == '__main__':
+ print "calling main"
main()
USER=4
ADMIN=8
+from unified_model import *
+
class Merge:
def __init__(self, l_merge):
self.merge_list = l_merge
# NOTE: these settings can be overridden by command line arguments,
# or the state of a record, i.e. if already in RT's Support Queue.
- nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+ pf = PersistFlags(loginbase, 1, db='site_persitflags')
+ nodes_up = pf.nodes_up
if nodes_up < MINUP:
d_diag_site[loginbase]['config']['squeeze'] = True
max_slices = self.getMaxSlices(loginbase)
- num_nodes = self.getNumNodes(loginbase)
+ num_nodes = pf.nodes_total #self.getNumNodes(loginbase)
# NOTE: when max_slices == 0, this is either a new site (the old way)
# or an old disabled site from previous monitor (before site['enabled'])
if nodes_up < num_nodes and max_slices != 0:
if config.policysavedb:
print "Saving Databases... act_all"
soltesz.dbDump("act_all", self.act_all)
+ soltesz.dbDump("diagnose_out", self.diagnose_db)
sys.exit(1)
#print_stats("sites_observed", stats)
# TODO: remove 'diagnose_out',
# or at least the entries that were acted on.
soltesz.dbDump("act_all", self.act_all)
+ soltesz.dbDump("diagnose_out", self.diagnose_db)
def accumSites(self):
"""
if ticket_id == 0:
# error.
print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
+ import os
os._exit(1)
pass
i_nodes_actedon += 1
if config.policysavedb:
- print "Saving Databases... act_all, diagnose_out"
- soltesz.dbDump("act_all", self.act_all)
+ #print "Saving Databases... act_all, diagnose_out"
+ #soltesz.dbDump("act_all", self.act_all)
# remove site record from diagnose_out, it's in act_all as done.
del self.diagnose_db[loginbase]
#soltesz.dbDump("diagnose_out", self.diagnose_db)
GREEN = esc + "[1;32m"
YELLOW = esc + "[1;33m"
BLUE = esc + "[1;34m"
+LIGHTBLUE = esc + "[1;36m"
NORMAL = esc + "[0;39m"
def red(str):
def green(str):
return GREEN + str + NORMAL
+def lightblue(str):
+ return LIGHTBLUE + str + NORMAL
+
def blue(str):
return BLUE + str + NORMAL
if values == None:
return fbnode['pcu']
else:
- return fbnode['pcu']
+ if 'pcu' not in fbnode:
+ return 'NOPCU'
+ else:
+ return fbnode['pcu']
if 'reboot' in values:
rb = values['reboot']
if rb == 0 or rb == "0":
return fbnode['pcu'] + "OK "
+ #return fbnode['pcu'] + "OK "
#return green(fbnode['pcu'])
elif "NetDown" == rb or "Not_Run" == rb:
return fbnode['pcu'] + "DOWN"
return fbnode['pcu'] + "BAD "
def color_boot_state(l):
- if l == "dbg": return yellow("dbg ")
- elif l == "dbg ": return yellow(l)
+ if l == "dbg": return yellow("debg")
+ elif l == "dbg ": return yellow("debg")
+ elif l == "diag": return lightblue(l)
+ elif l == "disable": return red("dsbl")
elif l == "down": return red(l)
elif l == "boot": return green(l)
elif l == "rins": return blue(l)
return l
def diff_time(timestamp):
+ import math
now = time.time()
if timestamp == None:
return "unknown"
# return the number of seconds as a difference from current time.
t_str = ""
if diff < 60: # sec in min.
- t = diff // 1
- t_str = "%s sec ago" % t
+ t = diff / 1
+ t_str = "%s sec ago" % int(math.ceil(t))
elif diff < 60*60: # sec in hour
- t = diff // (60)
- t_str = "%s min ago" % int(t)
+ t = diff / (60)
+ t_str = "%s min ago" % int(math.ceil(t))
elif diff < 60*60*24: # sec in day
- t = diff // (60*60)
- t_str = "%s hrs ago" % int(t)
+ t = diff / (60*60)
+ t_str = "%s hrs ago" % int(math.ceil(t))
elif diff < 60*60*24*7: # sec in week
- t = diff // (60*60*24)
- t_str = "%s days ago" % int(t)
- elif diff < 60*60*24*30: # approx sec in month
- t = diff // (60*60*24*7)
- t_str = "%s wks ago" % int(t)
+ t = diff / (60*60*24)
+ t_str = "%s days ago" % int(math.ceil(t))
+ elif diff <= 60*60*24*30: # approx sec in month
+ t = diff / (60*60*24*7)
+ t_str = "%s wks ago" % int(math.ceil(t))
elif diff > 60*60*24*30: # approx sec in month
- t = diff // (60*60*24*7*30)
+ t = diff / (60*60*24*30)
t_str = "%s mnths ago" % int(t)
return t_str
-def nodegroup_display(node, fb):
+def nodegroup_display(node, fb, conf=None):
if node['hostname'] in fb['nodes']:
node['current'] = get_current_state(fb['nodes'][node['hostname']]['values'])
else:
node['kernel'] = fb['nodes'][node['hostname']]['values']['kernel']
if '2.6' not in node['kernel']: node['kernel'] = ""
- node['boot_state'] = color_boot_state(node['boot_state'])
- node['current'] = color_boot_state(node['current'])
+ if conf and not conf.nocolor:
+ node['boot_state'] = color_boot_state(node['boot_state'])
+ node['current'] = color_boot_state(node['current'])
#node['boot_state'] = node['boot_state']
#node['current'] = node['current']
node['pcu'] = fb['nodes'][node['hostname']]['values']['pcu']
node['lastupdate'] = diff_time(node['last_contact'])
- return "%(hostname)-38s %(boot_state)5s %(current)5s %(pcu)6s %(key)45s %(kernel)32s %(lastupdate)12s " % node
+ return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)20.20s... %(kernel)43s %(lastupdate)12s " % node
from model import *
import soltesz
soltesz.dbDump("act_all", act_all)
del act_all
return True
+
+def datetime_fromstr(str):
+ if '-' in str:
+ try:
+ tup = time.strptime(str, "%Y-%m-%d")
+ except:
+ tup = time.strptime(str, "%Y-%m-%d-%H:%M")
+ elif '/' in str:
+ tup = time.strptime(str, "%m/%d/%Y")
+ else:
+ tup = time.strptime(str, "%m/%d/%Y")
+ ret = datetime.fromtimestamp(time.mktime(tup))
+ return ret
try:
n = api.GetNodes(node)[0]
- print n
+ #print n
net = api.GetNodeNetworks(n['nodenetwork_ids'])[0]
- print net
+ #print net
node_keys = ['boot_state', 'key', 'last_updated', 'last_contact']
for k in node_keys:
if 'last' in k:
- print "%15s == %s" % (k, diff_time(net[k]))
+ print "%15s == %s" % (k, diff_time(n[k]))
else:
- print "%15s == %s" % (k, net[k])
+ print "%15s == %s" % (k, n[k])
static_keys = ['method', 'ip', 'gateway', 'network', 'broadcast', 'netmask', 'dns1', 'dns2', 'mac', 'is_primary']
for k in static_keys:
# print k, "==" , net[k]
except:
print "Error with %s" % node
+ import traceback; print traceback.print_exc()
pass
# commands:
from optparse import OptionParser
from sets import Set
+from nodequery import verify,query_to_dict,node_select
from nodecommon import *
import soltesz
parser.set_defaults(nodegroup="Alpha",
node=None,
nodelist=None,
- list=False,
+ list=True,
add=False,
+ nocolor=False,
notng=False,
delete=False,
+ nodeselect=None,
)
parser.add_option("", "--not", dest="notng", action="store_true",
help="All nodes NOT in nodegroup.")
parser.add_option("", "--nodegroup", dest="nodegroup", metavar="NodegroupName",
help="Specify a nodegroup to perform actions on")
+ parser.add_option("", "--nodeselect", dest="nodeselect", metavar="querystring",
+ help="Specify a query to perform on findbad db")
+ parser.add_option("", "--site", dest="site", metavar="site name",
+ help="Specify a site to view node status")
+ parser.add_option("", "--nocolor", dest="nocolor", action="store_true",
+ help="Enable color")
parser.add_option("", "--list", dest="list", action="store_true",
help="List all nodes in the given nodegroup")
parser.add_option("", "--add", dest="add", action="store_true",
#nodelist = api.GetNodes(hostlist)
group_str = "Given"
+ elif config.site:
+ site = api.GetSites(config.site)
+ if len (site) > 0:
+ site = site[0]
+ nodelist = api.GetNodes(site['node_ids'])
+ else:
+ nodelist = []
+
+ group_str = config.site
+
+ elif config.nodeselect:
+ hostlist = node_select(config.nodeselect)
+ nodelist = api.GetNodes(hostlist)
+
+ group_str = "selection"
+
else:
ng = api.GetNodeGroups({'name' : config.nodegroup})
nodelist = api.GetNodes(ng[0]['node_ids'])
i = 1
for node in nodelist:
print "%-2d" % i,
- print nodegroup_display(node, fb)
+ print nodegroup_display(node, fb, config)
i += 1
elif config.add and config.nodegroup:
from datetime import datetime, timedelta
import calendar
+import sys
import time
from model import *
from nodecommon import *
config = config(parser)
config.parse_args()
-def datetime_fromstr(str):
- if '-' in str:
- tup = time.strptime(str, "%Y-%m-%d")
- elif '/' in str:
- tup = time.strptime(str, "%m/%d/%Y")
- else:
- tup = time.strptime(str, "%m/%d/%Y")
- return datetime.fromtimestamp(calendar.timegm(tup))
-
-def diff_time(timestamp):
- now = time.time()
- if timestamp == None:
- return "unknown"
- diff = now - timestamp
- # return the number of seconds as a difference from current time.
- t_str = ""
- if diff < 60: # sec in min.
- t = diff
- t_str = "%s sec ago" % t
- elif diff < 60*60: # sec in hour
- t = diff // (60)
- t_str = "%s min ago" % int(t)
- elif diff < 60*60*24: # sec in day
- t = diff // (60*60)
- t_str = "%s hours ago" % int(t)
- elif diff < 60*60*24*7: # sec in week
- t = diff // (60*60*24)
- t_str = "%s days ago" % int(t)
- elif diff < 60*60*24*30: # approx sec in month
- t = diff // (60*60*24*7)
- t_str = "%s weeks ago" % int(t)
- elif diff > 60*60*24*30: # approx sec in month
- t = diff // (60*60*24*7*30)
- t_str = "%s months ago" % int(t)
- return t_str
def fb_print_nodeinfo(fbnode, verbose, date=None):
if verbose: print " state | ssh | pcu | bootcd | category | kernel"
else:
fbnode['bootcd'] = "unknown"
fbnode['state'] = color_boot_state(get_current_state(fbnode))
- fbnode['kernel'] = fbnode['kernel'].split()[2]
+ if len(fbnode['kernel'].split()) >= 3:
+ fbnode['kernel'] = fbnode['kernel'].split()[2]
print " %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
def pcu_print_info(pcuinfo, hostname):
else:
begin = "2007-11-06"
+if config.node is None and len(config.args) > 0:
+ config.node = config.args[0]
+elif config.node is None:
+ print "Add a hostname to arguments"
+ print "exit."
+ sys.exit(1)
+
d = datetime_fromstr(begin)
tdelta = timedelta(1)
verbose = 1
+def get_filefromglob(d, str):
+ import os
+ import glob
+ # TODO: This is aweful.
+ path = "archive-pdb"
+ archive = soltesz.SPickle(path)
+ glob_str = "%s*.%s.pkl" % (d.strftime("%Y-%m-%d"), str)
+ os.chdir(path)
+ #print glob_str
+ file = glob.glob(glob_str)[0]
+ #print "loading %s" % file
+ os.chdir("..")
+ return file[:-4]
+ #fb = archive.load(file[:-4])
+
+
while True:
- file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
+ file = get_filefromglob(d, "production.findbad")
+ #file = "%s.production.findbad" % d.strftime("%Y-%m-%d")
try:
fb = archive.load(file)
del fb
verbose = 0
+ except KeyboardInterrupt:
+ sys.exit(1)
except:
+ #import traceback; print traceback.print_exc()
print d.strftime("%Y-%m-%d"), "No record"
d = d + tdelta
fbnode['state'] = color_boot_state(get_current_state(fbnode))
else:
fbnode['state'] = "none"
- fbnode['kernel'] = fbnode['kernel'].split()[2]
+ if len(fbnode['kernel'].split()) > 2:
+ fbnode['kernel'] = fbnode['kernel'].split()[2]
print "\t %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
def act_print_nodeinfo(actnode, header):
import auth
api = plc.PLC(auth.auth, auth.plc)
+import sys
import soltesz
-fb = soltesz.dbLoad("findbad")
-fbpcu = soltesz.dbLoad("findbadpcus")
from nodecommon import *
from policy import Diagnose
+import glob
+import os
+from reboot import pcu_name
import time
import re
-
+#fb = {}
+fb = soltesz.dbLoad("findbad")
+fbpcu = {}
def daysdown_print_nodeinfo(fbnode, hostname):
fbnode['hostname'] = hostname
print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % fbnode
-def fb_print_nodeinfo(fbnode, hostname):
+def fb_print_nodeinfo(fbnode, hostname, fields=None):
fbnode['hostname'] = hostname
fbnode['checked'] = diff_time(fbnode['checked'])
if fbnode['bootcd']:
fbnode['bootcd'] = fbnode['bootcd'].split()[-1]
else:
fbnode['bootcd'] = "unknown"
- if 'ERROR' in fbnode['category']:
- fbnode['kernel'] = ""
- else:
- fbnode['kernel'] = fbnode['kernel'].split()[2]
fbnode['pcu'] = color_pcu_state(fbnode)
- print "%(hostname)-39s | %(checked)11.11s | %(state)10.10s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+
+ if not fields:
+ if 'ERROR' in fbnode['category']:
+ fbnode['kernel'] = ""
+ else:
+ fbnode['kernel'] = fbnode['kernel'].split()[2]
+ fbnode['boot_state'] = fbnode['plcnode']['boot_state']
+
+ print "%(hostname)-39s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+ else:
+ format = ""
+ for f in fields:
+ format += "%%(%s)s " % f
+ print format % fbnode
def verify(constraints, data):
"""
value_re = re.compile(con[key])
con_and_true = con_and_true & (value_re.search(data[key]) is not None)
elif key not in data:
- print "missing key %s" % key
- con_and_true = False
+ print "missing key %s" % key,
+ pass
+ #print "missing key %s" % key
+ #con_and_true = False
con_or_true = con_or_true | con_and_true
return True
return False
-def pcu_select(str_query):
+def pcu_select(str_query, nodelist=None):
pcunames = []
- if str_query is None: return pcunames
+ nodenames = []
+ if str_query is None: return (nodenames, pcunames)
#print str_query
dict_query = query_to_dict(str_query)
#print dict_query
for node in fb['nodes'].keys():
+ if nodelist is not None:
+ if node not in nodelist: continue
fb_nodeinfo = fb['nodes'][node]['values']
if _pcu_in(fb_nodeinfo):
pcuinfo = fbpcu['nodes']['id_%s' % fb_nodeinfo['plcnode']['pcu_ids'][0]]['values']
if verify(dict_query, pcuinfo):
- pcunames.append(node)
-
- return pcunames
+ nodenames.append(node)
+ str = "cmdhttps/locfg.pl -s %s -f iloxml/License.xml -u %s -p '%s' | grep MESSAGE" % \
+ (pcu_name(pcuinfo), pcuinfo['username'], pcuinfo['password'])
+ pcunames.append(str)
+ return (nodenames, pcunames)
-def node_select(str_query):
+def node_select(str_query, nodelist=None):
hostnames = []
if str_query is None: return hostnames
#print str_query
dict_query = query_to_dict(str_query)
#print dict_query
+ global fb
for node in fb['nodes'].keys():
+ if nodelist is not None:
+ if node not in nodelist: continue
fb_nodeinfo = fb['nodes'][node]['values']
+ if fb_nodeinfo == []:
+ #print node, "has lost values"
+ continue
+ #sys.exit(1)
+ fb_nodeinfo['pcu'] = color_pcu_state(fb_nodeinfo)
+ fb_nodeinfo['hostname'] = node
+ if 'plcnode' in fb_nodeinfo:
+ fb_nodeinfo.update(fb_nodeinfo['plcnode'])
+
if verify(dict_query, fb_nodeinfo):
#print node #fb_nodeinfo
hostnames.append(node)
def main():
+ global fb
+ global fbpcu
+
from config import config
from optparse import OptionParser
parser = OptionParser()
- parser.set_defaults(node=None, select=None, pcuselect=None, nodelist=None, daysdown=None)
+ parser.set_defaults(node=None, fromtime=None, select=None, list=None, pcuselect=None, nodelist=None, daysdown=None, fields=None)
parser.add_option("", "--daysdown", dest="daysdown", action="store_true",
help="List the node state and days down...")
parser.add_option("", "--select", dest="select", metavar="key=value",
help="List all nodes with the given key=value pattern")
+ parser.add_option("", "--fields", dest="fields", metavar="key,list,...",
+ help="a list of keys to display for each entry.")
+ parser.add_option("", "--list", dest="list", action="store_true",
+ help="Write only the hostnames as output.")
parser.add_option("", "--pcuselect", dest="pcuselect", metavar="key=value",
help="List all nodes with the given key=value pattern")
parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt",
help="A list of nodes to bring out of debug mode.")
+ parser.add_option("", "--fromtime", dest="fromtime", metavar="YYYY-MM-DD",
+ help="Specify a starting date from which to begin the query.")
config = config(parser)
config.parse_args()
+
+ if config.fromtime:
+ path = "archive-pdb"
+ archive = soltesz.SPickle(path)
+ d = datetime_fromstr(config.fromtime)
+ glob_str = "%s*.production.findbad.pkl" % d.strftime("%Y-%m-%d")
+ os.chdir(path)
+ #print glob_str
+ file = glob.glob(glob_str)[0]
+ #print "loading %s" % file
+ os.chdir("..")
+ fb = archive.load(file[:-4])
+ else:
+ fb = soltesz.dbLoad("findbad")
+
+ fbpcu = soltesz.dbLoad("findbadpcus")
if config.nodelist:
nodelist = config.getListFromFile(config.nodelist)
- elif config.select is not None:
- nodelist = node_select(config.select)
- elif config.pcuselect is not None:
- nodelist = pcu_select(config.pcuselect)
else:
nodelist = fb['nodes'].keys()
+ pculist = None
+ if config.select is not None and config.pcuselect is not None:
+ nodelist = node_select(config.select, nodelist)
+ nodelist, pculist = pcu_select(config.pcuselect, nodelist)
+ elif config.select is not None:
+ nodelist = node_select(config.select, nodelist)
+ elif config.pcuselect is not None:
+ nodelist, pculist = pcu_select(config.pcuselect, nodelist)
+
+
+ if pculist:
+ for pcu in pculist:
+ print pcu
+
for node in nodelist:
config.node = node
fb_nodeinfo = fb['nodes'][node]['values']
- if config.daysdown:
- daysdown_print_nodeinfo(fb_nodeinfo, node)
+ if config.list:
+ print node
else:
- if config.select:
- fb_print_nodeinfo(fb_nodeinfo, node)
- elif not config.select and 'state' in fb_nodeinfo:
- fb_print_nodeinfo(fb_nodeinfo, node)
+ if config.daysdown:
+ daysdown_print_nodeinfo(fb_nodeinfo, node)
else:
- pass
+ if config.select:
+ if config.fields:
+ fields = config.fields.split(",")
+ else:
+ fields = None
+
+ fb_print_nodeinfo(fb_nodeinfo, node, fields)
+ elif not config.select and 'state' in fb_nodeinfo:
+ fb_print_nodeinfo(fb_nodeinfo, node)
+ else:
+ pass
if __name__ == "__main__":
main()
# Attempt to reboot a node in debug state.
-
import plc
import auth
api = plc.PLC(auth.auth, auth.plc)
from Rpyc import SocketConnection, Async
from Rpyc.Utils import *
+def get_fbnode(node):
+ fb = soltesz.dbLoad("findbad")
+ fbnode = fb['nodes'][node]['values']
+ return fbnode
class NodeConnection:
def __init__(self, connection, node, config):
try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
except Exception, x:
bm_continue = False
- if not config.quiet: print "exception"
- if not config.quiet: print x
+ print "exception"
+ print x
print " Possibly, unable to find valid configuration file"
if bm_continue:
else:
return False
+ def set_nodestate(self, state='boot'):
+ return api.UpdateNode(self.node, {'boot_state' : state})
+
def restart_node(self, state='boot'):
api.UpdateNode(self.node, {'boot_state' : state})
args['user'] = 'root'
args['hostname'] = self.node
args['monitordir'] = "/home/soltesz/monitor"
+ ssh_port = 22
if self.nosetup:
print "Skipping setup"
t1 = time.time()
# KILL any already running servers.
- cmd = """ssh %(user)s@%(hostname)s """ + \
- """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
- cmd = cmd % args
- if self.verbose: print cmd
- # TODO: Add timeout
- print localos.system(cmd,timeout)
-
- # START a new rpyc server.
- cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
- """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
- cmd = cmd % args
- if self.verbose: print cmd
- print localos.system(cmd,timeout)
+ ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
+ (ov,ev) = ssh.run_noexcept2("""<<\EOF
+ rm -f out.log
+ echo "kill server" >> out.log
+ ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
+ echo "export" >> out.log
+ export PYTHONPATH=$HOME ;
+ echo "start server" >> out.log
+ python Rpyc/Servers/forking_server.py &> server.log &
+ echo "done" >> out.log
+EOF""")
+ #cmd = """ssh %(user)s@%(hostname)s """ + \
+ # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
+ #cmd = cmd % args
+ #if self.verbose: print cmd
+ ## TODO: Add timeout
+ #print localos.system(cmd,timeout)
+
+ ## START a new rpyc server.
+ #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
+ # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
+ #cmd = cmd % args
+ #if self.verbose: print cmd
+ #print localos.system(cmd,timeout)
+ print ssh.ret
# TODO: Add timeout
# This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
# TODO: the read() here may block indefinitely. Need a better
# approach therefore, that includes a timeout.
- ret = self.command.stdout.read(5)
+ #ret = self.command.stdout.read(5)
+ ret = soltesz.read_t(self.command.stdout, 5)
t2 = time.time()
if 'READY' in ret:
def reboot(hostname, config=None, forced_action=None):
+ # NOTE: Nothing works if the bootcd is REALLY old.
+ # So, this is the first step.
+ fbnode = get_fbnode(hostname)
+ if fbnode['category'] == "OLDBOOTCD":
+ print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
+ args = {}
+ args['hostname_list'] = " %s" % hostname
+
+ m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
+ mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
+
+ loginbase = plc.siteId(hostname)
+ m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+
+ print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+ api.UpdateNode(hostname, {'boot_state' : 'disable'})
+ return True
+
node = hostname
print "Creating session for %s" % node
# update known_hosts file (in case the node has rebooted since last run)
steps = [
('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
+ ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
+
('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
+ ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
('floppytimeout','floppy0: floppy timeout called'),
('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
loginbase = plc.siteId(hostname)
m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ conn.set_nodestate('diag')
return False
print "...Downloading bm.log from %s" % node
log = conn.get_bootmanager_log()
child = fdpexpect.fdspawn(log)
+ try:
+ if config.collect: return True
+ except:
+ pass
+
time.sleep(1)
if config and not config.quiet: print "...Scanning bm.log for errors"
('noinstall' , 'notinstalled'),
('bziperror' , 'bzip2: Data integrity error when decompressing.'),
('noblockdev' , "No block devices detected."),
+ ('downloadfail' , 'Unable to download main tarball /boot-alpha/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
- ('hardwarefail' , 'Hardware requirements not met'),
+ ('hardwarerequirefail' , 'Hardware requirements not met'),
+ ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
+ ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
('modulefail' , 'Unable to get list of system modules'),
('writeerror' , 'write error: No space left on device'),
# By using the sequence identifier, we guarantee that there will be no
# frequent loops. I'm guessing there is a better way to track loops,
# though.
- if not config.force and ( pflags.getFlag(s) or pflags.isRecent() ):
- pflags.resetFlag(s)
- pflags.setRecent()
+ if not config.force and pflags.getRecentFlag(s):
+ pflags.setRecentFlag(s)
pflags.save()
print "... flag is set or it has already run recently. Skipping %s" % node
return True
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
]:
sequences.update({n : "restart_node_rins"})
"bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
"bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
]:
sequences.update({n: "restart_node_boot"})
# update_node_config_email
for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
"bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-nodehostname-update-debug-done",
]:
sequences.update({n : "update_node_config_email"})
+ for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+ sequences.update({n : "nodenetwork_email"})
+
# update_bootcd_email
- for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done",
- "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarefail-update-debug-done",
- "bminit-cfg-auth-getplc-hardware-exception-hardwarefail-update-debug-done",
+ for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
]:
sequences.update({n : "update_bootcd_email"})
+ for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
+ ]:
+ sequences.update({n: "suspect_error_email"})
+
# update_hardware_email
- sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarefail-update-debug-done" : "update_hardware_email"})
+ sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
+ sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
# broken_hardware_email
- sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done" : "broken_hardware_email"})
+ sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
+
+ flag_set = True
if s not in sequences:
conn.restart_bootmanager('boot')
+ # NOTE: Do not set the pflags value for this sequence if it's unknown.
+ # This way, we can check it again after we've fixed it.
+ flag_set = False
+
else:
if sequences[s] == "restart_bootmanager_boot":
else:
# there was some failure to synchronize the keys.
print "...Unable to repair node keys on %s" % node
+
+ elif sequences[s] == "suspect_error_email":
+ args = {}
+ args['hostname'] = hostname
+ args['sequence'] = s
+ args['bmlog'] = conn.get_bootmanager_log().read()
+ m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
+ mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
+ m.reset()
+ m.send(['monitor-list@lists.planet-lab.org'])
+
+ conn.restart_bootmanager('boot')
+
elif sequences[s] == "update_node_config_email":
print "...Sending message to UPDATE NODE CONFIG"
args = {}
loginbase = plc.siteId(hostname)
m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
conn.dump_plconf_file()
+ conn.set_nodestate('diag')
+
+ elif sequences[s] == "nodenetwork_email":
+ print "...Sending message to LOOK AT NODE NETWORK"
+ args = {}
+ args['hostname'] = hostname
+ args['bmlog'] = conn.get_bootmanager_log().read()
+ m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
+ True, db='nodenet_persistmessages')
+ loginbase = plc.siteId(hostname)
+ m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ conn.dump_plconf_file()
+ conn.set_nodestate('diag')
elif sequences[s] == "update_bootcd_email":
print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
loginbase = plc.siteId(hostname)
m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ #print "\tDisabling %s due to out-of-date BOOTCD" % hostname
+ #conn.set_nodestate('disable')
+
elif sequences[s] == "broken_hardware_email":
# MAKE An ACTION record that this host has failed hardware. May
# require either an exception "/minhw" or other manual intervention.
loginbase = plc.siteId(hostname)
m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ conn.set_nodestate('disable')
elif sequences[s] == "update_hardware_email":
print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
loginbase = plc.siteId(hostname)
m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ conn.set_nodestate('disable')
- pflags.setFlag(s)
- pflags.save()
+ if flag_set:
+ pflags.setRecentFlag(s)
+ pflags.save()
return True
from config import config
from optparse import OptionParser
parser = OptionParser()
- parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False)
+ parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
parser.add_option("", "--child", dest="child", action="store_true",
help="This is the child mode of this process.")
parser.add_option("", "--force", dest="force", metavar="boot_state",
help="Extra quiet output messages.")
parser.add_option("", "--verbose", dest="verbose", action="store_true",
help="Extra debug output messages.")
+ parser.add_option("", "--collect", dest="collect", action="store_true",
+ help="No action, just collect dmesg, and bm.log")
parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
help="Do not perform the orginary setup phase.")
parser.add_option("", "--node", dest="node", metavar="nodename.edu",
ret = s.recv(count, socket.MSG_DONTWAIT)
except socket.error, e:
if e[0] == errno.EAGAIN:
- return Exception(e[1])
+ raise Exception(e[1])
else:
# TODO: not other exceptions.
raise Exception(e)
s.close()
if e[0] == errno.ECONNREFUSED:
# cannot connect to remote host
- return Exception(e[1])
+ raise Exception(e[1])
else:
# TODO: what other conditions are there?
raise Exception(e)
s.send(self.format_msg("", 'O'))
ret = self.recv_noblock(s, 8)
print "Current status is '%s'" % ret
+
+ if ret == '':
+ raise Exception("Status returned 'another session already open' %s : %s" % (node_port, ret))
+
if node_port < len(ret):
status = ret[node_port]
if not dryrun:
- print "Pulsing %s" % node_port
if power_on:
+ print "Pulsing %s" % node_port
s.send(self.format_msg("%s" % node_port, 'P'))
else:
- # NOTE: turn power on before pulsing the port.
- print "power was off, so turning on then pulsing..."
+ # NOTE: turn power on ; do not pulse the port.
+ print "Power was off, so turning on ..."
s.send(self.format_msg("%s" % node_port, 'E'))
- s.send(self.format_msg("%s" % node_port, 'P'))
+ #s.send(self.format_msg("%s" % node_port, 'P'))
print "Receiving response."
ret = self.recv_noblock(s, 8)
# % (hostname,hostname)
# Queue == 10 is the spam Queue in RT.
+# SELECT Tk.* FROM Tickets AS Tk, Attachments AS At JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE Tk.Queue != 10 AND Tk.id > 10000 AND Tr.id=At.TransactionID AND Tk.Status = 'open' ;
+#
+
sql = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content
FROM Tickets AS Tk, Attachments AS At
JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId
#WHERE Tk.Queue != 10 AND Tk.id > 10000 AND
#Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR
#Tk.Status = 'new') """
- sqlall = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content, Us.EmailAddress FROM Tickets AS Tk, Attachments AS At, Users as Us JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE (Tk.Queue=3 OR Tk.Queue=22) AND Tk.id > 10000 AND Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR Tk.Status = 'new') AND Us.id=Tk.LastUpdatedBy """
+ sqlall = """SELECT distinct Tk.id, Tk.Status, Tk.Subject, At.Content, Us.EmailAddress, Tk.LastUpdated FROM Tickets AS Tk, Attachments AS At, Users as Us JOIN Transactions AS Tr ON Tk.id=Tr.ObjectId WHERE (Tk.Queue=3 OR Tk.Queue=22) AND Tk.id > 10000 AND Tr.id=At.TransactionID AND ( Tk.Status = 'open' OR Tk.Status = 'new') AND Us.id=Tk.LastUpdatedBy """
raw = fetch_from_db(db, sql)
"status":x[1],
"subj":str(x[2]),
"content":str(x[3]),
- "email":str(x[4]) },
+ "email":str(x[4]),
+ "lastupdated":str(x[5]),
+ },
raw)
db.close()
def kill(self, signal = signal.SIGTERM):
os.kill(self.pid, signal)
+def read_t(stream, count, timeout=COMMAND_TIMEOUT*2):
+ lin, lout, lerr = select([stream], [], [], timeout)
+ if len(lin) == 0:
+ raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+
+ return stream.read(count)
+
class CMD:
def __init__(self):
pass
def run_noexcept(self, cmd, timeout=COMMAND_TIMEOUT*2):
+ #print "CMD.run_noexcept(%s)" % cmd
try:
return CMD.run(self,cmd,timeout)
except ExceptionTimeout:
import traceback; print traceback.print_exc()
return ("", "SCRIPTTIMEOUT")
-
-# s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
-# #(f_in, f_out, f_err) = os.popen3(cmd)
-# (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
-# lout, lin, lerr = select([f_out,f_err], [], [], timeout)
-# if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
-# # Reached a timeout! Nuke process so it does not hang.
-# s.kill(signal.SIGKILL)
-# return ("", "SCRIPTTIMEOUT")
-# o_value = f_out.read()
-# e_value = ""
-# if o_value == "": # An error has occured
-# e_value = f_err.read()
-#
-# o_value = o_value.strip()
-# e_value = e_value.strip()
-#
-# f_out.close()
-# f_in.close()
-# f_err.close()
-# try:
-# s.kill()
-# except OSError:
-# # no such process, due to it already exiting...
-# pass
-#
-# return (o_value, e_value)
def system(self, cmd, timeout=COMMAND_TIMEOUT*2):
(o,e) = self.run(cmd, timeout)
self.output = o
def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
+ #print "CMD.run(%s)" % cmd
s = Sopen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
self.s = s
(f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
- lout, lin, lerr = select([f_out,f_err], [], [], timeout)
+ #print "calling select(%s)" % timeout
+ lout, lin, lerr = select([f_out], [], [f_err], timeout)
+ #print "TIMEOUT!!!!!!!!!!!!!!!!!!!"
+ if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
+ # Reached a timeout! Nuke process so it does not hang.
+ #print "KILLING"
+ s.kill(signal.SIGKILL)
+ raise ExceptionTimeout("TIMEOUT Running: %s" % cmd)
+ else:
+ #print "RETURNING"
+ #print len(lin), len(lout), len(lerr)
+ pass
+
+ o_value = ""
+ e_value = ""
+
+ #print "reading from f_out"
+ if len(lout) > 0: o_value = f_out.read()
+ #print "reading from f_err"
+ if len(lerr) > 0: e_value = f_err.read()
+
+ #print "striping output"
+ o_value = o_value.strip()
+ e_value = e_value.strip()
+
+ #print "OUTPUT", o_value, e_value
+
+ #print "closing files"
+ f_out.close()
+ f_in.close()
+ f_err.close()
+ try:
+ #print "s.kill()"
+ s.kill()
+ #print "after s.kill()"
+ except OSError:
+ # no such process, due to it already exiting...
+ pass
+
+ #print o_value, e_value
+ return (o_value, e_value)
+
+ def runargs(self, args, timeout=COMMAND_TIMEOUT*2):
+
+ #print "CMD.run(%s)" % " ".join(args)
+ s = Sopen(args, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
+ self.s = s
+ (f_in, f_out, f_err) = (s.stdin, s.stdout, s.stderr)
+ lout, lin, lerr = select([f_out], [], [f_err], timeout)
if len(lin) == 0 and len(lout) == 0 and len(lerr) == 0:
# Reached a timeout! Nuke process so it does not hang.
s.kill(signal.SIGKILL)
class SSH(CMD):
- def __init__(self, user, host, options = ssh_options):
+ def __init__(self, user, host, port=22, options = ssh_options):
self.options = options
self.user = user
self.host = host
+ self.port = port
return
def __options_to_str(self):
return options
def run(self, cmd, timeout=COMMAND_TIMEOUT*2):
- cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(),
+ cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
self.user, self.host, cmd)
+ #print "SSH.run(%s)" % cmd
return CMD.run(self, cmd, timeout)
def get_file(self, rmt_filename, local_filename=None):
if local_filename == None:
local_filename = "./"
- cmd = "scp -B %s %s@%s:%s %s" % (self.__options_to_str(),
+ cmd = "scp -P %s -B %s %s@%s:%s %s" % (self.port, self.__options_to_str(),
self.user, self.host,
rmt_filename, local_filename)
# output :
return CMD.run_noexcept(self, cmd)
def run_noexcept(self, cmd):
- cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(),
+ cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
self.user, self.host, cmd)
+ #print "SSH.run_noexcept(%s)" % cmd
return CMD.run_noexcept(self, cmd)
+ def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2):
+ cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
+ self.user, self.host, cmd)
+ #print "SSH.run_noexcept2(%s)" % cmd
+ r = CMD.run_noexcept(self, cmd, timeout)
+
+ # XXX: this may be resulting in deadlocks... not sure.
+ #if self.s.returncode is None:
+ # #self.s.kill()
+ # self.s.kill(signal.SIGKILL)
+ # self.s.wait()
+ # self.ret = self.s.returncode
+ self.ret = -1
+
+ return r
+
+ def system2(self, cmd, timeout=COMMAND_TIMEOUT*2):
+ cmd = "ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(),
+ self.user, self.host, cmd)
+ #print "SSH.system2(%s)" % cmd
+ return CMD.system(self, cmd, timeout)
+
def runE(self, cmd):
- cmd = "ssh %s %s@%s '%s'" % (self.__options_to_str(),
+ cmd = "ssh -p %s %s %s@%s '%s'" % (self.port, self.__options_to_str(),
self.user, self.host, cmd)
if ( DEBUG == 1 ):
print cmd,
import auth
api = plc.PLC(auth.auth, auth.plc)
-import config
import mailer
import time
+from nodecommon import *
+
+from const import *
def gethostlist(hostlist_file):
+ import config
return config.getListFromFile(hostlist_file)
#nodes = api.GetNodes({'peer_id' : None}, ['hostname'])
class Recent(object):
def __init__(self, withintime):
self.withintime = withintime
- self.time = time.time()
- self.action_taken = False
+
+ try:
+ self.time = self.__getattribute__('time')
+ except:
+ self.time = time.time()- 7*24*60*60
+
+ #self.time = time.time()
+ #self.action_taken = False
def isRecent(self):
if self.time + self.withintime < time.time():
obj = super(PersistFlags, typ).__new__(typ, *args, **kwargs)
for key in kwargs.keys():
obj.__setattr__(key, kwargs[key])
+ obj.time = time.time()
+ obj.action_taken = False
obj.db = db
return obj
self.__setattr__(name, False)
return False
+ def resetRecentFlag(self, name):
+ self.resetFlag(name)
+ self.unsetRecent()
+
def setRecentFlag(self, name):
self.setFlag(name)
self.setRecent()
self.__setattr__(name, False)
return False
+ def checkattr(self, name):
+ try:
+ x = self.__getattribute__(name)
+ return True
+ except:
+ return False
+
+
class PersistMessage(Message):
def __new__(typ, id, subject, message, via_rt, **kwargs):
if 'db' in kwargs:
obj.actiontracker = Recent(3*60*60*24)
obj.ticket_id = None
+ if 'ticket_id' in kwargs and kwargs['ticket_id'] is not None:
+ obj.ticket_id = kwargs['ticket_id']
+
obj.db = db
return obj
soltesz.dbDump(self.db, pm)
else:
# NOTE: only send a new message every week, regardless.
- print "Not sending to host b/c not within window of 6 days"
- pass
+ print "Not sending to host b/c not within window of %s days" % (self.actiontracker.withintime // 60*60*24)
class MonitorMessage(object):
def __new__(typ, id, *args, **kwargs):
def __init__(self, id, index, **kwargs):
self.id = id
- #SitePenalty.__init__(self, self.index)
def save(self):
pm = soltesz.dbLoad(self.db)
soltesz.dbDump(self.db, pm)
-
class Target:
"""
Each host has a target set of attributes. Some may be set manually,
return con_or_true
+class Record(object):
+
+ def __init__(self, hostname, data):
+ self.hostname = hostname
+ self.data = data
+ self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+ self.loginbase = self.plcdb_hn2lb[self.hostname]
+ return
+
+
+ def stageIswaitforever(self):
+ if 'waitforever' in self.data['stage']:
+ return True
+ else:
+ return False
+
+ def severity(self):
+ category = self.data['category']
+ prev_category = self.data['prev_category']
+ val = cmpCategoryVal(category, prev_category)
+ return val
+
+ def improved(self):
+ return self.severity() > 0
+
+ def end_record(self):
+ return node_end_record(self.hostname)
+
+ def reset_stage(self):
+ self.data['stage'] = 'findbad'
+ return True
+
+ def getCategory(self):
+ return self.data['category'].lower()
+
+ def getState(self):
+ return self.data['state'].lower()
+
+ def getDaysDown(cls, diag_record):
+ daysdown = -1
+ if diag_record['comonstats']['uptime'] != "null":
+ #print "uptime %s" % (int(float(diag_record['comonstats']['uptime'])) // (60*60*24))
+ daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+ elif diag_record['comonstats']['sshstatus'] != "null":
+ daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
+ elif diag_record['comonstats']['lastcotop'] != "null":
+ daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
+ else:
+ now = time.time()
+ last_contact = diag_record['plcnode']['last_contact']
+ if last_contact == None:
+ # the node has never been up, so give it a break
+ daysdown = -1
+ else:
+ diff = now - last_contact
+ daysdown = diff // (60*60*24)
+ return daysdown
+ getDaysDown = classmethod(getDaysDown)
+
+ def getStrDaysDown(cls, diag_record):
+ daysdown = cls.getDaysDown(diag_record)
+ if daysdown > 0:
+ return "%d days down"%daysdown
+ elif daysdown == -1:
+ return "Unknown number of days"
+ else:
+ return "%d days up"% -daysdown
+ getStrDaysDown = classmethod(getStrDaysDown)
+
+ def takeAction(self):
+ pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
+ if 'improvement' in self.data['stage'] or self.improved():
+ print "decreasing penalty for %s"%self.hostname
+ pp.decrease()
+ else:
+ print "increasing penalty for %s"%self.hostname
+ pp.increase()
+ pp.apply(self.hostname)
+ pp.save()
+
+ def _format_diaginfo(self):
+ info = self.data['info']
+ if self.data['stage'] == 'monitor-end-record':
+ hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
+ else:
+ hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
+ return hlist
+
+ def getMessage(self, ticket_id=None):
+ self.data['args']['hostname'] = self.hostname
+ self.data['args']['loginbase'] = self.loginbase
+ self.data['args']['hostname_list'] = self._format_diaginfo()
+ message = PersistMessage(self.hostname,
+ self.data['message'][0] % self.data['args'],
+ self.data['message'][1] % self.data['args'],
+ True, db='monitor_persistmessages',
+ ticket_id=ticket_id)
+ return message
+
+ def getContacts(self):
+ from config import config
+ #print "policy"
+ config = config()
+
+ roles = self.data['email']
+
+ if not config.mail and not config.debug and config.bcc:
+ roles = ADMIN
+ if config.mail and config.debug:
+ roles = ADMIN
+
+ # build targets
+ contacts = []
+ if ADMIN & roles:
+ contacts += [config.email]
+ if TECH & roles:
+ contacts += [TECHEMAIL % self.loginbase]
+ if PI & roles:
+ contacts += [PIEMAIL % self.loginbase]
+ if USER & roles:
+ slices = plc.slices(self.loginbase)
+ if len(slices) >= 1:
+ for slice in slices:
+ contacts += [SLICEMAIL % slice]
+ print "SLIC: %20s : %d slices" % (self.loginbase, len(slices))
+ else:
+ print "SLIC: %20s : 0 slices" % self.loginbase
+
+ return contacts
+
+
class NodeRecord:
def __init__(self, hostname, target):
self.hostname = hostname
- self.pcu = PCU(hostname)
self.ticket = None
self.target = target
if hostname in fb['nodes']:
else:
raise Exception("Hostname not in scan database")
- def get(self):
- pass
+ def stageIswaitforever(self):
+ if 'waitforever' in self.data['stage']:
+ return True
+ else:
+ return False
+
def severity(self):
category = self.data['category']
prev_category = self.data['prev_category']
val = cmpCategoryVal(category, prev_category)
return val
+
+ def improved(self):
+ return self.severity() > 0
+
+ def end_record(self):
+ return node_end_record(self.hostname)
+
+ def reset_stage(self):
+ self.data['stage'] = 'findbad'
+ return True
+
def open_tickets(self):
if self.ticket and self.ticket.status['status'] == 'open':
return 1
if __name__ == "__main__":
#r = RT()
#r.email("test", "body of test message", ['soltesz@cs.princeton.edu'])
- from emailTxt import mailtxt
- soltesz.dbDump("persistmessages", {});
- args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah - days down\n'}
- m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
- m.send(['soltesz@cs.utk.edu'])
- m = PersistMessage("blue", "test 1 - part 2", mailtxt.newalphacd_one[1] % args, True)
+ #from emailTxt import mailtxt
+ print "loaded"
+ #soltesz.dbDump("persistmessages", {});
+ #args = {'url_list': 'http://www.planet-lab.org/bootcds/planet1.usb\n','hostname': 'planet1','hostname_list': ' blahblah - days down\n'}
+ #m = PersistMessage("blue", "test 1", mailtxt.newdown_one[1] % args, True)
+ #m.send(['soltesz@cs.utk.edu'])
+ #m = PersistMessage("blue", "test 1 - part 2", mailtxt.newalphacd_one[1] % args, True)
# TRICK timer to thinking some time has passed.
- m.actiontracker.time = time.time() - 6*60*60*24
- m.send(['soltesz@cs.utk.edu'])
+ #m.actiontracker.time = time.time() - 6*60*60*24
+ #m.send(['soltesz@cs.utk.edu'])
$portstat = $pcu['portstatus'];
#foreach ( array('22', '23', '80', '443') $portstat as $port => $state)
- foreach ( array('22', '23', '80', '443') as $port)
+ foreach ( array('22', '23', '80', '443', '9100', '16992') as $port)
{
$state = $portstat[$port];
switch ($state)
}
}
+if ( $_REQUEST['id'] )
+{
+ $id = $_REQUEST['id'];
+} else{
+ $id = "all";
+}
+#print print_r($_SERVER) . "<BR>";
//array_multisort($protocols, SORT_ASC, SORT_STRING, $pculist);
?>
<html>
<body>
-Total PCUs : <?= $total ?>
-<table border=1>
- <tr>
- <th>Count</th>
- <th><?= get_category_link("pcu_id", "PCU ID") ?></th>
- <th><?= get_category_link("login_base", "Site") ?></th>
- <th><?= get_category_link("hostname", "PCU Name") ?></th>
- <th><?= get_category_link("complete_entry", "Incomplete Fields") ?></th>
- <th><?= get_category_link("dnsmatch", "DNS Status") ?></th>
- <th><?= get_category_link("portstatus", "Port Status") ?></th>
- <th><?= get_category_link("reboot", "Dry Run Results") ?></th>
- <th><?= get_category_link("model", "Model") ?></th>
- <th><?= get_category_link("node_ids", "Nodes") ?></th>
+<?php if ( $id == "all" ): ?>
+ Total PCUs : <?= $total ?>
+ <table border=1>
+ <tr>
+ <th>Count</th>
+ <th><?= get_category_link("pcu_id", "PCU ID") ?></th>
+ <th><?= get_category_link("login_base", "Site") ?></th>
+ <th><?= get_category_link("hostname", "PCU Name") ?></th>
+ <th><?= get_category_link("complete_entry", "Incomplete Fields") ?></th>
+ <th><?= get_category_link("dnsmatch", "DNS Status") ?></th>
+ <th><?= get_category_link("portstatus", "Port Status") ?></th>
+ <th><?= get_category_link("reboot", "Test Results") ?></th>
+ <th><?= get_category_link("model", "Model") ?></th>
+ <th><?= get_category_link("node_ids", "Nodes") ?></th>
+ </tr>
+ <?php $count = 0; ?>
+ <?php $reachable_nodes = 0; ?>
+ <?php foreach ( $pculist as $pcu ): ?>
+ <tr>
+ <td><?= $count ?></td>
+ <td id='id<?= $pcu['pcu_id'] ?>'><a href='<?= pcu_link($pcu) ?>'><?= $pcu['pcu_id'] ?></a></td>
+ <td><a href='<?= plc_site_link(pcu_site($pcu)) ?>'><?= pcu_site($pcu) ?></a></td>
+ <td><?= pcu_name($pcu) ?></td>
+ <td><?= pcu_entry($pcu) ?></td>
+ <td bgcolor='<?= DNS_to_color($pcu['dnsmatch']) ?>'><?= $pcu['dnsmatch'] ?></td>
+ <td><?= format_ports($pcu) ?></td>
+ <td bgcolor='<?= reboot_to_color($pcu['reboot']) ?>'><?= reboot_to_str($pcu['reboot']) ?></td>
+ <td nowrap><?= $pcu['model'] ?></td>
+ <td><?= count( $pcu['node_ids'] ) ?></td>
+ </tr>
+
+ <?php if ( $pcu['reboot'] == "0" ) $reachable_nodes+=count($pcu['node_ids']); ?>
+ <?php $count += 1; ?>
+ <?php endforeach; ?>
+ </table>
+ <b>Reachable Nodes:</b> <?= $reachable_nodes ?>
+<?php else: ?>
+ <table align=center border=1>
+ <tr>
+ <th><?= get_category_link("pcu_id", "PCU ID") ?></th>
+ <th><?= get_category_link("login_base", "Site") ?></th>
+ <th><?= get_category_link("hostname", "PCU Name") ?></th>
+ <th><?= get_category_link("complete_entry", "Incomplete Fields") ?></th>
+ <th><?= get_category_link("dnsmatch", "DNS Status") ?></th>
+ <th><?= get_category_link("portstatus", "Port Status") ?></th>
+ <th><?= get_category_link("reboot", "Test Results") ?></th>
+ <th><?= get_category_link("model", "Model") ?></th>
+ <th><?= get_category_link("node_ids", "Nodes") ?></th>
+ </tr>
+ <?php $count = 0; ?>
+ <?php $reachable_nodes = 0; ?>
+ <?php foreach ( $pculist as $pcu ): ?>
+ <?php if ( $pcu['pcu_id'] == $id ): ?>
+ <tr>
+ <td id='id<?= $pcu['pcu_id'] ?>'><a href='<?= pcu_link($pcu) ?>'><?= $pcu['pcu_id'] ?></a></td>
+ <td><a href='<?= plc_site_link(pcu_site($pcu)) ?>'><?= pcu_site($pcu) ?></a></td>
+ <td><?= pcu_name($pcu) ?></td>
+ <td><?= pcu_entry($pcu) ?></td>
+ <td bgcolor='<?= DNS_to_color($pcu['dnsmatch']) ?>'><?= $pcu['dnsmatch'] ?></td>
+ <td><?= format_ports($pcu) ?></td>
+ <td bgcolor='<?= reboot_to_color($pcu['reboot']) ?>'><?= reboot_to_str($pcu['reboot']) ?></td>
+ <td nowrap><?= $pcu['model'] ?></td>
+ <td><?= count( $pcu['node_ids'] ) ?></td>
+ </tr>
+ <?php endif; ?>
+ <?php endforeach; ?>
+ </table>
+ <br>
+ <table border=1 align=center>
+ <tr><th colspan=2>Legend for 'DNS Status'</th></tr>
+
+ <tr><td bgcolor=lightgreen>DNS-OK</td>
+ <td>This indicates that the DNS name and registered IP address match.</td>
+ </tr>
+ <tr><td bgcolor=lightgrey>DNS-MISMATCH</td>
+ <td>Sometimes, the registered IP and DNS IP address do not match. In these cases it is not clear which is correct,
+ so an error is flagged.</td>
+ </tr>
+ <tr><td bgcolor=lightgrey>DNS-NOENTRY</td>
+ <td>While a hostname is provided in the registration, the hostname is not actually registered in DNS.</td>
+ </tr>
+ <tr><td bgcolor=white>NOHOSTNAME</td>
+ <td>While we prefer that a hostname be registered, it is not
+ strictly required, since simply the IP address, if it is static, is enough to access the PCU.</td>
+ </tr>
+ <!--/table>
+ <table border=1-->
+ <tr><th colspan=2>Legend for 'Port Status'</th></tr>
+
+ <tr><td bgcolor=lightgreen>Open</td>
+ <td>Green port numbers are believed to be open.</td>
</tr>
-<?php $count = 0; ?>
-<?php $reachable_nodes = 0; ?>
-<?php foreach ( $pculist as $pcu ): ?>
- <tr>
- <td><?= $count ?></td>
- <td id='id<?= $pcu['pcu_id'] ?>'><a href='<?= pcu_link($pcu) ?>'><?= $pcu['pcu_id'] ?></a></td>
- <td><a href='<?= plc_site_link(pcu_site($pcu)) ?>'><?= pcu_site($pcu) ?></a></td>
- <td><?= pcu_name($pcu) ?></td>
- <td><?= pcu_entry($pcu) ?></td>
- <td bgcolor='<?= DNS_to_color($pcu['dnsmatch']) ?>'><?= $pcu['dnsmatch'] ?></td>
- <td><?= format_ports($pcu) ?></td>
- <td bgcolor='<?= reboot_to_color($pcu['reboot']) ?>'><?= reboot_to_str($pcu['reboot']) ?></td>
- <td nowrap><?= $pcu['model'] ?></td>
- <td><?= count( $pcu['node_ids'] ) ?></td>
+ <tr><td bgcolor=gold>Filtered</td>
+ <td>Gold port numbers are believed to be filtered or simply offline.</td>
</tr>
+ <tr><td bgcolor=indianred>Closed</td>
+ <td>Finally, red ports appear to be closed.</td>
+ </tr>
+ <!--/table>
+ <table border=1-->
+ <tr><th colspan=2>Legend for 'Test Results'</th></tr>
-<?php if ( $pcu['reboot'] == "0" ) $reachable_nodes+=count($pcu['node_ids']); ?>
-<?php $count += 1; ?>
-<?php endforeach; ?>
-</table>
+ <tr><td bgcolor=darkseagreen>OK</td>
+ <td>The PCU is accessible, and short of actually rebooting the node, everything appears to work.</td>
+ </tr>
+ <tr><td bgcolor=lightgrey>NetDown</td>
+ <td>The PCU is inaccessible from the PlanetLab address block 128.112.139.0/25, or it is simply offline.</td>
+ </tr>
+ <tr><td bgcolor=lightgrey>Not_Run</td>
+ <td>Previous errors, such as DNS or an incomplete configuration prevented the actual test from begin performed.</td>
+ </tr>
+ <tr><td bgcolor=indianred>Other Errors</td>
+ <td>Other errors are reported by the test that are more specific to the block encountered by the script.</td>
+ </tr>
+ </table>
+<?php endif; ?>
-<b>Reachable Nodes:</b> <?= $reachable_nodes ?>
</body>
</html>