#TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
source ${MONITOR_SCRIPT_ROOT}/agent.sh
+${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
${MONITOR_SCRIPT_ROOT}/syncwithplc.py $DATE || :
+service plc restart monitor
echo "Performing FindAll Nodes"
#########################
# 1. FINDBAD NODES
-${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || :
+${MONITOR_SCRIPT_ROOT}/findall.py $DATE || :
ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
# clean up stray 'locfg' processes that hang around inappropriately...
ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
${MONITOR_SCRIPT_ROOT}/policy.py $DATE
-
-echo "Archiving pkl files"
-#########################
-# Archive pkl files.
-for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do
- if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then
- cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl
- else
- echo "Warning: It failed to archive ${MONITOR_DATA_ROOT}/production.$f.pkl"
- fi
-done
+${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+service plc restart monitor
cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log
rm -f $MONITOR_PID
#############################################################################
#############################################################################
- pcumissing_notice =("""MONTEST: There is no PCU available to reboot %(hostname)s""",
+ pcumissing_notice =("""There is no PCU available to reboot %(hostname)s""",
"""We've noticed that there is no PCU associated with %(hostname)s, so we could
not reboot it ourselves.
-- PlanetLab Central (support@planet-lab.org)
""")
- pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""",
+ pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""",
"""We tried to use the PCU registered for %(hostname)s, but for some reason
the host did not come back online. You may be able to learn more by visiting
this link:
- https://monitor.planet-lab.org/monitor/pcuview?loginbase=%(loginbase)s
+ https://monitor.planet-lab.org/monitor/pcuview?loginbase=%(loginbase)s
We need your help resolving this issue in a few ways:
1. First, we need your help rebooting %(hostname)s. Because the above PCU does
not appear to work, please manually reboot this machine. If it turns out
- that there is a problem with the PCU configuration, we can help you
+ that there is a problem with the PCU configuration, we can help you
resolve that independently.
2. If it is possible, please correcct the above PCU problem, or let us know
- what steps you are taking. By enabling us to take administrative actions
- automatically without your intervention, you will save time in the future
- the next time we need to reboot this machine, because we will be able to
- do so without disturbing you.
+ what steps you are taking. By enabling us to take administrative actions
+ automatically without your intervention, you will save time in the future
+ the next time we need to reboot this machine, because we will be able to
+ do so without disturbing you.
3. If there is nothing apparently wrong with the PCU, or the mapping between
the PCU and the host, then there is likely a problem with our bootstrap
-- PlanetLab Central (support@planet-lab.org)
""")
- online_notice=("""MONTEST: Host %(hostname)s is online""",
+ online_notice=("""Host %(hostname)s is online""",
"""
This notice is simply to let you know that:
%(hostname)s
Thank you very much for your help!
-- PlanetLab Central (support@planet-lab.org)
""")
- test_notice=("""MONTEST: Host %(hostname)s is testing""",
+ test_notice=("""Host %(hostname)s is testing""",
"""
This notice is simply to test whether notices work.
%(hostname)s
Thank you very much for your help!
""")
- retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""",
+ retry_bootman=("""Running BootManager on %(hostname)s""",
"""
This notice is simply to let you know that:
%(hostname)s
appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py.
If any action is needed from you, you will recieve additional notices. Thank you!
""")
- down_notice=("""MONTEST: Host %(hostname)s is down""",
+ down_notice=("""Host %(hostname)s is down""",
"""
This notice is simply to let you know that:
%(hostname)s
-- PlanetLab Central (support@planet-lab.org)
""")
- clear_penalty=("""MONTEST: All privileges restored to site %(loginbase)s""",
+ clear_penalty=("""All privileges restored to site %(loginbase)s""",
"""
This notice is to let you know that any privileges previously reduced at your
site have been restored: %(penalty_level)s.
2+ - all existing slices will be disabled.
""")
- increase_penalty=("""MONTEST: Privilege reduced for site %(loginbase)s""",
+ increase_penalty=("""Privilege reduced for site %(loginbase)s""",
"""
This notice is to let you know that the privileges granted to your site as
a participating member of Planetlab have reduced: %(penalty_level)s.
2+ - all existing slices will be disabled.
""")
- newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """
+ newbootcd_notice=("""Host %(hostname)s needs a new BootImage""", """
We noticed the following node has an out-dated BootImage:
%(hostname)s
-- PlanetLab Central (support@planet-lab.org)
""")
- newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""",
+ newalphacd_notice=("""New Boot Images for %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported.
%(hostname)s
pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one]
pcudown=[pcudown_one, pcudown_one, pcudown_one]
- unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""",
+ unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
donation_down = [ donation_down_one, donation_down_one, donation_down_one ]
- minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""",
+ minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""",
"""
While trying to automatically recover this machine:
%(bmlog)s
""" )
- baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""",
+ baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node.
Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org.
%(bmlog)s
""")
- nodeconfig_notice=("""MONTEST: Please Update Configuration file for PlanetLab node %(hostname)s""",
+ nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit:
https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s
""")
- baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""",
+ baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""",
"""As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries.
%(hostname)s
def _setupRTenvironment():
- os.environ['PATH'] = os.environ['PATH'] + ":" + config.RT_WEB_TOOLS_PATH
+ os.environ['PATH'] = config.RT_WEB_TOOLS_PATH + ":" + os.environ['PATH']
os.environ['RTSERVER'] = config.RT_WEB_SERVER
os.environ['RTUSER'] = config.RT_WEB_USER
os.environ['RTPASSWD'] = config.RT_WEB_PASSWORD
return {}
cmd = "rt edit ticket/%s set status=%s" % (ticket_id, status)
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
value = f_out.read()
l_values = value.split('\n')
return {}
cmd = "rt show -t ticket -f id,subject,status,queue,created %s" % (ticket_id)
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
value = f_out.read()
l_values = value.split('\n')
# create a comma-separated list
s_to = ",".join(to)
cmd = "rt edit ticket/%s set admincc='%s'" % (ticket_id, s_to)
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
value = f_out.read()
l_values = value.split()
i_ticket_id = int(ticket_id)
cmd = "rt edit ticket/%s set subject='%s'" % (ticket_id, subject)
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
value = f_out.read()
l_values = value.split()
i_ticket_id = int(ticket_id)
cmd = "rt comment -m '%s' ticket/%s" % (comment, i_ticket_id)
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
value = f_out.read()
l_values = value.split()
+ l_err = f_err.read()
f_in.close() ; f_out.close() ; f_err.close()
if len(l_values) > 1 and "recorded" in l_values[1]:
# Success
# Error
f_in.close() ; f_out.close() ; f_err.close()
print "ERROR: RT failed to add comment to id %s" % ticket_id
+ print "ERROR: %s" % l_err
return
if not config.debug:
cmd = "rt edit ticket/%s set status=resolved" % i_ticket_id
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
f_in.close()
value = f_out.read()
setAdminCCViaRT(ticket_id, to)
cmd = "rt correspond -m - %s" % ticket_id
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
f_in.write(text)
f_in.flush()
if config.mail and not config.debug:
cmd = "rt create -i -t ticket"
+ print cmd
(f_in, f_out, f_err) = os.popen3(cmd)
f_in.write(input_text % (subject, spaced_text))
f_in.flush()
import sys
import os
import md5
+import re
+from monitor.util import file as fileutil
+
+purpose_message="""
+ This utility is designed to simplify the task of parsing and generating
+ statistics for the number of packages on PlanetLab nodes.
+"""
def list_to_md5(strlist):
- digest = md5.new()
- for f in strlist:
- digest.update(f)
-
- return digest.hexdigest()
-
-while True:
- line = sys.stdin.readline()
- if not line:
- break
- line = line.strip()
- fields = line.split()
- host = fields[1]
- rpms = fields[2:]
- rpms.sort()
- if len(rpms) != 0:
- sum = list_to_md5(rpms)
- print sum, host
+ digest = md5.new()
+ for f in strlist:
+ digest.update(f)
+
+ return digest.hexdigest()
+
+def pick_some_rpms(pattern, rpmlist):
+ l = []
+ cpatt = re.compile(pattern)
+ for rpm in rpmlist:
+ if cpatt.search(rpm):
+ l.append(rpm)
+ return l
+
+def main():
+ global api
+ global config
+
+ from optparse import OptionParser
+ parser = OptionParser()
+
+ parser.set_defaults( select=None,
+ input=None,
+ frequency=False,
+ package=False,
+ )
+
+ parser.add_option("", "--input", dest="input",
+ help="the input file")
+ parser.add_option("", "--select", dest="select",
+ help="the pattern to pull out from rpm list")
+ parser.add_option("", "--frequency", dest="frequency", action="store_true",
+ help="print the frequency of packages matched by select")
+ parser.add_option("", "--package", dest="package", action="store_true",
+ help="print the frequency of each pl package")
+ (config, args) = parser.parse_args()
+ if len(sys.argv) == 1 or config.input is None:
+ print purpose_message
+ parser.print_help()
+ sys.exit(1)
+
+ rpmlist = fileutil.getListFromFile(config.input)
+
+ current_packages = ['NodeManager-1.8-5.planetlab',
+ 'NodeUpdate-0.5-4.planetlab', 'codemux-0.1-13.planetlab',
+ 'fprobe-ulog-1.1.3-0.planetlab', 'ipod-2.2-1.planetlab',
+ 'iproute-2.6.16-2.planetlab', 'iptables-1.3.8-9.planetlab',
+ 'kernel-2.6.22.19-vs2.3.0.34.28.planetlab',
+ 'madwifi-0.9.4-2.6.22.19.3.planetlab', 'monitor-1.0-7.planetlab',
+ 'monitor-client-3.0-10.planetlab',
+ 'monitor-runlevelagent-3.0-10.planetlab', 'pl_mom-2.3-1.planetlab',
+ 'pl_sshd-1.0-11.planetlab', 'pyplnet-4.3-2.planetlab',
+ 'util-vserver-pl-0.3-16.planetlab',
+ 'vserver-planetlab-f8-i386-4.2-12.2009.05.27',
+ 'vserver-systemslices-planetlab-f8-i386-4.2-12.2009.05.27',
+ 'vsys-0.9-3.planetlab', 'vsys-scripts-0.95-3.planetlab']
+
+ # PL RPMS
+ if config.package:
+ all_patterns = map(lambda x: ".*" + x + ".*", [ 'NodeManager',
+ 'NodeUpdate', 'codemux', 'fprobe', 'ipod',
+ 'iproute', 'iptables', 'kernel', 'madwifi', 'monitor-client',
+ 'monitor-runlevelagent', 'monitor', 'oombailout', 'pl_mom',
+ 'pl_sshd', 'pyplnet', 'util-vserver-pl', 'vserver-planetlab-f8-i386',
+ 'vserver-systemslices-planetlab-f8-i386', 'vsys-scripts', 'vsys'])
+ else:
+ all_patterns = [config.select]
+
+ for pattern in all_patterns:
+ return_sums = {}
+ for line in rpmlist:
+ line = line.strip()
+ fields = line.split()
+ host = fields[1]
+ rpms = fields[2:]
+ rpms.sort()
+ rpms = pick_some_rpms(pattern, rpms)
+ if len(rpms) != 0:
+ sum = list_to_md5(rpms)
+ try:
+ return_sums[sum]['hosts'].append(host)
+ except:
+ return_sums[sum] = {'hosts' : [], 'diff' : []}
+ return_sums[sum]['hosts'].append(host)
+
+ return_sums[sum]['diff'] = set(rpms) - set(current_packages)
+
+ if config.frequency:
+ print "Frequency for packages that matched: %s" % pattern
+ sum_list = []
+ for sum in return_sums:
+ sum_list.append((len(return_sums[sum]['hosts']), sum))
+
+ sum_list.sort(lambda a,b: cmp(b[0], a[0]))
+ for sum in sum_list:
+ print sum[0], sum[1], map(lambda x: x.replace('.planetlab', ''), return_sums[sum[1]]['diff'])
+
+if __name__ == "__main__":
+ main()
return False
def add_to_file(filename, data):
os.system("echo '%s' >> %s" % (data, filename))
- print "echo '%s' >> %s" % (data, filename)
+ #print "echo '%s' >> %s" % (data, filename)
sites = api.GetSites({'login_base' : 'mlab*'}, ['node_ids'])
for s in sites:
for node in nodes:
try:
i = api.GetInterfaces({ 'interface_id' : node['interface_ids'], 'is_primary' : True})
- print len(i), i
- print "%s %s" % (i[0]['ip'], node['hostname'])
- #if not is_in_file(HOSTS_FILE, node['hostname']):
- # add_to_file(HOSTS_FILE, "%s %s" % (i[0]['ip'], node['hostname']))
+ #print len(i), i
+ #print "%s %s" % (i[0]['ip'], node['hostname'])
+ if not is_in_file(HOSTS_FILE, node['hostname']):
+ add_to_file(HOSTS_FILE, "%s %s" % (i[0]['ip'], node['hostname']))
except:
pass