From 258e40c53d1f6e79ea92548765d65bbb74eba004 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Wed, 17 Jun 2009 20:29:40 +0000 Subject: [PATCH] updating with tweaks to live deployment. --- automate-default.sh | 17 ++--- monitor/wrapper/emailTxt.py | 42 ++++++------ monitor/wrapper/mailer.py | 12 +++- statistics/parserpms.py | 124 ++++++++++++++++++++++++++++++------ zabbix/hosts.py | 10 +-- 5 files changed, 148 insertions(+), 57 deletions(-) diff --git a/automate-default.sh b/automate-default.sh index 2858c63..958d578 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -60,28 +60,21 @@ fi #TODO: should add a call to ssh-add -l to check if the keys are loaded or not. source ${MONITOR_SCRIPT_ROOT}/agent.sh +${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : ${MONITOR_SCRIPT_ROOT}/syncwithplc.py $DATE || : +service plc restart monitor echo "Performing FindAll Nodes" ######################### # 1. FINDBAD NODES -${MONITOR_SCRIPT_ROOT}/findall.py --increment $DATE || : +${MONITOR_SCRIPT_ROOT}/findall.py $DATE || : ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || : # clean up stray 'locfg' processes that hang around inappropriately... ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || : ${MONITOR_SCRIPT_ROOT}/policy.py $DATE - -echo "Archiving pkl files" -######################### -# Archive pkl files. -for f in act_all l_plcnodes site_persistflags node_persistflags pcu_persistflags ; do - if [ -f ${MONITOR_DATA_ROOT}/production.$f.pkl ] ; then - cp ${MONITOR_DATA_ROOT}/production.$f.pkl ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.production.$f.pkl - else - echo "Warning: It failed to archive ${MONITOR_DATA_ROOT}/production.$f.pkl" - fi -done +${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : +service plc restart monitor cp ${MONITOR_SCRIPT_ROOT}/monitor.log ${MONITOR_ARCHIVE_ROOT}/`date +%F-%H:%M`.monitor.log rm -f $MONITOR_PID diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 9a7fec3..77e8576 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -211,7 +211,7 @@ ERROR- This is an error state, where there is absolutely no contact ############################################################################# ############################################################################# - pcumissing_notice =("""MONTEST: There is no PCU available to reboot %(hostname)s""", + pcumissing_notice =("""There is no PCU available to reboot %(hostname)s""", """We've noticed that there is no PCU associated with %(hostname)s, so we could not reboot it ourselves. @@ -229,26 +229,26 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""", + pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""", """We tried to use the PCU registered for %(hostname)s, but for some reason the host did not come back online. You may be able to learn more by visiting this link: - https://monitor.planet-lab.org/monitor/pcuview?loginbase=%(loginbase)s + https://monitor.planet-lab.org/monitor/pcuview?loginbase=%(loginbase)s We need your help resolving this issue in a few ways: 1. First, we need your help rebooting %(hostname)s. Because the above PCU does not appear to work, please manually reboot this machine. If it turns out - that there is a problem with the PCU configuration, we can help you + that there is a problem with the PCU configuration, we can help you resolve that independently. 2. If it is possible, please correcct the above PCU problem, or let us know - what steps you are taking. By enabling us to take administrative actions - automatically without your intervention, you will save time in the future - the next time we need to reboot this machine, because we will be able to - do so without disturbing you. + what steps you are taking. By enabling us to take administrative actions + automatically without your intervention, you will save time in the future + the next time we need to reboot this machine, because we will be able to + do so without disturbing you. 3. If there is nothing apparently wrong with the PCU, or the mapping between the PCU and the host, then there is likely a problem with our bootstrap @@ -265,7 +265,7 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - online_notice=("""MONTEST: Host %(hostname)s is online""", + online_notice=("""Host %(hostname)s is online""", """ This notice is simply to let you know that: %(hostname)s @@ -277,14 +277,14 @@ is online and operational. Thank you very much for your help! -- PlanetLab Central (support@planet-lab.org) """) - test_notice=("""MONTEST: Host %(hostname)s is testing""", + test_notice=("""Host %(hostname)s is testing""", """ This notice is simply to test whether notices work. %(hostname)s Thank you very much for your help! """) - retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""", + retry_bootman=("""Running BootManager on %(hostname)s""", """ This notice is simply to let you know that: %(hostname)s @@ -292,7 +292,7 @@ This notice is simply to let you know that: appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py. If any action is needed from you, you will recieve additional notices. Thank you! """) - down_notice=("""MONTEST: Host %(hostname)s is down""", + down_notice=("""Host %(hostname)s is down""", """ This notice is simply to let you know that: %(hostname)s @@ -309,7 +309,7 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - clear_penalty=("""MONTEST: All privileges restored to site %(loginbase)s""", + clear_penalty=("""All privileges restored to site %(loginbase)s""", """ This notice is to let you know that any privileges previously reduced at your site have been restored: %(penalty_level)s. @@ -330,7 +330,7 @@ Legend: 2+ - all existing slices will be disabled. """) - increase_penalty=("""MONTEST: Privilege reduced for site %(loginbase)s""", + increase_penalty=("""Privilege reduced for site %(loginbase)s""", """ This notice is to let you know that the privileges granted to your site as a participating member of Planetlab have reduced: %(penalty_level)s. @@ -351,7 +351,7 @@ Legend: 2+ - all existing slices will be disabled. """) - newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """ + newbootcd_notice=("""Host %(hostname)s needs a new BootImage""", """ We noticed the following node has an out-dated BootImage: %(hostname)s @@ -453,7 +453,7 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""", + newalphacd_notice=("""New Boot Images for %(hostname)s""", """As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported. %(hostname)s @@ -483,7 +483,7 @@ Thank you for your help, pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one] pcudown=[pcudown_one, pcudown_one, pcudown_one] - unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""", + unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -569,7 +569,7 @@ Thank you for your help, donation_down = [ donation_down_one, donation_down_one, donation_down_one ] - minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""", + minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -589,7 +589,7 @@ BootManager.log output follows: %(bmlog)s """ ) - baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""", + baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node. Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org. @@ -655,7 +655,7 @@ BootManager.log output follows: %(bmlog)s """) - nodeconfig_notice=("""MONTEST: Please Update Configuration file for PlanetLab node %(hostname)s""", + nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit: https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s @@ -695,7 +695,7 @@ Thanks. """) - baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""", + baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""", """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries. %(hostname)s diff --git a/monitor/wrapper/mailer.py b/monitor/wrapper/mailer.py index d3f5df4..4a0d661 100755 --- a/monitor/wrapper/mailer.py +++ b/monitor/wrapper/mailer.py @@ -28,7 +28,7 @@ def reformat_for_rt(text): def _setupRTenvironment(): - os.environ['PATH'] = os.environ['PATH'] + ":" + config.RT_WEB_TOOLS_PATH + os.environ['PATH'] = config.RT_WEB_TOOLS_PATH + ":" + os.environ['PATH'] os.environ['RTSERVER'] = config.RT_WEB_SERVER os.environ['RTUSER'] = config.RT_WEB_USER os.environ['RTPASSWD'] = config.RT_WEB_PASSWORD @@ -41,6 +41,7 @@ def setTicketStatus(ticket_id, status): return {} cmd = "rt edit ticket/%s set status=%s" % (ticket_id, status) + print cmd (f_in, f_out, f_err) = os.popen3(cmd) value = f_out.read() l_values = value.split('\n') @@ -52,6 +53,7 @@ def getTicketStatus(ticket_id): return {} cmd = "rt show -t ticket -f id,subject,status,queue,created %s" % (ticket_id) + print cmd (f_in, f_out, f_err) = os.popen3(cmd) value = f_out.read() l_values = value.split('\n') @@ -78,6 +80,7 @@ def setAdminCCViaRT(ticket_id, to): # create a comma-separated list s_to = ",".join(to) cmd = "rt edit ticket/%s set admincc='%s'" % (ticket_id, s_to) + print cmd (f_in, f_out, f_err) = os.popen3(cmd) value = f_out.read() l_values = value.split() @@ -101,6 +104,7 @@ def setSubjectViaRT(ticket_id, subject): i_ticket_id = int(ticket_id) cmd = "rt edit ticket/%s set subject='%s'" % (ticket_id, subject) + print cmd (f_in, f_out, f_err) = os.popen3(cmd) value = f_out.read() l_values = value.split() @@ -125,9 +129,11 @@ def addCommentViaRT(ticket_id, comment): i_ticket_id = int(ticket_id) cmd = "rt comment -m '%s' ticket/%s" % (comment, i_ticket_id) + print cmd (f_in, f_out, f_err) = os.popen3(cmd) value = f_out.read() l_values = value.split() + l_err = f_err.read() f_in.close() ; f_out.close() ; f_err.close() if len(l_values) > 1 and "recorded" in l_values[1]: # Success @@ -136,6 +142,7 @@ def addCommentViaRT(ticket_id, comment): # Error f_in.close() ; f_out.close() ; f_err.close() print "ERROR: RT failed to add comment to id %s" % ticket_id + print "ERROR: %s" % l_err return @@ -153,6 +160,7 @@ def closeTicketViaRT(ticket_id, comment): if not config.debug: cmd = "rt edit ticket/%s set status=resolved" % i_ticket_id + print cmd (f_in, f_out, f_err) = os.popen3(cmd) f_in.close() value = f_out.read() @@ -184,6 +192,7 @@ def emailViaRT(subject, text, to, ticket_id=None): setAdminCCViaRT(ticket_id, to) cmd = "rt correspond -m - %s" % ticket_id + print cmd (f_in, f_out, f_err) = os.popen3(cmd) f_in.write(text) f_in.flush() @@ -229,6 +238,7 @@ def emailViaRT_NoTicket(subject, text, to): if config.mail and not config.debug: cmd = "rt create -i -t ticket" + print cmd (f_in, f_out, f_err) = os.popen3(cmd) f_in.write(input_text % (subject, spaced_text)) f_in.flush() diff --git a/statistics/parserpms.py b/statistics/parserpms.py index 6ca34b2..64144d7 100755 --- a/statistics/parserpms.py +++ b/statistics/parserpms.py @@ -3,23 +3,111 @@ import sys import os import md5 +import re +from monitor.util import file as fileutil + +purpose_message=""" + This utility is designed to simplify the task of parsing and generating + statistics for the number of packages on PlanetLab nodes. +""" def list_to_md5(strlist): - digest = md5.new() - for f in strlist: - digest.update(f) - - return digest.hexdigest() - -while True: - line = sys.stdin.readline() - if not line: - break - line = line.strip() - fields = line.split() - host = fields[1] - rpms = fields[2:] - rpms.sort() - if len(rpms) != 0: - sum = list_to_md5(rpms) - print sum, host + digest = md5.new() + for f in strlist: + digest.update(f) + + return digest.hexdigest() + +def pick_some_rpms(pattern, rpmlist): + l = [] + cpatt = re.compile(pattern) + for rpm in rpmlist: + if cpatt.search(rpm): + l.append(rpm) + return l + +def main(): + global api + global config + + from optparse import OptionParser + parser = OptionParser() + + parser.set_defaults( select=None, + input=None, + frequency=False, + package=False, + ) + + parser.add_option("", "--input", dest="input", + help="the input file") + parser.add_option("", "--select", dest="select", + help="the pattern to pull out from rpm list") + parser.add_option("", "--frequency", dest="frequency", action="store_true", + help="print the frequency of packages matched by select") + parser.add_option("", "--package", dest="package", action="store_true", + help="print the frequency of each pl package") + (config, args) = parser.parse_args() + if len(sys.argv) == 1 or config.input is None: + print purpose_message + parser.print_help() + sys.exit(1) + + rpmlist = fileutil.getListFromFile(config.input) + + current_packages = ['NodeManager-1.8-5.planetlab', + 'NodeUpdate-0.5-4.planetlab', 'codemux-0.1-13.planetlab', + 'fprobe-ulog-1.1.3-0.planetlab', 'ipod-2.2-1.planetlab', + 'iproute-2.6.16-2.planetlab', 'iptables-1.3.8-9.planetlab', + 'kernel-2.6.22.19-vs2.3.0.34.28.planetlab', + 'madwifi-0.9.4-2.6.22.19.3.planetlab', 'monitor-1.0-7.planetlab', + 'monitor-client-3.0-10.planetlab', + 'monitor-runlevelagent-3.0-10.planetlab', 'pl_mom-2.3-1.planetlab', + 'pl_sshd-1.0-11.planetlab', 'pyplnet-4.3-2.planetlab', + 'util-vserver-pl-0.3-16.planetlab', + 'vserver-planetlab-f8-i386-4.2-12.2009.05.27', + 'vserver-systemslices-planetlab-f8-i386-4.2-12.2009.05.27', + 'vsys-0.9-3.planetlab', 'vsys-scripts-0.95-3.planetlab'] + + # PL RPMS + if config.package: + all_patterns = map(lambda x: ".*" + x + ".*", [ 'NodeManager', + 'NodeUpdate', 'codemux', 'fprobe', 'ipod', + 'iproute', 'iptables', 'kernel', 'madwifi', 'monitor-client', + 'monitor-runlevelagent', 'monitor', 'oombailout', 'pl_mom', + 'pl_sshd', 'pyplnet', 'util-vserver-pl', 'vserver-planetlab-f8-i386', + 'vserver-systemslices-planetlab-f8-i386', 'vsys-scripts', 'vsys']) + else: + all_patterns = [config.select] + + for pattern in all_patterns: + return_sums = {} + for line in rpmlist: + line = line.strip() + fields = line.split() + host = fields[1] + rpms = fields[2:] + rpms.sort() + rpms = pick_some_rpms(pattern, rpms) + if len(rpms) != 0: + sum = list_to_md5(rpms) + try: + return_sums[sum]['hosts'].append(host) + except: + return_sums[sum] = {'hosts' : [], 'diff' : []} + return_sums[sum]['hosts'].append(host) + + return_sums[sum]['diff'] = set(rpms) - set(current_packages) + + if config.frequency: + print "Frequency for packages that matched: %s" % pattern + sum_list = [] + for sum in return_sums: + sum_list.append((len(return_sums[sum]['hosts']), sum)) + + sum_list.sort(lambda a,b: cmp(b[0], a[0])) + for sum in sum_list: + print sum[0], sum[1], map(lambda x: x.replace('.planetlab', ''), return_sums[sum[1]]['diff']) + +if __name__ == "__main__": + main() diff --git a/zabbix/hosts.py b/zabbix/hosts.py index c2dbde7..35b10f3 100755 --- a/zabbix/hosts.py +++ b/zabbix/hosts.py @@ -16,7 +16,7 @@ def is_in_file(filename, pattern): return False def add_to_file(filename, data): os.system("echo '%s' >> %s" % (data, filename)) - print "echo '%s' >> %s" % (data, filename) + #print "echo '%s' >> %s" % (data, filename) sites = api.GetSites({'login_base' : 'mlab*'}, ['node_ids']) for s in sites: @@ -24,9 +24,9 @@ for s in sites: for node in nodes: try: i = api.GetInterfaces({ 'interface_id' : node['interface_ids'], 'is_primary' : True}) - print len(i), i - print "%s %s" % (i[0]['ip'], node['hostname']) - #if not is_in_file(HOSTS_FILE, node['hostname']): - # add_to_file(HOSTS_FILE, "%s %s" % (i[0]['ip'], node['hostname'])) + #print len(i), i + #print "%s %s" % (i[0]['ip'], node['hostname']) + if not is_in_file(HOSTS_FILE, node['hostname']): + add_to_file(HOSTS_FILE, "%s %s" % (i[0]['ip'], node['hostname'])) except: pass -- 2.43.0