From 7b3d462aa05fcc1892fd914db163143f36a05945 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 20 Nov 2008 20:16:06 +0000 Subject: [PATCH] aremoved hard coded values to MONITOR_SCRIPT_ROOT. added cacheset to parser to allow for plccache refreshing. removed ' from emailZabbix.py due to runtime errors with zabbix_server --- Monitor.spec | 2 +- automate-default.sh | 2 +- monitor-default.conf | 6 ++--- monitor-server.cron | 2 +- monitor-server.init | 11 ++++---- monitor/database/dbpickle.py | 4 +++ monitor/database/zabbixapi/emailZabbix.py | 24 +++++++++-------- monitor/parser.py | 14 ++++++++++ zabbix/templates/zabbix_templates.xml | 4 +-- zabbix/zabbixsite.py | 33 +++++++++++++++-------- zabbix/zabbixsync.py | 4 +-- 11 files changed, 69 insertions(+), 37 deletions(-) diff --git a/Monitor.spec b/Monitor.spec index e0a58e7..15b24c1 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -152,7 +152,7 @@ rm -rf $RPM_BUILD_ROOT # an API server or so on. # TODO: create real monitorconfig.py from monitorconfig-default.py # TODO: create monitorconfig.php using phpconfig.py -# TODO: create symlink in /var/lib/monitor-server for chroot environments +# TODO: create symlink in /var/lib/monitor for chroot environments # TODO: update the content of automate_pl03.sh # TODO: Use the installed version of bootcd to create custom boot images. ( or, use the api now). diff --git a/automate-default.sh b/automate-default.sh index b5508c1..ef1cc2f 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -1,7 +1,7 @@ #!/bin/bash # NOTE: Must be an absolute path to guarantee it is read. -INSTALLPATH=/usr/share/monitor-server/ +INSTALLPATH=/usr/share/monitor/ # Generate an 'sh' style file full of variables in monitor.conf $INSTALLPATH/shconfig.py > $INSTALLPATH/monitorconfig.sh source $INSTALLPATH/monitorconfig.sh diff --git a/monitor-default.conf b/monitor-default.conf index fede16f..967a6ae 100644 --- a/monitor-default.conf +++ b/monitor-default.conf @@ -19,9 +19,9 @@ API_AUTH_PASSWORD= # SERVER PATHS MONITOR_HOSTNAME=monitor.planet-lab.org -MONITOR_SCRIPT_ROOT=/usr/share/monitor-server -MONITOR_DATA_ROOT=/var/lib/monitor-server -MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb +MONITOR_SCRIPT_ROOT=/usr/share/monitor +MONITOR_DATA_ROOT=/var/lib/monitor +MONITOR_ARCHIVE_ROOT=/var/lib/monitor/archive-pdb MONITOR_IP= MONITOR_HOSTNAME= diff --git a/monitor-server.cron b/monitor-server.cron index 1433b79..86e829c 100644 --- a/monitor-server.cron +++ b/monitor-server.cron @@ -1,6 +1,6 @@ # Runs every three hours to poll all nodes and PCUs, as well as take some # automated actions for debug nodes. -01 * * * * root /usr/share/monitor-server/automate.sh 2>&1 > /usr/share/monitor-server/monitor.log +01 * * * * root /usr/share/monitor/automate.sh 2>&1 > /usr/share/monitor/monitor.log diff --git a/monitor-server.init b/monitor-server.init index fd21161..f8e5a02 100644 --- a/monitor-server.init +++ b/monitor-server.init @@ -16,7 +16,7 @@ . /etc/planetlab/plc_config local_config=/etc/planetlab/configs/site.xml -MONITORPATH=/usr/share/monitor-server +MONITORPATH=/usr/share/monitor # Be verbose set -x @@ -101,7 +101,8 @@ function check_templates_and_import () { # LOG IN COOKIE_FILE=/tmp/cookiejar.txt - TEMPLATES_DIR=/usr/share/monitor/zabbix/templates + rm -f ${COOKIE_FILE} + TEMPLATES_DIR=${MONITORPATH}/zabbix/templates curl -s --cookie $COOKIE_FILE --cookie-jar $COOKIE_FILE \ --form "enter=Enter" \ --form "name=Admin" \ @@ -174,9 +175,9 @@ API_AUTH_USER=${PLC_MONITOR_EMAIL} API_AUTH_PASSWORD=${PLC_MONITOR_DBPASSWORD} # SERVER PATHS -MONITOR_SCRIPT_ROOT=/usr/share/monitor-server -MONITOR_DATA_ROOT=/var/lib/monitor-server -MONITOR_ARCHIVE_ROOT=/var/lib/monitor-server/archive-pdb +MONITOR_SCRIPT_ROOT=${MONITORPATH} +MONITOR_DATA_ROOT=/var/lib/monitor +MONITOR_ARCHIVE_ROOT=/var/lib/monitor/archive-pdb MONITOR_HOSTNAME=${PLC_MONITOR_HOST} MONITOR_IP=${PLC_MONITOR_IP} diff --git a/monitor/database/dbpickle.py b/monitor/database/dbpickle.py index e795658..074ff68 100644 --- a/monitor/database/dbpickle.py +++ b/monitor/database/dbpickle.py @@ -27,6 +27,10 @@ def cachedRecently(name, length=int(config.cachetime), type=None): return true or false based on whether the modified time of the cached file is within 'length' minutes. """ + if hasattr(config, 'cachecalls') and not config.cachecalls: + # don't use cached calls if cachecalls is false + return False + try: t = lastModified(name, type) except: diff --git a/monitor/database/zabbixapi/emailZabbix.py b/monitor/database/zabbixapi/emailZabbix.py index 44a1616..3f61fff 100644 --- a/monitor/database/zabbixapi/emailZabbix.py +++ b/monitor/database/zabbixapi/emailZabbix.py @@ -12,15 +12,17 @@ class mailtxt: #print getattr(cls,f) return + # NOTE: using the literal --> ' <-- character in messages will cause the db or zabbix_server to fail. + nodedown_one_subject="Server {HOSTNAME} is unreachable: First Notice" nodedown_one = """ Hello, -We hope that you're having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: +We hope that you are having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: {HOSTNAME} : Since {EVENT.AGE} -We're writing because we need your help returning them to their regular operation. +We are writing because we need your help returning them to their regular operation. To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the monitor status page to verify that your node is accessible. @@ -28,11 +30,11 @@ If the machine has booted successfully, you may check directly by logging in wit sudo /usr/sbin/vps ax -If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contact's Guide: +If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contacts Guide: https://www.planet-lab.org/doc/guides/bootcdsetup -There is no need to respond to this message unless there are any console messages relating to the node's failure. In this case, please report them to PlanetLab support (%(support_email)s) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you've taken. +There is no need to respond to this message unless there are any console messages relating to the node failure. In this case, please report them to PlanetLab support (%(support_email)s) so we can help resolve the issue. Including this message in your reply will help us coordinate our records with the actions you have taken. Finally, you can track the current status of your machines using this Google Gadget: @@ -46,18 +48,18 @@ Thank you for your help, nodedown_two = """ Hello, -We hope that you're having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: +We hope that you are having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: {HOSTNAME} : Since {EVENT.AGE} -We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation. We understand that machine maintenance can take time. So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site. No new slices may be created, but the existing slices and services running within them will be unaffected. +We are writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation. We understand that machine maintenance can take time. So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site. No new slices may be created, but the existing slices and services running within them will be unaffected. To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the monitor status page to verify that your node is accessible. If the machine has booted successfully, you may check directly by logging in with your site_admin account, and running: sudo /usr/sbin/vps ax -If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contact's Guide: +If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contacts Guide: https://www.planet-lab.org/doc/guides/bootcdsetup @@ -78,11 +80,11 @@ Thank you for your help, nodedown_three =""" Hello, -We hope that you're having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: +We hope that you are having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: {HOSTNAME} : Since {EVENT.AGE} -We understand that machine maintenance can take time. We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation. This is the third time attempting to contact someone in regard to these machines at your site. So, while we wait for the machines to return to their regular operation all current slice activity will be suspended. Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines. +We understand that machine maintenance can take time. We are writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation. This is the third time attempting to contact someone in regard to these machines at your site. So, while we wait for the machines to return to their regular operation all current slice activity will be suspended. Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines. To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the monitor status page to verify that your node is accessible. @@ -90,7 +92,7 @@ If the machine has booted successfully, you may check directly by logging in wit sudo /usr/sbin/vps ax -If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contact's Guide: +If you have a BootCD older than 3.0, you will need to create a new BootImage on CD or USB. You can find instructions for this at the Technical Contacts Guide: https://www.planet-lab.org/doc/guides/bootcdsetup @@ -106,7 +108,7 @@ Thank you for your help, nodedown_four=""" Hello, -We hope that you're having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: +We hope that you are having a good day. As part of PlanetLab node monitoring, we noticed the following node is down at your site: {HOSTNAME} : Since {EVENT.AGE} diff --git a/monitor/parser.py b/monitor/parser.py index bd15197..e3365ee 100644 --- a/monitor/parser.py +++ b/monitor/parser.py @@ -71,6 +71,17 @@ def parseSetDefaults(parser=None): "debug is enabled or for bcc when it is not") return parser +def parseSetCacheSet(parser=None): + if parser == None: + parser = OptionParser() + + parser.set_defaults(cachecalls=True, cachetime=60) + parser.add_option("", "--nocache", dest="cachecalls", action="store_false", + help="When using PLCCache objects, temporarily disable the use of caching. i.e. refresh") + parser.add_option("", "--cachetime", dest="cachetime", + help="How long to preserve a cached value. Minutes") + return parser + def parseSetNodeSets(parser=None): if parser == None: parser = OptionParser() @@ -95,6 +106,9 @@ def getParser(parsesets=[], parser=None): else: p = parser + if 'cacheset' in parsesets: + p = parseSetCacheSet(p) + if 'nodesets' in parsesets: p = parseSetNodeSets(p) if 'defaults' in parsesets: diff --git a/zabbix/templates/zabbix_templates.xml b/zabbix/templates/zabbix_templates.xml index d956f42..9495cb1 100644 --- a/zabbix/templates/zabbix_templates.xml +++ b/zabbix/templates/zabbix_templates.xml @@ -2046,7 +2046,7 @@ Processes - + Number of running processes apache 60 @@ -3340,7 +3340,7 @@ Apache is not running on {HOSTNAME} 0 - {{HOSTNAME}:proc.num[httpd].last(0)}<1 + {{HOSTNAME}:proc.num[plc_httpd].last(0)}<1 0 3 diff --git a/zabbix/zabbixsite.py b/zabbix/zabbixsite.py index e3bcfb4..241e739 100755 --- a/zabbix/zabbixsite.py +++ b/zabbix/zabbixsite.py @@ -90,6 +90,8 @@ def setup_global(): print "checking zabbix server host info" zabbixserver = Host.get_by(host="ZABBIX Server") if zabbixserver: + # TODO: verify that this works. it has failed once on fresh + # install... not sure why. print "Removing default Zabbix server entry" zabbixserver.delete() @@ -98,7 +100,7 @@ def setup_global(): # copying that the php code does during a host add. # NOTE: Instead, reformat any *xml.in templates and import those # during /etc/plc.d/monitor sync - for file in glob.glob("/usr/share/monitor/templates/*.xml.in"): + for file in glob.glob("%s/zabbix/templates/*.xml.in" config.MONITOR_SCRIPT_ROOT): if 'zabbix_server' in file: buf = loadFile(file) args = {'hostname' : config.MONITOR_HOSTNAME, 'ip' : config.MONITOR_IP} @@ -110,7 +112,7 @@ def setup_global(): print "checking scripts" script1 = Script.find_or_create(name="RebootNode", set_if_new = { - 'command':"/usr/share/monitor-server/reboot.py {HOST.CONN}", + 'command':"%s/reboot.py {HOST.CONN}" % config.MONITOR_SCRIPT_ROOT, 'host_access':3 # r/w) }) script2 = Script.find_or_create(name="NMap", @@ -129,8 +131,16 @@ def setup_site(loginbase, techemail, piemail, iplist): # TODO: remove old users that are no longer in the plcdb. # TODO: consider creating two user groups for Tech & PI emails + # NOTE: setup default valus for EMAIL + mailtxt.reformat({'hostname' : config.MONITOR_HOSTNAME, + 'support_email' : config.support_email}) + + # NOTE: verify arguments + if len(iplist) > 255: + raise Exception("iplist length is too long!") + BI_WEEKLY_ESC_PERIOD = int(60*60*24) - BI_WEEKLY_ESC_PERIOD = int(60) # testing... + #BI_WEEKLY_ESC_PERIOD = int(60) # testing... # User Group site_user_group = UsrGrp.find_or_create(name=USERGROUP_NAME % loginbase) @@ -169,10 +179,7 @@ def setup_site(loginbase, techemail, piemail, iplist): key_="system.uname", ports=10050) ) ) if dr.iprange != iplist: - if len(iplist) < 255: - dr.iprange = iplist - else: - raise Exception("iplist length is too long!") + dr.iprange = iplist # DISCOVERY ACTION for these servers @@ -228,7 +235,11 @@ def setup_site(loginbase, techemail, piemail, iplist): ] else: # TODO: verify iplist is up-to-date - pass + # NOTE: len(a.actioncondition_list) > 0 + ip_condition = a.actioncondition_list[0] + assert ip_condition.conditiontype == defines.CONDITION_TYPE_DHOST_IP + if ip_condition.value != iplist: + ip_condition.value = iplist # ESCALATION ACTION for these servers ea = Action.find_or_create(name=escalation_action_name, @@ -276,7 +287,7 @@ def setup_site(loginbase, techemail, piemail, iplist): esc_step_from=10, esc_step_to=10, esc_period=0, shortdata="", - longdata="%s:/usr/share/monitor-server/checkslices.py {HOSTNAME} disablesite" % config.MONITOR_HOSTNAME, + longdata="%s:%s/checkslices.py {HOSTNAME} disablesite" % ( config.MONITOR_HOSTNAME, config.MONITOR_SCRIPT_ROOT ), operationcondition_list=[ OperationConditionNotAck() ]), ActionOperation(operationtype=defines.OPERATION_TYPE_MESSAGE, shortdata=mailtxt.nodedown_two_subject, @@ -300,7 +311,7 @@ def setup_site(loginbase, techemail, piemail, iplist): esc_step_from=17, esc_step_to=17, esc_period=0, shortdata="", - longdata="%s:/usr/share/monitor-server/checkslices.py {HOSTNAME} disableslices" % config.MONITOR_HOSTNAME, + longdata="%s:%s/checkslices.py {HOSTNAME} disableslices" % ( config.MONITOR_HOSTNAME, config.MONITOR_SCRIPT_ROOT ), # TODO: send notice to users of slices operationcondition_list=[ OperationConditionNotAck() ]), ActionOperation(operationtype=defines.OPERATION_TYPE_MESSAGE, @@ -316,7 +327,7 @@ def setup_site(loginbase, techemail, piemail, iplist): esc_step_from=21, esc_step_to=0, esc_period=int(BI_WEEKLY_ESC_PERIOD*3.5), shortdata="", - longdata="%s:/usr/share/monitor-server/checkslices.py {HOSTNAME} forever" % config.MONITOR_HOSTNAME, + longdata="%s:%s/checkslices.py {HOSTNAME} forever" % ( config.MONITOR_HOSTNAME, config.MONITOR_SCRIPT_ROOT), operationcondition_list=[ OperationConditionNotAck() ]), ActionOperation(operationtype=defines.OPERATION_TYPE_MESSAGE, shortdata=mailtxt.nodedown_four_subject, diff --git a/zabbix/zabbixsync.py b/zabbix/zabbixsync.py index 3af935b..1c0e405 100755 --- a/zabbix/zabbixsync.py +++ b/zabbix/zabbixsync.py @@ -38,7 +38,7 @@ def add_loginbase(loginbase): if __name__=="__main__": from monitor import parser as parsermodule - parser = parsermodule.getParser() + parser = parsermodule.getParser(['cacheset']) parser.set_defaults( setupglobal=False, syncsite=True, site=None) parser.add_option("", "--setupglobal", action="store_true", dest="setupglobal", help="Setup global settings.") @@ -53,7 +53,7 @@ if __name__=="__main__": session.flush() if opts.syncsite: - api = plc.getAuthAPI() + api = plc.getCachedAuthAPI() query = {'peer_id' : None} if opts.site: query.update({'login_base' : opts.site}) -- 2.43.0