From 3b89b6f104b7e4e93e4c32a4d188664bb6b3e34f Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 May 2011 20:33:44 +0000 Subject: [PATCH] First of a series of significant changes to how monitor is organized. mailer.py -- uses CC rather than AdminCC to filter messages that are copied to our support list. Requires additional scrips in RT. controllers_local.py -- supports local extensions to the web interface. --- Monitor.spec | 27 +- commands/findbad.py | 2 +- commands/findbadpcu.py | 2 +- commands/policy.py | 636 +++++++++--------- monitor/wrapper/mailer.py | 12 +- monitor/wrapper/plc.py | 3 +- web/MonitorWeb/monitorweb/controllers.py | 11 +- .../monitorweb/controllers_local.py | 14 + .../monitorweb/templates/master.kid | 2 +- .../monitorweb/templates/pculist_plain.tmpl | 4 + .../monitorweb/templates/sitemenu.kid | 12 +- .../monitorweb/templates/welcome.kid | 55 +- 12 files changed, 425 insertions(+), 355 deletions(-) create mode 100644 web/MonitorWeb/monitorweb/controllers_local.py create mode 100644 web/MonitorWeb/monitorweb/templates/pculist_plain.tmpl diff --git a/Monitor.spec b/Monitor.spec index 32ecb44..5b79c11 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -99,8 +99,11 @@ Requires: MySQL-python Requires: nmap Requires: nc Requires: rt3 +Requires: traceroute Requires: plewww-plekit +Requires: pcucontrol +Requires: TurboGears #Requires: zabbix-client #Requires: zabbix-gui @@ -118,8 +121,8 @@ Requires: python Requires: monitor-server-deps Requires: PLCWWW >= 4.2 -Requires: bootcd-%{pldistro}-%{distroname}-%{_arch} >= 5.0 -Requires: pcucontrol +# NOTE: removed b/c 'distroname' gets corrupted during build process. +# Requires: bootcd-%{pldistro}-%{distroname}-%{_arch} >= 5.0 %description server The server side include all python modules and scripts needed to fully @@ -277,9 +280,9 @@ rm -rf $RPM_BUILD_ROOT # by fedora 8 (our current deployment) doesn't match the version # requirements. export TMPDIR=/var/tmp/ -easy_install -UZ http://files.turbogears.org/eggs/TurboGears-1.0.7-py2.5.egg -easy_install -UZ http://pypi.python.org/packages/source/S/SQLAlchemy/SQLAlchemy-0.5.3.tar.gz -easy_install -UZ Elixir +#easy_install -UZ http://files.turbogears.org/eggs/TurboGears-1.0.7-py2.5.egg +#easy_install -UZ http://pypi.python.org/packages/source/S/SQLAlchemy/SQLAlchemy-0.5.3.tar.gz +#easy_install -UZ Elixir # crazy openssl libs for racadm binary ln -s /lib/libssl.so.0.9.8b /usr/lib/libssl.so.2 @@ -308,20 +311,6 @@ if ! plc-config --category plc_monitor --variable rt_queue ; then --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml fi -# zabbix: -if ! plc-config --category plc_zabbix --variable enabled ; then - plc-config --category plc_zabbix --variable enabled --value false \ - --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml -fi -if ! plc-config --category plc_zabbix --variable host ; then - plc-config --category plc_zabbix --variable host --value localhost.localdomain \ - --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml -fi -if ! plc-config --category plc_zabbix --variable ip ; then - plc-config --category plc_zabbix --variable ip --value "" \ - --save /etc/planetlab/configs/site.xml /etc/planetlab/configs/site.xml -fi - %post nagios # TODO: do as much as possible to get the host setup and running. #chkconfig --add monitor-nagios diff --git a/commands/findbad.py b/commands/findbad.py index b76df4c..0302c07 100755 --- a/commands/findbad.py +++ b/commands/findbad.py @@ -48,7 +48,7 @@ def checkAndRecordState(l_nodes, cohash): global global_round global count - tp = threadpool.ThreadPool(20) + tp = threadpool.ThreadPool(10) scannode = ScanNodeInternal(global_round) # CREATE all the work requests diff --git a/commands/findbadpcu.py b/commands/findbadpcu.py index 9eb3be7..b3896cf 100755 --- a/commands/findbadpcu.py +++ b/commands/findbadpcu.py @@ -117,7 +117,7 @@ def main(): for s in sites: node_ids += s['node_ids'] - l_nodes = plccache.GetNodeByIds(node_ids) + l_nodes = plccache.GetNodesByIds(node_ids) pcus = [] for node in l_nodes: pcus += node['pcu_ids'] diff --git a/commands/policy.py b/commands/policy.py index 30b522a..7f8c5a2 100755 --- a/commands/policy.py +++ b/commands/policy.py @@ -34,324 +34,332 @@ api = plc.getAuthAPI() def logic(): - plc.nodeBootState(host, 'reinstall') - node_end_record(host) + plc.nodeBootState(host, 'reinstall') + node_end_record(host) def check_node_and_pcu_status_for(loginbase): - """ - this function checks whether all the nodes and associated pcus for a - given site are considered 'good'. - - If so, the function returns True. - Otherwise, the function returns False. - """ - - results = [] - for node in plccache.plcdb_lb2hn[loginbase]: - - noderec = FindbadNodeRecord.findby_or_create(hostname=node['hostname']) - nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) - nodebl = BlacklistRecord.get_by(hostname=node['hostname']) - pcuhist = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid) - - if (nodehist is not None and nodehist.status == 'good' and \ - ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ): - if nodebl is None: # no entry in blacklist table - results.append(True) - elif nodebl is not None and nodebl.expired(): # expired entry in blacklist table - results.append(True) - else: - results.append(False) # entry that is not expired. - else: - results.append(False) - - try: - print "test: %s" % results - # NOTE: incase results is empty, reduce does not work on an empty set. - return reduce(lambda x,y: x&y, results) and len(results) > MINUP - except: - return False + """ + this function checks whether all the nodes and associated pcus for a + given site are considered 'good'. + + If so, the function returns True. + Otherwise, the function returns False. + """ + + results = [] + for node in plccache.plcdb_lb2hn[loginbase]: + + noderec = FindbadNodeRecord.findby_or_create(hostname=node['hostname']) + nodehist = HistoryNodeRecord.findby_or_create(hostname=node['hostname']) + nodebl = BlacklistRecord.get_by(hostname=node['hostname']) + pcuhist = HistoryPCURecord.get_by(plc_pcuid=noderec.plc_pcuid) + + if (nodehist is not None and nodehist.status == 'good' and \ + ((pcuhist is not None and pcuhist.status == 'good') or (pcuhist is None)) ): + if nodebl is None: # no entry in blacklist table + results.append(True) + elif nodebl is not None and nodebl.expired(): # expired entry in blacklist table + results.append(True) + else: + results.append(False) # entry that is not expired. + else: + results.append(False) + + try: + print "test: %s" % results + # NOTE: incase results is empty, reduce does not work on an empty set. + return reduce(lambda x,y: x&y, results) and len(results) > MINUP + except: + return False def main(hostnames, sitenames): - # commands: - i = 1 - node_count = 1 - site_count = 1 - #print "hosts: %s" % hostnames - print "apply-policy" - for i,host in enumerate(hostnames): - try: - lb = plccache.plcdb_hn2lb[host] - except: - print "unknown host in plcdb_hn2lb %s" % host - email_exception("%s %s" % (i,host)) - continue - - nodeblack = BlacklistRecord.get_by(hostname=host) - - if nodeblack and not nodeblack.expired(): - print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() ) - continue - - sitehist = SiteInterface.get_or_make(loginbase=lb) - - recent_actions = sitehist.getRecentActions(hostname=host) - - nodehist = HistoryNodeRecord.findby_or_create(hostname=host) - - print "%s %s %s" % (i, nodehist.hostname, nodehist.status) - if nodehist.status == 'good' and \ - changed_lessthan(nodehist.last_changed, 1.0) and \ - found_within(recent_actions, 'down_notice', 7.0) and \ - not found_within(recent_actions, 'online_notice', 0.5): - # NOTE: chronicly flapping nodes will not get 'online' notices - # since, they are never up long enough to be 'good'. - # NOTE: searching for down_notice proves that the node has - # gone through a 'down' state first, rather than just - # flapping through: good, offline, online, ... - # - # NOTE: there is a narrow window in which this command must be - # evaluated, otherwise the notice will not go out. - # this is not ideal. - sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True) - print "send message for host %s online" % host - - - # if a node is offline and doesn't have a PCU, remind the user that they should have one. - #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ - # changed_greaterthan(nodehist.last_changed,1.0) and \ - # not found_within(recent_actions, 'pcumissing_notice', 7.0): - # - # sitehist.sendMessage('pcumissing_notice', hostname=host) - # print "send message for host %s pcumissing_notice" % host - - # if it is offline and HAS a PCU, then try to use it. - if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ - changed_greaterthan(nodehist.last_changed,1.0) and \ - not nodehist.firewall and \ - not found_between(recent_actions, 'try_reboot', 3.5, 1): - - # TODO: there MUST be a better way to do this... - # get fb node record for pcuid - fbpcu = None - fbnode = FindbadNodeRecord.get_latest_by(hostname=host) - if fbnode: - fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid) - - sitehist.attemptReboot(host) - print "send message for host %s try_reboot" % host - if False and not fbpcu.test_is_ok() and \ - not found_within(recent_actions, 'pcuerror_notice', 3.0): - - args = {} - if fbpcu: - args['pcu_name'] = fbpcu.pcu_name() - args['pcu_errors'] = fbpcu.pcu_errors() - args['plc_pcuid'] = fbpcu.plc_pcuid - else: - args['pcu_name'] = "error looking up pcu name" - args['pcu_errors'] = "" - args['plc_pcuid'] = 0 - - args['hostname'] = host - sitehist.sendMessage('pcuerror_notice', **args) - print "send message for host %s PCU Failure" % host - - - # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1) - # will be false for a day after the above condition is satisfied - if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ - changed_greaterthan(nodehist.last_changed,1.5) and \ - not nodehist.firewall and \ - found_between(recent_actions, 'try_reboot', 3.5, 1) and \ - not found_within(recent_actions, 'pcufailed_notice', 3.5): - - # TODO: there MUST be a better way to do this... - # get fb node record for pcuid - fbpcu = None - fbnode = FindbadNodeRecord.get_latest_by(hostname=host) - if fbnode: - fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid) - if fbpcu: - pcu_name = fbpcu.pcu_name() - else: - pcu_name = "error looking up pcu name" - - # get fb pcu record for pcuid - # send pcu failure message - sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name) - print "send message for host %s PCU Failure" % host - - if nodehist.status == 'failboot' and \ - changed_greaterthan(nodehist.last_changed, 0.25) and \ - not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): - # send down node notice - # delay 0.5 days before retrying... - - print "send message for host %s bootmanager_restore" % host - sitehist.runBootManager(host) - # sitehist.sendMessage('retry_bootman', hostname=host) - - if nodehist.status == 'down' and \ - changed_greaterthan(nodehist.last_changed, 2): - if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5): - # send down node notice - sitehist.sendMessage('down_notice', hostname=host) - print "send message for host %s down" % host - - #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): - # send down node notice - #email_exception(host, "firewall_notice") - # sitehist.sendMessage('firewall_notice', hostname=host) - # print "send message for host %s down" % host - - node_count = node_count + 1 - print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') - sys.stdout.flush() - session.flush() - - for i,site in enumerate(sitenames): - sitehist = SiteInterface.get_or_make(loginbase=site) - siteblack = BlacklistRecord.get_by(loginbase=site) - skip_due_to_blacklist=False - - if siteblack and not siteblack.expired(): - print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() ) - skip_due_to_blacklist=True - sitehist.clearPenalty() - sitehist.applyPenalty() - continue - - # TODO: make query only return records within a certin time range, - # i.e. greater than 0.5 days ago. or 5 days, etc. - recent_actions = sitehist.getRecentActions(loginbase=site) - - print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status) - - if sitehist.db.status == 'down': - if sitehist.db.penalty_pause and \ - changed_greaterthan(sitehist.db.penalty_pause_time, 30): - - email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase) - sitehist.closeTicket() - # NOTE: but preserve the penalty status. - sitehist.clearPenaltyPause() - - if sitehist.db.message_id != 0 and \ - sitehist.db.message_status == 'open' and \ - not sitehist.db.penalty_pause: - - email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase) - sitehist.setPenaltyPause() - - if not sitehist.db.penalty_pause and \ - not found_within(recent_actions, 'increase_penalty', 7) and \ - changed_greaterthan(sitehist.db.last_changed, 7): - - # TODO: catch errors - sitehist.increasePenalty() - sitehist.applyPenalty() - sitehist.sendMessage('increase_penalty') - - print "send message for site %s penalty increase" % site - - if sitehist.db.status == 'good': - # clear penalty - # NOTE: because 'all clear' should have an indefinite status, we - # have a boolean value rather than a 'recent action' - if sitehist.db.penalty_applied or sitehist.db.penalty_pause: - # send message that penalties are cleared. - - sitehist.clearPenalty() - sitehist.applyPenalty() - sitehist.sendMessage('clear_penalty') - sitehist.closeTicket() - - print "send message for site %s penalty cleared" % site - - # check all nodes and pcus for this site; if they're all ok, - # close the ticket, else leave it open. - # NOTE: in the case where a PCU reboots and fails, a message is - # sent, but the PCU may appear to be ok according to tests. - # NOTE: Also, bootmanager sends messages regarding disks, - # configuration, etc. So, the conditions here are 'good' - # rather than 'not down' as it is in sitebad. - close_ticket = check_node_and_pcu_status_for(site) - if close_ticket: - sitehist.closeTicket() - - site_count = site_count + 1 - - print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') - sys.stdout.flush() - session.flush() - - session.flush() - return + # commands: + i = 1 + node_count = 1 + site_count = 1 + #print "hosts: %s" % hostnames + print "apply-policy" + for i,host in enumerate(hostnames): + try: + lb = plccache.plcdb_hn2lb[host] + except: + print "unknown host in plcdb_hn2lb %s" % host + email_exception("%s %s" % (i,host)) + continue + + nodeblack = BlacklistRecord.get_by(hostname=host) + + if nodeblack and not nodeblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() ) + continue + + sitehist = SiteInterface.get_or_make(loginbase=lb) + + recent_actions = sitehist.getRecentActions(hostname=host) + + nodehist = HistoryNodeRecord.findby_or_create(hostname=host) + + print "%s %s %s" % (i, nodehist.hostname, nodehist.status) + if nodehist.status == 'good' and \ + changed_lessthan(nodehist.last_changed, 1.0) and \ + found_within(recent_actions, 'down_notice', 7.0) and \ + not found_within(recent_actions, 'online_notice', 0.5): + # NOTE: chronicly flapping nodes will not get 'online' notices + # since, they are never up long enough to be 'good'. + # NOTE: searching for down_notice proves that the node has + # gone through a 'down' state first, rather than just + # flapping through: good, offline, online, ... + # + # NOTE: there is a narrow window in which this command must be + # evaluated, otherwise the notice will not go out. + # this is not ideal. + sitehist.sendMessage('online_notice', hostname=host, viart=False, saveact=True) + print "send message for host %s online" % host + + + # if a node is offline and doesn't have a PCU, remind the user that they should have one. + #if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ + # changed_greaterthan(nodehist.last_changed,1.0) and \ + # not found_within(recent_actions, 'pcumissing_notice', 7.0): + # + # sitehist.sendMessage('pcumissing_notice', hostname=host) + # print "send message for host %s pcumissing_notice" % host + + # if it is offline and HAS a PCU, then try to use it. + if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ + changed_greaterthan(nodehist.last_changed,1.0) and \ + not nodehist.firewall and \ + not found_between(recent_actions, 'try_reboot', 3.5, 1): + + # TODO: there MUST be a better way to do this... + # get fb node record for pcuid + fbpcu = None + fbnode = FindbadNodeRecord.get_latest_by(hostname=host) + if fbnode: + fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid) + + sitehist.attemptReboot(host) + print "send message for host %s try_reboot" % host + if False and not fbpcu.test_is_ok() and \ + not found_within(recent_actions, 'pcuerror_notice', 3.0): + + args = {} + if fbpcu: + args['pcu_name'] = fbpcu.pcu_name() + args['pcu_errors'] = fbpcu.pcu_errors() + args['plc_pcuid'] = fbpcu.plc_pcuid + else: + args['pcu_name'] = "error looking up pcu name" + args['pcu_errors'] = "" + args['plc_pcuid'] = 0 + + args['hostname'] = host + sitehist.sendMessage('pcuerror_notice', **args) + print "send message for host %s PCU Failure" % host + + + # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1) + # will be false for a day after the above condition is satisfied + if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ + changed_greaterthan(nodehist.last_changed,1.5) and \ + not nodehist.firewall and \ + found_between(recent_actions, 'try_reboot', 3.5, 1) and \ + not found_within(recent_actions, 'pcufailed_notice', 3.5): + + # TODO: there MUST be a better way to do this... + # get fb node record for pcuid + fbpcu = None + fbnode = FindbadNodeRecord.get_latest_by(hostname=host) + if fbnode: + fbpcu = FindbadPCURecord.get_latest_by(plc_pcuid=fbnode.plc_pcuid) + if fbpcu: + pcu_name = fbpcu.pcu_name() + else: + pcu_name = "error looking up pcu name" + + # get fb pcu record for pcuid + # send pcu failure message + sitehist.sendMessage('pcufailed_notice', hostname=host, pcu_name=pcu_name) + print "send message for host %s PCU Failure" % host + + if nodehist.status == 'failboot' and \ + changed_greaterthan(nodehist.last_changed, 0.25) and \ + not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): + # send down node notice + # delay 0.5 days before retrying... + + print "send message for host %s bootmanager_restore" % host + sitehist.runBootManager(host) + # sitehist.sendMessage('retry_bootman', hostname=host) + + if nodehist.status == 'down' and \ + changed_greaterthan(nodehist.last_changed, 2): + if not nodehist.firewall and not found_within(recent_actions, 'down_notice', 3.5): + # send down node notice + sitehist.sendMessage('down_notice', hostname=host) + print "send message for host %s down" % host + + #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): + # send down node notice + #email_exception(host, "firewall_notice") + # sitehist.sendMessage('firewall_notice', hostname=host) + # print "send message for host %s down" % host + + node_count = node_count + 1 + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() + session.flush() + + for i,site in enumerate(sitenames): + sitehist = SiteInterface.get_or_make(loginbase=site) + siteblack = BlacklistRecord.get_by(loginbase=site) + skip_due_to_blacklist=False + + try: + site_exempt = plc.isSiteExempt(site) + except: + site_exempt = False + + if siteblack and not siteblack.expired() or site_exempt: + if siteblack: + print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() ) + else: + print "skipping %s due to blacklist." % (site) + skip_due_to_blacklist=True + sitehist.clearPenalty() + sitehist.applyPenalty() + continue + + # TODO: make query only return records within a certin time range, + # i.e. greater than 0.5 days ago. or 5 days, etc. + recent_actions = sitehist.getRecentActions(loginbase=site) + + print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status) + + if sitehist.db.status == 'down': + if sitehist.db.penalty_pause and \ + changed_greaterthan(sitehist.db.penalty_pause_time, 30): + + email_exception("", "clear pause penalty for site: %s" % sitehist.db.loginbase) + sitehist.closeTicket() + # NOTE: but preserve the penalty status. + sitehist.clearPenaltyPause() + + if sitehist.db.message_id != 0 and \ + sitehist.db.message_status == 'open' and \ + not sitehist.db.penalty_pause: + + email_exception("", "pause penalty for site: %s" % sitehist.db.loginbase) + sitehist.setPenaltyPause() + + if not sitehist.db.penalty_pause and \ + not found_within(recent_actions, 'increase_penalty', 7) and \ + changed_greaterthan(sitehist.db.last_changed, 7): + + # TODO: catch errors + sitehist.increasePenalty() + sitehist.applyPenalty() + sitehist.sendMessage('increase_penalty') + + print "send message for site %s penalty increase" % site + + if sitehist.db.status == 'good': + # clear penalty + # NOTE: because 'all clear' should have an indefinite status, we + # have a boolean value rather than a 'recent action' + if sitehist.db.penalty_applied or sitehist.db.penalty_pause: + # send message that penalties are cleared. + + sitehist.clearPenalty() + sitehist.applyPenalty() + sitehist.sendMessage('clear_penalty') + sitehist.closeTicket() + + print "send message for site %s penalty cleared" % site + + # check all nodes and pcus for this site; if they're all ok, + # close the ticket, else leave it open. + # NOTE: in the case where a PCU reboots and fails, a message is + # sent, but the PCU may appear to be ok according to tests. + # NOTE: Also, bootmanager sends messages regarding disks, + # configuration, etc. So, the conditions here are 'good' + # rather than 'not down' as it is in sitebad. + close_ticket = check_node_and_pcu_status_for(site) + if close_ticket: + sitehist.closeTicket() + + site_count = site_count + 1 + + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() + session.flush() + + session.flush() + return if __name__ == "__main__": - parser = parsermodule.getParser(['nodesets']) - parser.set_defaults( timewait=0, - skip=0, - rins=False, - reboot=False, - findbad=False, - force=False, - nosetup=False, - verbose=False, - quiet=False,) - - parser.add_option("", "--stopselect", dest="stopselect", metavar="", - help="The select string that must evaluate to true for the node to be considered 'done'") - parser.add_option("", "--findbad", dest="findbad", action="store_true", - help="Re-run findbad on the nodes we're going to check before acting.") - parser.add_option("", "--force", dest="force", action="store_true", - help="Force action regardless of previous actions/logs.") - parser.add_option("", "--rins", dest="rins", action="store_true", - help="Set the boot_state to 'rins' for all nodes.") - parser.add_option("", "--reboot", dest="reboot", action="store_true", - help="Actively try to reboot the nodes, keeping a log of actions.") - - parser.add_option("", "--verbose", dest="verbose", action="store_true", - help="Extra debug output messages.") - parser.add_option("", "--nosetup", dest="nosetup", action="store_true", - help="Do not perform the orginary setup phase.") - parser.add_option("", "--skip", dest="skip", - help="Number of machines to skip on the input queue.") - parser.add_option("", "--timewait", dest="timewait", - help="Minutes to wait between iterations of 10 nodes.") - - parser = parsermodule.getParser(['defaults'], parser) - config = parsermodule.parse_args(parser) - - fbquery = HistoryNodeRecord.query.all() - hostnames = [ n.hostname for n in fbquery ] - - fbquery = HistorySiteRecord.query.all() - sitenames = [ s.loginbase for s in fbquery ] - - if config.site: - # TODO: replace with calls to local db. the api fails so often that - # these calls should be regarded as unreliable. - l_nodes = plccache.GetNodesBySite(config.site) - filter_hostnames = [ n['hostname'] for n in l_nodes ] - - hostnames = filter(lambda x: x in filter_hostnames, hostnames) - sitenames = [config.site] - - if config.node: - hostnames = [ config.node ] - sitenames = [ plccache.plcdb_hn2lb[config.node] ] - - try: - main(hostnames, sitenames) - session.flush() - except KeyboardInterrupt: - print "Killed by interrupt" - session.flush() - sys.exit(0) - except: - email_exception() - print traceback.print_exc(); - print "fail all..." + parser = parsermodule.getParser(['nodesets']) + parser.set_defaults( timewait=0, + skip=0, + rins=False, + reboot=False, + findbad=False, + force=False, + nosetup=False, + verbose=False, + quiet=False,) + + parser.add_option("", "--stopselect", dest="stopselect", metavar="", + help="The select string that must evaluate to true for the node to be considered 'done'") + parser.add_option("", "--findbad", dest="findbad", action="store_true", + help="Re-run findbad on the nodes we're going to check before acting.") + parser.add_option("", "--force", dest="force", action="store_true", + help="Force action regardless of previous actions/logs.") + parser.add_option("", "--rins", dest="rins", action="store_true", + help="Set the boot_state to 'rins' for all nodes.") + parser.add_option("", "--reboot", dest="reboot", action="store_true", + help="Actively try to reboot the nodes, keeping a log of actions.") + + parser.add_option("", "--verbose", dest="verbose", action="store_true", + help="Extra debug output messages.") + parser.add_option("", "--nosetup", dest="nosetup", action="store_true", + help="Do not perform the orginary setup phase.") + parser.add_option("", "--skip", dest="skip", + help="Number of machines to skip on the input queue.") + parser.add_option("", "--timewait", dest="timewait", + help="Minutes to wait between iterations of 10 nodes.") + + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + + fbquery = HistoryNodeRecord.query.all() + hostnames = [ n.hostname for n in fbquery ] + + fbquery = HistorySiteRecord.query.all() + sitenames = [ s.loginbase for s in fbquery ] + + if config.site: + # TODO: replace with calls to local db. the api fails so often that + # these calls should be regarded as unreliable. + l_nodes = plccache.GetNodesBySite(config.site) + filter_hostnames = [ n['hostname'] for n in l_nodes ] + + hostnames = filter(lambda x: x in filter_hostnames, hostnames) + sitenames = [config.site] + + if config.node: + hostnames = [ config.node ] + sitenames = [ plccache.plcdb_hn2lb[config.node] ] + + try: + main(hostnames, sitenames) + session.flush() + except KeyboardInterrupt: + print "Killed by interrupt" + session.flush() + sys.exit(0) + except: + email_exception() + print traceback.print_exc(); + print "fail all..." diff --git a/monitor/wrapper/mailer.py b/monitor/wrapper/mailer.py index 34a8553..afa72eb 100755 --- a/monitor/wrapper/mailer.py +++ b/monitor/wrapper/mailer.py @@ -84,7 +84,7 @@ def setAdminCCViaRT(ticket_id, to): # create a comma-separated list s_to = ",".join(to) - cmd = "rt edit ticket/%s set admincc='%s'" % (ticket_id, s_to) + cmd = "rt edit ticket/%s set cc='%s'" % (ticket_id, s_to) print cmd (f_in, f_out, f_err) = os.popen3(cmd) value = f_out.read() @@ -95,7 +95,7 @@ def setAdminCCViaRT(ticket_id, to): pass else: print "VALUE:", value - print "ERROR: RT failed to update AdminCC for ticket %s" % ticket_id + print "ERROR: RT failed to update CC for ticket %s" % ticket_id return @@ -229,15 +229,15 @@ def emailViaRT_NoTicket(subject, text, to): # Set ENV Variables/PATH _setupRTenvironment() - # NOTE: AdminCc: (in PLC's RT configuration) gets an email sent. + # NOTE: Cc: (in PLC's RT configuration) gets an email sent. # This is not the case (surprisingly) for Cc: input_text = "Subject: %s\n" input_text += "Requestor: %s\n"% FROM input_text += "id: ticket/new\n" input_text += "Queue: %s\n" % config.RT_QUEUE for recipient in to: - input_text += "AdminCc: %s\n" % recipient - #input_text += "AdminCc: %s\n" % config.cc_email + input_text += "Cc: %s\n" % recipient + #input_text += "Cc: %s\n" % config.cc_email input_text += "Text: %s" # Add a space for each new line to get RT to accept the file. @@ -366,4 +366,4 @@ if __name__=="__main__": import smtplib import emailTxt import plc - emailViaRT("mail via RT", "Let's see if this succeeds...", [FROM]) + emailViaRT("test monitor mail 9", "Let's see if this succeeds...", ["stephen.soltesz@gmail.com"]) diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 00632bf..ae37851 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -412,7 +412,8 @@ def enableSiteSlices(loginbase): for attr in l_attr: if "enabled" == attr['tagname'] and attr['value'] == "0": logger.info("Deleted enable=0 attribute from slice %s" % slice) - api.DeleteSliceTag(auth.auth, attr['slice_tag_id']) + if not isSliceExempt(slice): + api.DeleteSliceTag(auth.auth, attr['slice_tag_id']) except Exception, exc: logger.info("enableSiteSlices: %s" % exc) print "exception: %s" % exc diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 6941327..68df2eb 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -11,6 +11,7 @@ import os from monitor.database.info.model import * #from monitor.database.zabbixapi.model import * from monitor_xmlrpc import MonitorXmlrpcServer +from controllers_local import LocalExtensions from monitor import util from monitor import reboot @@ -59,6 +60,7 @@ class NodeQueryFields(widgets.WidgetsList): uptime = widgets.CheckBox(label="Uptime") traceroute = widgets.CheckBox(label="Traceroute") port_status = widgets.CheckBox(label="Port Status") + plc_pcuid = widgets.CheckBox(label="PCU ID") rpms = widgets.CheckBox(label="RPM") rpmvalue = widgets.TextField(label="RPM Pattern") @@ -264,7 +266,7 @@ def prep_node_for_display(node, pcuhash=None, preppcu=True, asofdate=None): return agg -class Root(controllers.RootController, MonitorXmlrpcServer): +class Root(controllers.RootController, MonitorXmlrpcServer, LocalExtensions): @expose(template="monitorweb.templates.welcome") def index(self): # log.debug("Happy TurboGears Controller Responding For Duty") @@ -669,7 +671,7 @@ class Root(controllers.RootController, MonitorXmlrpcServer): query = [] if pcu_id: fbnode = HistoryPCURecord.get_by(plc_pcuid=pcu_id) - l = fbnode.versions[-100:] + l = fbnode.versions[-1000:] l.reverse() for pcu in l: #prep_node_for_display(node) @@ -714,13 +716,15 @@ class Root(controllers.RootController, MonitorXmlrpcServer): if loginbase: fbsite = HistorySiteRecord.get_by(loginbase=loginbase) # TODO: add links for earlier history if desired. - l = fbsite.versions[-100:] + l = fbsite.versions[-1000:] l.reverse() for site in l: query.append(site) return dict(query=query, loginbase=loginbase) + @expose("cheetah:monitorweb.templates.pculist_plain", as_format="plain", + accept_format="text/plain", content_type="text/plain") @expose(template="monitorweb.templates.pculist") def pcu(self, filter='all'): print "PCUVIEW------------------" @@ -898,3 +902,4 @@ class Root(controllers.RootController, MonitorXmlrpcServer): print "redirecting 3" return dict() + diff --git a/web/MonitorWeb/monitorweb/controllers_local.py b/web/MonitorWeb/monitorweb/controllers_local.py new file mode 100644 index 0000000..09ad5e9 --- /dev/null +++ b/web/MonitorWeb/monitorweb/controllers_local.py @@ -0,0 +1,14 @@ +import sys +import xmlrpclib +import cherrypy +import turbogears +from datetime import datetime, timedelta +import time +from monitor.wrapper import plc +import os, errno + +class LocalExtensions(object): + + @cherrypy.expose() + def example(self, **keywords): + pass diff --git a/web/MonitorWeb/monitorweb/templates/master.kid b/web/MonitorWeb/monitorweb/templates/master.kid index a786f33..5900e00 100644 --- a/web/MonitorWeb/monitorweb/templates/master.kid +++ b/web/MonitorWeb/monitorweb/templates/master.kid @@ -41,7 +41,7 @@

MyOps is an open source monitoring and management framework for MyPLC

-

Copyright © 2007-2009 Princeton

+

Copyright © 2007-2011 Princeton

diff --git a/web/MonitorWeb/monitorweb/templates/pculist_plain.tmpl b/web/MonitorWeb/monitorweb/templates/pculist_plain.tmpl new file mode 100644 index 0000000..5851488 --- /dev/null +++ b/web/MonitorWeb/monitorweb/templates/pculist_plain.tmpl @@ -0,0 +1,4 @@ +pcuname,status,username,password,model +#for pcu in $query +$pcu.pcu.plc_pcu_stats['ip'],$pcu.status,$pcu.pcu.plc_pcu_stats['username'],$pcu.pcu.plc_pcu_stats['password'],$pcu.pcu.plc_pcu_stats['model'] +#end for diff --git a/web/MonitorWeb/monitorweb/templates/sitemenu.kid b/web/MonitorWeb/monitorweb/templates/sitemenu.kid index 7dc567c..12808dd 100644 --- a/web/MonitorWeb/monitorweb/templates/sitemenu.kid +++ b/web/MonitorWeb/monitorweb/templates/sitemenu.kid @@ -66,6 +66,16 @@ - + + diff --git a/web/MonitorWeb/monitorweb/templates/welcome.kid b/web/MonitorWeb/monitorweb/templates/welcome.kid index 4328642..695fccd 100644 --- a/web/MonitorWeb/monitorweb/templates/welcome.kid +++ b/web/MonitorWeb/monitorweb/templates/welcome.kid @@ -5,39 +5,77 @@ from links import * + Welcome to MyOps + + + - -
-
    + + +
    +
    +

    + MyOps observes the operational state of PlanetLab and enforces global + policies at member sites. MyOps observes three principle objects in + PlanetLab: Sites, Nodes, and Power Control Units (PCUs). Based on + the observed state of these objects, MyOps applies policies to member + sites that are expected to improve the operational state of the whole + system. +

    + +

    If you'd like to track things a little more informally, you can install these Google Gadgets for summaries of the entire system or a specific site.

    @@ -45,6 +83,7 @@ from links import *
  1. MyOps Summary
  2. Site Summary
  3. +
    -- 2.43.0