From: Stephen Soltesz Date: Thu, 2 Apr 2009 17:57:58 +0000 (+0000) Subject: add loginbase to blacklist X-Git-Tag: Monitor-2.0-9~4 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=924b7e5c530ecf25d4c5b002fa89ff73ef11f53c add loginbase to blacklist add requires to pcucontrol module checked new-site-enable policy --- diff --git a/Monitor.spec b/Monitor.spec index 3c2a2d6..fe9e2b2 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -79,7 +79,12 @@ as Zabbix DB. %package pcucontrol summary: pcu controls for monitor and plcapi group: applications/system -requires: python +Requires: python +Requires: OpenIPMI-tools +Requires: openssh-clients +Requires: perl-libwww-perl +Requires: perl-IO-Socket-SSL +Requires: curl %description pcucontrol both monitor and the plcapi use a set of common commands to reboot machines diff --git a/blacklist.py b/blacklist.py index 4869879..8704b59 100755 --- a/blacklist.py +++ b/blacklist.py @@ -13,29 +13,41 @@ def usage(): def main(): + loginbase = False + try: - longopts = ["delete=", "help"] - (opts, argv) = getopt.getopt(sys.argv[1:], "d:h", longopts) + longopts = ["delete=", "loginbase", "help"] + (opts, argv) = getopt.getopt(sys.argv[1:], "d:lh", longopts) except getopt.GetoptError, err: print "Error: " + err.msg sys.exit(1) - blacklist = BlacklistRecord.query.all() - hostnames = [ h.hostname for h in blacklist ] + hostnames_q = BlacklistRecord.getHostnameBlacklist() + loginbases_q = BlacklistRecord.getLoginbaseBlacklist() + hostnames = [ h.hostname for h in hostnames_q ] + loginbases = [ h.loginbase for h in loginbases_q ] for (opt, optval) in opts: if opt in ["-d", "--delete"]: i = optval bl = BlacklistRecord.get_by(hostname=i) bl.delete() + elif opt in ["-l", "--loginbase"]: + loginbase = True else: usage() sys.exit(0) i_cnt = 0 - for i in blacklist: - print i.hostname - i_cnt += 1 + if not loginbase: + for i in hostnames: + print i + i_cnt += 1 + else: + for i in loginbases: + print i + i_cnt += 1 + while 1: @@ -43,13 +55,19 @@ def main(): if not line: break line = line.strip() - if line not in hostnames: - bl = BlacklistRecord(hostname=line) + if line not in hostnames and line not in loginbases: + if loginbase: + bl = BlacklistRecord(loginbase=line) + else: + bl = BlacklistRecord(hostname=line) bl.flush() i_cnt += 1 session.flush() - print "Total %d nodes in blacklist" % (i_cnt) + if loginbase: + print "Total %d loginbases in blacklist" % (i_cnt) + else: + print "Total %d nodes in blacklist" % (i_cnt) if __name__ == '__main__': import os diff --git a/monitor/database/info/action.py b/monitor/database/info/action.py index caef06f..0abec62 100644 --- a/monitor/database/info/action.py +++ b/monitor/database/info/action.py @@ -41,10 +41,21 @@ __session__ = mon_session class BlacklistRecord(Entity): date_created = Field(DateTime,default=datetime.now) - hostname = Field(String,default=None, primary_key=True) + hostname = Field(String,default=None) + loginbase = Field(String,default=None) expires = Field(Integer,default=0) # seconds plus acts_as_versioned(['hostname']) + @classmethod + def getLoginbaseBlacklist(cls): + # TODO: need to sort on 'round' since actions will not be globally sync'd. + return cls.query.filter(cls.loginbase!=None).order_by(cls.loginbase.desc()) + + @classmethod + def getHostnameBlacklist(cls): + # TODO: need to sort on 'round' since actions will not be globally sync'd. + return cls.query.filter(cls.hostname!=None).order_by(cls.hostname.desc()) + def neverExpires(self): if self.expires == 0: return True diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 05afe6e..220eb10 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -207,7 +207,7 @@ ERROR- This is an error state, where there is absolutely no contact with PlanetLab. """) - pcufailed_notice =("""Could not use PCU to reboot %(hostname)s""", + pcufailed_notice =("""MONTEST: Could not use PCU to reboot %(hostname)s""", """As part of PlanetLab node monitoring and maintenance, we tried to use the PCU registered for %(hostname)s, but could not for some reason. @@ -217,21 +217,21 @@ Please help. Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - online_notice=("""Host %(hostname)s is online""", + online_notice=("""MONTEST: Host %(hostname)s is online""", """ This notice is simply to let you know that: %(hostname)s is online and operational. Thank you very much for your help! """) - test_notice=("""Host %(hostname)s is testing""", + test_notice=("""MONTEST: Host %(hostname)s is testing""", """ This notice is simply to test whether notices work. %(hostname)s Thank you very much for your help! """) - retry_bootman=("""Running BootManager on %(hostname)s""", + retry_bootman=("""MONTEST: Running BootManager on %(hostname)s""", """ This notice is simply to let you know that: %(hostname)s @@ -239,7 +239,7 @@ This notice is simply to let you know that: appears stuck in a debug mode. To try to correct this, we're trying to rerun BootManager.py. If any action is needed from you, you will recieve additional notices. Thank you! """) - down_notice=("""Host %(hostname)s is down""", + down_notice=("""MONTEST: Host %(hostname)s is down""", """ This notice is simply to let you know that: %(hostname)s @@ -247,7 +247,7 @@ This notice is simply to let you know that: is down, disconnected from the network and/or non-operational. Please investigate, thank you very much for your help! """) - clear_penalty=("""All penalties have been cleared from site %(loginbase)s""", + clear_penalty=("""MONTEST: All penalties have been cleared from site %(loginbase)s""", """ This notice is to let you know that any penalties previously applied to your site have been removed: %(penalty_level)s. @@ -262,7 +262,7 @@ Legend: 2+ - all existing slices will be disabled. """) - increase_penalty=("""Penalty increased for site %(loginbase)s""", + increase_penalty=("""MONTEST: Penalty increased for site %(loginbase)s""", """ This notice is to let you know that the penalty applied to your site has increased: %(penalty_level)s. @@ -274,7 +274,7 @@ legend: 2+ - all existing slices will be disabled. """) - newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """ + newbootcd_notice=("""MONTEST: Host %(hostname)s needs a new BootImage""", """ As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: %(hostname)s @@ -372,7 +372,7 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - newalphacd_notice=(""" New Boot Images for %(hostname)s""", + newalphacd_notice=("""MONTEST: New Boot Images for %(hostname)s""", """As part of PlanetLab node monitoring, we noticed that we were not able to recognize all the hardware in your machine. This means that it is so new that it needs a new BootCD, or that it is so old that it is no longer supported. %(hostname)s @@ -403,7 +403,7 @@ Thank you for your help, pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one] pcudown=[pcudown_one, pcudown_one, pcudown_one] - unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""", + unknownsequence_notice = ("""MONTEST: Unrecognized Error on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -489,7 +489,7 @@ Thank you for your help, donation_down = [ donation_down_one, donation_down_one, donation_down_one ] - minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", + minimalhardware_notice = ("""MONTEST: Hardware requirements not met on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -509,7 +509,7 @@ BootManager.log output follows: %(bmlog)s """ ) - baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""", + baddisk_notice = ("""MONTEST: Bad Disk on PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node. Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org. @@ -575,7 +575,7 @@ BootManager.log output follows: %(bmlog)s """) - nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", + nodeconfig_notice=("""MONTEST: Please Update Configuration file for PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit: https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s @@ -615,7 +615,7 @@ Thanks. """) - baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""", + baddns_notice=("""MONTEST: Planetlab node down: broken DNS configuration for %(hostname)s""", """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries. %(hostname)s diff --git a/policy.py b/policy.py index a20da09..fcbbb94 100755 --- a/policy.py +++ b/policy.py @@ -210,14 +210,21 @@ class SiteInterface(HistorySiteRecord): def attemptReboot(self, hostname): print "attempting PCU reboot of %s" % hostname - ret = reboot.reboot_str(hostname) + err = "" + try: + ret = reboot.reboot_str(hostname) + except Exception, e: + err = traceback.format_exc() + ret = str(e) + if ret == 0 or ret == "0": ret = "" + act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='reboot', action_type='first_try_reboot', - error_string=ret) + error_string=err) def logic(): @@ -301,10 +308,11 @@ def main(hostnames, sitenames): # send down node notice sitehist.sendMessage('down_notice', hostname=host) - print "send message for host %s offline" % host + print "send message for host %s down" % host pass node_count = node_count + 1 + session.flush() for site in sitenames: sitehist = SiteInterface.get_or_make(loginbase=site) @@ -351,8 +359,9 @@ def main(hostnames, sitenames): site_count = site_count + 1 - session.flush() + session.flush() + session.flush() return @@ -422,8 +431,9 @@ if __name__ == "__main__": main(hostnames, sitenames) except KeyboardInterrupt: print "Killed by interrupt" + session.flush() sys.exit(0) except: #email_exception() print traceback.print_exc(); - print "Continuing..." + print "fail all..." diff --git a/sitebad.py b/sitebad.py index f90f887..cf5ab4e 100755 --- a/sitebad.py +++ b/sitebad.py @@ -56,37 +56,36 @@ def getnodesup(nodelist): def check_site_state(rec, sitehist): - if sitehist.new and sitehist.status != 'new': + if sitehist.new and sitehist.status not in ['new', 'online', 'good']: sitehist.status = 'new' + sitehist.penalty_applied = True # because new sites are disabled by default, i.e. have a penalty. sitehist.last_changed = datetime.now() - if not sitehist.new: + if sitehist.nodes_up >= MINUP: - if sitehist.nodes_up >= MINUP: + if sitehist.status != 'online' and sitehist.status != 'good': + sitehist.last_changed = datetime.now() - if sitehist.status != 'online' and sitehist.status != 'good': - sitehist.last_changed = datetime.now() + if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online': + print "changed status from %s to online" % sitehist.status + sitehist.status = 'online' - if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'online': - print "changed status from %s to online" % sitehist.status - sitehist.status = 'online' + if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good': + print "changed status from %s to good" % sitehist.status + sitehist.status = 'good' - if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'good': - print "changed status from %s to good" % sitehist.status - sitehist.status = 'good' + if not sitehist.new: - else: # sitehist.nodes_up < MINUP: - - if sitehist.status != 'offline' and sitehist.status != 'down': - sitehist.last_changed = datetime.now() + if sitehist.status != 'offline' and sitehist.status != 'down': + sitehist.last_changed = datetime.now() - if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline': - print "changed status from %s to offline" % sitehist.status - sitehist.status = 'offline' + if changed_lessthan(sitehist.last_changed, 0.5) and sitehist.status != 'offline': + print "changed status from %s to offline" % sitehist.status + sitehist.status = 'offline' - if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down': - print "changed status from %s to down" % sitehist.status - sitehist.status = 'down' + if changed_greaterthan(sitehist.last_changed, 0.5) and sitehist.status != 'down': + print "changed status from %s to down" % sitehist.status + sitehist.status = 'down' def checkAndRecordState(l_sites, l_plcsites): count = 0 diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 647c9e8..337139f 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -166,7 +166,7 @@ class Root(controllers.RootController): prep_node_for_display(node) nodequery += [node] - return self.pcuview(None, hostname) # dict(nodequery=nodequery) + return self.pcuview(None, None, hostname) # dict(nodequery=nodequery) @expose(template="monitorweb.templates.nodelist") def node(self, filter='boot'): @@ -243,6 +243,7 @@ class Root(controllers.RootController): def nodeaction_handler(self, tg_exceptions=None): """Handle any kind of error.""" + print "NODEACTION_HANDLER------------------" if 'pcuid' in request.params: pcuid = request.params['pcuid'] @@ -271,6 +272,7 @@ class Root(controllers.RootController): return self.pcuview(None, pcuid, **dict(exceptions=tg_exceptions)) def nodeaction(self, **data): + print "NODEACTION------------------" for item in data.keys(): print "%s %s" % ( item, data[item] ) @@ -294,7 +296,7 @@ class Root(controllers.RootController): ret = reboot.reboot_str(str(hostname)) print ret if ret: raise RuntimeError("Error using PCU: " + str(ret)) - flash("Reboot appeared to work. All at most 5 minutes. Run ExternalScan to check current status.") + flash("Reboot appeared to work. Allow at most 5 minutes. Then run ExternalScan to check current status.") elif action == "ExternalScan": scanapi.externalprobe(str(hostname)) @@ -311,6 +313,7 @@ class Root(controllers.RootController): @expose(template="monitorweb.templates.pcuview") @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)") def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data): + print "PCUVIEW------------------" session.clear() sitequery=[] pcuquery=[] diff --git a/web/MonitorWeb/monitorweb/static/css/style.css b/web/MonitorWeb/monitorweb/static/css/style.css index 473a4d9..40a1691 100644 --- a/web/MonitorWeb/monitorweb/static/css/style.css +++ b/web/MonitorWeb/monitorweb/static/css/style.css @@ -17,10 +17,10 @@ tr.even td {background-color:#fff;} #header { height: 40px; - width: 780px; + /*width: 780px;*/ /*background: blue URL('../images/header_inner.png') no-repeat;*/ - border-left: 1px solid #aaa; - border-right: 1px solid #aaa; + /*border-left: 1px solid #aaa;*/ + /*border-right: 1px solid #aaa;*/ margin: 0 auto 0 auto; text-align: center; font-size: 180%; @@ -189,7 +189,7 @@ h2 { } #footer { - border: 1px solid #aaa; + /*border: 1px solid #aaa;*/ border-top: 0px none; color: #999; background-color: white; diff --git a/web/MonitorWeb/monitorweb/templates/sitemenu.kid b/web/MonitorWeb/monitorweb/templates/sitemenu.kid index 4383b84..301e6ae 100644 --- a/web/MonitorWeb/monitorweb/templates/sitemenu.kid +++ b/web/MonitorWeb/monitorweb/templates/sitemenu.kid @@ -1,7 +1,7 @@ - App Name - ${page_title} + ${page_title} @@ -13,8 +13,8 @@ - + +
@@ -24,7 +24,7 @@ - + @@ -38,8 +38,8 @@
Sites PCUs NodesActionsActions
-