From b6b1491cb6611a63a012206d2f932a4784b4508f Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 27 Mar 2009 17:07:07 +0000 Subject: [PATCH] moved found_within to common.py renamed email messages in emailTxt to reflect action types updated findbad model to perform single-node queries correctly. added node.status categories to nodelist.kid since this is the primary difference between nodes now. --- monitor/common.py | 11 +++ monitor/database/info/findbad.py | 2 +- monitor/wrapper/emailTxt.py | 29 ++++--- nodebad.py | 41 +++++++--- pcucontrol/util/command.py | 1 + web/MonitorWeb/monitorweb/controllers.py | 80 +++++++++++++------ .../monitorweb/templates/nodelist.kid | 12 +-- 7 files changed, 127 insertions(+), 49 deletions(-) diff --git a/monitor/common.py b/monitor/common.py index 0f6dd40..aecd866 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -238,3 +238,14 @@ def changed_greaterthan(last_changed, days): #print "last changed less than %s" % timedelta(days) return False +def found_within(recent_actions, action_type, within): + for action in recent_actions: + if action_type == action.action_type and \ + datetime.now() - action.date_created < timedelta(within): + # recent action of given type. + #print "%s found_within %s in recent_actions from %s" % (action_type, timedelta(within), action.date_created) + return True + + print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) ) + return False + diff --git a/monitor/database/info/findbad.py b/monitor/database/info/findbad.py index b437842..a5139eb 100644 --- a/monitor/database/info/findbad.py +++ b/monitor/database/info/findbad.py @@ -94,7 +94,7 @@ class FindbadPCURecord(Entity): @classmethod def get_latest_by(cls, **kwargs): - return cls.query.filter_by(**kwargs) + return cls.query.filter_by(**kwargs).first() # ACCOUNTING date_checked = Field(DateTime) diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 385ac63..98c8856 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -274,6 +274,17 @@ legend: 2+ - all existing slices will be disabled. """) + newbootcd_notice=(""" Host %(hostname)s needs a new BootImage""", """ +As part of PlanetLab node monitoring, we noticed the following nodes have an out-dated BootCD: + + %(hostname)s + +This usually implies that you need to update the BootCD and node configuration file stored on the read-only media (either the all-in-one ISO CD, floppy disk, or write-protected USB stick). + +Thank you for your help, + -- PlanetLab Central (support@planet-lab.org) +""") + nmreset =("""NM Reset at %(loginbase)s""", """ Monitor restarted NM on the following machines: @@ -361,10 +372,10 @@ Thank you very much for your help, -- PlanetLab Central (support@planet-lab.org) """) - newalphacd_one=(""" Planetlab nodes need a new BootCD: %(loginbase)s""", -"""As part of PlanetLab node monitoring, we noticed that your machines needs a new BootCD to fully support your hardware: + newalphacd_notice=(""" New Boot Images for %(hostname)s""", +"""As part of PlanetLab node monitoring, we noticed that your machine needs a new BootCD to fully support your hardware: -%(hostname_list)s +%(hostname)s To make this process as simple as possible, we have created All-in-One boot images that include the node configuration file. @@ -385,14 +396,14 @@ Thank you for your help, # TODO: need reminder versions for repeats... newdown=[newdown_one, newdown_two, newdown_three] newbootcd=[newbootcd_one, newbootcd_two, newbootcd_three] - newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one] + #newalphacd=[newalphacd_one, newalphacd_one, newalphacd_one] newthankyou=[thankyou,thankyou,thankyou] pcuthankyou=[pcuthankyou_one,pcuthankyou_one,pcuthankyou_one] NMReset=[nmreset,nmreset,nmreset] pcutonodemapping=[pcutonodemapping_one, pcutonodemapping_one, pcutonodemapping_one] pcudown=[pcudown_one, pcudown_one, pcudown_one] - unknownsequence = ("""Unrecognized Error on PlanetLab host %(hostname)s""", + unknownsequence_notice = ("""Unrecognized Error on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -478,7 +489,7 @@ Thank you for your help, donation_down = [ donation_down_one, donation_down_one, donation_down_one ] - minimalhardware = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", + minimalhardware_notice = ("""Hardware requirements not met on PlanetLab host %(hostname)s""", """ While trying to automatically recover this machine: @@ -498,7 +509,7 @@ BootManager.log output follows: %(bmlog)s """ ) - baddisk = ("""Bad Disk on PlanetLab node %(hostname)s""", + baddisk_notice = ("""Bad Disk on PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has a number of disk or media related I/O errors, that prevent it from either booting or reliably running as a PlanetLab node. Please verify the integrity of the disk, and order a replacement if needed. If you need to schedule downtime for the node, please let us know at support@planet-lab.org. @@ -564,7 +575,7 @@ BootManager.log output follows: %(bmlog)s """) - plnode_cfg=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", + nodeconfig_notice=(""" Please Update Configuration file for PlanetLab node %(hostname)s""", """As part of PlanetLab node monitoring, we noticed %(hostname)s has an out-dated plnode.txt file with no NODE_ID or a mis-matched HOSTNAME. This can happen either due to an initial configuration failure at your site, with information entered into our database, or after a software upgrade. To resolve the issue we require your assistance. All that is needed is to visit: https://www.planet-lab.org/db/nodes/index.php?nodepattern=%(hostname)s @@ -604,7 +615,7 @@ Thanks. """) - baddns=("""Planetlab node down: broken DNS configuration for %(hostname)s""", + baddns_notice=("""Planetlab node down: broken DNS configuration for %(hostname)s""", """As part of PlanetLab node monitoring, we noticed the DNS servers used by the following machine(s) are not responding to queries. %(hostname)s diff --git a/nodebad.py b/nodebad.py index a0490e4..46ca879 100755 --- a/nodebad.py +++ b/nodebad.py @@ -44,31 +44,47 @@ def check_node_state(rec, node): boot_state = "unknown" last_contact = None + if boot_state == 'disable': boot_state = 'disabled' + if boot_state == 'diag': boot_state = 'diagnose' + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need # 'translations' into the node.status state # 'BOOT' is a permanent state, but we want it to have a bit of # hysteresis (less than 0.5 days) - #################################################################3 - # "Translate" the findbad states into nodebad status. + ################################################################# + # "Initialize" the findbad states into nodebad status if they are not already set - if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disable' : + if node_state == 'DOWN' and ( node.status != 'offline' and node.status != 'down' ) and boot_state != 'disabled' : print "changed status from %s to offline" % node.status node.status = 'offline' node.last_changed = datetime.now() - if node_state == 'DEBUG' and node.status != 'monitordebug': - print "changed status from %s to monitordebug" % (node.status) - node.status = "monitordebug" - node.last_changed = datetime.now() + if node_state == 'DEBUG' and node.status != 'monitordebug' and \ + node.status != 'disabled' and \ + node.status != 'diagnose': + if boot_state != 'disabled' and boot_state != 'diagnose': + + print "changed status from %s to monitordebug" % (node.status) + node.status = "monitordebug" + node.last_changed = datetime.now() + else: + print "changed status from %s to %s" % (node.status, boot_state) + node.status = boot_state + node.last_changed = datetime.now() if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': print "changed status from %s to online" % node.status node.status = 'online' node.last_changed = datetime.now() - #################################################################3 + ################################################################# # Switch temporary hystersis states into their 'firm' states. + # online -> good after half a day + # offline -> down after two days + # monitordebug -> down after 30 days + # diagnose -> monitordebug after 60 days + # disabled -> down after 60 days if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): print "changed status from %s to good" % node.status @@ -80,11 +96,16 @@ def check_node_state(rec, node): node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. - if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 14): + if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30): print "changed status from %s to down" % node.status node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. - #node.last_changed = datetime.now() + + if node.status == 'diagnose' and changed_greaterthan(node.last_changed, 60): + print "changed status from %s to down" % node.status + # NOTE: change an admin mode back into monitordebug after two months. + node.status = 'monitordebug' + node.last_changed = datetime.now() # extreme cases of offline nodes if ( boot_state == 'disabled' or last_contact == None ) and \ diff --git a/pcucontrol/util/command.py b/pcucontrol/util/command.py index 899d667..47627b4 100644 --- a/pcucontrol/util/command.py +++ b/pcucontrol/util/command.py @@ -197,6 +197,7 @@ class SSH(CMD): def run_noexcept2(self, cmd, timeout=COMMAND_TIMEOUT*2): cmd = "exec ssh -p %s %s %s@%s %s" % (self.port, self.__options_to_str(), self.user, self.host, cmd) + #print cmd r = CMD.run_noexcept(self, cmd, timeout) self.ret = -1 diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 1178aa1..0d4e703 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -12,14 +12,15 @@ from monitor.database.zabbixapi.model import * from monitor.database.dborm import zab_session as session from monitor.database.dborm import zab_metadata as metadata -from pcucontrol import reboot +from monitor import reboot +from monitor import scanapi + from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb from monitor.wrapper.plccache import plcdb_lb2hn as site_lb2hn from monitorweb.templates.links import * -from monitor import scanapi def query_to_dict(query): @@ -103,7 +104,7 @@ class NodeWidget(widgets.Widget): def prep_node_for_display(node): if node.plc_pcuid: - pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid) if pcu: node.pcu_status = pcu.reboot_trial_status node.pcu_short_status = format_pcu_shortstatus(pcu) @@ -168,40 +169,72 @@ class Root(controllers.RootController): return self.pcuview(None, hostname) # dict(nodequery=nodequery) @expose(template="monitorweb.templates.nodelist") - def node(self, filter='BOOT'): + def node(self, filter='boot'): import time fbquery = FindbadNodeRecord.get_all_latest() query = [] - filtercount = {'DOWN' : 0, 'BOOT': 0, 'DEBUG' : 0, 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0} + filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, + 'neverboot' : 0, 'pending' : 0, 'all' : 0, None : 0} for node in fbquery: # NOTE: reformat some fields. prep_node_for_display(node) - # NOTE: count filters - if node.observed_status != 'DOWN': - print node.hostname, node.observed_status - filtercount[node.observed_status] += 1 - else: + node.history.status + + if node.history.status in ['down', 'offline']: if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: - filtercount[node.observed_status] += 1 + filtercount['down'] += 1 else: filtercount['neverboot'] += 1 + elif node.history.status in ['good', 'online']: + filtercount['boot'] += 1 + elif node.history.status in ['debug', 'monitordebug']: + filtercount['debug'] += 1 + else: + filtercount[node.history.status] += 1 + + ## NOTE: count filters + #if node.observed_status != 'DOWN': + # print node.hostname, node.observed_status + # if node.observed_status == 'DEBUG': + # if node.plc_node_stats['boot_state'] in ['debug', 'diagnose', 'disabled']: + # filtercount[node.plc_node_stats['boot_state']] += 1 + # else: + # filtercount['debug'] += 1 + # + # else: + # filtercount[node.observed_status] += 1 + #else: + # if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: + # filtercount[node.observed_status] += 1 + # else: + # filtercount['neverboot'] += 1 # NOTE: apply filter - if filter == node.observed_status: - if filter == "DOWN": - if node.plc_node_stats['last_contact'] != None: - query.append(node) - else: - query.append(node) - elif filter == "neverboot": + if filter == "neverboot": if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None: query.append(node) - elif filter == "pending": - # TODO: look in message logs... - pass elif filter == "all": query.append(node) + elif filter == node.history.status: + query.append(node) + + #if filter == node.observed_status: + # if filter == "DOWN": + # if node.plc_node_stats['last_contact'] != None: + # query.append(node) + # else: + # query.append(node) + #elif filter == "neverboot": + # if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None: + # query.append(node) + #elif filter == "pending": + # # TODO: look in message logs... + # pass + #elif filter == node.plc_node_stats['boot_state']: + # query.append(node) + #elif filter == "all": + # query.append(node) widget = NodeWidget(template='monitorweb.templates.node_template') return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget) @@ -222,7 +255,7 @@ class Root(controllers.RootController): if 'pcuid' in val: pcuid = val['pcuid'] elif 'hostname' in val: - pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid + pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).plc_pcuid else: pcuid=None else: @@ -304,7 +337,7 @@ class Root(controllers.RootController): prep_node_for_display(node) nodequery += [node] if node.plc_pcuid: # not None - pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid) prep_pcu_for_display(pcu) pcus[node.plc_pcuid] = pcu @@ -326,7 +359,6 @@ class Root(controllers.RootController): node = FindbadNodeRecord.get_latest_by(hostname=nodename) print "%s" % node.port_status print "%s" % node.to_dict() - print "%s" % len(q.all()) if node: prep_node_for_display(node) nodequery += [node] diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid index 5b4e7c3..53bbe5b 100644 --- a/web/MonitorWeb/monitorweb/templates/nodelist.kid +++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid @@ -13,17 +13,19 @@ from links import * - - - + + + + + - + -
Production(${fc['BOOT']})Debug(${fc['DEBUG']})Down(${fc['DOWN']})Prod(${fc['boot']})Down(${fc['down']})Errors(${fc['debug']})Diagnose (${fc['diagnose']})Disabled (${fc['disabled']}) Never Booted(${fc['neverboot']})Pending Reply(${fc['pending']}) All
+ -- 2.43.0