From 6df6b8cf9b9a5e78f4f68445e1b2dabc2ae272e6 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 23 Sep 2008 19:53:34 +0000 Subject: [PATCH] M emailTxt.py updated description of error message from CoMon since it has changed. M showlatlon.py updated hardware spec thresholds to include more machines. M clean_policy.py stricter activation of the 'action-levels' that each event triggers. Previously things were out of sorts. M unified_model.py works with the 'action-level' changes above. M nodesets.py M grouprins.py add a site option M nodecommon.py add missing module M bootman.py M rtinfo.py M todo --- bootman.py | 10 +++++----- clean_policy.py | 30 +++++++++++++++++++++++++----- emailTxt.py | 6 +++--- grouprins.py | 13 +++++++++---- nodecommon.py | 1 + nodesets.py | 8 ++++++-- rtinfo.py | 2 +- showlatlon.py | 12 ++++++------ todo | 2 +- unified_model.py | 10 +++------- 10 files changed, 60 insertions(+), 34 deletions(-) diff --git a/bootman.py b/bootman.py index 87d8b71..faf77a2 100755 --- a/bootman.py +++ b/bootman.py @@ -541,11 +541,11 @@ def reboot(hostname, config=None, forced_action=None): # By using the sequence identifier, we guarantee that there will be no # frequent loops. I'm guessing there is a better way to track loops, # though. - if not config.force and pflags.getRecentFlag(s): - pflags.setRecentFlag(s) - pflags.save() - print "... flag is set or it has already run recently. Skipping %s" % node - return True + #if not config.force and pflags.getRecentFlag(s): + # pflags.setRecentFlag(s) + # pflags.save() + # print "... flag is set or it has already run recently. Skipping %s" % node + # return True sequences = {} diff --git a/clean_policy.py b/clean_policy.py index a14016e..8e35903 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -84,6 +84,7 @@ class MonitorMergeDiagnoseSendEscellate: fbnode['log'] = None fbnode['time'] = time.time() fbnode['email'] = TECH + fbnode['action-level'] = 0 fbnode['action'] = ['noop'] fbnode['date_created'] = time.time() @@ -171,7 +172,7 @@ class MonitorMergeDiagnoseSendEscellate: print "diagnose: checkStageAndTime Returned Valid Record" site = PersistFlags(self.loginbase, 1, db='site_persistflags') - if site.status != "good": + if "good" not in site.status: # != "good": print "diagnose: Setting site %s for 'squeeze'" % self.loginbase diag.setFlag('Squeeze') else: @@ -191,7 +192,9 @@ class MonitorMergeDiagnoseSendEscellate: #print record.data['stage'] #print "improvement" in record.data['stage'] #print self.getSendEmailFlag(record) - if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: + print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) ) + if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \ + "monitor-end-record" in record.data['stage']: print "action: getting message" message = record.getMessage(record.data['ticket_id']) if message: @@ -206,10 +209,13 @@ class MonitorMergeDiagnoseSendEscellate: print "action: setting record ticket_id" record.data['ticket_id'] = message.rt.ticket_id - if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'): + if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): print "action: taking action" - record.takeAction() + record.takeAction(record.data['action-level']) diag.resetFlag('Squeeze') + diag.save() + if diag.getFlag('BackOff'): + record.takeAction(0) diag.resetFlag('BackOff') diag.save() @@ -306,6 +312,7 @@ class MonitorMergeDiagnoseSendEscellate: record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'stage_actinoneweek' record.data['save-act-all'] = True + record.data['action-level'] = 0 elif 'reboot_node' in record.data['stage']: record.data['email'] = TECH @@ -314,6 +321,7 @@ class MonitorMergeDiagnoseSendEscellate: record.data['stage'] = 'stage_actinoneweek' record.data['takeaction'] = False record.data['save-act-all'] = False + record.data['action-level'] = 0 elif 'improvement' in record.data['stage']: print "checkStageAndTime: backing off of %s" % self.hostname @@ -322,6 +330,7 @@ class MonitorMergeDiagnoseSendEscellate: record.data['message'] = record.data['message_series'][0] record.data['stage'] = 'monitor-end-record' record.data['save-act-all'] = True + record.data['action-level'] = 0 elif 'actinoneweek' in record.data['stage']: if delta >= 7 * SPERDAY: @@ -333,6 +342,7 @@ class MonitorMergeDiagnoseSendEscellate: record.data['time'] = current_time # reset clock for waitforever record.data['takeaction'] = True record.data['save-act-all'] = True + record.data['action-level'] = 1 elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data: print "checkStageAndTime: second message in one week" record.data['email'] = TECH @@ -341,11 +351,13 @@ class MonitorMergeDiagnoseSendEscellate: record.data['second-mail-at-oneweek'] = True record.data['takeaction'] = False record.data['save-act-all'] = True + record.data['action-level'] = 0 else: record.data['message'] = None record.data['action'] = ['waitforoneweekaction' ] record.data['takeaction'] = False record.data['save-act-all'] = False + record.data['action-level'] = 0 print "checkStageAndTime: ignoring this record for: %s" % self.hostname #return None # don't send if there's no action @@ -359,6 +371,7 @@ class MonitorMergeDiagnoseSendEscellate: record.data['time'] = current_time # reset clock for waitforever record.data['takeaction'] = True record.data['save-act-all'] = True + record.data['action-level'] = 2 elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data: print "checkStageAndTime: second message in one week for stage two" record.data['email'] = TECH | PI @@ -367,12 +380,14 @@ class MonitorMergeDiagnoseSendEscellate: record.data['second-mail-at-twoweeks'] = True record.data['takeaction'] = False record.data['save-act-all'] = True + record.data['action-level'] = 1 else: record.data['message'] = None record.data['takeaction'] = False record.data['action'] = ['waitfortwoweeksaction'] record.data['save-act-all'] = False print "checkStageAndTime: second message in one week for stage two" + record.data['action-level'] = 1 #return None # don't send if there's no action elif 'ticket_waitforever' in record.data['stage']: @@ -385,18 +400,21 @@ class MonitorMergeDiagnoseSendEscellate: record.data['message'] = None record.data['time'] = current_time record.data['save-act-all'] = True + record.data['action-level'] = 2 else: if delta >= 7*SPERDAY: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None record.data['time'] = current_time # reset clock record.data['save-act-all'] = True + record.data['action-level'] = 2 else: record.data['action'] = ['ticket_waitforever'] record.data['message'] = None record.data['takeaction'] = False record.data['save-act-all'] = False - return None + record.data['action-level'] = 2 + #return None elif 'waitforever' in record.data['stage']: # more than 3 days since last action @@ -408,11 +426,13 @@ class MonitorMergeDiagnoseSendEscellate: record.data['message'] = record.data['message_series'][2] record.data['time'] = current_time # reset clock record.data['save-act-all'] = True + record.data['action-level'] = 2 else: record.data['action'] = ['waitforever'] record.data['message'] = None record.data['takeaction'] = False record.data['save-act-all'] = False + record.data['action-level'] = 2 #return None # don't send if there's no action else: diff --git a/emailTxt.py b/emailTxt.py index cfbf112..f764a41 100644 --- a/emailTxt.py +++ b/emailTxt.py @@ -22,7 +22,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a %(hostname_list)s We're writing because we need your help returning them to their regular operation. -To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return an 'Internal Server Error'. +To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return the message 'could not find requested table - probably empty'. http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50 @@ -51,7 +51,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a %(hostname_list)s We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation. We understand that machine maintenance can take time. So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site. No new slices may be created, but the existing slices and services running within them will be unaffected. -To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return an 'Internal Server Error'. +To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return the message 'could not find requested table - probably empty'. http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50 @@ -80,7 +80,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a %(hostname_list)s We understand that machine maintenance can take time. We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation. This is the third time attempting to contact someone in regard to these machines at your site. So, while we wait for the machines to return to their regular operation all current slice activity will be suspended. Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines. -To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return an 'Internal Server Error'. +To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine. Then, after checking that the node is properly networked, power cycle the machine. Note that rebooting the machine may not fully resolve the problems we are seeing. Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network. It may take several minutes before Comon registers your node. Until that time, visiting the link below will return the message 'could not find requested table - probably empty'. http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50 diff --git a/grouprins.py b/grouprins.py index d859727..cfefc6a 100755 --- a/grouprins.py +++ b/grouprins.py @@ -228,6 +228,11 @@ if config.nodegroup: nodelist = api.GetNodes(ng[0]['node_ids']) hostnames = [ n['hostname'] for n in nodelist ] +if config.site: + site = api.GetSites(config.site) + l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + hostnames = [ n['hostname'] for n in l_nodes ] + if config.node or config.nodelist: if config.node: hostnames = [ config.node ] else: hostnames = util.file.getListFromFile(config.nodelist) @@ -339,10 +344,10 @@ for host in hostnames: print "ALL METHODS OF RESTARTING %s FAILED" % host args = {} args['hostname'] = host - m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args, - "CANNOT CONTACT", False, db='suspect_persistmessages') - m.reset() - m.send(['monitor-list@lists.planet-lab.org']) + #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args, + # "CANNOT CONTACT", False, db='suspect_persistmessages') + #m.reset() + #m.send(['monitor-list@lists.planet-lab.org']) l = Log(host, record) print l diff --git a/nodecommon.py b/nodecommon.py index a8b82ea..624ee2c 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -4,6 +4,7 @@ import reboot import time import util.file import plc +from datetime import datetime from monitor import database from unified_model import PersistFlags esc = struct.pack('i', 27) diff --git a/nodesets.py b/nodesets.py index 63b049c..ea69d6b 100755 --- a/nodesets.py +++ b/nodesets.py @@ -4,6 +4,7 @@ import sys import os from sets import Set import parser as parsermodule +import util.file def main(): parser = parsermodule.getParser() @@ -16,8 +17,8 @@ def main(): f1 = config.args[0] f2 = config.args[1] - s1 = config.getListFromFile(f1) - s2 = config.getListFromFile(f2) + s1 = util.file.getListFromFile(f1) + s2 = util.file.getListFromFile(f2) s = nodesets(config.operation, s1, s2) @@ -44,3 +45,6 @@ def nodesets(operation, s1, s2): print "Unknown operation: %s " % operation return [] + +if __name__ == "__main__": + main() diff --git a/rtinfo.py b/rtinfo.py index 35d6973..bdbc993 100755 --- a/rtinfo.py +++ b/rtinfo.py @@ -11,7 +11,7 @@ for id in sql.keys(): #print sql[id].keys() #sys.exit(1) key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id] - sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id] + sortkeys[key] = "%(ticket_id)s %(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id] #sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id] keys = sortkeys.keys() diff --git a/showlatlon.py b/showlatlon.py index 10367e4..af01bd7 100755 --- a/showlatlon.py +++ b/showlatlon.py @@ -29,11 +29,11 @@ def gethardwarequality(nodename, fb): for field in ['cpuspeed', 'memsize', 'disksize']: if field not in cstat: cstat[field] = "null" - if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.4: + if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.2: return "BAD" # "cpu_slow", - if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.9: + if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.8: return "BAD" # "mem_small", - if cstat['disksize'] != "null" and float(cstat['disksize']) < 320.0: + if cstat['disksize'] != "null" and float(cstat['disksize']) < 300.0: return "BAD" # "disk_small", if cstat['disksize'] == "null" and \ @@ -42,9 +42,9 @@ def gethardwarequality(nodename, fb): return "N/A" try: - if float(cstat['cpuspeed']) >= 2.4 and \ - float(cstat['memsize']) >= 2.9 and \ - (cstat['disksize'] == "null" or float(cstat['disksize']) >= 320.0): + if float(cstat['cpuspeed']) >= 2.2 and \ + float(cstat['memsize']) >= 2.8 and \ + (cstat['disksize'] == "null" or float(cstat['disksize']) >= 300.0): return "A-OK" except: print cstat diff --git a/todo b/todo index d7370ef..ae180a8 100644 --- a/todo +++ b/todo @@ -14,9 +14,9 @@ TODO: - testapi.py - findbad.py on sample site. - nodebad.py + - findbadpcus.py - nodequery.py - nodegroups.py - - findbadpcus.py - loads webpage for those retreived values to confirm setup succeeded. * reimplement the config.py / .config mechanism. I'd like for many commands diff --git a/unified_model.py b/unified_model.py index 8c5fb7f..e237bc9 100755 --- a/unified_model.py +++ b/unified_model.py @@ -3,8 +3,6 @@ from monitor import database import plc -api = plc.getAuthAPI() - import mailer import time @@ -15,9 +13,6 @@ import config def gethostlist(hostlist_file): return util.file.getListFromFile(hostlist_file) - - #nodes = api.GetNodes({'peer_id' : None}, ['hostname']) - #return [ n['hostname'] for n in nodes ] def array_to_priority_map(array): """ Create a mapping where each entry of array is given a priority equal @@ -450,7 +445,7 @@ class Record(object): def getDaysDown(cls, diag_record): daysdown = -1 - if diag_record['comonstats']['uptime'] != "null": + if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1": daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24) #elif diag_record['comonstats']['sshstatus'] != "null": # daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24) @@ -504,7 +499,7 @@ class Record(object): # return "%d days up"% -daysdown #getStrDaysDown = classmethod(getStrDaysDown) - def takeAction(self): + def takeAction(self, index=0): pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames') if 'improvement' in self.data['stage'] or self.improved() or \ 'monitor-end-record' in self.data['stage']: @@ -514,6 +509,7 @@ class Record(object): else: print "takeAction: increasing penalty for %s"%self.hostname pp.increase() + pp.index = index pp.apply(self.hostname) pp.save() -- 2.45.2