M emailTxt.py
authorStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
updated description of error message from CoMon since it has changed.
M    showlatlon.py
updated hardware spec thresholds to include more machines.
M    clean_policy.py
stricter activation of the 'action-levels' that each event triggers.
Previously things were out of sorts.
M    unified_model.py
works with the 'action-level' changes above.
M    nodesets.py
M    grouprins.py
add a site option
M    nodecommon.py
add missing module
M    bootman.py
M    rtinfo.py
M    todo

bootman.py
clean_policy.py
emailTxt.py
grouprins.py
nodecommon.py
nodesets.py
rtinfo.py
showlatlon.py
todo
unified_model.py

index 87d8b71..faf77a2 100755 (executable)
@@ -541,11 +541,11 @@ def reboot(hostname, config=None, forced_action=None):
        #  By using the sequence identifier, we guarantee that there will be no
        #  frequent loops.  I'm guessing there is a better way to track loops,
        #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
 
        sequences = {}
 
index a14016e..8e35903 100644 (file)
@@ -84,6 +84,7 @@ class MonitorMergeDiagnoseSendEscellate:
                fbnode['log'] = None
                fbnode['time'] = time.time()
                fbnode['email'] = TECH
+               fbnode['action-level'] = 0
                fbnode['action'] = ['noop']
                fbnode['date_created'] = time.time()
 
@@ -171,7 +172,7 @@ class MonitorMergeDiagnoseSendEscellate:
                print "diagnose: checkStageAndTime Returned Valid Record"
                site = PersistFlags(self.loginbase, 1, db='site_persistflags')
 
-               if site.status != "good":
+               if "good" not in site.status: #  != "good":
                        print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
                        diag.setFlag('Squeeze')
                else:
@@ -191,7 +192,9 @@ class MonitorMergeDiagnoseSendEscellate:
                #print record.data['stage']
                #print "improvement" in record.data['stage']
                #print self.getSendEmailFlag(record)
-               if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: 
+               print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
+               if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
+                       "monitor-end-record" in record.data['stage']:
                        print "action: getting message"
                        message = record.getMessage(record.data['ticket_id'])
                        if message:
@@ -206,10 +209,13 @@ class MonitorMergeDiagnoseSendEscellate:
                                        print "action: setting record ticket_id"
                                        record.data['ticket_id'] = message.rt.ticket_id
 
-                       if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+                       if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
                                print "action: taking action"
-                               record.takeAction()
+                               record.takeAction(record.data['action-level'])
                                diag.resetFlag('Squeeze')
+                               diag.save()
+                       if diag.getFlag('BackOff'):
+                               record.takeAction(0)
                                diag.resetFlag('BackOff')
                                diag.save()
 
@@ -306,6 +312,7 @@ class MonitorMergeDiagnoseSendEscellate:
                        record.data['message'] = record.data['message_series'][0]
                        record.data['stage'] = 'stage_actinoneweek'
                        record.data['save-act-all'] = True
+                       record.data['action-level'] = 0
 
                elif 'reboot_node' in record.data['stage']:
                        record.data['email'] = TECH
@@ -314,6 +321,7 @@ class MonitorMergeDiagnoseSendEscellate:
                        record.data['stage'] = 'stage_actinoneweek'
                        record.data['takeaction'] = False
                        record.data['save-act-all'] = False
+                       record.data['action-level'] = 0
                        
                elif 'improvement' in record.data['stage']:
                        print "checkStageAndTime: backing off of %s" % self.hostname
@@ -322,6 +330,7 @@ class MonitorMergeDiagnoseSendEscellate:
                        record.data['message'] = record.data['message_series'][0]
                        record.data['stage'] = 'monitor-end-record'
                        record.data['save-act-all'] = True
+                       record.data['action-level'] = 0
 
                elif 'actinoneweek' in record.data['stage']:
                        if delta >= 7 * SPERDAY: 
@@ -333,6 +342,7 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['time'] = current_time              # reset clock for waitforever
                                record.data['takeaction'] = True
                                record.data['save-act-all'] = True
+                               record.data['action-level'] = 1
                        elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
                                print "checkStageAndTime: second message in one week"
                                record.data['email'] = TECH 
@@ -341,11 +351,13 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['second-mail-at-oneweek'] = True
                                record.data['takeaction'] = False
                                record.data['save-act-all'] = True
+                               record.data['action-level'] = 0
                        else:
                                record.data['message'] = None
                                record.data['action'] = ['waitforoneweekaction' ]
                                record.data['takeaction'] = False
                                record.data['save-act-all'] = False
+                               record.data['action-level'] = 0
                                print "checkStageAndTime: ignoring this record for: %s" % self.hostname
                                #return None                    # don't send if there's no action
 
@@ -359,6 +371,7 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['time'] = current_time              # reset clock for waitforever
                                record.data['takeaction'] = True
                                record.data['save-act-all'] = True
+                               record.data['action-level'] = 2
                        elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
                                print "checkStageAndTime: second message in one week for stage two"
                                record.data['email'] = TECH | PI
@@ -367,12 +380,14 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['second-mail-at-twoweeks'] = True
                                record.data['takeaction'] = False
                                record.data['save-act-all'] = True
+                               record.data['action-level'] = 1
                        else:
                                record.data['message'] = None
                                record.data['takeaction'] = False
                                record.data['action'] = ['waitfortwoweeksaction']
                                record.data['save-act-all'] = False
                                print "checkStageAndTime: second message in one week for stage two"
+                               record.data['action-level'] = 1
                                #return None                    # don't send if there's no action
 
                elif 'ticket_waitforever' in record.data['stage']:
@@ -385,18 +400,21 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['message'] = None
                                record.data['time'] = current_time
                                record.data['save-act-all'] = True
+                               record.data['action-level'] = 2
                        else:
                                if delta >= 7*SPERDAY:
                                        record.data['action'] = ['ticket_waitforever']
                                        record.data['message'] = None
                                        record.data['time'] = current_time              # reset clock
                                        record.data['save-act-all'] = True
+                                       record.data['action-level'] = 2
                                else:
                                        record.data['action'] = ['ticket_waitforever']
                                        record.data['message'] = None
                                        record.data['takeaction'] = False
                                        record.data['save-act-all'] = False
-                                       return None
+                                       record.data['action-level'] = 2
+                                       #return None
 
                elif 'waitforever' in record.data['stage']:
                        # more than 3 days since last action
@@ -408,11 +426,13 @@ class MonitorMergeDiagnoseSendEscellate:
                                record.data['message'] = record.data['message_series'][2]
                                record.data['time'] = current_time              # reset clock
                                record.data['save-act-all'] = True
+                               record.data['action-level'] = 2
                        else:
                                record.data['action'] = ['waitforever']
                                record.data['message'] = None
                                record.data['takeaction'] = False
                                record.data['save-act-all'] = False
+                               record.data['action-level'] = 2
                                #return None                    # don't send if there's no action
 
                else:
index cfbf112..f764a41 100644 (file)
@@ -22,7 +22,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
 %(hostname_list)s 
 We're writing because we need your help returning them to their regular operation.
 
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
 
        http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
 
@@ -51,7 +51,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
 %(hostname_list)s 
 We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
 
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
 
        http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
 
@@ -80,7 +80,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
 %(hostname_list)s 
 We understand that machine maintenance can take time.  We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
 
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
 
        http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
 
index d859727..cfefc6a 100755 (executable)
@@ -228,6 +228,11 @@ if config.nodegroup:
        nodelist = api.GetNodes(ng[0]['node_ids'])
        hostnames = [ n['hostname'] for n in nodelist ]
 
+if config.site:
+       site = api.GetSites(config.site)
+       l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+       hostnames = [ n['hostname'] for n in l_nodes ]
+
 if config.node or config.nodelist:
        if config.node: hostnames = [ config.node ] 
        else: hostnames = util.file.getListFromFile(config.nodelist)
@@ -339,10 +344,10 @@ for host in hostnames:
                                print "ALL METHODS OF RESTARTING %s FAILED" % host
                                args = {}
                                args['hostname'] = host
-                               m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-                                                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
-                               m.reset()
-                               m.send(['monitor-list@lists.planet-lab.org'])
+                               #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
+                               #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
+                               #m.reset()
+                               #m.send(['monitor-list@lists.planet-lab.org'])
 
                        l = Log(host, record)
                        print l
index a8b82ea..624ee2c 100644 (file)
@@ -4,6 +4,7 @@ import reboot
 import time
 import util.file
 import plc
+from datetime import datetime 
 from monitor import database
 from unified_model import PersistFlags
 esc = struct.pack('i', 27)
index 63b049c..ea69d6b 100755 (executable)
@@ -4,6 +4,7 @@ import sys
 import os
 from sets import Set
 import parser as parsermodule
+import util.file
 
 def main():
        parser = parsermodule.getParser()
@@ -16,8 +17,8 @@ def main():
        f1 = config.args[0]
        f2 = config.args[1]
 
-       s1 = config.getListFromFile(f1)
-       s2 = config.getListFromFile(f2)
+       s1 = util.file.getListFromFile(f1)
+       s2 = util.file.getListFromFile(f2)
 
        s = nodesets(config.operation, s1, s2)
 
@@ -44,3 +45,6 @@ def nodesets(operation, s1, s2):
                print "Unknown operation: %s " % operation
        
        return []
+
+if __name__ == "__main__":
+       main()
index 35d6973..bdbc993 100755 (executable)
--- a/rtinfo.py
+++ b/rtinfo.py
@@ -11,7 +11,7 @@ for id in sql.keys():
        #print sql[id].keys()
        #sys.exit(1)
        key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id]
-       sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
+       sortkeys[key] = "%(ticket_id)s %(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
        #sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id]
 
 keys = sortkeys.keys()
index 10367e4..af01bd7 100755 (executable)
@@ -29,11 +29,11 @@ def gethardwarequality(nodename, fb):
                for field in ['cpuspeed', 'memsize', 'disksize']:
                        if field not in cstat: cstat[field] = "null"
 
-               if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.4:
+               if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.2:
                        return "BAD" # "cpu_slow",
-               if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.9:
+               if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.8:
                        return "BAD" # "mem_small",
-               if cstat['disksize'] != "null" and float(cstat['disksize']) < 320.0:
+               if cstat['disksize'] != "null" and float(cstat['disksize']) < 300.0:
                        return "BAD" # "disk_small",
 
                if cstat['disksize'] == "null" and \
@@ -42,9 +42,9 @@ def gethardwarequality(nodename, fb):
                        return "N/A"
 
                try:
-                       if  float(cstat['cpuspeed']) >= 2.4 and \
-                               float(cstat['memsize']) >= 2.9 and \
-                               (cstat['disksize'] == "null" or float(cstat['disksize']) >= 320.0):
+                       if  float(cstat['cpuspeed']) >= 2.2 and \
+                               float(cstat['memsize']) >= 2.8 and \
+                               (cstat['disksize'] == "null" or float(cstat['disksize']) >= 300.0):
                                return "A-OK"
                except:
                        print cstat
diff --git a/todo b/todo
index d7370ef..ae180a8 100644 (file)
--- a/todo
+++ b/todo
@@ -14,9 +14,9 @@ TODO:
        - testapi.py
        - findbad.py on sample site.
        - nodebad.py
+       - findbadpcus.py
        - nodequery.py
        - nodegroups.py
-       - findbadpcus.py
        - loads webpage for those retreived values to confirm setup succeeded.
 
  * reimplement the config.py / .config mechanism.  I'd like for many commands
index 8c5fb7f..e237bc9 100755 (executable)
@@ -3,8 +3,6 @@
 from monitor import database
 
 import plc
-api = plc.getAuthAPI()
-
 import mailer
 import time
 
@@ -15,9 +13,6 @@ import config
 
 def gethostlist(hostlist_file):
        return util.file.getListFromFile(hostlist_file)
-       
-       #nodes = api.GetNodes({'peer_id' : None}, ['hostname'])
-       #return [ n['hostname'] for n in nodes ]
 
 def array_to_priority_map(array):
        """ Create a mapping where each entry of array is given a priority equal
@@ -450,7 +445,7 @@ class Record(object):
 
        def getDaysDown(cls, diag_record):
                daysdown = -1
-               if diag_record['comonstats']['uptime'] != "null":
+               if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
                        daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
                #elif diag_record['comonstats']['sshstatus'] != "null":
                #       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
@@ -504,7 +499,7 @@ class Record(object):
        #               return "%d days up"% -daysdown
        #getStrDaysDown = classmethod(getStrDaysDown)
 
-       def takeAction(self):
+       def takeAction(self, index=0):
                pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
                if 'improvement' in self.data['stage'] or self.improved() or \
                        'monitor-end-record' in self.data['stage']:
@@ -514,6 +509,7 @@ class Record(object):
                else:
                        print "takeAction: increasing penalty for %s"%self.hostname
                        pp.increase()
+               pp.index = index
                pp.apply(self.hostname)
                pp.save()