M emailTxt.py

author Stephen Soltesz <soltesz@cs.princeton.edu>

Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
diff --git a/bootman.py b/bootman.py

index 87d8b71..faf77a2 100755 (executable)
--- a/bootman.py
+++ b/bootman.py
@@ -541,11 +541,11 @@ def reboot(hostname, config=None, forced_action=None):
         #  By using the sequence identifier, we guarantee that there will be no
         #  frequent loops.  I'm guessing there is a better way to track loops,
         #  though.
-       if not config.force and pflags.getRecentFlag(s):
-               pflags.setRecentFlag(s)
-               pflags.save() 
-               print "... flag is set or it has already run recently. Skipping %s" % node
-               return True
+       #if not config.force and pflags.getRecentFlag(s):
+       #       pflags.setRecentFlag(s)
+       #       pflags.save() 
+       #       print "... flag is set or it has already run recently. Skipping %s" % node
+       #       return True
  
         sequences = {}
  
diff --git a/clean_policy.py b/clean_policy.py

index a14016e..8e35903 100644 (file)
--- a/clean_policy.py
+++ b/clean_policy.py
@@ -84,6 +84,7 @@ class MonitorMergeDiagnoseSendEscellate:
                 fbnode['log'] = None
                 fbnode['time'] = time.time()
                 fbnode['email'] = TECH
+               fbnode['action-level'] = 0
                 fbnode['action'] = ['noop']
                 fbnode['date_created'] = time.time()
  
@@ -171,7 +172,7 @@ class MonitorMergeDiagnoseSendEscellate:
                 print "diagnose: checkStageAndTime Returned Valid Record"
                 site = PersistFlags(self.loginbase, 1, db='site_persistflags')
  
-               if site.status != "good":
+               if "good" not in site.status: #  != "good":
                         print "diagnose: Setting site %s for 'squeeze'" % self.loginbase
                         diag.setFlag('Squeeze')
                 else:
@@ -191,7 +192,9 @@ class MonitorMergeDiagnoseSendEscellate:
                 #print record.data['stage']
                 #print "improvement" in record.data['stage']
                 #print self.getSendEmailFlag(record)
-               if self.getSendEmailFlag(record) or "monitor-end-record" in record.data['stage']: 
+               print "%s %s DAYS DOWN" % ( self.hostname, Record.getDaysDown(record.data) )
+               if ( self.getSendEmailFlag(record) and Record.getDaysDown(record.data) >= 2 ) or \
+                       "monitor-end-record" in record.data['stage']:
                         print "action: getting message"
                         message = record.getMessage(record.data['ticket_id'])
                         if message:
@@ -206,10 +209,13 @@ class MonitorMergeDiagnoseSendEscellate:
                                         print "action: setting record ticket_id"
                                         record.data['ticket_id'] = message.rt.ticket_id
  
-                       if (record.data['takeaction'] and diag.getFlag('Squeeze') ) or diag.getFlag('BackOff'):
+                       if ( record.data['takeaction'] and diag.getFlag('Squeeze') ): 
                                 print "action: taking action"
-                               record.takeAction()
+                               record.takeAction(record.data['action-level'])
                                 diag.resetFlag('Squeeze')
+                               diag.save()
+                       if diag.getFlag('BackOff'):
+                               record.takeAction(0)
                                 diag.resetFlag('BackOff')
                                 diag.save()
  
@@ -306,6 +312,7 @@ class MonitorMergeDiagnoseSendEscellate:
                         record.data['message'] = record.data['message_series'][0]
                         record.data['stage'] = 'stage_actinoneweek'
                         record.data['save-act-all'] = True
+                       record.data['action-level'] = 0
  
                 elif 'reboot_node' in record.data['stage']:
                         record.data['email'] = TECH
@@ -314,6 +321,7 @@ class MonitorMergeDiagnoseSendEscellate:
                         record.data['stage'] = 'stage_actinoneweek'
                         record.data['takeaction'] = False
                         record.data['save-act-all'] = False
+                       record.data['action-level'] = 0
                         
                 elif 'improvement' in record.data['stage']:
                         print "checkStageAndTime: backing off of %s" % self.hostname
@@ -322,6 +330,7 @@ class MonitorMergeDiagnoseSendEscellate:
                         record.data['message'] = record.data['message_series'][0]
                         record.data['stage'] = 'monitor-end-record'
                         record.data['save-act-all'] = True
+                       record.data['action-level'] = 0
  
                 elif 'actinoneweek' in record.data['stage']:
                         if delta >= 7 * SPERDAY: 
@@ -333,6 +342,7 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['time'] = current_time              # reset clock for waitforever
                                 record.data['takeaction'] = True
                                 record.data['save-act-all'] = True
+                               record.data['action-level'] = 1
                         elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in record.data:
                                 print "checkStageAndTime: second message in one week"
                                 record.data['email'] = TECH 
@@ -341,11 +351,13 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['second-mail-at-oneweek'] = True
                                 record.data['takeaction'] = False
                                 record.data['save-act-all'] = True
+                               record.data['action-level'] = 0
                         else:
                                 record.data['message'] = None
                                 record.data['action'] = ['waitforoneweekaction' ]
                                 record.data['takeaction'] = False
                                 record.data['save-act-all'] = False
+                               record.data['action-level'] = 0
                                 print "checkStageAndTime: ignoring this record for: %s" % self.hostname
                                 #return None                    # don't send if there's no action
  
@@ -359,6 +371,7 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['time'] = current_time              # reset clock for waitforever
                                 record.data['takeaction'] = True
                                 record.data['save-act-all'] = True
+                               record.data['action-level'] = 2
                         elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in record.data:
                                 print "checkStageAndTime: second message in one week for stage two"
                                 record.data['email'] = TECH | PI
@@ -367,12 +380,14 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['second-mail-at-twoweeks'] = True
                                 record.data['takeaction'] = False
                                 record.data['save-act-all'] = True
+                               record.data['action-level'] = 1
                         else:
                                 record.data['message'] = None
                                 record.data['takeaction'] = False
                                 record.data['action'] = ['waitfortwoweeksaction']
                                 record.data['save-act-all'] = False
                                 print "checkStageAndTime: second message in one week for stage two"
+                               record.data['action-level'] = 1
                                 #return None                    # don't send if there's no action
  
                 elif 'ticket_waitforever' in record.data['stage']:
@@ -385,18 +400,21 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['message'] = None
                                 record.data['time'] = current_time
                                 record.data['save-act-all'] = True
+                               record.data['action-level'] = 2
                         else:
                                 if delta >= 7*SPERDAY:
                                         record.data['action'] = ['ticket_waitforever']
                                         record.data['message'] = None
                                         record.data['time'] = current_time              # reset clock
                                         record.data['save-act-all'] = True
+                                       record.data['action-level'] = 2
                                 else:
                                         record.data['action'] = ['ticket_waitforever']
                                         record.data['message'] = None
                                         record.data['takeaction'] = False
                                         record.data['save-act-all'] = False
-                                       return None
+                                       record.data['action-level'] = 2
+                                       #return None
  
                 elif 'waitforever' in record.data['stage']:
                         # more than 3 days since last action
@@ -408,11 +426,13 @@ class MonitorMergeDiagnoseSendEscellate:
                                 record.data['message'] = record.data['message_series'][2]
                                 record.data['time'] = current_time              # reset clock
                                 record.data['save-act-all'] = True
+                               record.data['action-level'] = 2
                         else:
                                 record.data['action'] = ['waitforever']
                                 record.data['message'] = None
                                 record.data['takeaction'] = False
                                 record.data['save-act-all'] = False
+                               record.data['action-level'] = 2
                                 #return None                    # don't send if there's no action
  
                 else:
diff --git a/emailTxt.py b/emailTxt.py

index cfbf112..f764a41 100644 (file)
--- a/emailTxt.py
+++ b/emailTxt.py
@@ -22,7 +22,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
  %(hostname_list)s 
  We're writing because we need your help returning them to their regular operation.
  
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
  
         http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
  
@@ -51,7 +51,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
  %(hostname_list)s 
  We're writing again because our previous correspondence, sent only to the registered Technical Contact, has gone unacknowledged for at least a week, and we need your help returning these machines to their regular operation.  We understand that machine maintenance can take time.  So, while we wait for the machines to return to their regular operation slice creation has been suspended at your site.  No new slices may be created, but the existing slices and services running within them will be unaffected.
  
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
  
         http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
  
@@ -80,7 +80,7 @@ As part of PlanetLab node monitoring, we noticed the following nodes were down a
  %(hostname_list)s 
  We understand that machine maintenance can take time.  We're writing again because our previous correspondences, sent first to the registered Technical Contact then the the Site PI, have gone unacknowledged for at least two weeks, and we need your help returning these machines to their regular operation.  This is the third time attempting to contact someone in regard to these machines at your site.  So, while we wait for the machines to return to their regular operation all current slice activity will be suspended.  Current experiments will be stopped and will not be be able to start again until there is evidence that you have begun to help with the maintenance of these machines.
  
-To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return an 'Internal Server Error'.
+To help, please confirm that a verison 3.0 or greater BootCD is installed in the machine.  Then, after checking that the node is properly networked, power cycle the machine.  Note that rebooting the machine may not fully resolve the problems we are seeing.  Once the machine has come back up, please visit the Comon status page to verify that your node is accessible from the network.  It may take several minutes before Comon registers your node.  Until that time, visiting the link below will return the message 'could not find requested table - probably empty'.
  
         http://summer.cs.princeton.edu/status/tabulator.cgi?table=nodes/table_%(hostname)s&limit=50
  
diff --git a/grouprins.py b/grouprins.py

index d859727..cfefc6a 100755 (executable)
--- a/grouprins.py
+++ b/grouprins.py
@@ -228,6 +228,11 @@ if config.nodegroup:
         nodelist = api.GetNodes(ng[0]['node_ids'])
         hostnames = [ n['hostname'] for n in nodelist ]
  
+if config.site:
+       site = api.GetSites(config.site)
+       l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname'])
+       hostnames = [ n['hostname'] for n in l_nodes ]
+
  if config.node or config.nodelist:
         if config.node: hostnames = [ config.node ] 
         else: hostnames = util.file.getListFromFile(config.nodelist)
@@ -339,10 +344,10 @@ for host in hostnames:
                                 print "ALL METHODS OF RESTARTING %s FAILED" % host
                                 args = {}
                                 args['hostname'] = host
-                               m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
-                                                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
-                               m.reset()
-                               m.send(['monitor-list@lists.planet-lab.org'])
+                               #m = PersistMessage(host, "ALL METHODS FAILED for %(hostname)s" % args,
+                               #                                                        "CANNOT CONTACT", False, db='suspect_persistmessages')
+                               #m.reset()
+                               #m.send(['monitor-list@lists.planet-lab.org'])
  
                         l = Log(host, record)
                         print l
diff --git a/nodecommon.py b/nodecommon.py

index a8b82ea..624ee2c 100644 (file)
--- a/nodecommon.py
+++ b/nodecommon.py
@@ -4,6 +4,7 @@ import reboot
  import time
  import util.file
  import plc
+from datetime import datetime 
  from monitor import database
  from unified_model import PersistFlags
  esc = struct.pack('i', 27)
diff --git a/nodesets.py b/nodesets.py

index 63b049c..ea69d6b 100755 (executable)
--- a/nodesets.py
+++ b/nodesets.py
@@ -4,6 +4,7 @@ import sys
  import os
  from sets import Set
  import parser as parsermodule
+import util.file
  
  def main():
         parser = parsermodule.getParser()
@@ -16,8 +17,8 @@ def main():
         f1 = config.args[0]
         f2 = config.args[1]
  
-       s1 = config.getListFromFile(f1)
-       s2 = config.getListFromFile(f2)
+       s1 = util.file.getListFromFile(f1)
+       s2 = util.file.getListFromFile(f2)
  
         s = nodesets(config.operation, s1, s2)
  
@@ -44,3 +45,6 @@ def nodesets(operation, s1, s2):
                 print "Unknown operation: %s " % operation
         
         return []
+
+if __name__ == "__main__":
+       main()
diff --git a/rtinfo.py b/rtinfo.py

index 35d6973..bdbc993 100755 (executable)
--- a/rtinfo.py
+++ b/rtinfo.py
@@ -11,7 +11,7 @@ for id in sql.keys():
         #print sql[id].keys()
         #sys.exit(1)
         key = "%(queue)s-%(owner)s-%(status)s-%(lastupdated)s-%(email)-30s-%(subj)s" % sql[id]
-       sortkeys[key] = "%(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
+       sortkeys[key] = "%(ticket_id)s %(queue)s %(lastupdated)s %(status)6s %(email)-25s %(owner)6s %(subj)26.26s https://rt.planet-lab.org/Ticket/Display.html?id=%(ticket_id)s" % sql[id]
         #sortkeys[key] = "%(ticket_id)s %(status)6s %(email)-30s %(lastupdated)s %(subj)s" % sql[id]
  
  keys = sortkeys.keys()
diff --git a/showlatlon.py b/showlatlon.py

index 10367e4..af01bd7 100755 (executable)
--- a/showlatlon.py
+++ b/showlatlon.py
@@ -29,11 +29,11 @@ def gethardwarequality(nodename, fb):
                 for field in ['cpuspeed', 'memsize', 'disksize']:
                         if field not in cstat: cstat[field] = "null"
  
-               if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.4:
+               if cstat['cpuspeed'] != "null" and float(cstat['cpuspeed']) < 2.2:
                         return "BAD" # "cpu_slow",
-               if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.9:
+               if cstat['memsize'] != "null" and float(cstat['memsize']) < 2.8:
                         return "BAD" # "mem_small",
-               if cstat['disksize'] != "null" and float(cstat['disksize']) < 320.0:
+               if cstat['disksize'] != "null" and float(cstat['disksize']) < 300.0:
                         return "BAD" # "disk_small",
  
                 if cstat['disksize'] == "null" and \
@@ -42,9 +42,9 @@ def gethardwarequality(nodename, fb):
                         return "N/A"
  
                 try:
-                       if  float(cstat['cpuspeed']) >= 2.4 and \
-                               float(cstat['memsize']) >= 2.9 and \
-                               (cstat['disksize'] == "null" or float(cstat['disksize']) >= 320.0):
+                       if  float(cstat['cpuspeed']) >= 2.2 and \
+                               float(cstat['memsize']) >= 2.8 and \
+                               (cstat['disksize'] == "null" or float(cstat['disksize']) >= 300.0):
                                 return "A-OK"
                 except:
                         print cstat
diff --git a/todo b/todo

index d7370ef..ae180a8 100644 (file)
--- a/todo
+++ b/todo
@@ -14,9 +14,9 @@ TODO:
         - testapi.py
         - findbad.py on sample site.
         - nodebad.py
+       - findbadpcus.py
         - nodequery.py
         - nodegroups.py
-       - findbadpcus.py
         - loads webpage for those retreived values to confirm setup succeeded.
  
   * reimplement the config.py / .config mechanism.  I'd like for many commands
diff --git a/unified_model.py b/unified_model.py

index 8c5fb7f..e237bc9 100755 (executable)
--- a/unified_model.py
+++ b/unified_model.py
@@ -3,8 +3,6 @@
  from monitor import database
  
  import plc
-api = plc.getAuthAPI()
-
  import mailer
  import time
  
@@ -15,9 +13,6 @@ import config
  
  def gethostlist(hostlist_file):
         return util.file.getListFromFile(hostlist_file)
-       
-       #nodes = api.GetNodes({'peer_id' : None}, ['hostname'])
-       #return [ n['hostname'] for n in nodes ]
  
  def array_to_priority_map(array):
         """ Create a mapping where each entry of array is given a priority equal
@@ -450,7 +445,7 @@ class Record(object):
  
         def getDaysDown(cls, diag_record):
                 daysdown = -1
-               if diag_record['comonstats']['uptime'] != "null":
+               if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
                         daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
                 #elif diag_record['comonstats']['sshstatus'] != "null":
                 #       daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
@@ -504,7 +499,7 @@ class Record(object):
         #               return "%d days up"% -daysdown
         #getStrDaysDown = classmethod(getStrDaysDown)
  
-       def takeAction(self):
+       def takeAction(self, index=0):
                 pp = PersistSitePenalty(self.hostname, 0, db='persistpenalty_hostnames')
                 if 'improvement' in self.data['stage'] or self.improved() or \
                         'monitor-end-record' in self.data['stage']:
@@ -514,6 +509,7 @@ class Record(object):
                 else:
                         print "takeAction: increasing penalty for %s"%self.hostname
                         pp.increase()
+               pp.index = index
                 pp.apply(self.hostname)
                 pp.save()
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Tue, 23 Sep 2008 19:53:34 +0000 (19:53 +0000)
bootman.py		patch \| blob \| history
clean_policy.py		patch \| blob \| history
emailTxt.py		patch \| blob \| history
grouprins.py		patch \| blob \| history
nodecommon.py		patch \| blob \| history
nodesets.py		patch \| blob \| history
rtinfo.py		patch \| blob \| history
showlatlon.py		patch \| blob \| history
todo		patch \| blob \| history
unified_model.py		patch \| blob \| history