- merge revision 1.6
authorMark Huang <mlhuang@cs.princeton.edu>
Mon, 17 Jul 2006 21:34:20 +0000 (21:34 +0000)
committerMark Huang <mlhuang@cs.princeton.edu>
Mon, 17 Jul 2006 21:34:20 +0000 (21:34 +0000)
date: 2006/07/17 19:31:27;  author: faiyaza;  state: Exp;  lines: +139 -25
Added state information per slice.  Slices are now killed after consuming more
memory than the threshold more than twice.  Also fixed the slice restart issue where
slices would be restarted and repeated restart emails would be sent.

swapmon.py

index 907f72f..7122481 100755 (executable)
@@ -7,9 +7,10 @@
 #
 # Mark Huang <mlhuang@cs.princeton.edu>
 # Andy Bavier <acb@cs.princeton.edu>
+# Faiyaz Ahmed <faiyaza@cs.princeton.edu>
 # Copyright (C) 2004-2006 The Trustees of Princeton University
 #
-# $Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $
+# $Id: swapmon.py,v 1.6 2006/07/17 19:31:27 faiyaza Exp $
 #
 
 import syslog
@@ -44,11 +45,20 @@ period = 30
 change_thresh = 5
 
 # Swap utilization at which the largest consumer of physical memory is reset
-reset_thresh = 85
+reset_thresh = 75
 
 # Swap utilization at which the machine is rebooted
 reboot_thresh = 95
 
+# Time to wait before checking slice again after reset
+reset_timeout = 10
+
+# Number of strikes before killing (strike, strike, kill)
+kill_thresh = 2
+
+# Time to wait before removing slice from kill queue 
+kill_timeout = 30 
+
 # Minimum physical memory utilization to be considered the largest consumer
 min_thresh = 10
 
@@ -108,6 +118,117 @@ behavior.
 %(date)s %(hostname)s alarm %(slice)s
 """.lstrip()
 
+# Message sent after a slice has been killed
+kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
+kill_body = \
+"""
+Sometime before %(date)s, swap space was
+nearly exhausted on %(hostname)s.
+
+Slice %(slice)s was killed since it was the largest consumer of
+physical memory at %(rss)s (%(percent)4.1f%%) after repeated restarts.
+
+Please reply to this message explaining the nature of your experiment,
+and what you are doing to address the problem.
+
+%(slice)s processes prior to reset:
+
+%(table)s
+
+%(date)s %(hostname)s reset %(slice)s
+""".lstrip()
+
+
+
+class Reset:
+       """
+       Keeps track of state information for resets and kills
+
+       resettimeleft - timeout before checking for next reset
+       resetcount - number of strikes 
+       killtimeleft - time out before removing from kill queue 
+       """
+
+       def __init__(self,name):
+               self.name = name
+               self.resettimeleft = reset_timeout
+               self.resetcount = 0 
+               self.killtimeleft = kill_timeout
+
+       def __repr__(self):
+               return self.name
+       
+       def update(self):
+               # Count down for next check of reset slice.
+                       if self.resettimeleft > 0:
+                        self.resettimeleft -= 1
+                        if debug and verbose:
+                                       print "%s has %s seconds to die and has been reset %s times" \
+                                       %(self.name, self.resettimeleft, self.resetcount)
+                                       print "%s has %s seconds in probation" \
+                                       %(self.name, self.killtimeleft)
+               if self.killtimeleft > 0:
+                       # Count down kill probation timer (killtimeleft)
+                       self.killtimeleft -= 1
+               else:
+                       # Once out of probation period (killtimeleft), remove strike
+                       self.resetcount = 0
+
+
+       # Check to see if a slice needs to be killed.  If it has rules more than kill_thresh in 
+       # the probation period (kill_timeout) send an email, kill the slice.
+       def checkkill(self,params):
+               if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
+                       if debug:
+                                print kill_subject % params
+                                print kill_body % params
+                       try:
+                               pid = os.fork()
+                               if pid == 0:
+                                       print "Slice %s is being killed." % self.name   
+                                       vserver = VServer(self.name)
+                                       vserver.stop()
+                               else:
+                                       os.waitpid(pid,0)
+                       except Exception, err:
+                                print "Warning: Exception received while killing slice %s: %s" % self.name, err
+                       slicemail(self.name, kill_subject % params, kill_body % params)
+                       return True
+               return False 
+
+       # Reset slice after checking to see if slice is out of timeout.
+       # Increment resetcount, check to see if larger than kill_thresh.
+       def reset(self, params):
+               # If its the first reset or if its been reset before
+               # and we are out of the reset timeout.
+               if self.resetcount == 0 or self.resettimeleft == 0:
+                       # Do we need to kill this slice?  Check history first.
+                       if self.checkkill(params):
+                               return
+                       # Update counters
+                       self.resetcount += 1
+                       self.killtimeleft = kill_timeout
+                       self.resettimeleft = reset_timeout
+                       if debug:
+                               print reset_subject % params
+                               print reset_body % params
+                       try:
+                               pid = os.fork()
+                               if pid == 0:
+                                       print "Resetting slice " + self.name 
+                                       vserver = VServer(self.name)
+                                       vserver.stop()
+                                       vserver.start(wait = False)
+                                       os._exit(0)
+                               else:
+                                       os.waitpid(pid, 0)
+                       except Exception, err:
+                               print "Warning: Exception received while resetting slice %s:" \
+                                       % self.name, err
+                       slicemail(self.name, reset_subject % params, reset_body % params)
+
+
+
 def usage():
     print """
 Usage: %s [OPTIONS]...
@@ -349,7 +470,7 @@ def main():
         (version, slices) = pickle.load(f)
         f.close()
         # Check version of data file
-        if version != "$Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $":
+        if version != "$Id: swapmon.py,v 1.6 2006/07/17 19:31:27 faiyaza Exp $":
             print "Not using old version '%s' data file %s" % (version, datafile)
             raise Exception
 
@@ -366,7 +487,7 @@ def main():
         # Delete data file
         os.unlink(datafile)
     except Exception:
-        version = "$Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $"
+        version = "$Id: swapmon.py,v 1.6 2006/07/17 19:31:27 faiyaza Exp $"
         slices = {}
 
     # Query process table every 30 seconds, or when a large change in
@@ -378,11 +499,18 @@ def main():
     # System slices that we have warned but could not reset
     warned = []
 
+    # Slices that were reset
+    resetlist = {}
+
     while True:
         used = swap_used()
+
+       for resetslice in resetlist.keys():
+               resetlist[resetslice].update()
+       
         if last_used is None:
             last_used = used
-        if verbose:
+       if verbose:
             print "%d%% swap consumed" % used
 
         if used >= reboot_thresh:
@@ -404,9 +532,10 @@ def main():
             slicelist.sort(lambda a, b: b['rss'] - a['rss'])
             for slice in slicelist:
                 percent = 100. * slice['rss'] / total_rss
+
                 if percent < min_thresh:
                     continue
-
+               
                 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
                       (used,
                        slice['name'],
@@ -444,25 +573,10 @@ def main():
                             print "Warning slice " + slice['name']
                             slicemail(slice['name'], alarm_subject % params, alarm_body % params)
                 else:
-                    # Otherwise, reset
-                    if debug:
-                        print reset_subject % params
-                        print reset_body % params
-                    else:
-                        try:
-                            pid = os.fork()
-                            if pid == 0:
-                                print "Resetting slice " + slice['name']
-                                vserver = VServer(slice['name'])
-                                vserver.stop()
-                                vserver.start(wait = False)
-                                os._exit(0)
-                            else:
-                                os.waitpid(pid, 0)
-                        except Exception, err:
-                            print "Warning: Exception received while resetting slice %s:" % slice['name'], err
-                        slicemail(slice['name'], reset_subject % params, reset_body % params)
-                    break
+                       # Reset slice
+                       if not resetlist.has_key(slice['name']):
+                               resetlist[slice['name']] = Reset(slice['name'])
+                        resetlist[slice['name']].reset(params)
 
         elif timer <= 0 or used >= (last_used + change_thresh):
             if used >= (last_used + change_thresh):