From d12d2ba5e6dce633dc95fd53f07448f77ac0fe9c Mon Sep 17 00:00:00 2001 From: Faiyaz Ahmed Date: Mon, 17 Jul 2006 19:31:27 +0000 Subject: [PATCH] Added state information per slice. Slices are now killed after consuming more memory than the threshold more than twice. Also fixed the slice restart issue where slices would be restarted and repeated restart emails would be sent. --- swapmon.py | 164 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 139 insertions(+), 25 deletions(-) diff --git a/swapmon.py b/swapmon.py index 3f6304e..a29a59e 100755 --- a/swapmon.py +++ b/swapmon.py @@ -7,9 +7,10 @@ # # Mark Huang # Andy Bavier +# Faiyaz Ahmed # Copyright (C) 2004-2006 The Trustees of Princeton University # -# $Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $ +# $Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $ # import syslog @@ -44,11 +45,20 @@ period = 30 change_thresh = 5 # Swap utilization at which the largest consumer of physical memory is reset -reset_thresh = 85 +reset_thresh = 75 # Swap utilization at which the machine is rebooted reboot_thresh = 95 +# Time to wait before checking slice again after reset +reset_timeout = 10 + +# Number of strikes before killing (strike, strike, kill) +kill_thresh = 2 + +# Time to wait before removing slice from kill queue +kill_timeout = 30 + # Minimum physical memory utilization to be considered the largest consumer min_thresh = 10 @@ -108,6 +118,117 @@ behavior. %(date)s %(hostname)s alarm %(slice)s """.lstrip() +# Message sent after a slice has been killed +kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s" +kill_body = \ +""" +Sometime before %(date)s, swap space was +nearly exhausted on %(hostname)s. + +Slice %(slice)s was killed since it was the largest consumer of +physical memory at %(rss)s (%(percent)4.1f%%) after repeated restarts. + +Please reply to this message explaining the nature of your experiment, +and what you are doing to address the problem. + +%(slice)s processes prior to reset: + +%(table)s + +%(date)s %(hostname)s reset %(slice)s +""".lstrip() + + + +class Reset: + """ + Keeps track of state information for resets and kills + + resettimeleft - timeout before checking for next reset + resetcount - number of strikes + killtimeleft - time out before removing from kill queue + """ + + def __init__(self,name): + self.name = name + self.resettimeleft = reset_timeout + self.resetcount = 0 + self.killtimeleft = kill_timeout + + def __repr__(self): + return self.name + + def update(self): + # Count down for next check of reset slice. + if self.resettimeleft > 0: + self.resettimeleft -= 1 + if debug and verbose: + print "%s has %s seconds to die and has been reset %s times" \ + %(self.name, self.resettimeleft, self.resetcount) + print "%s has %s seconds in probation" \ + %(self.name, self.killtimeleft) + if self.killtimeleft > 0: + # Count down kill probation timer (killtimeleft) + self.killtimeleft -= 1 + else: + # Once out of probation period (killtimeleft), remove strike + self.resetcount = 0 + + + # Check to see if a slice needs to be killed. If it has rules more than kill_thresh in + # the probation period (kill_timeout) send an email, kill the slice. + def checkkill(self,params): + if self.killtimeleft > 0 and self.resetcount >= kill_thresh: + if debug: + print kill_subject % params + print kill_body % params + try: + pid = os.fork() + if pid == 0: + print "Slice %s is being killed." % self.name + vserver = VServer(self.name) + vserver.stop() + else: + os.waitpid(pid,0) + except Exception, err: + print "Warning: Exception received while killing slice %s: %s" % self.name, err + slicemail(self.name, kill_subject % params, kill_body % params) + return True + return False + + # Reset slice after checking to see if slice is out of timeout. + # Increment resetcount, check to see if larger than kill_thresh. + def reset(self, params): + # If its the first reset or if its been reset before + # and we are out of the reset timeout. + if self.resetcount == 0 or self.resettimeleft == 0: + # Do we need to kill this slice? Check history first. + if self.checkkill(params): + return + # Update counters + self.resetcount += 1 + self.killtimeleft = kill_timeout + self.resettimeleft = reset_timeout + if debug: + print reset_subject % params + print reset_body % params + try: + pid = os.fork() + if pid == 0: + print "Resetting slice " + self.name + vserver = VServer(self.name) + vserver.stop() + vserver.start(wait = False) + os._exit(0) + else: + os.waitpid(pid, 0) + except Exception, err: + print "Warning: Exception received while resetting slice %s:" \ + % self.name, err + slicemail(self.name, reset_subject % params, reset_body % params) + + + def usage(): print """ Usage: %s [OPTIONS]... @@ -349,7 +470,7 @@ def main(): (version, slices) = pickle.load(f) f.close() # Check version of data file - if version != "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $": + if version != "$Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $": print "Not using old version '%s' data file %s" % (version, datafile) raise Exception @@ -366,7 +487,7 @@ def main(): # Delete data file os.unlink(datafile) except Exception: - version = "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $" + version = "$Id: swapmon.py,v 1.5 2006/05/09 03:23:57 mlhuang Exp $" slices = {} # Query process table every 30 seconds, or when a large change in @@ -378,11 +499,18 @@ def main(): # System slices that we have warned but could not reset warned = [] + # Slices that were reset + resetlist = {} + while True: used = swap_used() + + for resetslice in resetlist.keys(): + resetlist[resetslice].update() + if last_used is None: last_used = used - if verbose: + if verbose: print "%d%% swap consumed" % used if used >= reboot_thresh: @@ -404,9 +532,10 @@ def main(): slicelist.sort(lambda a, b: b['rss'] - a['rss']) for slice in slicelist: percent = 100. * slice['rss'] / total_rss + if percent < min_thresh: continue - + print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \ (used, slice['name'], @@ -444,25 +573,10 @@ def main(): print "Warning slice " + slice['name'] slicemail(slice['name'], alarm_subject % params, alarm_body % params) else: - # Otherwise, reset - if debug: - print reset_subject % params - print reset_body % params - else: - try: - pid = os.fork() - if pid == 0: - print "Resetting slice " + slice['name'] - vserver = VServer(slice['name']) - vserver.stop() - vserver.start(wait = False) - os._exit(0) - else: - os.waitpid(pid, 0) - except Exception, err: - print "Warning: Exception received while resetting slice %s:" % slice['name'], err - slicemail(slice['name'], reset_subject % params, reset_body % params) - break + # Reset slice + if not resetlist.has_key(slice['name']): + resetlist[slice['name']] = Reset(slice['name']) + resetlist[slice['name']].reset(params) elif timer <= 0 or used >= (last_used + change_thresh): if used >= (last_used + change_thresh): -- 2.43.0