From: Mark Huang Date: Mon, 21 Aug 2006 21:27:43 +0000 (+0000) Subject: merge to HEAD as of 2006-08-21 X-Git-Tag: myplc-0_4-rc3~1 X-Git-Url: http://git.onelab.eu/?p=mom.git;a=commitdiff_plain;h=refs%2Fheads%2Fmyplc-0_4-branch merge to HEAD as of 2006-08-21 --- diff --git a/leak.c b/leak.c index 271d41b..d5798ec 100644 --- a/leak.c +++ b/leak.c @@ -4,7 +4,7 @@ * Mark Huang * Copyright (C) 2006 The Trustees of Princeton University * - * $Id$ + * $Id: leak.c,v 1.2 2006/07/25 18:09:19 faiyaza Exp $ */ #include @@ -16,17 +16,19 @@ int main(int argc, char *argv[]) { int rate = 16; + int size = 1; int leaked; for (;;) { int c, option_index = 0; static struct option long_options[] = { { "rate", required_argument, NULL, 'r' }, + { "size", optional_argument, NULL, 's' }, { "help", no_argument, NULL, 'h' }, { 0, 0, 0, 0 } }; - c = getopt_long(argc, argv, "r:h", long_options, &option_index); + c = getopt_long(argc, argv, "r:h:s", long_options, &option_index); if (c == -1) break; @@ -34,10 +36,14 @@ main(int argc, char *argv[]) case 'r': rate = atoi(optarg); break; + case 's': + size = atoi(optarg); + break; case 'h': default: fprintf(stderr, "Usage: %s [OPTION]...\n", argv[0]); fprintf(stderr, "\t-r, --rate=MiB/sec\tRate to leak memory in MiB/sec\n"); + fprintf(stderr, "\t-s, --size=MiB\tGrow to size and wait.\n"); return 0; } } @@ -46,7 +52,7 @@ main(int argc, char *argv[]) for (;;) { int i, bufsize = rate * 1024 * 1024; char *buf = malloc(bufsize); - if (buf) { + if (buf && (leaked <= size)) { /* Touch every page in the buffer */ for (i = 0; i < bufsize; i += 4096) buf[i] = 1; diff --git a/nm_mom.py b/nm_mom.py index dc18817..6aca03a 100755 --- a/nm_mom.py +++ b/nm_mom.py @@ -5,7 +5,7 @@ # Faiyaz Ahmed # Copyright (C) 2006 The Trustees of Princeton University # -# $Id: nm_mom.py,v 1.2 2006/08/17 20:12:09 faiyaza Exp $ +# $Id: nm_mom.py,v 1.3 2006/08/17 20:21:09 faiyaza Exp $ # import syslog diff --git a/pl_mom.cron b/pl_mom.cron index eb47892..4ef46c9 100644 --- a/pl_mom.cron +++ b/pl_mom.cron @@ -4,8 +4,9 @@ # Mark Huang # Copyright (C) 2005 The Trustees of Princeton University # -# $Id: pl_mop.cron,v 1.1 2005/10/11 17:34:57 mlhuang Exp $ +# $Id: pl_mom.cron,v 1.2 2006/08/17 20:06:26 faiyaza Exp $ # @M@ @H@ * * * root /usr/local/planetlab/bin/pl_mop.sh */15 * * * * root /usr/share/pl_mom/bwmon.py +*/15 * * * * root /usr/share/pl_mom/nm_mom.py diff --git a/swapmon.py b/swapmon.py index 671663d..c43dccb 100755 --- a/swapmon.py +++ b/swapmon.py @@ -10,7 +10,7 @@ # Faiyaz Ahmed # Copyright (C) 2004-2006 The Trustees of Princeton University # -# $Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $ +# $Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $ # import syslog @@ -62,8 +62,9 @@ kill_timeout = 120 # Don't email the same message more than once in the same emailtimeout interval email_timeout = 1800 -# Minimum physical memory utilization to be considered the largest consumer -min_thresh = 10 +# Physical size threshold to be considered a consumer. Rationale is if there are no procs +# with a size at least as large as this, then there is a slow leaker; better to just reboot. +rss_min = 150 * 1024 # System slices that should not be reset (regexps) system_slices = ['root', PLC_SLICE_PREFIX + '_'] @@ -159,7 +160,6 @@ class Reset: self.resettimeleft = reset_timeout self.resetcount = 0 self.resetmail = 0 - self.kill = False self.killtimeleft = kill_timeout self.killmail = 0 @@ -181,15 +181,12 @@ class Reset: else: # Once out of probation period (killtimeleft), remove strikes self.resetcount = 0 - self.kill = False - # Check to see if a slice needs to be killed. If it has rules more than kill_thresh in - # the probation period (kill_timeout) send an email, kill the slice. + # Check to see if a slice needs to be killed. If it has been killed more + # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice. def checkkill(self,params): - if self.killtimeleft > 0 and self.resetcount >= kill_thresh and \ - self.kill == False: - self.kill = True + if self.killtimeleft > 0 and self.resetcount >= kill_thresh: if debug: print kill_subject % params print kill_body % params @@ -215,7 +212,8 @@ class Reset: # Reset slice after checking to see if slice is out of timeout. # Increment resetcount, check to see if larger than kill_thresh. def reset(self, params): - # If its the first reset or if its been reset before + # If its the first reset (came back after kill) + # or if its been reset before # and we are out of the reset timeout. if self.resetcount == 0 or self.resettimeleft == 0: # Do we need to kill this slice? Check history first. @@ -428,7 +426,7 @@ def summary(names = None, total_rss = memtotal()): def main(): # Defaults global debug, verbose, datafile - global period, change_thresh, reset_thresh, reboot_thresh, min_thresh, system_slices + global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices # All slices names = [] @@ -458,8 +456,6 @@ def main(): reset_thresh = int(optval) elif opt == "--reboot-thresh": reboot_thresh = int(optval) - elif opt == "--min-thresh": - min_thresh = int(optval) elif opt == "--system-slice": system_slices.append(optval) elif opt == "--status": @@ -490,7 +486,7 @@ def main(): (version, slices) = pickle.load(f) f.close() # Check version of data file - if version != "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $": + if version != "$Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $": print "Not using old version '%s' data file %s" % (version, datafile) raise Exception @@ -507,7 +503,7 @@ def main(): # Delete data file os.unlink(datafile) except Exception: - version = "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $" + version = "$Id: swapmon.py,v 1.10 2006/08/16 16:18:45 faiyaza Exp $" slices = {} # Query process table every 30 seconds, or when a large change in @@ -530,6 +526,7 @@ def main(): if last_used is None: last_used = used + if verbose: print "%d%% swap consumed" % used @@ -547,13 +544,15 @@ def main(): bwlimit.run("/bin/sync; /sbin/reboot -f") elif used >= reset_thresh: + if debug: + print "Memory used = %s" %(used) # Try and find a hog slicelist = slices.values() slicelist.sort(lambda a, b: b['rss'] - a['rss']) for slice in slicelist: percent = 100. * slice['rss'] / total_rss - if percent < min_thresh: + if slice['rss'] < rss_min: continue print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \ @@ -597,8 +596,9 @@ def main(): if not resetlist.has_key(slice['name']): resetlist[slice['name']] = Reset(slice['name']) resetlist[slice['name']].reset(params) + slices = slicestat(names) - elif timer <= 0 or used >= (last_used + change_thresh): + if timer <= 0 or used >= (last_used + change_thresh): if used >= (last_used + change_thresh): print "%d%% swap consumed, %d%% in last %d seconds" % \ (used, used - last_used, period - timer)