3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # util-vserver/python/vserver.py allows us to control slices directly
27 from vserver import VServer
29 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
38 DATAFILE = "/var/lib/misc/swapmon.dat"
40 # Seconds between process analysis
43 # Minimum change in swap utilization over 30 seconds that will trigger
44 # early process analysis.
47 # Swap utilization at which the largest consumer of physical memory is reset
50 # Swap utilization at which the machine is rebooted
53 # Time to wait before checking slice again after reset
56 # Don't email the same message more than once in the same emailtimeout interval
59 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
60 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
63 # System slices that should not be reset (regexps)
64 system_slices = ['root', PLC_SLICE_PREFIX + '_']
66 # Message sent after a critical reboot
67 rebooted_subject = "pl_mom rebooted %(hostname)s"
70 Sometime before %(date)s, swap space was
71 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
73 Slices active prior to reboot are listed below. Memory usage
74 statistics are not entirely accurate due to threading.
78 %(date)s %(hostname)s reboot
81 # Message sent after a hog is reset
82 reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
85 Sometime before %(date)s, swap space was
86 nearly exhausted on %(hostname)s.
88 Slice %(slice)s was reset since it was the largest consumer of
89 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
91 Please reply to this message explaining the nature of your experiment,
92 and what you are doing to address the problem.
94 http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
96 %(slice)s processes prior to reset:
100 %(date)s %(hostname)s reset %(slice)s
103 # Message sent to system slices that should not be reset
104 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
107 Sometime before %(date)s, swap space was
108 nearly exhausted on %(hostname)s.
110 System slice %(slice)s was the largest consumer of physical memory at
111 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
112 but please verify its behavior.
114 %(slice)s processes prior to alarm:
118 %(date)s %(hostname)s alarm %(slice)s
121 # Message sent after a slice has been killed
122 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
125 Sometime before %(date)s, swap space was
126 nearly exhausted on %(hostname)s.
128 Slice %(slice)s was killed since it was the largest consumer of
129 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
130 after repeated restarts.
132 Please reply to this message explaining the nature of your experiment,
133 and what you are doing to address the problem.
135 %(slice)s processes prior to reset:
139 %(date)s %(hostname)s reset %(slice)s
146 Keeps track of state information for resets and kills
148 resettimeleft - timeout before checking for next reset
149 resetcount - number of strikes
150 killtimeleft - time out before removing from kill queue
151 {kill,reset}mail - Time of last email
152 kill - State of kill. If slice is already being killed, wait before retry.
155 def __init__(self,name):
164 def reset(self, params):
165 if self.resetcount == 0 or self.resettimeleft == 0:
166 print "%s has %s seconds to die and has been reset %s times" \
167 %(self.name, self.resettimeleft, self.resetcount)
169 print reset_subject % params
170 print reset_body % params
174 print "Resetting slice " + self.name
175 vserver = VServer(self.name)
181 except Exception, err:
182 print "Warning: Exception received while resetting slice %s:" \
184 if (time.time() - self.resetmail) > email_timeout:
185 slicemail(self.name, reset_subject % params, reset_body % params)
186 print "Sending Reset email for slice %s" % self.name
187 self.resetmail = time.time()
191 Usage: %s [OPTIONS]...
194 -d, --debug Enable debugging (default: %s)
195 -v, --verbose Increase verbosity level (default: %d)
196 -f, --file=FILE Data file (default: %s)
197 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
198 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
199 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
200 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
201 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
202 --system-slice=SLICE System slice that should not be reset
203 --status Print memory usage statistics and exit
204 -h, --help This message
205 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
207 def slicestat(names = None):
209 Get status of specified slices (if names is None or empty, all
210 slices). vsize, sz, and rss are in KiB. Returns
211 PID CONTEXT VSZ SZ RSS %MEM CMD
212 {xid: {'xid': slice_id,
214 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
215 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
216 'pcpu': cpu_percent, 'pmem': mem_percent}]
217 'vsize': total_virtual_kib,
218 'sz': total_potential_kib,
219 'rss': total_physical_kib}}
222 # Mandatory fields. xid is a virtual field inserted by vps. Make
223 # sure cmd is last so that it does not get truncated
225 fields = ['pid', 'xid', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
227 # vps inserts xid after pid in the output, but ps doesn't know
228 # what the field means.
229 ps_fields = list(fields)
230 ps_fields.remove('xid')
234 # Eat the header line. vps depends on the header to figure out
235 # which column is the PID column, so we can't just tell ps not to
237 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
241 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
242 # vps uses to denote the root context and the "all contexts"
243 # context) with "0" so that we can just split() on whitespace.
244 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
246 # Represent process as a dict of fields
247 values = line.split(None, len(fields) - 1)
248 if len(values) != len(fields):
250 proc = dict(zip(fields, values))
252 # Convert ints and floats
255 proc[field] = int(proc[field])
258 proc[field] = float(proc[field])
262 # vps sometimes prints ERR or the name of the slice
263 # instead of a context ID if it
264 # cannot identify the context of an orphaned (usually dying)
265 # process. Skip these processes.
266 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
269 # Assign (pl_)sshd processes to slice instead of root
270 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
273 xid = bwlimit.get_xid(m.group(1))
277 name = bwlimit.get_slice(proc['xid'])
279 # Orphaned (not associated with a slice) class
280 name = "%d?" % proc['xid']
282 # Monitor only the specified slices
283 if names and name not in names:
286 # Additional overhead calculations from slicestat
288 # Include 12 KiB of process overhead =
289 # 4 KiB top-level page table +
290 # 4 KiB kernel structure +
291 # 4 KiB basic page table
294 # Include additional page table overhead
296 if proc['vsize'] > 4096:
297 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
300 if slices.has_key(proc['xid']):
301 slice = slices[proc['xid']]
303 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
305 slice['procs'].append(proc)
306 slice['vsize'] += proc['vsize']
307 slice['sz'] += proc['sz']
308 slice['rss'] += proc['rss']
310 slices[proc['xid']] = slice
316 Returns total physical and swap memory on the system in KiB.
320 meminfo = open("/proc/meminfo", "r")
321 for line in meminfo.readlines():
323 (name, value, kb) = line.split()
326 if name == "MemTotal:":
328 elif name == "SwapTotal:":
335 Returns swap utilization on the system as a whole percentage (0-100).
340 swaps = open("/proc/swaps", "r")
342 lines = swaps.readlines()[1:]
345 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
346 (filename, type, size, used, priority) = line.strip().split()
348 total_swap += int(size)
349 total_used += int(used)
350 except ValueEror, err:
352 except (IOError, KeyError), err: pass
354 swapused = 100 * total_used / total_swap
355 if debug: print "%s percent swap used" % swapused
358 def summary(slices = None, total_mem = None, total_swap = None):
360 Return a summary of memory usage by slice.
362 if not slices: slices = slicestat()
363 slicelist = slices.values()
364 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
365 if total_mem is None or total_swap is None:
366 (total_mem, total_swap) = memtotal()
368 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
369 for slice in slicelist:
370 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
371 (slice['name'], len(slice['procs']),
372 format_bytes(slice['rss'] * 1024, si = False),
373 100. * slice['rss'] / total_mem,
374 format_bytes(slice['sz'] * 1024, si = False),
375 100. * slice['sz'] / (total_mem + total_swap))
378 def formtable(slice, percent):
380 Makes pretty message to email with human readable ps values.
382 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
383 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
384 for proc in slice['procs']:
385 table += "%5s %10s %10s %10s %4.1f %s\n" % \
387 format_bytes(proc['vsize'] * 1024, si = False),
388 format_bytes(proc['sz'] * 1024, si = False),
389 format_bytes(proc['rss'] * 1024, si = False),
393 prettytable = {'hostname': socket.gethostname(),
394 'date': time.asctime(time.gmtime()) + " GMT",
396 'slice': slice['name'],
397 'rss': format_bytes(slice['rss'] * 1024, si = False),
398 'sz': format_bytes(slice['sz'] * 1024, si = False),
404 Return dictionary of vps (slicestat) from datfile left behind by OOM
405 before rebooting. If none file, just grab the latest dict (slicestat)
406 and return that. If dat file found, means we rebooted, send an email to
410 f = open(DATAFILE, "r+")
412 print "Loading %s" % DATAFILE
413 (v, slices) = pickle.load(f)
415 # Check version of data file
417 print "Not using old version '%s' data file %s" % (v, DATAFILE)
420 params = {'hostname': socket.gethostname(),
421 'date': time.asctime(time.gmtime()) + " GMT",
422 'table': summary(slices, total_mem, total_swap)}
424 print rebooted_subject % params
425 print rebooted_body % params
427 slicemail(None, rebooted_subject % params, rebooted_body % params)
437 def writedat(slices):
439 Write (slices) to pickled datfile.
441 if verbose: print "Saving %s" % DATAFILE
442 f = open(DATAFILE, "w")
443 pickle.dump((VERSION, slices), f)
449 global debug, verbose, DATAFILE, VERSION
450 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
455 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
456 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
457 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
458 except getopt.GetoptError, err:
459 print "Error: " + err.msg
463 for (opt, optval) in opts:
464 if opt == "-d" or opt == "--debug":
466 elif opt == "-v" or opt == "--verbose":
468 elif opt == "-f" or opt == "--file":
470 elif opt == "-s" or opt == "--slice":
472 elif opt == "-p" or opt == "--period":
474 elif opt == "--change-thresh":
475 change_thresh = int(optval)
476 elif opt == "--reset-thresh":
477 reset_thresh = int(optval)
478 elif opt == "--reboot-thresh":
479 reboot_thresh = int(optval)
480 elif opt == "--min-thresh":
481 rss_min = int(optval)
482 elif opt == "--system-slice":
483 system_slices.append(optval)
484 elif opt == "--status":
485 print summary(slicestat(names))
491 # Check if we are already running
498 # Redirect stdout and stderr to syslog
499 syslog.openlog("swapmon")
500 sys.stdout = sys.stderr = Logger()
503 (total_mem, total_swap) = memtotal()
506 # Query process table every 30 seconds, or when a large change in
507 # swap utilization is detected.
514 if last_used is None: last_used = used
517 if used >= reboot_thresh:
518 # Dump slice state before rebooting
520 # Goodbye, cruel world
521 print "%d%% swap consumed, rebooting" % used
522 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
523 elif used >= reset_thresh:
525 slicelist = slices.values()
526 # Puts largest on top.
527 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
528 for slice in slicelist:
529 percent = 100. * slice['rss'] / total_mem
530 if slice['rss'] < rss_min: continue
531 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
534 format_bytes(slice['rss'] * 1024, si = False),
536 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
537 # Make a pretty table.
538 params = formtable(slice, percent)
539 # Match slice name against system slice patterns
540 is_system_slice = filter(None,
541 [re.match(pattern, slice['name']) for pattern in system_slices])
543 # Do not reset system slices, just warn once
545 if slice['name'] not in warned:
546 warned.append(slice['name'])
547 print "Warning slice " + slice['name']
549 print alarm_subject % params
550 print alarm_body % params
552 slicemail(slice['name'], alarm_subject % params,
556 if not debug: slicemail(self.name, reset_subject % params, reset_body % params)
558 # wait period before recalculating swap. If in danger, recalc.
559 if timer <= 0 or used >= (last_used + change_thresh):
560 if used >= (last_used + change_thresh):
561 print "%d%% swap consumed, %d%% in last %d seconds" % \
562 (used, used - last_used, period - timer)
564 slices = slicestat(names)
567 # Keep track of large changes in swap utilization
574 if __name__ == '__main__':