3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # util-vserver/python/vserver.py allows us to control slices directly
27 from vserver import VServer
29 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
38 DATAFILE = "/var/lib/misc/swapmon.dat"
40 # Seconds between process analysis
43 # Minimum change in swap utilization over 30 seconds that will trigger
44 # early process analysis.
47 # Swap utilization at which the largest consumer of physical memory is reset
50 # Swap utilization at which the machine is rebooted
53 # Don't email the same message more than once in the same emailtimeout interval
56 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
57 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
60 # System slices that should not be reset (regexps)
61 system_slices = ['root', PLC_SLICE_PREFIX + '_']
63 # Message sent after a critical reboot
64 rebooted_subject = "pl_mom rebooted %(hostname)s"
67 Sometime before %(date)s, swap space was
68 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
70 Slices active prior to reboot are listed below. Memory usage
71 statistics are not entirely accurate due to threading.
75 %(date)s %(hostname)s reboot
78 # Message sent to system slices that should not be reset
79 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
82 Sometime before %(date)s, swap space was
83 nearly exhausted on %(hostname)s.
85 System slice %(slice)s was the largest consumer of physical memory at
86 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
87 but please verify its behavior.
89 %(slice)s processes prior to alarm:
93 %(date)s %(hostname)s alarm %(slice)s
96 # Message sent after a slice has been killed
97 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
100 Sometime before %(date)s, swap space was
101 nearly exhausted on %(hostname)s.
103 Slice %(slice)s was killed since it was the largest consumer of
104 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
105 after repeated restarts.
107 Please reply to this message explaining the nature of your experiment,
108 and what you are doing to address the problem.
110 %(slice)s processes prior to reset:
114 %(date)s %(hostname)s reset %(slice)s
117 def killsliverprocs(xid):
118 bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
123 Usage: %s [OPTIONS]...
126 -d, --debug Enable debugging (default: %s)
127 -v, --verbose Increase verbosity level (default: %d)
128 -f, --file=FILE Data file (default: %s)
129 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
130 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
131 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
132 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
133 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
134 --system-slice=SLICE System slice that should not be reset
135 --status Print memory usage statistics and exit
136 -h, --help This message
137 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
139 def slicestat(names = None):
141 Get status of specified slices (if names is None or empty, all
142 slices). vsize, sz, and rss are in KiB. Returns
143 PID CONTEXT VSZ SZ RSS %MEM CMD
144 {xid: {'xid': slice_id,
146 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
147 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
148 'pcpu': cpu_percent, 'pmem': mem_percent}]
149 'vsize': total_virtual_kib,
150 'sz': total_potential_kib,
151 'rss': total_physical_kib}}
154 # Mandatory fields. xid is a virtual field inserted by vps. Make
155 # sure cmd is last so that it does not get truncated
157 fields = ['pid', 'xid', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
159 # vps inserts xid after pid in the output, but ps doesn't know
160 # what the field means.
161 ps_fields = list(fields)
162 ps_fields.remove('xid')
166 # Eat the header line. vps depends on the header to figure out
167 # which column is the PID column, so we can't just tell ps not to
169 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
173 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
174 # vps uses to denote the root context and the "all contexts"
175 # context) with "0" so that we can just split() on whitespace.
176 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
178 # Represent process as a dict of fields
179 values = line.split(None, len(fields) - 1)
180 if len(values) != len(fields):
182 proc = dict(zip(fields, values))
184 # Convert ints and floats
187 proc[field] = int(proc[field])
190 proc[field] = float(proc[field])
194 # vps sometimes prints ERR or the name of the slice
195 # instead of a context ID if it
196 # cannot identify the context of an orphaned (usually dying)
197 # process. Skip these processes.
198 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
201 # Assign (pl_)sshd processes to slice instead of root
202 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
205 xid = bwlimit.get_xid(m.group(1))
209 name = bwlimit.get_slice(proc['xid'])
211 # Orphaned (not associated with a slice) class
212 name = "%d?" % proc['xid']
214 # Monitor only the specified slices
215 if names and name not in names:
218 # Additional overhead calculations from slicestat
220 # Include 12 KiB of process overhead =
221 # 4 KiB top-level page table +
222 # 4 KiB kernel structure +
223 # 4 KiB basic page table
226 # Include additional page table overhead
228 if proc['vsize'] > 4096:
229 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
232 if slices.has_key(proc['xid']):
233 slice = slices[proc['xid']]
235 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
237 slice['procs'].append(proc)
238 slice['vsize'] += proc['vsize']
239 slice['sz'] += proc['sz']
240 slice['rss'] += proc['rss']
242 slices[proc['xid']] = slice
248 Returns total physical and swap memory on the system in KiB.
252 meminfo = open("/proc/meminfo", "r")
253 for line in meminfo.readlines():
255 (name, value, kb) = line.split()
258 if name == "MemTotal:":
260 elif name == "SwapTotal:":
267 Returns swap utilization on the system as a whole percentage (0-100).
272 swaps = open("/proc/swaps", "r")
274 lines = swaps.readlines()[1:]
277 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
278 (filename, type, size, used, priority) = line.strip().split()
280 total_swap += int(size)
281 total_used += int(used)
282 except ValueEror, err:
284 except (IOError, KeyError), err: pass
286 swapused = 100 * total_used / total_swap
287 if debug: print "%s percent swap used" % swapused
290 def summary(slices = None, total_mem = None, total_swap = None):
292 Return a summary of memory usage by slice.
294 if not slices: slices = slicestat()
295 slicelist = slices.values()
296 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
297 if total_mem is None or total_swap is None:
298 (total_mem, total_swap) = memtotal()
300 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
301 for slice in slicelist:
302 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
303 (slice['name'], len(slice['procs']),
304 format_bytes(slice['rss'] * 1024, si = False),
305 100. * slice['rss'] / total_mem,
306 format_bytes(slice['sz'] * 1024, si = False),
307 100. * slice['sz'] / (total_mem + total_swap))
310 def formtable(slice, percent):
312 Makes pretty message to email with human readable ps values.
314 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
315 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
316 for proc in slice['procs']:
317 table += "%5s %10s %10s %10s %4.1f %s\n" % \
319 format_bytes(proc['vsize'] * 1024, si = False),
320 format_bytes(proc['sz'] * 1024, si = False),
321 format_bytes(proc['rss'] * 1024, si = False),
325 prettytable = {'hostname': socket.gethostname(),
326 'date': time.asctime(time.gmtime()) + " GMT",
328 'slice': slice['name'],
329 'rss': format_bytes(slice['rss'] * 1024, si = False),
330 'sz': format_bytes(slice['sz'] * 1024, si = False),
336 Return dictionary of vps (slicestat) from datfile left behind by OOM
337 before rebooting. If none file, just grab the latest dict (slicestat)
338 and return that. If dat file found, means we rebooted, send an email to
342 f = open(DATAFILE, "r+")
344 print "Loading %s" % DATAFILE
345 (v, slices) = pickle.load(f)
347 # Check version of data file
349 print "Not using old version '%s' data file %s" % (v, DATAFILE)
352 params = {'hostname': socket.gethostname(),
353 'date': time.asctime(time.gmtime()) + " GMT",
354 'table': summary(slices, total_mem, total_swap)}
356 print rebooted_subject % params
357 print rebooted_body % params
359 slicemail(None, rebooted_subject % params, rebooted_body % params)
369 def writedat(slices):
371 Write (slices) to pickled datfile.
373 if verbose: print "Saving %s" % DATAFILE
374 f = open(DATAFILE, "w")
375 pickle.dump((VERSION, slices), f)
381 global debug, verbose, DATAFILE, VERSION
382 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
392 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
393 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
394 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
395 except getopt.GetoptError, err:
396 print "Error: " + err.msg
400 for (opt, optval) in opts:
401 if opt == "-d" or opt == "--debug":
403 elif opt == "-v" or opt == "--verbose":
405 elif opt == "-f" or opt == "--file":
407 elif opt == "-s" or opt == "--slice":
409 elif opt == "-p" or opt == "--period":
411 elif opt == "--change-thresh":
412 change_thresh = int(optval)
413 elif opt == "--reset-thresh":
414 reset_thresh = int(optval)
415 elif opt == "--reboot-thresh":
416 reboot_thresh = int(optval)
417 elif opt == "--min-thresh":
418 rss_min = int(optval)
419 elif opt == "--system-slice":
420 system_slices.append(optval)
421 elif opt == "--status":
422 print summary(slicestat(names))
428 # Check if we are already running
435 # Redirect stdout and stderr to syslog
436 syslog.openlog("swapmon")
437 sys.stdout = sys.stderr = Logger()
440 (total_mem, total_swap) = memtotal()
443 # Query process table every 30 seconds, or when a large change in
444 # swap utilization is detected.
448 if last_used is None: last_used = used
451 if used >= reboot_thresh:
452 # Dump slice state before rebooting
454 # Goodbye, cruel world
455 print "%d%% swap consumed, rebooting" % used
456 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
457 elif used >= reset_thresh:
459 slicelist = slices.values()
460 # Puts largest on top.
461 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
462 for slice in slicelist:
463 percent = 100. * slice['rss'] / total_mem
464 if slice['rss'] < rss_min: continue
465 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
468 format_bytes(slice['rss'] * 1024, si = False),
470 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
471 # Make a pretty table.
472 params = formtable(slice, percent)
473 # Match slice name against system slice patterns
474 is_system_slice = filter(None,
475 [re.match(pattern, slice['name']) for pattern in system_slices])
477 # Do not reset system slices, just warn once
479 if slice['name'] not in warned:
480 warned.append(slice['name'])
481 print "Warning slice " + slice['name']
483 print alarm_subject % params
484 print alarm_body % params
486 slicemail(slice['name'], alarm_subject % params,
491 if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
492 slicemail(slice['name'], kill_subject % params, kill_body % params)
493 emailed[slice['name']] = time.time()
495 print kill_subject % params
496 print kill_body % params
497 print "Killing procs in %s" % slice['name']
498 killsliverprocs(slice['xid'])
500 # wait period before recalculating swap. If in danger, recalc.
501 if timer <= 0 or used >= (last_used + change_thresh):
502 if used >= (last_used + change_thresh):
503 print "%d%% swap consumed, %d%% in last %d seconds" % \
504 (used, used - last_used, period - timer)
506 slices = slicestat(names)
509 # Keep track of large changes in swap utilization
516 if __name__ == '__main__':