3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
34 DATAFILE = "/var/lib/misc/swapmon.dat"
36 # Seconds between process analysis
39 # Minimum change in swap utilization over 30 seconds that will trigger
40 # early process analysis.
43 # Swap utilization at which the largest consumer of physical memory is reset
46 # Swap utilization at which the machine is rebooted
49 # Don't email the same message more than once in the same emailtimeout interval
52 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
53 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
56 # System slices that should not be reset (regexps)
57 system_slices = ['root', PLC_SLICE_PREFIX + '_']
59 # Message sent after a critical reboot
60 rebooted_subject = "pl_mom rebooted %(hostname)s"
63 Sometime before %(date)s, swap space was
64 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
66 Slices active prior to reboot are listed below. Memory usage
67 statistics are not entirely accurate due to threading.
71 %(date)s %(hostname)s reboot
74 # Message sent to system slices that should not be reset
75 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
78 Sometime before %(date)s, swap space was
79 nearly exhausted on %(hostname)s.
81 System slice %(slice)s was the largest consumer of physical memory at
82 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
83 but please verify its behavior.
85 %(slice)s processes prior to alarm:
89 %(date)s %(hostname)s alarm %(slice)s
92 # Message sent after a slice has been killed
93 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
96 Sometime before %(date)s, swap space was
97 nearly exhausted on %(hostname)s.
99 Slice %(slice)s was killed since it was the largest consumer of
100 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
101 after repeated restarts.
103 Please reply to this message explaining the nature of your experiment,
104 and what you are doing to address the problem.
106 %(slice)s processes prior to reset:
110 %(date)s %(hostname)s reset %(slice)s
113 def killsliverprocs(xid):
114 bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
119 Usage: %s [OPTIONS]...
122 -d, --debug Enable debugging (default: %s)
123 -v, --verbose Increase verbosity level (default: %d)
124 -f, --file=FILE Data file (default: %s)
125 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
126 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
127 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
128 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
129 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
130 --system-slice=SLICE System slice that should not be reset
131 --status Print memory usage statistics and exit
132 --memstatus Print total memory, total swap, and swap used
133 -h, --help This message
134 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
136 def slicestat(names = None):
138 Get status of specified slices (if names is None or empty, all
139 slices). vsize, sz, and rss are in KiB. Returns
140 PID CONTEXT VSZ SZ RSS %MEM CMD
141 {xid: {'xid': slice_id,
143 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
144 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
145 'pcpu': cpu_percent, 'pmem': mem_percent}]
146 'vsize': total_virtual_kib,
147 'sz': total_potential_kib,
148 'rss': total_physical_kib}}
151 # Mandatory fields. xid is a virtual field inserted by vps. Make
152 # sure cmd is last so that it does not get truncated
154 fields = ['pid', 'xid', 'vsname', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
156 # vps inserts xid after pid in the output, but ps doesn't know
157 # what the field means.
158 ps_fields = list(fields)
159 ps_fields.remove('xid')
160 ps_fields.remove('vsname')
164 # Eat the header line. vps depends on the header to figure out
165 # which column is the PID column, so we can't just tell ps not to
167 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
171 # Represent process as a dict of fields
172 values = line.split(None, len(fields) - 1)
173 if len(values) != len(fields):
175 pass # ignore spurious error message from vps
177 print "slicestat: failed to parse line: " + line
179 proc = dict(zip(fields, values))
181 # Convert ints and floats
184 proc[field] = int(proc[field])
187 proc[field] = float(proc[field])
191 # vps sometimes prints ERR or the name of the slice
192 # instead of a context ID if it
193 # cannot identify the context of an orphaned (usually dying)
194 # process. Skip these processes.
195 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
197 pass # ignore spurious error message from vps
199 print "slicestat: failed to parse line: " + line
202 # Assign (pl_)sshd processes to slice instead of root
203 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
206 xid = bwlimit.get_xid(m.group(1))
210 name = bwlimit.get_slice(proc['xid'])
212 # Orphaned (not associated with a slice) class
213 name = "%d?" % proc['xid']
215 # Monitor only the specified slices
216 if names and name not in names:
219 # Additional overhead calculations from slicestat
221 # Include 12 KiB of process overhead =
222 # 4 KiB top-level page table +
223 # 4 KiB kernel structure +
224 # 4 KiB basic page table
227 # Include additional page table overhead
229 if proc['vsize'] > 4096:
230 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
233 if slices.has_key(proc['xid']):
234 slice = slices[proc['xid']]
236 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
238 slice['procs'].append(proc)
239 slice['vsize'] += proc['vsize']
240 slice['sz'] += proc['sz']
241 slice['rss'] += proc['rss']
243 slices[proc['xid']] = slice
249 Returns total physical and swap memory on the system in KiB.
253 meminfo = open("/proc/meminfo", "r")
254 for line in meminfo.readlines():
256 (name, value, kb) = line.split()
259 if name == "MemTotal:":
261 elif name == "SwapTotal:":
268 Returns swap utilization on the system as a whole percentage (0-100).
273 swaps = open("/proc/swaps", "r")
275 lines = swaps.readlines()[1:]
278 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
279 (filename, type, size, used, priority) = line.strip().split()
281 total_swap += int(size)
282 total_used += int(used)
283 except ValueEror, err:
285 except (IOError, KeyError), err: pass
287 swapused = 100 * total_used / total_swap
288 if debug: print "%s percent swap used" % swapused
291 def summary(slices = None, total_mem = None, total_swap = None):
293 Return a summary of memory usage by slice.
295 if not slices: slices = slicestat()
296 slicelist = slices.values()
297 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
298 if total_mem is None or total_swap is None:
299 (total_mem, total_swap) = memtotal()
301 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
302 for slice in slicelist:
303 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
304 (slice['name'], len(slice['procs']),
305 format_bytes(slice['rss'] * 1024, si = False),
306 100. * slice['rss'] / total_mem,
307 format_bytes(slice['sz'] * 1024, si = False),
308 100. * slice['sz'] / (total_mem + total_swap))
311 def formtable(slice, percent):
313 Makes pretty message to email with human readable ps values.
315 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
316 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
317 for proc in slice['procs']:
318 table += "%5s %10s %10s %10s %4.1f %s\n" % \
320 format_bytes(proc['vsize'] * 1024, si = False),
321 format_bytes(proc['sz'] * 1024, si = False),
322 format_bytes(proc['rss'] * 1024, si = False),
326 prettytable = {'hostname': socket.gethostname(),
327 'date': time.asctime(time.gmtime()) + " GMT",
329 'slice': slice['name'],
330 'rss': format_bytes(slice['rss'] * 1024, si = False),
331 'sz': format_bytes(slice['sz'] * 1024, si = False),
337 Return dictionary of vps (slicestat) from datfile left behind by OOM
338 before rebooting. If none file, just grab the latest dict (slicestat)
339 and return that. If dat file found, means we rebooted, send an email to
343 f = open(DATAFILE, "r+")
345 print "Loading %s" % DATAFILE
346 (v, slices) = pickle.load(f)
348 # Check version of data file
350 print "Not using old version '%s' data file %s" % (v, DATAFILE)
353 params = {'hostname': socket.gethostname(),
354 'date': time.asctime(time.gmtime()) + " GMT",
355 'table': summary(slices, total_mem, total_swap)}
357 print rebooted_subject % params
358 print rebooted_body % params
360 slicemail(None, rebooted_subject % params, rebooted_body % params)
370 def writedat(slices):
372 Write (slices) to pickled datfile.
374 if verbose: print "Saving %s" % DATAFILE
375 f = open(DATAFILE, "w")
376 pickle.dump((VERSION, slices), f)
382 global debug, verbose, DATAFILE, VERSION
383 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
393 longopts = ["debug", "verbose", "file=", "slice=", "status", "memstatus", "help"]
394 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
395 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
396 except getopt.GetoptError, err:
397 print "Error: " + err.msg
401 for (opt, optval) in opts:
402 if opt == "-d" or opt == "--debug":
404 elif opt == "-v" or opt == "--verbose":
406 elif opt == "-f" or opt == "--file":
408 elif opt == "-s" or opt == "--slice":
410 elif opt == "-p" or opt == "--period":
412 elif opt == "--change-thresh":
413 change_thresh = int(optval)
414 elif opt == "--reset-thresh":
415 reset_thresh = int(optval)
416 elif opt == "--reboot-thresh":
417 reboot_thresh = int(optval)
418 elif opt == "--min-thresh":
419 rss_min = int(optval)
420 elif opt == "--system-slice":
421 system_slices.append(optval)
422 elif opt == "--status":
423 print summary(slicestat(names))
425 elif opt == "--memstatus":
426 (mem, swap) = memtotal()
427 swap_pct = swap_used()
428 print "memory total:", mem
429 print "swap total:", swap
430 print "swap used:", swap_pct
436 # Check if we are already running
443 # Redirect stdout and stderr to syslog
444 syslog.openlog("swapmon")
445 sys.stdout = sys.stderr = Logger()
448 (total_mem, total_swap) = memtotal()
451 # Query process table every 30 seconds, or when a large change in
452 # swap utilization is detected.
456 if last_used is None: last_used = used
458 if used >= reboot_thresh:
459 # Dump slice state before rebooting
461 # Goodbye, cruel world
462 print "%d%% swap consumed, rebooting" % used
463 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
464 elif used >= reset_thresh:
466 slicelist = slices.values()
467 # Puts largest on top.
468 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
469 for slice in slicelist:
470 percent = 100. * slice['rss'] / total_mem
471 if slice['rss'] < rss_min: continue
472 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
475 format_bytes(slice['rss'] * 1024, si = False),
477 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
478 # Make a pretty table.
479 params = formtable(slice, percent)
480 # Match slice name against system slice patterns
481 is_system_slice = filter(None,
482 [re.match(pattern, slice['name']) for pattern in system_slices])
484 # Do not reset system slices, just warn once
486 if slice['name'] not in warned:
487 warned.append(slice['name'])
488 print "Warning slice " + slice['name']
490 print alarm_subject % params
491 print alarm_body % params
493 slicemail(slice['name'], alarm_subject % params,
498 if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
499 slicemail(slice['name'], kill_subject % params, kill_body % params)
500 emailed[slice['name']] = time.time()
502 print kill_subject % params
503 print kill_body % params
504 print "Killing procs in %s" % slice['name']
505 killsliverprocs(slice['xid'])
507 # wait period before recalculating swap. If in danger, recalc.
508 if timer <= 0 or used >= (last_used + change_thresh):
509 if used >= (last_used + change_thresh):
510 print "%d%% swap consumed, %d%% in last %d seconds" % \
511 (used, used - last_used, period - timer)
513 slices = slicestat(names)
516 # Keep track of large changes in swap utilization
523 if __name__ == '__main__':