3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
34 DATAFILE = "/var/lib/misc/swapmon.dat"
36 # Seconds between process analysis
39 # Minimum change in swap utilization over 30 seconds that will trigger
40 # early process analysis.
43 # Swap utilization at which the largest consumer of physical memory is reset
46 # Swap utilization at which the machine is rebooted
49 # Don't email the same message more than once in the same emailtimeout interval
52 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
53 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
56 # System slices that should not be reset (regexps)
57 system_slices = ['root', PLC_SLICE_PREFIX + '_']
59 # Message sent after a critical reboot
60 rebooted_subject = "pl_mom rebooted %(hostname)s"
63 Sometime before %(date)s, swap space was
64 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
66 Slices active prior to reboot are listed below. Memory usage
67 statistics are not entirely accurate due to threading.
71 %(date)s %(hostname)s reboot
74 # Message sent to system slices that should not be reset
75 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
78 Sometime before %(date)s, swap space was
79 nearly exhausted on %(hostname)s.
81 System slice %(slice)s was the largest consumer of physical memory at
82 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
83 but please verify its behavior.
85 %(slice)s processes prior to alarm:
89 %(date)s %(hostname)s alarm %(slice)s
92 # Message sent after a slice has been killed
93 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
96 Sometime before %(date)s, swap space was
97 nearly exhausted on %(hostname)s.
99 Slice %(slice)s was killed since it was the largest consumer of
100 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
101 after repeated restarts.
103 Please reply to this message explaining the nature of your experiment,
104 and what you are doing to address the problem.
106 %(slice)s processes prior to reset:
110 %(date)s %(hostname)s reset %(slice)s
113 def killsliverprocs(xid):
114 bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
119 Usage: %s [OPTIONS]...
122 -d, --debug Enable debugging (default: %s)
123 -v, --verbose Increase verbosity level (default: %d)
124 -f, --file=FILE Data file (default: %s)
125 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
126 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
127 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
128 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
129 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
130 --system-slice=SLICE System slice that should not be reset
131 --status Print memory usage statistics and exit
132 -h, --help This message
133 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
135 def slicestat(names = None):
137 Get status of specified slices (if names is None or empty, all
138 slices). vsize, sz, and rss are in KiB. Returns
139 PID CONTEXT VSZ SZ RSS %MEM CMD
140 {xid: {'xid': slice_id,
142 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
143 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
144 'pcpu': cpu_percent, 'pmem': mem_percent}]
145 'vsize': total_virtual_kib,
146 'sz': total_potential_kib,
147 'rss': total_physical_kib}}
150 # Mandatory fields. xid is a virtual field inserted by vps. Make
151 # sure cmd is last so that it does not get truncated
153 fields = ['pid', 'xid', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
155 # vps inserts xid after pid in the output, but ps doesn't know
156 # what the field means.
157 ps_fields = list(fields)
158 ps_fields.remove('xid')
162 # Eat the header line. vps depends on the header to figure out
163 # which column is the PID column, so we can't just tell ps not to
165 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
169 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
170 # vps uses to denote the root context and the "all contexts"
171 # context) with "0" so that we can just split() on whitespace.
172 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
174 # Represent process as a dict of fields
175 values = line.split(None, len(fields) - 1)
176 if len(values) != len(fields):
178 proc = dict(zip(fields, values))
180 # Convert ints and floats
183 proc[field] = int(proc[field])
186 proc[field] = float(proc[field])
190 # vps sometimes prints ERR or the name of the slice
191 # instead of a context ID if it
192 # cannot identify the context of an orphaned (usually dying)
193 # process. Skip these processes.
194 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
197 # Assign (pl_)sshd processes to slice instead of root
198 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
201 xid = bwlimit.get_xid(m.group(1))
205 name = bwlimit.get_slice(proc['xid'])
207 # Orphaned (not associated with a slice) class
208 name = "%d?" % proc['xid']
210 # Monitor only the specified slices
211 if names and name not in names:
214 # Additional overhead calculations from slicestat
216 # Include 12 KiB of process overhead =
217 # 4 KiB top-level page table +
218 # 4 KiB kernel structure +
219 # 4 KiB basic page table
222 # Include additional page table overhead
224 if proc['vsize'] > 4096:
225 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
228 if slices.has_key(proc['xid']):
229 slice = slices[proc['xid']]
231 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
233 slice['procs'].append(proc)
234 slice['vsize'] += proc['vsize']
235 slice['sz'] += proc['sz']
236 slice['rss'] += proc['rss']
238 slices[proc['xid']] = slice
244 Returns total physical and swap memory on the system in KiB.
248 meminfo = open("/proc/meminfo", "r")
249 for line in meminfo.readlines():
251 (name, value, kb) = line.split()
254 if name == "MemTotal:":
256 elif name == "SwapTotal:":
263 Returns swap utilization on the system as a whole percentage (0-100).
268 swaps = open("/proc/swaps", "r")
270 lines = swaps.readlines()[1:]
273 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
274 (filename, type, size, used, priority) = line.strip().split()
276 total_swap += int(size)
277 total_used += int(used)
278 except ValueEror, err:
280 except (IOError, KeyError), err: pass
282 swapused = 100 * total_used / total_swap
283 if debug: print "%s percent swap used" % swapused
286 def summary(slices = None, total_mem = None, total_swap = None):
288 Return a summary of memory usage by slice.
290 if not slices: slices = slicestat()
291 slicelist = slices.values()
292 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
293 if total_mem is None or total_swap is None:
294 (total_mem, total_swap) = memtotal()
296 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
297 for slice in slicelist:
298 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
299 (slice['name'], len(slice['procs']),
300 format_bytes(slice['rss'] * 1024, si = False),
301 100. * slice['rss'] / total_mem,
302 format_bytes(slice['sz'] * 1024, si = False),
303 100. * slice['sz'] / (total_mem + total_swap))
306 def formtable(slice, percent):
308 Makes pretty message to email with human readable ps values.
310 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
311 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
312 for proc in slice['procs']:
313 table += "%5s %10s %10s %10s %4.1f %s\n" % \
315 format_bytes(proc['vsize'] * 1024, si = False),
316 format_bytes(proc['sz'] * 1024, si = False),
317 format_bytes(proc['rss'] * 1024, si = False),
321 prettytable = {'hostname': socket.gethostname(),
322 'date': time.asctime(time.gmtime()) + " GMT",
324 'slice': slice['name'],
325 'rss': format_bytes(slice['rss'] * 1024, si = False),
326 'sz': format_bytes(slice['sz'] * 1024, si = False),
332 Return dictionary of vps (slicestat) from datfile left behind by OOM
333 before rebooting. If none file, just grab the latest dict (slicestat)
334 and return that. If dat file found, means we rebooted, send an email to
338 f = open(DATAFILE, "r+")
340 print "Loading %s" % DATAFILE
341 (v, slices) = pickle.load(f)
343 # Check version of data file
345 print "Not using old version '%s' data file %s" % (v, DATAFILE)
348 params = {'hostname': socket.gethostname(),
349 'date': time.asctime(time.gmtime()) + " GMT",
350 'table': summary(slices, total_mem, total_swap)}
352 print rebooted_subject % params
353 print rebooted_body % params
355 slicemail(None, rebooted_subject % params, rebooted_body % params)
365 def writedat(slices):
367 Write (slices) to pickled datfile.
369 if verbose: print "Saving %s" % DATAFILE
370 f = open(DATAFILE, "w")
371 pickle.dump((VERSION, slices), f)
377 global debug, verbose, DATAFILE, VERSION
378 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
388 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
389 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
390 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
391 except getopt.GetoptError, err:
392 print "Error: " + err.msg
396 for (opt, optval) in opts:
397 if opt == "-d" or opt == "--debug":
399 elif opt == "-v" or opt == "--verbose":
401 elif opt == "-f" or opt == "--file":
403 elif opt == "-s" or opt == "--slice":
405 elif opt == "-p" or opt == "--period":
407 elif opt == "--change-thresh":
408 change_thresh = int(optval)
409 elif opt == "--reset-thresh":
410 reset_thresh = int(optval)
411 elif opt == "--reboot-thresh":
412 reboot_thresh = int(optval)
413 elif opt == "--min-thresh":
414 rss_min = int(optval)
415 elif opt == "--system-slice":
416 system_slices.append(optval)
417 elif opt == "--status":
418 print summary(slicestat(names))
424 # Check if we are already running
431 # Redirect stdout and stderr to syslog
432 syslog.openlog("swapmon")
433 sys.stdout = sys.stderr = Logger()
436 (total_mem, total_swap) = memtotal()
439 # Query process table every 30 seconds, or when a large change in
440 # swap utilization is detected.
444 if last_used is None: last_used = used
446 if used >= reboot_thresh:
447 # Dump slice state before rebooting
449 # Goodbye, cruel world
450 print "%d%% swap consumed, rebooting" % used
451 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
452 elif used >= reset_thresh:
454 slicelist = slices.values()
455 # Puts largest on top.
456 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
457 for slice in slicelist:
458 percent = 100. * slice['rss'] / total_mem
459 if slice['rss'] < rss_min: continue
460 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
463 format_bytes(slice['rss'] * 1024, si = False),
465 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
466 # Make a pretty table.
467 params = formtable(slice, percent)
468 # Match slice name against system slice patterns
469 is_system_slice = filter(None,
470 [re.match(pattern, slice['name']) for pattern in system_slices])
472 # Do not reset system slices, just warn once
474 if slice['name'] not in warned:
475 warned.append(slice['name'])
476 print "Warning slice " + slice['name']
478 print alarm_subject % params
479 print alarm_body % params
481 slicemail(slice['name'], alarm_subject % params,
486 if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
487 slicemail(slice['name'], kill_subject % params, kill_body % params)
488 emailed[slice['name']] = time.time()
490 print kill_subject % params
491 print kill_body % params
492 print "Killing procs in %s" % slice['name']
493 killsliverprocs(slice['xid'])
495 # wait period before recalculating swap. If in danger, recalc.
496 if timer <= 0 or used >= (last_used + change_thresh):
497 if used >= (last_used + change_thresh):
498 print "%d%% swap consumed, %d%% in last %d seconds" % \
499 (used, used - last_used, period - timer)
501 slices = slicestat(names)
504 # Keep track of large changes in swap utilization
511 if __name__ == '__main__':