3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
34 DATAFILE = "/var/lib/misc/swapmon.dat"
36 # Seconds between process analysis
39 # Minimum change in swap utilization over 30 seconds that will trigger
40 # early process analysis.
43 # Swap utilization at which the largest consumer of physical memory is reset
46 # Swap utilization at which the machine is rebooted
49 # Don't email the same message more than once in the same emailtimeout interval
52 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
53 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
56 # System slices that should not be reset (regexps)
57 system_slices = ['root', PLC_SLICE_PREFIX + '_']
59 # Message sent after a critical reboot
60 rebooted_subject = "pl_mom rebooted %(hostname)s"
63 Sometime before %(date)s, swap space was
64 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
66 Slices active prior to reboot are listed below. Memory usage
67 statistics are not entirely accurate due to threading.
71 %(date)s %(hostname)s reboot
74 # Message sent to system slices that should not be reset
75 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
78 Sometime before %(date)s, swap space was
79 nearly exhausted on %(hostname)s.
81 System slice %(slice)s was the largest consumer of physical memory at
82 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
83 but please verify its behavior.
85 %(slice)s processes prior to alarm:
89 %(date)s %(hostname)s alarm %(slice)s
92 # Message sent after a slice has been killed
93 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
96 Sometime before %(date)s, swap space was
97 nearly exhausted on %(hostname)s.
99 Slice %(slice)s was killed since it was the largest consumer of
100 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
101 after repeated restarts.
103 Please reply to this message explaining the nature of your experiment,
104 and what you are doing to address the problem.
106 %(slice)s processes prior to reset:
110 %(date)s %(hostname)s reset %(slice)s
113 def killsliverprocs(xid):
114 bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
119 Usage: %s [OPTIONS]...
122 -d, --debug Enable debugging (default: %s)
123 -v, --verbose Increase verbosity level (default: %d)
124 -f, --file=FILE Data file (default: %s)
125 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
126 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
127 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
128 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
129 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
130 --system-slice=SLICE System slice that should not be reset
131 --status Print memory usage statistics and exit
132 --memstatus Print total memory, total swap, and swap used
133 -h, --help This message
134 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
136 def slicestat(names = None):
138 Get status of specified slices (if names is None or empty, all
139 slices). vsize, sz, and rss are in KiB. Returns
140 PID CONTEXT VSZ SZ RSS %MEM CMD
141 {xid: {'xid': slice_id,
143 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
144 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
145 'pcpu': cpu_percent, 'pmem': mem_percent}]
146 'vsize': total_virtual_kib,
147 'sz': total_potential_kib,
148 'rss': total_physical_kib}}
151 # Mandatory fields. xid is a virtual field inserted by vps. Make
152 # sure cmd is last so that it does not get truncated
154 fields = ['pid', 'xid', 'vsname', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
156 # vps inserts xid after pid in the output, but ps doesn't know
157 # what the field means.
158 ps_fields = list(fields)
159 ps_fields.remove('xid')
160 ps_fields.remove('vsname')
164 # Eat the header line. vps depends on the header to figure out
165 # which column is the PID column, so we can't just tell ps not to
167 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
171 # Represent process as a dict of fields
172 values = line.split(None, len(fields) - 1)
173 if len(values) != len(fields):
174 print "slicestat: failed to parse line:", line
176 proc = dict(zip(fields, values))
178 # Convert ints and floats
181 proc[field] = int(proc[field])
184 proc[field] = float(proc[field])
188 # vps sometimes prints ERR or the name of the slice
189 # instead of a context ID if it
190 # cannot identify the context of an orphaned (usually dying)
191 # process. Skip these processes.
192 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
193 print "slicestat: failed to parse line:", line
196 # Assign (pl_)sshd processes to slice instead of root
197 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
200 xid = bwlimit.get_xid(m.group(1))
204 name = bwlimit.get_slice(proc['xid'])
206 # Orphaned (not associated with a slice) class
207 name = "%d?" % proc['xid']
209 # Monitor only the specified slices
210 if names and name not in names:
213 # Additional overhead calculations from slicestat
215 # Include 12 KiB of process overhead =
216 # 4 KiB top-level page table +
217 # 4 KiB kernel structure +
218 # 4 KiB basic page table
221 # Include additional page table overhead
223 if proc['vsize'] > 4096:
224 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
227 if slices.has_key(proc['xid']):
228 slice = slices[proc['xid']]
230 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
232 slice['procs'].append(proc)
233 slice['vsize'] += proc['vsize']
234 slice['sz'] += proc['sz']
235 slice['rss'] += proc['rss']
237 slices[proc['xid']] = slice
243 Returns total physical and swap memory on the system in KiB.
247 meminfo = open("/proc/meminfo", "r")
248 for line in meminfo.readlines():
250 (name, value, kb) = line.split()
253 if name == "MemTotal:":
255 elif name == "SwapTotal:":
262 Returns swap utilization on the system as a whole percentage (0-100).
267 swaps = open("/proc/swaps", "r")
269 lines = swaps.readlines()[1:]
272 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
273 (filename, type, size, used, priority) = line.strip().split()
275 total_swap += int(size)
276 total_used += int(used)
277 except ValueEror, err:
279 except (IOError, KeyError), err: pass
281 swapused = 100 * total_used / total_swap
282 if debug: print "%s percent swap used" % swapused
285 def summary(slices = None, total_mem = None, total_swap = None):
287 Return a summary of memory usage by slice.
289 if not slices: slices = slicestat()
290 slicelist = slices.values()
291 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
292 if total_mem is None or total_swap is None:
293 (total_mem, total_swap) = memtotal()
295 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
296 for slice in slicelist:
297 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
298 (slice['name'], len(slice['procs']),
299 format_bytes(slice['rss'] * 1024, si = False),
300 100. * slice['rss'] / total_mem,
301 format_bytes(slice['sz'] * 1024, si = False),
302 100. * slice['sz'] / (total_mem + total_swap))
305 def formtable(slice, percent):
307 Makes pretty message to email with human readable ps values.
309 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
310 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
311 for proc in slice['procs']:
312 table += "%5s %10s %10s %10s %4.1f %s\n" % \
314 format_bytes(proc['vsize'] * 1024, si = False),
315 format_bytes(proc['sz'] * 1024, si = False),
316 format_bytes(proc['rss'] * 1024, si = False),
320 prettytable = {'hostname': socket.gethostname(),
321 'date': time.asctime(time.gmtime()) + " GMT",
323 'slice': slice['name'],
324 'rss': format_bytes(slice['rss'] * 1024, si = False),
325 'sz': format_bytes(slice['sz'] * 1024, si = False),
331 Return dictionary of vps (slicestat) from datfile left behind by OOM
332 before rebooting. If none file, just grab the latest dict (slicestat)
333 and return that. If dat file found, means we rebooted, send an email to
337 f = open(DATAFILE, "r+")
339 print "Loading %s" % DATAFILE
340 (v, slices) = pickle.load(f)
342 # Check version of data file
344 print "Not using old version '%s' data file %s" % (v, DATAFILE)
347 params = {'hostname': socket.gethostname(),
348 'date': time.asctime(time.gmtime()) + " GMT",
349 'table': summary(slices, total_mem, total_swap)}
351 print rebooted_subject % params
352 print rebooted_body % params
354 slicemail(None, rebooted_subject % params, rebooted_body % params)
364 def writedat(slices):
366 Write (slices) to pickled datfile.
368 if verbose: print "Saving %s" % DATAFILE
369 f = open(DATAFILE, "w")
370 pickle.dump((VERSION, slices), f)
376 global debug, verbose, DATAFILE, VERSION
377 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
387 longopts = ["debug", "verbose", "file=", "slice=", "status", "memstatus", "help"]
388 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
389 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
390 except getopt.GetoptError, err:
391 print "Error: " + err.msg
395 for (opt, optval) in opts:
396 if opt == "-d" or opt == "--debug":
398 elif opt == "-v" or opt == "--verbose":
400 elif opt == "-f" or opt == "--file":
402 elif opt == "-s" or opt == "--slice":
404 elif opt == "-p" or opt == "--period":
406 elif opt == "--change-thresh":
407 change_thresh = int(optval)
408 elif opt == "--reset-thresh":
409 reset_thresh = int(optval)
410 elif opt == "--reboot-thresh":
411 reboot_thresh = int(optval)
412 elif opt == "--min-thresh":
413 rss_min = int(optval)
414 elif opt == "--system-slice":
415 system_slices.append(optval)
416 elif opt == "--status":
417 print summary(slicestat(names))
419 elif opt == "--memstatus":
420 (mem, swap) = memtotal()
421 swap_pct = swap_used()
422 print "memory total:", mem
423 print "swap total:", swap
424 print "swap used:", swap_pct
430 # Check if we are already running
437 # Redirect stdout and stderr to syslog
438 syslog.openlog("swapmon")
439 sys.stdout = sys.stderr = Logger()
442 (total_mem, total_swap) = memtotal()
445 # Query process table every 30 seconds, or when a large change in
446 # swap utilization is detected.
450 if last_used is None: last_used = used
452 if used >= reboot_thresh:
453 # Dump slice state before rebooting
455 # Goodbye, cruel world
456 print "%d%% swap consumed, rebooting" % used
457 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
458 elif used >= reset_thresh:
460 slicelist = slices.values()
461 # Puts largest on top.
462 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
463 for slice in slicelist:
464 percent = 100. * slice['rss'] / total_mem
465 if slice['rss'] < rss_min: continue
466 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
469 format_bytes(slice['rss'] * 1024, si = False),
471 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
472 # Make a pretty table.
473 params = formtable(slice, percent)
474 # Match slice name against system slice patterns
475 is_system_slice = filter(None,
476 [re.match(pattern, slice['name']) for pattern in system_slices])
478 # Do not reset system slices, just warn once
480 if slice['name'] not in warned:
481 warned.append(slice['name'])
482 print "Warning slice " + slice['name']
484 print alarm_subject % params
485 print alarm_body % params
487 slicemail(slice['name'], alarm_subject % params,
492 if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
493 slicemail(slice['name'], kill_subject % params, kill_body % params)
494 emailed[slice['name']] = time.time()
496 print kill_subject % params
497 print kill_body % params
498 print "Killing procs in %s" % slice['name']
499 killsliverprocs(slice['xid'])
501 # wait period before recalculating swap. If in danger, recalc.
502 if timer <= 0 or used >= (last_used + change_thresh):
503 if used >= (last_used + change_thresh):
504 print "%d%% swap consumed, %d%% in last %d seconds" % \
505 (used, used - last_used, period - timer)
507 slices = slicestat(names)
510 # Keep track of large changes in swap utilization
517 if __name__ == '__main__':