3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
23 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
24 import plnode.bwlimit as bwlimit
32 DATAFILE = "/var/lib/misc/swapmon.dat"
33 # xxx fixme - this is broken under git
35 # Seconds between process analysis
38 # Minimum change in swap utilization over 30 seconds that will trigger
39 # early process analysis.
42 # Swap utilization at which the largest consumer of physical memory is reset
45 # Swap utilization at which the machine is rebooted
48 # Don't email the same message more than once in the same emailtimeout interval
51 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
52 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
55 # System slices that should not be reset (regexps)
56 system_slices = ['root', PLC_SLICE_PREFIX + '_']
58 # Message sent after a critical reboot
59 rebooted_subject = "pl_mom rebooted %(hostname)s"
62 Sometime before %(date)s, swap space was
63 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
65 Slices active prior to reboot are listed below. Memory usage
66 statistics are not entirely accurate due to threading.
70 %(date)s %(hostname)s reboot
73 # Message sent to system slices that should not be reset
74 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
77 Sometime before %(date)s, swap space was
78 nearly exhausted on %(hostname)s.
80 System slice %(slice)s was the largest consumer of physical memory at
81 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
82 but please verify its behavior.
84 %(slice)s processes prior to alarm:
88 %(date)s %(hostname)s alarm %(slice)s
91 # Message sent after a slice has been killed
92 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
95 Sometime before %(date)s, swap space was
96 nearly exhausted on %(hostname)s.
98 Slice %(slice)s was killed since it was the largest consumer of
99 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
100 after repeated restarts.
102 Please reply to this message explaining the nature of your experiment,
103 and what you are doing to address the problem.
105 %(slice)s processes prior to reset:
109 %(date)s %(hostname)s reset %(slice)s
112 def killsliverprocs(xid):
113 bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
118 Usage: %s [OPTIONS]...
121 -d, --debug Enable debugging (default: %s)
122 -v, --verbose Increase verbosity level (default: %d)
123 -f, --file=FILE Data file (default: %s)
124 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
125 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
126 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
127 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
128 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
129 --system-slice=SLICE System slice that should not be reset
130 --status Print memory usage statistics and exit
131 --memstatus Print total memory, total swap, and swap used
132 -h, --help This message
133 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
135 def slicestat(names = None):
137 Get status of specified slices (if names is None or empty, all
138 slices). vsize, sz, and rss are in KiB. Returns
139 PID CONTEXT VSZ SZ RSS %MEM CMD
140 {xid: {'xid': slice_id,
142 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
143 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
144 'pcpu': cpu_percent, 'pmem': mem_percent}]
145 'vsize': total_virtual_kib,
146 'sz': total_potential_kib,
147 'rss': total_physical_kib}}
150 # Mandatory fields. xid is a virtual field inserted by vps. Make
151 # sure cmd is last so that it does not get truncated
153 fields = ['pid', 'xid', 'vsname', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
155 # vps inserts xid after pid in the output, but ps doesn't know
156 # what the field means.
157 ps_fields = list(fields)
158 ps_fields.remove('xid')
159 ps_fields.remove('vsname')
163 # Eat the header line. vps depends on the header to figure out
164 # which column is the PID column, so we can't just tell ps not to
166 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
170 # Represent process as a dict of fields
171 values = line.split(None, len(fields) - 1)
172 if len(values) != len(fields):
174 pass # ignore spurious error message from vps
176 print "slicestat: failed to parse line: " + line
178 proc = dict(zip(fields, values))
180 # Convert ints and floats
183 proc[field] = int(proc[field])
186 proc[field] = float(proc[field])
190 # vps sometimes prints ERR or the name of the slice
191 # instead of a context ID if it
192 # cannot identify the context of an orphaned (usually dying)
193 # process. Skip these processes.
194 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
196 pass # ignore spurious error message from vps
198 print "slicestat: failed to parse line: " + line
201 # Assign (pl_)sshd processes to slice instead of root
202 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
205 xid = bwlimit.get_xid(m.group(1))
209 name = bwlimit.get_slice(proc['xid'])
211 # Orphaned (not associated with a slice) class
212 name = "%d?" % proc['xid']
214 # Monitor only the specified slices
215 if names and name not in names:
218 # Additional overhead calculations from slicestat
220 # Include 12 KiB of process overhead =
221 # 4 KiB top-level page table +
222 # 4 KiB kernel structure +
223 # 4 KiB basic page table
226 # Include additional page table overhead
228 if proc['vsize'] > 4096:
229 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
232 if slices.has_key(proc['xid']):
233 slice = slices[proc['xid']]
235 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
237 slice['procs'].append(proc)
238 slice['vsize'] += proc['vsize']
239 slice['sz'] += proc['sz']
240 slice['rss'] += proc['rss']
242 slices[proc['xid']] = slice
248 Returns total physical and swap memory on the system in KiB.
252 meminfo = open("/proc/meminfo", "r")
253 for line in meminfo.readlines():
255 (name, value, kb) = line.split()
258 if name == "MemTotal:":
260 elif name == "SwapTotal:":
267 Returns swap utilization on the system as a whole percentage (0-100).
272 swaps = open("/proc/swaps", "r")
274 lines = swaps.readlines()[1:]
277 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
278 (filename, type, size, used, priority) = line.strip().split()
280 total_swap += int(size)
281 total_used += int(used)
282 except ValueEror, err:
284 except (IOError, KeyError), err: pass
286 swapused = 100 * total_used / total_swap
287 if debug: print "%s percent swap used" % swapused
290 def summary(slices = None, total_mem = None, total_swap = None):
292 Return a summary of memory usage by slice.
294 if not slices: slices = slicestat()
295 slicelist = slices.values()
296 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
297 if total_mem is None or total_swap is None:
298 (total_mem, total_swap) = memtotal()
300 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
301 for slice in slicelist:
302 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
303 (slice['name'], len(slice['procs']),
304 format_bytes(slice['rss'] * 1024, si = False),
305 100. * slice['rss'] / total_mem,
306 format_bytes(slice['sz'] * 1024, si = False),
307 100. * slice['sz'] / (total_mem + total_swap))
310 def formtable(slice, percent):
312 Makes pretty message to email with human readable ps values.
314 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
315 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
316 for proc in slice['procs']:
317 table += "%5s %10s %10s %10s %4.1f %s\n" % \
319 format_bytes(proc['vsize'] * 1024, si = False),
320 format_bytes(proc['sz'] * 1024, si = False),
321 format_bytes(proc['rss'] * 1024, si = False),
325 prettytable = {'hostname': socket.gethostname(),
326 'date': time.asctime(time.gmtime()) + " GMT",
328 'slice': slice['name'],
329 'rss': format_bytes(slice['rss'] * 1024, si = False),
330 'sz': format_bytes(slice['sz'] * 1024, si = False),
336 Return dictionary of vps (slicestat) from datfile left behind by OOM
337 before rebooting. If none file, just grab the latest dict (slicestat)
338 and return that. If dat file found, means we rebooted, send an email to
342 f = open(DATAFILE, "r+")
344 print "Loading %s" % DATAFILE
345 (v, slices) = pickle.load(f)
347 # Check version of data file
349 print "Not using old version '%s' data file %s" % (v, DATAFILE)
352 params = {'hostname': socket.gethostname(),
353 'date': time.asctime(time.gmtime()) + " GMT",
354 'table': summary(slices, total_mem, total_swap)}
356 print rebooted_subject % params
357 print rebooted_body % params
359 slicemail(None, rebooted_subject % params, rebooted_body % params)
369 def writedat(slices):
371 Write (slices) to pickled datfile.
373 if verbose: print "Saving %s" % DATAFILE
374 f = open(DATAFILE, "w")
375 pickle.dump((VERSION, slices), f)
381 global debug, verbose, DATAFILE, VERSION
382 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
392 longopts = ["debug", "verbose", "file=", "slice=", "status", "memstatus", "help"]
393 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
394 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
395 except getopt.GetoptError, err:
396 print "Error: " + err.msg
400 for (opt, optval) in opts:
401 if opt == "-d" or opt == "--debug":
403 elif opt == "-v" or opt == "--verbose":
405 elif opt == "-f" or opt == "--file":
407 elif opt == "-s" or opt == "--slice":
409 elif opt == "-p" or opt == "--period":
411 elif opt == "--change-thresh":
412 change_thresh = int(optval)
413 elif opt == "--reset-thresh":
414 reset_thresh = int(optval)
415 elif opt == "--reboot-thresh":
416 reboot_thresh = int(optval)
417 elif opt == "--min-thresh":
418 rss_min = int(optval)
419 elif opt == "--system-slice":
420 system_slices.append(optval)
421 elif opt == "--status":
422 print summary(slicestat(names))
424 elif opt == "--memstatus":
425 (mem, swap) = memtotal()
426 swap_pct = swap_used()
427 print "memory total:", mem
428 print "swap total:", swap
429 print "swap used:", swap_pct
435 # Check if we are already running
442 # Redirect stdout and stderr to syslog
443 syslog.openlog("swapmon")
444 sys.stdout = sys.stderr = Logger()
447 (total_mem, total_swap) = memtotal()
450 # Query process table every 30 seconds, or when a large change in
451 # swap utilization is detected.
455 if last_used is None: last_used = used
457 if used >= reboot_thresh:
458 # Dump slice state before rebooting
460 # Goodbye, cruel world
461 print "%d%% swap consumed, rebooting" % used
462 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
463 elif used >= reset_thresh:
465 slicelist = slices.values()
466 # Puts largest on top.
467 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
468 for slice in slicelist:
469 percent = 100. * slice['rss'] / total_mem
470 if slice['rss'] < rss_min: continue
471 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
474 format_bytes(slice['rss'] * 1024, si = False),
476 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
477 # Make a pretty table.
478 params = formtable(slice, percent)
479 # Match slice name against system slice patterns
480 is_system_slice = filter(None,
481 [re.match(pattern, slice['name']) for pattern in system_slices])
483 # Do not reset system slices, just warn once
485 if slice['name'] not in warned:
486 warned.append(slice['name'])
487 print "Warning slice " + slice['name']
489 print alarm_subject % params
490 print alarm_body % params
492 slicemail(slice['name'], alarm_subject % params,
497 if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
498 slicemail(slice['name'], kill_subject % params, kill_body % params)
499 emailed[slice['name']] = time.time()
501 print kill_subject % params
502 print kill_body % params
503 print "Killing procs in %s" % slice['name']
504 killsliverprocs(slice['xid'])
506 # wait period before recalculating swap. If in danger, recalc.
507 if timer <= 0 or used >= (last_used + change_thresh):
508 if used >= (last_used + change_thresh):
509 print "%d%% swap consumed, %d%% in last %d seconds" % \
510 (used, used - last_used, period - timer)
512 slices = slicestat(names)
515 # Keep track of large changes in swap utilization
522 if __name__ == '__main__':