3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # util-vserver/python/vserver.py allows us to control slices directly
27 from vserver import VServer
29 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
38 DATAFILE = "/var/lib/misc/swapmon.dat"
40 # Seconds between process analysis
43 # Minimum change in swap utilization over 30 seconds that will trigger
44 # early process analysis.
47 # Swap utilization at which the largest consumer of physical memory is reset
50 # Swap utilization at which the machine is rebooted
53 # Time to wait before checking slice again after reset
56 # Number of strikes before killing (strike, strike, kill)
59 # Time to wait before removing slice from kill queue (probation)
62 # Don't email the same message more than once in the same emailtimeout interval
65 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
66 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
69 # System slices that should not be reset (regexps)
70 system_slices = ['root', PLC_SLICE_PREFIX + '_']
72 # Message sent after a critical reboot
73 rebooted_subject = "pl_mom rebooted %(hostname)s"
76 Sometime before %(date)s, swap space was
77 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
79 Slices active prior to reboot are listed below. Memory usage
80 statistics are not entirely accurate due to threading.
84 %(date)s %(hostname)s reboot
87 # Message sent after a hog is reset
88 reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
91 Sometime before %(date)s, swap space was
92 nearly exhausted on %(hostname)s.
94 Slice %(slice)s was reset since it was the largest consumer of
95 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
97 Please reply to this message explaining the nature of your experiment,
98 and what you are doing to address the problem.
100 http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
102 %(slice)s processes prior to reset:
106 %(date)s %(hostname)s reset %(slice)s
109 # Message sent to system slices that should not be reset
110 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
113 Sometime before %(date)s, swap space was
114 nearly exhausted on %(hostname)s.
116 System slice %(slice)s was the largest consumer of physical memory at
117 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
118 but please verify its behavior.
120 %(slice)s processes prior to alarm:
124 %(date)s %(hostname)s alarm %(slice)s
127 # Message sent after a slice has been killed
128 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
131 Sometime before %(date)s, swap space was
132 nearly exhausted on %(hostname)s.
134 Slice %(slice)s was killed since it was the largest consumer of
135 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
136 after repeated restarts.
138 Please reply to this message explaining the nature of your experiment,
139 and what you are doing to address the problem.
141 %(slice)s processes prior to reset:
145 %(date)s %(hostname)s reset %(slice)s
152 Keeps track of state information for resets and kills
154 resettimeleft - timeout before checking for next reset
155 resetcount - number of strikes
156 killtimeleft - time out before removing from kill queue
157 {kill,reset}mail - Time of last email
158 kill - State of kill. If slice is already being killed, wait before retry.
161 def __init__(self,name):
163 self.resettimeleft = reset_timeout
166 self.killtimeleft = kill_timeout
173 # Count down for next check of reset slice.
174 if self.resettimeleft > 0:
175 self.resettimeleft -= 1
176 if debug and verbose: print "%s has %s seconds in probation" \
177 %(self.name, self.killtimeleft)
178 if self.killtimeleft > 0:
179 # Count down kill probation timer (killtimeleft)
180 self.killtimeleft -= 1
181 if self.killtimeleft == 1:
182 print "%s is out of probation" % self.name
184 # Once out of probation period (killtimeleft), remove strikes
188 # Check to see if a slice needs to be killed. If it has been killed more
189 # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
190 def checkkill(self,params):
191 if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
193 print kill_subject % params
194 print kill_body % params
198 print "Slice %s is being killed." % self.name
199 vserver = VServer(self.name)
204 except Exception, err:
205 print "Warning: Exception received while killing slice %s: %s" \
207 if (time.time() - self.killmail) > email_timeout:
208 slicemail(self.name, kill_subject % params, kill_body % params)
209 print "Sending KILL email for slice %s" % self.name
210 self.killmail = time.time()
214 # Reset slice after checking to see if slice is out of timeout.
215 # Increment resetcount, check to see if larger than kill_thresh.
216 def reset(self, params):
217 # If its the first reset (came back after kill)
218 # or if its been reset before
219 # and we are out of the reset timeout.
220 if self.resetcount == 0 or self.resettimeleft == 0:
221 # Do we need to kill this slice? Check history first.
222 if self.checkkill(params): return
225 self.killtimeleft = kill_timeout
226 self.resettimeleft = reset_timeout
227 print "%s has %s seconds to die and has been reset %s times" \
228 %(self.name, self.resettimeleft, self.resetcount)
230 print reset_subject % params
231 print reset_body % params
235 print "Resetting slice " + self.name
236 vserver = VServer(self.name)
238 vserver.start(wait = False)
242 except Exception, err:
243 print "Warning: Exception received while resetting slice %s:" \
245 if (time.time() - self.resetmail) > email_timeout:
246 slicemail(self.name, reset_subject % params, reset_body % params)
247 print "Sending Reset email for slice %s" % self.name
248 self.resetmail = time.time()
252 Usage: %s [OPTIONS]...
255 -d, --debug Enable debugging (default: %s)
256 -v, --verbose Increase verbosity level (default: %d)
257 -f, --file=FILE Data file (default: %s)
258 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
259 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
260 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
261 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
262 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
263 --system-slice=SLICE System slice that should not be reset
264 --status Print memory usage statistics and exit
265 -h, --help This message
266 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
268 def slicestat(names = None):
270 Get status of specified slices (if names is None or empty, all
271 slices). vsize, sz, and rss are in KiB. Returns
273 {xid: {'xid': slice_id,
275 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
276 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
277 'pcpu': cpu_percent, 'pmem': mem_percent}]
278 'vsize': total_virtual_kib,
279 'sz': total_potential_kib,
280 'rss': total_physical_kib}}
283 # Mandatory fields. xid is a virtual field inserted by vps. Make
284 # sure cmd is last so that it does not get truncated
286 fields = ['pid', 'xid', 'user', 'vsize', 'sz', 'rss', 'pcpu', 'pmem', 'cmd']
288 # vps inserts xid after pid in the output, but ps doesn't know
289 # what the field means.
290 ps_fields = list(fields)
291 ps_fields.remove('xid')
295 # Eat the header line. vps depends on the header to figure out
296 # which column is the PID column, so we can't just tell ps not to
298 for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]:
302 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
303 # vps uses to denote the root context and the "all contexts"
304 # context) with "0" so that we can just split() on whitespace.
305 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
307 # Represent process as a dict of fields
308 values = line.split(None, len(fields) - 1)
309 if len(values) != len(fields):
311 proc = dict(zip(fields, values))
313 # Convert ints and floats
316 proc[field] = int(proc[field])
319 proc[field] = float(proc[field])
323 # vps sometimes prints ERR instead of a context ID if it
324 # cannot identify the context of an orphaned (usually dying)
325 # process. Skip these processes.
326 if type(proc['xid']) != int:
329 # Assign (pl_)sshd processes to slice instead of root
330 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
332 xid = bwlimit.get_xid(m.group(1))
336 name = bwlimit.get_slice(proc['xid'])
338 # Orphaned (not associated with a slice) class
339 name = "%d?" % proc['xid']
341 # Monitor only the specified slices
342 if names and name not in names:
345 # Additional overhead calculations from slicestat
347 # Include 12 KiB of process overhead =
348 # 4 KiB top-level page table +
349 # 4 KiB kernel structure +
350 # 4 KiB basic page table
353 # Include additional page table overhead
354 if proc['vsize'] > 4096:
355 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
357 if slices.has_key(proc['xid']):
358 slice = slices[proc['xid']]
360 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
362 slice['procs'].append(proc)
363 slice['vsize'] += proc['vsize']
364 slice['sz'] += proc['sz']
365 slice['rss'] += proc['rss']
367 slices[proc['xid']] = slice
373 Returns total physical and swap memory on the system in KiB.
377 meminfo = open("/proc/meminfo", "r")
378 for line in meminfo.readlines():
380 (name, value, kb) = line.split()
383 if name == "MemTotal:":
385 elif name == "SwapTotal:":
392 Returns swap utilization on the system as a whole percentage (0-100).
397 swaps = open("/proc/swaps", "r")
399 lines = swaps.readlines()[1:]
402 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
403 (filename, type, size, used, priority) = line.strip().split()
405 total_swap += int(size)
406 total_used += int(used)
407 except ValueEror, err:
409 except (IOError, KeyError), err: pass
411 swapused = 100 * total_used / total_swap
412 if debug: print "%s percent swap used" % swapused
415 def summary(slices = None, total_mem = None, total_swap = None):
417 Return a summary of memory usage by slice.
419 if not slices: slices = slicestat()
420 slicelist = slices.values()
421 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
422 if total_mem is None or total_swap is None:
423 (total_mem, total_swap) = memtotal()
425 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
426 for slice in slicelist:
427 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
428 (slice['name'], len(slice['procs']),
429 format_bytes(slice['rss'] * 1024, si = False),
430 100. * slice['rss'] / total_mem,
431 format_bytes(slice['sz'] * 1024, si = False),
432 100. * slice['sz'] / (total_mem + total_swap))
435 def formtable(slice, percent):
437 Makes pretty message to email with human readable ps values.
439 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
440 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
441 for proc in slice['procs']:
442 table += "%5s %10s %10s %10s %4.1f %4.1f %s\n" % \
444 format_bytes(proc['vsize'] * 1024, si = False),
445 format_bytes(proc['sz'] * 1024, si = False),
446 format_bytes(proc['rss'] * 1024, si = False),
451 prettytable = {'hostname': socket.gethostname(),
452 'date': time.asctime(time.gmtime()) + " GMT",
454 'slice': slice['name'],
455 'rss': format_bytes(slice['rss'] * 1024, si = False),
456 'sz': format_bytes(slice['sz'] * 1024, si = False),
462 Return dictionary of vps (slicestat) from datfile left behind by OOM
463 before rebooting. If none file, just grab the latest dict (slicestat)
464 and return that. If dat file found, means we rebooted, send an email to
468 f = open(DATAFILE, "r+")
470 print "Loading %s" % DATAFILE
471 (v, slices) = pickle.load(f)
473 # Check version of data file
475 print "Not using old version '%s' data file %s" % (v, DATAFILE)
478 params = {'hostname': socket.gethostname(),
479 'date': time.asctime(time.gmtime()) + " GMT",
480 'table': summary(slices, total_mem, total_swap)}
482 print rebooted_subject % params
483 print rebooted_body % params
485 slicemail(None, rebooted_subject % params, rebooted_body % params)
495 def writedat(slices):
497 Write (slices) to pickled datfile.
499 if verbose: print "Saving %s" % DATAFILE
500 f = open(DATAFILE, "w")
501 pickle.dump((VERSION, slices), f)
507 global debug, verbose, DATAFILE, VERSION
508 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
513 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
514 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
515 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
516 except getopt.GetoptError, err:
517 print "Error: " + err.msg
521 for (opt, optval) in opts:
522 if opt == "-d" or opt == "--debug":
524 elif opt == "-v" or opt == "--verbose":
526 elif opt == "-f" or opt == "--file":
528 elif opt == "-s" or opt == "--slice":
530 elif opt == "-p" or opt == "--period":
532 elif opt == "--change-thresh":
533 change_thresh = int(optval)
534 elif opt == "--reset-thresh":
535 reset_thresh = int(optval)
536 elif opt == "--reboot-thresh":
537 reboot_thresh = int(optval)
538 elif opt == "--min-thresh":
539 rss_min = int(optval)
540 elif opt == "--system-slice":
541 system_slices.append(optval)
542 elif opt == "--status":
543 print summary(slicestat(names))
549 # Check if we are already running
556 # Redirect stdout and stderr to syslog
557 syslog.openlog("swapmon")
558 sys.stdout = sys.stderr = Logger()
561 (total_mem, total_swap) = memtotal()
564 # Query process table every 30 seconds, or when a large change in
565 # swap utilization is detected.
570 # System slices that we have warned but could not reset
573 # Slices that were reset
578 if last_used is None: last_used = used
580 # If we've reset you recently, update timers.
581 for resetslice in resetlist.keys():
582 resetlist[resetslice].update()
583 # If you've been good, remove you from our list.
584 if resetlist[resetslice].killtimeleft == 0 and \
585 resetlist[resetslice].resettimeleft == 0:
586 del resetlist[resetslice]
588 if verbose: print "%d%% swap consumed" % used
590 if used >= reboot_thresh:
591 # Dump slice state before rebooting
593 # Goodbye, cruel world
594 print "%d%% swap consumed, rebooting" % used
595 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
596 elif used >= reset_thresh:
598 slicelist = slices.values()
599 # Puts largest on top.
600 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
601 for slice in slicelist:
602 percent = 100. * slice['rss'] / total_mem
603 if slice['rss'] < rss_min: continue
604 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
607 format_bytes(slice['rss'] * 1024, si = False),
609 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
610 # Make a pretty table.
611 params = formtable(slice, percent)
612 # Match slice name against system slice patterns
613 is_system_slice = filter(None,
614 [re.match(pattern, slice['name']) for pattern in system_slices])
616 # Do not reset system slices, just warn once
618 if slice['name'] not in warned:
619 warned.append(slice['name'])
620 print "Warning slice " + slice['name']
622 print alarm_subject % params
623 print alarm_body % params
625 slicemail(slice['name'], alarm_subject % params,
629 if not resetlist.has_key(slice['name']):
630 resetlist[slice['name']] = Reset(slice['name'])
631 resetlist[slice['name']].reset(params)
633 # wait period vefore recalculating swap. If in danger, recalc.
634 if timer <= 0 or used >= (last_used + change_thresh):
635 if used >= (last_used + change_thresh):
636 print "%d%% swap consumed, %d%% in last %d seconds" % \
637 (used, used - last_used, period - timer)
639 slices = slicestat(names)
642 # Keep track of large changes in swap utilization
649 if __name__ == '__main__':