3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # util-vserver/python/vserver.py allows us to control slices directly
27 from vserver import VServer
29 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
38 datafile = "/var/lib/misc/swapmon.dat"
40 # Seconds between process analysis
43 # Minimum change in swap utilization over 30 seconds that will trigger
44 # early process analysis.
47 # Swap utilization at which the largest consumer of physical memory is reset
50 # Swap utilization at which the machine is rebooted
53 # Time to wait before checking slice again after reset
56 # Number of strikes before killing (strike, strike, kill)
59 # Time to wait before removing slice from kill queue (probation)
62 # Don't email the same message more than once in the same emailtimeout interval
65 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
66 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
69 # System slices that should not be reset (regexps)
70 system_slices = ['root', PLC_SLICE_PREFIX + '_']
72 # Message sent after a critical reboot
73 rebooted_subject = "pl_mom rebooted %(hostname)s"
76 Sometime before %(date)s, swap space was
77 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
79 Slices active prior to reboot are listed below. Memory usage
80 statistics are not entirely accurate due to threading.
84 %(date)s %(hostname)s reboot
87 # Message sent after a hog is reset
88 reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
91 Sometime before %(date)s, swap space was
92 nearly exhausted on %(hostname)s.
94 Slice %(slice)s was reset since it was the largest consumer of
95 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
97 Please reply to this message explaining the nature of your experiment,
98 and what you are doing to address the problem.
100 http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
102 %(slice)s processes prior to reset:
106 %(date)s %(hostname)s reset %(slice)s
109 # Message sent to system slices that should not be reset
110 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
113 Sometime before %(date)s, swap space was
114 nearly exhausted on %(hostname)s.
116 System slice %(slice)s was the largest consumer of physical memory at
117 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
118 but please verify its behavior.
120 %(slice)s processes prior to alarm:
124 %(date)s %(hostname)s alarm %(slice)s
127 # Message sent after a slice has been killed
128 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
131 Sometime before %(date)s, swap space was
132 nearly exhausted on %(hostname)s.
134 Slice %(slice)s was killed since it was the largest consumer of
135 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
136 after repeated restarts.
138 Please reply to this message explaining the nature of your experiment,
139 and what you are doing to address the problem.
141 %(slice)s processes prior to reset:
145 %(date)s %(hostname)s reset %(slice)s
152 Keeps track of state information for resets and kills
154 resettimeleft - timeout before checking for next reset
155 resetcount - number of strikes
156 killtimeleft - time out before removing from kill queue
157 {kill,reset}mail - Time of last email
158 kill - State of kill. If slice is already being killed, wait before retry.
161 def __init__(self,name):
163 self.resettimeleft = reset_timeout
166 self.killtimeleft = kill_timeout
173 # Count down for next check of reset slice.
174 if self.resettimeleft > 0:
175 self.resettimeleft -= 1
176 if debug and verbose: print "%s has %s seconds in probation" \
177 %(self.name, self.killtimeleft)
178 if self.killtimeleft > 0:
179 # Count down kill probation timer (killtimeleft)
180 self.killtimeleft -= 1
181 if self.killtimeleft == 1:
182 print "%s is out of probation" % self.name
184 # Once out of probation period (killtimeleft), remove strikes
188 # Check to see if a slice needs to be killed. If it has been killed more
189 # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
190 def checkkill(self,params):
191 if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
193 print kill_subject % params
194 print kill_body % params
198 print "Slice %s is being killed." % self.name
199 vserver = VServer(self.name)
204 except Exception, err:
205 print "Warning: Exception received while killing slice %s: %s" \
207 if (time.time() - self.killmail) > email_timeout:
208 slicemail(self.name, kill_subject % params, kill_body % params)
209 print "Sending KILL email for slice %s" % self.name
210 self.killmail = time.time()
214 # Reset slice after checking to see if slice is out of timeout.
215 # Increment resetcount, check to see if larger than kill_thresh.
216 def reset(self, params):
217 # If its the first reset (came back after kill)
218 # or if its been reset before
219 # and we are out of the reset timeout.
220 if self.resetcount == 0 or self.resettimeleft == 0:
221 # Do we need to kill this slice? Check history first.
222 if self.checkkill(params): return
225 self.killtimeleft = kill_timeout
226 self.resettimeleft = reset_timeout
227 print "%s has %s seconds to die and has been reset %s times" \
228 %(self.name, self.resettimeleft, self.resetcount)
230 print reset_subject % params
231 print reset_body % params
235 print "Resetting slice " + self.name
236 vserver = VServer(self.name)
238 vserver.start(wait = False)
242 except Exception, err:
243 print "Warning: Exception received while resetting slice %s:" \
245 if (time.time() - self.resetmail) > email_timeout:
246 slicemail(self.name, reset_subject % params, reset_body % params)
247 print "Sending Reset email for slice %s" % self.name
248 self.resetmail = time.time()
252 Usage: %s [OPTIONS]...
255 -d, --debug Enable debugging (default: %s)
256 -v, --verbose Increase verbosity level (default: %d)
257 -f, --file=FILE Data file (default: %s)
258 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
259 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
260 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
261 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
262 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
263 --system-slice=SLICE System slice that should not be reset
264 --status Print memory usage statistics and exit
265 -h, --help This message
266 """.lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period))
268 def slicestat(names = None):
270 Get status of specified slices (if names is None or empty, all
271 slices). vsize, sz, and rss are in KiB. Returns
273 {xid: {'xid': slice_id,
275 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
276 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
277 'pcpu': cpu_percent, 'pmem': mem_percent}]
278 'vsize': total_virtual_kib,
279 'sz': total_potential_kib,
280 'rss': total_physical_kib}}
283 # Mandatory fields. xid is a virtual field inserted by vps. Make
284 # sure cmd is last so that it does not get truncated
286 fields = ['pid', 'xid', 'user', 'vsize', 'sz', 'rss', 'pcpu', 'pmem', 'cmd']
288 # vps inserts xid after pid in the output, but ps doesn't know
289 # what the field means.
290 ps_fields = list(fields)
291 ps_fields.remove('xid')
295 # Eat the header line. vps depends on the header to figure out
296 # which column is the PID column, so we can't just tell ps not to
298 for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]:
302 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
303 # vps uses to denote the root context and the "all contexts"
304 # context) with "0" so that we can just split() on whitespace.
305 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
307 # Represent process as a dict of fields
308 values = line.split(None, len(fields) - 1)
309 if len(values) != len(fields):
311 proc = dict(zip(fields, values))
313 # Convert ints and floats
316 proc[field] = int(proc[field])
319 proc[field] = float(proc[field])
323 # vps sometimes prints ERR instead of a context ID if it
324 # cannot identify the context of an orphaned (usually dying)
325 # process. Skip these processes.
326 if type(proc['xid']) != int:
329 # Assign (pl_)sshd processes to slice instead of root
330 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
332 xid = bwlimit.get_xid(m.group(1))
336 name = bwlimit.get_slice(proc['xid'])
338 # Orphaned (not associated with a slice) class
339 name = "%d?" % proc['xid']
341 # Monitor only the specified slices
342 if names and name not in names:
345 # Additional overhead calculations from slicestat
347 # Include 12 KiB of process overhead =
348 # 4 KiB top-level page table +
349 # 4 KiB kernel structure +
350 # 4 KiB basic page table
353 # Include additional page table overhead
354 if proc['vsize'] > 4096:
355 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
357 if slices.has_key(proc['xid']):
358 slice = slices[proc['xid']]
360 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
362 slice['procs'].append(proc)
363 slice['vsize'] += proc['vsize']
364 slice['sz'] += proc['sz']
365 slice['rss'] += proc['rss']
367 slices[proc['xid']] = slice
373 Returns total physical and swap memory on the system in KiB.
377 meminfo = open("/proc/meminfo", "r")
378 for line in meminfo.readlines():
380 (name, value, kb) = line.split()
383 if name == "MemTotal:":
385 elif name == "SwapTotal:":
392 Returns swap utilization on the system as a whole percentage (0-100).
397 swaps = open("/proc/swaps", "r")
399 lines = swaps.readlines()[1:]
402 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
403 (filename, type, size, used, priority) = line.strip().split()
405 total_swap += int(size)
406 total_used += int(used)
407 except ValueEror, err:
409 except (IOError, KeyError), err: pass
411 swapused = 100 * total_used / total_swap
412 if debug: print "%s percent swap used" % swapused
415 def summary(slices = None, total_mem = None, total_swap = None):
417 Return a summary of memory usage by slice.
419 if not slices: slices = slicestat()
420 slicelist = slices.values()
421 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
422 if total_mem is None or total_swap is None:
423 (total_mem, total_swap) = memtotal()
425 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
426 for slice in slicelist:
427 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
428 (slice['name'], len(slice['procs']),
429 format_bytes(slice['rss'] * 1024, si = False),
430 100. * slice['rss'] / total_mem,
431 format_bytes(slice['sz'] * 1024, si = False),
432 100. * slice['sz'] / (total_mem + total_swap))
435 def formtable(slice, percent):
437 Makes pretty message to email with human readable ps values.
439 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
440 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
441 for proc in slice['procs']:
442 table += "%5s %10s %10s %10s %4.1f %4.1f %s\n" % \
444 format_bytes(proc['vsize'] * 1024, si = False),
445 format_bytes(proc['sz'] * 1024, si = False),
446 format_bytes(proc['rss'] * 1024, si = False),
451 prettytable = {'hostname': socket.gethostname(),
452 'date': time.asctime(time.gmtime()) + " GMT",
454 'slice': slice['name'],
455 'rss': format_bytes(slice['rss'] * 1024, si = False),
456 'sz': format_bytes(slice['sz'] * 1024, si = False),
462 Return dictionary of vps (slicestat) from datfile left behind by OOM
463 before rebooting. If none file, just grab the latest dict (slicestat)
464 and return that. If dat file found, means we rebooted, send an email to
468 f = open(datafile, "r+")
470 print "Loading %s" % datafile
471 (version, slices) = pickle.load(f)
473 # Check version of data file
474 if version != "$Id$":
475 print "Not using old version '%s' data file %s" % (version, datafile)
478 params = {'hostname': socket.gethostname(),
479 'date': time.asctime(time.gmtime()) + " GMT",
480 'table': summary(slices, total_mem, total_swap)}
482 print rebooted_subject % params
483 print rebooted_body % params
485 slicemail(None, rebooted_subject % params, rebooted_body % params)
496 def writedat(slices):
498 Write (slices) to pickled datfile.
500 if verbose: print "Saving %s" % datafile
501 f = open(datafile, "w")
502 pickle.dump((version, slices), f)
508 global debug, verbose, datafile
509 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
514 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
515 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
516 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
517 except getopt.GetoptError, err:
518 print "Error: " + err.msg
522 for (opt, optval) in opts:
523 if opt == "-d" or opt == "--debug":
525 elif opt == "-v" or opt == "--verbose":
527 elif opt == "-f" or opt == "--file":
529 elif opt == "-s" or opt == "--slice":
531 elif opt == "-p" or opt == "--period":
533 elif opt == "--change-thresh":
534 change_thresh = int(optval)
535 elif opt == "--reset-thresh":
536 reset_thresh = int(optval)
537 elif opt == "--reboot-thresh":
538 reboot_thresh = int(optval)
539 elif opt == "--min-thresh":
540 rss_min = int(optval)
541 elif opt == "--system-slice":
542 system_slices.append(optval)
543 elif opt == "--status":
544 print summary(slicestat(names))
550 # Check if we are already running
557 # Redirect stdout and stderr to syslog
558 syslog.openlog("swapmon")
559 sys.stdout = sys.stderr = Logger()
562 (total_mem, total_swap) = memtotal()
565 # Query process table every 30 seconds, or when a large change in
566 # swap utilization is detected.
571 # System slices that we have warned but could not reset
574 # Slices that were reset
579 if last_used is None: last_used = used
581 # If we've reset you recently, update timers.
582 for resetslice in resetlist.keys():
583 resetlist[resetslice].update()
584 # If you've been good, remove you from our list.
585 if resetlist[resetslice].killtimeleft == 0 and \
586 resetlist[resetslice].resettimeleft == 0:
587 del resetlist[resetslice]
589 if verbose: print "%d%% swap consumed" % used
591 if used >= reboot_thresh:
592 # Dump slice state before rebooting
594 # Goodbye, cruel world
595 print "%d%% swap consumed, rebooting" % used
596 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
597 elif used >= reset_thresh:
599 slicelist = slices.values()
600 # Puts largest on top.
601 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
602 for slice in slicelist:
603 percent = 100. * slice['rss'] / total_mem
604 if slice['rss'] < rss_min: continue
605 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
608 format_bytes(slice['rss'] * 1024, si = False),
610 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
611 # Make a pretty table.
612 params = formtable(slice, percent)
613 # Match slice name against system slice patterns
614 is_system_slice = filter(None,
615 [re.match(pattern, slice['name']) for pattern in system_slices])
617 # Do not reset system slices, just warn once
619 if slice['name'] not in warned:
620 warned.append(slice['name'])
621 print "Warning slice " + slice['name']
623 print alarm_subject % params
624 print alarm_body % params
626 slicemail(slice['name'], alarm_subject % params,
630 if not resetlist.has_key(slice['name']):
631 resetlist[slice['name']] = Reset(slice['name'])
632 resetlist[slice['name']].reset(params)
634 # wait period vefore recalculating swap. If in danger, recalc.
635 if timer <= 0 or used >= (last_used + change_thresh):
636 if used >= (last_used + change_thresh):
637 print "%d%% swap consumed, %d%% in last %d seconds" % \
638 (used, used - last_used, period - timer)
640 slices = slicestat(names)
643 # Keep track of large changes in swap utilization
650 if __name__ == '__main__':