3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
25 # util-vserver/python/vserver.py allows us to control slices directly
27 from vserver import VServer
29 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
38 DATAFILE = "/var/lib/misc/swapmon.dat"
40 # Seconds between process analysis
43 # Minimum change in swap utilization over 30 seconds that will trigger
44 # early process analysis.
47 # Swap utilization at which the largest consumer of physical memory is reset
50 # Swap utilization at which the machine is rebooted
53 # Time to wait before checking slice again after reset
56 # Number of strikes before killing (strike, strike, kill)
59 # Time to wait before removing slice from kill queue (probation)
62 # Don't email the same message more than once in the same emailtimeout interval
65 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
66 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
69 # System slices that should not be reset (regexps)
70 system_slices = ['root', PLC_SLICE_PREFIX + '_']
72 # Message sent after a critical reboot
73 rebooted_subject = "pl_mom rebooted %(hostname)s"
76 Sometime before %(date)s, swap space was
77 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
79 Slices active prior to reboot are listed below. Memory usage
80 statistics are not entirely accurate due to threading.
84 %(date)s %(hostname)s reboot
87 # Message sent after a hog is reset
88 reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
91 Sometime before %(date)s, swap space was
92 nearly exhausted on %(hostname)s.
94 Slice %(slice)s was reset since it was the largest consumer of
95 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
97 Please reply to this message explaining the nature of your experiment,
98 and what you are doing to address the problem.
100 http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
102 %(slice)s processes prior to reset:
106 %(date)s %(hostname)s reset %(slice)s
109 # Message sent to system slices that should not be reset
110 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
113 Sometime before %(date)s, swap space was
114 nearly exhausted on %(hostname)s.
116 System slice %(slice)s was the largest consumer of physical memory at
117 %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset,
118 but please verify its behavior.
120 %(slice)s processes prior to alarm:
124 %(date)s %(hostname)s alarm %(slice)s
127 # Message sent after a slice has been killed
128 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
131 Sometime before %(date)s, swap space was
132 nearly exhausted on %(hostname)s.
134 Slice %(slice)s was killed since it was the largest consumer of
135 physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
136 after repeated restarts.
138 Please reply to this message explaining the nature of your experiment,
139 and what you are doing to address the problem.
141 %(slice)s processes prior to reset:
145 %(date)s %(hostname)s reset %(slice)s
152 Keeps track of state information for resets and kills
154 resettimeleft - timeout before checking for next reset
155 resetcount - number of strikes
156 killtimeleft - time out before removing from kill queue
157 {kill,reset}mail - Time of last email
158 kill - State of kill. If slice is already being killed, wait before retry.
161 def __init__(self,name):
163 self.resettimeleft = reset_timeout
166 self.killtimeleft = kill_timeout
173 # Count down for next check of reset slice.
174 if self.resettimeleft > 0:
175 self.resettimeleft -= 1
176 if debug and verbose: print "%s has %s seconds in probation" \
177 %(self.name, self.killtimeleft)
178 if self.killtimeleft > 0:
179 # Count down kill probation timer (killtimeleft)
180 self.killtimeleft -= 1
181 if self.killtimeleft == 1:
182 print "%s is out of probation" % self.name
184 # Once out of probation period (killtimeleft), remove strikes
188 # Check to see if a slice needs to be killed. If it has been killed more
189 # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
190 def checkkill(self,params):
191 if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
193 print kill_subject % params
194 print kill_body % params
198 print "Slice %s is being killed." % self.name
199 vserver = VServer(self.name)
201 # ignore initscripts. Don't run anything at start.
202 vserver.INITSCRIPTS = []
207 except Exception, err:
208 print "Warning: Exception received while killing slice %s: %s" \
210 if (time.time() - self.killmail) > email_timeout:
211 slicemail(self.name, kill_subject % params, kill_body % params)
212 print "Sending KILL email for slice %s" % self.name
213 self.killmail = time.time()
217 # Reset slice after checking to see if slice is out of timeout.
218 # Increment resetcount, check to see if larger than kill_thresh.
219 def reset(self, params):
220 # If its the first reset (came back after kill)
221 # or if its been reset before
222 # and we are out of the reset timeout.
223 if self.resetcount == 0 or self.resettimeleft == 0:
224 # Do we need to kill this slice? Check history first.
225 if self.checkkill(params): return
228 self.killtimeleft = kill_timeout
229 self.resettimeleft = reset_timeout
230 print "%s has %s seconds to die and has been reset %s times" \
231 %(self.name, self.resettimeleft, self.resetcount)
233 print reset_subject % params
234 print reset_body % params
238 print "Resetting slice " + self.name
239 vserver = VServer(self.name)
245 except Exception, err:
246 print "Warning: Exception received while resetting slice %s:" \
248 if (time.time() - self.resetmail) > email_timeout:
249 slicemail(self.name, reset_subject % params, reset_body % params)
250 print "Sending Reset email for slice %s" % self.name
251 self.resetmail = time.time()
255 Usage: %s [OPTIONS]...
258 -d, --debug Enable debugging (default: %s)
259 -v, --verbose Increase verbosity level (default: %d)
260 -f, --file=FILE Data file (default: %s)
261 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
262 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
263 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
264 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
265 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
266 --system-slice=SLICE System slice that should not be reset
267 --status Print memory usage statistics and exit
268 -h, --help This message
269 """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
271 def slicestat(names = None):
273 Get status of specified slices (if names is None or empty, all
274 slices). vsize, sz, and rss are in KiB. Returns
275 PID CONTEXT VSZ SZ RSS %MEM CMD
276 {xid: {'xid': slice_id,
278 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
279 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
280 'pcpu': cpu_percent, 'pmem': mem_percent}]
281 'vsize': total_virtual_kib,
282 'sz': total_potential_kib,
283 'rss': total_physical_kib}}
286 # Mandatory fields. xid is a virtual field inserted by vps. Make
287 # sure cmd is last so that it does not get truncated
289 fields = ['pid', 'xid', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
291 # vps inserts xid after pid in the output, but ps doesn't know
292 # what the field means.
293 ps_fields = list(fields)
294 ps_fields.remove('xid')
298 # Eat the header line. vps depends on the header to figure out
299 # which column is the PID column, so we can't just tell ps not to
301 for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
305 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
306 # vps uses to denote the root context and the "all contexts"
307 # context) with "0" so that we can just split() on whitespace.
308 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
310 # Represent process as a dict of fields
311 values = line.split(None, len(fields) - 1)
312 if len(values) != len(fields):
314 proc = dict(zip(fields, values))
316 # Convert ints and floats
319 proc[field] = int(proc[field])
322 proc[field] = float(proc[field])
326 # vps sometimes prints ERR or the name of the slice
327 # instead of a context ID if it
328 # cannot identify the context of an orphaned (usually dying)
329 # process. Skip these processes.
330 if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
333 # Assign (pl_)sshd processes to slice instead of root
334 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
337 xid = bwlimit.get_xid(m.group(1))
341 name = bwlimit.get_slice(proc['xid'])
343 # Orphaned (not associated with a slice) class
344 name = "%d?" % proc['xid']
346 # Monitor only the specified slices
347 if names and name not in names:
350 # Additional overhead calculations from slicestat
352 # Include 12 KiB of process overhead =
353 # 4 KiB top-level page table +
354 # 4 KiB kernel structure +
355 # 4 KiB basic page table
358 # Include additional page table overhead
360 if proc['vsize'] > 4096:
361 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
364 if slices.has_key(proc['xid']):
365 slice = slices[proc['xid']]
367 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0}
369 slice['procs'].append(proc)
370 slice['vsize'] += proc['vsize']
371 slice['sz'] += proc['sz']
372 slice['rss'] += proc['rss']
374 slices[proc['xid']] = slice
380 Returns total physical and swap memory on the system in KiB.
384 meminfo = open("/proc/meminfo", "r")
385 for line in meminfo.readlines():
387 (name, value, kb) = line.split()
390 if name == "MemTotal:":
392 elif name == "SwapTotal:":
399 Returns swap utilization on the system as a whole percentage (0-100).
404 swaps = open("/proc/swaps", "r")
406 lines = swaps.readlines()[1:]
409 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
410 (filename, type, size, used, priority) = line.strip().split()
412 total_swap += int(size)
413 total_used += int(used)
414 except ValueEror, err:
416 except (IOError, KeyError), err: pass
418 swapused = 100 * total_used / total_swap
419 if debug: print "%s percent swap used" % swapused
422 def summary(slices = None, total_mem = None, total_swap = None):
424 Return a summary of memory usage by slice.
426 if not slices: slices = slicestat()
427 slicelist = slices.values()
428 slicelist.sort(lambda a, b: b['sz'] - a['sz'])
429 if total_mem is None or total_swap is None:
430 (total_mem, total_swap) = memtotal()
432 table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage")
433 for slice in slicelist:
434 table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \
435 (slice['name'], len(slice['procs']),
436 format_bytes(slice['rss'] * 1024, si = False),
437 100. * slice['rss'] / total_mem,
438 format_bytes(slice['sz'] * 1024, si = False),
439 100. * slice['sz'] / (total_mem + total_swap))
442 def formtable(slice, percent):
444 Makes pretty message to email with human readable ps values.
446 table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
447 ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
448 for proc in slice['procs']:
449 table += "%5s %10s %10s %10s %4.1f %s\n" % \
451 format_bytes(proc['vsize'] * 1024, si = False),
452 format_bytes(proc['sz'] * 1024, si = False),
453 format_bytes(proc['rss'] * 1024, si = False),
457 prettytable = {'hostname': socket.gethostname(),
458 'date': time.asctime(time.gmtime()) + " GMT",
460 'slice': slice['name'],
461 'rss': format_bytes(slice['rss'] * 1024, si = False),
462 'sz': format_bytes(slice['sz'] * 1024, si = False),
468 Return dictionary of vps (slicestat) from datfile left behind by OOM
469 before rebooting. If none file, just grab the latest dict (slicestat)
470 and return that. If dat file found, means we rebooted, send an email to
474 f = open(DATAFILE, "r+")
476 print "Loading %s" % DATAFILE
477 (v, slices) = pickle.load(f)
479 # Check version of data file
481 print "Not using old version '%s' data file %s" % (v, DATAFILE)
484 params = {'hostname': socket.gethostname(),
485 'date': time.asctime(time.gmtime()) + " GMT",
486 'table': summary(slices, total_mem, total_swap)}
488 print rebooted_subject % params
489 print rebooted_body % params
491 slicemail(None, rebooted_subject % params, rebooted_body % params)
501 def writedat(slices):
503 Write (slices) to pickled datfile.
505 if verbose: print "Saving %s" % DATAFILE
506 f = open(DATAFILE, "w")
507 pickle.dump((VERSION, slices), f)
513 global debug, verbose, DATAFILE, VERSION
514 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
519 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
520 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
521 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
522 except getopt.GetoptError, err:
523 print "Error: " + err.msg
527 for (opt, optval) in opts:
528 if opt == "-d" or opt == "--debug":
530 elif opt == "-v" or opt == "--verbose":
532 elif opt == "-f" or opt == "--file":
534 elif opt == "-s" or opt == "--slice":
536 elif opt == "-p" or opt == "--period":
538 elif opt == "--change-thresh":
539 change_thresh = int(optval)
540 elif opt == "--reset-thresh":
541 reset_thresh = int(optval)
542 elif opt == "--reboot-thresh":
543 reboot_thresh = int(optval)
544 elif opt == "--min-thresh":
545 rss_min = int(optval)
546 elif opt == "--system-slice":
547 system_slices.append(optval)
548 elif opt == "--status":
549 print summary(slicestat(names))
555 # Check if we are already running
562 # Redirect stdout and stderr to syslog
563 syslog.openlog("swapmon")
564 sys.stdout = sys.stderr = Logger()
567 (total_mem, total_swap) = memtotal()
570 # Query process table every 30 seconds, or when a large change in
571 # swap utilization is detected.
576 # System slices that we have warned but could not reset
579 # Slices that were reset
584 if last_used is None: last_used = used
586 # If we've reset you recently, update timers.
587 for resetslice in resetlist.keys():
588 resetlist[resetslice].update()
589 # If you've been good, remove you from our list.
590 if resetlist[resetslice].killtimeleft == 0 and \
591 resetlist[resetslice].resettimeleft == 0:
592 del resetlist[resetslice]
594 if used >= reboot_thresh:
595 # Dump slice state before rebooting
597 # Goodbye, cruel world
598 print "%d%% swap consumed, rebooting" % used
599 if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f")
600 elif used >= reset_thresh:
602 slicelist = slices.values()
603 # Puts largest on top.
604 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
605 for slice in slicelist:
606 percent = 100. * slice['rss'] / total_mem
607 if slice['rss'] < rss_min: continue
608 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
611 format_bytes(slice['rss'] * 1024, si = False),
613 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
614 # Make a pretty table.
615 params = formtable(slice, percent)
616 # Match slice name against system slice patterns
617 is_system_slice = filter(None,
618 [re.match(pattern, slice['name']) for pattern in system_slices])
620 # Do not reset system slices, just warn once
622 if slice['name'] not in warned:
623 warned.append(slice['name'])
624 print "Warning slice " + slice['name']
626 print alarm_subject % params
627 print alarm_body % params
629 slicemail(slice['name'], alarm_subject % params,
633 if not resetlist.has_key(slice['name']):
634 resetlist[slice['name']] = Reset(slice['name'])
635 resetlist[slice['name']].reset(params)
637 # wait period vefore recalculating swap. If in danger, recalc.
638 if timer <= 0 or used >= (last_used + change_thresh):
639 if used >= (last_used + change_thresh):
640 print "%d%% swap consumed, %d%% in last %d seconds" % \
641 (used, used - last_used, period - timer)
643 slices = slicestat(names)
646 # Keep track of large changes in swap utilization
653 if __name__ == '__main__':