3 # Swap monitoring daemon. Every 30 seconds, checks process memory
4 # usage. At 90% utilization, resets the slice that is consuming the
5 # most physical memory. At 95% utilization, reboots the machine to
8 # Mark Huang <mlhuang@cs.princeton.edu>
9 # Andy Bavier <acb@cs.princeton.edu>
10 # Faiyaz Ahmed <faiyaza@cs.princeton.edu>
11 # Copyright (C) 2004-2006 The Trustees of Princeton University
13 # $Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $
25 # util-vserver/python/vserver.py allows us to control slices directly
27 from vserver import VServer
29 # bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
38 datafile = "/var/lib/misc/swapmon.dat"
40 # Seconds between process analysis
43 # Minimum change in swap utilization over 30 seconds that will trigger
44 # early process analysis.
47 # Swap utilization at which the largest consumer of physical memory is reset
50 # Swap utilization at which the machine is rebooted
53 # Time to wait before checking slice again after reset
56 # Number of strikes before killing (strike, strike, kill)
59 # Time to wait before removing slice from kill queue (probation)
62 # Don't email the same message more than once in the same emailtimeout interval
65 # Physical size threshold to be considered a consumer. Rationale is if there are no procs
66 # with a size at least as large as this, then there is a slow leaker; better to just reboot.
69 # System slices that should not be reset (regexps)
70 system_slices = ['root', PLC_SLICE_PREFIX + '_']
72 # Message sent after a critical reboot
73 rebooted_subject = "pl_mom rebooted %(hostname)s"
76 Sometime before %(date)s, swap space was
77 nearly exhausted on %(hostname)s, so pl_mom rebooted it.
79 Slices active prior to reboot are listed below. Memory usage
80 statistics are not entirely accurate due to threading.
84 %(date)s %(hostname)s reboot
87 # Message sent after a hog is reset
88 reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
91 Sometime before %(date)s, swap space was
92 nearly exhausted on %(hostname)s.
94 Slice %(slice)s was reset since it was the largest consumer of
95 physical memory at %(rss)s (%(percent)4.1f%%).
97 Please reply to this message explaining the nature of your experiment,
98 and what you are doing to address the problem.
100 %(slice)s processes prior to reset:
104 %(date)s %(hostname)s reset %(slice)s
107 # Message sent to system slices that should not be reset
108 alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
111 Sometime before %(date)s, swap space was
112 nearly exhausted on %(hostname)s.
114 System slice %(slice)s was the largest consumer of physical memory at
115 %(rss)s (%(percent)4.1f%%). It was not reset, but please verify its
118 %(slice)s processes prior to alarm:
122 %(date)s %(hostname)s alarm %(slice)s
125 # Message sent after a slice has been killed
126 kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
129 Sometime before %(date)s, swap space was
130 nearly exhausted on %(hostname)s.
132 Slice %(slice)s was killed since it was the largest consumer of
133 physical memory at %(rss)s (%(percent)4.1f%%) after repeated restarts.
135 Please reply to this message explaining the nature of your experiment,
136 and what you are doing to address the problem.
138 %(slice)s processes prior to reset:
142 %(date)s %(hostname)s reset %(slice)s
149 Keeps track of state information for resets and kills
151 resettimeleft - timeout before checking for next reset
152 resetcount - number of strikes
153 killtimeleft - time out before removing from kill queue
154 {kill,reset}mail - Time of last email
155 kill - State of kill. If slice is already being killed, wait before retry.
158 def __init__(self,name):
160 self.resettimeleft = reset_timeout
163 self.killtimeleft = kill_timeout
170 # Count down for next check of reset slice.
171 if self.resettimeleft > 0:
172 self.resettimeleft -= 1
173 if debug and verbose:
174 print "%s has %s seconds in probation" \
175 %(self.name, self.killtimeleft)
176 if self.killtimeleft > 0:
177 # Count down kill probation timer (killtimeleft)
178 self.killtimeleft -= 1
179 if self.killtimeleft == 1:
180 print "%s is out of probation" % self.name
182 # Once out of probation period (killtimeleft), remove strikes
186 # Check to see if a slice needs to be killed. If it has been killed more
187 # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
188 def checkkill(self,params):
189 if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
191 print kill_subject % params
192 print kill_body % params
196 print "Slice %s is being killed." % self.name
197 vserver = VServer(self.name)
202 except Exception, err:
203 print "Warning: Exception received while killing slice %s: %s" \
205 if (time.time() - self.killmail) > email_timeout:
206 slicemail(self.name, kill_subject % params, kill_body % params)
207 print "Sending KILL email for slice %s" % self.name
208 self.killmail = time.time()
212 # Reset slice after checking to see if slice is out of timeout.
213 # Increment resetcount, check to see if larger than kill_thresh.
214 def reset(self, params):
215 # If its the first reset (came back after kill)
216 # or if its been reset before
217 # and we are out of the reset timeout.
218 if self.resetcount == 0 or self.resettimeleft == 0:
219 # Do we need to kill this slice? Check history first.
220 if self.checkkill(params):
224 self.killtimeleft = kill_timeout
225 self.resettimeleft = reset_timeout
226 print "%s has %s seconds to die and has been reset %s times" \
227 %(self.name, self.resettimeleft, self.resetcount)
229 print reset_subject % params
230 print reset_body % params
234 print "Resetting slice " + self.name
235 vserver = VServer(self.name)
237 vserver.start(wait = False)
241 except Exception, err:
242 print "Warning: Exception received while resetting slice %s:" \
244 if (time.time() - self.resetmail) > email_timeout:
245 slicemail(self.name, reset_subject % params, reset_body % params)
246 print "Sending Reset email for slice %s" % self.name
247 self.resetmail = time.time()
252 Usage: %s [OPTIONS]...
255 -d, --debug Enable debugging (default: %s)
256 -v, --verbose Increase verbosity level (default: %d)
257 -f, --file=FILE Data file (default: %s)
258 -s, --slice=SLICE Constrain monitoring to these slices (default: all)
259 -p, --period=SECONDS Seconds between normal process analysis (default: %s)
260 --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
261 --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
262 --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
263 --system-slice=SLICE System slice that should not be reset
264 --status Print memory usage statistics and exit
265 -h, --help This message
266 """.lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period))
268 def slicestat(names = None):
270 Get status of specified slices (if names is None or empty, all
271 slices). vsize and rss are in KiB. Returns
273 {xid: {'xid': slice_id,
275 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
276 'vsize': virtual_kib, 'rss': physical_kib,
277 'pcpu': cpu_percent, 'pmem': mem_percent}]
278 'vsize': total_virtual_kib,
279 'rss': total_physical_kib}}
282 # Mandatory fields. xid is a virtual field inserted by vps. Make
283 # sure cmd is last so that it does not get truncated
285 fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd']
287 # vps inserts xid after pid in the output, but ps doesn't know
288 # what the field means.
289 ps_fields = list(fields)
290 ps_fields.remove('xid')
294 # Eat the header line. vps depends on the header to figure out
295 # which column is the PID column, so we can't just tell ps not to
297 for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]:
301 # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
302 # vps uses to denote the root context and the "all contexts"
303 # context) with "0" so that we can just split() on whitespace.
304 line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
306 # Represent process as a dict of fields
307 values = line.split(None, len(fields) - 1)
308 if len(values) != len(fields):
310 proc = dict(zip(fields, values))
312 # Convert ints and floats
315 proc[field] = int(proc[field])
318 proc[field] = float(proc[field])
322 # vps sometimes prints ERR instead of a context ID if it
323 # cannot identify the context of an orphaned (usually dying)
324 # process. Skip these processes.
325 if type(proc['xid']) != int:
328 # Assign (pl_)sshd processes to slice instead of root
329 m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
331 xid = bwlimit.get_xid(m.group(1))
335 name = bwlimit.get_slice(proc['xid'])
337 # Orphaned (not associated with a slice) class
338 name = "%d?" % proc['xid']
340 # Monitor only the specified slices
341 if names and name not in names:
344 # Additional overhead calculations from slicestat
346 # Include 12 KiB of process overhead =
347 # 4 KiB top-level page table +
348 # 4 KiB kernel structure +
349 # 4 KiB basic page table
352 # Include additional page table overhead
353 if proc['vsize'] > 4096:
354 proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
356 if slices.has_key(proc['xid']):
357 slice = slices[proc['xid']]
359 slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0}
361 slice['procs'].append(proc)
362 slice['vsize'] += proc['vsize']
363 slice['rss'] += proc['rss']
365 slices[proc['xid']] = slice
371 Returns total physical memory on the system in KiB.
374 meminfo = open("/proc/meminfo", "r")
375 line = meminfo.readline()
377 if line[0:8] == "MemTotal":
378 # MemTotal: 255396 kB
379 (name, value, kb) = line.split()
386 Returns swap utilization on the system as a whole percentage (0-100).
393 swaps = open("/proc/swaps", "r")
395 lines = swaps.readlines()[1:]
398 # /dev/mapper/planetlab-swap partition 1048568 3740 -1
399 (filename, type, size, used, priority) = line.strip().split()
401 total_swap += int(size)
402 total_used += int(used)
403 except ValueEror, err:
405 except (IOError, KeyError), err:
408 return 100 * total_used / total_swap
410 def summary(names = None, total_rss = memtotal()):
412 Return a summary of memory usage by slice.
414 slicelist = slicestat(names).values()
415 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
417 table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage")
418 for slice in slicelist:
419 table += "%-20s%10d%16s (%4.1f%%)\n" % \
420 (slice['name'], len(slice['procs']),
421 format_bytes(slice['rss'] * 1024, si = False),
422 100. * slice['rss'] / total_rss)
428 global debug, verbose, datafile
429 global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
434 longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
435 longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
436 (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
437 except getopt.GetoptError, err:
438 print "Error: " + err.msg
442 for (opt, optval) in opts:
443 if opt == "-d" or opt == "--debug":
445 elif opt == "-v" or opt == "--verbose":
447 elif opt == "-f" or opt == "--file":
449 elif opt == "-s" or opt == "--slice":
451 elif opt == "-p" or opt == "--period":
453 elif opt == "--change-thresh":
454 change_thresh = int(optval)
455 elif opt == "--reset-thresh":
456 reset_thresh = int(optval)
457 elif opt == "--reboot-thresh":
458 reboot_thresh = int(optval)
459 elif opt == "--system-slice":
460 system_slices.append(optval)
461 elif opt == "--status":
468 # Check if we are already running
475 # Redirect stdout and stderr to syslog
476 syslog.openlog("swapmon")
477 sys.stdout = sys.stderr = Logger()
479 # Get total physical memory
480 total_rss = memtotal()
483 f = open(datafile, "r+")
485 print "Loading %s" % datafile
486 (version, slices) = pickle.load(f)
488 # Check version of data file
489 if version != "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $":
490 print "Not using old version '%s' data file %s" % (version, datafile)
493 params = {'hostname': socket.gethostname(),
494 'date': time.asctime(time.gmtime()) + " GMT",
495 'table': summary(total_rss)}
498 print rebooted_subject % params
499 print rebooted_body % params
501 slicemail(None, rebooted_subject % params, rebooted_body % params)
506 version = "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $"
509 # Query process table every 30 seconds, or when a large change in
510 # swap utilization is detected.
515 # System slices that we have warned but could not reset
518 # Slices that were reset
524 for resetslice in resetlist.keys():
525 resetlist[resetslice].update()
527 if last_used is None:
531 print "%d%% swap consumed" % used
533 if used >= reboot_thresh:
534 # Dump slice state before rebooting
536 print "Saving %s" % datafile
537 f = open(datafile, "w")
538 pickle.dump((version, slices), f)
541 # Goodbye, cruel world
542 print "%d%% swap consumed, rebooting" % used
544 bwlimit.run("/bin/sync; /sbin/reboot -f")
546 elif used >= reset_thresh:
548 print "Memory used = %s" %(used)
550 slicelist = slices.values()
551 slicelist.sort(lambda a, b: b['rss'] - a['rss'])
552 for slice in slicelist:
553 percent = 100. * slice['rss'] / total_rss
555 if slice['rss'] < rss_min:
558 print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
561 format_bytes(slice['rss'] * 1024, si = False),
564 slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
566 table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND')
567 for proc in slice['procs']:
568 table += "%5s %10s %10s %4.1f %4.1f %s\n" % \
570 format_bytes(proc['vsize'] * 1024, si = False),
571 format_bytes(proc['rss'] * 1024, si = False),
572 proc['pcpu'], proc['pmem'], proc['cmd'])
574 params = {'hostname': socket.gethostname(),
575 'date': time.asctime(time.gmtime()) + " GMT",
577 'slice': slice['name'],
578 'rss': format_bytes(slice['rss'] * 1024, si = False),
581 # Match slice name against system slice patterns
582 is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices])
585 if slice['name'] not in warned:
586 warned.append(slice['name'])
588 print alarm_subject % params
589 print alarm_body % params
591 print "Warning slice " + slice['name']
592 slicemail(slice['name'], alarm_subject % params,
596 if not resetlist.has_key(slice['name']):
597 resetlist[slice['name']] = Reset(slice['name'])
598 resetlist[slice['name']].reset(params)
599 slices = slicestat(names)
601 if timer <= 0 or used >= (last_used + change_thresh):
602 if used >= (last_used + change_thresh):
603 print "%d%% swap consumed, %d%% in last %d seconds" % \
604 (used, used - last_used, period - timer)
606 slices = slicestat(names)
609 # Keep track of large changes in swap utilization
617 if __name__ == '__main__':