#!/usr/bin/python # # Swap monitoring daemon. Every 30 seconds, checks process memory # usage. At 90% utilization, resets the slice that is consuming the # most physical memory. At 95% utilization, reboots the machine to # avoid a crash. # # Mark Huang # Andy Bavier # Faiyaz Ahmed # Copyright (C) 2004-2006 The Trustees of Princeton University # # $Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $ # import syslog import os import sys import getopt import re import pickle import socket import time # util-vserver/python/vserver.py allows us to control slices directly # from Python from vserver import VServer # bwlimit exports a few useful functions like run(), get_xid(), and get_slice() import bwlimit # Utility functions from pl_mom import * # Defaults debug = False verbose = 0 datafile = "/var/lib/misc/swapmon.dat" # Seconds between process analysis period = 30 # Minimum change in swap utilization over 30 seconds that will trigger # early process analysis. change_thresh = 5 # Swap utilization at which the largest consumer of physical memory is reset reset_thresh = 80 # Swap utilization at which the machine is rebooted reboot_thresh = 95 # Time to wait before checking slice again after reset reset_timeout = 15 # Number of strikes before killing (strike, strike, kill) kill_thresh = 2 # Time to wait before removing slice from kill queue (probation) kill_timeout = 120 # Don't email the same message more than once in the same emailtimeout interval email_timeout = 1800 # Physical size threshold to be considered a consumer. Rationale is if there are no procs # with a size at least as large as this, then there is a slow leaker; better to just reboot. rss_min = 150 * 1024 # System slices that should not be reset (regexps) system_slices = ['root', PLC_SLICE_PREFIX + '_'] # Message sent after a critical reboot rebooted_subject = "pl_mom rebooted %(hostname)s" rebooted_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s, so pl_mom rebooted it. Slices active prior to reboot are listed below. Memory usage statistics are not entirely accurate due to threading. %(table)s %(date)s %(hostname)s reboot """.lstrip() # Message sent after a hog is reset reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s" reset_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s. Slice %(slice)s was reset since it was the largest consumer of physical memory at %(rss)s (%(percent)4.1f%%). Please reply to this message explaining the nature of your experiment, and what you are doing to address the problem. %(slice)s processes prior to reset: %(table)s %(date)s %(hostname)s reset %(slice)s """.lstrip() # Message sent to system slices that should not be reset alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s" alarm_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s. System slice %(slice)s was the largest consumer of physical memory at %(rss)s (%(percent)4.1f%%). It was not reset, but please verify its behavior. %(slice)s processes prior to alarm: %(table)s %(date)s %(hostname)s alarm %(slice)s """.lstrip() # Message sent after a slice has been killed kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s" kill_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s. Slice %(slice)s was killed since it was the largest consumer of physical memory at %(rss)s (%(percent)4.1f%%) after repeated restarts. Please reply to this message explaining the nature of your experiment, and what you are doing to address the problem. %(slice)s processes prior to reset: %(table)s %(date)s %(hostname)s reset %(slice)s """.lstrip() class Reset: """ Keeps track of state information for resets and kills resettimeleft - timeout before checking for next reset resetcount - number of strikes killtimeleft - time out before removing from kill queue {kill,reset}mail - Time of last email kill - State of kill. If slice is already being killed, wait before retry. """ def __init__(self,name): self.name = name self.resettimeleft = reset_timeout self.resetcount = 0 self.resetmail = 0 self.killtimeleft = kill_timeout self.killmail = 0 def __repr__(self): return self.name def update(self): # Count down for next check of reset slice. if self.resettimeleft > 0: self.resettimeleft -= 1 if debug and verbose: print "%s has %s seconds in probation" \ %(self.name, self.killtimeleft) if self.killtimeleft > 0: # Count down kill probation timer (killtimeleft) self.killtimeleft -= 1 if self.killtimeleft == 1: print "%s is out of probation" % self.name else: # Once out of probation period (killtimeleft), remove strikes self.resetcount = 0 # Check to see if a slice needs to be killed. If it has been killed more # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice. def checkkill(self,params): if self.killtimeleft > 0 and self.resetcount >= kill_thresh: if debug: print kill_subject % params print kill_body % params try: pid = os.fork() if pid == 0: print "Slice %s is being killed." % self.name vserver = VServer(self.name) vserver.stop() os._exit(0) else: os.waitpid(pid,0) except Exception, err: print "Warning: Exception received while killing slice %s: %s" \ % self.name, err if (time.time() - self.killmail) > email_timeout: slicemail(self.name, kill_subject % params, kill_body % params) print "Sending KILL email for slice %s" % self.name self.killmail = time.time() return True return False # Reset slice after checking to see if slice is out of timeout. # Increment resetcount, check to see if larger than kill_thresh. def reset(self, params): # If its the first reset (came back after kill) # or if its been reset before # and we are out of the reset timeout. if self.resetcount == 0 or self.resettimeleft == 0: # Do we need to kill this slice? Check history first. if self.checkkill(params): return # Update counters self.resetcount += 1 self.killtimeleft = kill_timeout self.resettimeleft = reset_timeout print "%s has %s seconds to die and has been reset %s times" \ %(self.name, self.resettimeleft, self.resetcount) if debug: print reset_subject % params print reset_body % params try: pid = os.fork() if pid == 0: print "Resetting slice " + self.name vserver = VServer(self.name) vserver.stop() vserver.start(wait = False) os._exit(0) else: os.waitpid(pid,0) except Exception, err: print "Warning: Exception received while resetting slice %s:" \ % self.name, err if (time.time() - self.resetmail) > email_timeout: slicemail(self.name, reset_subject % params, reset_body % params) print "Sending Reset email for slice %s" % self.name self.resetmail = time.time() def usage(): print """ Usage: %s [OPTIONS]... Options: -d, --debug Enable debugging (default: %s) -v, --verbose Increase verbosity level (default: %d) -f, --file=FILE Data file (default: %s) -s, --slice=SLICE Constrain monitoring to these slices (default: all) -p, --period=SECONDS Seconds between normal process analysis (default: %s) --reset-thresh=PERCENT Swap utilization at which slice reset is attempted --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog --system-slice=SLICE System slice that should not be reset --status Print memory usage statistics and exit -h, --help This message """.lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period)) def slicestat(names = None): """ Get status of specified slices (if names is None or empty, all slices). vsize and rss are in KiB. Returns {xid: {'xid': slice_id, 'name': slice_name, 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command, 'vsize': virtual_kib, 'rss': physical_kib, 'pcpu': cpu_percent, 'pmem': mem_percent}] 'vsize': total_virtual_kib, 'rss': total_physical_kib}} """ # Mandatory fields. xid is a virtual field inserted by vps. Make # sure cmd is last so that it does not get truncated # automatically. fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd'] # vps inserts xid after pid in the output, but ps doesn't know # what the field means. ps_fields = list(fields) ps_fields.remove('xid') slices = {} # Eat the header line. vps depends on the header to figure out # which column is the PID column, so we can't just tell ps not to # print it. for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]: # Chomp newline line = line.strip() # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that # vps uses to denote the root context and the "all contexts" # context) with "0" so that we can just split() on whitespace. line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0") # Represent process as a dict of fields values = line.split(None, len(fields) - 1) if len(values) != len(fields): continue proc = dict(zip(fields, values)) # Convert ints and floats for field in proc: try: proc[field] = int(proc[field]) except ValueError: try: proc[field] = float(proc[field]) except ValueError: pass # vps sometimes prints ERR instead of a context ID if it # cannot identify the context of an orphaned (usually dying) # process. Skip these processes. if type(proc['xid']) != int: continue # Assign (pl_)sshd processes to slice instead of root m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd']) if m is not None: xid = bwlimit.get_xid(m.group(1)) if xid is not None: proc['xid'] = xid name = bwlimit.get_slice(proc['xid']) if name is None: # Orphaned (not associated with a slice) class name = "%d?" % proc['xid'] # Monitor only the specified slices if names and name not in names: continue # Additional overhead calculations from slicestat # Include 12 KiB of process overhead = # 4 KiB top-level page table + # 4 KiB kernel structure + # 4 KiB basic page table proc['rss'] += 12 # Include additional page table overhead if proc['vsize'] > 4096: proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096) if slices.has_key(proc['xid']): slice = slices[proc['xid']] else: slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0} slice['procs'].append(proc) slice['vsize'] += proc['vsize'] slice['rss'] += proc['rss'] slices[proc['xid']] = slice return slices def memtotal(): """ Returns total physical memory on the system in KiB. """ meminfo = open("/proc/meminfo", "r") line = meminfo.readline() meminfo.close() if line[0:8] == "MemTotal": # MemTotal: 255396 kB (name, value, kb) = line.split() return int(value) return 0 def swap_used(): """ Returns swap utilization on the system as a whole percentage (0-100). """ total_swap = 0 total_used = 0 try: swaps = open("/proc/swaps", "r") # Eat header line lines = swaps.readlines()[1:] swaps.close() for line in lines: # /dev/mapper/planetlab-swap partition 1048568 3740 -1 (filename, type, size, used, priority) = line.strip().split() try: total_swap += int(size) total_used += int(used) except ValueEror, err: pass except (IOError, KeyError), err: pass return 100 * total_used / total_swap def summary(names = None, total_rss = memtotal()): """ Return a summary of memory usage by slice. """ slicelist = slicestat(names).values() slicelist.sort(lambda a, b: b['rss'] - a['rss']) table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage") for slice in slicelist: table += "%-20s%10d%16s (%4.1f%%)\n" % \ (slice['name'], len(slice['procs']), format_bytes(slice['rss'] * 1024, si = False), 100. * slice['rss'] / total_rss) return table def main(): # Defaults global debug, verbose, datafile global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices # All slices names = [] try: longopts = ["debug", "verbose", "file=", "slice=", "status", "help"] longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="] (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts) except getopt.GetoptError, err: print "Error: " + err.msg usage() sys.exit(1) for (opt, optval) in opts: if opt == "-d" or opt == "--debug": debug = True elif opt == "-v" or opt == "--verbose": verbose += 1 elif opt == "-f" or opt == "--file": datafile = optval elif opt == "-s" or opt == "--slice": names.append(optval) elif opt == "-p" or opt == "--period": period = int(optval) elif opt == "--change-thresh": change_thresh = int(optval) elif opt == "--reset-thresh": reset_thresh = int(optval) elif opt == "--reboot-thresh": reboot_thresh = int(optval) elif opt == "--system-slice": system_slices.append(optval) elif opt == "--status": print summary(names) sys.exit(0) else: usage() sys.exit(0) # Check if we are already running writepid("swapmon") if not debug: daemonize() # Rewrite PID file writepid("swapmon") # Redirect stdout and stderr to syslog syslog.openlog("swapmon") sys.stdout = sys.stderr = Logger() # Get total physical memory total_rss = memtotal() try: f = open(datafile, "r+") if verbose: print "Loading %s" % datafile (version, slices) = pickle.load(f) f.close() # Check version of data file if version != "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $": print "Not using old version '%s' data file %s" % (version, datafile) raise Exception params = {'hostname': socket.gethostname(), 'date': time.asctime(time.gmtime()) + " GMT", 'table': summary(total_rss)} if debug: print rebooted_subject % params print rebooted_body % params else: slicemail(None, rebooted_subject % params, rebooted_body % params) # Delete data file os.unlink(datafile) except Exception: version = "$Id: swapmon.py,v 1.9 2006/07/19 19:40:55 faiyaza Exp $" slices = {} # Query process table every 30 seconds, or when a large change in # swap utilization is detected. timer = period last_used = None used = None # System slices that we have warned but could not reset warned = [] # Slices that were reset resetlist = {} while True: used = swap_used() for resetslice in resetlist.keys(): resetlist[resetslice].update() if last_used is None: last_used = used if verbose: print "%d%% swap consumed" % used if used >= reboot_thresh: # Dump slice state before rebooting if verbose: print "Saving %s" % datafile f = open(datafile, "w") pickle.dump((version, slices), f) f.close() # Goodbye, cruel world print "%d%% swap consumed, rebooting" % used if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f") elif used >= reset_thresh: if debug: print "Memory used = %s" %(used) # Try and find a hog slicelist = slices.values() slicelist.sort(lambda a, b: b['rss'] - a['rss']) for slice in slicelist: percent = 100. * slice['rss'] / total_rss if slice['rss'] < rss_min: continue print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \ (used, slice['name'], format_bytes(slice['rss'] * 1024, si = False), percent) slice['procs'].sort(lambda a, b: b['rss'] - a['rss']) table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND') for proc in slice['procs']: table += "%5s %10s %10s %4.1f %4.1f %s\n" % \ (proc['pid'], format_bytes(proc['vsize'] * 1024, si = False), format_bytes(proc['rss'] * 1024, si = False), proc['pcpu'], proc['pmem'], proc['cmd']) params = {'hostname': socket.gethostname(), 'date': time.asctime(time.gmtime()) + " GMT", 'table': table, 'slice': slice['name'], 'rss': format_bytes(slice['rss'] * 1024, si = False), 'percent': percent} # Match slice name against system slice patterns is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices]) if is_system_slice: if slice['name'] not in warned: warned.append(slice['name']) if debug: print alarm_subject % params print alarm_body % params else: print "Warning slice " + slice['name'] slicemail(slice['name'], alarm_subject % params, alarm_body % params) else: # Reset slice if not resetlist.has_key(slice['name']): resetlist[slice['name']] = Reset(slice['name']) resetlist[slice['name']].reset(params) slices = slicestat(names) if timer <= 0 or used >= (last_used + change_thresh): if used >= (last_used + change_thresh): print "%d%% swap consumed, %d%% in last %d seconds" % \ (used, used - last_used, period - timer) # Get slice state slices = slicestat(names) # Reset timer timer = period # Keep track of large changes in swap utilization last_used = used timer -= 1 time.sleep(1) removepid("swapmon") if __name__ == '__main__': main()