#!/usr/bin/python # # Swap monitoring daemon. Every 30 seconds, checks process memory # usage. At 90% utilization, resets the slice that is consuming the # most physical memory. At 95% utilization, reboots the machine to # avoid a crash. # # Mark Huang # Andy Bavier # Faiyaz Ahmed # Copyright (C) 2004-2006 The Trustees of Princeton University # import syslog import os import sys import getopt import re import pickle import socket import time # bwlimit exports a few useful functions like run(), get_xid(), and get_slice() import plnode.bwlimit as bwlimit # Utility functions from pl_mom import * # Defaults debug = False verbose = 0 DATAFILE = "/var/lib/misc/swapmon.dat" # xxx fixme - this is broken under git VERSION = "$Id$" # Seconds between process analysis period = 30 # Minimum change in swap utilization over 30 seconds that will trigger # early process analysis. change_thresh = 5 # Swap utilization at which the largest consumer of physical memory is reset reset_thresh = 80 # Swap utilization at which the machine is rebooted reboot_thresh = 95 # Don't email the same message more than once in the same emailtimeout interval email_timeout = 1800 # Physical size threshold to be considered a consumer. Rationale is if there are no procs # with a size at least as large as this, then there is a slow leaker; better to just reboot. rss_min = 150 * 1024 # System slices that should not be reset (regexps) system_slices = ['root', PLC_SLICE_PREFIX + '_'] # Message sent after a critical reboot rebooted_subject = "pl_mom rebooted %(hostname)s" rebooted_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s, so pl_mom rebooted it. Slices active prior to reboot are listed below. Memory usage statistics are not entirely accurate due to threading. %(table)s %(date)s %(hostname)s reboot """.lstrip() # Message sent to system slices that should not be reset alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s" alarm_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s. System slice %(slice)s was the largest consumer of physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable). It was not reset, but please verify its behavior. %(slice)s processes prior to alarm: %(table)s %(date)s %(hostname)s alarm %(slice)s """.lstrip() # Message sent after a slice has been killed kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s" kill_body = \ """ Sometime before %(date)s, swap space was nearly exhausted on %(hostname)s. Slice %(slice)s was killed since it was the largest consumer of physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable) after repeated restarts. Please reply to this message explaining the nature of your experiment, and what you are doing to address the problem. %(slice)s processes prior to reset: %(table)s %(date)s %(hostname)s reset %(slice)s """.lstrip() def killsliverprocs(xid): bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid) def usage(): print """ Usage: %s [OPTIONS]... Options: -d, --debug Enable debugging (default: %s) -v, --verbose Increase verbosity level (default: %d) -f, --file=FILE Data file (default: %s) -s, --slice=SLICE Constrain monitoring to these slices (default: all) -p, --period=SECONDS Seconds between normal process analysis (default: %s) --reset-thresh=PERCENT Swap utilization at which slice reset is attempted --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog --system-slice=SLICE System slice that should not be reset --status Print memory usage statistics and exit --memstatus Print total memory, total swap, and swap used -h, --help This message """.lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period)) def slicestat(names = None): """ Get status of specified slices (if names is None or empty, all slices). vsize, sz, and rss are in KiB. Returns PID CONTEXT VSZ SZ RSS %MEM CMD {xid: {'xid': slice_id, 'name': slice_name, 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command, 'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib, 'pcpu': cpu_percent, 'pmem': mem_percent}] 'vsize': total_virtual_kib, 'sz': total_potential_kib, 'rss': total_physical_kib}} """ # Mandatory fields. xid is a virtual field inserted by vps. Make # sure cmd is last so that it does not get truncated # automatically. fields = ['pid', 'xid', 'vsname', 'vsize', 'sz', 'rss', 'pmem', 'cmd'] # vps inserts xid after pid in the output, but ps doesn't know # what the field means. ps_fields = list(fields) ps_fields.remove('xid') ps_fields.remove('vsname') slices = {} # Eat the header line. vps depends on the header to figure out # which column is the PID column, so we can't just tell ps not to # print it. for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]: # Chomp newline line = line.strip() # Represent process as a dict of fields values = line.split(None, len(fields) - 1) if len(values) != len(fields): if "ERR" in line: pass # ignore spurious error message from vps else: print "slicestat: failed to parse line: " + line continue proc = dict(zip(fields, values)) # Convert ints and floats for field in proc: try: proc[field] = int(proc[field]) except ValueError: try: proc[field] = float(proc[field]) except ValueError: pass # vps sometimes prints ERR or the name of the slice # instead of a context ID if it # cannot identify the context of an orphaned (usually dying) # process. Skip these processes. if (type(proc['xid']) != int) or (type(proc['vsize']) !=int): if "ERR" in line: pass # ignore spurious error message from vps else: print "slicestat: failed to parse line: " + line continue # Assign (pl_)sshd processes to slice instead of root m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd']) if m is not None: xid = bwlimit.get_xid(m.group(1)) if xid is not None: proc['xid'] = xid name = bwlimit.get_slice(proc['xid']) if name is None: # Orphaned (not associated with a slice) class name = "%d?" % proc['xid'] # Monitor only the specified slices if names and name not in names: continue # Additional overhead calculations from slicestat # Include 12 KiB of process overhead = # 4 KiB top-level page table + # 4 KiB kernel structure + # 4 KiB basic page table proc['rss'] += 12 # Include additional page table overhead try: if proc['vsize'] > 4096: proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096) except: pass if slices.has_key(proc['xid']): slice = slices[proc['xid']] else: slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'sz': 0, 'rss': 0} slice['procs'].append(proc) slice['vsize'] += proc['vsize'] slice['sz'] += proc['sz'] slice['rss'] += proc['rss'] slices[proc['xid']] = slice return slices def memtotal(): """ Returns total physical and swap memory on the system in KiB. """ mem = 0 swap = 0 meminfo = open("/proc/meminfo", "r") for line in meminfo.readlines(): try: (name, value, kb) = line.split() except: continue if name == "MemTotal:": mem = int(value) elif name == "SwapTotal:": swap = int(value) meminfo.close() return (mem, swap) def swap_used(): """ Returns swap utilization on the system as a whole percentage (0-100). """ total_swap = 0 total_used = 0 try: swaps = open("/proc/swaps", "r") # Eat header line lines = swaps.readlines()[1:] swaps.close() for line in lines: # /dev/mapper/planetlab-swap partition 1048568 3740 -1 (filename, type, size, used, priority) = line.strip().split() try: total_swap += int(size) total_used += int(used) except ValueEror, err: pass except (IOError, KeyError), err: pass swapused = 100 * total_used / total_swap if debug: print "%s percent swap used" % swapused return swapused def summary(slices = None, total_mem = None, total_swap = None): """ Return a summary of memory usage by slice. """ if not slices: slices = slicestat() slicelist = slices.values() slicelist.sort(lambda a, b: b['sz'] - a['sz']) if total_mem is None or total_swap is None: (total_mem, total_swap) = memtotal() table = "%-20s%10s%24s%24s\n\n" % ("Slice", "Processes", "Memory Usage", "Potential Usage") for slice in slicelist: table += "%-20s%10d%16s (%4.1f%%)%16s (%4.1f%%)\n" % \ (slice['name'], len(slice['procs']), format_bytes(slice['rss'] * 1024, si = False), 100. * slice['rss'] / total_mem, format_bytes(slice['sz'] * 1024, si = False), 100. * slice['sz'] / (total_mem + total_swap)) return table def formtable(slice, percent): ''' Makes pretty message to email with human readable ps values. ''' table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \ ("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND') for proc in slice['procs']: table += "%5s %10s %10s %10s %4.1f %s\n" % \ (proc['pid'], format_bytes(proc['vsize'] * 1024, si = False), format_bytes(proc['sz'] * 1024, si = False), format_bytes(proc['rss'] * 1024, si = False), proc['pmem'], proc['cmd']) prettytable = {'hostname': socket.gethostname(), 'date': time.asctime(time.gmtime()) + " GMT", 'table': table, 'slice': slice['name'], 'rss': format_bytes(slice['rss'] * 1024, si = False), 'sz': format_bytes(slice['sz'] * 1024, si = False), 'percent': percent} return prettytable def readdat(): ''' Return dictionary of vps (slicestat) from datfile left behind by OOM before rebooting. If none file, just grab the latest dict (slicestat) and return that. If dat file found, means we rebooted, send an email to pl_mom@pl. ''' try: f = open(DATAFILE, "r+") if verbose: print "Loading %s" % DATAFILE (v, slices) = pickle.load(f) f.close() # Check version of data file if v != VERSION: print "Not using old version '%s' data file %s" % (v, DATAFILE) raise Exception params = {'hostname': socket.gethostname(), 'date': time.asctime(time.gmtime()) + " GMT", 'table': summary(slices, total_mem, total_swap)} if debug: print rebooted_subject % params print rebooted_body % params else: slicemail(None, rebooted_subject % params, rebooted_body % params) # Delete data file os.unlink(DATAFILE) except Exception: slices = slicestat() return slices def writedat(slices): """ Write (slices) to pickled datfile. """ if verbose: print "Saving %s" % DATAFILE f = open(DATAFILE, "w") pickle.dump((VERSION, slices), f) f.close() def main(): # Defaults global debug, verbose, DATAFILE, VERSION global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices # All slices names = [] timer = period last_used = None used = None warned = [] emailed = {} try: longopts = ["debug", "verbose", "file=", "slice=", "status", "memstatus", "help"] longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="] (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts) except getopt.GetoptError, err: print "Error: " + err.msg usage() sys.exit(1) for (opt, optval) in opts: if opt == "-d" or opt == "--debug": debug = True elif opt == "-v" or opt == "--verbose": verbose += 1 elif opt == "-f" or opt == "--file": DATAFILE = optval elif opt == "-s" or opt == "--slice": names.append(optval) elif opt == "-p" or opt == "--period": period = int(optval) elif opt == "--change-thresh": change_thresh = int(optval) elif opt == "--reset-thresh": reset_thresh = int(optval) elif opt == "--reboot-thresh": reboot_thresh = int(optval) elif opt == "--min-thresh": rss_min = int(optval) elif opt == "--system-slice": system_slices.append(optval) elif opt == "--status": print summary(slicestat(names)) sys.exit(0) elif opt == "--memstatus": (mem, swap) = memtotal() swap_pct = swap_used() print "memory total:", mem print "swap total:", swap print "swap used:", swap_pct sys.exit(0) else: usage() sys.exit(0) # Check if we are already running writepid("swapmon") if not debug: daemonize() # Rewrite PID file writepid("swapmon") # Redirect stdout and stderr to syslog syslog.openlog("swapmon") sys.stdout = sys.stderr = Logger() # Get total memory (total_mem, total_swap) = memtotal() slices = readdat() # Query process table every 30 seconds, or when a large change in # swap utilization is detected. while True: used = swap_used() if last_used is None: last_used = used if used >= reboot_thresh: # Dump slice state before rebooting writedat(slices) # Goodbye, cruel world print "%d%% swap consumed, rebooting" % used if not debug: bwlimit.run("/bin/sync; /sbin/reboot -f") elif used >= reset_thresh: # Try and find a hog slicelist = slices.values() # Puts largest on top. slicelist.sort(lambda a, b: b['rss'] - a['rss']) for slice in slicelist: percent = 100. * slice['rss'] / total_mem if slice['rss'] < rss_min: continue print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \ (used, slice['name'], format_bytes(slice['rss'] * 1024, si = False), percent) slice['procs'].sort(lambda a, b: b['rss'] - a['rss']) # Make a pretty table. params = formtable(slice, percent) # Match slice name against system slice patterns is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices]) # Do not reset system slices, just warn once if is_system_slice: if slice['name'] not in warned: warned.append(slice['name']) print "Warning slice " + slice['name'] if debug: print alarm_subject % params print alarm_body % params else: slicemail(slice['name'], alarm_subject % params, alarm_body % params) else: # Reset slice if not debug: if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout): slicemail(slice['name'], kill_subject % params, kill_body % params) emailed[slice['name']] = time.time() else: print kill_subject % params print kill_body % params print "Killing procs in %s" % slice['name'] killsliverprocs(slice['xid']) # wait period before recalculating swap. If in danger, recalc. if timer <= 0 or used >= (last_used + change_thresh): if used >= (last_used + change_thresh): print "%d%% swap consumed, %d%% in last %d seconds" % \ (used, used - last_used, period - timer) # Get slice state slices = slicestat(names) # Reset timer timer = period # Keep track of large changes in swap utilization last_used = used timer -= 1 time.sleep(1) removepid("swapmon") if __name__ == '__main__': main()