From 42dbe689a93025034f3e798174e54f3e2a0d3371 Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Fri, 28 Apr 2006 19:26:59 +0000 Subject: [PATCH] - rewrite pl_mom.pl - now single-threaded, several other improvements - pick up support and slice e-mail addresses, and system slice prefix, from PLC configuration in /etc/planetlab - use vserver.py module to reset slices instead of exec()ing chcontext - use vps instead of slicestat; slicestat may not run on all nodes, in particular, does not run on private PlanetLab nodes. Calling vps is much faster than querying slicestat. - list slices active before reboot, in the reboot notification - list all slice processes running before reset, in the reset notification - previously, only the reboot threshold was checked every second. Now, both reboot and reset thresholds are checked every second. Since it is relatively expensive and can take a while to run under heavy load, vps is still run only every 30 seconds, or when a large change in swap utilization is detected. This cuts down on the time necessary to detect fast growing hogs, before critical utilization is reached. --- swapmon.py | 472 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 472 insertions(+) create mode 100755 swapmon.py diff --git a/swapmon.py b/swapmon.py new file mode 100755 index 0000000..7d579e4 --- /dev/null +++ b/swapmon.py @@ -0,0 +1,472 @@ +#!/usr/bin/python +# +# Swap monitoring daemon. Every 30 seconds, checks process memory +# usage. At 90% utilization, resets the slice that is consuming the +# most physical memory. At 95% utilization, reboots the machine to +# avoid a crash. +# +# Mark Huang +# Andy Bavier +# Copyright (C) 2004-2006 The Trustees of Princeton University +# +# $Id: BandwidthMonitor.py,v 1.1 2006/04/25 14:40:28 mlhuang Exp $ +# + +import syslog +import os +import sys +import getopt +import re +import pickle +import socket +import time + +import textwrap +wrap = textwrap.TextWrapper() + +# util-vserver/python/vserver.py allows us to control slices directly +# from Python +from vserver import VServer + +# bwlimit exports a few useful functions like run(), get_xid(), and get_slice() +import bwlimit + +# Utility functions +from pl_mom import * + +# Defaults +debug = False +verbose = 0 +datafile = "/var/lib/misc/swapmon.dat" + +# Seconds between process analysis +period = 30 + +# Minimum change in swap utilization over 30 seconds that will trigger +# early process analysis. +change_thresh = 5 + +# Swap utilization at which the largest consumer of physical memory is reset +reset_thresh = 85 + +# Swap utilization at which the machine is rebooted +reboot_thresh = 95 + +# Minimum physical memory utilization to be considered the largest consumer +min_thresh = 10 + +# System slices that should not be reset (regexps) +system_slices = ['root', PLC_SLICE_PREFIX + '_'] + +# Message sent after a critical reboot +rebooted_subject = "pl_mom rebooted %(hostname)s" +rebooted_body = \ +""" +Sometime before %(date)s, swap space was +nearly exhausted on %(hostname)s, so pl_mom rebooted it. + +Slices active prior to reboot are listed below. Memory usage +statistics are not entirely accurate due to threading. + +%(table)s + +%(date)s %(hostname)s reboot +""".lstrip() + +# Message sent after a hog is reset +reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s" +reset_body = \ +""" +Sometime before %(date)s, swap space was +nearly exhausted on %(hostname)s. + +Slice %(slice)s was reset since it was the largest consumer of +physical memory at %(rss)s (%(percent)4.1f%%). + +Please reply to this message explaining the nature of your experiment, +and what you are doing to address the problem. + +%(slice)s processes prior to reset: + +%(table)s + +%(date)s %(hostname)s reset %(slice)s +""".lstrip() + +# Message sent to system slices that should not be reset +alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s" +alarm_body = \ +""" +Sometime before %(date)s, swap space was +nearly exhausted on %(hostname)s. + +System slice %(slice)s was the largest consumer of physical memory at +%(rss)s (%(percent)4.1f%%). It was not reset, but please verify its +behavior. + +%(slice)s processes prior to alarm: + +%(table)s + +%(date)s %(hostname)s alarm %(slice)s +""".lstrip() + +def usage(): + print """ +Usage: %s [OPTIONS]... + +Options: + -d, --debug Enable debugging (default: %s) + -v, --verbose Increase verbosity level (default: %d) + -f, --file=FILE Data file (default: %s) + -s, --slice=SLICE Constrain monitoring to these slices (default: all) + -p, --period=SECONDS Seconds between normal process analysis (default: %s) + --reset-thresh=PERCENT Swap utilization at which slice reset is attempted + --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted + --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog + --system-slice=SLICE System slice that should not be reset + -h, --help This message +""".lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period)) + +def slicestat(names = None): + """ + Get status of specified slices (if names is None or empty, all + slices). vsize and rss are in KiB. Returns + + {xid: {'xid': slice_id, + 'name': slice_name, + 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command, + 'vsize': virtual_kib, 'rss': physical_kib, + 'pcpu': cpu_percent, 'pmem': mem_percent}] + 'vsize': total_virtual_kib, + 'rss': total_physical_kib}} + """ + + # Mandatory fields. xid is a virtual field inserted by vps. Make + # sure cmd is last so that it does not get truncated + # automatically. + fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd'] + + # vps inserts xid after pid in the output, but ps doesn't know + # what the field means. + ps_fields = list(fields) + ps_fields.remove('xid') + + slices = {} + + # Eat the header line. vps depends on the header to figure out + # which column is the PID column, so we can't just tell ps not to + # print it. + for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]: + # Chomp newline + line = line.strip() + + # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that + # vps uses to denote the root context and the "all contexts" + # context) with "0" so that we can just split() on whitespace. + line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0") + + # Represent process as a dict of fields + values = line.split(None, len(fields) - 1) + if len(values) != len(fields): + continue + proc = dict(zip(fields, values)) + + # Convert ints and floats + for field in proc: + try: + proc[field] = int(proc[field]) + except ValueError: + try: + proc[field] = float(proc[field]) + except ValueError: + pass + + # Assign (pl_)sshd processes to slice instead of root + m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd']) + if m is not None: + xid = bwlimit.get_xid(m.group(1)) + if xid is not None: + proc['xid'] = xid + + name = bwlimit.get_slice(proc['xid']) + if name is None: + # Orphaned (not associated with a slice) class + name = "%d?" % proc['xid'] + + # Monitor only the specified slices + if names and name not in names: + continue + + # Additional overhead calculations from slicestat + + # Include 12 KiB of process overhead = + # 4 KiB top-level page table + + # 4 KiB kernel structure + + # 4 KiB basic page table + proc['rss'] += 12 + + # Include additional page table overhead + if proc['vsize'] > 4096: + proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096) + + if slices.has_key(proc['xid']): + slice = slices[proc['xid']] + else: + slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0} + + slice['procs'].append(proc) + slice['vsize'] += proc['vsize'] + slice['rss'] += proc['rss'] + + slices[proc['xid']] = slice + + return slices + +def memtotal(): + """ + Returns total physical memory on the system in KiB. + """ + + meminfo = open("/proc/meminfo", "r") + line = meminfo.readline() + meminfo.close() + if line[0:8] == "MemTotal": + # MemTotal: 255396 kB + (name, value, kb) = line.split() + return int(value) + + return 0 + +def swap_used(): + """ + Returns swap utilization on the system as a whole percentage (0-100). + """ + + total_swap = 0 + total_used = 0 + + try: + swaps = open("/proc/swaps", "r") + # Eat header line + lines = swaps.readlines()[1:] + swaps.close() + for line in lines: + # /dev/mapper/planetlab-swap partition 1048568 3740 -1 + (filename, type, size, used, priority) = line.strip().split() + try: + total_swap += int(size) + total_used += int(used) + except ValueEror, err: + pass + except (IOError, KeyError), err: + pass + + return 100 * total_used / total_swap + +def main(): + # Defaults + global debug, verbose, datafile + global period, change_thresh, reset_thresh, reboot_thresh, min_thresh, system_slices + # All slices + names = [] + + try: + longopts = ["debug", "verbose", "file=", "slice=", "help"] + longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="] + (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts) + except getopt.GetoptError, err: + print "Error: " + err.msg + usage() + sys.exit(1) + + for (opt, optval) in opts: + if opt == "-d" or opt == "--debug": + debug = True + elif opt == "-v" or opt == "--verbose": + verbose += 1 + elif opt == "-f" or opt == "--file": + datafile = optval + elif opt == "-s" or opt == "--slice": + names.append(optval) + elif opt == "-p" or opt == "--period": + period = int(optval) + elif opt == "--change-thresh": + change_thresh = int(optval) + elif opt == "--reset-thresh": + reset_thresh = int(optval) + elif opt == "--reboot-thresh": + reboot_thresh = int(optval) + elif opt == "--min-thresh": + min_thresh = int(optval) + elif opt == "--system-slice": + system_slices.append(optval) + else: + usage() + sys.exit(0) + + # Check if we are already running + writepid("swapmon") + + if not debug: + daemonize() + # Rewrite PID file + writepid("swapmon") + # Redirect stdout and stderr to syslog + syslog.openlog("swapmon") + sys.stdout = sys.stderr = Logger() + + # Get total physical memory + total_rss = memtotal() + + try: + f = open(datafile, "r+") + if verbose: + print "Loading %s" % datafile + (version, slices) = pickle.load(f) + f.close() + # Check version of data file + if version != "$Id: bwmon.py,v 1.1 2006/04/25 14:40:28 mlhuang Exp $": + print "Not using old version '%s' data file %s" % (version, datafile) + raise Exception + + # Send notification if we rebooted the node because of swap exhaustion + slicelist = slices.values() + slicelist.sort(lambda a, b: b['rss'] - a['rss']) + + table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage") + for slice in slicelist: + table += "%-20s%10d%16s (%4.1f%%)\n" % \ + (slice['name'], len(slice['procs']), + format_bytes(slice['rss'] * 1024, si = False), + 100. * slice['rss'] / total_rss) + + params = {'hostname': socket.gethostname(), + 'date': time.asctime(time.gmtime()) + " GMT", + 'table': table} + + if debug: + print rebooted_subject % params + print rebooted_body % params + else: + slicemail(None, rebooted_subject % params, rebooted_body % params) + + # Delete data file + os.unlink(datafile) + except Exception: + version = "$Id: bwmon.py,v 1.1 2006/04/25 14:40:28 mlhuang Exp $" + slices = {} + + # Query process table every 30 seconds, or when a large change in + # swap utilization is detected. + timer = period + last_used = None + used = None + + # System slices that we have warned but could not reset + warned = [] + + while True: + used = swap_used() + if last_used is None: + last_used = used + if verbose: + print "%d%% swap consumed" % used + + if used >= reboot_thresh: + # Dump slice state before rebooting + if verbose: + print "Saving %s" % datafile + f = open(datafile, "w") + pickle.dump((version, slices), f) + f.close() + + # Goodbye, cruel world + print "%d%% swap consumed, rebooting" % used + if not debug: + bwlimit.run("/bin/sync; /sbin/reboot -f") + + elif used >= reset_thresh: + # Try and find a hog + slicelist = slices.values() + slicelist.sort(lambda a, b: b['rss'] - a['rss']) + for slice in slicelist: + percent = 100. * slice['rss'] / total_rss + if percent < min_thresh: + continue + + print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \ + (used, + slice['name'], + format_bytes(slice['rss'] * 1024, si = False), + percent) + + slice['procs'].sort(lambda a, b: b['rss'] - a['rss']) + + table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND') + for proc in slice['procs']: + table += "%5s %10s %10s %4.1f %4.1f %s\n" % \ + (proc['pid'], + format_bytes(proc['vsize'] * 1024, si = False), + format_bytes(proc['rss'] * 1024, si = False), + proc['pcpu'], proc['pmem'], proc['cmd']) + + params = {'hostname': socket.gethostname(), + 'date': time.asctime(time.gmtime()) + " GMT", + 'table': table, + 'slice': slice['name'], + 'rss': format_bytes(slice['rss'] * 1024, si = False), + 'percent': percent} + + # Match slice name against system slice patterns + is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices]) + + if is_system_slice: + # Do not reset system slices, just warn once + if slice['name'] not in warned: + warned.append(slice['name']) + if debug: + print alarm_subject % params + print alarm_body % params + else: + print "Warning slice " + slice['name'] + slicemail(slice['name'], alarm_subject % params, alarm_body % params) + else: + # Otherwise, reset + if debug: + print reset_subject % params + print reset_body % params + else: + try: + pid = os.fork() + if pid == 0: + print "Resetting slice " + slice['name'] + vserver = VServer(slice['name']) + vserver.stop() + vserver.start(wait = False) + os._exit(0) + else: + os.waitpid(pid, 0) + except Exception, err: + print "Warning: Exception received while resetting slice %s:" % slice['name'], err + slicemail(slice['name'], reset_subject % params, reset_body % params) + break + + elif timer <= 0 or used >= (last_used + change_thresh): + if used >= (last_used + change_thresh): + print "%d%% swap consumed, %d%% in last %d seconds" % \ + (used, used - last_used, period - timer) + # Get slice state + slices = slicestat(names) + # Reset timer + timer = period + # Keep track of large changes in swap utilization + last_used = used + + timer -= 1 + time.sleep(1) + + removepid("swapmon") + +if __name__ == '__main__': + main() -- 2.43.0