-#!/usr/bin/python
-#
-# Swap monitoring daemon. Every 30 seconds, checks process memory
-# usage. At 90% utilization, resets the slice that is consuming the
-# most physical memory. At 95% utilization, reboots the machine to
-# avoid a crash.
-#
-# Mark Huang <mlhuang@cs.princeton.edu>
-# Andy Bavier <acb@cs.princeton.edu>
-# Copyright (C) 2004-2006 The Trustees of Princeton University
-#
-# $Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $
-#
-
-import syslog
-import os
-import sys
-import getopt
-import re
-import pickle
-import socket
-import time
-
-# util-vserver/python/vserver.py allows us to control slices directly
-# from Python
-from vserver import VServer
-
-# bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
-import bwlimit
-
-# Utility functions
-from pl_mom import *
-
-# Defaults
-debug = False
-verbose = 0
-datafile = "/var/lib/misc/swapmon.dat"
-
-# Seconds between process analysis
-period = 30
-
-# Minimum change in swap utilization over 30 seconds that will trigger
-# early process analysis.
-change_thresh = 5
-
-# Swap utilization at which the largest consumer of physical memory is reset
-reset_thresh = 85
-
-# Swap utilization at which the machine is rebooted
-reboot_thresh = 95
-
-# Minimum physical memory utilization to be considered the largest consumer
-min_thresh = 10
-
-# System slices that should not be reset (regexps)
-system_slices = ['root', PLC_SLICE_PREFIX + '_']
-
-# Message sent after a critical reboot
-rebooted_subject = "pl_mom rebooted %(hostname)s"
-rebooted_body = \
-"""
-Sometime before %(date)s, swap space was
-nearly exhausted on %(hostname)s, so pl_mom rebooted it.
-
-Slices active prior to reboot are listed below. Memory usage
-statistics are not entirely accurate due to threading.
-
-%(table)s
-
-%(date)s %(hostname)s reboot
-""".lstrip()
-
-# Message sent after a hog is reset
-reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
-reset_body = \
-"""
-Sometime before %(date)s, swap space was
-nearly exhausted on %(hostname)s.
-
-Slice %(slice)s was reset since it was the largest consumer of
-physical memory at %(rss)s (%(percent)4.1f%%).
-
-Please reply to this message explaining the nature of your experiment,
-and what you are doing to address the problem.
-
-%(slice)s processes prior to reset:
-
-%(table)s
-
-%(date)s %(hostname)s reset %(slice)s
-""".lstrip()
-
-# Message sent to system slices that should not be reset
-alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
-alarm_body = \
-"""
-Sometime before %(date)s, swap space was
-nearly exhausted on %(hostname)s.
-
-System slice %(slice)s was the largest consumer of physical memory at
-%(rss)s (%(percent)4.1f%%). It was not reset, but please verify its
-behavior.
-
-%(slice)s processes prior to alarm:
-
-%(table)s
-
-%(date)s %(hostname)s alarm %(slice)s
-""".lstrip()
-
-def usage():
- print """
-Usage: %s [OPTIONS]...
-
-Options:
- -d, --debug Enable debugging (default: %s)
- -v, --verbose Increase verbosity level (default: %d)
- -f, --file=FILE Data file (default: %s)
- -s, --slice=SLICE Constrain monitoring to these slices (default: all)
- -p, --period=SECONDS Seconds between normal process analysis (default: %s)
- --reset-thresh=PERCENT Swap utilization at which slice reset is attempted
- --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted
- --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
- --system-slice=SLICE System slice that should not be reset
- --status Print memory usage statistics and exit
- -h, --help This message
-""".lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period))
-
-def slicestat(names = None):
- """
- Get status of specified slices (if names is None or empty, all
- slices). vsize and rss are in KiB. Returns
-
- {xid: {'xid': slice_id,
- 'name': slice_name,
- 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
- 'vsize': virtual_kib, 'rss': physical_kib,
- 'pcpu': cpu_percent, 'pmem': mem_percent}]
- 'vsize': total_virtual_kib,
- 'rss': total_physical_kib}}
- """
-
- # Mandatory fields. xid is a virtual field inserted by vps. Make
- # sure cmd is last so that it does not get truncated
- # automatically.
- fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd']
-
- # vps inserts xid after pid in the output, but ps doesn't know
- # what the field means.
- ps_fields = list(fields)
- ps_fields.remove('xid')
-
- slices = {}
-
- # Eat the header line. vps depends on the header to figure out
- # which column is the PID column, so we can't just tell ps not to
- # print it.
- for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]:
- # Chomp newline
- line = line.strip()
-
- # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
- # vps uses to denote the root context and the "all contexts"
- # context) with "0" so that we can just split() on whitespace.
- line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
-
- # Represent process as a dict of fields
- values = line.split(None, len(fields) - 1)
- if len(values) != len(fields):
- continue
- proc = dict(zip(fields, values))
-
- # Convert ints and floats
- for field in proc:
- try:
- proc[field] = int(proc[field])
- except ValueError:
- try:
- proc[field] = float(proc[field])
- except ValueError:
- pass
-
- # vps sometimes prints ERR instead of a context ID if it
- # cannot identify the context of an orphaned (usually dying)
- # process. Skip these processes.
- if type(proc['xid']) != int:
- continue
-
- # Assign (pl_)sshd processes to slice instead of root
- m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
- if m is not None:
- xid = bwlimit.get_xid(m.group(1))
- if xid is not None:
- proc['xid'] = xid
-
- name = bwlimit.get_slice(proc['xid'])
- if name is None:
- # Orphaned (not associated with a slice) class
- name = "%d?" % proc['xid']
-
- # Monitor only the specified slices
- if names and name not in names:
- continue
-
- # Additional overhead calculations from slicestat
-
- # Include 12 KiB of process overhead =
- # 4 KiB top-level page table +
- # 4 KiB kernel structure +
- # 4 KiB basic page table
- proc['rss'] += 12
-
- # Include additional page table overhead
- if proc['vsize'] > 4096:
- proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
-
- if slices.has_key(proc['xid']):
- slice = slices[proc['xid']]
- else:
- slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0}
-
- slice['procs'].append(proc)
- slice['vsize'] += proc['vsize']
- slice['rss'] += proc['rss']
-
- slices[proc['xid']] = slice
-
- return slices
-
-def memtotal():
- """
- Returns total physical memory on the system in KiB.
- """
-
- meminfo = open("/proc/meminfo", "r")
- line = meminfo.readline()
- meminfo.close()
- if line[0:8] == "MemTotal":
- # MemTotal: 255396 kB
- (name, value, kb) = line.split()
- return int(value)
-
- return 0
-
-def swap_used():
- """
- Returns swap utilization on the system as a whole percentage (0-100).
- """
-
- total_swap = 0
- total_used = 0
-
- try:
- swaps = open("/proc/swaps", "r")
- # Eat header line
- lines = swaps.readlines()[1:]
- swaps.close()
- for line in lines:
- # /dev/mapper/planetlab-swap partition 1048568 3740 -1
- (filename, type, size, used, priority) = line.strip().split()
- try:
- total_swap += int(size)
- total_used += int(used)
- except ValueEror, err:
- pass
- except (IOError, KeyError), err:
- pass
-
- return 100 * total_used / total_swap
-
-def summary(names = None, total_rss = memtotal()):
- """
- Return a summary of memory usage by slice.
- """
- slicelist = slicestat(names).values()
- slicelist.sort(lambda a, b: b['rss'] - a['rss'])
-
- table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage")
- for slice in slicelist:
- table += "%-20s%10d%16s (%4.1f%%)\n" % \
- (slice['name'], len(slice['procs']),
- format_bytes(slice['rss'] * 1024, si = False),
- 100. * slice['rss'] / total_rss)
-
- return table
-
-def main():
- # Defaults
- global debug, verbose, datafile
- global period, change_thresh, reset_thresh, reboot_thresh, min_thresh, system_slices
- # All slices
- names = []
-
- try:
- longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
- longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
- (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
- except getopt.GetoptError, err:
- print "Error: " + err.msg
- usage()
- sys.exit(1)
-
- for (opt, optval) in opts:
- if opt == "-d" or opt == "--debug":
- debug = True
- elif opt == "-v" or opt == "--verbose":
- verbose += 1
- elif opt == "-f" or opt == "--file":
- datafile = optval
- elif opt == "-s" or opt == "--slice":
- names.append(optval)
- elif opt == "-p" or opt == "--period":
- period = int(optval)
- elif opt == "--change-thresh":
- change_thresh = int(optval)
- elif opt == "--reset-thresh":
- reset_thresh = int(optval)
- elif opt == "--reboot-thresh":
- reboot_thresh = int(optval)
- elif opt == "--min-thresh":
- min_thresh = int(optval)
- elif opt == "--system-slice":
- system_slices.append(optval)
- elif opt == "--status":
- print summary(names)
- sys.exit(0)
- else:
- usage()
- sys.exit(0)
-
- # Check if we are already running
- writepid("swapmon")
-
- if not debug:
- daemonize()
- # Rewrite PID file
- writepid("swapmon")
- # Redirect stdout and stderr to syslog
- syslog.openlog("swapmon")
- sys.stdout = sys.stderr = Logger()
-
- # Get total physical memory
- total_rss = memtotal()
-
- try:
- f = open(datafile, "r+")
- if verbose:
- print "Loading %s" % datafile
- (version, slices) = pickle.load(f)
- f.close()
- # Check version of data file
- if version != "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $":
- print "Not using old version '%s' data file %s" % (version, datafile)
- raise Exception
-
- params = {'hostname': socket.gethostname(),
- 'date': time.asctime(time.gmtime()) + " GMT",
- 'table': summary(total_rss)}
-
- if debug:
- print rebooted_subject % params
- print rebooted_body % params
- else:
- slicemail(None, rebooted_subject % params, rebooted_body % params)
-
- # Delete data file
- os.unlink(datafile)
- except Exception:
- version = "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $"
- slices = {}
-
- # Query process table every 30 seconds, or when a large change in
- # swap utilization is detected.
- timer = period
- last_used = None
- used = None
-
- # System slices that we have warned but could not reset
- warned = []
-
- while True:
- used = swap_used()
- if last_used is None:
- last_used = used
- if verbose:
- print "%d%% swap consumed" % used
-
- if used >= reboot_thresh:
- # Dump slice state before rebooting
- if verbose:
- print "Saving %s" % datafile
- f = open(datafile, "w")
- pickle.dump((version, slices), f)
- f.close()
-
- # Goodbye, cruel world
- print "%d%% swap consumed, rebooting" % used
- if not debug:
- bwlimit.run("/bin/sync; /sbin/reboot -f")
-
- elif used >= reset_thresh:
- # Try and find a hog
- slicelist = slices.values()
- slicelist.sort(lambda a, b: b['rss'] - a['rss'])
- for slice in slicelist:
- percent = 100. * slice['rss'] / total_rss
- if percent < min_thresh:
- continue
-
- print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \
- (used,
- slice['name'],
- format_bytes(slice['rss'] * 1024, si = False),
- percent)
-
- slice['procs'].sort(lambda a, b: b['rss'] - a['rss'])
-
- table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND')
- for proc in slice['procs']:
- table += "%5s %10s %10s %4.1f %4.1f %s\n" % \
- (proc['pid'],
- format_bytes(proc['vsize'] * 1024, si = False),
- format_bytes(proc['rss'] * 1024, si = False),
- proc['pcpu'], proc['pmem'], proc['cmd'])
-
- params = {'hostname': socket.gethostname(),
- 'date': time.asctime(time.gmtime()) + " GMT",
- 'table': table,
- 'slice': slice['name'],
- 'rss': format_bytes(slice['rss'] * 1024, si = False),
- 'percent': percent}
-
- # Match slice name against system slice patterns
- is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices])
-
- if is_system_slice:
- # Do not reset system slices, just warn once
- if slice['name'] not in warned:
- warned.append(slice['name'])
- if debug:
- print alarm_subject % params
- print alarm_body % params
- else:
- print "Warning slice " + slice['name']
- slicemail(slice['name'], alarm_subject % params, alarm_body % params)
- else:
- # Otherwise, reset
- if debug:
- print reset_subject % params
- print reset_body % params
- else:
- try:
- pid = os.fork()
- if pid == 0:
- print "Resetting slice " + slice['name']
- vserver = VServer(slice['name'])
- vserver.stop()
- vserver.start(wait = False)
- os._exit(0)
- else:
- os.waitpid(pid, 0)
- except Exception, err:
- print "Warning: Exception received while resetting slice %s:" % slice['name'], err
- slicemail(slice['name'], reset_subject % params, reset_body % params)
- break
-
- elif timer <= 0 or used >= (last_used + change_thresh):
- if used >= (last_used + change_thresh):
- print "%d%% swap consumed, %d%% in last %d seconds" % \
- (used, used - last_used, period - timer)
- # Get slice state
- slices = slicestat(names)
- # Reset timer
- timer = period
- # Keep track of large changes in swap utilization
- last_used = used
-
- timer -= 1
- time.sleep(1)
-
- removepid("swapmon")
-
-if __name__ == '__main__':
- main()