From c5fded1c7bc934ed22658472c44905b2d2ef7f35 Mon Sep 17 00:00:00 2001 From: Planet-Lab Support Date: Tue, 9 May 2006 03:24:00 +0000 Subject: [PATCH 1/1] This commit was manufactured by cvs2svn to create tag 'planetlab-3_3-rc1'. --- bwmon.py | 341 ------------------------------------ leak.c | 61 ------- pl_mom.cron | 11 -- pl_mom.py | 213 ----------------------- swapmon.init | 72 -------- swapmon.py | 484 --------------------------------------------------- 6 files changed, 1182 deletions(-) delete mode 100755 bwmon.py delete mode 100644 leak.c delete mode 100644 pl_mom.cron delete mode 100644 pl_mom.py delete mode 100755 swapmon.init delete mode 100755 swapmon.py diff --git a/bwmon.py b/bwmon.py deleted file mode 100755 index b6f5697..0000000 --- a/bwmon.py +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/python -# -# Average bandwidth monitoring script. Run periodically via cron(8) to -# enforce a soft limit on daily bandwidth usage for each slice. If a -# slice is found to have exceeded its daily bandwidth usage when the -# script is run, its instantaneous rate will be capped at the desired -# average rate. Thus, in the worst case, a slice will only be able to -# send a little more than twice its average daily limit. -# -# Two separate limits are enforced, one for destinations exempt from -# the node bandwidth cap, and the other for all other destinations. -# -# Mark Huang -# Andy Bavier -# Copyright (C) 2004-2006 The Trustees of Princeton University -# -# $Id: bwmon.py,v 1.2 2006/04/28 20:25:19 mlhuang Exp $ -# - -import syslog -import os -import sys -import getopt -import time -import pickle - -import socket -import xmlrpclib -import bwlimit - -from sets import Set - -# Utility functions -from pl_mom import * - -# Constants -seconds_per_day = 24 * 60 * 60 -bits_per_byte = 8 - -# Defaults -debug = False -verbose = 0 -datafile = "/var/lib/misc/bwmon.dat" -nm = None - -default_maxrate = bwlimit.get_bwcap() - -default_maxexemptrate = bwlimit.bwmax - -# 500 Kbit or 5.4 GB per day -default_avgrate = 500000 - -# 1.5 Mbit or 16.4 GB per day -default_avgexemptrate = 1500000 - -# Average over 1 day -period = 1 * seconds_per_day - -# Message template -template = \ -""" -The slice %(slice)s has transmitted more than %(bytes)s from -%(hostname)s to %(class)s destinations -since %(since)s. - -Its maximum %(class)s burst rate will be capped at %(avgrate)s -until %(until)s. - -Please reduce the average %(class)s transmission rate -of the slice to %(avgrate)s, or %(limit)s per %(period)s. - -""".lstrip() - -footer = \ -""" -%(date)s %(hostname)s bwcap %(slice)s -""".lstrip() - -class Slice: - """ - Stores the last recorded bandwidth parameters of a slice. - - xid - slice context/VServer ID - name - slice name - time - beginning of recording period in UNIX seconds - bytes - low bandwidth bytes transmitted at the beginning of the recording period - exemptbytes - high bandwidth bytes transmitted at the beginning of the recording period - avgrate - average low bandwidth rate to enforce over the recording period - avgexemptrate - average high bandwidth rate to enforce over the recording period - """ - - def __init__(self, xid, name, maxrate, maxexemptrate, bytes, exemptbytes): - self.xid = xid - self.name = name - self.reset(maxrate, maxexemptrate, bytes, exemptbytes) - - def __repr__(self): - return self.name - - def reset(self, maxrate, maxexemptrate, bytes, exemptbytes): - """ - Begin a new recording period. Remove caps by restoring limits - to their default values. - """ - - # Reset baseline time - self.time = time.time() - - # Reset baseline byte coutns - self.bytes = bytes - self.exemptbytes = exemptbytes - - # Query Node Manager for max rate overrides - (new_maxrate, new_maxexemptrate) = nm.query(self.name, ['nm_net_max_rate', 'nm_net_max_exempt_rate']) - if new_maxrate is not None: - new_maxrate *= 1000 - else: - new_maxrate = default_maxrate - if new_maxexemptrate is not None: - new_maxexemptrate *= 1000 - else: - new_maxexemptrate = default_maxexemptrate - - if new_maxrate != maxrate or new_maxexemptrate != maxexemptrate: - print "%s reset to %s/%s" % \ - (self.name, - bwlimit.format_tc_rate(new_maxrate), - bwlimit.format_tc_rate(new_maxexemptrate)) - bwlimit.set(xid = self.xid, maxrate = new_maxrate, maxexemptrate = new_maxexemptrate) - - def update(self, maxrate, maxexemptrate, bytes, exemptbytes): - """ - Update byte counts and check if average rates have been - exceeded. In the worst case (instantaneous usage of the entire - average daily byte limit at the beginning of the recording - period), the slice will be immediately capped and will get to - send twice the average daily byte limit. In the common case, - it will get to send slightly more than the average daily byte - limit. - """ - - # Query Node Manager for max average rate overrides - (self.avgrate, self.avgexemptrate) = nm.query(self.name, ['nm_net_avg_rate', 'nm_net_avg_exempt_rate']) - if self.avgrate is None: - self.avgrate = default_avgrate - if self.avgexemptrate is None: - self.avgexemptrate = default_avgexemptrate - - # Prepare message parameters from the template - message = "" - params = {'slice': self.name, 'hostname': socket.gethostname(), - 'since': time.asctime(time.gmtime(self.time)) + " GMT", - 'until': time.asctime(time.gmtime(self.time + period)) + " GMT", - 'date': time.asctime(time.gmtime()) + " GMT", - 'period': format_period(period)} - - bytelimit = self.avgrate * period / bits_per_byte - if bytes >= (self.bytes + bytelimit) and \ - maxrate > self.avgrate: - new_maxrate = self.avgrate - else: - new_maxrate = maxrate - - # Format template parameters for low bandwidth message - params['class'] = "low bandwidth" - params['bytes'] = format_bytes(bytes - self.bytes) - params['maxrate'] = bwlimit.format_tc_rate(maxrate) - params['limit'] = format_bytes(bytelimit) - params['avgrate'] = bwlimit.format_tc_rate(self.avgrate) - - if verbose: - print "%(slice)s %(class)s " \ - "%(bytes)s/%(limit)s (%(maxrate)s/%(avgrate)s)" % \ - params - - # Cap low bandwidth burst rate - if new_maxrate != maxrate: - message += template % params - print "%(slice)s %(class)s capped at %(avgrate)s (%(bytes)s/%(limit)s)" % params - - exemptbytelimit = self.avgexemptrate * period / bits_per_byte - if exemptbytes >= (self.exemptbytes + exemptbytelimit) and \ - maxexemptrate > self.avgexemptrate: - new_maxexemptrate = self.avgexemptrate - else: - new_maxexemptrate = maxexemptrate - - # Format template parameters for high bandwidth message - params['class'] = "high bandwidth" - params['bytes'] = format_bytes(exemptbytes - self.exemptbytes) - params['maxrate'] = bwlimit.format_tc_rate(maxexemptrate) - params['limit'] = format_bytes(exemptbytelimit) - params['avgrate'] = bwlimit.format_tc_rate(self.avgexemptrate) - - if verbose: - print "%(slice)s %(class)s " \ - "%(bytes)s/%(limit)s (%(maxrate)s/%(avgrate)s)" % \ - params - - # Cap high bandwidth burst rate - if new_maxexemptrate != maxexemptrate: - message += template % params - print "%(slice)s %(class)s capped at %(avgrate)s (%(bytes)s/%(limit)s)" % params - - # Apply parameters - if new_maxrate != maxrate or new_maxexemptrate != maxexemptrate: - bwlimit.set(xid = self.xid, maxrate = new_maxrate, maxexemptrate = new_maxexemptrate) - - # Notify slice - if message: - subject = "pl_mom capped bandwidth of slice %(slice)s on %(hostname)s" % params - if debug: - print subject - print message + (footer % params) - else: - slicemail(self.name, subject, message + (footer % params)) - -def usage(): - print """ -Usage: %s [OPTIONS]... - -Options: - -d, --debug Enable debugging (default: %s) - -v, --verbose Increase verbosity level (default: %d) - -f, --file=FILE Data file (default: %s) - -s, --slice=SLICE Constrain monitoring to these slices (default: all) - -p, --period=SECONDS Interval in seconds over which to enforce average byte limits (default: %s) - -h, --help This message -""".lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period)) - -def main(): - # Defaults - global debug, verbose, datafile, period, nm - # All slices - names = [] - - try: - longopts = ["debug", "verbose", "file=", "slice=", "period=", "help"] - (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:p:h", longopts) - except getopt.GetoptError, err: - print "Error: " + err.msg - usage() - sys.exit(1) - - for (opt, optval) in opts: - if opt == "-d" or opt == "--debug": - debug = True - elif opt == "-v" or opt == "--verbose": - verbose += 1 - bwlimit.verbose = verbose - 1 - elif opt == "-f" or opt == "--file": - datafile = optval - elif opt == "-s" or opt == "--slice": - names.append(optval) - elif opt == "-p" or opt == "--period": - period = int(optval) - else: - usage() - sys.exit(0) - - # Check if we are already running - writepid("bwmon") - - if not debug: - # Redirect stdout and stderr to syslog - syslog.openlog("bwmon") - sys.stdout = sys.stderr = Logger() - - try: - f = open(datafile, "r+") - if verbose: - print "Loading %s" % datafile - (version, slices) = pickle.load(f) - f.close() - # Check version of data file - if version != "$Id: bwmon.py,v 1.2 2006/04/28 20:25:19 mlhuang Exp $": - print "Not using old version '%s' data file %s" % (version, datafile) - raise Exception - except Exception: - version = "$Id: bwmon.py,v 1.2 2006/04/28 20:25:19 mlhuang Exp $" - slices = {} - - # Get special slice IDs - root_xid = bwlimit.get_xid("root") - default_xid = bwlimit.get_xid("default") - - # Open connection to Node Manager - nm = NM() - - live = [] - for params in bwlimit.get(): - (xid, share, - minrate, maxrate, - minexemptrate, maxexemptrate, - bytes, exemptbytes) = params - live.append(xid) - - # Ignore root and default buckets - if xid == root_xid or xid == default_xid: - continue - - name = bwlimit.get_slice(xid) - if name is None: - # Orphaned (not associated with a slice) class - name = "%d?" % xid - - # Monitor only the specified slices - if names and name not in names: - continue - - if slices.has_key(xid): - slice = slices[xid] - if time.time() >= (slice.time + period) or \ - bytes < slice.bytes or exemptbytes < slice.exemptbytes: - # Reset to defaults every 24 hours or if it appears - # that the byte counters have overflowed (or, more - # likely, the node was restarted or the HTB buckets - # were re-initialized). - slice.reset(maxrate, maxexemptrate, bytes, exemptbytes) - else: - # Update byte counts - slice.update(maxrate, maxexemptrate, bytes, exemptbytes) - else: - # New slice, initialize state - slice = slices[xid] = Slice(xid, name, maxrate, maxexemptrate, bytes, exemptbytes) - - # Delete dead slices - dead = Set(slices.keys()) - Set(live) - for xid in dead: - del slices[xid] - - if verbose: - print "Saving %s" % datafile - f = open(datafile, "w") - pickle.dump((version, slices), f) - f.close() - - removepid("bwmon") - -if __name__ == '__main__': - main() diff --git a/leak.c b/leak.c deleted file mode 100644 index 271d41b..0000000 --- a/leak.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Stupid program that leaks memory at a configurable rate, to test swapmon - * - * Mark Huang - * Copyright (C) 2006 The Trustees of Princeton University - * - * $Id$ - */ - -#include -#include -#include -#include - -int -main(int argc, char *argv[]) -{ - int rate = 16; - int leaked; - - for (;;) { - int c, option_index = 0; - static struct option long_options[] = { - { "rate", required_argument, NULL, 'r' }, - { "help", no_argument, NULL, 'h' }, - { 0, 0, 0, 0 } - }; - - c = getopt_long(argc, argv, "r:h", long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'r': - rate = atoi(optarg); - break; - case 'h': - default: - fprintf(stderr, "Usage: %s [OPTION]...\n", argv[0]); - fprintf(stderr, "\t-r, --rate=MiB/sec\tRate to leak memory in MiB/sec\n"); - return 0; - } - } - - leaked = 0; - for (;;) { - int i, bufsize = rate * 1024 * 1024; - char *buf = malloc(bufsize); - if (buf) { - /* Touch every page in the buffer */ - for (i = 0; i < bufsize; i += 4096) - buf[i] = 1; - leaked += rate; - printf("\r%d MiB", leaked); - fflush(stdout); - } - sleep(1); - } - - return 0; -} diff --git a/pl_mom.cron b/pl_mom.cron deleted file mode 100644 index eb47892..0000000 --- a/pl_mom.cron +++ /dev/null @@ -1,11 +0,0 @@ -# -# Runs once a day to "fix" nodes in various ways -# -# Mark Huang -# Copyright (C) 2005 The Trustees of Princeton University -# -# $Id: pl_mop.cron,v 1.1 2005/10/11 17:34:57 mlhuang Exp $ -# - -@M@ @H@ * * * root /usr/local/planetlab/bin/pl_mop.sh -*/15 * * * * root /usr/share/pl_mom/bwmon.py diff --git a/pl_mom.py b/pl_mom.py deleted file mode 100644 index d889e88..0000000 --- a/pl_mom.py +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/python -# -# Utility functions -# -# Mark Huang -# Copyright (C) 2006 The Trustees of Princeton University -# -# $Id: pl_mom.py,v 1.2 2006/05/08 17:37:28 mlhuang Exp $ -# - -import os -import sys -import syslog -import socket -import xmlrpclib - -# /etc/planetlab/plc_config.py is a Python fragment maintained by -# PlanetLabConf that contains PLC configuration variables. -try: - sys.path.append("/etc/planetlab") - from plc_config import * -except: - print "Warning: Configuration file /etc/planetlab/plc_config.py not found" - PLC_NAME = "PlanetLab" - PLC_SLICE_PREFIX = "pl" - PLC_MAIL_SUPPORT_ADDRESS = "support@planet-lab.org" - PLC_MAIL_SLICE_ADDRESS = "SLICE@slices.planet-lab.org" - -def format_bytes(bytes, si = True): - """ - Formats bytes into a string - """ - if si: - kilo = 1000. - else: - # Officially, a kibibyte - kilo = 1024. - - if bytes >= (kilo * kilo * kilo): - return "%.1f GB" % (bytes / (kilo * kilo * kilo)) - elif bytes >= 1000000: - return "%.1f MB" % (bytes / (kilo * kilo)) - elif bytes >= 1000: - return "%.1f KB" % (bytes / kilo) - else: - return "%.0f bytes" % bytes - -def format_period(seconds): - """ - Formats a period in seconds into a string - """ - - if seconds == (24 * 60 * 60): - return "day" - elif seconds == (60 * 60): - return "hour" - elif seconds > (24 * 60 * 60): - return "%.1f days" % (seconds / 24. / 60. / 60.) - elif seconds > (60 * 60): - return "%.1f hours" % (seconds / 60. / 60.) - elif seconds > (60): - return "%.1f minutes" % (seconds / 60.) - else: - return "%.0f seconds" % seconds - -def writepid(prog): - """ - Check PID file. Exit if already running. Update PID file. - """ - - try: - pidfile = file("/var/run/%s.pid" % prog, "r") - pid = pidfile.readline().strip() - pidfile.close() - if os.path.isdir("/proc/" + pid): - print "Error: Another copy of %s is still running (%s)" % (prog, pid) - sys.exit(1) - except IOError: - pass - - pidfile = file("/var/run/%s.pid" % prog, "w") - pidfile.write(str(os.getpid())) - pidfile.close() - -def removepid(prog): - os.unlink("/var/run/%s.pid" % prog) - -def slicemail(slice, subject, body): - sendmail = os.popen("/usr/sbin/sendmail -t -f%s" % PLC_MAIL_SUPPORT_ADDRESS, "w") - - # PLC has a separate list for pl_mom messages - if PLC_MAIL_SUPPORT_ADDRESS == "support@planet-lab.org": - to = ["pl-mom@planet-lab.org"] - else: - to = [PLC_MAIL_SUPPORT_ADDRESS] - - if slice is not None and slice != "root": - to.append(PLC_MAIL_SLICE_ADDRESS.replace("SLICE", slice)) - - header = {'from': "%s Support <%s>" % (PLC_NAME, PLC_MAIL_SUPPORT_ADDRESS), - 'to': ", ".join(to), - 'version': sys.version.split(" ")[0], - 'subject': subject} - - # Write headers - sendmail.write( -""" -Content-type: text/plain -From: %(from)s -Reply-To: %(from)s -To: %(to)s -X-Mailer: Python/%(version)s -Subject: %(subject)s - -""".lstrip() % header) - - # Write body - sendmail.write(body) - - # Done - sendmail.close() - -class Logger: - """ - Simple file-like class for redirecting stdout and stderr to /var/log/messages - """ - - def write(self, text): - text = text.strip() - if text: - syslog.syslog(text) - -def daemonize(): - """ - Daemonize self. Detach from terminal, close all open files, and fork twice. - """ - - pid = os.fork() - if pid == 0: - # Detach from terminal - os.setsid() - # Orphan myself - pid = os.fork() - if pid == 0: - os.chdir("/") - else: - os._exit(0) - else: - os._exit(0) - - # Close all open file descriptors - import resource - maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] - if (maxfd == resource.RLIM_INFINITY): - maxfd = 1024 - for fd in range(0, maxfd): - try: - os.close(fd) - except OSError: - pass - - # Redirect stdin to /dev/null - os.open("/dev/null", os.O_RDWR) - -class NM: - """ - Simple interface to local Node Manager API - """ - - def __init__(self, url = "http://localhost:812/", timeout = 10): - """ - Open a new connection to the local Node Manager - """ - - socket.setdefaulttimeout(timeout) - try: - self.nm = xmlrpclib.ServerProxy(url) - except Exception, err: - print "Warning: Exception received while opening connection to Node Manager:", err - self.nm = None - - def query(self, slice, attributes): - """ - Get values of various slice attributes from the local Node Manager - - slice - slice name - attributes - [attribute_name, (attribute_name, return_value_if_not_set), ...] - """ - - values = [None for attribute in attributes] - - if self.nm is not None: - try: - # Read rspec (the NM hash code for the slice) - rcap = open("/var/run/pl_nm/%s.vm_rcap" % slice, "r") - rspec = rcap.readline().strip() - rcap.close() - - for i, attribute in enumerate(attributes): - # NM interface allows you to pass in a tuple - # (attribute, default) instead of just an - # attribute name. default is returned if the - # attribute is not set. - if type(attribute) != tuple: - attribute = (attribute, 0) - (rc, (value,)) = self.nm.nm_inspect(rspec, [attribute]) - if rc == 0 and value != attribute[1]: - values[i] = value - except Exception, err: - print "Warning: Exception received while querying Node Manager:", err - self.nm = None - - return values diff --git a/swapmon.init b/swapmon.init deleted file mode 100755 index d99530d..0000000 --- a/swapmon.init +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -# -# swapmon Swap monitoring daemon -# -# chkconfig: 2345 98 2 -# -# description: Resets memory hogs when swap is running low -# -# $Id: host.init,v 1.6 2006/04/20 09:01:00 thierry Exp $ -# - -PATH=/sbin:/bin:/usr/bin:/usr/sbin - -# Source function library. -. /etc/init.d/functions - -# Source configuration -if [ -f /etc/sysconfig/swapmon ] ; then - . /etc/sysconfig/swapmon -fi - -pidfile=/var/run/swapmon.pid -lockfile=/var/lock/subsys/swapmon -RETVAL=0 - -start () -{ - echo -n $"Starting swap monitor: " - daemon --check=swapmon /usr/share/pl_mom/swapmon.py $SWAPMON_OPTIONS - RETVAL=$? - echo - [ $RETVAL = 0 ] && touch ${lockfile} - return $RETVAL -} - -stop () -{ - echo -n $"Stopping swap monitor: " - killproc swapmon - RETVAL=$? - echo - [ $RETVAL = 0 ] && rm -f ${lockfile} ${pidfile} -} - - -case "$1" in - start) - start - ;; - stop) - stop - ;; - restart|reload) - stop - start - ;; - condrestart) - if [ -f ${lockfile} ] ; then - stop - start - fi - ;; - status) - status swapmon.py - ;; - *) - echo $"Usage: $0 {start|stop|restart|reload|condrestart|status}" - RETVAL=1 - ;; -esac - -exit $RETVAL diff --git a/swapmon.py b/swapmon.py deleted file mode 100755 index 3f6304e..0000000 --- a/swapmon.py +++ /dev/null @@ -1,484 +0,0 @@ -#!/usr/bin/python -# -# Swap monitoring daemon. Every 30 seconds, checks process memory -# usage. At 90% utilization, resets the slice that is consuming the -# most physical memory. At 95% utilization, reboots the machine to -# avoid a crash. -# -# Mark Huang -# Andy Bavier -# Copyright (C) 2004-2006 The Trustees of Princeton University -# -# $Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $ -# - -import syslog -import os -import sys -import getopt -import re -import pickle -import socket -import time - -# util-vserver/python/vserver.py allows us to control slices directly -# from Python -from vserver import VServer - -# bwlimit exports a few useful functions like run(), get_xid(), and get_slice() -import bwlimit - -# Utility functions -from pl_mom import * - -# Defaults -debug = False -verbose = 0 -datafile = "/var/lib/misc/swapmon.dat" - -# Seconds between process analysis -period = 30 - -# Minimum change in swap utilization over 30 seconds that will trigger -# early process analysis. -change_thresh = 5 - -# Swap utilization at which the largest consumer of physical memory is reset -reset_thresh = 85 - -# Swap utilization at which the machine is rebooted -reboot_thresh = 95 - -# Minimum physical memory utilization to be considered the largest consumer -min_thresh = 10 - -# System slices that should not be reset (regexps) -system_slices = ['root', PLC_SLICE_PREFIX + '_'] - -# Message sent after a critical reboot -rebooted_subject = "pl_mom rebooted %(hostname)s" -rebooted_body = \ -""" -Sometime before %(date)s, swap space was -nearly exhausted on %(hostname)s, so pl_mom rebooted it. - -Slices active prior to reboot are listed below. Memory usage -statistics are not entirely accurate due to threading. - -%(table)s - -%(date)s %(hostname)s reboot -""".lstrip() - -# Message sent after a hog is reset -reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s" -reset_body = \ -""" -Sometime before %(date)s, swap space was -nearly exhausted on %(hostname)s. - -Slice %(slice)s was reset since it was the largest consumer of -physical memory at %(rss)s (%(percent)4.1f%%). - -Please reply to this message explaining the nature of your experiment, -and what you are doing to address the problem. - -%(slice)s processes prior to reset: - -%(table)s - -%(date)s %(hostname)s reset %(slice)s -""".lstrip() - -# Message sent to system slices that should not be reset -alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s" -alarm_body = \ -""" -Sometime before %(date)s, swap space was -nearly exhausted on %(hostname)s. - -System slice %(slice)s was the largest consumer of physical memory at -%(rss)s (%(percent)4.1f%%). It was not reset, but please verify its -behavior. - -%(slice)s processes prior to alarm: - -%(table)s - -%(date)s %(hostname)s alarm %(slice)s -""".lstrip() - -def usage(): - print """ -Usage: %s [OPTIONS]... - -Options: - -d, --debug Enable debugging (default: %s) - -v, --verbose Increase verbosity level (default: %d) - -f, --file=FILE Data file (default: %s) - -s, --slice=SLICE Constrain monitoring to these slices (default: all) - -p, --period=SECONDS Seconds between normal process analysis (default: %s) - --reset-thresh=PERCENT Swap utilization at which slice reset is attempted - --reboot-thresh=PERCENT Swap utilization at which the machine is rebooted - --min-thresh=PERCENT Minimum physical memory utilization to be considered a hog - --system-slice=SLICE System slice that should not be reset - --status Print memory usage statistics and exit - -h, --help This message -""".lstrip() % (sys.argv[0], debug, verbose, datafile, format_period(period)) - -def slicestat(names = None): - """ - Get status of specified slices (if names is None or empty, all - slices). vsize and rss are in KiB. Returns - - {xid: {'xid': slice_id, - 'name': slice_name, - 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command, - 'vsize': virtual_kib, 'rss': physical_kib, - 'pcpu': cpu_percent, 'pmem': mem_percent}] - 'vsize': total_virtual_kib, - 'rss': total_physical_kib}} - """ - - # Mandatory fields. xid is a virtual field inserted by vps. Make - # sure cmd is last so that it does not get truncated - # automatically. - fields = ['pid', 'xid', 'user', 'vsize', 'rss', 'pcpu', 'pmem', 'cmd'] - - # vps inserts xid after pid in the output, but ps doesn't know - # what the field means. - ps_fields = list(fields) - ps_fields.remove('xid') - - slices = {} - - # Eat the header line. vps depends on the header to figure out - # which column is the PID column, so we can't just tell ps not to - # print it. - for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]: - # Chomp newline - line = line.strip() - - # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that - # vps uses to denote the root context and the "all contexts" - # context) with "0" so that we can just split() on whitespace. - line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0") - - # Represent process as a dict of fields - values = line.split(None, len(fields) - 1) - if len(values) != len(fields): - continue - proc = dict(zip(fields, values)) - - # Convert ints and floats - for field in proc: - try: - proc[field] = int(proc[field]) - except ValueError: - try: - proc[field] = float(proc[field]) - except ValueError: - pass - - # vps sometimes prints ERR instead of a context ID if it - # cannot identify the context of an orphaned (usually dying) - # process. Skip these processes. - if type(proc['xid']) != int: - continue - - # Assign (pl_)sshd processes to slice instead of root - m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd']) - if m is not None: - xid = bwlimit.get_xid(m.group(1)) - if xid is not None: - proc['xid'] = xid - - name = bwlimit.get_slice(proc['xid']) - if name is None: - # Orphaned (not associated with a slice) class - name = "%d?" % proc['xid'] - - # Monitor only the specified slices - if names and name not in names: - continue - - # Additional overhead calculations from slicestat - - # Include 12 KiB of process overhead = - # 4 KiB top-level page table + - # 4 KiB kernel structure + - # 4 KiB basic page table - proc['rss'] += 12 - - # Include additional page table overhead - if proc['vsize'] > 4096: - proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096) - - if slices.has_key(proc['xid']): - slice = slices[proc['xid']] - else: - slice = {'xid': proc['xid'], 'name': name, 'procs': [], 'vsize': 0, 'rss': 0} - - slice['procs'].append(proc) - slice['vsize'] += proc['vsize'] - slice['rss'] += proc['rss'] - - slices[proc['xid']] = slice - - return slices - -def memtotal(): - """ - Returns total physical memory on the system in KiB. - """ - - meminfo = open("/proc/meminfo", "r") - line = meminfo.readline() - meminfo.close() - if line[0:8] == "MemTotal": - # MemTotal: 255396 kB - (name, value, kb) = line.split() - return int(value) - - return 0 - -def swap_used(): - """ - Returns swap utilization on the system as a whole percentage (0-100). - """ - - total_swap = 0 - total_used = 0 - - try: - swaps = open("/proc/swaps", "r") - # Eat header line - lines = swaps.readlines()[1:] - swaps.close() - for line in lines: - # /dev/mapper/planetlab-swap partition 1048568 3740 -1 - (filename, type, size, used, priority) = line.strip().split() - try: - total_swap += int(size) - total_used += int(used) - except ValueEror, err: - pass - except (IOError, KeyError), err: - pass - - return 100 * total_used / total_swap - -def summary(names = None, total_rss = memtotal()): - """ - Return a summary of memory usage by slice. - """ - slicelist = slicestat(names).values() - slicelist.sort(lambda a, b: b['rss'] - a['rss']) - - table = "%-20s%10s%24s\n\n" % ("Slice", "Processes", "Memory Usage") - for slice in slicelist: - table += "%-20s%10d%16s (%4.1f%%)\n" % \ - (slice['name'], len(slice['procs']), - format_bytes(slice['rss'] * 1024, si = False), - 100. * slice['rss'] / total_rss) - - return table - -def main(): - # Defaults - global debug, verbose, datafile - global period, change_thresh, reset_thresh, reboot_thresh, min_thresh, system_slices - # All slices - names = [] - - try: - longopts = ["debug", "verbose", "file=", "slice=", "status", "help"] - longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="] - (opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts) - except getopt.GetoptError, err: - print "Error: " + err.msg - usage() - sys.exit(1) - - for (opt, optval) in opts: - if opt == "-d" or opt == "--debug": - debug = True - elif opt == "-v" or opt == "--verbose": - verbose += 1 - elif opt == "-f" or opt == "--file": - datafile = optval - elif opt == "-s" or opt == "--slice": - names.append(optval) - elif opt == "-p" or opt == "--period": - period = int(optval) - elif opt == "--change-thresh": - change_thresh = int(optval) - elif opt == "--reset-thresh": - reset_thresh = int(optval) - elif opt == "--reboot-thresh": - reboot_thresh = int(optval) - elif opt == "--min-thresh": - min_thresh = int(optval) - elif opt == "--system-slice": - system_slices.append(optval) - elif opt == "--status": - print summary(names) - sys.exit(0) - else: - usage() - sys.exit(0) - - # Check if we are already running - writepid("swapmon") - - if not debug: - daemonize() - # Rewrite PID file - writepid("swapmon") - # Redirect stdout and stderr to syslog - syslog.openlog("swapmon") - sys.stdout = sys.stderr = Logger() - - # Get total physical memory - total_rss = memtotal() - - try: - f = open(datafile, "r+") - if verbose: - print "Loading %s" % datafile - (version, slices) = pickle.load(f) - f.close() - # Check version of data file - if version != "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $": - print "Not using old version '%s' data file %s" % (version, datafile) - raise Exception - - params = {'hostname': socket.gethostname(), - 'date': time.asctime(time.gmtime()) + " GMT", - 'table': summary(total_rss)} - - if debug: - print rebooted_subject % params - print rebooted_body % params - else: - slicemail(None, rebooted_subject % params, rebooted_body % params) - - # Delete data file - os.unlink(datafile) - except Exception: - version = "$Id: swapmon.py,v 1.4 2006/05/02 17:23:14 mlhuang Exp $" - slices = {} - - # Query process table every 30 seconds, or when a large change in - # swap utilization is detected. - timer = period - last_used = None - used = None - - # System slices that we have warned but could not reset - warned = [] - - while True: - used = swap_used() - if last_used is None: - last_used = used - if verbose: - print "%d%% swap consumed" % used - - if used >= reboot_thresh: - # Dump slice state before rebooting - if verbose: - print "Saving %s" % datafile - f = open(datafile, "w") - pickle.dump((version, slices), f) - f.close() - - # Goodbye, cruel world - print "%d%% swap consumed, rebooting" % used - if not debug: - bwlimit.run("/bin/sync; /sbin/reboot -f") - - elif used >= reset_thresh: - # Try and find a hog - slicelist = slices.values() - slicelist.sort(lambda a, b: b['rss'] - a['rss']) - for slice in slicelist: - percent = 100. * slice['rss'] / total_rss - if percent < min_thresh: - continue - - print "%d%% swap consumed, slice %s is using %s (%d%%) of memory" % \ - (used, - slice['name'], - format_bytes(slice['rss'] * 1024, si = False), - percent) - - slice['procs'].sort(lambda a, b: b['rss'] - a['rss']) - - table = "%5s %10s %10s %4s %4s %s\n\n" % ("PID", "VIRT", "RES", '%CPU', '%MEM', 'COMMAND') - for proc in slice['procs']: - table += "%5s %10s %10s %4.1f %4.1f %s\n" % \ - (proc['pid'], - format_bytes(proc['vsize'] * 1024, si = False), - format_bytes(proc['rss'] * 1024, si = False), - proc['pcpu'], proc['pmem'], proc['cmd']) - - params = {'hostname': socket.gethostname(), - 'date': time.asctime(time.gmtime()) + " GMT", - 'table': table, - 'slice': slice['name'], - 'rss': format_bytes(slice['rss'] * 1024, si = False), - 'percent': percent} - - # Match slice name against system slice patterns - is_system_slice = filter(None, [re.match(pattern, slice['name']) for pattern in system_slices]) - - if is_system_slice: - # Do not reset system slices, just warn once - if slice['name'] not in warned: - warned.append(slice['name']) - if debug: - print alarm_subject % params - print alarm_body % params - else: - print "Warning slice " + slice['name'] - slicemail(slice['name'], alarm_subject % params, alarm_body % params) - else: - # Otherwise, reset - if debug: - print reset_subject % params - print reset_body % params - else: - try: - pid = os.fork() - if pid == 0: - print "Resetting slice " + slice['name'] - vserver = VServer(slice['name']) - vserver.stop() - vserver.start(wait = False) - os._exit(0) - else: - os.waitpid(pid, 0) - except Exception, err: - print "Warning: Exception received while resetting slice %s:" % slice['name'], err - slicemail(slice['name'], reset_subject % params, reset_body % params) - break - - elif timer <= 0 or used >= (last_used + change_thresh): - if used >= (last_used + change_thresh): - print "%d%% swap consumed, %d%% in last %d seconds" % \ - (used, used - last_used, period - timer) - # Get slice state - slices = slicestat(names) - # Reset timer - timer = period - # Keep track of large changes in swap utilization - last_used = used - - timer -= 1 - time.sleep(1) - - removepid("swapmon") - -if __name__ == '__main__': - main() -- 2.43.0