import socket
import time
-# util-vserver/python/vserver.py allows us to control slices directly
-# from Python
-from vserver import VServer
-
# bwlimit exports a few useful functions like run(), get_xid(), and get_slice()
import bwlimit
# Swap utilization at which the machine is rebooted
reboot_thresh = 95
-# Time to wait before checking slice again after reset
-reset_timeout = 25
-
# Don't email the same message more than once in the same emailtimeout interval
email_timeout = 1800
%(date)s %(hostname)s reboot
""".lstrip()
-# Message sent after a hog is reset
-reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
-reset_body = \
-"""
-Sometime before %(date)s, swap space was
-nearly exhausted on %(hostname)s.
-
-Slice %(slice)s was reset since it was the largest consumer of
-physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
-
-Please reply to this message explaining the nature of your experiment,
-and what you are doing to address the problem.
-
-http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
-
-%(slice)s processes prior to reset:
-
-%(table)s
-
-%(date)s %(hostname)s reset %(slice)s
-""".lstrip()
-
# Message sent to system slices that should not be reset
alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
alarm_body = \
%(date)s %(hostname)s reset %(slice)s
""".lstrip()
+def killsliverprocs(xid):
+ bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
-class Reset:
- """
- Keeps track of state information for resets and kills
-
- resettimeleft - timeout before checking for next reset
- resetcount - number of strikes
- killtimeleft - time out before removing from kill queue
- {kill,reset}mail - Time of last email
- kill - State of kill. If slice is already being killed, wait before retry.
- """
-
- def __init__(self,name):
- self.name = name
- self.resetmail = 0
- self.killmail = 0
-
- def __repr__(self):
- return self.name
-
- # Reset slice
- def reset(self, params):
- if self.resetcount == 0 or self.resettimeleft == 0:
- print "%s has %s seconds to die and has been reset %s times" \
- %(self.name, self.resettimeleft, self.resetcount)
- if debug:
- print reset_subject % params
- print reset_body % params
- try:
- pid = os.fork()
- if pid == 0:
- print "Resetting slice " + self.name
- vserver = VServer(self.name)
- vserver.stop()
- vserver.start()
- os._exit(0)
- else:
- os.waitpid(pid,0)
- except Exception, err:
- print "Warning: Exception received while resetting slice %s:" \
- % self.name, err
- if (time.time() - self.resetmail) > email_timeout:
- slicemail(self.name, reset_subject % params, reset_body % params)
- print "Sending Reset email for slice %s" % self.name
- self.resetmail = time.time()
-
def usage():
print """
Usage: %s [OPTIONS]...
--min-thresh=PERCENT Minimum physical memory utilization to be considered a hog
--system-slice=SLICE System slice that should not be reset
--status Print memory usage statistics and exit
+ --memstatus Print total memory, total swap, and swap used
-h, --help This message
""".lstrip() % (sys.argv[0], debug, verbose, DATAFILE, format_period(period))
# Mandatory fields. xid is a virtual field inserted by vps. Make
# sure cmd is last so that it does not get truncated
# automatically.
- fields = ['pid', 'xid', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
+ fields = ['pid', 'xid', 'vsname', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
# vps inserts xid after pid in the output, but ps doesn't know
# what the field means.
ps_fields = list(fields)
ps_fields.remove('xid')
+ ps_fields.remove('vsname')
slices = {}
# Chomp newline
line = line.strip()
- # Replace "0 MAIN" and "1 ALL_PROC" (the special monikers that
- # vps uses to denote the root context and the "all contexts"
- # context) with "0" so that we can just split() on whitespace.
- line = line.replace("0 MAIN", "0").replace("1 ALL_PROC", "0")
-
# Represent process as a dict of fields
values = line.split(None, len(fields) - 1)
if len(values) != len(fields):
+ print "slicestat: failed to parse line: " + line
continue
proc = dict(zip(fields, values))
except ValueError:
pass
- # vps sometimes prints ERR or the name of the slice
+ # vps sometimes prints ERR or the name of the slice
# instead of a context ID if it
# cannot identify the context of an orphaned (usually dying)
# process. Skip these processes.
if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
+ print "slicestat: failed to parse line: " + line
continue
# Assign (pl_)sshd processes to slice instead of root
slice['rss'] += proc['rss']
slices[proc['xid']] = slice
-
+
return slices
def memtotal():
global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
# All slices
names = []
+ timer = period
+ last_used = None
+ used = None
+ warned = []
+ emailed = {}
try:
- longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
+ longopts = ["debug", "verbose", "file=", "slice=", "status", "memstatus", "help"]
longopts += ["period=", "reset-thresh=", "reboot-thresh=", "min-thresh=", "system-slice="]
(opts, argv) = getopt.getopt(sys.argv[1:], "dvf:s:ph", longopts)
except getopt.GetoptError, err:
elif opt == "--status":
print summary(slicestat(names))
sys.exit(0)
+ elif opt == "--memstatus":
+ (mem, swap) = memtotal()
+ swap_pct = swap_used()
+ print "memory total:", mem
+ print "swap total:", swap
+ print "swap used:", swap_pct
+ sys.exit(0)
else:
usage()
sys.exit(0)
# Query process table every 30 seconds, or when a large change in
# swap utilization is detected.
- timer = period
- last_used = None
- used = None
while True:
used = swap_used()
if last_used is None: last_used = used
-
if used >= reboot_thresh:
# Dump slice state before rebooting
alarm_body % params)
else:
# Reset slice
- if not debug: slicemail(self.name, reset_subject % params, reset_body % params)
+ if not debug:
+ if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
+ slicemail(slice['name'], kill_subject % params, kill_body % params)
+ emailed[slice['name']] = time.time()
+ else:
+ print kill_subject % params
+ print kill_body % params
+ print "Killing procs in %s" % slice['name']
+ killsliverprocs(slice['xid'])
# wait period before recalculating swap. If in danger, recalc.
if timer <= 0 or used >= (last_used + change_thresh):