# Swap utilization at which the machine is rebooted
reboot_thresh = 95
-# Time to wait before checking slice again after reset
-reset_timeout = 25
-
# Don't email the same message more than once in the same emailtimeout interval
email_timeout = 1800
%(date)s %(hostname)s reboot
""".lstrip()
-# Message sent after a hog is reset
-reset_subject = "pl_mom reset slice %(slice)s on %(hostname)s"
-reset_body = \
-"""
-Sometime before %(date)s, swap space was
-nearly exhausted on %(hostname)s.
-
-Slice %(slice)s was reset since it was the largest consumer of
-physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable).
-
-Please reply to this message explaining the nature of your experiment,
-and what you are doing to address the problem.
-
-http://summer.cs.princeton.edu/status/tabulator.cgi?table=slices/table_%(slice)s
-
-%(slice)s processes prior to reset:
-
-%(table)s
-
-%(date)s %(hostname)s reset %(slice)s
-""".lstrip()
-
# Message sent to system slices that should not be reset
alarm_subject = "pl_mom alarm slice %(slice)s on %(hostname)s"
alarm_body = \
%(date)s %(hostname)s reset %(slice)s
""".lstrip()
+def killsliverprocs(xid):
+ bwlimit.run("/usr/sbin/vkill -s 9 -c %s 0" % xid)
-class Reset:
- """
- Keeps track of state information for resets and kills
-
- resettimeleft - timeout before checking for next reset
- resetcount - number of strikes
- killtimeleft - time out before removing from kill queue
- {kill,reset}mail - Time of last email
- kill - State of kill. If slice is already being killed, wait before retry.
- """
-
- def __init__(self,name):
- self.name = name
- self.resetmail = 0
- self.killmail = 0
-
- def __repr__(self):
- return self.name
-
- # Reset slice
- def reset(self, params):
- if self.resetcount == 0 or self.resettimeleft == 0:
- print "%s has %s seconds to die and has been reset %s times" \
- %(self.name, self.resettimeleft, self.resetcount)
- if debug:
- print reset_subject % params
- print reset_body % params
- try:
- pid = os.fork()
- if pid == 0:
- print "Resetting slice " + self.name
- vserver = VServer(self.name)
- vserver.stop()
- vserver.start()
- os._exit(0)
- else:
- os.waitpid(pid,0)
- except Exception, err:
- print "Warning: Exception received while resetting slice %s:" \
- % self.name, err
- if (time.time() - self.resetmail) > email_timeout:
- slicemail(self.name, reset_subject % params, reset_body % params)
- print "Sending Reset email for slice %s" % self.name
- self.resetmail = time.time()
-
def usage():
print """
Usage: %s [OPTIONS]...
global period, change_thresh, reset_thresh, reboot_thresh, rss_min, system_slices
# All slices
names = []
+ timer = period
+ last_used = None
+ used = None
+ warned = []
+ emailed = {}
try:
longopts = ["debug", "verbose", "file=", "slice=", "status", "help"]
# Query process table every 30 seconds, or when a large change in
# swap utilization is detected.
- timer = period
- last_used = None
- used = None
while True:
used = swap_used()
alarm_body % params)
else:
# Reset slice
- if not debug: slicemail(self.name, reset_subject % params, reset_body % params)
+ if not debug:
+ if emailed.get(slice['name'], (time.time() + email_timeout + 1)) > (time.time() + email_timeout):
+ slicemail(slice['name'], kill_subject % params, kill_body % params)
+ emailed[slice['name']] = time.time()
+ else:
+ print kill_subject % params
+ print kill_body % params
+ print "Killing procs in %s" % slice['name']
+ killsliverprocs(slice['xid'])
# wait period before recalculating swap. If in danger, recalc.
if timer <= 0 or used >= (last_used + change_thresh):