+# Message sent after a slice has been killed
+kill_subject = "pl_mom killed slice %(slice)s on %(hostname)s"
+kill_body = \
+"""
+Sometime before %(date)s, swap space was
+nearly exhausted on %(hostname)s.
+
+Slice %(slice)s was killed since it was the largest consumer of
+physical memory at %(rss)s (%(percent)4.1f%%) (%(sz)s writable)
+after repeated restarts.
+
+Please reply to this message explaining the nature of your experiment,
+and what you are doing to address the problem.
+
+%(slice)s processes prior to reset:
+
+%(table)s
+
+%(date)s %(hostname)s reset %(slice)s
+""".lstrip()
+
+
+
+class Reset:
+ """
+ Keeps track of state information for resets and kills
+
+ resettimeleft - timeout before checking for next reset
+ resetcount - number of strikes
+ killtimeleft - time out before removing from kill queue
+ {kill,reset}mail - Time of last email
+ kill - State of kill. If slice is already being killed, wait before retry.
+ """
+
+ def __init__(self,name):
+ self.name = name
+ self.resettimeleft = reset_timeout
+ self.resetcount = 0
+ self.resetmail = 0
+ self.killtimeleft = kill_timeout
+ self.killmail = 0
+
+ def __repr__(self):
+ return self.name
+
+ def update(self):
+ # Count down for next check of reset slice.
+ if self.resettimeleft > 0:
+ self.resettimeleft -= 1
+ if debug and verbose: print "%s has %s seconds in probation" \
+ %(self.name, self.killtimeleft)
+ if self.killtimeleft > 0:
+ # Count down kill probation timer (killtimeleft)
+ self.killtimeleft -= 1
+ if self.killtimeleft == 1:
+ print "%s is out of probation" % self.name
+ else:
+ # Once out of probation period (killtimeleft), remove strikes
+ self.resetcount = 0
+
+
+ # Check to see if a slice needs to be killed. If it has been killed more
+ # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
+ def checkkill(self,params):
+ if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
+ if debug:
+ print kill_subject % params
+ print kill_body % params
+ try:
+ pid = os.fork()
+ if pid == 0:
+ print "Slice %s is being killed." % self.name
+ vserver = VServer(self.name)
+ vserver.stop()
+ # ignore initscripts. Don't run anything at start.
+ vserver.INITSCRIPTS = []
+ vserver.start()
+ os._exit(0)
+ else:
+ os.waitpid(pid,0)
+ except Exception, err:
+ print "Warning: Exception received while killing slice %s: %s" \
+ % (self.name, err)
+ if (time.time() - self.killmail) > email_timeout:
+ slicemail(self.name, kill_subject % params, kill_body % params)
+ print "Sending KILL email for slice %s" % self.name
+ self.killmail = time.time()
+ return True
+ return False
+
+ # Reset slice after checking to see if slice is out of timeout.
+ # Increment resetcount, check to see if larger than kill_thresh.
+ def reset(self, params):
+ # If its the first reset (came back after kill)
+ # or if its been reset before
+ # and we are out of the reset timeout.
+ if self.resetcount == 0 or self.resettimeleft == 0:
+ # Do we need to kill this slice? Check history first.
+ if self.checkkill(params): return
+ # Update counters
+ self.resetcount += 1
+ self.killtimeleft = kill_timeout
+ self.resettimeleft = reset_timeout
+ print "%s has %s seconds to die and has been reset %s times" \
+ %(self.name, self.resettimeleft, self.resetcount)
+ if debug:
+ print reset_subject % params
+ print reset_body % params
+ try:
+ pid = os.fork()
+ if pid == 0:
+ print "Resetting slice " + self.name
+ vserver = VServer(self.name)
+ vserver.stop()
+ vserver.start()
+ os._exit(0)
+ else:
+ os.waitpid(pid,0)
+ except Exception, err:
+ print "Warning: Exception received while resetting slice %s:" \
+ % self.name, err
+ if (time.time() - self.resetmail) > email_timeout:
+ slicemail(self.name, reset_subject % params, reset_body % params)
+ print "Sending Reset email for slice %s" % self.name
+ self.resetmail = time.time()
+