# Time to wait before checking slice again after reset
reset_timeout = 25
-# Number of strikes before killing (strike, strike, kill)
-kill_thresh = 2
-
-# Time to wait before removing slice from kill queue (probation)
-kill_timeout = 120
-
# Don't email the same message more than once in the same emailtimeout interval
email_timeout = 1800
def __init__(self,name):
self.name = name
- self.resettimeleft = reset_timeout
- self.resetcount = 0
self.resetmail = 0
- self.killtimeleft = kill_timeout
self.killmail = 0
def __repr__(self):
return self.name
- def update(self):
- # Count down for next check of reset slice.
- if self.resettimeleft > 0:
- self.resettimeleft -= 1
- if debug and verbose: print "%s has %s seconds in probation" \
- %(self.name, self.killtimeleft)
- if self.killtimeleft > 0:
- # Count down kill probation timer (killtimeleft)
- self.killtimeleft -= 1
- if self.killtimeleft == 1:
- print "%s is out of probation" % self.name
- else:
- # Once out of probation period (killtimeleft), remove strikes
- self.resetcount = 0
-
-
- # Check to see if a slice needs to be killed. If it has been killed more
- # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
- def checkkill(self,params):
- if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
- if debug:
- print kill_subject % params
- print kill_body % params
- try:
- pid = os.fork()
- if pid == 0:
- print "Slice %s is being killed." % self.name
- vserver = VServer(self.name)
- vserver.stop()
- # ignore initscripts. Don't run anything at start.
- vserver.INITSCRIPTS = []
- vserver.start()
- os._exit(0)
- else:
- os.waitpid(pid,0)
- except Exception, err:
- print "Warning: Exception received while killing slice %s: %s" \
- % (self.name, err)
- if (time.time() - self.killmail) > email_timeout:
- slicemail(self.name, kill_subject % params, kill_body % params)
- print "Sending KILL email for slice %s" % self.name
- self.killmail = time.time()
- return True
- return False
-
- # Reset slice after checking to see if slice is out of timeout.
- # Increment resetcount, check to see if larger than kill_thresh.
+ # Reset slice
def reset(self, params):
- # If its the first reset (came back after kill)
- # or if its been reset before
- # and we are out of the reset timeout.
if self.resetcount == 0 or self.resettimeleft == 0:
- # Do we need to kill this slice? Check history first.
- if self.checkkill(params): return
- # Update counters
- self.resetcount += 1
- self.killtimeleft = kill_timeout
- self.resettimeleft = reset_timeout
print "%s has %s seconds to die and has been reset %s times" \
%(self.name, self.resettimeleft, self.resetcount)
if debug:
last_used = None
used = None
- # System slices that we have warned but could not reset
- warned = []
-
- # Slices that were reset
- resetlist = {}
-
while True:
used = swap_used()
if last_used is None: last_used = used
- # If we've reset you recently, update timers.
- for resetslice in resetlist.keys():
- resetlist[resetslice].update()
- # If you've been good, remove you from our list.
- if resetlist[resetslice].killtimeleft == 0 and \
- resetlist[resetslice].resettimeleft == 0:
- del resetlist[resetslice]
-
+
if used >= reboot_thresh:
# Dump slice state before rebooting
writedat(slices)
alarm_body % params)
else:
# Reset slice
- if not resetlist.has_key(slice['name']):
- resetlist[slice['name']] = Reset(slice['name'])
- resetlist[slice['name']].reset(params)
+ if not debug: slicemail(self.name, reset_subject % params, reset_body % params)
- # wait period vefore recalculating swap. If in danger, recalc.
+ # wait period before recalculating swap. If in danger, recalc.
if timer <= 0 or used >= (last_used + change_thresh):
if used >= (last_used + change_thresh):
print "%d%% swap consumed, %d%% in last %d seconds" % \