# Time to wait before checking slice again after reset
reset_timeout = 25
-# Number of strikes before killing (strike, strike, kill)
-kill_thresh = 2
-
-# Time to wait before removing slice from kill queue (probation)
-kill_timeout = 120
-
# Don't email the same message more than once in the same emailtimeout interval
email_timeout = 1800
def __init__(self,name):
self.name = name
- self.resettimeleft = reset_timeout
- self.resetcount = 0
self.resetmail = 0
- self.killtimeleft = kill_timeout
self.killmail = 0
def __repr__(self):
return self.name
- def update(self):
- # Count down for next check of reset slice.
- if self.resettimeleft > 0:
- self.resettimeleft -= 1
- if debug and verbose: print "%s has %s seconds in probation" \
- %(self.name, self.killtimeleft)
- if self.killtimeleft > 0:
- # Count down kill probation timer (killtimeleft)
- self.killtimeleft -= 1
- if self.killtimeleft == 1:
- print "%s is out of probation" % self.name
- else:
- # Once out of probation period (killtimeleft), remove strikes
- self.resetcount = 0
-
-
- # Check to see if a slice needs to be killed. If it has been killed more
- # than kill_thresh in the probation period (kill_timeout) send an email, kill the slice.
- def checkkill(self,params):
- if self.killtimeleft > 0 and self.resetcount >= kill_thresh:
- if debug:
- print kill_subject % params
- print kill_body % params
- try:
- pid = os.fork()
- if pid == 0:
- print "Slice %s is being killed." % self.name
- vserver = VServer(self.name)
- vserver.stop()
- os._exit(0)
- else:
- os.waitpid(pid,0)
- except Exception, err:
- print "Warning: Exception received while killing slice %s: %s" \
- % (self.name, err)
- if (time.time() - self.killmail) > email_timeout:
- slicemail(self.name, kill_subject % params, kill_body % params)
- print "Sending KILL email for slice %s" % self.name
- self.killmail = time.time()
- return True
- return False
-
- # Reset slice after checking to see if slice is out of timeout.
- # Increment resetcount, check to see if larger than kill_thresh.
+ # Reset slice
def reset(self, params):
- # If its the first reset (came back after kill)
- # or if its been reset before
- # and we are out of the reset timeout.
if self.resetcount == 0 or self.resettimeleft == 0:
- # Do we need to kill this slice? Check history first.
- if self.checkkill(params): return
- # Update counters
- self.resetcount += 1
- self.killtimeleft = kill_timeout
- self.resettimeleft = reset_timeout
print "%s has %s seconds to die and has been reset %s times" \
%(self.name, self.resettimeleft, self.resetcount)
if debug:
print "Resetting slice " + self.name
vserver = VServer(self.name)
vserver.stop()
- vserver.start(wait = False)
+ vserver.start()
os._exit(0)
else:
os.waitpid(pid,0)
"""
Get status of specified slices (if names is None or empty, all
slices). vsize, sz, and rss are in KiB. Returns
-
+ PID CONTEXT VSZ SZ RSS %MEM CMD
{xid: {'xid': slice_id,
'name': slice_name,
- 'procs': [{'pid': pid, 'xid': slice_id, 'user', username, 'cmd': command,
+ 'procs': [{'pid': pid, 'xid': slice_id, 'cmd': command,
'vsize': virtual_kib, 'sz': potential_kib, 'rss': physical_kib,
'pcpu': cpu_percent, 'pmem': mem_percent}]
'vsize': total_virtual_kib,
# Mandatory fields. xid is a virtual field inserted by vps. Make
# sure cmd is last so that it does not get truncated
# automatically.
- fields = ['pid', 'xid', 'user', 'vsize', 'sz', 'rss', 'pcpu', 'pmem', 'cmd']
+ fields = ['pid', 'xid', 'vsize', 'sz', 'rss', 'pmem', 'cmd']
# vps inserts xid after pid in the output, but ps doesn't know
# what the field means.
# Eat the header line. vps depends on the header to figure out
# which column is the PID column, so we can't just tell ps not to
# print it.
- for line in bwlimit.run("/usr/sbin/vps -e -o " + ",".join(ps_fields))[1:]:
+ for line in bwlimit.run("/usr/sbin/vps -e -o " + ":16,".join(ps_fields))[1:]:
# Chomp newline
line = line.strip()
except ValueError:
pass
- # vps sometimes prints ERR instead of a context ID if it
+ # vps sometimes prints ERR or the name of the slice
+ # instead of a context ID if it
# cannot identify the context of an orphaned (usually dying)
# process. Skip these processes.
- if type(proc['xid']) != int:
+ if (type(proc['xid']) != int) or (type(proc['vsize']) !=int):
continue
# Assign (pl_)sshd processes to slice instead of root
m = re.search(r"sshd: ([a-zA-Z_]+)", proc['cmd'])
+
if m is not None:
xid = bwlimit.get_xid(m.group(1))
if xid is not None:
proc['rss'] += 12
# Include additional page table overhead
- if proc['vsize'] > 4096:
- proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
+ try:
+ if proc['vsize'] > 4096:
+ proc['rss'] += 4 * ((proc['vsize'] - 1) / 4096)
+ except: pass
if slices.has_key(proc['xid']):
slice = slices[proc['xid']]
slice['rss'] += proc['rss']
slices[proc['xid']] = slice
-
+
return slices
def memtotal():
table = "%5s %10s %10s %10s %4s %4s %s\n\n" % \
("PID", "VIRT", "SZ", "RES", '%CPU', '%MEM', 'COMMAND')
for proc in slice['procs']:
- table += "%5s %10s %10s %10s %4.1f %4.1f %s\n" % \
+ table += "%5s %10s %10s %10s %4.1f %s\n" % \
(proc['pid'],
format_bytes(proc['vsize'] * 1024, si = False),
format_bytes(proc['sz'] * 1024, si = False),
format_bytes(proc['rss'] * 1024, si = False),
- proc['pcpu'],
proc['pmem'],
proc['cmd'])
last_used = None
used = None
- # System slices that we have warned but could not reset
- warned = []
-
- # Slices that were reset
- resetlist = {}
-
while True:
used = swap_used()
if last_used is None: last_used = used
- # If we've reset you recently, update timers.
- for resetslice in resetlist.keys():
- resetlist[resetslice].update()
- # If you've been good, remove you from our list.
- if resetlist[resetslice].killtimeleft == 0 and \
- resetlist[resetslice].resettimeleft == 0:
- del resetlist[resetslice]
-
- if verbose: print "%d%% swap consumed" % used
-
+
if used >= reboot_thresh:
# Dump slice state before rebooting
writedat(slices)
alarm_body % params)
else:
# Reset slice
- if not resetlist.has_key(slice['name']):
- resetlist[slice['name']] = Reset(slice['name'])
- resetlist[slice['name']].reset(params)
+ if not debug: slicemail(self.name, reset_subject % params, reset_body % params)
- # wait period vefore recalculating swap. If in danger, recalc.
+ # wait period before recalculating swap. If in danger, recalc.
if timer <= 0 or used >= (last_used + change_thresh):
if used >= (last_used + change_thresh):
print "%d%% swap consumed, %d%% in last %d seconds" % \