From: Planet-Lab Support <support@planet-lab.org>
Date: Mon, 1 May 2006 14:16:35 +0000 (+0000)
Subject: This commit was manufactured by cvs2svn to create branch
X-Git-Tag: planetlab-3_3-branch-point~1
X-Git-Url: http://git.onelab.eu/?p=util-vserver.git;a=commitdiff_plain;h=55b0a09abca499f96094b45be79692ae540a5d3c

This commit was manufactured by cvs2svn to create branch
'planetlab-3_3-branch'.
---

diff --git a/.cvsignore b/.cvsignore
new file mode 100644
index 0000000..93b0d06
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,14 @@
+.X_usr_local_etc-up-to-date
+.deps
+.fixups
+.pathconfig.h.pathsubst.stamp
+FEATURES.txt
+Makefile
+autom4te.cache
+config.cache
+config.h
+config.log
+config.status
+libtool
+pathconfig.h
+stamp-h1
diff --git a/contrib/.cvsignore b/contrib/.cvsignore
new file mode 100644
index 0000000..55b480f
--- /dev/null
+++ b/contrib/.cvsignore
@@ -0,0 +1,2 @@
+.manifest.dat.pathsubst.stamp
+manifest.dat
diff --git a/lib/.cvsignore b/lib/.cvsignore
new file mode 100644
index 0000000..6a9f2c5
--- /dev/null
+++ b/lib/.cvsignore
@@ -0,0 +1,6 @@
+.deps
+.dirstamp
+.libs
+*.lo
+libvserver.la
+util-vserver.pc
diff --git a/lib/cflags-v13.c b/lib/cflags-v13.c
index 246d1d4..0af2d04 100644
--- a/lib/cflags-v13.c
+++ b/lib/cflags-v13.c
@@ -42,6 +42,7 @@ static struct Mapping_uint64 const VALUES[] = {
   DECL("sched_hard",    VC_VXF_SCHED_HARD),
   DECL("sched_prio",    VC_VXF_SCHED_PRIO),
   DECL("sched_pause",   VC_VXF_SCHED_PAUSE),
+  DECL("sched_share",   VC_VXF_SCHED_SHARE),
 
   DECL("virt_mem",      VC_VXF_VIRT_MEM),
   DECL("virt_uptime",   VC_VXF_VIRT_UPTIME),
diff --git a/lib/planetlab.c b/lib/planetlab.c
index 034c249..4d85fb9 100644
--- a/lib/planetlab.c
+++ b/lib/planetlab.c
@@ -33,6 +33,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include <errno.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <unistd.h>
 #include <sys/resource.h>
 
@@ -43,18 +44,20 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "vserver.h"
 
 static int
-create_context(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
+create_context(xid_t ctx, uint32_t flags, uint64_t bcaps)
 {
   struct vc_ctx_caps  vc_caps;
-  struct vc_ctx_flags  vc_flags;
-  struct vc_set_sched  vc_sched;
-  struct vc_rlimit  vc_rlimit;
 
-  /* create context info */
+  /*
+   * Create context info - this sets the STATE_SETUP and STATE_INIT flags.
+   * Don't ever clear the STATE_INIT flag, that makes us the init task.
+   *
+   * XXX - the kernel code allows initial flags to be passed as an arg.
+   */
   if (vc_ctx_create(ctx) == VC_NOCTX)
     return -1;
 
-  /* set capabilities - these don't take effect until SETUP flags is unset */
+  /* set capabilities - these don't take effect until SETUP flag is unset */
   vc_caps.bcaps = bcaps;
   vc_caps.bmask = ~0ULL;  /* currently unused */
   vc_caps.ccaps = 0;      /* don't want any of these */
@@ -62,53 +65,44 @@ create_context(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
   if (vc_set_ccaps(ctx, &vc_caps))
     return -1;
 
-  /* ignore all flags except SETUP and scheduler flags */
-  vc_flags.mask = VC_VXF_STATE_SETUP | VC_VXF_SCHED_FLAGS;
-  /* don't let user change scheduler flags */
-  vc_flags.flagword = flags & ~VC_VXF_SCHED_FLAGS;  /* SETUP not set */
+  /* set default scheduling parameters */
+  pl_setsched(ctx, 1, 0);
 
-  /* set scheduler parameters */
-  vc_flags.flagword |= rspec->cpu_sched_flags;
-  vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS |
-		       VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX);
-  vc_sched.fill_rate = rspec->cpu_share;  /* tokens accumulated per interval */
-  vc_sched.interval = 1000;  /* milliseconds */
-  vc_sched.tokens = 100;     /* initial allocation of tokens */
-  vc_sched.tokens_min = 50;  /* need this many tokens to run */
-  vc_sched.tokens_max = 100;  /* max accumulated number of tokens */
-  if (vc_set_sched(ctx, &vc_sched))
-    return -1;
-
-  /* set resource limits */
-  vc_rlimit.min = VC_LIM_KEEP;
-  vc_rlimit.soft = VC_LIM_KEEP;
-  vc_rlimit.hard = rspec->mem_limit;
-  if (vc_set_rlimit(ctx, RLIMIT_RSS, &vc_rlimit))
-    return -1;
+  return 0;
+}
 
-  /* assume min and soft unchanged by set_rlimit */
-  vc_rlimit.hard = rspec->task_limit;
-  if (vc_set_rlimit(ctx, RLIMIT_NPROC, &vc_rlimit))
-    return -1;
+int
+pl_setup_done(xid_t ctx)
+{
+  struct vc_ctx_flags  vc_flags;
 
-  /* set flags, unset SETUP flag - this allows other processes to migrate */
+  /* unset SETUP flag - this allows other processes to migrate */
+  vc_flags.mask = VC_VXF_STATE_SETUP;
+  vc_flags.flagword = 0;
   if (vc_set_cflags(ctx, &vc_flags))
     return -1;
 
   return 0;
 }
 
+#define RETRY_LIMIT  10
+
 int
-pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
+pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps)
 {
+  int  retry_count = 0;
+
   for (;;)
     {
       struct vc_ctx_flags  vc_flags;
 
       if (vc_get_cflags(ctx, &vc_flags))
 	{
+	  if (errno != ESRCH)
+	    return -1;
+
 	  /* context doesn't exist - create it */
-	  if (create_context(ctx, flags, bcaps, rspec))
+	  if (create_context(ctx, flags, bcaps))
 	    {
 	      if (errno == EEXIST)
 		/* another process beat us in a race */
@@ -120,13 +114,18 @@ pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
 	    }
 
 	  /* created context and migrated to it i.e., we're done */
-	  break;
+	  return 1;
 	}
 
       /* check the SETUP flag */
       if (vc_flags.flagword & VC_VXF_STATE_SETUP)
 	{
 	  /* context is still being setup - wait a while then retry */
+	  if (retry_count++ >= RETRY_LIMIT)
+	    {
+	      errno = EBUSY;
+	      return -1;
+	    }
 	  sleep(1);
 	  continue;
 	}
@@ -141,3 +140,46 @@ pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
 
   return 0;
 }
+
+/* it's okay for a syscall to fail because the context doesn't exist */
+#define VC_SYSCALL(x)				\
+do						\
+{						\
+  if (x)					\
+    return errno == ESRCH ? 0 : -1;		\
+}						\
+while (0)
+
+int
+pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags)
+{
+  struct vc_set_sched  vc_sched;
+  struct vc_ctx_flags  vc_flags;
+  uint32_t  new_flags;
+
+  vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS |
+		       VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX);
+  vc_sched.fill_rate = cpu_share;  /* tokens accumulated per interval */
+  vc_sched.interval = 1000;  /* milliseconds */
+  vc_sched.tokens = 100;     /* initial allocation of tokens */
+  vc_sched.tokens_min = 50;  /* need this many tokens to run */
+  vc_sched.tokens_max = 100;  /* max accumulated number of tokens */
+
+  VC_SYSCALL(vc_set_sched(ctx, &vc_sched));
+
+  /* get current flag values */
+  VC_SYSCALL(vc_get_cflags(ctx, &vc_flags));
+
+  /* guaranteed CPU corresponds to SCHED_SHARE flag being cleared */
+  new_flags = (cpu_sched_flags & VS_SCHED_CPU_GUARANTEED
+	       ? 0
+	       : VC_VXF_SCHED_SHARE);
+  if ((vc_flags.flagword & VC_VXF_SCHED_SHARE) != new_flags)
+    {
+      vc_flags.mask = VC_VXF_SCHED_FLAGS;
+      vc_flags.flagword = new_flags | VC_VXF_SCHED_HARD;
+      VC_SYSCALL(vc_set_cflags(ctx, &vc_flags));
+    }
+
+  return 0;
+}
diff --git a/lib/planetlab.h b/lib/planetlab.h
index 34a9b91..e4d6ae4 100644
--- a/lib/planetlab.h
+++ b/lib/planetlab.h
@@ -34,23 +34,18 @@ POSSIBILITY OF SUCH DAMAGE.
 #ifndef _LIB_PLANETLAB_H_
 #define _LIB_PLANETLAB_H_
 
-/*
- * context create
- */
-typedef struct {
-  uint32_t  cpu_share;
-  uint32_t  cpu_sched_flags;
-  uint64_t  mem_limit;
-  uint64_t  task_limit;
-} rspec_t;
-
-#define VC_VXF_SCHED_SHARE  0x00000800
 #define VC_VXF_SCHED_FLAGS  (VC_VXF_SCHED_HARD | VC_VXF_SCHED_SHARE)
 
 int
-pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec);
+pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps);
+
+int
+pl_setup_done(xid_t ctx);
 
 int
 pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags);
 
+/* scheduler flags */
+#define VS_SCHED_CPU_GUARANTEED  1
+
 #endif
diff --git a/lib/vserver.h b/lib/vserver.h
index 0b1aa8c..f082f0d 100644
--- a/lib/vserver.h
+++ b/lib/vserver.h
@@ -169,6 +169,7 @@
 #define	VC_VXF_SCHED_HARD		0x00000100ull
 #define	VC_VXF_SCHED_PRIO		0x00000200ull
 #define	VC_VXF_SCHED_PAUSE		0x00000400ull
+#define	VC_VXF_SCHED_SHARE		0x00000800ull
 
 #define VC_VXF_VIRT_MEM			0x00010000ull
 #define VC_VXF_VIRT_UPTIME		0x00020000ull
diff --git a/python/Makefile b/python/Makefile
index 7ae69d7..5007d34 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -1,53 +1,36 @@
 #
 # GNUMakefile for util-vserver Python bindings
 #
-# It's too hard to integrate distutils into the autoconf/libtool
-# framework, so run this Makefile separately from and after the normal
-# util-vserver build.
-#
 # Steve Muir <smuir@cs.princeton.edu>
 # Mark Huang <mlhuang@cs.princeton.edu>
 # Copyright (C) 2005 The Trustees of Princeton University
 #
-# $Id: Makefile,v 1.7 2005/08/26 04:00:44 mlhuang Exp $
+# $Id: Makefile,v 1.11 2006/03/01 22:03:38 mlhuang Exp $
 #
 
-ALL := vserverimpl.so vduimpl.so util_vserver_vars.py
+INCLUDES := -I.. -I../lib
+LIBS = -L../lib -lvserver
+
+PY_MODS := vserver.py cpulimit.py bwlimit.py
+PY_EXT_MODS := vduimpl.so vserverimpl.so
+
+LT_LINK = ../libtool --tag=CC --mode=link
+
+
 
-# need command substitution
-SHELL := /bin/bash
+all: py-build
 
-pythonlibdir := @libdir@/python@PYTHON_VERSION@/site-packages
+# XXX - compatibility with util-vserver specfile
+INSTALL_ROOT ?= $(DESTDIR)
 
-all: $(ALL)
+install: py-install
 
-%.o: %.c
-        # builds object and incompletely linked library
-	python setup.py build_ext
-        # copy to current directory
-	cp -a build/temp.*/*.o .
+clean: py-clean
 
-vserverimpl.so vduimpl.so: %.so: %.o
-        # relink the object against libvserver with libtool
-	../libtool --tag=CC --mode=link $(CC) -shared -o $@ $< ../lib/libvserver.la
 
-util_vserver_vars.py: ../scripts/util-vserver-vars
-        # python does not export variables beginning with underscore
-	(. $< ; \
-	while read var ; do eval echo $$var=\$${$$var} ; done < \
-	<(sed -ne "s/\([^=]*\)=.*/\1/p" $<) \
-	| sed -e "s/^_*//" -e "s/\([^=]*\)=\(.*\)/\1='\2'/") > $@
 
-install: $(ALL)
-        # install relinked libraries and byte-compiled scripts
-	python setup.py install --root="$(DESTDIR)"
-        # reinstall libraries with libtool so that the final path
-        # to libvserver is resolved
-	for so in $(filter %.so, $(ALL)) ; do \
-	  ../libtool --tag=CC --mode=install install "$$so" `find "$(DESTDIR)" -name "$$so"` ; \
-	done
+UTIL_PYTHON ?= $(wildcard ../../util-python*)
 
-clean:
-	rm -rf $(ALL) *.o build
+include $(UTIL_PYTHON)/pybuild.mk
 
 .PHONY: all install clean
diff --git a/python/bwlimit b/python/bwlimit
new file mode 100755
index 0000000..0630377
--- /dev/null
+++ b/python/bwlimit
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+
+import bwlimit
+
+if __name__ == '__main__':
+    bwlimit.main()
diff --git a/python/bwlimit.py b/python/bwlimit.py
index b408caa..fbe825f 100644
--- a/python/bwlimit.py
+++ b/python/bwlimit.py
@@ -1,158 +1,588 @@
-#!/bin/env python2 -u
-
-# Based on code written by: Andy Bavier, acb@cs.princeton.edu
+#!/usr/bin/python
 # 
-# Bandwidth limit script to run on PlanetLab nodes.  The intent is to use
-# the Hierarchical Token Bucket queueing discipline of 'tc' to (1) cap 
-# the output bandwidth of the node at a specified rate (e.g., 5Mbps) and 
-# (2) to allow all vservers to fairly share this rate.  For instance,
-# if there are N vservers, then each should get at least 5/N Mbps of 
-# bandwidth.
+# Bandwidth limit module for PlanetLab nodes. The intent is to use the
+# Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
+# slices to fairly share access to available node bandwidth. We
+# currently define three classes of "available node bandwidth":
+#
+# 1. Available hardware bandwidth (bwmax): The maximum rate of the
+# hardware.
+#
+# 2. Available capped bandwidth (bwcap): The maximum rate allowed to
+# non-exempt destinations. By default, equal to bwmax, but may be
+# lowered by PIs.
+#
+# 3. Available uncapped ("exempt") bandwidth: The difference between
+# bwmax and what is currently being used of bwcap, or the maximum rate
+# allowed to destinations exempt from caps (e.g., Internet2).
+#
+# All three classes of bandwidth are fairly shared according to the
+# notion of "shares". For instance, if the node is capped at 5 Mbps,
+# there are N slices, and each slice has 1 share, then each slice
+# should get at least 5/N Mbps of bandwidth. How HTB is implemented
+# makes this statement a little too simplistic. What it really means
+# is that during any single time period, only a certain number of
+# bytes can be sent onto the wire. Each slice is guaranteed that at
+# least some small number of its bytes will be sent. Whatever is left
+# over from the budget, is split in proportion to the number of shares
+# each slice has.
+#
+# Even if the node is not capped at a particular limit (bwcap ==
+# bwmax), this module enforces fair share access to bwmax. Also, if
+# the node is capped at a particular limit, rules may optionally be
+# defined that classify certain packets into the "exempt" class. This
+# class receives whatever bandwidth is leftover between bwcap and
+# bwmax; slices fairly share this bandwidth as well.
+#
+# The root context is exempt from sharing and can send as much as it
+# needs to.
 #
 # Some relevant URLs:
-#   http://lartc.org/howto               for how to use tc
-#   http://luxik.cdi.cz/~devik/qos/htb/  for info on htb
+#
+# 1. http://lartc.org/howto               for how to use tc
+# 2. http://luxik.cdi.cz/~devik/qos/htb/  for info on HTB
+#
+# Andy Bavier <acb@cs.princeton.edu>
+# Mark Huang <mlhuang@cs.princeton.edu>
+# Copyright (C) 2006 The Trustees of Princeton University
+#
+# $Id: bwlimit.py,v 1.10 2006/03/14 22:57:50 smuir Exp $
+#
 
-import sys, os, re, string
+import sys, os, re, getopt
+from sets import Set
+import pwd
 
-# Global variables
-TC="/sbin/tc"                 # Where the modified tc program lives
-OPS = ["change","add"]  # Sequence of TC ops we'll try
 
-# Support to run system commands
-import runcmd
-def run(cmd):
-    try:
-        runcmd.run(cmd)
-        ret = True
-    except runcmd.Error, ex:
-        ret = False
+# Where the tc binary lives
+TC = "/sbin/tc"
 
-    return ret
+# Default interface
+dev = "eth0"
+
+# Verbosity level
+verbose = 0
+
+# bwmin should be small enough that it can be considered negligibly
+# slow compared to the hardware. 8 bits/second appears to be the
+# smallest value supported by tc.
+bwmin = 8
+
+# bwmax should be large enough that it can be considered at least as
+# fast as the hardware.
+bwmax = 1000*1000*1000
+
+# quantum is the maximum number of bytes that can be borrowed by a
+# share (or slice, if each slice gets 1 share) in one time period
+# (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
+# above their guarantees, and each is attempting to borrow up to the
+# node bandwidth cap, quantums control how the excess bandwidth is
+# distributed. Slices with 2 shares will borrow twice the amount in
+# one time period as slices with 1 share, so averaged over time, they
+# will get twice as much of the excess bandwidth. The value should be
+# as small as possible and at least 1 MTU. By default, it would be
+# calculated as bwmin/10, but since we use such small a value for
+# bwmin, it's better to just set it to a value safely above 1 Ethernet
+# MTU.
+quantum = 1600
+
+# cburst is the maximum number of bytes that can be burst onto the
+# wire in one time period (with HZ=1000, 1 ms). If multiple slices
+# have data queued for transmission, cbursts control how long each
+# slice can have the wire for. If not specified, it is set to the
+# smallest possible value that would enable the slice's "ceil" rate
+# (usually the node bandwidth cap), to be reached if a slice was able
+# to borrow enough bandwidth to do so. For now, it's unclear how or if
+# to relate this to the notion of shares, so just let tc set the
+# default.
+cburst = None
+
+# There is another parameter that controls how bandwidth is allocated
+# between slices on nodes that is outside the scope of HTB. We enforce
+# a 16 GByte/day total limit on each slice, which works out to about
+# 1.5mbit. If a slice exceeds this byte limit before the day finishes,
+# it is capped at (i.e., its "ceil" rate is set to) the smaller of the
+# node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
+# rule and executes this script to override "ceil".
+
+# We support multiple bandwidth limits, by reserving the top nibble of
+# the minor classid to be the "subclassid". Theoretically, we could
+# support up to 15 subclasses, but for now, we only define two: the
+# "default" subclass 1:10 that is capped at the node bandwidth cap (in
+# this example, 5mbit) and the "exempt" subclass 1:20 that is capped
+# at bwmax (i.e., not capped). The 1:1 parent class exists only to
+# make the borrowing model work. All bandwidth above minimum
+# guarantees is fairly shared (in this example, slice 2 is guaranteed
+# at least 1mbit in addition to fair access to the rest), subject to
+# the restrictions of the class hierarchy: namely, that the total
+# bandwidth to non-exempt destinations should not exceed the node
+# bandwidth cap.
+#
+#                         1:
+#                         |
+#                    1:1 (1gbit)
+#           ______________|_____________
+#          |                            |
+#   1:10 (8bit, 5mbit)           1:20 (8bit, 1gbit)
+#          |                            |
+# 1:1000 (8bit, 5mbit),        1:2000 (8bit, 1gbit),
+# 1:1001 (8bit, 5mbit),        1:2001 (8bit, 1gbit),
+# 1:1002 (1mbit, 5mbit),       1:2002 (1mbit, 1gbit),
+# ...                          ...
+# 1:1FFF (8bit, 5mbit)         1:2FFF (8bit, 1gbit)
+#
+default_minor = 0x1000
+exempt_minor = 0x2000
+
+# root_xid is for the root context. The root context is exempt from
+# fair sharing in both the default and exempt subclasses. The root
+# context gets 5 shares by default.
+root_xid = 0x0000
+root_share = 5
+
+# default_xid is for unclassifiable packets. Packets should not be
+# classified here very often. They can be if a slice's HTB classes are
+# deleted before its processes are. Each slice gets 1 share by
+# default.
+default_xid = 0x0FFF
+default_share = 1
+
+# See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
+# warned that older versions of tc interpret "kbps", "mbps", "mbit",
+# and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
+# "kibit" and that if an older version is installed, all rates will
+# be off by a small fraction.
+suffixes = {
+    "":         1,
+    "bit":	1,
+    "kibit":	1024,
+    "kbit":	1000,
+    "mibit":	1024*1024,
+    "mbit":	1000000,
+    "gibit":	1024*1024*1024,
+    "gbit":	1000000000,
+    "tibit":	1024*1024*1024*1024,
+    "tbit":	1000000000000,
+    "bps":	8,
+    "kibps":	8*1024,
+    "kbps":	8000,
+    "mibps":	8*1024*1024,
+    "mbps":	8000000,
+    "gibps":	8*1024*1024*1024,
+    "gbps":	8000000000,
+    "tibps":	8*1024*1024*1024*1024,
+    "tbps":	8000000000000
+}
 
-def get_defaults(cap_file="/etc/planetlab/bwcap", default_cap="10mbit"):
-    # The maximum output bandwidth, read in from cap_file (if it
-    # exists). If cap_file does not exist, use default_cap for
-    # bandwidth cap.  See also the 'cburst' parameter below.
-    cap=default_cap
+
+# Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
+def get_tc_rate(s):
+    if type(s) == int:
+        return s
+    m = re.match(r"([0-9.]+)(\D*)", s)
+    if m is None:
+        return -1
+    suffix = m.group(2).lower()
+    if suffixes.has_key(suffix):
+        return int(float(m.group(1)) * suffixes[suffix])
+    else:
+        return -1
+
+
+# Prints a tc rate string
+def format_tc_rate(rate):
+    if rate >= 1000000:
+        return "%.0fmbit" % (rate / 1000000.)
+    elif rate >= 1000:
+        return "%.0fkbit" % (rate / 1000.)
+    else:
+        return "%.0fbit" % rate
+
+
+# Parse /etc/planetlab/bwcap (or equivalent)
+def read_bwcap(bwcap_file):
+    bwcap = bwmax
     try:
-        os.stat(cap_file)
-        fp = open(cap_file)
-        lines = fp.readlines()
-        fp.close()
+        fp = open(bwcap_file, "r")
+        line = fp.readline().strip()
+        if line:
+            bwcap = get_tc_rate(line)
+    except:
+        pass
+    if bwcap == -1:
+        bwcap = bwmax
+    return bwcap
+
+
+# Get current (live) value of bwcap
+def get_bwcap(dev = dev):
+
+    state = tc("-d class show dev %s" % dev)
+    base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
+    base_classes = filter(None, map(base_re.match, state))
+    if not base_classes:
+        return -1
+    if len(base_classes) > 1:
+        raise Exception, "unable to get current bwcap"
+    return get_tc_rate(base_classes[0].group(1))
+
+
+# Get slice xid (500) from slice name ("500" or "princeton_mlh") or
+# slice name ("princeton_mlh") from slice xid (500).
+def get_slice(xid_or_name):
+
+    if xid_or_name == root_xid:
+        return "root"
+    if xid_or_name == default_xid:
+        return "default"
+    if isinstance(xid_or_name, (int, long)):
         try:
-            cap=string.strip(lines[0])
-        except ValueError, ex:
+            return pwd.getpwuid(xid_or_name).pw_name
+        except KeyError:
             pass
-    except OSError, ex:
+    else:
+        try:
+            try:
+                return int(xid_or_name)
+            except ValueError:
+                pass
+            return pwd.getpwnam(xid_or_name).pw_uid
+        except KeyError:
+            pass
+
+    return None
+
+
+# Shortcut for running a command
+def run(cmd, input = None):
+    try:
+        if verbose:
+            sys.stderr.write("Executing: " + cmd + "\n")
+        if input is None:
+            fileobj = os.popen(cmd, "r")
+            output = fileobj.readlines()
+        else:
+            fileobj = os.popen(cmd, "w")
+            fileobj.write(input)
+            output = None
+        if fileobj.close() is None:
+            return output
+    except Exception, e:
         pass
+    return None
+
+
+# Shortcut for running a tc command
+def tc(cmd):
+    return run(TC + " " + cmd)
+
+
+# (Re)initialize the bandwidth limits on this node
+def init(dev, bwcap):
+
+    # load the module used to manage exempt classes
+    run("/sbin/modprobe ip_set_iphash")
+
+    # Delete root qdisc 1: if it exists. This will also automatically
+    # delete any child classes.
+    for line in tc("qdisc show dev %s" % dev):
+        # Search for the root qdisc 1:
+        m = re.match(r"qdisc htb 1:", line)
+        if m is not None:
+            tc("qdisc del dev %s root handle 1:" % dev)
+            break
+
+    # Initialize HTB. The "default" clause specifies that if a packet
+    # fails classification, it should go into the class with handle
+    # 1FFF.
+    tc("qdisc add dev %s root handle 1: htb default %x" % \
+       (dev, default_minor | default_xid))
 
-    # How many bytes a single token bucket is allowed to send at once.
-    # Small values (i.e., 3080 = two maximum-sized Ethernet packets)
-    # provide better fine-grained fairness.  At high rates (e.g.,
-    # cap=100mbit) this needs to be raised to allow full throughput.
-    cburst=30800
-
-    # The 'share' and 'quantum' parameters both influence the actual throughput
-    # seen by a particular vserver:
-
-    # 'share' is the rate at which tokens fill the bucket, and so is
-    # the minimum bandwidth given to the task.  I think this just
-    # needs to be set to some small value that is the same for all
-    # vservers.  With the current value and a 5mbit cap, we can
-    # support 5000 vservers (5mbit/1kbit = 5000).  With values lower
-    # than 10kbit, the HTB output (from tc -s -d class dev eth0) looks
-    # strange... this needs to be looked into further.
-    share="1kbit"
-
-    # 'quantum' influences how excess bandwidth (i.e., above the
-    # 'share') is distributed to vservers.  Apparently, vservers can
-    # send additional packets in proportion to their quantums (and not
-    # their shares, as one might expect).  See:
-    #   http://luxik.cdi.cz/~devik/qos/htb/manual/userg.htm#sharing
-    #   The above link states that 'quantum' is automatically
-    #   calculated for shares above 120kbit.  Otherwise it should be
-    #   set to a small value but at least one MTU, so I set it to one
-    #   MTU.  All vservers are assigned the same quantum and so they
-    #   should share equally.
-    quantum=1540
-
-    return cap, cburst, share, quantum
-
-
-def init(eth):
-    global TC, OPS
-
-    cap, cburst, share, quantum = get_defaults()
-    if cap == "-1": return
-
-    # Install HTB on $ETH.  Specifies that all packets not matching a
-    # filter rule go to class with handle 9999.  If we don't supply a
-    # default class, it sounds like non-matching packets can be sent
-    # at an unlimited rate.
-    for op in OPS:
-        cmd = "%s qdisc %s dev %s root handle 1: htb default 9999" % (TC,op,eth)
-        if run(cmd): break
-
-    # Add a root class with bwcap capped rate
-    for op in OPS:
-        cmd = "%s class %s dev %s parent 1: classid 1:1 htb rate %s quantum %d" % \
-              (TC, op, eth, cap, quantum)
-        if run(cmd): break
-
-    # Set up the default class.  Packets will fail to match a filter rule
-    # and end up here if they are sent by a process with UID < 500.
-    for op in OPS:
-        cmd = "%s class %s dev %s parent 1:1 classid 1:9999 htb rate %s ceil %s quantum %d cburst %d" % \
-              (TC, op, eth, share, cap, quantum, cburst)
-        if run(cmd): break
-
-    # The next command appears to throttle back processes that are
-    # sending faster than the token bucket can support, rather than
-    # just dropping their packets.
-    for op in OPS:
-        cmd = "%s qdisc %s dev %s parent 1:9999 handle 9999 pfifo" % \
-              (TC, op, eth)
-        if run(cmd): break
-
-def on(xid, eth, bwlimit, cap, minrate, maxrate):
-    global TC, OPS
-
-    default_cap, default_cburst, default_share, default_quantum = get_defaults()
-    quantum = bwlimit * default_quantum
-
-    # Set up the per-vserver token bucket
-    for op in OPS:
-        cmd = "%s class %s dev %s parent 1:1 classid 1:%d htb rate %s ceil %s quantum %d cburst %d" % \
-              (TC, op, eth, xid, minrate, cap, quantum, default_cburst)
-        if run(cmd): break
-
-    # The next command appears to throttle back processes that are
-    # sending faster than the token bucket can support, rather than
-    # just dropping their packets.
-    for op in OPS:
-        cmd = "%s qdisc %s dev %s parent 1:%d handle %d pfifo" % \
-              (TC, op, eth, xid, xid)
-        if run(cmd): break
-
-    # Matches packets sent by a vserver to the appropriate token bucket.
-    # The raw socket module marks each packet with its vserver id.
-    # See: http://lartc.org/howto/lartc.qdisc.filters.html for more
-    # info on the filter command.
-    cmd = "%s filter del dev %s protocol ip prio %d" % (TC, eth, xid)
-    run(cmd)
-    cmd = "%s filter add dev %s prio %d parent 1:0 protocol ip handle %d fw flowid 1:%d" % \
-          (TC, eth, xid, xid, xid)
-    run(cmd)
-
-def off(xid, eth):
-    cmd = "%s filter del dev %s protocol ip prio %d" % (TC, eth, xid)
-    run(cmd)
-
-    cmd = "%s qdisc del dev %s parent 1:%d" % (TC, eth, xid)
-    run(cmd)
-
-    cmd = "%s class del dev %s classid 1:%d" % (TC, eth, xid)
-    run(cmd)
+    # Set up a parent class from which all subclasses borrow.
+    tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
+       (dev, bwmax))
 
+    # Set up a subclass that represents the node bandwidth cap. We
+    # allow each slice to borrow up to this rate, so it is also
+    # usually the "ceil" rate for each slice.
+    tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
+       (dev, bwmin, bwcap))
+
+    # Set up a subclass that represents "exemption" from the node
+    # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
+    # to exempt destinations can still be fairly shared up to bwmax.
+    tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
+       (dev, bwmin, bwmax))
+
+    # Set up the root class (and tell VNET what it is). Packets sent
+    # by root end up here and are capped at the node bandwidth
+    # cap.
+    on(root_xid, dev, share = root_share)
+    file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
+
+    # Set up the default class. Packets that fail classification end
+    # up here.
+    on(default_xid, dev, share = default_share)
+
+
+# Get the bandwidth limits for a particular slice xid as a tuple (xid,
+# share, minrate, maxrate), or all classes as a list of tuples.
+def get(xid = None, dev = dev):
+    if xid is None:
+        ret = []
+    else:
+        ret = None
+
+    # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
+    for line in tc("-d class show dev %s" % dev):
+        # Search for child classes of 1:10
+        m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
+        if m is None:
+            continue
+
+        # If we are looking for a particular class
+        classid = int(m.group(1), 16) & default_xid
+        if xid is not None and xid != classid:
+            continue
+
+        # Parse share
+        share = 1
+        m = re.search(r"quantum (\d+)", line)
+        if m is not None:
+            share = int(m.group(1)) / quantum
+
+        # Parse minrate
+        minrate = bwmin
+        m = re.search(r"rate (\w+)", line)
+        if m is not None:
+            minrate = get_tc_rate(m.group(1))
+
+        # Parse maxrate 
+        maxrate = bwmax
+        m = re.search(r"ceil (\w+)", line)
+        if m is not None:
+            maxrate = get_tc_rate(m.group(1))
+
+        if xid is None:
+            # Return a list of parameters
+            ret.append((classid, share, minrate, maxrate))
+        else:
+            # Return the parameters for this class
+            ret = (classid, share, minrate, maxrate)
+            break
+
+    return ret
+
+
+# Apply specified bandwidth limit to the specified slice xid
+def on(xid, dev = dev, share = None, minrate = None, maxrate = None):
+    # Get defaults from current state if available
+    cap = get(xid, dev)
+    if cap is not None:
+        if share is None:
+            share = cap[1]
+        if minrate is None:
+            minrate = cap[2]
+        if maxrate is None:
+            maxrate = cap[3]
+
+    # Figure out what the current node bandwidth cap is
+    bwcap = bwmax
+    for line in tc("-d class show dev %s" % dev):
+        # Search for 1:10
+        m = re.match(r"class htb 1:10.*ceil (\w+)", line)
+        if m is not None:
+            bwcap = get_tc_rate(m.group(1))
+            break
+
+    # Set defaults
+    if share is None:
+        share = default_share
+    if minrate is None:
+        minrate = bwmin
+    else:
+        minrate = get_tc_rate(minrate)
+    if maxrate is None:
+        maxrate = bwcap
+    else:
+        maxrate = get_tc_rate(maxrate)
+
+    # Sanity checks
+    if maxrate > bwcap:
+        maxrate = bwcap
+    if minrate > maxrate:
+        minrate = maxrate
+
+    # Set up subclasses for the slice
+    tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+       (dev, default_minor | xid, minrate, maxrate, share * quantum))
+
+    tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+       (dev, exempt_minor | xid, minrate, bwmax, share * quantum))
+
+    # Attach a FIFO to each subclass, which helps to throttle back
+    # processes that are sending faster than the token buckets can
+    # support.
+    tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
+       (dev, default_minor | xid, default_minor | xid))
+
+    tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
+       (dev, exempt_minor | xid, exempt_minor | xid))
+
+
+# Remove class associated with specified slice xid. If further packets
+# are seen from this slice, they will be classified into the default
+# class 1:1FFF.
+def off(xid, dev = dev):
+    cap = get(xid, dev)
+    if cap is not None:
+        tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
+        tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
+
+
+def exempt_init(group_name, node_ips):
+
+    # Clean up
+    iptables = "/sbin/iptables -t vnet %s POSTROUTING" 
+    run(iptables % "-F")
+    run("/sbin/ipset -X " + group_name)
+
+    # Create a hashed IP set of all of these destinations
+    lines = ["-N %s iphash" % group_name]
+    add_cmd = "-A %s " % group_name
+    lines += [(add_cmd + ip) for ip in node_ips]
+    lines += ["COMMIT"]
+    restore = "\n".join(lines) + "\n"
+    run("/sbin/ipset -R", restore)
+
+    # Add rule to match on destination IP set
+    run((iptables + " -m set --set %s dst -j CLASSIFY --set-class 1:%x") %
+        ("-A", group_name, exempt_minor))
+
+
+def usage():
+    bwcap_description = format_tc_rate(get_bwcap())
+        
+    print """
+Usage:
+
+%s [OPTION]... [COMMAND] [ARGUMENT]...
+
+Options:
+	-d device	Network interface (default: %s)
+        -r rate         Node bandwidth cap (default: %s)
+        -q quantum      Share multiplier (default: %d bytes)
+        -h              This message
+
+Commands:
+        init
+                (Re)initialize bandwidth caps.
+        on slice [share] [minrate] [maxrate]
+                Set bandwidth cap for the specified slice
+        off slice
+                Remove bandwidth caps for the specified slice
+        get
+                Get all bandwidth caps
+        get slice
+                Get bandwidth caps for the specified slice
+        getcap slice
+                Get maxrate for the specified slice
+        setcap slice maxrate
+                Set maxrate for the specified slice
+""" % (sys.argv[0], dev, bwcap_description, quantum)
+    sys.exit(1)
     
+
+def main():
+    global dev, quantum, verbose
+
+    # Defaults
+    bwcap = get_bwcap()
+
+    (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
+    for (opt, optval) in opts:
+        if opt == '-d':
+            dev = optval
+        elif opt == '-r':
+            bwcap = get_tc_rate(optval)
+        elif opt == '-q':
+            quantum = int(optval)
+        elif opt == '-v':
+            verbose += 1
+        elif opt == '-h':
+            usage()
+
+    if len(argv):
+        if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
+            # (Re)initialize
+            init(dev, get_tc_rate(bwcap))
+
+        elif argv[0] == "get" or argv[0] == "show":
+            # Show
+            if len(argv) >= 2:
+                # Show a particular slice
+                xid = get_slice(argv[1])
+                if xid is None:
+                    sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
+                    usage()
+                caps = [get(xid, dev)]
+            else:
+                # Show all slices
+                caps = get(None, dev)
+
+            for (xid, share, minrate, maxrate) in caps:
+                slice = get_slice(xid)
+                if slice is None:
+                    # Orphaned (not associated with a slice) class
+                    slice = "%d?" % xid
+                print "%s %d %s %s" % \
+                      (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
+
+        elif len(argv) >= 2:
+            # slice, ...
+            xid = get_slice(argv[1])
+            if xid is None:
+                sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
+                usage()
+
+            if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
+                # Enable cap
+                args = []
+                if len(argv) >= 3:
+                    # ... share, minrate, maxrate
+                    casts = [int, get_tc_rate, get_tc_rate]
+                    for i, arg in enumerate(argv[2:]):
+                        if i >= len(casts):
+                            break
+                        args.append(casts[i](arg))
+                on(xid, dev, *args)
+
+            elif argv[0] == "off" or argv[0] == "del":
+                # Disable cap
+                off(xid, dev)
+
+            # Backward compatibility with old resman script
+            elif argv[0] == "getcap":
+                # Get maxrate
+                cap = get(xid, dev)
+                if cap is not None:
+                    (xid, share, minrate, maxrate) = cap
+                    print format_tc_rate(maxrate)
+
+            # Backward compatibility with old resman script
+            elif argv[0] == "setcap":
+                if len(argv) >= 3:
+                    # Set maxrate
+                    on(xid, dev, maxrate = get_tc_rate(argv[2]))
+                else:
+                    usage()
+
+            else:
+                usage()
+
+        else:
+            usage()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/setup.py b/python/setup.py
deleted file mode 100644
index 8491c58..0000000
--- a/python/setup.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/python
-#
-# Python distutils script for util-vserver Python bindings
-#
-# Steve Muir <smuir@cs.princeton.edu>
-# Mark Huang <mlhuang@cs.princeton.edu>
-#
-# Copyright (C) 2005 The Trustees of Princeton University
-#
-# $Id$
-#
-
-from distutils.core import setup, Extension
-
-extension_args = {}
-extension_args['extra_compile_args'] = ['-Wall']
-extension_args['include_dirs'] = ['..', '../lib']
-# Link against libvserver with libtool later
-#extension_args['library_dirs'] = ['../lib']
-#extension_args['libraries'] = ['vserver']
-
-modules = ['util_vserver_vars', 'vserver', 'cpulimit', 'bwlimit']
-extensions = [Extension('vduimpl', ['vduimpl.c'], **extension_args),
-              Extension('vserverimpl', ['vserverimpl.c'], **extension_args)]
-
-setup(py_modules = modules, ext_modules = extensions)
diff --git a/python/vduimpl.c b/python/vduimpl.c
index fb219ff..2191826 100644
--- a/python/vduimpl.c
+++ b/python/vduimpl.c
@@ -549,6 +549,7 @@ do_vdu(PyObject *self, PyObject *args)
 	cwd_fd = open(".", O_RDONLY);
 	res = vdu_onedir(&tbl, &s, path);
 	fchdir(cwd_fd);
+	close(cwd_fd);
 
 	/* deallocate whatever has been added to tbl */
 	Dispose(&tbl);
diff --git a/python/vserver.py b/python/vserver.py
index 264aee2..f455e60 100644
--- a/python/vserver.py
+++ b/python/vserver.py
@@ -4,16 +4,21 @@ import errno
 import fcntl
 import os
 import re
+import pwd
+import signal
 import sys
 import time
 import traceback
 
 import mountimpl
-import passfdimpl
+import runcmd
 import utmp
 import vserverimpl, vduimpl
 import cpulimit, bwlimit
 
+from vserverimpl import VS_SCHED_CPU_GUARANTEED as SCHED_CPU_GUARANTEED
+from vserverimpl import DLIMIT_INF
+
 
 
 #
@@ -29,32 +34,36 @@ FLAGS_ULIMIT = 64
 FLAGS_NAMESPACE = 128
 
 
-              
+
+class NoSuchVServer(Exception): pass
+
+
+
 class VServer:
 
     INITSCRIPTS = [('/etc/rc.vinit', 'start'),
                    ('/etc/rc.d/rc', '%(runlevel)d')]
 
-    def __init__(self, name, vm_running = False, resources = {}):
+    def __init__(self, name, vm_id = None, vm_running = False):
 
         self.name = name
         self.config_file = "/etc/vservers/%s.conf" % name
         self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name)
         if not (os.path.isdir(self.dir) and
                 os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
-            raise Exception, "no such vserver: " + name
-        self.config = self.__read_config_file("/etc/vservers.conf")
-        self.config.update(self.__read_config_file(self.config_file))
-        self.flags = 0
-        flags = self.config["S_FLAGS"].split(" ")
-        if "lock" in flags:
-            self.flags |= FLAGS_LOCK
-        if "nproc" in flags:
-            self.flags |= FLAGS_NPROC
+            raise NoSuchVServer, "no such vserver: " + name
+        self.config = {}
+        for config_file in ["/etc/vservers.conf", self.config_file]:
+            try:
+                self.config.update(self.__read_config_file(config_file))
+            except IOError, ex:
+                if ex.errno != errno.ENOENT:
+                    raise
         self.remove_caps = ~vserverimpl.CAP_SAFE;
-        self.ctx = int(self.config["S_CONTEXT"])
+        if vm_id == None:
+            vm_id = int(self.config['S_CONTEXT'])
+        self.ctx = vm_id
         self.vm_running = vm_running
-        self.resources = resources
 
     config_var_re = re.compile(r"^ *([A-Z_]+)=(.*)\n?$", re.MULTILINE)
 
@@ -111,10 +120,30 @@ class VServer:
         os.chroot(self.dir)
         os.chdir("/")
 
+    def chroot_call(self, fn, *args):
+
+        cwd_fd = os.open(".", os.O_RDONLY)
+        try:
+            root_fd = os.open("/", os.O_RDONLY)
+            try:
+                self.__do_chroot()
+                result = fn(*args)
+            finally:
+                os.fchdir(root_fd)
+                os.chroot(".")
+                os.fchdir(cwd_fd)
+                os.close(root_fd)
+        finally:
+            os.close(cwd_fd)
+        return result
+
     def set_disklimit(self, block_limit):
 
         # block_limit is in kB
-        over_limit = False
+        if block_limit == 0:
+            vserverimpl.unsetdlimit(self.dir, self.ctx)
+            return
+
         if self.vm_running:
             block_usage = vserverimpl.DLIMIT_KEEP
             inode_usage = vserverimpl.DLIMIT_KEEP
@@ -122,8 +151,6 @@ class VServer:
             # init_disk_info() must have been called to get usage values
             block_usage = self.disk_blocks
             inode_usage = self.disk_inodes
-            if block_limit < block_usage:
-                over_limit = True
 
         vserverimpl.setdlimit(self.dir,
                               self.ctx,
@@ -133,31 +160,38 @@ class VServer:
                               vserverimpl.DLIMIT_INF,  # inode limit
                               2)   # %age reserved for root
 
-        if over_limit:
-            raise Exception, ("%s disk usage (%u blocks) > limit (%u)" %
-                              (self.name, block_usage, block_limit))
-
     def get_disklimit(self):
 
         try:
-            blocksused, blocktotal, inodesused, inodestotal, reserved = \
-                        vserverimpl.getdlimit(self.dir, self.ctx)
+            (self.disk_blocks, block_limit, self.disk_inodes, inode_limit,
+             reserved) = vserverimpl.getdlimit(self.dir, self.ctx)
         except OSError, ex:
-            if ex.errno == errno.ESRCH:
-                # get here if no vserver disk limit has been set for xid
-                # set blockused to -1 to indicate no limit
-                blocktotal = -1
+            if ex.errno != errno.ESRCH:
+                raise
+            # get here if no vserver disk limit has been set for xid
+            block_limit = -1
 
-        return blocktotal
+        return block_limit
 
-    def set_sched(self, cpu_share):
+    def set_sched_config(self, cpu_share, sched_flags):
+
+        """ Write current CPU scheduler parameters to the vserver
+        configuration file. This method does not modify the kernel CPU
+        scheduling parameters for this context. """
 
         if cpu_share == int(self.config.get("CPULIMIT", -1)):
             return
-
-        self.__update_config_file(self.config_file, { "CPULIMIT": cpu_share })
+        cpu_guaranteed = sched_flags & SCHED_CPU_GUARANTEED
+        cpu_config = { "CPULIMIT": cpu_share, "CPUGUARANTEED": cpu_guaranteed }
+        self.update_resources(cpu_config)
         if self.vm_running:
-            vserverimpl.setsched(self.ctx, cpu_share, True)
+            self.set_sched(cpu_share, sched_flags)
+
+    def set_sched(self, cpu_share, sched_flags = 0):
+
+        """ Update kernel CPU scheduling parameters for this context. """
+
+        vserverimpl.setsched(self.ctx, cpu_share, sched_flags)
 
     def get_sched(self):
         # have no way of querying scheduler right now on a per vserver basis
@@ -179,86 +213,35 @@ class VServer:
         ret = vserverimpl.getrlimit(self.ctx,6)
         return ret
 
-    def set_bwlimit(self, eth, limit, cap, minrate, maxrate):
-        if cap == "-1":
-            bwlimit.off(self.ctx,eth)
-        else:
-            bwlimit.on(self.ctx, eth, limit, cap, minrate, maxrate)
-
-    def get_bwlimit(self, eth):
-        # not implemented yet
-        bwlimit = -1
-        cap = "unknown"
-        minrate = "unknown"
-        maxrate = "unknown"
-        return (bwlimit, cap, minrate, maxrate)
-        
-    def open(self, filename, mode = "r", bufsize = -1):
+    def set_bwlimit(self, maxrate, minrate = 1, share = None, dev = "eth0"):
 
-        (sendsock, recvsock) = passfdimpl.socketpair()
-        child_pid = os.fork()
-        if child_pid == 0:
-            try:
-                # child process
-                self.__do_chroot()
-                f = open(filename, mode)
-                passfdimpl.sendmsg(f.fileno(), sendsock)
-                os._exit(0)
-            except EnvironmentError, ex:
-                (result, errmsg) = (ex.errno, ex.strerror)
-            except Exception, ex:
-                (result, errmsg) = (255, str(ex))
-            os.write(sendsock, errmsg)
-            os._exit(result)
+        if maxrate != 0:
+            bwlimit.on(self.ctx, dev, share, minrate, maxrate)
+        else:
+            bwlimit.off(self.ctx, dev)
 
-        # parent process
+    def get_bwlimit(self, dev = "eth0"):
 
-        # XXX - need this since a lambda can't raise an exception
-        def __throw(ex):
-            raise ex
+        result = bwlimit.get(self.ctx)
+        # result of bwlimit.get is (ctx, share, minrate, maxrate)
+        if result:
+            result = result[1:]
+        return result
 
-        os.close(sendsock)
-        throw = lambda : __throw(Exception(errmsg))
-        while True:
-            try:
-                (pid, status) = os.waitpid(child_pid, 0)
-                if os.WIFEXITED(status):
-                    result = os.WEXITSTATUS(status)
-                    if result != 255:
-                        errmsg = os.strerror(result)
-                        throw = lambda : __throw(IOError(result, errmsg))
-                    else:
-                        errmsg = "unexpected exception in child"
-                else:
-                    result = -1
-                    errmsg = "child killed"
-                break
-            except OSError, ex:
-                if ex.errno != errno.EINTR:
-                    os.close(recvsock)
-                    raise ex
-        fcntl.fcntl(recvsock, fcntl.F_SETFL, os.O_NONBLOCK)
-        try:
-            (fd, errmsg) = passfdimpl.recvmsg(recvsock)
-        except OSError, ex:
-            if ex.errno != errno.EAGAIN:
-                throw = lambda : __throw(ex)
-            fd = 0
-        os.close(recvsock)
-        if not fd:
-            throw()
+    def open(self, filename, mode = "r", bufsize = -1):
 
-        return os.fdopen(fd, mode, bufsize)
+        return self.chroot_call(open, filename, mode, bufsize)
 
     def __do_chcontext(self, state_file):
 
-        vserverimpl.chcontext(self.ctx, self.resources)
+        if state_file:
+            print >>state_file, "S_CONTEXT=%u" % self.ctx
+            print >>state_file, "S_PROFILE="
+            state_file.close()
 
-        if not state_file:
-            return
-        print >>state_file, "S_CONTEXT=%d" % self.ctx
-        print >>state_file, "S_PROFILE=%s" % self.config.get("S_PROFILE", "")
-        state_file.close()
+        if vserverimpl.chcontext(self.ctx):
+            self.set_resources()
+            vserverimpl.setup_done(self.ctx)
 
     def __prep(self, runlevel, log):
 
@@ -327,7 +310,6 @@ class VServer:
                 self.__do_chroot()
                 log = open("/var/log/boot.log", "w", 0)
                 os.dup2(1, 2)
-                # XXX - close all other fds
 
                 print >>log, ("%s: starting the virtual server %s" %
                               (time.asctime(time.gmtime()), self.name))
@@ -379,6 +361,13 @@ class VServer:
         # parent process
         return child_pid
 
+    def set_resources(self):
+
+        """ Called when vserver context is entered for first time,
+        should be overridden by subclass. """
+
+        pass
+
     def update_resources(self, resources):
 
         self.config.update(resources)
@@ -391,3 +380,20 @@ class VServer:
         (self.disk_inodes, self.disk_blocks, size) = vduimpl.vdu(self.dir)
 
         return size
+
+    def stop(self, signal = signal.SIGKILL):
+
+        vserverimpl.killall(self.ctx, signal)
+        self.vm_running = False
+
+
+
+def create(vm_name, static = False, ctor = VServer):
+
+    options = []
+    if static:
+        options += ['--static']
+    runcmd.run('vuseradd', options + [vm_name])
+    vm_id = pwd.getpwnam(vm_name)[2]
+
+    return ctor(vm_name, vm_id)
diff --git a/python/vserverimpl.c b/python/vserverimpl.c
index 3a93224..d5f018d 100644
--- a/python/vserverimpl.c
+++ b/python/vserverimpl.c
@@ -44,29 +44,40 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "vserver.h"
 #include "vserver-internal.h"
 
+#define NONE  ({ Py_INCREF(Py_None); Py_None; })
+
 /*
  * context create
  */
 static PyObject *
 vserver_chcontext(PyObject *self, PyObject *args)
 {
+  int  result;
   xid_t  ctx;
   uint32_t  flags = 0;
   uint32_t  bcaps = ~vc_get_insecurebcaps();
-  rspec_t  rspec = { 32, VC_VXF_SCHED_FLAGS, -1, -1 };
-  PyObject  *resources;
-  PyObject  *cpu_share;
 
-  if (!PyArg_ParseTuple(args, "IO!|K", &ctx, &PyDict_Type, &resources, &flags))
+  if (!PyArg_ParseTuple(args, "I|K", &ctx, &flags))
+    return NULL;
+
+  if ((result = pl_chcontext(ctx, flags, bcaps)) < 0)
+    return PyErr_SetFromErrno(PyExc_OSError);
+
+  return PyBool_FromLong(result);
+}
+
+static PyObject *
+vserver_setup_done(PyObject *self, PyObject *args)
+{
+  xid_t  ctx;
+
+  if (!PyArg_ParseTuple(args, "I", &ctx))
     return NULL;
-  if ((cpu_share = PyMapping_GetItemString(resources, "nm_cpu_share")) &&
-      (cpu_share = PyNumber_Int(cpu_share)))
-    rspec.cpu_share = PyInt_AsLong(cpu_share);
 
-  if (pl_chcontext(ctx, flags, bcaps, &rspec))
-    PyErr_SetFromErrno(PyExc_OSError);
+  if (pl_setup_done(ctx) < 0)
+    return PyErr_SetFromErrno(PyExc_OSError);
 
-  return Py_None;
+  return NONE;
 }
 
 static PyObject *
@@ -83,7 +94,6 @@ vserver_set_rlimit(PyObject *self, PyObject *args) {
 	if (!PyArg_ParseTuple(args, "IiL", &xid, &resource, &limits.hard))
 		return NULL;
 
-	ret = Py_None;
 	if (vc_set_rlimit(xid, resource, &limits)) 
 		ret = PyErr_SetFromErrno(PyExc_OSError);
 	else if (vc_get_rlimit(xid, resource, &limits)==-1)
@@ -108,7 +118,6 @@ vserver_get_rlimit(PyObject *self, PyObject *args) {
 	if (!PyArg_ParseTuple(args, "Ii", &xid, &resource))
 		return NULL;
 
-	ret = Py_None;
 	if (vc_get_rlimit(xid, resource, &limits)==-1)
 		ret = PyErr_SetFromErrno(PyExc_OSError);
 	else
@@ -117,54 +126,27 @@ vserver_get_rlimit(PyObject *self, PyObject *args) {
 	return ret;
 }
 
-#if 0
 /*
  * setsched
  */
 static PyObject *
 vserver_setsched(PyObject *self, PyObject *args)
 {
-  xid_t  xid;
-  struct vc_set_sched sched;
-  struct vc_ctx_flags flags;
-  unsigned cpuguaranteed = 0;
-
-  sched.set_mask = (VC_VXSM_FILL_RATE | 
-		    VC_VXSM_INTERVAL | 
-		    VC_VXSM_TOKENS_MIN | 
-		    VC_VXSM_TOKENS_MAX);
-
-  if (!PyArg_ParseTuple(args, "I|I|I|I|I|I|I", &xid, 
-			&sched.fill_rate,
-			&sched.interval,
-			&sched.tokens,
-			&sched.tokens_min,
-			&sched.tokens_max,
-			&cpuguaranteed))
-    return NULL;
-
-  flags.flagword = VC_VXF_SCHED_HARD;
-  flags.mask |= VC_VXF_SCHED_HARD;
-#define VC_VXF_SCHED_SHARE       0x00000800ull
-  if (cpuguaranteed==0) {
-	  flags.flagword |= VC_VXF_SCHED_SHARE;
-	  flags.mask |= VC_VXF_SCHED_SHARE;
-  }
+  xid_t  ctx;
+  uint32_t  cpu_share;
+  uint32_t  cpu_sched_flags = VC_VXF_SCHED_FLAGS;
 
-  if (vc_set_cflags(xid, &flags) == -1)
-	  return PyErr_SetFromErrno(PyExc_OSError);
+  if (!PyArg_ParseTuple(args, "II|I", &ctx, &cpu_share, &cpu_sched_flags))
+    return NULL;
 
-  if (vc_set_sched(xid, &sched) == -1)
-	  return PyErr_SetFromErrno(PyExc_OSError);
+  /* ESRCH indicates that there are no processes in the context */
+  if (pl_setsched(ctx, cpu_share, cpu_sched_flags) &&
+      errno != ESRCH)
+    return PyErr_SetFromErrno(PyExc_OSError);
 
-  return Py_None;
+  return NONE;
 }
 
-/*
- * setsched
- */
-#endif
-
 static PyObject *
 vserver_get_dlimit(PyObject *self, PyObject *args)
 {
@@ -225,24 +207,63 @@ vserver_set_dlimit(PyObject *self, PyObject *args)
             vserver(VCMD_set_dlimit, xid, &data))
           return PyErr_SetFromErrno(PyExc_OSError);
 
-	return Py_None;	
+	return NONE;	
+}
+
+static PyObject *
+vserver_unset_dlimit(PyObject *self, PyObject *args)
+{
+  char  *path;
+  unsigned  xid;
+  struct vcmd_ctx_dlimit_base_v0  init;
+
+  if (!PyArg_ParseTuple(args, "si", &path, &xid))
+    return NULL;
+
+  memset(&init, 0, sizeof(init));
+  init.name = path;
+  init.flags = 0;
+
+  if (vserver(VCMD_rem_dlimit, xid, &init) && errno != ESRCH)
+    return PyErr_SetFromErrno(PyExc_OSError);
+
+  return NONE;	
+}
+
+static PyObject *
+vserver_killall(PyObject *self, PyObject *args)
+{
+  xid_t  ctx;
+  int  sig;
+
+  if (!PyArg_ParseTuple(args, "Ii", &ctx, &sig))
+    return NULL;
+
+  if (vc_ctx_kill(ctx, 0, sig) && errno != ESRCH)
+    return PyErr_SetFromErrno(PyExc_OSError);
+
+  return NONE;
 }
 
 static PyMethodDef  methods[] = {
   { "chcontext", vserver_chcontext, METH_VARARGS,
     "chcontext to vserver with provided flags" },
-#if 0
+  { "setup_done", vserver_setup_done, METH_VARARGS,
+    "Release vserver setup lock" },
   { "setsched", vserver_setsched, METH_VARARGS,
     "Change vserver scheduling attributes for given vserver context" },
-#endif
   { "setdlimit", vserver_set_dlimit, METH_VARARGS,
     "Set disk limits for given vserver context" },
+  { "unsetdlimit", vserver_unset_dlimit, METH_VARARGS,
+    "Remove disk limits for given vserver context" },
   { "getdlimit", vserver_get_dlimit, METH_VARARGS,
     "Get disk limits for given vserver context" },
   { "setrlimit", vserver_set_rlimit, METH_VARARGS,
     "Set resource limits for given resource of a vserver context" },
   { "getrlimit", vserver_get_rlimit, METH_VARARGS,
     "Get resource limits for given resource of a vserver context" },
+  { "killall", vserver_killall, METH_VARARGS,
+    "Send signal to all processes in vserver context" },
   { NULL, NULL, 0, NULL }
 };
 
@@ -262,4 +283,9 @@ initvserverimpl(void)
   /* export limit-related constants */
   PyModule_AddIntConstant(mod, "DLIMIT_KEEP", (int)CDLIM_KEEP);
   PyModule_AddIntConstant(mod, "DLIMIT_INF", (int)CDLIM_INFINITY);
+
+  /* scheduler flags */
+  PyModule_AddIntConstant(mod,
+			  "VS_SCHED_CPU_GUARANTEED",
+			  VS_SCHED_CPU_GUARANTEED);
 }
diff --git a/scripts/vuseradd b/scripts/vuseradd
index 8023aa6..3a531f3 100755
--- a/scripts/vuseradd
+++ b/scripts/vuseradd
@@ -4,7 +4,7 @@
 #
 # Copyright (c) 2004  The Trustees of Princeton University (Trustees).
 #
-# $Id: vuseradd,v 1.23 2005/08/21 21:41:03 mlhuang Exp $
+# $Id: vuseradd,v 1.24 2005/09/02 20:00:39 mlhuang Exp $
 #
 
 : ${UTIL_VSERVER_VARS:=/usr/lib/util-vserver/util-vserver-vars}
@@ -93,8 +93,3 @@ if [ -d "$__DEFAULT_VSERVERDIR/$NAME" ] ; then
 	echo "$NAME	ALL=(ALL)	ALL" >> "$__DEFAULT_VSERVERDIR/$NAME/etc/sudoers"
     fi
 fi
-
-# turn resource management on for vserver $NAME
-chkconfig resman && service resman start $NAME
-# XXX - resman doesn't print a trailing newline
-echo
diff --git a/src/vsh.c b/src/vsh.c
index 435ea05..7d28bf4 100644
--- a/src/vsh.c
+++ b/src/vsh.c
@@ -281,7 +281,7 @@ static int sandbox_processes(xid_t ctx, char *context)
 		exit(1);
 	}
 #else
-        rspec_t  rspec;
+	int  ctx_is_new;
 	unsigned long long cpu = VC_LIM_KEEP;
 	unsigned long long mem = VC_LIM_KEEP;
 	unsigned long long task = VC_LIM_KEEP;
@@ -294,18 +294,48 @@ static int sandbox_processes(xid_t ctx, char *context)
 		 {0,0}};
 
 	get_limits(context,list);
+
+	/* check whether the slice has been disabled */
+	if (!cpu)
+	  {
+	    fprintf(stderr, "*** this slice has been suspended ***\n");
+	    exit(0);
+	  }
+
 	(void) (sandbox_chroot(ctx));
 
-        rspec.cpu_share = cpu;
-        rspec.cpu_sched_flags = (VC_VXF_SCHED_HARD |
-                                 (cpuguaranteed ? 0 : VC_VXF_SCHED_SHARE));
-        rspec.mem_limit = mem;
-        rspec.task_limit = task;
-        if (pl_chcontext(ctx, 0, ~vc_get_insecurebcaps(), &rspec))
+        if ((ctx_is_new = pl_chcontext(ctx, 0, ~vc_get_insecurebcaps())) < 0)
           {
             PERROR("pl_chcontext(%u)", ctx);
             exit(1);
           }
+	if (ctx_is_new)
+	  {
+	    /* set resources */
+	    struct vc_rlimit limits;
+
+	    limits.min = VC_LIM_KEEP;
+	    limits.soft = VC_LIM_KEEP;
+	    limits.hard = mem;
+	    if (vc_set_rlimit(ctx, RLIMIT_RSS, &limits))
+	      {
+		PERROR("pl_setrlimit(%u, RLIMIT_RSS)", ctx);
+		exit(1);
+	      }
+	    limits.hard = task;
+	    if (vc_set_rlimit(ctx, RLIMIT_NPROC, &limits))
+	      {
+		PERROR("pl_setrlimit(%u, RLIMIT_NPROC)", ctx);
+		exit(1);
+	      }
+	    cpuguaranteed &= VS_SCHED_CPU_GUARANTEED;
+	    if (pl_setsched(ctx, cpu, cpuguaranteed) < 0)
+	      {
+		PERROR("pl_setsched(&u)", ctx);
+		exit(1);
+	      }
+	    pl_setup_done(ctx);
+	  }
 #endif
 	return 0;
 }
diff --git a/util-vserver.spec b/util-vserver.spec
index c7fc376..8eb5aa0 100644
--- a/util-vserver.spec
+++ b/util-vserver.spec
@@ -17,11 +17,14 @@
 
 %define name util-vserver
 %define version 0.30.208
-%define release 3.planetlab%{?date:.%{date}}
+%define release 10%{?pldistro:.%{pldistro}}%{?date:.%{date}}
 
 %define _without_dietlibc 1
 %define _without_xalan 1
 
+# don't build debuginfo RPM
+%define debug_package %{nil}
+
 Vendor: PlanetLab
 Packager: PlanetLab Central <support@planet-lab.org>
 Distribution: PlanetLab 3.0
@@ -381,6 +384,30 @@ done
 
 
 %changelog
+* Fri Feb 17 2006 Steve Muir <smuir@cs.princeton.edu>
+- add support for setting guaranteed CPU share flag in rspec
+
+* Fri Jan 13 2006 Steve Muir <smuir@cs.princeton.edu>
+- fix bug in python/vserverimpl.c where attempting to adjust CPU share
+  for a context that didn't exist would cause an error (it should be a
+  safe no-op)
+
+* Fri Dec  2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix bugs in python/vserverimpl.c where exceptions were not raised when
+  they should be and thus occured later at unexpected times
+- add support for stopping a vserver
+
+* Wed Nov  9 2005 Steve Muir <smuir@cs.princeton.edu>
+- add support for removing resource limits e.g., when a slice is deleted
+
+* Mon Nov  7 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix file descriptor leak in vduimpl
+- clean up handling of network parameters
+- don't rely upon /etc/vservers/foo.conf to initialise vserver object
+
+* Wed Nov  2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix Python modules to handling scheduling parameters correctly
+
 * Fri Oct 28 2005 Steve Muir <smuir@cs.princeton.edu>
 - raise exception about being over disk limit after setting usage values
 
diff --git a/util-vserver.spec.in b/util-vserver.spec.in
index 2f47ef8..9919150 100644
--- a/util-vserver.spec.in
+++ b/util-vserver.spec.in
@@ -17,11 +17,14 @@
 
 %define name @PACKAGE@
 %define version @VERSION@
-%define release 1.planetlab%{?date:.%{date}}
+%define release 10%{?pldistro:.%{pldistro}}%{?date:.%{date}}
 
 %define _without_dietlibc 1
 %define _without_xalan 1
 
+# don't build debuginfo RPM
+%define debug_package %{nil}
+
 Vendor: PlanetLab
 Packager: PlanetLab Central <support@planet-lab.org>
 Distribution: PlanetLab 3.0
@@ -381,6 +384,37 @@ done
 
 
 %changelog
+* Fri Feb 17 2006 Steve Muir <smuir@cs.princeton.edu>
+- add support for setting guaranteed CPU share flag in rspec
+
+* Fri Jan 13 2006 Steve Muir <smuir@cs.princeton.edu>
+- fix bug in python/vserverimpl.c where attempting to adjust CPU share
+  for a context that didn't exist would cause an error (it should be a
+  safe no-op)
+
+* Fri Dec  2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix bugs in python/vserverimpl.c where exceptions were not raised when
+  they should be and thus occured later at unexpected times
+- add support for stopping a vserver
+
+* Wed Nov  9 2005 Steve Muir <smuir@cs.princeton.edu>
+- add support for removing resource limits e.g., when a slice is deleted
+
+* Mon Nov  7 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix file descriptor leak in vduimpl
+- clean up handling of network parameters
+- don't rely upon /etc/vservers/foo.conf to initialise vserver object
+
+* Wed Nov  2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix Python modules to handling scheduling parameters correctly
+
+* Fri Oct 28 2005 Steve Muir <smuir@cs.princeton.edu>
+- raise exception about being over disk limit after setting usage values
+
+* Fri Oct  7 2005 Steve Muir <smuir@cs.princeton.edu>
+- create common function to be used for entering a vserver and applying
+  resource limits
+
 * Thu Aug 21 2005 Mark Huang <mlhuang@cs.princeton.edu>
 - restore build of python modules