'planetlab-3_3-branch'.
--- /dev/null
+.X_usr_local_etc-up-to-date
+.deps
+.fixups
+.pathconfig.h.pathsubst.stamp
+FEATURES.txt
+Makefile
+autom4te.cache
+config.cache
+config.h
+config.log
+config.status
+libtool
+pathconfig.h
+stamp-h1
--- /dev/null
+.manifest.dat.pathsubst.stamp
+manifest.dat
--- /dev/null
+.deps
+.dirstamp
+.libs
+*.lo
+libvserver.la
+util-vserver.pc
DECL("sched_hard", VC_VXF_SCHED_HARD),
DECL("sched_prio", VC_VXF_SCHED_PRIO),
DECL("sched_pause", VC_VXF_SCHED_PAUSE),
+ DECL("sched_share", VC_VXF_SCHED_SHARE),
DECL("virt_mem", VC_VXF_VIRT_MEM),
DECL("virt_uptime", VC_VXF_VIRT_UPTIME),
#include <errno.h>
#include <stdint.h>
+#include <stdio.h>
#include <unistd.h>
#include <sys/resource.h>
#include "vserver.h"
static int
-create_context(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
+create_context(xid_t ctx, uint32_t flags, uint64_t bcaps)
{
struct vc_ctx_caps vc_caps;
- struct vc_ctx_flags vc_flags;
- struct vc_set_sched vc_sched;
- struct vc_rlimit vc_rlimit;
- /* create context info */
+ /*
+ * Create context info - this sets the STATE_SETUP and STATE_INIT flags.
+ * Don't ever clear the STATE_INIT flag, that makes us the init task.
+ *
+ * XXX - the kernel code allows initial flags to be passed as an arg.
+ */
if (vc_ctx_create(ctx) == VC_NOCTX)
return -1;
- /* set capabilities - these don't take effect until SETUP flags is unset */
+ /* set capabilities - these don't take effect until SETUP flag is unset */
vc_caps.bcaps = bcaps;
vc_caps.bmask = ~0ULL; /* currently unused */
vc_caps.ccaps = 0; /* don't want any of these */
if (vc_set_ccaps(ctx, &vc_caps))
return -1;
- /* ignore all flags except SETUP and scheduler flags */
- vc_flags.mask = VC_VXF_STATE_SETUP | VC_VXF_SCHED_FLAGS;
- /* don't let user change scheduler flags */
- vc_flags.flagword = flags & ~VC_VXF_SCHED_FLAGS; /* SETUP not set */
+ /* set default scheduling parameters */
+ pl_setsched(ctx, 1, 0);
- /* set scheduler parameters */
- vc_flags.flagword |= rspec->cpu_sched_flags;
- vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS |
- VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX);
- vc_sched.fill_rate = rspec->cpu_share; /* tokens accumulated per interval */
- vc_sched.interval = 1000; /* milliseconds */
- vc_sched.tokens = 100; /* initial allocation of tokens */
- vc_sched.tokens_min = 50; /* need this many tokens to run */
- vc_sched.tokens_max = 100; /* max accumulated number of tokens */
- if (vc_set_sched(ctx, &vc_sched))
- return -1;
-
- /* set resource limits */
- vc_rlimit.min = VC_LIM_KEEP;
- vc_rlimit.soft = VC_LIM_KEEP;
- vc_rlimit.hard = rspec->mem_limit;
- if (vc_set_rlimit(ctx, RLIMIT_RSS, &vc_rlimit))
- return -1;
+ return 0;
+}
- /* assume min and soft unchanged by set_rlimit */
- vc_rlimit.hard = rspec->task_limit;
- if (vc_set_rlimit(ctx, RLIMIT_NPROC, &vc_rlimit))
- return -1;
+int
+pl_setup_done(xid_t ctx)
+{
+ struct vc_ctx_flags vc_flags;
- /* set flags, unset SETUP flag - this allows other processes to migrate */
+ /* unset SETUP flag - this allows other processes to migrate */
+ vc_flags.mask = VC_VXF_STATE_SETUP;
+ vc_flags.flagword = 0;
if (vc_set_cflags(ctx, &vc_flags))
return -1;
return 0;
}
+#define RETRY_LIMIT 10
+
int
-pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec)
+pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps)
{
+ int retry_count = 0;
+
for (;;)
{
struct vc_ctx_flags vc_flags;
if (vc_get_cflags(ctx, &vc_flags))
{
+ if (errno != ESRCH)
+ return -1;
+
/* context doesn't exist - create it */
- if (create_context(ctx, flags, bcaps, rspec))
+ if (create_context(ctx, flags, bcaps))
{
if (errno == EEXIST)
/* another process beat us in a race */
}
/* created context and migrated to it i.e., we're done */
- break;
+ return 1;
}
/* check the SETUP flag */
if (vc_flags.flagword & VC_VXF_STATE_SETUP)
{
/* context is still being setup - wait a while then retry */
+ if (retry_count++ >= RETRY_LIMIT)
+ {
+ errno = EBUSY;
+ return -1;
+ }
sleep(1);
continue;
}
return 0;
}
+
+/* it's okay for a syscall to fail because the context doesn't exist */
+#define VC_SYSCALL(x) \
+do \
+{ \
+ if (x) \
+ return errno == ESRCH ? 0 : -1; \
+} \
+while (0)
+
+int
+pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags)
+{
+ struct vc_set_sched vc_sched;
+ struct vc_ctx_flags vc_flags;
+ uint32_t new_flags;
+
+ vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS |
+ VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX);
+ vc_sched.fill_rate = cpu_share; /* tokens accumulated per interval */
+ vc_sched.interval = 1000; /* milliseconds */
+ vc_sched.tokens = 100; /* initial allocation of tokens */
+ vc_sched.tokens_min = 50; /* need this many tokens to run */
+ vc_sched.tokens_max = 100; /* max accumulated number of tokens */
+
+ VC_SYSCALL(vc_set_sched(ctx, &vc_sched));
+
+ /* get current flag values */
+ VC_SYSCALL(vc_get_cflags(ctx, &vc_flags));
+
+ /* guaranteed CPU corresponds to SCHED_SHARE flag being cleared */
+ new_flags = (cpu_sched_flags & VS_SCHED_CPU_GUARANTEED
+ ? 0
+ : VC_VXF_SCHED_SHARE);
+ if ((vc_flags.flagword & VC_VXF_SCHED_SHARE) != new_flags)
+ {
+ vc_flags.mask = VC_VXF_SCHED_FLAGS;
+ vc_flags.flagword = new_flags | VC_VXF_SCHED_HARD;
+ VC_SYSCALL(vc_set_cflags(ctx, &vc_flags));
+ }
+
+ return 0;
+}
#ifndef _LIB_PLANETLAB_H_
#define _LIB_PLANETLAB_H_
-/*
- * context create
- */
-typedef struct {
- uint32_t cpu_share;
- uint32_t cpu_sched_flags;
- uint64_t mem_limit;
- uint64_t task_limit;
-} rspec_t;
-
-#define VC_VXF_SCHED_SHARE 0x00000800
#define VC_VXF_SCHED_FLAGS (VC_VXF_SCHED_HARD | VC_VXF_SCHED_SHARE)
int
-pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec);
+pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps);
+
+int
+pl_setup_done(xid_t ctx);
int
pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags);
+/* scheduler flags */
+#define VS_SCHED_CPU_GUARANTEED 1
+
#endif
#define VC_VXF_SCHED_HARD 0x00000100ull
#define VC_VXF_SCHED_PRIO 0x00000200ull
#define VC_VXF_SCHED_PAUSE 0x00000400ull
+#define VC_VXF_SCHED_SHARE 0x00000800ull
#define VC_VXF_VIRT_MEM 0x00010000ull
#define VC_VXF_VIRT_UPTIME 0x00020000ull
#
# GNUMakefile for util-vserver Python bindings
#
-# It's too hard to integrate distutils into the autoconf/libtool
-# framework, so run this Makefile separately from and after the normal
-# util-vserver build.
-#
# Steve Muir <smuir@cs.princeton.edu>
# Mark Huang <mlhuang@cs.princeton.edu>
# Copyright (C) 2005 The Trustees of Princeton University
#
-# $Id: Makefile,v 1.7 2005/08/26 04:00:44 mlhuang Exp $
+# $Id: Makefile,v 1.11 2006/03/01 22:03:38 mlhuang Exp $
#
-ALL := vserverimpl.so vduimpl.so util_vserver_vars.py
+INCLUDES := -I.. -I../lib
+LIBS = -L../lib -lvserver
+
+PY_MODS := vserver.py cpulimit.py bwlimit.py
+PY_EXT_MODS := vduimpl.so vserverimpl.so
+
+LT_LINK = ../libtool --tag=CC --mode=link
+
+
-# need command substitution
-SHELL := /bin/bash
+all: py-build
-pythonlibdir := @libdir@/python@PYTHON_VERSION@/site-packages
+# XXX - compatibility with util-vserver specfile
+INSTALL_ROOT ?= $(DESTDIR)
-all: $(ALL)
+install: py-install
-%.o: %.c
- # builds object and incompletely linked library
- python setup.py build_ext
- # copy to current directory
- cp -a build/temp.*/*.o .
+clean: py-clean
-vserverimpl.so vduimpl.so: %.so: %.o
- # relink the object against libvserver with libtool
- ../libtool --tag=CC --mode=link $(CC) -shared -o $@ $< ../lib/libvserver.la
-util_vserver_vars.py: ../scripts/util-vserver-vars
- # python does not export variables beginning with underscore
- (. $< ; \
- while read var ; do eval echo $$var=\$${$$var} ; done < \
- <(sed -ne "s/\([^=]*\)=.*/\1/p" $<) \
- | sed -e "s/^_*//" -e "s/\([^=]*\)=\(.*\)/\1='\2'/") > $@
-install: $(ALL)
- # install relinked libraries and byte-compiled scripts
- python setup.py install --root="$(DESTDIR)"
- # reinstall libraries with libtool so that the final path
- # to libvserver is resolved
- for so in $(filter %.so, $(ALL)) ; do \
- ../libtool --tag=CC --mode=install install "$$so" `find "$(DESTDIR)" -name "$$so"` ; \
- done
+UTIL_PYTHON ?= $(wildcard ../../util-python*)
-clean:
- rm -rf $(ALL) *.o build
+include $(UTIL_PYTHON)/pybuild.mk
.PHONY: all install clean
--- /dev/null
+#!/usr/bin/python
+
+import bwlimit
+
+if __name__ == '__main__':
+ bwlimit.main()
-#!/bin/env python2 -u
-
-# Based on code written by: Andy Bavier, acb@cs.princeton.edu
+#!/usr/bin/python
#
-# Bandwidth limit script to run on PlanetLab nodes. The intent is to use
-# the Hierarchical Token Bucket queueing discipline of 'tc' to (1) cap
-# the output bandwidth of the node at a specified rate (e.g., 5Mbps) and
-# (2) to allow all vservers to fairly share this rate. For instance,
-# if there are N vservers, then each should get at least 5/N Mbps of
-# bandwidth.
+# Bandwidth limit module for PlanetLab nodes. The intent is to use the
+# Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
+# slices to fairly share access to available node bandwidth. We
+# currently define three classes of "available node bandwidth":
+#
+# 1. Available hardware bandwidth (bwmax): The maximum rate of the
+# hardware.
+#
+# 2. Available capped bandwidth (bwcap): The maximum rate allowed to
+# non-exempt destinations. By default, equal to bwmax, but may be
+# lowered by PIs.
+#
+# 3. Available uncapped ("exempt") bandwidth: The difference between
+# bwmax and what is currently being used of bwcap, or the maximum rate
+# allowed to destinations exempt from caps (e.g., Internet2).
+#
+# All three classes of bandwidth are fairly shared according to the
+# notion of "shares". For instance, if the node is capped at 5 Mbps,
+# there are N slices, and each slice has 1 share, then each slice
+# should get at least 5/N Mbps of bandwidth. How HTB is implemented
+# makes this statement a little too simplistic. What it really means
+# is that during any single time period, only a certain number of
+# bytes can be sent onto the wire. Each slice is guaranteed that at
+# least some small number of its bytes will be sent. Whatever is left
+# over from the budget, is split in proportion to the number of shares
+# each slice has.
+#
+# Even if the node is not capped at a particular limit (bwcap ==
+# bwmax), this module enforces fair share access to bwmax. Also, if
+# the node is capped at a particular limit, rules may optionally be
+# defined that classify certain packets into the "exempt" class. This
+# class receives whatever bandwidth is leftover between bwcap and
+# bwmax; slices fairly share this bandwidth as well.
+#
+# The root context is exempt from sharing and can send as much as it
+# needs to.
#
# Some relevant URLs:
-# http://lartc.org/howto for how to use tc
-# http://luxik.cdi.cz/~devik/qos/htb/ for info on htb
+#
+# 1. http://lartc.org/howto for how to use tc
+# 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
+#
+# Andy Bavier <acb@cs.princeton.edu>
+# Mark Huang <mlhuang@cs.princeton.edu>
+# Copyright (C) 2006 The Trustees of Princeton University
+#
+# $Id: bwlimit.py,v 1.10 2006/03/14 22:57:50 smuir Exp $
+#
-import sys, os, re, string
+import sys, os, re, getopt
+from sets import Set
+import pwd
-# Global variables
-TC="/sbin/tc" # Where the modified tc program lives
-OPS = ["change","add"] # Sequence of TC ops we'll try
-# Support to run system commands
-import runcmd
-def run(cmd):
- try:
- runcmd.run(cmd)
- ret = True
- except runcmd.Error, ex:
- ret = False
+# Where the tc binary lives
+TC = "/sbin/tc"
- return ret
+# Default interface
+dev = "eth0"
+
+# Verbosity level
+verbose = 0
+
+# bwmin should be small enough that it can be considered negligibly
+# slow compared to the hardware. 8 bits/second appears to be the
+# smallest value supported by tc.
+bwmin = 8
+
+# bwmax should be large enough that it can be considered at least as
+# fast as the hardware.
+bwmax = 1000*1000*1000
+
+# quantum is the maximum number of bytes that can be borrowed by a
+# share (or slice, if each slice gets 1 share) in one time period
+# (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
+# above their guarantees, and each is attempting to borrow up to the
+# node bandwidth cap, quantums control how the excess bandwidth is
+# distributed. Slices with 2 shares will borrow twice the amount in
+# one time period as slices with 1 share, so averaged over time, they
+# will get twice as much of the excess bandwidth. The value should be
+# as small as possible and at least 1 MTU. By default, it would be
+# calculated as bwmin/10, but since we use such small a value for
+# bwmin, it's better to just set it to a value safely above 1 Ethernet
+# MTU.
+quantum = 1600
+
+# cburst is the maximum number of bytes that can be burst onto the
+# wire in one time period (with HZ=1000, 1 ms). If multiple slices
+# have data queued for transmission, cbursts control how long each
+# slice can have the wire for. If not specified, it is set to the
+# smallest possible value that would enable the slice's "ceil" rate
+# (usually the node bandwidth cap), to be reached if a slice was able
+# to borrow enough bandwidth to do so. For now, it's unclear how or if
+# to relate this to the notion of shares, so just let tc set the
+# default.
+cburst = None
+
+# There is another parameter that controls how bandwidth is allocated
+# between slices on nodes that is outside the scope of HTB. We enforce
+# a 16 GByte/day total limit on each slice, which works out to about
+# 1.5mbit. If a slice exceeds this byte limit before the day finishes,
+# it is capped at (i.e., its "ceil" rate is set to) the smaller of the
+# node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
+# rule and executes this script to override "ceil".
+
+# We support multiple bandwidth limits, by reserving the top nibble of
+# the minor classid to be the "subclassid". Theoretically, we could
+# support up to 15 subclasses, but for now, we only define two: the
+# "default" subclass 1:10 that is capped at the node bandwidth cap (in
+# this example, 5mbit) and the "exempt" subclass 1:20 that is capped
+# at bwmax (i.e., not capped). The 1:1 parent class exists only to
+# make the borrowing model work. All bandwidth above minimum
+# guarantees is fairly shared (in this example, slice 2 is guaranteed
+# at least 1mbit in addition to fair access to the rest), subject to
+# the restrictions of the class hierarchy: namely, that the total
+# bandwidth to non-exempt destinations should not exceed the node
+# bandwidth cap.
+#
+# 1:
+# |
+# 1:1 (1gbit)
+# ______________|_____________
+# | |
+# 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
+# | |
+# 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
+# 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
+# 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
+# ... ...
+# 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
+#
+default_minor = 0x1000
+exempt_minor = 0x2000
+
+# root_xid is for the root context. The root context is exempt from
+# fair sharing in both the default and exempt subclasses. The root
+# context gets 5 shares by default.
+root_xid = 0x0000
+root_share = 5
+
+# default_xid is for unclassifiable packets. Packets should not be
+# classified here very often. They can be if a slice's HTB classes are
+# deleted before its processes are. Each slice gets 1 share by
+# default.
+default_xid = 0x0FFF
+default_share = 1
+
+# See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
+# warned that older versions of tc interpret "kbps", "mbps", "mbit",
+# and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
+# "kibit" and that if an older version is installed, all rates will
+# be off by a small fraction.
+suffixes = {
+ "": 1,
+ "bit": 1,
+ "kibit": 1024,
+ "kbit": 1000,
+ "mibit": 1024*1024,
+ "mbit": 1000000,
+ "gibit": 1024*1024*1024,
+ "gbit": 1000000000,
+ "tibit": 1024*1024*1024*1024,
+ "tbit": 1000000000000,
+ "bps": 8,
+ "kibps": 8*1024,
+ "kbps": 8000,
+ "mibps": 8*1024*1024,
+ "mbps": 8000000,
+ "gibps": 8*1024*1024*1024,
+ "gbps": 8000000000,
+ "tibps": 8*1024*1024*1024*1024,
+ "tbps": 8000000000000
+}
-def get_defaults(cap_file="/etc/planetlab/bwcap", default_cap="10mbit"):
- # The maximum output bandwidth, read in from cap_file (if it
- # exists). If cap_file does not exist, use default_cap for
- # bandwidth cap. See also the 'cburst' parameter below.
- cap=default_cap
+
+# Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
+def get_tc_rate(s):
+ if type(s) == int:
+ return s
+ m = re.match(r"([0-9.]+)(\D*)", s)
+ if m is None:
+ return -1
+ suffix = m.group(2).lower()
+ if suffixes.has_key(suffix):
+ return int(float(m.group(1)) * suffixes[suffix])
+ else:
+ return -1
+
+
+# Prints a tc rate string
+def format_tc_rate(rate):
+ if rate >= 1000000:
+ return "%.0fmbit" % (rate / 1000000.)
+ elif rate >= 1000:
+ return "%.0fkbit" % (rate / 1000.)
+ else:
+ return "%.0fbit" % rate
+
+
+# Parse /etc/planetlab/bwcap (or equivalent)
+def read_bwcap(bwcap_file):
+ bwcap = bwmax
try:
- os.stat(cap_file)
- fp = open(cap_file)
- lines = fp.readlines()
- fp.close()
+ fp = open(bwcap_file, "r")
+ line = fp.readline().strip()
+ if line:
+ bwcap = get_tc_rate(line)
+ except:
+ pass
+ if bwcap == -1:
+ bwcap = bwmax
+ return bwcap
+
+
+# Get current (live) value of bwcap
+def get_bwcap(dev = dev):
+
+ state = tc("-d class show dev %s" % dev)
+ base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
+ base_classes = filter(None, map(base_re.match, state))
+ if not base_classes:
+ return -1
+ if len(base_classes) > 1:
+ raise Exception, "unable to get current bwcap"
+ return get_tc_rate(base_classes[0].group(1))
+
+
+# Get slice xid (500) from slice name ("500" or "princeton_mlh") or
+# slice name ("princeton_mlh") from slice xid (500).
+def get_slice(xid_or_name):
+
+ if xid_or_name == root_xid:
+ return "root"
+ if xid_or_name == default_xid:
+ return "default"
+ if isinstance(xid_or_name, (int, long)):
try:
- cap=string.strip(lines[0])
- except ValueError, ex:
+ return pwd.getpwuid(xid_or_name).pw_name
+ except KeyError:
pass
- except OSError, ex:
+ else:
+ try:
+ try:
+ return int(xid_or_name)
+ except ValueError:
+ pass
+ return pwd.getpwnam(xid_or_name).pw_uid
+ except KeyError:
+ pass
+
+ return None
+
+
+# Shortcut for running a command
+def run(cmd, input = None):
+ try:
+ if verbose:
+ sys.stderr.write("Executing: " + cmd + "\n")
+ if input is None:
+ fileobj = os.popen(cmd, "r")
+ output = fileobj.readlines()
+ else:
+ fileobj = os.popen(cmd, "w")
+ fileobj.write(input)
+ output = None
+ if fileobj.close() is None:
+ return output
+ except Exception, e:
pass
+ return None
+
+
+# Shortcut for running a tc command
+def tc(cmd):
+ return run(TC + " " + cmd)
+
+
+# (Re)initialize the bandwidth limits on this node
+def init(dev, bwcap):
+
+ # load the module used to manage exempt classes
+ run("/sbin/modprobe ip_set_iphash")
+
+ # Delete root qdisc 1: if it exists. This will also automatically
+ # delete any child classes.
+ for line in tc("qdisc show dev %s" % dev):
+ # Search for the root qdisc 1:
+ m = re.match(r"qdisc htb 1:", line)
+ if m is not None:
+ tc("qdisc del dev %s root handle 1:" % dev)
+ break
+
+ # Initialize HTB. The "default" clause specifies that if a packet
+ # fails classification, it should go into the class with handle
+ # 1FFF.
+ tc("qdisc add dev %s root handle 1: htb default %x" % \
+ (dev, default_minor | default_xid))
- # How many bytes a single token bucket is allowed to send at once.
- # Small values (i.e., 3080 = two maximum-sized Ethernet packets)
- # provide better fine-grained fairness. At high rates (e.g.,
- # cap=100mbit) this needs to be raised to allow full throughput.
- cburst=30800
-
- # The 'share' and 'quantum' parameters both influence the actual throughput
- # seen by a particular vserver:
-
- # 'share' is the rate at which tokens fill the bucket, and so is
- # the minimum bandwidth given to the task. I think this just
- # needs to be set to some small value that is the same for all
- # vservers. With the current value and a 5mbit cap, we can
- # support 5000 vservers (5mbit/1kbit = 5000). With values lower
- # than 10kbit, the HTB output (from tc -s -d class dev eth0) looks
- # strange... this needs to be looked into further.
- share="1kbit"
-
- # 'quantum' influences how excess bandwidth (i.e., above the
- # 'share') is distributed to vservers. Apparently, vservers can
- # send additional packets in proportion to their quantums (and not
- # their shares, as one might expect). See:
- # http://luxik.cdi.cz/~devik/qos/htb/manual/userg.htm#sharing
- # The above link states that 'quantum' is automatically
- # calculated for shares above 120kbit. Otherwise it should be
- # set to a small value but at least one MTU, so I set it to one
- # MTU. All vservers are assigned the same quantum and so they
- # should share equally.
- quantum=1540
-
- return cap, cburst, share, quantum
-
-
-def init(eth):
- global TC, OPS
-
- cap, cburst, share, quantum = get_defaults()
- if cap == "-1": return
-
- # Install HTB on $ETH. Specifies that all packets not matching a
- # filter rule go to class with handle 9999. If we don't supply a
- # default class, it sounds like non-matching packets can be sent
- # at an unlimited rate.
- for op in OPS:
- cmd = "%s qdisc %s dev %s root handle 1: htb default 9999" % (TC,op,eth)
- if run(cmd): break
-
- # Add a root class with bwcap capped rate
- for op in OPS:
- cmd = "%s class %s dev %s parent 1: classid 1:1 htb rate %s quantum %d" % \
- (TC, op, eth, cap, quantum)
- if run(cmd): break
-
- # Set up the default class. Packets will fail to match a filter rule
- # and end up here if they are sent by a process with UID < 500.
- for op in OPS:
- cmd = "%s class %s dev %s parent 1:1 classid 1:9999 htb rate %s ceil %s quantum %d cburst %d" % \
- (TC, op, eth, share, cap, quantum, cburst)
- if run(cmd): break
-
- # The next command appears to throttle back processes that are
- # sending faster than the token bucket can support, rather than
- # just dropping their packets.
- for op in OPS:
- cmd = "%s qdisc %s dev %s parent 1:9999 handle 9999 pfifo" % \
- (TC, op, eth)
- if run(cmd): break
-
-def on(xid, eth, bwlimit, cap, minrate, maxrate):
- global TC, OPS
-
- default_cap, default_cburst, default_share, default_quantum = get_defaults()
- quantum = bwlimit * default_quantum
-
- # Set up the per-vserver token bucket
- for op in OPS:
- cmd = "%s class %s dev %s parent 1:1 classid 1:%d htb rate %s ceil %s quantum %d cburst %d" % \
- (TC, op, eth, xid, minrate, cap, quantum, default_cburst)
- if run(cmd): break
-
- # The next command appears to throttle back processes that are
- # sending faster than the token bucket can support, rather than
- # just dropping their packets.
- for op in OPS:
- cmd = "%s qdisc %s dev %s parent 1:%d handle %d pfifo" % \
- (TC, op, eth, xid, xid)
- if run(cmd): break
-
- # Matches packets sent by a vserver to the appropriate token bucket.
- # The raw socket module marks each packet with its vserver id.
- # See: http://lartc.org/howto/lartc.qdisc.filters.html for more
- # info on the filter command.
- cmd = "%s filter del dev %s protocol ip prio %d" % (TC, eth, xid)
- run(cmd)
- cmd = "%s filter add dev %s prio %d parent 1:0 protocol ip handle %d fw flowid 1:%d" % \
- (TC, eth, xid, xid, xid)
- run(cmd)
-
-def off(xid, eth):
- cmd = "%s filter del dev %s protocol ip prio %d" % (TC, eth, xid)
- run(cmd)
-
- cmd = "%s qdisc del dev %s parent 1:%d" % (TC, eth, xid)
- run(cmd)
-
- cmd = "%s class del dev %s classid 1:%d" % (TC, eth, xid)
- run(cmd)
+ # Set up a parent class from which all subclasses borrow.
+ tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
+ (dev, bwmax))
+ # Set up a subclass that represents the node bandwidth cap. We
+ # allow each slice to borrow up to this rate, so it is also
+ # usually the "ceil" rate for each slice.
+ tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
+ (dev, bwmin, bwcap))
+
+ # Set up a subclass that represents "exemption" from the node
+ # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
+ # to exempt destinations can still be fairly shared up to bwmax.
+ tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
+ (dev, bwmin, bwmax))
+
+ # Set up the root class (and tell VNET what it is). Packets sent
+ # by root end up here and are capped at the node bandwidth
+ # cap.
+ on(root_xid, dev, share = root_share)
+ file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
+
+ # Set up the default class. Packets that fail classification end
+ # up here.
+ on(default_xid, dev, share = default_share)
+
+
+# Get the bandwidth limits for a particular slice xid as a tuple (xid,
+# share, minrate, maxrate), or all classes as a list of tuples.
+def get(xid = None, dev = dev):
+ if xid is None:
+ ret = []
+ else:
+ ret = None
+
+ # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
+ for line in tc("-d class show dev %s" % dev):
+ # Search for child classes of 1:10
+ m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
+ if m is None:
+ continue
+
+ # If we are looking for a particular class
+ classid = int(m.group(1), 16) & default_xid
+ if xid is not None and xid != classid:
+ continue
+
+ # Parse share
+ share = 1
+ m = re.search(r"quantum (\d+)", line)
+ if m is not None:
+ share = int(m.group(1)) / quantum
+
+ # Parse minrate
+ minrate = bwmin
+ m = re.search(r"rate (\w+)", line)
+ if m is not None:
+ minrate = get_tc_rate(m.group(1))
+
+ # Parse maxrate
+ maxrate = bwmax
+ m = re.search(r"ceil (\w+)", line)
+ if m is not None:
+ maxrate = get_tc_rate(m.group(1))
+
+ if xid is None:
+ # Return a list of parameters
+ ret.append((classid, share, minrate, maxrate))
+ else:
+ # Return the parameters for this class
+ ret = (classid, share, minrate, maxrate)
+ break
+
+ return ret
+
+
+# Apply specified bandwidth limit to the specified slice xid
+def on(xid, dev = dev, share = None, minrate = None, maxrate = None):
+ # Get defaults from current state if available
+ cap = get(xid, dev)
+ if cap is not None:
+ if share is None:
+ share = cap[1]
+ if minrate is None:
+ minrate = cap[2]
+ if maxrate is None:
+ maxrate = cap[3]
+
+ # Figure out what the current node bandwidth cap is
+ bwcap = bwmax
+ for line in tc("-d class show dev %s" % dev):
+ # Search for 1:10
+ m = re.match(r"class htb 1:10.*ceil (\w+)", line)
+ if m is not None:
+ bwcap = get_tc_rate(m.group(1))
+ break
+
+ # Set defaults
+ if share is None:
+ share = default_share
+ if minrate is None:
+ minrate = bwmin
+ else:
+ minrate = get_tc_rate(minrate)
+ if maxrate is None:
+ maxrate = bwcap
+ else:
+ maxrate = get_tc_rate(maxrate)
+
+ # Sanity checks
+ if maxrate > bwcap:
+ maxrate = bwcap
+ if minrate > maxrate:
+ minrate = maxrate
+
+ # Set up subclasses for the slice
+ tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+ (dev, default_minor | xid, minrate, maxrate, share * quantum))
+
+ tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+ (dev, exempt_minor | xid, minrate, bwmax, share * quantum))
+
+ # Attach a FIFO to each subclass, which helps to throttle back
+ # processes that are sending faster than the token buckets can
+ # support.
+ tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
+ (dev, default_minor | xid, default_minor | xid))
+
+ tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
+ (dev, exempt_minor | xid, exempt_minor | xid))
+
+
+# Remove class associated with specified slice xid. If further packets
+# are seen from this slice, they will be classified into the default
+# class 1:1FFF.
+def off(xid, dev = dev):
+ cap = get(xid, dev)
+ if cap is not None:
+ tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
+ tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
+
+
+def exempt_init(group_name, node_ips):
+
+ # Clean up
+ iptables = "/sbin/iptables -t vnet %s POSTROUTING"
+ run(iptables % "-F")
+ run("/sbin/ipset -X " + group_name)
+
+ # Create a hashed IP set of all of these destinations
+ lines = ["-N %s iphash" % group_name]
+ add_cmd = "-A %s " % group_name
+ lines += [(add_cmd + ip) for ip in node_ips]
+ lines += ["COMMIT"]
+ restore = "\n".join(lines) + "\n"
+ run("/sbin/ipset -R", restore)
+
+ # Add rule to match on destination IP set
+ run((iptables + " -m set --set %s dst -j CLASSIFY --set-class 1:%x") %
+ ("-A", group_name, exempt_minor))
+
+
+def usage():
+ bwcap_description = format_tc_rate(get_bwcap())
+
+ print """
+Usage:
+
+%s [OPTION]... [COMMAND] [ARGUMENT]...
+
+Options:
+ -d device Network interface (default: %s)
+ -r rate Node bandwidth cap (default: %s)
+ -q quantum Share multiplier (default: %d bytes)
+ -h This message
+
+Commands:
+ init
+ (Re)initialize bandwidth caps.
+ on slice [share] [minrate] [maxrate]
+ Set bandwidth cap for the specified slice
+ off slice
+ Remove bandwidth caps for the specified slice
+ get
+ Get all bandwidth caps
+ get slice
+ Get bandwidth caps for the specified slice
+ getcap slice
+ Get maxrate for the specified slice
+ setcap slice maxrate
+ Set maxrate for the specified slice
+""" % (sys.argv[0], dev, bwcap_description, quantum)
+ sys.exit(1)
+
+def main():
+ global dev, quantum, verbose
+
+ # Defaults
+ bwcap = get_bwcap()
+
+ (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
+ for (opt, optval) in opts:
+ if opt == '-d':
+ dev = optval
+ elif opt == '-r':
+ bwcap = get_tc_rate(optval)
+ elif opt == '-q':
+ quantum = int(optval)
+ elif opt == '-v':
+ verbose += 1
+ elif opt == '-h':
+ usage()
+
+ if len(argv):
+ if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
+ # (Re)initialize
+ init(dev, get_tc_rate(bwcap))
+
+ elif argv[0] == "get" or argv[0] == "show":
+ # Show
+ if len(argv) >= 2:
+ # Show a particular slice
+ xid = get_slice(argv[1])
+ if xid is None:
+ sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
+ usage()
+ caps = [get(xid, dev)]
+ else:
+ # Show all slices
+ caps = get(None, dev)
+
+ for (xid, share, minrate, maxrate) in caps:
+ slice = get_slice(xid)
+ if slice is None:
+ # Orphaned (not associated with a slice) class
+ slice = "%d?" % xid
+ print "%s %d %s %s" % \
+ (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
+
+ elif len(argv) >= 2:
+ # slice, ...
+ xid = get_slice(argv[1])
+ if xid is None:
+ sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
+ usage()
+
+ if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
+ # Enable cap
+ args = []
+ if len(argv) >= 3:
+ # ... share, minrate, maxrate
+ casts = [int, get_tc_rate, get_tc_rate]
+ for i, arg in enumerate(argv[2:]):
+ if i >= len(casts):
+ break
+ args.append(casts[i](arg))
+ on(xid, dev, *args)
+
+ elif argv[0] == "off" or argv[0] == "del":
+ # Disable cap
+ off(xid, dev)
+
+ # Backward compatibility with old resman script
+ elif argv[0] == "getcap":
+ # Get maxrate
+ cap = get(xid, dev)
+ if cap is not None:
+ (xid, share, minrate, maxrate) = cap
+ print format_tc_rate(maxrate)
+
+ # Backward compatibility with old resman script
+ elif argv[0] == "setcap":
+ if len(argv) >= 3:
+ # Set maxrate
+ on(xid, dev, maxrate = get_tc_rate(argv[2]))
+ else:
+ usage()
+
+ else:
+ usage()
+
+ else:
+ usage()
+
+
+if __name__ == '__main__':
+ main()
+++ /dev/null
-#!/usr/bin/python
-#
-# Python distutils script for util-vserver Python bindings
-#
-# Steve Muir <smuir@cs.princeton.edu>
-# Mark Huang <mlhuang@cs.princeton.edu>
-#
-# Copyright (C) 2005 The Trustees of Princeton University
-#
-# $Id$
-#
-
-from distutils.core import setup, Extension
-
-extension_args = {}
-extension_args['extra_compile_args'] = ['-Wall']
-extension_args['include_dirs'] = ['..', '../lib']
-# Link against libvserver with libtool later
-#extension_args['library_dirs'] = ['../lib']
-#extension_args['libraries'] = ['vserver']
-
-modules = ['util_vserver_vars', 'vserver', 'cpulimit', 'bwlimit']
-extensions = [Extension('vduimpl', ['vduimpl.c'], **extension_args),
- Extension('vserverimpl', ['vserverimpl.c'], **extension_args)]
-
-setup(py_modules = modules, ext_modules = extensions)
cwd_fd = open(".", O_RDONLY);
res = vdu_onedir(&tbl, &s, path);
fchdir(cwd_fd);
+ close(cwd_fd);
/* deallocate whatever has been added to tbl */
Dispose(&tbl);
import fcntl
import os
import re
+import pwd
+import signal
import sys
import time
import traceback
import mountimpl
-import passfdimpl
+import runcmd
import utmp
import vserverimpl, vduimpl
import cpulimit, bwlimit
+from vserverimpl import VS_SCHED_CPU_GUARANTEED as SCHED_CPU_GUARANTEED
+from vserverimpl import DLIMIT_INF
+
#
FLAGS_NAMESPACE = 128
-
+
+class NoSuchVServer(Exception): pass
+
+
+
class VServer:
INITSCRIPTS = [('/etc/rc.vinit', 'start'),
('/etc/rc.d/rc', '%(runlevel)d')]
- def __init__(self, name, vm_running = False, resources = {}):
+ def __init__(self, name, vm_id = None, vm_running = False):
self.name = name
self.config_file = "/etc/vservers/%s.conf" % name
self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name)
if not (os.path.isdir(self.dir) and
os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)):
- raise Exception, "no such vserver: " + name
- self.config = self.__read_config_file("/etc/vservers.conf")
- self.config.update(self.__read_config_file(self.config_file))
- self.flags = 0
- flags = self.config["S_FLAGS"].split(" ")
- if "lock" in flags:
- self.flags |= FLAGS_LOCK
- if "nproc" in flags:
- self.flags |= FLAGS_NPROC
+ raise NoSuchVServer, "no such vserver: " + name
+ self.config = {}
+ for config_file in ["/etc/vservers.conf", self.config_file]:
+ try:
+ self.config.update(self.__read_config_file(config_file))
+ except IOError, ex:
+ if ex.errno != errno.ENOENT:
+ raise
self.remove_caps = ~vserverimpl.CAP_SAFE;
- self.ctx = int(self.config["S_CONTEXT"])
+ if vm_id == None:
+ vm_id = int(self.config['S_CONTEXT'])
+ self.ctx = vm_id
self.vm_running = vm_running
- self.resources = resources
config_var_re = re.compile(r"^ *([A-Z_]+)=(.*)\n?$", re.MULTILINE)
os.chroot(self.dir)
os.chdir("/")
+ def chroot_call(self, fn, *args):
+
+ cwd_fd = os.open(".", os.O_RDONLY)
+ try:
+ root_fd = os.open("/", os.O_RDONLY)
+ try:
+ self.__do_chroot()
+ result = fn(*args)
+ finally:
+ os.fchdir(root_fd)
+ os.chroot(".")
+ os.fchdir(cwd_fd)
+ os.close(root_fd)
+ finally:
+ os.close(cwd_fd)
+ return result
+
def set_disklimit(self, block_limit):
# block_limit is in kB
- over_limit = False
+ if block_limit == 0:
+ vserverimpl.unsetdlimit(self.dir, self.ctx)
+ return
+
if self.vm_running:
block_usage = vserverimpl.DLIMIT_KEEP
inode_usage = vserverimpl.DLIMIT_KEEP
# init_disk_info() must have been called to get usage values
block_usage = self.disk_blocks
inode_usage = self.disk_inodes
- if block_limit < block_usage:
- over_limit = True
vserverimpl.setdlimit(self.dir,
self.ctx,
vserverimpl.DLIMIT_INF, # inode limit
2) # %age reserved for root
- if over_limit:
- raise Exception, ("%s disk usage (%u blocks) > limit (%u)" %
- (self.name, block_usage, block_limit))
-
def get_disklimit(self):
try:
- blocksused, blocktotal, inodesused, inodestotal, reserved = \
- vserverimpl.getdlimit(self.dir, self.ctx)
+ (self.disk_blocks, block_limit, self.disk_inodes, inode_limit,
+ reserved) = vserverimpl.getdlimit(self.dir, self.ctx)
except OSError, ex:
- if ex.errno == errno.ESRCH:
- # get here if no vserver disk limit has been set for xid
- # set blockused to -1 to indicate no limit
- blocktotal = -1
+ if ex.errno != errno.ESRCH:
+ raise
+ # get here if no vserver disk limit has been set for xid
+ block_limit = -1
- return blocktotal
+ return block_limit
- def set_sched(self, cpu_share):
+ def set_sched_config(self, cpu_share, sched_flags):
+
+ """ Write current CPU scheduler parameters to the vserver
+ configuration file. This method does not modify the kernel CPU
+ scheduling parameters for this context. """
if cpu_share == int(self.config.get("CPULIMIT", -1)):
return
-
- self.__update_config_file(self.config_file, { "CPULIMIT": cpu_share })
+ cpu_guaranteed = sched_flags & SCHED_CPU_GUARANTEED
+ cpu_config = { "CPULIMIT": cpu_share, "CPUGUARANTEED": cpu_guaranteed }
+ self.update_resources(cpu_config)
if self.vm_running:
- vserverimpl.setsched(self.ctx, cpu_share, True)
+ self.set_sched(cpu_share, sched_flags)
+
+ def set_sched(self, cpu_share, sched_flags = 0):
+
+ """ Update kernel CPU scheduling parameters for this context. """
+
+ vserverimpl.setsched(self.ctx, cpu_share, sched_flags)
def get_sched(self):
# have no way of querying scheduler right now on a per vserver basis
ret = vserverimpl.getrlimit(self.ctx,6)
return ret
- def set_bwlimit(self, eth, limit, cap, minrate, maxrate):
- if cap == "-1":
- bwlimit.off(self.ctx,eth)
- else:
- bwlimit.on(self.ctx, eth, limit, cap, minrate, maxrate)
-
- def get_bwlimit(self, eth):
- # not implemented yet
- bwlimit = -1
- cap = "unknown"
- minrate = "unknown"
- maxrate = "unknown"
- return (bwlimit, cap, minrate, maxrate)
-
- def open(self, filename, mode = "r", bufsize = -1):
+ def set_bwlimit(self, maxrate, minrate = 1, share = None, dev = "eth0"):
- (sendsock, recvsock) = passfdimpl.socketpair()
- child_pid = os.fork()
- if child_pid == 0:
- try:
- # child process
- self.__do_chroot()
- f = open(filename, mode)
- passfdimpl.sendmsg(f.fileno(), sendsock)
- os._exit(0)
- except EnvironmentError, ex:
- (result, errmsg) = (ex.errno, ex.strerror)
- except Exception, ex:
- (result, errmsg) = (255, str(ex))
- os.write(sendsock, errmsg)
- os._exit(result)
+ if maxrate != 0:
+ bwlimit.on(self.ctx, dev, share, minrate, maxrate)
+ else:
+ bwlimit.off(self.ctx, dev)
- # parent process
+ def get_bwlimit(self, dev = "eth0"):
- # XXX - need this since a lambda can't raise an exception
- def __throw(ex):
- raise ex
+ result = bwlimit.get(self.ctx)
+ # result of bwlimit.get is (ctx, share, minrate, maxrate)
+ if result:
+ result = result[1:]
+ return result
- os.close(sendsock)
- throw = lambda : __throw(Exception(errmsg))
- while True:
- try:
- (pid, status) = os.waitpid(child_pid, 0)
- if os.WIFEXITED(status):
- result = os.WEXITSTATUS(status)
- if result != 255:
- errmsg = os.strerror(result)
- throw = lambda : __throw(IOError(result, errmsg))
- else:
- errmsg = "unexpected exception in child"
- else:
- result = -1
- errmsg = "child killed"
- break
- except OSError, ex:
- if ex.errno != errno.EINTR:
- os.close(recvsock)
- raise ex
- fcntl.fcntl(recvsock, fcntl.F_SETFL, os.O_NONBLOCK)
- try:
- (fd, errmsg) = passfdimpl.recvmsg(recvsock)
- except OSError, ex:
- if ex.errno != errno.EAGAIN:
- throw = lambda : __throw(ex)
- fd = 0
- os.close(recvsock)
- if not fd:
- throw()
+ def open(self, filename, mode = "r", bufsize = -1):
- return os.fdopen(fd, mode, bufsize)
+ return self.chroot_call(open, filename, mode, bufsize)
def __do_chcontext(self, state_file):
- vserverimpl.chcontext(self.ctx, self.resources)
+ if state_file:
+ print >>state_file, "S_CONTEXT=%u" % self.ctx
+ print >>state_file, "S_PROFILE="
+ state_file.close()
- if not state_file:
- return
- print >>state_file, "S_CONTEXT=%d" % self.ctx
- print >>state_file, "S_PROFILE=%s" % self.config.get("S_PROFILE", "")
- state_file.close()
+ if vserverimpl.chcontext(self.ctx):
+ self.set_resources()
+ vserverimpl.setup_done(self.ctx)
def __prep(self, runlevel, log):
self.__do_chroot()
log = open("/var/log/boot.log", "w", 0)
os.dup2(1, 2)
- # XXX - close all other fds
print >>log, ("%s: starting the virtual server %s" %
(time.asctime(time.gmtime()), self.name))
# parent process
return child_pid
+ def set_resources(self):
+
+ """ Called when vserver context is entered for first time,
+ should be overridden by subclass. """
+
+ pass
+
def update_resources(self, resources):
self.config.update(resources)
(self.disk_inodes, self.disk_blocks, size) = vduimpl.vdu(self.dir)
return size
+
+ def stop(self, signal = signal.SIGKILL):
+
+ vserverimpl.killall(self.ctx, signal)
+ self.vm_running = False
+
+
+
+def create(vm_name, static = False, ctor = VServer):
+
+ options = []
+ if static:
+ options += ['--static']
+ runcmd.run('vuseradd', options + [vm_name])
+ vm_id = pwd.getpwnam(vm_name)[2]
+
+ return ctor(vm_name, vm_id)
#include "vserver.h"
#include "vserver-internal.h"
+#define NONE ({ Py_INCREF(Py_None); Py_None; })
+
/*
* context create
*/
static PyObject *
vserver_chcontext(PyObject *self, PyObject *args)
{
+ int result;
xid_t ctx;
uint32_t flags = 0;
uint32_t bcaps = ~vc_get_insecurebcaps();
- rspec_t rspec = { 32, VC_VXF_SCHED_FLAGS, -1, -1 };
- PyObject *resources;
- PyObject *cpu_share;
- if (!PyArg_ParseTuple(args, "IO!|K", &ctx, &PyDict_Type, &resources, &flags))
+ if (!PyArg_ParseTuple(args, "I|K", &ctx, &flags))
+ return NULL;
+
+ if ((result = pl_chcontext(ctx, flags, bcaps)) < 0)
+ return PyErr_SetFromErrno(PyExc_OSError);
+
+ return PyBool_FromLong(result);
+}
+
+static PyObject *
+vserver_setup_done(PyObject *self, PyObject *args)
+{
+ xid_t ctx;
+
+ if (!PyArg_ParseTuple(args, "I", &ctx))
return NULL;
- if ((cpu_share = PyMapping_GetItemString(resources, "nm_cpu_share")) &&
- (cpu_share = PyNumber_Int(cpu_share)))
- rspec.cpu_share = PyInt_AsLong(cpu_share);
- if (pl_chcontext(ctx, flags, bcaps, &rspec))
- PyErr_SetFromErrno(PyExc_OSError);
+ if (pl_setup_done(ctx) < 0)
+ return PyErr_SetFromErrno(PyExc_OSError);
- return Py_None;
+ return NONE;
}
static PyObject *
if (!PyArg_ParseTuple(args, "IiL", &xid, &resource, &limits.hard))
return NULL;
- ret = Py_None;
if (vc_set_rlimit(xid, resource, &limits))
ret = PyErr_SetFromErrno(PyExc_OSError);
else if (vc_get_rlimit(xid, resource, &limits)==-1)
if (!PyArg_ParseTuple(args, "Ii", &xid, &resource))
return NULL;
- ret = Py_None;
if (vc_get_rlimit(xid, resource, &limits)==-1)
ret = PyErr_SetFromErrno(PyExc_OSError);
else
return ret;
}
-#if 0
/*
* setsched
*/
static PyObject *
vserver_setsched(PyObject *self, PyObject *args)
{
- xid_t xid;
- struct vc_set_sched sched;
- struct vc_ctx_flags flags;
- unsigned cpuguaranteed = 0;
-
- sched.set_mask = (VC_VXSM_FILL_RATE |
- VC_VXSM_INTERVAL |
- VC_VXSM_TOKENS_MIN |
- VC_VXSM_TOKENS_MAX);
-
- if (!PyArg_ParseTuple(args, "I|I|I|I|I|I|I", &xid,
- &sched.fill_rate,
- &sched.interval,
- &sched.tokens,
- &sched.tokens_min,
- &sched.tokens_max,
- &cpuguaranteed))
- return NULL;
-
- flags.flagword = VC_VXF_SCHED_HARD;
- flags.mask |= VC_VXF_SCHED_HARD;
-#define VC_VXF_SCHED_SHARE 0x00000800ull
- if (cpuguaranteed==0) {
- flags.flagword |= VC_VXF_SCHED_SHARE;
- flags.mask |= VC_VXF_SCHED_SHARE;
- }
+ xid_t ctx;
+ uint32_t cpu_share;
+ uint32_t cpu_sched_flags = VC_VXF_SCHED_FLAGS;
- if (vc_set_cflags(xid, &flags) == -1)
- return PyErr_SetFromErrno(PyExc_OSError);
+ if (!PyArg_ParseTuple(args, "II|I", &ctx, &cpu_share, &cpu_sched_flags))
+ return NULL;
- if (vc_set_sched(xid, &sched) == -1)
- return PyErr_SetFromErrno(PyExc_OSError);
+ /* ESRCH indicates that there are no processes in the context */
+ if (pl_setsched(ctx, cpu_share, cpu_sched_flags) &&
+ errno != ESRCH)
+ return PyErr_SetFromErrno(PyExc_OSError);
- return Py_None;
+ return NONE;
}
-/*
- * setsched
- */
-#endif
-
static PyObject *
vserver_get_dlimit(PyObject *self, PyObject *args)
{
vserver(VCMD_set_dlimit, xid, &data))
return PyErr_SetFromErrno(PyExc_OSError);
- return Py_None;
+ return NONE;
+}
+
+static PyObject *
+vserver_unset_dlimit(PyObject *self, PyObject *args)
+{
+ char *path;
+ unsigned xid;
+ struct vcmd_ctx_dlimit_base_v0 init;
+
+ if (!PyArg_ParseTuple(args, "si", &path, &xid))
+ return NULL;
+
+ memset(&init, 0, sizeof(init));
+ init.name = path;
+ init.flags = 0;
+
+ if (vserver(VCMD_rem_dlimit, xid, &init) && errno != ESRCH)
+ return PyErr_SetFromErrno(PyExc_OSError);
+
+ return NONE;
+}
+
+static PyObject *
+vserver_killall(PyObject *self, PyObject *args)
+{
+ xid_t ctx;
+ int sig;
+
+ if (!PyArg_ParseTuple(args, "Ii", &ctx, &sig))
+ return NULL;
+
+ if (vc_ctx_kill(ctx, 0, sig) && errno != ESRCH)
+ return PyErr_SetFromErrno(PyExc_OSError);
+
+ return NONE;
}
static PyMethodDef methods[] = {
{ "chcontext", vserver_chcontext, METH_VARARGS,
"chcontext to vserver with provided flags" },
-#if 0
+ { "setup_done", vserver_setup_done, METH_VARARGS,
+ "Release vserver setup lock" },
{ "setsched", vserver_setsched, METH_VARARGS,
"Change vserver scheduling attributes for given vserver context" },
-#endif
{ "setdlimit", vserver_set_dlimit, METH_VARARGS,
"Set disk limits for given vserver context" },
+ { "unsetdlimit", vserver_unset_dlimit, METH_VARARGS,
+ "Remove disk limits for given vserver context" },
{ "getdlimit", vserver_get_dlimit, METH_VARARGS,
"Get disk limits for given vserver context" },
{ "setrlimit", vserver_set_rlimit, METH_VARARGS,
"Set resource limits for given resource of a vserver context" },
{ "getrlimit", vserver_get_rlimit, METH_VARARGS,
"Get resource limits for given resource of a vserver context" },
+ { "killall", vserver_killall, METH_VARARGS,
+ "Send signal to all processes in vserver context" },
{ NULL, NULL, 0, NULL }
};
/* export limit-related constants */
PyModule_AddIntConstant(mod, "DLIMIT_KEEP", (int)CDLIM_KEEP);
PyModule_AddIntConstant(mod, "DLIMIT_INF", (int)CDLIM_INFINITY);
+
+ /* scheduler flags */
+ PyModule_AddIntConstant(mod,
+ "VS_SCHED_CPU_GUARANTEED",
+ VS_SCHED_CPU_GUARANTEED);
}
#
# Copyright (c) 2004 The Trustees of Princeton University (Trustees).
#
-# $Id: vuseradd,v 1.23 2005/08/21 21:41:03 mlhuang Exp $
+# $Id: vuseradd,v 1.24 2005/09/02 20:00:39 mlhuang Exp $
#
: ${UTIL_VSERVER_VARS:=/usr/lib/util-vserver/util-vserver-vars}
echo "$NAME ALL=(ALL) ALL" >> "$__DEFAULT_VSERVERDIR/$NAME/etc/sudoers"
fi
fi
-
-# turn resource management on for vserver $NAME
-chkconfig resman && service resman start $NAME
-# XXX - resman doesn't print a trailing newline
-echo
exit(1);
}
#else
- rspec_t rspec;
+ int ctx_is_new;
unsigned long long cpu = VC_LIM_KEEP;
unsigned long long mem = VC_LIM_KEEP;
unsigned long long task = VC_LIM_KEEP;
{0,0}};
get_limits(context,list);
+
+ /* check whether the slice has been disabled */
+ if (!cpu)
+ {
+ fprintf(stderr, "*** this slice has been suspended ***\n");
+ exit(0);
+ }
+
(void) (sandbox_chroot(ctx));
- rspec.cpu_share = cpu;
- rspec.cpu_sched_flags = (VC_VXF_SCHED_HARD |
- (cpuguaranteed ? 0 : VC_VXF_SCHED_SHARE));
- rspec.mem_limit = mem;
- rspec.task_limit = task;
- if (pl_chcontext(ctx, 0, ~vc_get_insecurebcaps(), &rspec))
+ if ((ctx_is_new = pl_chcontext(ctx, 0, ~vc_get_insecurebcaps())) < 0)
{
PERROR("pl_chcontext(%u)", ctx);
exit(1);
}
+ if (ctx_is_new)
+ {
+ /* set resources */
+ struct vc_rlimit limits;
+
+ limits.min = VC_LIM_KEEP;
+ limits.soft = VC_LIM_KEEP;
+ limits.hard = mem;
+ if (vc_set_rlimit(ctx, RLIMIT_RSS, &limits))
+ {
+ PERROR("pl_setrlimit(%u, RLIMIT_RSS)", ctx);
+ exit(1);
+ }
+ limits.hard = task;
+ if (vc_set_rlimit(ctx, RLIMIT_NPROC, &limits))
+ {
+ PERROR("pl_setrlimit(%u, RLIMIT_NPROC)", ctx);
+ exit(1);
+ }
+ cpuguaranteed &= VS_SCHED_CPU_GUARANTEED;
+ if (pl_setsched(ctx, cpu, cpuguaranteed) < 0)
+ {
+ PERROR("pl_setsched(&u)", ctx);
+ exit(1);
+ }
+ pl_setup_done(ctx);
+ }
#endif
return 0;
}
%define name util-vserver
%define version 0.30.208
-%define release 3.planetlab%{?date:.%{date}}
+%define release 10%{?pldistro:.%{pldistro}}%{?date:.%{date}}
%define _without_dietlibc 1
%define _without_xalan 1
+# don't build debuginfo RPM
+%define debug_package %{nil}
+
Vendor: PlanetLab
Packager: PlanetLab Central <support@planet-lab.org>
Distribution: PlanetLab 3.0
%changelog
+* Fri Feb 17 2006 Steve Muir <smuir@cs.princeton.edu>
+- add support for setting guaranteed CPU share flag in rspec
+
+* Fri Jan 13 2006 Steve Muir <smuir@cs.princeton.edu>
+- fix bug in python/vserverimpl.c where attempting to adjust CPU share
+ for a context that didn't exist would cause an error (it should be a
+ safe no-op)
+
+* Fri Dec 2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix bugs in python/vserverimpl.c where exceptions were not raised when
+ they should be and thus occured later at unexpected times
+- add support for stopping a vserver
+
+* Wed Nov 9 2005 Steve Muir <smuir@cs.princeton.edu>
+- add support for removing resource limits e.g., when a slice is deleted
+
+* Mon Nov 7 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix file descriptor leak in vduimpl
+- clean up handling of network parameters
+- don't rely upon /etc/vservers/foo.conf to initialise vserver object
+
+* Wed Nov 2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix Python modules to handling scheduling parameters correctly
+
* Fri Oct 28 2005 Steve Muir <smuir@cs.princeton.edu>
- raise exception about being over disk limit after setting usage values
%define name @PACKAGE@
%define version @VERSION@
-%define release 1.planetlab%{?date:.%{date}}
+%define release 10%{?pldistro:.%{pldistro}}%{?date:.%{date}}
%define _without_dietlibc 1
%define _without_xalan 1
+# don't build debuginfo RPM
+%define debug_package %{nil}
+
Vendor: PlanetLab
Packager: PlanetLab Central <support@planet-lab.org>
Distribution: PlanetLab 3.0
%changelog
+* Fri Feb 17 2006 Steve Muir <smuir@cs.princeton.edu>
+- add support for setting guaranteed CPU share flag in rspec
+
+* Fri Jan 13 2006 Steve Muir <smuir@cs.princeton.edu>
+- fix bug in python/vserverimpl.c where attempting to adjust CPU share
+ for a context that didn't exist would cause an error (it should be a
+ safe no-op)
+
+* Fri Dec 2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix bugs in python/vserverimpl.c where exceptions were not raised when
+ they should be and thus occured later at unexpected times
+- add support for stopping a vserver
+
+* Wed Nov 9 2005 Steve Muir <smuir@cs.princeton.edu>
+- add support for removing resource limits e.g., when a slice is deleted
+
+* Mon Nov 7 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix file descriptor leak in vduimpl
+- clean up handling of network parameters
+- don't rely upon /etc/vservers/foo.conf to initialise vserver object
+
+* Wed Nov 2 2005 Steve Muir <smuir@cs.princeton.edu>
+- fix Python modules to handling scheduling parameters correctly
+
+* Fri Oct 28 2005 Steve Muir <smuir@cs.princeton.edu>
+- raise exception about being over disk limit after setting usage values
+
+* Fri Oct 7 2005 Steve Muir <smuir@cs.princeton.edu>
+- create common function to be used for entering a vserver and applying
+ resource limits
+
* Thu Aug 21 2005 Mark Huang <mlhuang@cs.princeton.edu>
- restore build of python modules