From: Planet-Lab Support Date: Mon, 1 May 2006 14:16:35 +0000 (+0000) Subject: This commit was manufactured by cvs2svn to create branch X-Git-Tag: planetlab-3_3-branch-point~1 X-Git-Url: http://git.onelab.eu/?p=util-vserver.git;a=commitdiff_plain;h=55b0a09abca499f96094b45be79692ae540a5d3c This commit was manufactured by cvs2svn to create branch 'planetlab-3_3-branch'. --- diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 0000000..93b0d06 --- /dev/null +++ b/.cvsignore @@ -0,0 +1,14 @@ +.X_usr_local_etc-up-to-date +.deps +.fixups +.pathconfig.h.pathsubst.stamp +FEATURES.txt +Makefile +autom4te.cache +config.cache +config.h +config.log +config.status +libtool +pathconfig.h +stamp-h1 diff --git a/contrib/.cvsignore b/contrib/.cvsignore new file mode 100644 index 0000000..55b480f --- /dev/null +++ b/contrib/.cvsignore @@ -0,0 +1,2 @@ +.manifest.dat.pathsubst.stamp +manifest.dat diff --git a/lib/.cvsignore b/lib/.cvsignore new file mode 100644 index 0000000..6a9f2c5 --- /dev/null +++ b/lib/.cvsignore @@ -0,0 +1,6 @@ +.deps +.dirstamp +.libs +*.lo +libvserver.la +util-vserver.pc diff --git a/lib/cflags-v13.c b/lib/cflags-v13.c index 246d1d4..0af2d04 100644 --- a/lib/cflags-v13.c +++ b/lib/cflags-v13.c @@ -42,6 +42,7 @@ static struct Mapping_uint64 const VALUES[] = { DECL("sched_hard", VC_VXF_SCHED_HARD), DECL("sched_prio", VC_VXF_SCHED_PRIO), DECL("sched_pause", VC_VXF_SCHED_PAUSE), + DECL("sched_share", VC_VXF_SCHED_SHARE), DECL("virt_mem", VC_VXF_VIRT_MEM), DECL("virt_uptime", VC_VXF_VIRT_UPTIME), diff --git a/lib/planetlab.c b/lib/planetlab.c index 034c249..4d85fb9 100644 --- a/lib/planetlab.c +++ b/lib/planetlab.c @@ -33,6 +33,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include @@ -43,18 +44,20 @@ POSSIBILITY OF SUCH DAMAGE. #include "vserver.h" static int -create_context(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec) +create_context(xid_t ctx, uint32_t flags, uint64_t bcaps) { struct vc_ctx_caps vc_caps; - struct vc_ctx_flags vc_flags; - struct vc_set_sched vc_sched; - struct vc_rlimit vc_rlimit; - /* create context info */ + /* + * Create context info - this sets the STATE_SETUP and STATE_INIT flags. + * Don't ever clear the STATE_INIT flag, that makes us the init task. + * + * XXX - the kernel code allows initial flags to be passed as an arg. + */ if (vc_ctx_create(ctx) == VC_NOCTX) return -1; - /* set capabilities - these don't take effect until SETUP flags is unset */ + /* set capabilities - these don't take effect until SETUP flag is unset */ vc_caps.bcaps = bcaps; vc_caps.bmask = ~0ULL; /* currently unused */ vc_caps.ccaps = 0; /* don't want any of these */ @@ -62,53 +65,44 @@ create_context(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec) if (vc_set_ccaps(ctx, &vc_caps)) return -1; - /* ignore all flags except SETUP and scheduler flags */ - vc_flags.mask = VC_VXF_STATE_SETUP | VC_VXF_SCHED_FLAGS; - /* don't let user change scheduler flags */ - vc_flags.flagword = flags & ~VC_VXF_SCHED_FLAGS; /* SETUP not set */ + /* set default scheduling parameters */ + pl_setsched(ctx, 1, 0); - /* set scheduler parameters */ - vc_flags.flagword |= rspec->cpu_sched_flags; - vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS | - VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX); - vc_sched.fill_rate = rspec->cpu_share; /* tokens accumulated per interval */ - vc_sched.interval = 1000; /* milliseconds */ - vc_sched.tokens = 100; /* initial allocation of tokens */ - vc_sched.tokens_min = 50; /* need this many tokens to run */ - vc_sched.tokens_max = 100; /* max accumulated number of tokens */ - if (vc_set_sched(ctx, &vc_sched)) - return -1; - - /* set resource limits */ - vc_rlimit.min = VC_LIM_KEEP; - vc_rlimit.soft = VC_LIM_KEEP; - vc_rlimit.hard = rspec->mem_limit; - if (vc_set_rlimit(ctx, RLIMIT_RSS, &vc_rlimit)) - return -1; + return 0; +} - /* assume min and soft unchanged by set_rlimit */ - vc_rlimit.hard = rspec->task_limit; - if (vc_set_rlimit(ctx, RLIMIT_NPROC, &vc_rlimit)) - return -1; +int +pl_setup_done(xid_t ctx) +{ + struct vc_ctx_flags vc_flags; - /* set flags, unset SETUP flag - this allows other processes to migrate */ + /* unset SETUP flag - this allows other processes to migrate */ + vc_flags.mask = VC_VXF_STATE_SETUP; + vc_flags.flagword = 0; if (vc_set_cflags(ctx, &vc_flags)) return -1; return 0; } +#define RETRY_LIMIT 10 + int -pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec) +pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps) { + int retry_count = 0; + for (;;) { struct vc_ctx_flags vc_flags; if (vc_get_cflags(ctx, &vc_flags)) { + if (errno != ESRCH) + return -1; + /* context doesn't exist - create it */ - if (create_context(ctx, flags, bcaps, rspec)) + if (create_context(ctx, flags, bcaps)) { if (errno == EEXIST) /* another process beat us in a race */ @@ -120,13 +114,18 @@ pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec) } /* created context and migrated to it i.e., we're done */ - break; + return 1; } /* check the SETUP flag */ if (vc_flags.flagword & VC_VXF_STATE_SETUP) { /* context is still being setup - wait a while then retry */ + if (retry_count++ >= RETRY_LIMIT) + { + errno = EBUSY; + return -1; + } sleep(1); continue; } @@ -141,3 +140,46 @@ pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec) return 0; } + +/* it's okay for a syscall to fail because the context doesn't exist */ +#define VC_SYSCALL(x) \ +do \ +{ \ + if (x) \ + return errno == ESRCH ? 0 : -1; \ +} \ +while (0) + +int +pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags) +{ + struct vc_set_sched vc_sched; + struct vc_ctx_flags vc_flags; + uint32_t new_flags; + + vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS | + VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX); + vc_sched.fill_rate = cpu_share; /* tokens accumulated per interval */ + vc_sched.interval = 1000; /* milliseconds */ + vc_sched.tokens = 100; /* initial allocation of tokens */ + vc_sched.tokens_min = 50; /* need this many tokens to run */ + vc_sched.tokens_max = 100; /* max accumulated number of tokens */ + + VC_SYSCALL(vc_set_sched(ctx, &vc_sched)); + + /* get current flag values */ + VC_SYSCALL(vc_get_cflags(ctx, &vc_flags)); + + /* guaranteed CPU corresponds to SCHED_SHARE flag being cleared */ + new_flags = (cpu_sched_flags & VS_SCHED_CPU_GUARANTEED + ? 0 + : VC_VXF_SCHED_SHARE); + if ((vc_flags.flagword & VC_VXF_SCHED_SHARE) != new_flags) + { + vc_flags.mask = VC_VXF_SCHED_FLAGS; + vc_flags.flagword = new_flags | VC_VXF_SCHED_HARD; + VC_SYSCALL(vc_set_cflags(ctx, &vc_flags)); + } + + return 0; +} diff --git a/lib/planetlab.h b/lib/planetlab.h index 34a9b91..e4d6ae4 100644 --- a/lib/planetlab.h +++ b/lib/planetlab.h @@ -34,23 +34,18 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef _LIB_PLANETLAB_H_ #define _LIB_PLANETLAB_H_ -/* - * context create - */ -typedef struct { - uint32_t cpu_share; - uint32_t cpu_sched_flags; - uint64_t mem_limit; - uint64_t task_limit; -} rspec_t; - -#define VC_VXF_SCHED_SHARE 0x00000800 #define VC_VXF_SCHED_FLAGS (VC_VXF_SCHED_HARD | VC_VXF_SCHED_SHARE) int -pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps, const rspec_t *rspec); +pl_chcontext(xid_t ctx, uint32_t flags, uint64_t bcaps); + +int +pl_setup_done(xid_t ctx); int pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags); +/* scheduler flags */ +#define VS_SCHED_CPU_GUARANTEED 1 + #endif diff --git a/lib/vserver.h b/lib/vserver.h index 0b1aa8c..f082f0d 100644 --- a/lib/vserver.h +++ b/lib/vserver.h @@ -169,6 +169,7 @@ #define VC_VXF_SCHED_HARD 0x00000100ull #define VC_VXF_SCHED_PRIO 0x00000200ull #define VC_VXF_SCHED_PAUSE 0x00000400ull +#define VC_VXF_SCHED_SHARE 0x00000800ull #define VC_VXF_VIRT_MEM 0x00010000ull #define VC_VXF_VIRT_UPTIME 0x00020000ull diff --git a/python/Makefile b/python/Makefile index 7ae69d7..5007d34 100644 --- a/python/Makefile +++ b/python/Makefile @@ -1,53 +1,36 @@ # # GNUMakefile for util-vserver Python bindings # -# It's too hard to integrate distutils into the autoconf/libtool -# framework, so run this Makefile separately from and after the normal -# util-vserver build. -# # Steve Muir # Mark Huang # Copyright (C) 2005 The Trustees of Princeton University # -# $Id: Makefile,v 1.7 2005/08/26 04:00:44 mlhuang Exp $ +# $Id: Makefile,v 1.11 2006/03/01 22:03:38 mlhuang Exp $ # -ALL := vserverimpl.so vduimpl.so util_vserver_vars.py +INCLUDES := -I.. -I../lib +LIBS = -L../lib -lvserver + +PY_MODS := vserver.py cpulimit.py bwlimit.py +PY_EXT_MODS := vduimpl.so vserverimpl.so + +LT_LINK = ../libtool --tag=CC --mode=link + + -# need command substitution -SHELL := /bin/bash +all: py-build -pythonlibdir := @libdir@/python@PYTHON_VERSION@/site-packages +# XXX - compatibility with util-vserver specfile +INSTALL_ROOT ?= $(DESTDIR) -all: $(ALL) +install: py-install -%.o: %.c - # builds object and incompletely linked library - python setup.py build_ext - # copy to current directory - cp -a build/temp.*/*.o . +clean: py-clean -vserverimpl.so vduimpl.so: %.so: %.o - # relink the object against libvserver with libtool - ../libtool --tag=CC --mode=link $(CC) -shared -o $@ $< ../lib/libvserver.la -util_vserver_vars.py: ../scripts/util-vserver-vars - # python does not export variables beginning with underscore - (. $< ; \ - while read var ; do eval echo $$var=\$${$$var} ; done < \ - <(sed -ne "s/\([^=]*\)=.*/\1/p" $<) \ - | sed -e "s/^_*//" -e "s/\([^=]*\)=\(.*\)/\1='\2'/") > $@ -install: $(ALL) - # install relinked libraries and byte-compiled scripts - python setup.py install --root="$(DESTDIR)" - # reinstall libraries with libtool so that the final path - # to libvserver is resolved - for so in $(filter %.so, $(ALL)) ; do \ - ../libtool --tag=CC --mode=install install "$$so" `find "$(DESTDIR)" -name "$$so"` ; \ - done +UTIL_PYTHON ?= $(wildcard ../../util-python*) -clean: - rm -rf $(ALL) *.o build +include $(UTIL_PYTHON)/pybuild.mk .PHONY: all install clean diff --git a/python/bwlimit b/python/bwlimit new file mode 100755 index 0000000..0630377 --- /dev/null +++ b/python/bwlimit @@ -0,0 +1,6 @@ +#!/usr/bin/python + +import bwlimit + +if __name__ == '__main__': + bwlimit.main() diff --git a/python/bwlimit.py b/python/bwlimit.py index b408caa..fbe825f 100644 --- a/python/bwlimit.py +++ b/python/bwlimit.py @@ -1,158 +1,588 @@ -#!/bin/env python2 -u - -# Based on code written by: Andy Bavier, acb@cs.princeton.edu +#!/usr/bin/python # -# Bandwidth limit script to run on PlanetLab nodes. The intent is to use -# the Hierarchical Token Bucket queueing discipline of 'tc' to (1) cap -# the output bandwidth of the node at a specified rate (e.g., 5Mbps) and -# (2) to allow all vservers to fairly share this rate. For instance, -# if there are N vservers, then each should get at least 5/N Mbps of -# bandwidth. +# Bandwidth limit module for PlanetLab nodes. The intent is to use the +# Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow +# slices to fairly share access to available node bandwidth. We +# currently define three classes of "available node bandwidth": +# +# 1. Available hardware bandwidth (bwmax): The maximum rate of the +# hardware. +# +# 2. Available capped bandwidth (bwcap): The maximum rate allowed to +# non-exempt destinations. By default, equal to bwmax, but may be +# lowered by PIs. +# +# 3. Available uncapped ("exempt") bandwidth: The difference between +# bwmax and what is currently being used of bwcap, or the maximum rate +# allowed to destinations exempt from caps (e.g., Internet2). +# +# All three classes of bandwidth are fairly shared according to the +# notion of "shares". For instance, if the node is capped at 5 Mbps, +# there are N slices, and each slice has 1 share, then each slice +# should get at least 5/N Mbps of bandwidth. How HTB is implemented +# makes this statement a little too simplistic. What it really means +# is that during any single time period, only a certain number of +# bytes can be sent onto the wire. Each slice is guaranteed that at +# least some small number of its bytes will be sent. Whatever is left +# over from the budget, is split in proportion to the number of shares +# each slice has. +# +# Even if the node is not capped at a particular limit (bwcap == +# bwmax), this module enforces fair share access to bwmax. Also, if +# the node is capped at a particular limit, rules may optionally be +# defined that classify certain packets into the "exempt" class. This +# class receives whatever bandwidth is leftover between bwcap and +# bwmax; slices fairly share this bandwidth as well. +# +# The root context is exempt from sharing and can send as much as it +# needs to. # # Some relevant URLs: -# http://lartc.org/howto for how to use tc -# http://luxik.cdi.cz/~devik/qos/htb/ for info on htb +# +# 1. http://lartc.org/howto for how to use tc +# 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB +# +# Andy Bavier +# Mark Huang +# Copyright (C) 2006 The Trustees of Princeton University +# +# $Id: bwlimit.py,v 1.10 2006/03/14 22:57:50 smuir Exp $ +# -import sys, os, re, string +import sys, os, re, getopt +from sets import Set +import pwd -# Global variables -TC="/sbin/tc" # Where the modified tc program lives -OPS = ["change","add"] # Sequence of TC ops we'll try -# Support to run system commands -import runcmd -def run(cmd): - try: - runcmd.run(cmd) - ret = True - except runcmd.Error, ex: - ret = False +# Where the tc binary lives +TC = "/sbin/tc" - return ret +# Default interface +dev = "eth0" + +# Verbosity level +verbose = 0 + +# bwmin should be small enough that it can be considered negligibly +# slow compared to the hardware. 8 bits/second appears to be the +# smallest value supported by tc. +bwmin = 8 + +# bwmax should be large enough that it can be considered at least as +# fast as the hardware. +bwmax = 1000*1000*1000 + +# quantum is the maximum number of bytes that can be borrowed by a +# share (or slice, if each slice gets 1 share) in one time period +# (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth +# above their guarantees, and each is attempting to borrow up to the +# node bandwidth cap, quantums control how the excess bandwidth is +# distributed. Slices with 2 shares will borrow twice the amount in +# one time period as slices with 1 share, so averaged over time, they +# will get twice as much of the excess bandwidth. The value should be +# as small as possible and at least 1 MTU. By default, it would be +# calculated as bwmin/10, but since we use such small a value for +# bwmin, it's better to just set it to a value safely above 1 Ethernet +# MTU. +quantum = 1600 + +# cburst is the maximum number of bytes that can be burst onto the +# wire in one time period (with HZ=1000, 1 ms). If multiple slices +# have data queued for transmission, cbursts control how long each +# slice can have the wire for. If not specified, it is set to the +# smallest possible value that would enable the slice's "ceil" rate +# (usually the node bandwidth cap), to be reached if a slice was able +# to borrow enough bandwidth to do so. For now, it's unclear how or if +# to relate this to the notion of shares, so just let tc set the +# default. +cburst = None + +# There is another parameter that controls how bandwidth is allocated +# between slices on nodes that is outside the scope of HTB. We enforce +# a 16 GByte/day total limit on each slice, which works out to about +# 1.5mbit. If a slice exceeds this byte limit before the day finishes, +# it is capped at (i.e., its "ceil" rate is set to) the smaller of the +# node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this +# rule and executes this script to override "ceil". + +# We support multiple bandwidth limits, by reserving the top nibble of +# the minor classid to be the "subclassid". Theoretically, we could +# support up to 15 subclasses, but for now, we only define two: the +# "default" subclass 1:10 that is capped at the node bandwidth cap (in +# this example, 5mbit) and the "exempt" subclass 1:20 that is capped +# at bwmax (i.e., not capped). The 1:1 parent class exists only to +# make the borrowing model work. All bandwidth above minimum +# guarantees is fairly shared (in this example, slice 2 is guaranteed +# at least 1mbit in addition to fair access to the rest), subject to +# the restrictions of the class hierarchy: namely, that the total +# bandwidth to non-exempt destinations should not exceed the node +# bandwidth cap. +# +# 1: +# | +# 1:1 (1gbit) +# ______________|_____________ +# | | +# 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit) +# | | +# 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit), +# 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit), +# 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit), +# ... ... +# 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit) +# +default_minor = 0x1000 +exempt_minor = 0x2000 + +# root_xid is for the root context. The root context is exempt from +# fair sharing in both the default and exempt subclasses. The root +# context gets 5 shares by default. +root_xid = 0x0000 +root_share = 5 + +# default_xid is for unclassifiable packets. Packets should not be +# classified here very often. They can be if a slice's HTB classes are +# deleted before its processes are. Each slice gets 1 share by +# default. +default_xid = 0x0FFF +default_share = 1 + +# See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be +# warned that older versions of tc interpret "kbps", "mbps", "mbit", +# and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and +# "kibit" and that if an older version is installed, all rates will +# be off by a small fraction. +suffixes = { + "": 1, + "bit": 1, + "kibit": 1024, + "kbit": 1000, + "mibit": 1024*1024, + "mbit": 1000000, + "gibit": 1024*1024*1024, + "gbit": 1000000000, + "tibit": 1024*1024*1024*1024, + "tbit": 1000000000000, + "bps": 8, + "kibps": 8*1024, + "kbps": 8000, + "mibps": 8*1024*1024, + "mbps": 8000000, + "gibps": 8*1024*1024*1024, + "gbps": 8000000000, + "tibps": 8*1024*1024*1024*1024, + "tbps": 8000000000000 +} -def get_defaults(cap_file="/etc/planetlab/bwcap", default_cap="10mbit"): - # The maximum output bandwidth, read in from cap_file (if it - # exists). If cap_file does not exist, use default_cap for - # bandwidth cap. See also the 'cburst' parameter below. - cap=default_cap + +# Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second +def get_tc_rate(s): + if type(s) == int: + return s + m = re.match(r"([0-9.]+)(\D*)", s) + if m is None: + return -1 + suffix = m.group(2).lower() + if suffixes.has_key(suffix): + return int(float(m.group(1)) * suffixes[suffix]) + else: + return -1 + + +# Prints a tc rate string +def format_tc_rate(rate): + if rate >= 1000000: + return "%.0fmbit" % (rate / 1000000.) + elif rate >= 1000: + return "%.0fkbit" % (rate / 1000.) + else: + return "%.0fbit" % rate + + +# Parse /etc/planetlab/bwcap (or equivalent) +def read_bwcap(bwcap_file): + bwcap = bwmax try: - os.stat(cap_file) - fp = open(cap_file) - lines = fp.readlines() - fp.close() + fp = open(bwcap_file, "r") + line = fp.readline().strip() + if line: + bwcap = get_tc_rate(line) + except: + pass + if bwcap == -1: + bwcap = bwmax + return bwcap + + +# Get current (live) value of bwcap +def get_bwcap(dev = dev): + + state = tc("-d class show dev %s" % dev) + base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*") + base_classes = filter(None, map(base_re.match, state)) + if not base_classes: + return -1 + if len(base_classes) > 1: + raise Exception, "unable to get current bwcap" + return get_tc_rate(base_classes[0].group(1)) + + +# Get slice xid (500) from slice name ("500" or "princeton_mlh") or +# slice name ("princeton_mlh") from slice xid (500). +def get_slice(xid_or_name): + + if xid_or_name == root_xid: + return "root" + if xid_or_name == default_xid: + return "default" + if isinstance(xid_or_name, (int, long)): try: - cap=string.strip(lines[0]) - except ValueError, ex: + return pwd.getpwuid(xid_or_name).pw_name + except KeyError: pass - except OSError, ex: + else: + try: + try: + return int(xid_or_name) + except ValueError: + pass + return pwd.getpwnam(xid_or_name).pw_uid + except KeyError: + pass + + return None + + +# Shortcut for running a command +def run(cmd, input = None): + try: + if verbose: + sys.stderr.write("Executing: " + cmd + "\n") + if input is None: + fileobj = os.popen(cmd, "r") + output = fileobj.readlines() + else: + fileobj = os.popen(cmd, "w") + fileobj.write(input) + output = None + if fileobj.close() is None: + return output + except Exception, e: pass + return None + + +# Shortcut for running a tc command +def tc(cmd): + return run(TC + " " + cmd) + + +# (Re)initialize the bandwidth limits on this node +def init(dev, bwcap): + + # load the module used to manage exempt classes + run("/sbin/modprobe ip_set_iphash") + + # Delete root qdisc 1: if it exists. This will also automatically + # delete any child classes. + for line in tc("qdisc show dev %s" % dev): + # Search for the root qdisc 1: + m = re.match(r"qdisc htb 1:", line) + if m is not None: + tc("qdisc del dev %s root handle 1:" % dev) + break + + # Initialize HTB. The "default" clause specifies that if a packet + # fails classification, it should go into the class with handle + # 1FFF. + tc("qdisc add dev %s root handle 1: htb default %x" % \ + (dev, default_minor | default_xid)) - # How many bytes a single token bucket is allowed to send at once. - # Small values (i.e., 3080 = two maximum-sized Ethernet packets) - # provide better fine-grained fairness. At high rates (e.g., - # cap=100mbit) this needs to be raised to allow full throughput. - cburst=30800 - - # The 'share' and 'quantum' parameters both influence the actual throughput - # seen by a particular vserver: - - # 'share' is the rate at which tokens fill the bucket, and so is - # the minimum bandwidth given to the task. I think this just - # needs to be set to some small value that is the same for all - # vservers. With the current value and a 5mbit cap, we can - # support 5000 vservers (5mbit/1kbit = 5000). With values lower - # than 10kbit, the HTB output (from tc -s -d class dev eth0) looks - # strange... this needs to be looked into further. - share="1kbit" - - # 'quantum' influences how excess bandwidth (i.e., above the - # 'share') is distributed to vservers. Apparently, vservers can - # send additional packets in proportion to their quantums (and not - # their shares, as one might expect). See: - # http://luxik.cdi.cz/~devik/qos/htb/manual/userg.htm#sharing - # The above link states that 'quantum' is automatically - # calculated for shares above 120kbit. Otherwise it should be - # set to a small value but at least one MTU, so I set it to one - # MTU. All vservers are assigned the same quantum and so they - # should share equally. - quantum=1540 - - return cap, cburst, share, quantum - - -def init(eth): - global TC, OPS - - cap, cburst, share, quantum = get_defaults() - if cap == "-1": return - - # Install HTB on $ETH. Specifies that all packets not matching a - # filter rule go to class with handle 9999. If we don't supply a - # default class, it sounds like non-matching packets can be sent - # at an unlimited rate. - for op in OPS: - cmd = "%s qdisc %s dev %s root handle 1: htb default 9999" % (TC,op,eth) - if run(cmd): break - - # Add a root class with bwcap capped rate - for op in OPS: - cmd = "%s class %s dev %s parent 1: classid 1:1 htb rate %s quantum %d" % \ - (TC, op, eth, cap, quantum) - if run(cmd): break - - # Set up the default class. Packets will fail to match a filter rule - # and end up here if they are sent by a process with UID < 500. - for op in OPS: - cmd = "%s class %s dev %s parent 1:1 classid 1:9999 htb rate %s ceil %s quantum %d cburst %d" % \ - (TC, op, eth, share, cap, quantum, cburst) - if run(cmd): break - - # The next command appears to throttle back processes that are - # sending faster than the token bucket can support, rather than - # just dropping their packets. - for op in OPS: - cmd = "%s qdisc %s dev %s parent 1:9999 handle 9999 pfifo" % \ - (TC, op, eth) - if run(cmd): break - -def on(xid, eth, bwlimit, cap, minrate, maxrate): - global TC, OPS - - default_cap, default_cburst, default_share, default_quantum = get_defaults() - quantum = bwlimit * default_quantum - - # Set up the per-vserver token bucket - for op in OPS: - cmd = "%s class %s dev %s parent 1:1 classid 1:%d htb rate %s ceil %s quantum %d cburst %d" % \ - (TC, op, eth, xid, minrate, cap, quantum, default_cburst) - if run(cmd): break - - # The next command appears to throttle back processes that are - # sending faster than the token bucket can support, rather than - # just dropping their packets. - for op in OPS: - cmd = "%s qdisc %s dev %s parent 1:%d handle %d pfifo" % \ - (TC, op, eth, xid, xid) - if run(cmd): break - - # Matches packets sent by a vserver to the appropriate token bucket. - # The raw socket module marks each packet with its vserver id. - # See: http://lartc.org/howto/lartc.qdisc.filters.html for more - # info on the filter command. - cmd = "%s filter del dev %s protocol ip prio %d" % (TC, eth, xid) - run(cmd) - cmd = "%s filter add dev %s prio %d parent 1:0 protocol ip handle %d fw flowid 1:%d" % \ - (TC, eth, xid, xid, xid) - run(cmd) - -def off(xid, eth): - cmd = "%s filter del dev %s protocol ip prio %d" % (TC, eth, xid) - run(cmd) - - cmd = "%s qdisc del dev %s parent 1:%d" % (TC, eth, xid) - run(cmd) - - cmd = "%s class del dev %s classid 1:%d" % (TC, eth, xid) - run(cmd) + # Set up a parent class from which all subclasses borrow. + tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \ + (dev, bwmax)) + # Set up a subclass that represents the node bandwidth cap. We + # allow each slice to borrow up to this rate, so it is also + # usually the "ceil" rate for each slice. + tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \ + (dev, bwmin, bwcap)) + + # Set up a subclass that represents "exemption" from the node + # bandwidth cap. Once the node bandwidth cap is reached, bandwidth + # to exempt destinations can still be fairly shared up to bwmax. + tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \ + (dev, bwmin, bwmax)) + + # Set up the root class (and tell VNET what it is). Packets sent + # by root end up here and are capped at the node bandwidth + # cap. + on(root_xid, dev, share = root_share) + file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid)) + + # Set up the default class. Packets that fail classification end + # up here. + on(default_xid, dev, share = default_share) + + +# Get the bandwidth limits for a particular slice xid as a tuple (xid, +# share, minrate, maxrate), or all classes as a list of tuples. +def get(xid = None, dev = dev): + if xid is None: + ret = [] + else: + ret = None + + # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb + for line in tc("-d class show dev %s" % dev): + # Search for child classes of 1:10 + m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line) + if m is None: + continue + + # If we are looking for a particular class + classid = int(m.group(1), 16) & default_xid + if xid is not None and xid != classid: + continue + + # Parse share + share = 1 + m = re.search(r"quantum (\d+)", line) + if m is not None: + share = int(m.group(1)) / quantum + + # Parse minrate + minrate = bwmin + m = re.search(r"rate (\w+)", line) + if m is not None: + minrate = get_tc_rate(m.group(1)) + + # Parse maxrate + maxrate = bwmax + m = re.search(r"ceil (\w+)", line) + if m is not None: + maxrate = get_tc_rate(m.group(1)) + + if xid is None: + # Return a list of parameters + ret.append((classid, share, minrate, maxrate)) + else: + # Return the parameters for this class + ret = (classid, share, minrate, maxrate) + break + + return ret + + +# Apply specified bandwidth limit to the specified slice xid +def on(xid, dev = dev, share = None, minrate = None, maxrate = None): + # Get defaults from current state if available + cap = get(xid, dev) + if cap is not None: + if share is None: + share = cap[1] + if minrate is None: + minrate = cap[2] + if maxrate is None: + maxrate = cap[3] + + # Figure out what the current node bandwidth cap is + bwcap = bwmax + for line in tc("-d class show dev %s" % dev): + # Search for 1:10 + m = re.match(r"class htb 1:10.*ceil (\w+)", line) + if m is not None: + bwcap = get_tc_rate(m.group(1)) + break + + # Set defaults + if share is None: + share = default_share + if minrate is None: + minrate = bwmin + else: + minrate = get_tc_rate(minrate) + if maxrate is None: + maxrate = bwcap + else: + maxrate = get_tc_rate(maxrate) + + # Sanity checks + if maxrate > bwcap: + maxrate = bwcap + if minrate > maxrate: + minrate = maxrate + + # Set up subclasses for the slice + tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \ + (dev, default_minor | xid, minrate, maxrate, share * quantum)) + + tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \ + (dev, exempt_minor | xid, minrate, bwmax, share * quantum)) + + # Attach a FIFO to each subclass, which helps to throttle back + # processes that are sending faster than the token buckets can + # support. + tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \ + (dev, default_minor | xid, default_minor | xid)) + + tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \ + (dev, exempt_minor | xid, exempt_minor | xid)) + + +# Remove class associated with specified slice xid. If further packets +# are seen from this slice, they will be classified into the default +# class 1:1FFF. +def off(xid, dev = dev): + cap = get(xid, dev) + if cap is not None: + tc("class del dev %s classid 1:%x" % (dev, default_minor | xid)) + tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid)) + + +def exempt_init(group_name, node_ips): + + # Clean up + iptables = "/sbin/iptables -t vnet %s POSTROUTING" + run(iptables % "-F") + run("/sbin/ipset -X " + group_name) + + # Create a hashed IP set of all of these destinations + lines = ["-N %s iphash" % group_name] + add_cmd = "-A %s " % group_name + lines += [(add_cmd + ip) for ip in node_ips] + lines += ["COMMIT"] + restore = "\n".join(lines) + "\n" + run("/sbin/ipset -R", restore) + + # Add rule to match on destination IP set + run((iptables + " -m set --set %s dst -j CLASSIFY --set-class 1:%x") % + ("-A", group_name, exempt_minor)) + + +def usage(): + bwcap_description = format_tc_rate(get_bwcap()) + + print """ +Usage: + +%s [OPTION]... [COMMAND] [ARGUMENT]... + +Options: + -d device Network interface (default: %s) + -r rate Node bandwidth cap (default: %s) + -q quantum Share multiplier (default: %d bytes) + -h This message + +Commands: + init + (Re)initialize bandwidth caps. + on slice [share] [minrate] [maxrate] + Set bandwidth cap for the specified slice + off slice + Remove bandwidth caps for the specified slice + get + Get all bandwidth caps + get slice + Get bandwidth caps for the specified slice + getcap slice + Get maxrate for the specified slice + setcap slice maxrate + Set maxrate for the specified slice +""" % (sys.argv[0], dev, bwcap_description, quantum) + sys.exit(1) + +def main(): + global dev, quantum, verbose + + # Defaults + bwcap = get_bwcap() + + (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh") + for (opt, optval) in opts: + if opt == '-d': + dev = optval + elif opt == '-r': + bwcap = get_tc_rate(optval) + elif opt == '-q': + quantum = int(optval) + elif opt == '-v': + verbose += 1 + elif opt == '-h': + usage() + + if len(argv): + if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1): + # (Re)initialize + init(dev, get_tc_rate(bwcap)) + + elif argv[0] == "get" or argv[0] == "show": + # Show + if len(argv) >= 2: + # Show a particular slice + xid = get_slice(argv[1]) + if xid is None: + sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1]) + usage() + caps = [get(xid, dev)] + else: + # Show all slices + caps = get(None, dev) + + for (xid, share, minrate, maxrate) in caps: + slice = get_slice(xid) + if slice is None: + # Orphaned (not associated with a slice) class + slice = "%d?" % xid + print "%s %d %s %s" % \ + (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate)) + + elif len(argv) >= 2: + # slice, ... + xid = get_slice(argv[1]) + if xid is None: + sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1]) + usage() + + if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace": + # Enable cap + args = [] + if len(argv) >= 3: + # ... share, minrate, maxrate + casts = [int, get_tc_rate, get_tc_rate] + for i, arg in enumerate(argv[2:]): + if i >= len(casts): + break + args.append(casts[i](arg)) + on(xid, dev, *args) + + elif argv[0] == "off" or argv[0] == "del": + # Disable cap + off(xid, dev) + + # Backward compatibility with old resman script + elif argv[0] == "getcap": + # Get maxrate + cap = get(xid, dev) + if cap is not None: + (xid, share, minrate, maxrate) = cap + print format_tc_rate(maxrate) + + # Backward compatibility with old resman script + elif argv[0] == "setcap": + if len(argv) >= 3: + # Set maxrate + on(xid, dev, maxrate = get_tc_rate(argv[2])) + else: + usage() + + else: + usage() + + else: + usage() + + +if __name__ == '__main__': + main() diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index 8491c58..0000000 --- a/python/setup.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/python -# -# Python distutils script for util-vserver Python bindings -# -# Steve Muir -# Mark Huang -# -# Copyright (C) 2005 The Trustees of Princeton University -# -# $Id$ -# - -from distutils.core import setup, Extension - -extension_args = {} -extension_args['extra_compile_args'] = ['-Wall'] -extension_args['include_dirs'] = ['..', '../lib'] -# Link against libvserver with libtool later -#extension_args['library_dirs'] = ['../lib'] -#extension_args['libraries'] = ['vserver'] - -modules = ['util_vserver_vars', 'vserver', 'cpulimit', 'bwlimit'] -extensions = [Extension('vduimpl', ['vduimpl.c'], **extension_args), - Extension('vserverimpl', ['vserverimpl.c'], **extension_args)] - -setup(py_modules = modules, ext_modules = extensions) diff --git a/python/vduimpl.c b/python/vduimpl.c index fb219ff..2191826 100644 --- a/python/vduimpl.c +++ b/python/vduimpl.c @@ -549,6 +549,7 @@ do_vdu(PyObject *self, PyObject *args) cwd_fd = open(".", O_RDONLY); res = vdu_onedir(&tbl, &s, path); fchdir(cwd_fd); + close(cwd_fd); /* deallocate whatever has been added to tbl */ Dispose(&tbl); diff --git a/python/vserver.py b/python/vserver.py index 264aee2..f455e60 100644 --- a/python/vserver.py +++ b/python/vserver.py @@ -4,16 +4,21 @@ import errno import fcntl import os import re +import pwd +import signal import sys import time import traceback import mountimpl -import passfdimpl +import runcmd import utmp import vserverimpl, vduimpl import cpulimit, bwlimit +from vserverimpl import VS_SCHED_CPU_GUARANTEED as SCHED_CPU_GUARANTEED +from vserverimpl import DLIMIT_INF + # @@ -29,32 +34,36 @@ FLAGS_ULIMIT = 64 FLAGS_NAMESPACE = 128 - + +class NoSuchVServer(Exception): pass + + + class VServer: INITSCRIPTS = [('/etc/rc.vinit', 'start'), ('/etc/rc.d/rc', '%(runlevel)d')] - def __init__(self, name, vm_running = False, resources = {}): + def __init__(self, name, vm_id = None, vm_running = False): self.name = name self.config_file = "/etc/vservers/%s.conf" % name self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name) if not (os.path.isdir(self.dir) and os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)): - raise Exception, "no such vserver: " + name - self.config = self.__read_config_file("/etc/vservers.conf") - self.config.update(self.__read_config_file(self.config_file)) - self.flags = 0 - flags = self.config["S_FLAGS"].split(" ") - if "lock" in flags: - self.flags |= FLAGS_LOCK - if "nproc" in flags: - self.flags |= FLAGS_NPROC + raise NoSuchVServer, "no such vserver: " + name + self.config = {} + for config_file in ["/etc/vservers.conf", self.config_file]: + try: + self.config.update(self.__read_config_file(config_file)) + except IOError, ex: + if ex.errno != errno.ENOENT: + raise self.remove_caps = ~vserverimpl.CAP_SAFE; - self.ctx = int(self.config["S_CONTEXT"]) + if vm_id == None: + vm_id = int(self.config['S_CONTEXT']) + self.ctx = vm_id self.vm_running = vm_running - self.resources = resources config_var_re = re.compile(r"^ *([A-Z_]+)=(.*)\n?$", re.MULTILINE) @@ -111,10 +120,30 @@ class VServer: os.chroot(self.dir) os.chdir("/") + def chroot_call(self, fn, *args): + + cwd_fd = os.open(".", os.O_RDONLY) + try: + root_fd = os.open("/", os.O_RDONLY) + try: + self.__do_chroot() + result = fn(*args) + finally: + os.fchdir(root_fd) + os.chroot(".") + os.fchdir(cwd_fd) + os.close(root_fd) + finally: + os.close(cwd_fd) + return result + def set_disklimit(self, block_limit): # block_limit is in kB - over_limit = False + if block_limit == 0: + vserverimpl.unsetdlimit(self.dir, self.ctx) + return + if self.vm_running: block_usage = vserverimpl.DLIMIT_KEEP inode_usage = vserverimpl.DLIMIT_KEEP @@ -122,8 +151,6 @@ class VServer: # init_disk_info() must have been called to get usage values block_usage = self.disk_blocks inode_usage = self.disk_inodes - if block_limit < block_usage: - over_limit = True vserverimpl.setdlimit(self.dir, self.ctx, @@ -133,31 +160,38 @@ class VServer: vserverimpl.DLIMIT_INF, # inode limit 2) # %age reserved for root - if over_limit: - raise Exception, ("%s disk usage (%u blocks) > limit (%u)" % - (self.name, block_usage, block_limit)) - def get_disklimit(self): try: - blocksused, blocktotal, inodesused, inodestotal, reserved = \ - vserverimpl.getdlimit(self.dir, self.ctx) + (self.disk_blocks, block_limit, self.disk_inodes, inode_limit, + reserved) = vserverimpl.getdlimit(self.dir, self.ctx) except OSError, ex: - if ex.errno == errno.ESRCH: - # get here if no vserver disk limit has been set for xid - # set blockused to -1 to indicate no limit - blocktotal = -1 + if ex.errno != errno.ESRCH: + raise + # get here if no vserver disk limit has been set for xid + block_limit = -1 - return blocktotal + return block_limit - def set_sched(self, cpu_share): + def set_sched_config(self, cpu_share, sched_flags): + + """ Write current CPU scheduler parameters to the vserver + configuration file. This method does not modify the kernel CPU + scheduling parameters for this context. """ if cpu_share == int(self.config.get("CPULIMIT", -1)): return - - self.__update_config_file(self.config_file, { "CPULIMIT": cpu_share }) + cpu_guaranteed = sched_flags & SCHED_CPU_GUARANTEED + cpu_config = { "CPULIMIT": cpu_share, "CPUGUARANTEED": cpu_guaranteed } + self.update_resources(cpu_config) if self.vm_running: - vserverimpl.setsched(self.ctx, cpu_share, True) + self.set_sched(cpu_share, sched_flags) + + def set_sched(self, cpu_share, sched_flags = 0): + + """ Update kernel CPU scheduling parameters for this context. """ + + vserverimpl.setsched(self.ctx, cpu_share, sched_flags) def get_sched(self): # have no way of querying scheduler right now on a per vserver basis @@ -179,86 +213,35 @@ class VServer: ret = vserverimpl.getrlimit(self.ctx,6) return ret - def set_bwlimit(self, eth, limit, cap, minrate, maxrate): - if cap == "-1": - bwlimit.off(self.ctx,eth) - else: - bwlimit.on(self.ctx, eth, limit, cap, minrate, maxrate) - - def get_bwlimit(self, eth): - # not implemented yet - bwlimit = -1 - cap = "unknown" - minrate = "unknown" - maxrate = "unknown" - return (bwlimit, cap, minrate, maxrate) - - def open(self, filename, mode = "r", bufsize = -1): + def set_bwlimit(self, maxrate, minrate = 1, share = None, dev = "eth0"): - (sendsock, recvsock) = passfdimpl.socketpair() - child_pid = os.fork() - if child_pid == 0: - try: - # child process - self.__do_chroot() - f = open(filename, mode) - passfdimpl.sendmsg(f.fileno(), sendsock) - os._exit(0) - except EnvironmentError, ex: - (result, errmsg) = (ex.errno, ex.strerror) - except Exception, ex: - (result, errmsg) = (255, str(ex)) - os.write(sendsock, errmsg) - os._exit(result) + if maxrate != 0: + bwlimit.on(self.ctx, dev, share, minrate, maxrate) + else: + bwlimit.off(self.ctx, dev) - # parent process + def get_bwlimit(self, dev = "eth0"): - # XXX - need this since a lambda can't raise an exception - def __throw(ex): - raise ex + result = bwlimit.get(self.ctx) + # result of bwlimit.get is (ctx, share, minrate, maxrate) + if result: + result = result[1:] + return result - os.close(sendsock) - throw = lambda : __throw(Exception(errmsg)) - while True: - try: - (pid, status) = os.waitpid(child_pid, 0) - if os.WIFEXITED(status): - result = os.WEXITSTATUS(status) - if result != 255: - errmsg = os.strerror(result) - throw = lambda : __throw(IOError(result, errmsg)) - else: - errmsg = "unexpected exception in child" - else: - result = -1 - errmsg = "child killed" - break - except OSError, ex: - if ex.errno != errno.EINTR: - os.close(recvsock) - raise ex - fcntl.fcntl(recvsock, fcntl.F_SETFL, os.O_NONBLOCK) - try: - (fd, errmsg) = passfdimpl.recvmsg(recvsock) - except OSError, ex: - if ex.errno != errno.EAGAIN: - throw = lambda : __throw(ex) - fd = 0 - os.close(recvsock) - if not fd: - throw() + def open(self, filename, mode = "r", bufsize = -1): - return os.fdopen(fd, mode, bufsize) + return self.chroot_call(open, filename, mode, bufsize) def __do_chcontext(self, state_file): - vserverimpl.chcontext(self.ctx, self.resources) + if state_file: + print >>state_file, "S_CONTEXT=%u" % self.ctx + print >>state_file, "S_PROFILE=" + state_file.close() - if not state_file: - return - print >>state_file, "S_CONTEXT=%d" % self.ctx - print >>state_file, "S_PROFILE=%s" % self.config.get("S_PROFILE", "") - state_file.close() + if vserverimpl.chcontext(self.ctx): + self.set_resources() + vserverimpl.setup_done(self.ctx) def __prep(self, runlevel, log): @@ -327,7 +310,6 @@ class VServer: self.__do_chroot() log = open("/var/log/boot.log", "w", 0) os.dup2(1, 2) - # XXX - close all other fds print >>log, ("%s: starting the virtual server %s" % (time.asctime(time.gmtime()), self.name)) @@ -379,6 +361,13 @@ class VServer: # parent process return child_pid + def set_resources(self): + + """ Called when vserver context is entered for first time, + should be overridden by subclass. """ + + pass + def update_resources(self, resources): self.config.update(resources) @@ -391,3 +380,20 @@ class VServer: (self.disk_inodes, self.disk_blocks, size) = vduimpl.vdu(self.dir) return size + + def stop(self, signal = signal.SIGKILL): + + vserverimpl.killall(self.ctx, signal) + self.vm_running = False + + + +def create(vm_name, static = False, ctor = VServer): + + options = [] + if static: + options += ['--static'] + runcmd.run('vuseradd', options + [vm_name]) + vm_id = pwd.getpwnam(vm_name)[2] + + return ctor(vm_name, vm_id) diff --git a/python/vserverimpl.c b/python/vserverimpl.c index 3a93224..d5f018d 100644 --- a/python/vserverimpl.c +++ b/python/vserverimpl.c @@ -44,29 +44,40 @@ POSSIBILITY OF SUCH DAMAGE. #include "vserver.h" #include "vserver-internal.h" +#define NONE ({ Py_INCREF(Py_None); Py_None; }) + /* * context create */ static PyObject * vserver_chcontext(PyObject *self, PyObject *args) { + int result; xid_t ctx; uint32_t flags = 0; uint32_t bcaps = ~vc_get_insecurebcaps(); - rspec_t rspec = { 32, VC_VXF_SCHED_FLAGS, -1, -1 }; - PyObject *resources; - PyObject *cpu_share; - if (!PyArg_ParseTuple(args, "IO!|K", &ctx, &PyDict_Type, &resources, &flags)) + if (!PyArg_ParseTuple(args, "I|K", &ctx, &flags)) + return NULL; + + if ((result = pl_chcontext(ctx, flags, bcaps)) < 0) + return PyErr_SetFromErrno(PyExc_OSError); + + return PyBool_FromLong(result); +} + +static PyObject * +vserver_setup_done(PyObject *self, PyObject *args) +{ + xid_t ctx; + + if (!PyArg_ParseTuple(args, "I", &ctx)) return NULL; - if ((cpu_share = PyMapping_GetItemString(resources, "nm_cpu_share")) && - (cpu_share = PyNumber_Int(cpu_share))) - rspec.cpu_share = PyInt_AsLong(cpu_share); - if (pl_chcontext(ctx, flags, bcaps, &rspec)) - PyErr_SetFromErrno(PyExc_OSError); + if (pl_setup_done(ctx) < 0) + return PyErr_SetFromErrno(PyExc_OSError); - return Py_None; + return NONE; } static PyObject * @@ -83,7 +94,6 @@ vserver_set_rlimit(PyObject *self, PyObject *args) { if (!PyArg_ParseTuple(args, "IiL", &xid, &resource, &limits.hard)) return NULL; - ret = Py_None; if (vc_set_rlimit(xid, resource, &limits)) ret = PyErr_SetFromErrno(PyExc_OSError); else if (vc_get_rlimit(xid, resource, &limits)==-1) @@ -108,7 +118,6 @@ vserver_get_rlimit(PyObject *self, PyObject *args) { if (!PyArg_ParseTuple(args, "Ii", &xid, &resource)) return NULL; - ret = Py_None; if (vc_get_rlimit(xid, resource, &limits)==-1) ret = PyErr_SetFromErrno(PyExc_OSError); else @@ -117,54 +126,27 @@ vserver_get_rlimit(PyObject *self, PyObject *args) { return ret; } -#if 0 /* * setsched */ static PyObject * vserver_setsched(PyObject *self, PyObject *args) { - xid_t xid; - struct vc_set_sched sched; - struct vc_ctx_flags flags; - unsigned cpuguaranteed = 0; - - sched.set_mask = (VC_VXSM_FILL_RATE | - VC_VXSM_INTERVAL | - VC_VXSM_TOKENS_MIN | - VC_VXSM_TOKENS_MAX); - - if (!PyArg_ParseTuple(args, "I|I|I|I|I|I|I", &xid, - &sched.fill_rate, - &sched.interval, - &sched.tokens, - &sched.tokens_min, - &sched.tokens_max, - &cpuguaranteed)) - return NULL; - - flags.flagword = VC_VXF_SCHED_HARD; - flags.mask |= VC_VXF_SCHED_HARD; -#define VC_VXF_SCHED_SHARE 0x00000800ull - if (cpuguaranteed==0) { - flags.flagword |= VC_VXF_SCHED_SHARE; - flags.mask |= VC_VXF_SCHED_SHARE; - } + xid_t ctx; + uint32_t cpu_share; + uint32_t cpu_sched_flags = VC_VXF_SCHED_FLAGS; - if (vc_set_cflags(xid, &flags) == -1) - return PyErr_SetFromErrno(PyExc_OSError); + if (!PyArg_ParseTuple(args, "II|I", &ctx, &cpu_share, &cpu_sched_flags)) + return NULL; - if (vc_set_sched(xid, &sched) == -1) - return PyErr_SetFromErrno(PyExc_OSError); + /* ESRCH indicates that there are no processes in the context */ + if (pl_setsched(ctx, cpu_share, cpu_sched_flags) && + errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); - return Py_None; + return NONE; } -/* - * setsched - */ -#endif - static PyObject * vserver_get_dlimit(PyObject *self, PyObject *args) { @@ -225,24 +207,63 @@ vserver_set_dlimit(PyObject *self, PyObject *args) vserver(VCMD_set_dlimit, xid, &data)) return PyErr_SetFromErrno(PyExc_OSError); - return Py_None; + return NONE; +} + +static PyObject * +vserver_unset_dlimit(PyObject *self, PyObject *args) +{ + char *path; + unsigned xid; + struct vcmd_ctx_dlimit_base_v0 init; + + if (!PyArg_ParseTuple(args, "si", &path, &xid)) + return NULL; + + memset(&init, 0, sizeof(init)); + init.name = path; + init.flags = 0; + + if (vserver(VCMD_rem_dlimit, xid, &init) && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_killall(PyObject *self, PyObject *args) +{ + xid_t ctx; + int sig; + + if (!PyArg_ParseTuple(args, "Ii", &ctx, &sig)) + return NULL; + + if (vc_ctx_kill(ctx, 0, sig) && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; } static PyMethodDef methods[] = { { "chcontext", vserver_chcontext, METH_VARARGS, "chcontext to vserver with provided flags" }, -#if 0 + { "setup_done", vserver_setup_done, METH_VARARGS, + "Release vserver setup lock" }, { "setsched", vserver_setsched, METH_VARARGS, "Change vserver scheduling attributes for given vserver context" }, -#endif { "setdlimit", vserver_set_dlimit, METH_VARARGS, "Set disk limits for given vserver context" }, + { "unsetdlimit", vserver_unset_dlimit, METH_VARARGS, + "Remove disk limits for given vserver context" }, { "getdlimit", vserver_get_dlimit, METH_VARARGS, "Get disk limits for given vserver context" }, { "setrlimit", vserver_set_rlimit, METH_VARARGS, "Set resource limits for given resource of a vserver context" }, { "getrlimit", vserver_get_rlimit, METH_VARARGS, "Get resource limits for given resource of a vserver context" }, + { "killall", vserver_killall, METH_VARARGS, + "Send signal to all processes in vserver context" }, { NULL, NULL, 0, NULL } }; @@ -262,4 +283,9 @@ initvserverimpl(void) /* export limit-related constants */ PyModule_AddIntConstant(mod, "DLIMIT_KEEP", (int)CDLIM_KEEP); PyModule_AddIntConstant(mod, "DLIMIT_INF", (int)CDLIM_INFINITY); + + /* scheduler flags */ + PyModule_AddIntConstant(mod, + "VS_SCHED_CPU_GUARANTEED", + VS_SCHED_CPU_GUARANTEED); } diff --git a/scripts/vuseradd b/scripts/vuseradd index 8023aa6..3a531f3 100755 --- a/scripts/vuseradd +++ b/scripts/vuseradd @@ -4,7 +4,7 @@ # # Copyright (c) 2004 The Trustees of Princeton University (Trustees). # -# $Id: vuseradd,v 1.23 2005/08/21 21:41:03 mlhuang Exp $ +# $Id: vuseradd,v 1.24 2005/09/02 20:00:39 mlhuang Exp $ # : ${UTIL_VSERVER_VARS:=/usr/lib/util-vserver/util-vserver-vars} @@ -93,8 +93,3 @@ if [ -d "$__DEFAULT_VSERVERDIR/$NAME" ] ; then echo "$NAME ALL=(ALL) ALL" >> "$__DEFAULT_VSERVERDIR/$NAME/etc/sudoers" fi fi - -# turn resource management on for vserver $NAME -chkconfig resman && service resman start $NAME -# XXX - resman doesn't print a trailing newline -echo diff --git a/src/vsh.c b/src/vsh.c index 435ea05..7d28bf4 100644 --- a/src/vsh.c +++ b/src/vsh.c @@ -281,7 +281,7 @@ static int sandbox_processes(xid_t ctx, char *context) exit(1); } #else - rspec_t rspec; + int ctx_is_new; unsigned long long cpu = VC_LIM_KEEP; unsigned long long mem = VC_LIM_KEEP; unsigned long long task = VC_LIM_KEEP; @@ -294,18 +294,48 @@ static int sandbox_processes(xid_t ctx, char *context) {0,0}}; get_limits(context,list); + + /* check whether the slice has been disabled */ + if (!cpu) + { + fprintf(stderr, "*** this slice has been suspended ***\n"); + exit(0); + } + (void) (sandbox_chroot(ctx)); - rspec.cpu_share = cpu; - rspec.cpu_sched_flags = (VC_VXF_SCHED_HARD | - (cpuguaranteed ? 0 : VC_VXF_SCHED_SHARE)); - rspec.mem_limit = mem; - rspec.task_limit = task; - if (pl_chcontext(ctx, 0, ~vc_get_insecurebcaps(), &rspec)) + if ((ctx_is_new = pl_chcontext(ctx, 0, ~vc_get_insecurebcaps())) < 0) { PERROR("pl_chcontext(%u)", ctx); exit(1); } + if (ctx_is_new) + { + /* set resources */ + struct vc_rlimit limits; + + limits.min = VC_LIM_KEEP; + limits.soft = VC_LIM_KEEP; + limits.hard = mem; + if (vc_set_rlimit(ctx, RLIMIT_RSS, &limits)) + { + PERROR("pl_setrlimit(%u, RLIMIT_RSS)", ctx); + exit(1); + } + limits.hard = task; + if (vc_set_rlimit(ctx, RLIMIT_NPROC, &limits)) + { + PERROR("pl_setrlimit(%u, RLIMIT_NPROC)", ctx); + exit(1); + } + cpuguaranteed &= VS_SCHED_CPU_GUARANTEED; + if (pl_setsched(ctx, cpu, cpuguaranteed) < 0) + { + PERROR("pl_setsched(&u)", ctx); + exit(1); + } + pl_setup_done(ctx); + } #endif return 0; } diff --git a/util-vserver.spec b/util-vserver.spec index c7fc376..8eb5aa0 100644 --- a/util-vserver.spec +++ b/util-vserver.spec @@ -17,11 +17,14 @@ %define name util-vserver %define version 0.30.208 -%define release 3.planetlab%{?date:.%{date}} +%define release 10%{?pldistro:.%{pldistro}}%{?date:.%{date}} %define _without_dietlibc 1 %define _without_xalan 1 +# don't build debuginfo RPM +%define debug_package %{nil} + Vendor: PlanetLab Packager: PlanetLab Central Distribution: PlanetLab 3.0 @@ -381,6 +384,30 @@ done %changelog +* Fri Feb 17 2006 Steve Muir +- add support for setting guaranteed CPU share flag in rspec + +* Fri Jan 13 2006 Steve Muir +- fix bug in python/vserverimpl.c where attempting to adjust CPU share + for a context that didn't exist would cause an error (it should be a + safe no-op) + +* Fri Dec 2 2005 Steve Muir +- fix bugs in python/vserverimpl.c where exceptions were not raised when + they should be and thus occured later at unexpected times +- add support for stopping a vserver + +* Wed Nov 9 2005 Steve Muir +- add support for removing resource limits e.g., when a slice is deleted + +* Mon Nov 7 2005 Steve Muir +- fix file descriptor leak in vduimpl +- clean up handling of network parameters +- don't rely upon /etc/vservers/foo.conf to initialise vserver object + +* Wed Nov 2 2005 Steve Muir +- fix Python modules to handling scheduling parameters correctly + * Fri Oct 28 2005 Steve Muir - raise exception about being over disk limit after setting usage values diff --git a/util-vserver.spec.in b/util-vserver.spec.in index 2f47ef8..9919150 100644 --- a/util-vserver.spec.in +++ b/util-vserver.spec.in @@ -17,11 +17,14 @@ %define name @PACKAGE@ %define version @VERSION@ -%define release 1.planetlab%{?date:.%{date}} +%define release 10%{?pldistro:.%{pldistro}}%{?date:.%{date}} %define _without_dietlibc 1 %define _without_xalan 1 +# don't build debuginfo RPM +%define debug_package %{nil} + Vendor: PlanetLab Packager: PlanetLab Central Distribution: PlanetLab 3.0 @@ -381,6 +384,37 @@ done %changelog +* Fri Feb 17 2006 Steve Muir +- add support for setting guaranteed CPU share flag in rspec + +* Fri Jan 13 2006 Steve Muir +- fix bug in python/vserverimpl.c where attempting to adjust CPU share + for a context that didn't exist would cause an error (it should be a + safe no-op) + +* Fri Dec 2 2005 Steve Muir +- fix bugs in python/vserverimpl.c where exceptions were not raised when + they should be and thus occured later at unexpected times +- add support for stopping a vserver + +* Wed Nov 9 2005 Steve Muir +- add support for removing resource limits e.g., when a slice is deleted + +* Mon Nov 7 2005 Steve Muir +- fix file descriptor leak in vduimpl +- clean up handling of network parameters +- don't rely upon /etc/vservers/foo.conf to initialise vserver object + +* Wed Nov 2 2005 Steve Muir +- fix Python modules to handling scheduling parameters correctly + +* Fri Oct 28 2005 Steve Muir +- raise exception about being over disk limit after setting usage values + +* Fri Oct 7 2005 Steve Muir +- create common function to be used for entering a vserver and applying + resource limits + * Thu Aug 21 2005 Mark Huang - restore build of python modules