From 73929cd988b29cc673ff12945fc921957bd3ce78 Mon Sep 17 00:00:00 2001 From: Daniel Hokka Zakrisson Date: Fri, 30 Nov 2007 17:54:32 +0000 Subject: [PATCH] Separate the PlanetLab modifications from util-vserver --- AUTHORS | 0 ChangeLog | 0 Makefile.am | 70 ++++ NEWS | 0 README | 0 configure.ac | 196 ++++++++++ man/vsh.8 | 48 +++ python/bwlimit | 6 + python/bwlimit.py | 724 ++++++++++++++++++++++++++++++++++++ python/disklimit | 54 +++ python/vserver.py | 521 ++++++++++++++++++++++++++ python/vserverimpl.c | 746 ++++++++++++++++++++++++++++++++++++++ scripts/vcached | 106 ++++++ scripts/vcached.cron | 10 + scripts/vcached.logrotate | 10 + scripts/vuseradd | 166 +++++++++ scripts/vuserdel | 57 +++ src/planetlab.c | 362 ++++++++++++++++++ src/planetlab.h | 88 +++++ src/vip6-autod.c | 623 +++++++++++++++++++++++++++++++ src/vsh.c | 343 ++++++++++++++++++ sysv/vip6-autod | 59 +++ util-vserver-pl.spec.in | 93 +++++ 23 files changed, 4282 insertions(+) create mode 100644 AUTHORS create mode 100644 ChangeLog create mode 100644 Makefile.am create mode 100644 NEWS create mode 100644 README create mode 100644 configure.ac create mode 100644 man/vsh.8 create mode 100755 python/bwlimit create mode 100644 python/bwlimit.py create mode 100755 python/disklimit create mode 100644 python/vserver.py create mode 100644 python/vserverimpl.c create mode 100755 scripts/vcached create mode 100644 scripts/vcached.cron create mode 100644 scripts/vcached.logrotate create mode 100755 scripts/vuseradd create mode 100755 scripts/vuserdel create mode 100644 src/planetlab.c create mode 100644 src/planetlab.h create mode 100644 src/vip6-autod.c create mode 100644 src/vsh.c create mode 100755 sysv/vip6-autod create mode 100644 util-vserver-pl.spec.in diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..e69de29 diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..32d0193 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,70 @@ +## $Id$ + +# Copyright (C) 2003,2004,2005,2006 Enrico Scholz +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# + +python_vserverimpl_la_SOURCES = python/vserverimpl.c src/planetlab.c +python_vserverimpl_la_LIBADD = -lvserver +python_vserverimpl_la_CFLAGS = -Wno-redundant-decls -I$(top_srcdir)/src $(PYTHON_INCLUDES) +python_vserverimpl_la_LDFLAGS = -module -avoid-version + +src_vip6_autod_SOURCES = src/vip6-autod.c +src_vip6_autod_LDADD = -lvserver -lnl + +src_vsh_SOURCES = src/vsh.c src/planetlab.c +src_vsh_LDADD = -lvserver + +noinst_HEADERS = src/planetlab.h + +man_MANS = man/vsh.8 + +pyexec_LTLIBRARIES = python/vserverimpl.la + +pyexec_DATA = python/vserver.py \ + python/bwlimit.py \ + python/cpulimit.py + +sbin_SCRIPTS = python/bwlimit \ + python/disklimit \ + scripts/vcached \ + scripts/vuseradd \ + scripts/vuserdel + +sbin_PROGRAMS = src/vip6-autod \ + src/vsh + +sysv_SCRIPTS = sysv/vip6-autod + +crondir = $(sysconfdir)/cron.d +cron_DATA = scripts/vcached.cron + +logrotatedir = $(sysconfdir)/logrotate.d +logrotate_DATA = scripts/vcached.logrotate + +install-data-hook: install-fix-script-paths + +fix_SCRPTS = $(addprefix $(sbindir)/, $(notdir $(sbin_SCRIPTS))) \ + $(addprefix $(sysvdir)/, $(notdir $(sysv_SCRIPTS))) + +install-fix-script-paths: + test "$(UV_PKGLIBDIR)" = "/usr/lib/util-vserver" || \ + for i in $(fix_SCRPTS); do \ + f="$(DESTDIR)$$i"; \ + $(SED) -e 's!/usr/lib/util-vserver!$(UV_PKGLIBDIR)!g' "$$f" > "$$f.tmp"; \ + cmp -s "$$f.tmp" "$$f" || cat "$$f.tmp" > "$$f"; \ + rm -f "$$f.tmp"; \ + done diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/README b/README new file mode 100644 index 0000000..e69de29 diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..2284cfe --- /dev/null +++ b/configure.ac @@ -0,0 +1,196 @@ +dnl $Id: configure.ac 2604 2007-09-02 20:03:17Z dhozac $ + +dnl Copyright (C) 2002,2003,2004 Enrico Scholz +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 2, or (at your option) +dnl any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +dnl +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a configuration +dnl script generated by Autoconf, you may include it under the same +dnl distribution terms that you use for the rest of that program. +dnl + +AC_PREREQ(2.57) +AC_INIT(util-vserver-pl, 0.1, support@planet-lab.org) +AC_CONFIG_SRCDIR([python/vserverimpl.c]) +AC_CONFIG_HEADER([config.h]) + +AM_INIT_AUTOMAKE([1.8.3 gnu dist-bzip2 subdir-objects]) +AM_MAINTAINER_MODE + +AC_CANONICAL_BUILD +AC_CANONICAL_HOST + +# Checks for programs. +AC_PROG_CC +AC_PROG_INSTALL +AC_PROG_LN_S +AM_PROG_CC_C_O +AC_DISABLE_STATIC +AC_PROG_LIBTOOL +AM_PATH_PYTHON(2.3) + +AC_ARG_VAR(CC, [The C compiler]) + + +AC_CHECK_HEADERS([asm/types.h stdint.h], [ : ], + [ ensc_have_vserver=no ]) +AC_CHECK_TYPES([xid_t], [ : ], [AC_DEFINE_UNQUOTED([xid_t], [uint32_t], + [Define this to an unsigned integer type])], + [AC_INCLUDES_DEFAULT() +#ifdef HAVE_ASM_TYPES_H +# include +#endif]) +AC_CHECK_TYPES([nid_t], [ : ], [AC_DEFINE_UNQUOTED([nid_t], [uint32_t], + [Define this to an unsigned integer type])], + [AC_INCLUDES_DEFAULT() +#ifdef HAVE_ASM_TYPES_H +# include +#endif]) +AC_CHECK_TYPES([tag_t], [ : ], [AC_DEFINE_UNQUOTED([tag_t], [uint32_t], + [Define this to an unsigned integer type])], + [AC_INCLUDES_DEFAULT() +#ifdef HAVE_ASM_TYPES_H +# include +#endif]) + +AC_CHECK_HEADER([vserver.h], [ : ], + [ AC_MSG_ERROR([No vserver.h found!]) ]) + +AC_CHECK_LIB(vserver, vc_ctx_create, [ : ], + [ AC_MSG_ERROR([No libvserver found!]) ]) + +dnl {check for libnl +dnl + +ensc_have_libnl=yes +LIBNL_ROOT=`readlink -f ../libnl*/` +if test -d "$LIBNL_ROOT"; then + CFLAGS="$CFLAGS -I ${LIBNL_ROOT}/include" + LDFLAGS="$LDFLAGS -L${LIBNL_ROOT}/lib" +fi + +if test x"$ensc_have_libnl" = xyes; then + AC_CHECK_HEADERS([asm/types.h stdint.h], [ : ], + [ ensc_have_libnl=no ]) + AC_CHECK_TYPES([__s64], [ : ], [AC_DEFINE_UNQUOTED([__s64], [int64_t], + [Define this to a signed 64-bit integer type])], + [AC_INCLUDES_DEFAULT() +#ifdef HAVE_ASM_TYPES_H +# include +#endif]) + AC_CHECK_TYPES([__u64], [ : ], [AC_DEFINE_UNQUOTED([__u64], [uint64_t], + [Define this to an unsigned 64-bit integer type])], + [AC_INCLUDES_DEFAULT() +#ifdef HAVE_ASM_TYPES_H +# include +#endif]) +fi + +if test x"$ensc_have_libnl" = xyes; then + AC_CHECK_HEADERS([netlink/netlink.h netlink/route/addr.h], [ : ], + [ ensc_have_libnl=no ], [AC_INCLUDES_DEFAULT() +#ifdef HAVE_ASM_TYPES_H +# include +#endif] + ) +fi + +if test x"$ensc_have_libnl" = xyes; then + AC_CHECK_LIB(nl, nlmsg_get_src, [ : ], + [ ensc_have_libnl=no ]) +fi + +if test x"$ensc_have_libnl" != xyes; then + AC_MSG_WARN([ +**** +**** 'libnl' could not be found; +**** this will disable the build of 'vip6-autod' +****]) +fi + +AM_CONDITIONAL(ENSC_HAVE_LIBNL, test x"$ensc_have_libnl" = xyes) + +dnl +dnl libnl stuff ends here} +dnl +dnl ####################### + +dnl +dnl Get python includes +dnl + +AC_MSG_CHECKING([for python includes]) +PYTHON_INCLUDES=`python-config --includes 2>/dev/null` +if test x"$PYTHON_INCLUDES" = x; then + python_inc_plat=`$PYTHON -c "from distutils.sysconfig import get_python_inc; print get_python_inc(1)"` + python_inc=`$PYTHON -c "from distutils.sysconfig import get_python_inc; print get_python_inc()"` + if test x"$python_inc_plat" != x; then + PYTHON_INCLUDES="$PYTHON_INCLUDES -I$python_inc_plat" + fi + if test x"$python_inc" != x; then + PYTHON_INCLUDES="$PYTHON_INCLUDES -I$python_inc" + fi +fi +AC_MSG_RESULT([$PYTHON_INCLUDES]) +AC_SUBST(PYTHON_INCLUDES) + + +dnl +dnl Figure out util-vserver directories +dnl + +old_PATH="$PATH" +PATH="$PATH:/sbin:/usr/sbin:/usr/local/sbin" + +AC_MSG_CHECKING([for vserver-Rootdir]) +DEFAULT_VSERVERDIR=`vserver-info 2>/dev/null | awk '$1 == "vserver-Rootdir:" { print $2 }'` +if test x"$DEFAULT_VSERVERDIR" = x; then + AC_MSG_ERROR([No vserver-Rootdir could be found!]) +fi +AC_MSG_RESULT([$DEFAULT_VSERVERDIR]) +AC_DEFINE_UNQUOTED([DEFAULT_VSERVERDIR], ["$DEFAULT_VSERVERDIR"], + [Define this to the path where your guests live]) + +AC_MSG_CHECKING([for util-vserver-vars]) +UV_PREFIX=`vserver-info 2>/dev/null | awk '$1 == "prefix:" { print $2 }'` +UV_PKGLIBDIR=`echo "$UV_PREFIX"/lib*/util-vserver` +if test x"$UV_PREFIX" = x -o ! -d "$UV_PKGLIBDIR" -o ! -r "$UV_PKGLIBDIR/util-vserver-vars"; then + AC_MSG_ERROR([No util-vserver-vars could be found in $UV_PKGLIBDIR]) +fi +AC_MSG_RESULT([$UV_PKGLIBDIR]) +AC_SUBST(UV_PKGLIBDIR) + + +PATH="$old_PATH" + + +AC_DEFINE_UNQUOTED([LOCALSTATEDIR], ["$localstatedir"], + [Define this to the local state directory]) +AC_ARG_WITH([initrddir], [AC_HELP_STRING([--with-initrddir ], + [use as directory for SysV init-files (default: $sysconfdir/init.d)])], + [case "$withval" in + yes|no) AC_MSG_ERROR(['$withval' is not a valid value for '--with-initrddir']);; + *) sysvdir="$withval";; + esac], + [sysvdir="${sysconfdir}/init.d"]) +AC_SUBST(sysvdir) + + +AC_DEFINE(_FILE_OFFSET_BITS, [64], [Use 64bit interface for filesystem operations]) + +AC_CONFIG_FILES([util-vserver-pl.spec Makefile]) +AC_OUTPUT diff --git a/man/vsh.8 b/man/vsh.8 new file mode 100644 index 0000000..85f914c --- /dev/null +++ b/man/vsh.8 @@ -0,0 +1,48 @@ +.de Sh \" Subsection +.br +.if t .Sp +.ne 5 +.PP +\fB\\$1\fR +.PP +.. +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Ip \" List item +.br +.ie \\n(.$>=3 .ne \\$3 +.el .ne 3 +.IP "\\$1" \\$2 +.. +.TH "VSH" 8 "2004-07-29" "PlanetLab specific Vserver shell" "vsh" + +.SH NAME +vsh \- Safely trampoline's a slice user from global vserver context to +the correspondingly named local vserver context\&. + +.SH "SYNOPSIS" + +.PP +\fBvsh\fR [COMMAND] \fR + +.TP +[COMMAND] +when specified, the command to run, otherwise vsh will just run the +user's shell as a login shell + +.SH "SUMMARY" + +.PP +vsh is used as the login shell for slice users in the global vserver's +/etc/passwd\&. It is invoked either by sshd when a slice user logs on +to a PlanetLab node or as root using the su command\&. When this +occurs, vsh switches vserver context and sets the uid/gid of the slice +user, as specified in the slice's vserver /etc/passwd\&. + +.PP +vsh assumes that the same account (by name) used to ssh/su into the +vserver also exists in the vserver specific /etc/passwd file\&. This +is the only reason that for now it is deemed to be PlanetLab +specific\&. diff --git a/python/bwlimit b/python/bwlimit new file mode 100755 index 0000000..0630377 --- /dev/null +++ b/python/bwlimit @@ -0,0 +1,6 @@ +#!/usr/bin/python + +import bwlimit + +if __name__ == '__main__': + bwlimit.main() diff --git a/python/bwlimit.py b/python/bwlimit.py new file mode 100644 index 0000000..8221e7b --- /dev/null +++ b/python/bwlimit.py @@ -0,0 +1,724 @@ +#!/usr/bin/python +# +# Bandwidth limit module for PlanetLab nodes. The intent is to use the +# Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow +# slices to fairly share access to available node bandwidth. We +# currently define three classes of "available node bandwidth": +# +# 1. Available hardware bandwidth (bwmax): The maximum rate of the +# hardware. +# +# 2. Available capped bandwidth (bwcap): The maximum rate allowed to +# non-exempt destinations. By default, equal to bwmax, but may be +# lowered by PIs. +# +# 3. Available uncapped ("exempt") bandwidth: The difference between +# bwmax and what is currently being used of bwcap, or the maximum rate +# allowed to destinations exempt from caps (e.g., Internet2). +# +# All three classes of bandwidth are fairly shared according to the +# notion of "shares". For instance, if the node is capped at 5 Mbps, +# there are N slices, and each slice has 1 share, then each slice +# should get at least 5/N Mbps of bandwidth. How HTB is implemented +# makes this statement a little too simplistic. What it really means +# is that during any single time period, only a certain number of +# bytes can be sent onto the wire. Each slice is guaranteed that at +# least some small number of its bytes will be sent. Whatever is left +# over from the budget, is split in proportion to the number of shares +# each slice has. +# +# Even if the node is not capped at a particular limit (bwcap == +# bwmax), this module enforces fair share access to bwmax. Also, if +# the node is capped at a particular limit, rules may optionally be +# defined that classify certain packets into the "exempt" class. This +# class receives whatever bandwidth is leftover between bwcap and +# bwmax; slices fairly share this bandwidth as well. +# +# The root context is exempt from sharing and can send as much as it +# needs to. +# +# Some relevant URLs: +# +# 1. http://lartc.org/howto for how to use tc +# 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB +# +# Andy Bavier +# Mark Huang +# Copyright (C) 2006 The Trustees of Princeton University +# +# $Id: bwlimit.py,v 1.15 2007/02/07 04:21:11 mlhuang Exp $ +# + +import sys, os, re, getopt +from sets import Set +import pwd + + +# Where the tc binary lives +TC = "/sbin/tc" + +# Default interface +dev = "eth0" + +# Verbosity level +verbose = 0 + +# bwmin should be small enough that it can be considered negligibly +# slow compared to the hardware. 8 bits/second appears to be the +# smallest value supported by tc. +bwmin = 8 + +# bwmax should be large enough that it can be considered at least as +# fast as the hardware. +bwmax = 1000*1000*1000 + +# quantum is the maximum number of bytes that can be borrowed by a +# share (or slice, if each slice gets 1 share) in one time period +# (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth +# above their guarantees, and each is attempting to borrow up to the +# node bandwidth cap, quantums control how the excess bandwidth is +# distributed. Slices with 2 shares will borrow twice the amount in +# one time period as slices with 1 share, so averaged over time, they +# will get twice as much of the excess bandwidth. The value should be +# as small as possible and at least 1 MTU. By default, it would be +# calculated as bwmin/10, but since we use such small a value for +# bwmin, it's better to just set it to a value safely above 1 Ethernet +# MTU. +quantum = 1600 + +# cburst is the maximum number of bytes that can be burst onto the +# wire in one time period (with HZ=1000, 1 ms). If multiple slices +# have data queued for transmission, cbursts control how long each +# slice can have the wire for. If not specified, it is set to the +# smallest possible value that would enable the slice's "ceil" rate +# (usually the node bandwidth cap), to be reached if a slice was able +# to borrow enough bandwidth to do so. For now, it's unclear how or if +# to relate this to the notion of shares, so just let tc set the +# default. +cburst = None + +# There is another parameter that controls how bandwidth is allocated +# between slices on nodes that is outside the scope of HTB. We enforce +# a 16 GByte/day total limit on each slice, which works out to about +# 1.5mbit. If a slice exceeds this byte limit before the day finishes, +# it is capped at (i.e., its "ceil" rate is set to) the smaller of the +# node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this +# rule and executes this script to override "ceil". + +# We support multiple bandwidth limits, by reserving the top nibble of +# the minor classid to be the "subclassid". Theoretically, we could +# support up to 15 subclasses, but for now, we only define two: the +# "default" subclass 1:10 that is capped at the node bandwidth cap (in +# this example, 5mbit) and the "exempt" subclass 1:20 that is capped +# at bwmax (i.e., not capped). The 1:1 parent class exists only to +# make the borrowing model work. All bandwidth above minimum +# guarantees is fairly shared (in this example, slice 2 is guaranteed +# at least 1mbit in addition to fair access to the rest), subject to +# the restrictions of the class hierarchy: namely, that the total +# bandwidth to non-exempt destinations should not exceed the node +# bandwidth cap. +# +# 1: +# | +# 1:1 (1gbit) +# ______________|_____________ +# | | +# 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit) +# | | +# 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit), +# 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit), +# 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit), +# ... ... +# 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit) +# +default_minor = 0x1000 +exempt_minor = 0x2000 + +# root_xid is for the root context. The root context is exempt from +# fair sharing in both the default and exempt subclasses. The root +# context gets 5 shares by default. +root_xid = 0x0000 +root_share = 5 + +# default_xid is for unclassifiable packets. Packets should not be +# classified here very often. They can be if a slice's HTB classes are +# deleted before its processes are. Each slice gets 1 share by +# default. +default_xid = 0x0FFF +default_share = 1 + +# See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be +# warned that older versions of tc interpret "kbps", "mbps", "mbit", +# and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and +# "kibit" and that if an older version is installed, all rates will +# be off by a small fraction. +suffixes = { + "": 1, + "bit": 1, + "kibit": 1024, + "kbit": 1000, + "mibit": 1024*1024, + "mbit": 1000000, + "gibit": 1024*1024*1024, + "gbit": 1000000000, + "tibit": 1024*1024*1024*1024, + "tbit": 1000000000000, + "bps": 8, + "kibps": 8*1024, + "kbps": 8000, + "mibps": 8*1024*1024, + "mbps": 8000000, + "gibps": 8*1024*1024*1024, + "gbps": 8000000000, + "tibps": 8*1024*1024*1024*1024, + "tbps": 8000000000000 +} + + +def get_tc_rate(s): + """ + Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second + """ + + if type(s) == int: + return s + m = re.match(r"([0-9.]+)(\D*)", s) + if m is None: + return -1 + suffix = m.group(2).lower() + if suffixes.has_key(suffix): + return int(float(m.group(1)) * suffixes[suffix]) + else: + return -1 + + +def format_tc_rate(rate): + """ + Formats a bits/second rate into a tc rate string + """ + + if rate >= 1000000000 and (rate % 1000000000) == 0: + return "%.0fgbit" % (rate / 1000000000.) + elif rate >= 1000000 and (rate % 1000000) == 0: + return "%.0fmbit" % (rate / 1000000.) + elif rate >= 1000: + return "%.0fkbit" % (rate / 1000.) + else: + return "%.0fbit" % rate + + +# Parse /etc/planetlab/bwcap (or equivalent) +def read_bwcap(bwcap_file): + bwcap = bwmax + try: + fp = open(bwcap_file, "r") + line = fp.readline().strip() + if line: + bwcap = get_tc_rate(line) + except: + pass + if bwcap == -1: + bwcap = bwmax + return bwcap + + +def get_bwcap(dev = dev): + """ + Get the current (live) value of the node bandwidth cap + """ + + state = tc("-d class show dev %s" % dev) + base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*") + base_classes = filter(None, map(base_re.match, state)) + if not base_classes: + return -1 + if len(base_classes) > 1: + raise Exception, "unable to get current bwcap" + return get_tc_rate(base_classes[0].group(1)) + + +def get_slice(xid): + """ + Get slice name ("princeton_mlh") from slice xid (500) + """ + + if xid == root_xid: + return "root" + if xid == default_xid: + return "default" + try: + return pwd.getpwuid(xid).pw_name + except KeyError: + pass + + return None + +def get_xid(slice): + """ + Get slice xid ("princeton_mlh") from slice name ("500" or "princeton_mlh") + """ + + if slice == "root": + return root_xid + if slice == "default": + return default_xid + try: + try: + return int(slice) + except ValueError: + pass + return pwd.getpwnam(slice).pw_uid + except KeyError: + pass + + return None + +def run(cmd, input = None): + """ + Shortcut for running a shell command + """ + + try: + if verbose: + sys.stderr.write("Executing: " + cmd + "\n") + if input is None: + fileobj = os.popen(cmd, "r") + output = fileobj.readlines() + else: + fileobj = os.popen(cmd, "w") + fileobj.write(input) + output = None + if fileobj.close() is None: + return output + except Exception, e: + pass + return None + + +def tc(cmd): + """ + Shortcut for running a tc command + """ + + return run(TC + " " + cmd) + + +def init(dev = dev, bwcap = bwmax): + """ + (Re)initialize the bandwidth limits on this node + """ + + # Load the module used to manage exempt classes + run("/sbin/modprobe ip_set_iphash") + + # Save current settings + paramslist = get(None, dev) + + # Delete root qdisc 1: if it exists. This will also automatically + # delete any child classes. + for line in tc("qdisc show dev %s" % dev): + # Search for the root qdisc 1: + m = re.match(r"qdisc htb 1:", line) + if m is not None: + tc("qdisc del dev %s root handle 1:" % dev) + break + + # Initialize HTB. The "default" clause specifies that if a packet + # fails classification, it should go into the class with handle + # 1FFF. + tc("qdisc add dev %s root handle 1: htb default %x" % \ + (dev, default_minor | default_xid)) + + # Set up a parent class from which all subclasses borrow. + tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \ + (dev, bwmax)) + + # Set up a subclass that represents the node bandwidth cap. We + # allow each slice to borrow up to this rate, so it is also + # usually the "ceil" rate for each slice. + tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \ + (dev, bwmin, bwcap)) + + # Set up a subclass that represents "exemption" from the node + # bandwidth cap. Once the node bandwidth cap is reached, bandwidth + # to exempt destinations can still be fairly shared up to bwmax. + tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \ + (dev, bwmin, bwmax)) + + # Set up the root class (and tell VNET what it is). Packets sent + # by root end up here and are capped at the node bandwidth + # cap. + #on(root_xid, dev, share = root_share) + #try: + # file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid)) + #except: + # pass + + # Set up the default class. Packets that fail classification end + # up here. + on(default_xid, dev, share = default_share) + + # Restore old settings + for (xid, share, + minrate, maxrate, + minexemptrate, maxexemptrate, + bytes, exemptbytes) in paramslist: + if xid not in (root_xid, default_xid): + on(xid, dev, share, minrate, maxrate, minexemptrate, maxexemptrate) + + +def get(xid = None, dev = dev): + """ + Get the bandwidth limits and current byte totals for a + particular slice xid as a tuple (xid, share, minrate, maxrate, + minexemptrate, maxexemptrate, bytes, exemptbytes), or all classes + as a list of such tuples. + """ + + if xid is None: + ret = [] + else: + ret = None + + rates = {} + rate = None + + # ... + # class htb 1:1000 parent 1:10 leaf 1000: prio 0 quantum 8000 rate 8bit ceil 10000Kbit ... + # Sent 6851486 bytes 49244 pkt (dropped 0, overlimits 0 requeues 0) + # ... + # class htb 1:2000 parent 1:20 leaf 2000: prio 0 quantum 8000 rate 8bit ceil 1000Mbit ... + # Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) + # ... + for line in tc("-s -d class show dev %s" % dev): + # Rate parameter line + params = re.match(r"class htb 1:([0-9a-f]+) parent 1:(10|20)", line) + # Statistics line + stats = re.match(r".* Sent ([0-9]+) bytes", line) + # Another class + ignore = re.match(r"class htb", line) + + if params is not None: + # Which class + if params.group(2) == "10": + min = 'min' + max = 'max' + bytes = 'bytes' + else: + min = 'minexempt' + max = 'maxexempt' + bytes = 'exemptbytes' + + # Slice ID + id = int(params.group(1), 16) & 0x0FFF; + + if rates.has_key(id): + rate = rates[id] + else: + rate = {'id': id} + + # Parse share + rate['share'] = 1 + m = re.search(r"quantum (\d+)", line) + if m is not None: + rate['share'] = int(m.group(1)) / quantum + + # Parse minrate + rate[min] = bwmin + m = re.search(r"rate (\w+)", line) + if m is not None: + rate[min] = get_tc_rate(m.group(1)) + + # Parse maxrate + rate[max] = bwmax + m = re.search(r"ceil (\w+)", line) + if m is not None: + rate[max] = get_tc_rate(m.group(1)) + + # Which statistics to parse + rate['stats'] = bytes + + rates[id] = rate + + elif stats is not None: + if rate is not None: + rate[rate['stats']] = int(stats.group(1)) + + elif ignore is not None: + rate = None + + # Keep parsing until we get everything + if rate is not None and \ + rate.has_key('min') and rate.has_key('minexempt') and \ + rate.has_key('max') and rate.has_key('maxexempt') and \ + rate.has_key('bytes') and rate.has_key('exemptbytes'): + params = (rate['id'], rate['share'], + rate['min'], rate['max'], + rate['minexempt'], rate['maxexempt'], + rate['bytes'], rate['exemptbytes']) + if xid is None: + # Return a list of parameters + ret.append(params) + rate = None + elif xid == rate['id']: + # Return the parameters for this class + ret = params + break + + return ret + + +def on(xid, dev = dev, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None): + """ + Apply specified bandwidth limit to the specified slice xid + """ + + # Get defaults from current state if available + cap = get(xid, dev) + if cap is not None: + if share is None: + share = cap[1] + if minrate is None: + minrate = cap[2] + if maxrate is None: + maxrate = cap[3] + if minexemptrate is None: + minexemptrate = cap[4] + if maxexemptrate is None: + maxexemptrate = cap[5] + + # Figure out what the current node bandwidth cap is + bwcap = get_bwcap() + + # Set defaults + if share is None: + share = default_share + if minrate is None: + minrate = bwmin + else: + minrate = get_tc_rate(minrate) + if maxrate is None: + maxrate = bwcap + else: + maxrate = get_tc_rate(maxrate) + if minexemptrate is None: + minexemptrate = minrate + else: + minexemptrate = get_tc_rate(minexemptrate) + if maxexemptrate is None: + maxexemptrate = bwmax + else: + maxexemptrate = get_tc_rate(maxexemptrate) + + # Sanity checks + if maxrate < bwmin: + maxrate = bwmin + if maxrate > bwcap: + maxrate = bwcap + if minrate < bwmin: + minrate = bwmin + if minrate > maxrate: + minrate = maxrate + if maxexemptrate < bwmin: + maxexemptrate = bwmin + if maxexemptrate > bwmax: + maxexemptrate = bwmax + if minexemptrate < bwmin: + minexemptrate = bwmin + if minexemptrate > maxexemptrate: + minexemptrate = maxexemptrate + + # Set up subclasses for the slice + tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \ + (dev, default_minor | xid, minrate, maxrate, share * quantum)) + + tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \ + (dev, exempt_minor | xid, minexemptrate, maxexemptrate, share * quantum)) + + # Attach a FIFO to each subclass, which helps to throttle back + # processes that are sending faster than the token buckets can + # support. + tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \ + (dev, default_minor | xid, default_minor | xid)) + + tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \ + (dev, exempt_minor | xid, exempt_minor | xid)) + + +def set(xid, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None): + on(xid = xid, share = share, + minrate = minrate, maxrate = maxrate, + minexemptrate = minexemptrate, maxexemptrate = maxexemptrate) + + +# Remove class associated with specified slice xid. If further packets +# are seen from this slice, they will be classified into the default +# class 1:1FFF. +def off(xid, dev = dev): + """ + Remove class associated with specified slice xid. If further + packets are seen from this slice, they will be classified into the + default class 1:1FFF. + """ + + cap = get(xid, dev) + if cap is not None: + tc("class del dev %s classid 1:%x" % (dev, default_minor | xid)) + tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid)) + + +def exempt_init(group_name, node_ips): + """ + Initialize the list of destinations exempt from the node bandwidth + (burst) cap. + """ + + # Clean up + iptables = "/sbin/iptables -t MANGLE %s POSTROUTING" + run(iptables % "-F") + run("/sbin/ipset -X " + group_name) + + # Create a hashed IP set of all of these destinations + lines = ["-N %s iphash" % group_name] + add_cmd = "-A %s " % group_name + lines += [(add_cmd + ip) for ip in node_ips] + lines += ["COMMIT"] + restore = "\n".join(lines) + "\n" + run("/sbin/ipset -R", restore) + + # Add rule to match on destination IP set + run((iptables + " -m set --set %s dst -j CLASSIFY --set-class 1:%x") % + ("-A", group_name, exempt_minor)) + + +def usage(): + bwcap_description = format_tc_rate(get_bwcap()) + + print """ +Usage: + +%s [OPTION]... [COMMAND] [ARGUMENT]... + +Options: + -d device Network interface (default: %s) + -r rate Node bandwidth cap (default: %s) + -q quantum Share multiplier (default: %d bytes) + -n Print rates in numeric bits per second + -v Enable verbose debug messages + -h This message + +Commands: + init + (Re)initialize all bandwidth parameters + on slice [share|-] [minrate|-] [maxrate|-] [minexemptrate|-] [maxexemptrate|-] + Set bandwidth parameter(s) for the specified slice + off slice + Remove all bandwidth parameters for the specified slice + get + Get all bandwidth parameters for all slices + get slice + Get bandwidth parameters for the specified slice +""" % (sys.argv[0], dev, bwcap_description, quantum) + sys.exit(1) + + +def main(): + global dev, quantum, verbose + + # Defaults + numeric = False + bwcap = get_bwcap() + + (opts, argv) = getopt.getopt(sys.argv[1:], "d:nr:q:vh") + for (opt, optval) in opts: + if opt == '-d': + dev = optval + elif opt == '-n': + numeric = True + elif opt == '-r': + bwcap = get_tc_rate(optval) + elif opt == '-q': + quantum = int(optval) + elif opt == '-v': + verbose += 1 + elif opt == '-h': + usage() + + if len(argv): + if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1): + # (Re)initialize + init(dev, get_tc_rate(bwcap)) + + elif argv[0] == "get" or argv[0] == "show": + # Show + if len(argv) >= 2: + # Show a particular slice + xid = get_xid(argv[1]) + if xid is None: + sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1]) + usage() + params = get(xid, dev) + if params is None: + paramslist = [] + else: + paramslist = [params] + else: + # Show all slices + paramslist = get(None, dev) + + for (xid, share, + minrate, maxrate, + minexemptrate, maxexemptrate, + bytes, exemptbytes) in paramslist: + slice = get_slice(xid) + if slice is None: + # Orphaned (not associated with a slice) class + slice = "%d?" % xid + if numeric: + print "%s %d %d %d %d %d %d %d" % \ + (slice, share, + minrate, maxrate, + minexemptrate, maxexemptrate, + bytes, exemptbytes) + else: + print "%s %d %s %s %s %s %d %d" % \ + (slice, share, + format_tc_rate(minrate), format_tc_rate(maxrate), + format_tc_rate(minexemptrate), format_tc_rate(maxexemptrate), + bytes, exemptbytes) + + elif len(argv) >= 2: + # slice, ... + xid = get_xid(argv[1]) + if xid is None: + sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1]) + usage() + + if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace" or argv[0] == "set": + # Enable cap + args = [] + if len(argv) >= 3: + # ... share, minrate, maxrate, minexemptrate, maxexemptrate + casts = [int, get_tc_rate, get_tc_rate, get_tc_rate, get_tc_rate] + for i, arg in enumerate(argv[2:]): + if i >= len(casts): + break + if arg == "-": + args.append(None) + else: + args.append(casts[i](arg)) + on(xid, dev, *args) + + elif argv[0] == "off" or argv[0] == "del": + # Disable cap + off(xid, dev) + + else: + usage() + + else: + usage() + + +if __name__ == '__main__': + main() diff --git a/python/disklimit b/python/disklimit new file mode 100755 index 0000000..f8c9787 --- /dev/null +++ b/python/disklimit @@ -0,0 +1,54 @@ +#!/bin/env python + +import vserver, sys, os, re, getopt + +def usage(): + print """ +Usage: + +%s [OPTION]... [COMMAND] [ARGUMENT]... + +Options: + + +Commands: + set slice space + Set max disk limit for a slice + + get slice + Get current disk limit for slice +""" % (sys.argv[0]) + + sys.exit(1) + +def get(argv): + slicename = argv[0] + vs = vserver.VServer(slicename) + limit = vs.get_disklimit() + return "%s %d limit" % (slicename,limit) + +def set(argv): + slicename = argv[0] + vs = vserver.VServer(slicename) + oldlimit = vs.get_disklimit() + newlimit = int(argv[1]) + if newlimit >= oldlimit: + vs.set_disklimit(newlimit) + return None + +def main(): + functions = {"get":get, "set":set} + argv = sys.argv[1:] + if len(argv): + func = functions.get(argv[0],usage) + result = func(argv[1:]) + if result <> None: + print result + sys.exit(0) + + # no command given + usage() + + +if __name__ == '__main__': + main() diff --git a/python/vserver.py b/python/vserver.py new file mode 100644 index 0000000..24b4560 --- /dev/null +++ b/python/vserver.py @@ -0,0 +1,521 @@ +# Copyright 2005 Princeton University + +#$Id: vserver.py,v 1.72 2007/08/02 16:01:59 dhozac Exp $ + +import errno +import fcntl +import os +import re +import pwd +import signal +import sys +import time +import traceback +import subprocess +import resource + +import vserverimpl +import cpulimit, bwlimit + +from vserverimpl import VS_SCHED_CPU_GUARANTEED as SCHED_CPU_GUARANTEED +from vserverimpl import DLIMIT_INF +from vserverimpl import VC_LIM_KEEP +from vserverimpl import VLIMIT_NSOCK +from vserverimpl import VLIMIT_OPENFD +from vserverimpl import VLIMIT_ANON +from vserverimpl import VLIMIT_SHMEM + +# +# these are the flags taken from the kernel linux/vserver/legacy.h +# +FLAGS_LOCK = 1 +FLAGS_SCHED = 2 # XXX - defined in util-vserver/src/chcontext.c +FLAGS_NPROC = 4 +FLAGS_PRIVATE = 8 +FLAGS_INIT = 16 +FLAGS_HIDEINFO = 32 +FLAGS_ULIMIT = 64 +FLAGS_NAMESPACE = 128 + +RLIMITS = { "NSOCK": VLIMIT_NSOCK, + "OPENFD": VLIMIT_OPENFD, + "ANON": VLIMIT_ANON, + "SHMEM": VLIMIT_SHMEM} + +# add in the platform supported rlimits +for entry in resource.__dict__.keys(): + if entry.find("RLIMIT_")==0: + k = entry[len("RLIMIT_"):] + if not RLIMITS.has_key(k): + RLIMITS[k]=resource.__dict__[entry] + else: + print "WARNING: duplicate RLIMITS key %s" % k + +class NoSuchVServer(Exception): pass + + +class VServerConfig: + def __init__(self, name, directory): + self.name = name + self.dir = directory + self.cache = None + if not (os.path.isdir(self.dir) and + os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)): + raise NoSuchVServer, "%s does not exist" % self.dir + + def get(self, option, default = None): + try: + if self.cache: + return self.cache[option] + else: + f = open(os.path.join(self.dir, option), "r") + buf = f.read().rstrip() + f.close() + return buf + except: + if default is not None: + return default + else: + raise KeyError, "Key %s is not set for %s" % (option, self.name) + + def update(self, option, value): + if self.cache: + return + + try: + old_umask = os.umask(0022) + filename = os.path.join(self.dir, option) + try: + os.makedirs(os.path.dirname(filename), 0755) + except: + pass + f = open(filename, 'w') + if isinstance(value, list): + f.write("%s\n" % "\n".join(value)) + else: + f.write("%s\n" % value) + f.close() + os.umask(old_umask) + except: + raise + + def unset(self, option): + if self.cache: + return + + try: + filename = os.path.join(self.dir, option) + os.unlink(filename) + try: + os.removedirs(os.path.dirname(filename)) + except: + pass + return True + except: + return False + + def cache_it(self): + self.cache = {} + def add_to_cache(cache, dirname, fnames): + for file in fnames: + full_name = os.path.join(dirname, file) + if os.path.islink(full_name): + fnames.remove(file) + elif (os.path.isfile(full_name) and + os.access(full_name, os.R_OK)): + f = open(full_name, "r") + cache[full_name.replace(os.path.join(self.dir, ''), + '')] = f.read().rstrip() + f.close() + os.path.walk(self.dir, add_to_cache, self.cache) + + +class VServer: + + INITSCRIPTS = [('/etc/rc.vinit', 'start'), + ('/etc/rc.d/rc', '%(runlevel)d')] + + def __init__(self, name, vm_id = None, vm_running = None): + + self.name = name + self.rlimits_changed = False + self.dir = "%s/%s" % (vserverimpl.VSERVER_BASEDIR, name) + if not (os.path.isdir(self.dir) and + os.access(self.dir, os.R_OK | os.W_OK | os.X_OK)): + raise NoSuchVServer, "no such vserver: " + name + self.config = VServerConfig(name, "/etc/vservers/%s" % name) + self.remove_caps = ~vserverimpl.CAP_SAFE; + if vm_id == None: + vm_id = int(self.config.get('context')) + self.ctx = vm_id + if vm_running == None: + vm_running = self.is_running() + self.vm_running = vm_running + + def have_limits_changed(self): + return self.rlimits_changed + + def set_rlimit_limit(self,type,hard,soft,minimum): + """Generic set resource limit function for vserver""" + global RLIMITS + changed = False + try: + old_hard, old_soft, old_minimum = self.get_rlimit_limit(type) + if old_hard != VC_LIM_KEEP and old_hard <> hard: changed = True + if old_soft != VC_LIM_KEEP and old_soft <> soft: changed = True + if old_minimum != VC_LIM_KEEP and old_minimum <> minimum: changed = True + self.rlimits_changed = self.rlimits_changed or changed + except OSError, e: + if self.is_running(): print "Unexpected error with getrlimit for running context %d" % self.ctx + + resource_type = RLIMITS[type] + try: + ret = vserverimpl.setrlimit(self.ctx,resource_type,hard,soft,minimum) + except OSError, e: + if self.is_running(): print "Unexpected error with setrlimit for running context %d" % self.ctx + + def set_rlimit_config(self,type,hard,soft,minimum): + """Generic set resource limit function for vserver""" + if hard <> VC_LIM_KEEP: + self.config.update('rlimits/%s.hard' % type.lower(), hard) + if soft <> VC_LIM_KEEP: + self.config.update('rlimits/%s.soft' % type.lower(), soft) + if minimum <> VC_LIM_KEEP: + self.config.update('rlimits/%s.min' % type.lower(), minimum) + self.set_rlimit_limit(type,hard,soft,minimum) + + def get_rlimit_limit(self,type): + """Generic get resource configuration function for vserver""" + global RLIMITS + resource_type = RLIMITS[type] + try: + ret = vserverimpl.getrlimit(self.ctx,resource_type) + except OSError, e: + print "Unexpected error with getrlimit for context %d" % self.ctx + ret = self.get_rlimit_config(type) + return ret + + def get_rlimit_config(self,type): + """Generic get resource configuration function for vserver""" + hard = int(self.config.get("rlimits/%s.hard"%type.lower(),VC_LIM_KEEP)) + soft = int(self.config.get("rlimits/%s.soft"%type.lower(),VC_LIM_KEEP)) + minimum = int(self.config.get("rlimits/%s.min"%type.lower(),VC_LIM_KEEP)) + return (hard,soft,minimum) + + def set_capabilities(self, capabilities): + return vserverimpl.setbcaps(self.ctx, vserverimpl.text2bcaps(capabilities)) + + def set_capabilities_config(self, capabilities): + self.config.update('bcapabilities', capabilities) + self.set_capabilities(capabilities) + + def get_capabilities(self): + return vserverimpl.bcaps2text(vserverimpl.getbcaps(self.ctx)) + + def get_capabilities_config(self): + return self.config.get('bcapabilities', '') + + def set_ipaddresses(self, addresses): + vserverimpl.netremove(self.ctx, "all") + for a in addresses.split(","): + vserverimpl.netadd(self.ctx, a) + + def set_ipaddresses_config(self, addresses): + i = 0 + for a in addresses.split(","): + self.config.update("interfaces/%d/ip" % i, a) + i += 1 + while self.config.unset("interfaces/%d/ip" % i): + i += 1 + self.set_ipaddresses(addresses) + + def get_ipaddresses_config(self): + i = 0 + ret = [] + while True: + r = self.config.get("interfaces/%d/ip" % i, '') + if r == '': + break + ret += [r] + i += 1 + return ",".join(ret) + + def get_ipaddresses(self): + # No clean way to do this right now. + return None + + def __do_chroot(self): + self.config.cache_it() + os.chroot(self.dir) + os.chdir("/") + + def chroot_call(self, fn, *args): + + cwd_fd = os.open(".", os.O_RDONLY) + try: + root_fd = os.open("/", os.O_RDONLY) + try: + self.__do_chroot() + result = fn(*args) + finally: + os.fchdir(root_fd) + os.chroot(".") + os.fchdir(cwd_fd) + os.close(root_fd) + finally: + os.close(cwd_fd) + return result + + def set_disklimit(self, block_limit): + # block_limit is in kB + if block_limit == 0: + try: + vserverimpl.unsetdlimit(self.dir, self.ctx) + except OSError, e: + print "Unexpected error with unsetdlimit for context %d" % self.ctx + return + + if self.vm_running: + block_usage = vserverimpl.DLIMIT_KEEP + inode_usage = vserverimpl.DLIMIT_KEEP + else: + # init_disk_info() must have been called to get usage values + block_usage = self.disk_blocks + inode_usage = self.disk_inodes + + + try: + vserverimpl.setdlimit(self.dir, + self.ctx, + block_usage, + block_limit, + inode_usage, + vserverimpl.DLIMIT_INF, # inode limit + 2) # %age reserved for root + except OSError, e: + print "Unexpected error with setdlimit for context %d" % self.ctx + + + self.config.update('dlimits/0/space_total', block_limit) + + def is_running(self): + return vserverimpl.isrunning(self.ctx) + + def get_disklimit(self): + + try: + (self.disk_blocks, block_limit, self.disk_inodes, inode_limit, + reserved) = vserverimpl.getdlimit(self.dir, self.ctx) + except OSError, ex: + if ex.errno != errno.ESRCH: + raise + # get here if no vserver disk limit has been set for xid + block_limit = -1 + + return block_limit + + def set_sched_config(self, cpu_share, sched_flags): + + """ Write current CPU scheduler parameters to the vserver + configuration file. This method does not modify the kernel CPU + scheduling parameters for this context. """ + + if sched_flags & SCHED_CPU_GUARANTEED: + cpu_guaranteed = cpu_share + else: + cpu_guaranteed = 0 + self.config.update('sched/fill-rate2', cpu_share) + self.config.update('sched/fill-rate', cpu_guaranteed) + + if self.vm_running: + self.set_sched(cpu_share, sched_flags) + + def set_sched(self, cpu_share, sched_flags = 0): + """ Update kernel CPU scheduling parameters for this context. """ + vserverimpl.setsched(self.ctx, cpu_share, sched_flags) + + def get_sched(self): + # have no way of querying scheduler right now on a per vserver basis + return (-1, False) + + def set_bwlimit(self, minrate = bwlimit.bwmin, maxrate = None, + exempt_min = None, exempt_max = None, + share = None, dev = "eth0"): + + if minrate is None: + bwlimit.off(self.ctx, dev) + else: + bwlimit.on(self.ctx, dev, share, + minrate, maxrate, exempt_min, exempt_max) + + def get_bwlimit(self, dev = "eth0"): + + result = bwlimit.get(self.ctx) + # result of bwlimit.get is (ctx, share, minrate, maxrate) + if result: + result = result[1:] + return result + + def open(self, filename, mode = "r", bufsize = -1): + + return self.chroot_call(open, filename, mode, bufsize) + + def __do_chcontext(self, state_file): + + if state_file: + print >>state_file, "%u" % self.ctx + state_file.close() + + if vserverimpl.chcontext(self.ctx, vserverimpl.text2bcaps(self.get_capabilities_config())): + self.set_resources() + vserverimpl.setup_done(self.ctx) + + def __prep(self, runlevel, log): + + """ Perform all the crap that the vserver script does before + actually executing the startup scripts. """ + + # remove /var/run and /var/lock/subsys files + # but don't remove utmp from the top-level /var/run + RUNDIR = "/var/run" + LOCKDIR = "/var/lock/subsys" + filter_fn = lambda fs: filter(lambda f: f != 'utmp', fs) + garbage = reduce((lambda (out, ff), (dir, subdirs, files): + (out + map((dir + "/").__add__, ff(files)), + lambda fs: fs)), + list(os.walk(RUNDIR)), + ([], filter_fn))[0] + garbage += filter(os.path.isfile, map((LOCKDIR + "/").__add__, + os.listdir(LOCKDIR))) + if False: + for f in garbage: + os.unlink(f) + + # set the initial runlevel + f = open(RUNDIR + "/utmp", "w") + vserverimpl.setrunlevel(f, runlevel) + f.close() + + # mount /proc and /dev/pts + self.__do_mount("none", self.dir, "/proc", "proc") + # XXX - magic mount options + self.__do_mount("none", self.dir, "/dev/pts", "devpts", 0, "gid=5,mode=0620") + + def __do_mount(self, *mount_args): + + try: + vserverimpl.mount(*mount_args) + except OSError, ex: + if ex.errno == errno.EBUSY: + # assume already mounted + return + raise ex + + def enter(self): + self.__do_chroot() + self.__do_chcontext(None) + + def start(self, wait, runlevel = 3): + self.vm_running = True + self.rlimits_changed = False + + child_pid = os.fork() + if child_pid == 0: + # child process + try: + # get a new session + os.setsid() + + # open state file to record vserver info + state_file = open("/var/run/vservers/%s" % self.name, "w") + + # use /dev/null for stdin, /var/log/boot.log for stdout/err + fd = os.open("/dev/null", os.O_RDONLY) + if fd != 0: + os.dup2(fd, 0) + os.close(fd) + self.__do_chroot() + log = open("/var/log/boot.log", "w", 0) + if log.fileno() != 1: + os.dup2(log.fileno(), 1) + os.dup2(1, 2) + + print >>log, ("%s: starting the virtual server %s" % + (time.asctime(time.gmtime()), self.name)) + + # perform pre-init cleanup + self.__prep(runlevel, log) + + # execute each init script in turn + # XXX - we don't support all scripts that vserver script does + self.__do_chcontext(state_file) + for cmd in self.INITSCRIPTS: + try: + # enter vserver context + arg_subst = { 'runlevel': runlevel } + cmd_args = [cmd[0]] + map(lambda x: x % arg_subst, + cmd[1:]) + print >>log, "executing '%s'" % " ".join(cmd_args) + os.spawnvp(os.P_NOWAIT,cmd[0],cmd_args) + except: + traceback.print_exc() + os._exit(1) + + # we get here due to an exception in the top-level child process + except Exception, ex: + traceback.print_exc() + os._exit(0) + + # parent process + return child_pid + + def set_resources(self): + + """ Called when vserver context is entered for first time, + should be overridden by subclass. """ + + pass + + def init_disk_info(self): + cmd = "/usr/sbin/vdu --script --space --inodes --blocksize 1024 --xid %d %s" % (self.ctx, self.dir) + p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + close_fds=True) + p.stdin.close() + line = p.stdout.readline() + if not line: + sys.stderr.write(p.stderr.read()) + p.stdout.close() + p.stderr.close() + ret = p.wait() + + (space, inodes) = line.split() + self.disk_inodes = int(inodes) + self.disk_blocks = int(space) + #(self.disk_inodes, self.disk_blocks) = vduimpl.vdu(self.dir) + + return self.disk_blocks * 1024 + + def stop(self, signal = signal.SIGKILL): + vserverimpl.killall(self.ctx, signal) + self.vm_running = False + self.rlimits_changed = False + + + +def create(vm_name, static = False, ctor = VServer): + + options = ['vuseradd'] + if static: + options += ['--static'] + ret = os.spawnvp(os.P_WAIT, 'vuseradd', options + [vm_name]) + if not os.WIFEXITED(ret) or os.WEXITSTATUS(ret) != 0: + out = "system command ('%s') " % options + if os.WIFEXITED(ret): + out += "failed, rc = %d" % os.WEXITSTATUS(ret) + else: + out += "killed by signal %d" % os.WTERMSIG(ret) + raise SystemError, out + vm_id = pwd.getpwnam(vm_name)[2] + + return ctor(vm_name, vm_id) diff --git a/python/vserverimpl.c b/python/vserverimpl.c new file mode 100644 index 0000000..71f0820 --- /dev/null +++ b/python/vserverimpl.c @@ -0,0 +1,746 @@ +/* Copyright 2005 Princeton University + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided +with the distribution. + +* Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived +from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PRINCETON +UNIVERSITY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "vserver.h" +#include "planetlab.h" + +static inline PyObject *inc_and_ret_none(void) +{ + Py_INCREF(Py_None); + return Py_None; +} + +#define NONE inc_and_ret_none() + +/* + * context create + */ +static PyObject * +vserver_chcontext(PyObject *self, PyObject *args) +{ + int ctx_is_new; + xid_t ctx; + uint_least64_t bcaps = 0; + + if (!PyArg_ParseTuple(args, "I|K", &ctx, &bcaps)) + return NULL; + bcaps |= ~(vc_get_insecurebcaps() | (1 << VC_CAP_NET_BIND_SERVICE)); + + if ((ctx_is_new = pl_chcontext(ctx, bcaps, 0)) < 0) + return PyErr_SetFromErrno(PyExc_OSError); + + return PyBool_FromLong(ctx_is_new); +} + +static PyObject * +vserver_setup_done(PyObject *self, PyObject *args) +{ + xid_t ctx; + + if (!PyArg_ParseTuple(args, "I", &ctx)) + return NULL; + + if (pl_setup_done(ctx) < 0) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_isrunning(PyObject *self, PyObject *args) +{ + xid_t ctx; + PyObject *ret; + struct stat statbuf; + char fname[64]; + + if (!PyArg_ParseTuple(args, "I", &ctx)) + return NULL; + + sprintf(fname,"/proc/virtual/%d", ctx); + + if(stat(&fname[0],&statbuf)==0) + ret = PyBool_FromLong(1); + else + ret = PyBool_FromLong(0); + + return ret; +} + +static PyObject * +__vserver_get_rlimit(xid_t xid, int resource) { + struct vc_rlimit limits; + PyObject *ret; + + errno = 0; + if (vc_get_rlimit(xid, resource, &limits)==-1) + ret = PyErr_SetFromErrno(PyExc_OSError); + else + ret = Py_BuildValue("LLL",limits.hard, limits.soft, limits.min); + + return ret; +} + +static PyObject * +vserver_get_rlimit(PyObject *self, PyObject *args) { + xid_t xid; + int resource; + PyObject *ret; + + if (!PyArg_ParseTuple(args, "Ii", &xid, &resource)) + ret = NULL; + else + ret = __vserver_get_rlimit(xid, resource); + + return ret; +} + +static PyObject * +vserver_set_rlimit(PyObject *self, PyObject *args) { + struct vc_rlimit limits; + struct rlimit lim; + xid_t xid; + int resource, lresource; + PyObject *ret; + + limits.min = VC_LIM_KEEP; + limits.soft = VC_LIM_KEEP; + limits.hard = VC_LIM_KEEP; + + if (!PyArg_ParseTuple(args, "IiLLL", &xid, &resource, &limits.hard, &limits.soft, &limits.min)) + return NULL; + + lresource = resource; + switch (resource) { + case VC_VLIMIT_NSOCK: + case VC_VLIMIT_ANON: + case VC_VLIMIT_SHMEM: + goto do_vc_set_rlimit; + case VC_VLIMIT_OPENFD: + lresource = RLIMIT_NOFILE; + break; + default: + break; + } + + getrlimit(lresource,&lim); + if (adjust_lim(&limits,&lim)) { + setrlimit(lresource, &lim); + } + + do_vc_set_rlimit: + errno = 0; + if (vc_set_rlimit(xid, resource, &limits)==-1) + ret = PyErr_SetFromErrno(PyExc_OSError); + else + ret = __vserver_get_rlimit(xid, resource); + + return ret; +} + +/* + * setsched + */ +static PyObject * +vserver_setsched(PyObject *self, PyObject *args) +{ + xid_t ctx; + uint32_t cpu_share; + uint32_t cpu_sched_flags = VC_VXF_SCHED_FLAGS; + + if (!PyArg_ParseTuple(args, "II|I", &ctx, &cpu_share, &cpu_sched_flags)) + return NULL; + + /* ESRCH indicates that there are no processes in the context */ + if (pl_setsched(ctx, cpu_share, cpu_sched_flags) && + errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_get_dlimit(PyObject *self, PyObject *args) +{ + PyObject *res; + char* path; + unsigned xid; + struct vc_ctx_dlimit data; + int r; + + if (!PyArg_ParseTuple(args, "si", &path,&xid)) + return NULL; + + memset(&data, 0, sizeof(data)); + r = vc_get_dlimit(path, xid, 0, &data); + if (r>=0) { + res = Py_BuildValue("(i,i,i,i,i)", + data.space_used, + data.space_total, + data.inodes_used, + data.inodes_total, + data.reserved); + } else { + res = PyErr_SetFromErrno(PyExc_OSError); + } + + return res; +} + + +static PyObject * +vserver_set_dlimit(PyObject *self, PyObject *args) +{ + char* path; + unsigned xid; + struct vc_ctx_dlimit data; + + memset(&data,0,sizeof(data)); + if (!PyArg_ParseTuple(args, "siiiiii", &path, + &xid, + &data.space_used, + &data.space_total, + &data.inodes_used, + &data.inodes_total, + &data.reserved)) + return NULL; + + if ((vc_add_dlimit(path, xid, 0) && errno != EEXIST) || + vc_set_dlimit(path, xid, 0, &data)) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_unset_dlimit(PyObject *self, PyObject *args) +{ + char *path; + unsigned xid; + + if (!PyArg_ParseTuple(args, "si", &path, &xid)) + return NULL; + + if (vc_rem_dlimit(path, xid, 0) && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_killall(PyObject *self, PyObject *args) +{ + xid_t ctx; + int sig; + struct vc_ctx_flags cflags = { + .flagword = 0, + .mask = VC_VXF_PERSISTENT + }; + struct vc_net_flags nflags = { + .flagword = 0, + .mask = VC_NXF_PERSISTENT + }; + + if (!PyArg_ParseTuple(args, "Ii", &ctx, &sig)) + return NULL; + + if (vc_ctx_kill(ctx, 0, sig) && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + if (vc_set_cflags(ctx, &cflags) && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + if (vc_set_nflags(ctx, &nflags) && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_set_bcaps(PyObject *self, PyObject *args) +{ + xid_t ctx; + struct vc_ctx_caps caps; + + if (!PyArg_ParseTuple(args, "IK", &ctx, &caps.bcaps)) + return NULL; + + caps.bmask = vc_get_insecurebcaps(); + caps.cmask = caps.ccaps = 0; + if (vc_set_ccaps(ctx, &caps) == -1 && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_text2bcaps(PyObject *self, PyObject *args) +{ + struct vc_ctx_caps caps = { .bcaps = 0 }; + const char *list; + int len; + struct vc_err_listparser err; + + if (!PyArg_ParseTuple(args, "s#", &list, &len)) + return NULL; + + vc_list2bcap(list, len, &err, &caps); + + return Py_BuildValue("K", caps.bcaps); +} + +static PyObject * +vserver_get_bcaps(PyObject *self, PyObject *args) +{ + xid_t ctx; + struct vc_ctx_caps caps; + + if (!PyArg_ParseTuple(args, "I", &ctx)) + return NULL; + + if (vc_get_ccaps(ctx, &caps) == -1) { + if (errno != -ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + else + caps.bcaps = 0; + } + + return Py_BuildValue("K", caps.bcaps & vc_get_insecurebcaps()); +} + +static PyObject * +vserver_bcaps2text(PyObject *self, PyObject *args) +{ + struct vc_ctx_caps caps = { .bcaps = 0 }; + PyObject *list; + const char *cap; + + if (!PyArg_ParseTuple(args, "K", &caps.bcaps)) + return NULL; + + list = PyString_FromString(""); + + while ((cap = vc_lobcap2text(&caps.bcaps)) != NULL) { + if (list == NULL) + break; + PyString_ConcatAndDel(&list, PyString_FromFormat( + (PyString_Size(list) > 0 ? ",CAP_%s" : "CAP_%s" ), + cap)); + } + + return list; +} + +static inline int +convert_address(const char *str, struct vc_net_addr *addr) +{ + void *dst; + if (inet_pton(AF_INET6, str, addr->vna_v6_ip.s6_addr) > 0) { + addr->vna_type = VC_NXA_TYPE_IPV6; + return 0; + } + else if (inet_pton(AF_INET, str, &addr->vna_v4_ip.s_addr) > 0) { + addr->vna_type = VC_NXA_TYPE_IPV4; + return 0; + } + return -1; +} + +static int +mask_to_prefix(void *data, int limit) +{ + uint8_t *mask = data; + int prefix; + for (prefix = 0; prefix < limit && mask[prefix >> 3] & (1 << (prefix & 0x07)); prefix++) + ; + return prefix; +} + +static int +get_mask(struct vc_net_addr *addr) +{ + struct ifaddrs *head, *ifa; + int ret = 0; + int family, offset, len; + void *ip; + + switch (addr->vna_type) { + case VC_NXA_TYPE_IPV4: + family = AF_INET; + offset = offsetof(struct sockaddr_in, sin_addr.s_addr); + ip = &addr->vna_v4_ip.s_addr; + len = 4; + addr->vna_v4_mask.s_addr = htonl(0xffffff00); + addr->vna_prefix = 24; + break; + case VC_NXA_TYPE_IPV6: + family = AF_INET6; + offset = offsetof(struct sockaddr_in6, sin6_addr.s6_addr); + ip = addr->vna_v6_ip.s6_addr; + len = 16; + addr->vna_v6_mask.s6_addr32[9] = addr->vna_v6_mask.s6_addr32[1] = 0xffffffff; + addr->vna_v6_mask.s6_addr32[2] = addr->vna_v6_mask.s6_addr32[3] = 0x00000000; + addr->vna_prefix = 64; + break; + default: + errno = -EINVAL; + return -1; + } + + if (getifaddrs(&head) == -1) + return -1; + for (ifa = head; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family == family && + memcmp((char *) ifa->ifa_addr + offset, ip, len) == 0) { + switch (addr->vna_type) { + case VC_NXA_TYPE_IPV4: + memcpy(&addr->vna_v4_mask.s_addr, ifa->ifa_netmask + offset, len); + addr->vna_prefix = mask_to_prefix(&addr->vna_v4_mask.s_addr, 32); + break; + case VC_NXA_TYPE_IPV6: + memcpy(addr->vna_v6_mask.s6_addr, ifa->ifa_netmask + offset, len); + addr->vna_prefix = mask_to_prefix(addr->vna_v6_mask.s6_addr, 128); + break; + } + ret = 1; + break; + } + } + freeifaddrs(head); + return ret; +} + +/* XXX These two functions are really similar */ +static PyObject * +vserver_net_add(PyObject *self, PyObject *args) +{ + struct vc_net_addr addr; + nid_t nid; + const char *ip; + + if (!PyArg_ParseTuple(args, "Is", &nid, &ip)) + return NULL; + + if (convert_address(ip, &addr) == -1) + return PyErr_Format(PyExc_ValueError, "%s is not a valid IP address", ip); + + switch (get_mask(&addr)) { + case -1: + return PyErr_SetFromErrno(PyExc_OSError); + case 0: + /* XXX error here? */ + break; + } + addr.vna_type |= VC_NXA_TYPE_ADDR; + + if (vc_net_add(nid, &addr) == -1 && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +static PyObject * +vserver_net_remove(PyObject *self, PyObject *args) +{ + struct vc_net_addr addr; + nid_t nid; + const char *ip; + + if (!PyArg_ParseTuple(args, "Is", &nid, &ip)) + return NULL; + + if (strcmp(ip, "all") == 0) + addr.vna_type = VC_NXA_TYPE_ANY; + else if (strcmp(ip, "all4") == 0) + addr.vna_type = VC_NXA_TYPE_IPV6 | VC_NXA_TYPE_ANY; + else if (strcmp(ip, "all6") == 0) + addr.vna_type = VC_NXA_TYPE_IPV6 | VC_NXA_TYPE_ANY; + else { + if (convert_address(ip, &addr) == -1) + return PyErr_Format(PyExc_ValueError, "%s is not a valid IP address", ip); + addr.vna_type |= VC_NXA_TYPE_ADDR; + } + + switch (get_mask(&addr)) { + case -1: + return PyErr_SetFromErrno(PyExc_OSError); + } + + if (vc_net_remove(nid, &addr) == -1 && errno != ESRCH) + return PyErr_SetFromErrno(PyExc_OSError); + + return NONE; +} + +struct secure_dirs { + int host_fd; + int cwd_fd; + int guest_fd; + int target_fd; +}; + +static inline int +fchroot(int fd) +{ + if (fchdir(fd) == -1 || chroot(".") == -1) + return -1; + return 0; +} + +static inline int +restore_dirs(struct secure_dirs *dirs) +{ + if (dirs->host_fd != -1) { + if (fchroot(dirs->host_fd) == -1) + return -1; + if (close(dirs->host_fd) == -1) + return -1; + } + if (dirs->guest_fd != -1) { + if (close(dirs->guest_fd) == -1) + return -1; + } + if (dirs->target_fd != -1) { + if (close(dirs->target_fd) == -1) + return -1; + } + if (dirs->cwd_fd != -1) { + if (fchdir(dirs->cwd_fd) == -1) + return -1; + if (close(dirs->cwd_fd) == -1) + return -1; + } + return 0; +} + +static inline int +secure_chdir(struct secure_dirs *dirs, const char *guest, const char *target) +{ + dirs->host_fd = dirs->cwd_fd = dirs->guest_fd = dirs->target_fd = -1; + + dirs->host_fd = open("/", O_RDONLY|O_DIRECTORY); + if (dirs->host_fd == -1) + return -1; + + dirs->cwd_fd = open(".", O_RDONLY|O_DIRECTORY); + if (dirs->cwd_fd == -1) + return -1; + + dirs->guest_fd = open(guest, O_RDONLY|O_DIRECTORY); + if (dirs->guest_fd == -1) + return -1; + if (fchroot(dirs->guest_fd) == -1) + return -1; + + dirs->target_fd = open(target, O_RDONLY|O_DIRECTORY); + if (dirs->target_fd == -1) + return -1; + + if (fchroot(dirs->host_fd) == -1 || close(dirs->host_fd) == -1) + return -1; + dirs->host_fd = -1; + if (close(dirs->guest_fd) == -1) + return -1; + dirs->guest_fd = -1; + + if (fchdir(dirs->target_fd) == -1 || close(dirs->target_fd) == -1) + return -1; + + return 0; +} + +static PyObject * +vserver_mount(PyObject *self, PyObject *args) +{ + const char *guest, *target, *source, *type, *data = NULL; + unsigned long flags = 0; + struct secure_dirs dirs; + + if (!PyArg_ParseTuple(args, "ssss|ks", &source, &guest, &target, &type, + &flags, &data)) + return NULL; + + if (secure_chdir(&dirs, guest, target) == -1) + goto out; + if (mount(source, ".", type, flags, data) == -1) + goto out; + restore_dirs(&dirs); + + return NONE; + +out: + restore_dirs(&dirs); + return PyErr_SetFromErrno(PyExc_OSError); +} + +static PyObject * +vserver_umount(PyObject *self, PyObject *args) +{ + const char *guest, *target; + int flags = 0; + char *path; + PyObject *ret; + + if (!PyArg_ParseTuple(args, "ss|i", &guest, &target, &flags)) + return NULL; + + path = calloc(strlen(guest) + strlen(target) + 2, sizeof(char)); + sprintf(path, "%s/%s", guest, target); + if (umount2(path, flags) == -1) + ret = PyErr_SetFromErrno(PyExc_OSError); + else + ret = NONE; + free(path); + + return ret; +} + +static PyObject * +vserver_set_runlevel(PyObject *self, PyObject *args) +{ + const char *file; + int runlevel; + struct utmp ut; + + if (!PyArg_ParseTuple(args, "si", &file, &runlevel)) + return NULL; + + utmpname(file); + setutent(); + memset(&ut, 0, sizeof(ut)); + ut.ut_type = RUN_LVL; + ut.ut_pid = ('#' << 8) + runlevel + '0'; + pututline(&ut); + endutent(); + + return NONE; +} + +static PyMethodDef methods[] = { + { "chcontext", vserver_chcontext, METH_VARARGS, + "chcontext to vserver with provided flags" }, + { "setup_done", vserver_setup_done, METH_VARARGS, + "Release vserver setup lock" }, + { "setsched", vserver_setsched, METH_VARARGS, + "Change vserver scheduling attributes for given vserver context" }, + { "setdlimit", vserver_set_dlimit, METH_VARARGS, + "Set disk limits for given vserver context" }, + { "unsetdlimit", vserver_unset_dlimit, METH_VARARGS, + "Remove disk limits for given vserver context" }, + { "getdlimit", vserver_get_dlimit, METH_VARARGS, + "Get disk limits for given vserver context" }, + { "setrlimit", vserver_set_rlimit, METH_VARARGS, + "Set resource limits for given resource of a vserver context" }, + { "getrlimit", vserver_get_rlimit, METH_VARARGS, + "Get resource limits for given resource of a vserver context" }, + { "killall", vserver_killall, METH_VARARGS, + "Send signal to all processes in vserver context" }, + { "isrunning", vserver_isrunning, METH_VARARGS, + "Check if vserver is running"}, + { "setbcaps", vserver_set_bcaps, METH_VARARGS, + "Set POSIX capabilities of a vserver context" }, + { "getbcaps", vserver_get_bcaps, METH_VARARGS, + "Get POSIX capabilities of a vserver context" }, + { "text2bcaps", vserver_text2bcaps, METH_VARARGS, + "Translate a string of capabilities to a bitmap" }, + { "bcaps2text", vserver_bcaps2text, METH_VARARGS, + "Translate a capability-bitmap into a string" }, + { "netadd", vserver_net_add, METH_VARARGS, + "Assign an IP address to a context" }, + { "netremove", vserver_net_remove, METH_VARARGS, + "Remove IP address(es) from a context" }, + { "mount", vserver_mount, METH_VARARGS, + "Perform the mount() system call" }, + { "umount", vserver_umount, METH_VARARGS, + "Perform the umount2() system call" }, + { "setrunlevel", vserver_set_runlevel, METH_VARARGS, + "Set the runlevel in utmp" }, + { NULL, NULL, 0, NULL } +}; + +PyMODINIT_FUNC +initvserverimpl(void) +{ + PyObject *mod; + + mod = Py_InitModule("vserverimpl", methods); + + /* export the set of 'safe' capabilities */ + PyModule_AddIntConstant(mod, "CAP_SAFE", ~vc_get_insecurebcaps()); + + /* export the default vserver directory */ + PyModule_AddStringConstant(mod, "VSERVER_BASEDIR", DEFAULT_VSERVERDIR); + + /* export limit-related constants */ + PyModule_AddIntConstant(mod, "DLIMIT_KEEP", (int)VC_CDLIM_KEEP); + PyModule_AddIntConstant(mod, "DLIMIT_INF", (int)VC_CDLIM_INFINITY); + PyModule_AddIntConstant(mod, "VC_LIM_KEEP", (int)VC_LIM_KEEP); + + PyModule_AddIntConstant(mod, "RLIMIT_CPU", (int)RLIMIT_CPU); + PyModule_AddIntConstant(mod, "RLIMIT_RSS", (int)RLIMIT_RSS); + PyModule_AddIntConstant(mod, "RLIMIT_NPROC", (int)RLIMIT_NPROC); + PyModule_AddIntConstant(mod, "RLIMIT_NOFILE", (int)RLIMIT_NOFILE); + PyModule_AddIntConstant(mod, "RLIMIT_MEMLOCK", (int)RLIMIT_MEMLOCK); + PyModule_AddIntConstant(mod, "RLIMIT_AS", (int)RLIMIT_AS); + PyModule_AddIntConstant(mod, "RLIMIT_LOCKS", (int)RLIMIT_LOCKS); + + PyModule_AddIntConstant(mod, "RLIMIT_SIGPENDING", (int)RLIMIT_SIGPENDING); + PyModule_AddIntConstant(mod, "RLIMIT_MSGQUEUE", (int)RLIMIT_MSGQUEUE); + + PyModule_AddIntConstant(mod, "VLIMIT_NSOCK", (int)VC_VLIMIT_NSOCK); + PyModule_AddIntConstant(mod, "VLIMIT_OPENFD", (int)VC_VLIMIT_OPENFD); + PyModule_AddIntConstant(mod, "VLIMIT_ANON", (int)VC_VLIMIT_ANON); + PyModule_AddIntConstant(mod, "VLIMIT_SHMEM", (int)VC_VLIMIT_SHMEM); + + /* scheduler flags */ + PyModule_AddIntConstant(mod, + "VS_SCHED_CPU_GUARANTEED", + VS_SCHED_CPU_GUARANTEED); +} diff --git a/scripts/vcached b/scripts/vcached new file mode 100755 index 0000000..0e09c7e --- /dev/null +++ b/scripts/vcached @@ -0,0 +1,106 @@ +#!/bin/bash +# +# vcached: VServer cache allocator +# +# Description: A script that preallocates vservers and stores them in +# a cache. Preallocated vservers from the cache may be then used to +# instantiate real vservers. Requires that /var/run/vcached.pid does +# not exist on startup. Should run periodically as a cron job. +# +# Based on work by: +# +# Brent Chun - bnc@intel-research.net +# Tristan Koo - tristan.koo@intel-research.net +# William Wung - wungism@uclink.berkeley.edu +# +# Mark Huang +# Copyright (c) 2004-2005 The Trustees of Princeton University +# +# $Id: vcached,v 1.14 2007/07/05 19:05:14 dhozac Exp $ +# + +PATH=/sbin:/usr/sbin:$PATH + +# number of images to keep cached +slots=32 + +# PID file +pidfile=/var/run/vcached.pid + +# log file +logfile=/var/log/vcached.log + +# debug +debug=0 + +usage() +{ + echo "usage: vcached [OPTION...]" + echo " -s [slots] number of images to keep cached" + echo " -p [pidfile] PID file" + echo " -l [logfile] log file" + echo " -d debug" + exit 1 +} + +# parse options +while getopts 's:p:l:dh' OPT ; do + case "$OPT" in + s) slots=$OPTARG ;; + p) pidfile=$OPTARG ;; + l) logfile=$OPTARG ;; + d) debug=1 ;; + h|*) usage ;; + esac +done + +# append output to log file +exec 1>>$logfile +exec 2>>$logfile + +# check if we are already running +if [ -f $pidfile ] && kill -0 `cat $pidfile` >/dev/null 2>&1 ; then + echo "vcached(`cat $pidfile`) already running" + exit 1 +fi +echo $$ > $pidfile + +# clean up lock file before exiting +trap "rm -f $pidfile" EXIT + +: ${UTIL_VSERVER_VARS:=/usr/lib/util-vserver/util-vserver-vars} +test -e "$UTIL_VSERVER_VARS" || { + echo "Can not find util-vserver installation; aborting..." + exit 1 +} +. "$UTIL_VSERVER_VARS" + +# make sure barrier bit is set on /vservers to prevent chroot() escapes +setattr --barrier $__DEFAULT_VSERVERDIR + +# take out the trash +#rm -rf "$__DEFAULT_VSERVERDIR/.vtmp" + +mkdir -p "$__DEFAULT_VSERVERDIR/.vcache" +mkdir -p "$__DEFAULT_VSERVERDIR/.vtmp" + +[ $debug -ne 0 ] && echo "$(date) Checking the cache" +for i in $(seq 0 $(($slots - 1))) ; do + if [ ! -d "$__DEFAULT_VSERVERDIR/.vcache/v$i" ] ; then + echo "$(date) Caching v$i" + # build image in .vtmp + TMP=$(mktemp -d "$__DEFAULT_VSERVERDIR/.vtmp/v$i.XXXXXX") + "$_VCLONE" "$__DEFAULT_VSERVERDIR/.vref/default/" "$TMP"/ + RETVAL=$? + # move it to .vcache when complete + if [ $RETVAL -eq 0 ] ; then + mv "$TMP" "$__DEFAULT_VSERVERDIR/.vcache/v$i" + echo "$(date) v$i ready" + else + echo "$(date) Error $RETVAL building v$i" + rm -rf "$TMP" + fi + fi +done + +exit 0 diff --git a/scripts/vcached.cron b/scripts/vcached.cron new file mode 100644 index 0000000..3a88e9d --- /dev/null +++ b/scripts/vcached.cron @@ -0,0 +1,10 @@ +# +# vcached: VServer cache allocator +# +# Mark Huang +# Copyright (c) 2004-2005 The Trustees of Princeton University +# +# $Id: vcached.cron,v 1.2 2005/09/01 18:52:53 mlhuang Exp $ +# + +*/15 * * * * root /usr/sbin/vcached diff --git a/scripts/vcached.logrotate b/scripts/vcached.logrotate new file mode 100644 index 0000000..203baba --- /dev/null +++ b/scripts/vcached.logrotate @@ -0,0 +1,10 @@ +/var/log/vcached.log { + compress + daily + notifempty + rotate 5 + missingok + postrotate + kill -HUP `cat /var/run/vcached.pid 2>/dev/null` 2>/dev/null || true + endscript +} diff --git a/scripts/vuseradd b/scripts/vuseradd new file mode 100755 index 0000000..aa1210b --- /dev/null +++ b/scripts/vuseradd @@ -0,0 +1,166 @@ +#!/bin/bash +# +# useradd(8) wrapper for vservers +# +# Mark Huang +# Copyright (C) 2004-2006 The Trustees of Princeton University +# +# $Id: vuseradd,v 1.28 2007/07/05 19:05:14 dhozac Exp $ +# + +: ${UTIL_VSERVER_VARS:=/usr/lib/util-vserver/util-vserver-vars} +test -e "$UTIL_VSERVER_VARS" || { + echo "Can not find util-vserver installation; aborting..." + exit 1 +} +. "$UTIL_VSERVER_VARS" + +shopt -s nullglob + +# Defaults +TYPE="default" + +usage() +{ + TYPES= + pushd "$__DEFAULT_VSERVERDIR/.vref" >/dev/null + for ref in * ; do + if [ -z "$TYPES" ] ; then + TYPES=$ref + else + TYPES="$TYPES, $ref" + fi + done + popd >/dev/null + + echo "Usage: vuseradd [OPTION]... [NAME]" + echo " -t Reference image type ($TYPES)" + exit 1 +} + +# Get options +while getopts "t:" opt ; do + case $opt in + t) + TYPE="$OPTARG" + ;; + *) + usage + ;; + esac +done +shift $(($OPTIND - 1)) + +# Get slice name +[ -z "$1" ] && usage +NAME=$1 + +# Add slices group to /etc/group if not already present +groupadd slices 2>/dev/null || : + +# Add slice name to /etc/passwd +useradd -g slices -s /bin/vsh $NAME -p '*' + +USERID=`id -u $NAME` +GROUPID=`id -g $NAME` +GROUPNAME=`id -gn $NAME` + +# Create /etc/vservers configuration files +if [ ! -d $__CONFDIR/$NAME ] ; then + # Move away the guest contents for now + if [ -d $__DEFAULT_VSERVERDIR/$NAME ] ; then + mkdir -p "$__DEFAULT_VSERVERDIR/.vtmp" + TMP=$(mktemp -d "$__DEFAULT_VSERVERDIR/.vtmp/$NAME.XXXXXX") + mv $__DEFAULT_VSERVERDIR/$NAME "$TMP" + HAS_VSERVERDIR=1 + else + HAS_VSERVERDIR=0 + fi + + $_VSERVER $NAME build -m skeleton --context $USERID \ + --interface nodev:0.0.0.0/0 \ + --flags persistent,~info_init,sched_hard + RETVAL=$? + DIR=$__CONFDIR/$NAME + if [ $RETVAL -ne 0 ] ; then + echo "Error $RETVAL building $DIR" + rm -rf $DIR $__DEFAULT_VSERVERDIR/$NAME + fi + mkdir -p $DIR/apps/init $DIR/rlimits $DIR/sched $DIR/dlimits/0 + echo default > $DIR/apps/init/mark + echo 1000 > $DIR/rlimits/nproc + + # Set persistent for the network context + echo persistent > $DIR/nflags + + # Set up the scheduler + echo 1000 > $DIR/sched/interval + echo 1000 > $DIR/sched/interval2 + echo 0 > $DIR/sched/fill-rate + echo 32 > $DIR/sched/fill-rate2 + touch $DIR/sched/idle-time + echo 100 > $DIR/sched/tokens + echo 50 > $DIR/sched/tokens-min + echo 100 > $DIR/sched/tokens-max + + # Set up disk limits (unlimited) + echo `$_READLINK $DIR/vdir` > $DIR/dlimits/0/directory + echo 2 > $DIR/dlimits/0/reserved + echo -1 > $DIR/dlimits/0/inodes_total + echo -1 > $DIR/dlimits/0/space_total + + # Remove the basically empty guest directory + rm -rf $__DEFAULT_VSERVERDIR/$NAME + # Move the guest back + if [ "$HAS_VSERVERDIR" = 1 ] ; then + mv "$TMP/$NAME" $__DEFAULT_VSERVERDIR/$NAME + rm -rf "$TMP" + fi +fi + +if [ ! -d "$__DEFAULT_VSERVERDIR/$NAME" ] ; then + # Check the cache + if [ "$TYPE" = "default" ] ; then + for i in "$__DEFAULT_VSERVERDIR/.vcache/"* ; do + [ -d "$i" ] && mv "$i" "$__DEFAULT_VSERVERDIR/$NAME" && break + done + fi + + # Build slice from reference image + if [ ! -d "$__DEFAULT_VSERVERDIR/$NAME" ] ; then + REF="$__DEFAULT_VSERVERDIR/.vref/$TYPE" + + # Build in temporary directory + mkdir -p "$__DEFAULT_VSERVERDIR/.vtmp" + TMP=$(mktemp -d "$__DEFAULT_VSERVERDIR/.vtmp/$NAME.XXXXXX") + "$_VCLONE" "$REF"/ "$TMP"/ + RETVAL=$? + + # Move it to its permanent location when complete + if [ $RETVAL -eq 0 ] ; then + mv "$TMP" "$__DEFAULT_VSERVERDIR/$NAME" + else + echo "Error $RETVAL building $__DEFAULT_VSERVERDIR/$NAME" + rm -rf "$TMP" $__CONFDIR/$NAME $__PKGSTATEDIR/$NAME.ctx + userdel -r $NAME + exit $RETVAL + fi + fi +fi + +if [ -d "$__DEFAULT_VSERVERDIR/$NAME" ] ; then + # Fix permissions + chmod 755 "$__DEFAULT_VSERVERDIR/$NAME" + + # Add user in vserver + $_VSERVER ----insecure $NAME suexec root sh -c \ + "groupadd -g $GROUPID $GROUPNAME ; useradd -u $USERID -g $GROUPID -p '' $NAME" + + # Add an unrestricted entry to /etc/sudoers file + if [ -f "$__DEFAULT_VSERVERDIR/$NAME/etc/sudoers" ] && \ + ! grep -q "^$NAME" "$__DEFAULT_VSERVERDIR/$NAME/etc/sudoers" ; then + echo "$NAME ALL=(ALL) ALL" >> "$__DEFAULT_VSERVERDIR/$NAME/etc/sudoers" + fi +fi + +exit 0 diff --git a/scripts/vuserdel b/scripts/vuserdel new file mode 100755 index 0000000..778a0fa --- /dev/null +++ b/scripts/vuserdel @@ -0,0 +1,57 @@ +#!/bin/bash +# +# userdel(8) wrapper for vservers +# +# Copyright (c) 2004 The Trustees of Princeton University (Trustees). +# +# $Id: vuserdel,v 1.11 2007/06/29 14:13:01 dhozac Exp $ +# + +: ${UTIL_VSERVER_VARS:=/usr/lib/util-vserver/util-vserver-vars} +test -e "$UTIL_VSERVER_VARS" || { + echo "Can not find util-vserver installation; aborting..." + exit 1 +} +. "$UTIL_VSERVER_VARS" + +usage() +{ + echo "usage: $0 name" + exit 1 +} + +[ -z "$1" ] && usage +[ "$1" == "--static" ] && { STATIC=yes; shift; } +NAME=$1 + +# read config file to get context ID +CTX=`cat $__CONFDIR/$NAME/context` + +# don't bother stopping gracefully, just kill all the processes +chcontext --silent --secure --ctx $CTX $__LEGACYDIR/vserverkillall + +# unmount any directories in vserver that are mount points +for d in `sed -ne "s%^[^ ]* \($__DEFAULT_VSERVERDIR/$NAME/[^ ]*\) .*%\1%p" /proc/mounts` +do + # use echo -e to turn escaped whitespace back into regular chars + # be careful about embedded backquotes here (i think we're safe) + dir=`echo -e "$d"` + echo "unmounting $dir" + umount -l "$dir" +done + +# delete user +[ -z "$STATIC" ] && userdel -r $NAME + +# remove vserver configuration directory +rm -rf /etc/vservers/$NAME + +# remove vserver profile +rm -f /var/run/vservers/$NAME.ctx + +# destroy vserver +if [ -d $__DEFAULT_VSERVERDIR/$NAME ] ; then + TMP=$(mktemp -d "$__DEFAULT_VSERVERDIR/.vtmp/$NAME.XXXXXX") + mv "$__DEFAULT_VSERVERDIR/$NAME" "$TMP" + rm -rf "$TMP" +fi diff --git a/src/planetlab.c b/src/planetlab.c new file mode 100644 index 0000000..293d0a7 --- /dev/null +++ b/src/planetlab.c @@ -0,0 +1,362 @@ +/* Copyright 2005 Princeton University + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PRINCETON +UNIVERSITY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vserver.h" +#include "planetlab.h" + +static int +create_context(xid_t ctx, uint64_t bcaps) +{ + struct vc_ctx_caps vc_caps; + struct vc_net_flags vc_nf; + + /* Create network context */ + if (vc_net_create(ctx) == VC_NOCTX) { + if (errno == EEXIST) + goto process; + return -1; + } + + /* Make the network context persistent */ + vc_nf.mask = vc_nf.flagword = VC_NXF_PERSISTENT; + if (vc_set_nflags(ctx, &vc_nf)) + return -1; + +process: + /* + * Create context info - this sets the STATE_SETUP and STATE_INIT flags. + */ + if (vc_ctx_create(ctx, 0) == VC_NOCTX) + return -1; + + /* Set capabilities - these don't take effect until SETUP flag is unset */ + vc_caps.bcaps = bcaps; + vc_caps.bmask = ~0ULL; /* currently unused */ + vc_caps.ccaps = 0; /* don't want any of these */ + vc_caps.cmask = ~0ULL; + if (vc_set_ccaps(ctx, &vc_caps)) + return -1; + + if (pl_setsched(ctx, 1, 0) < 0) { + PERROR("pl_setsched(%u)", ctx); + exit(1); + } + + return 0; +} + +int +pl_setup_done(xid_t ctx) +{ + struct vc_ctx_flags vc_flags; + + /* unset SETUP flag - this allows other processes to migrate */ + /* set the PERSISTENT flag - so the context doesn't vanish */ + /* Don't clear the STATE_INIT flag, as that would make us the init task. */ + vc_flags.mask = VC_VXF_STATE_SETUP|VC_VXF_PERSISTENT; + vc_flags.flagword = VC_VXF_PERSISTENT; + if (vc_set_cflags(ctx, &vc_flags)) + return -1; + + return 0; +} + +#define RETRY_LIMIT 10 + +int +pl_chcontext(xid_t ctx, uint64_t bcaps, const struct sliver_resources *slr) +{ + int retry_count = 0; + int net_migrated = 0; + + pl_set_ulimits(slr); + + for (;;) + { + struct vc_ctx_flags vc_flags; + + if (vc_get_cflags(ctx, &vc_flags)) + { + if (errno != ESRCH) + return -1; + + /* context doesn't exist - create it */ + if (create_context(ctx, bcaps)) + { + if (errno == EEXIST) + /* another process beat us in a race */ + goto migrate; + if (errno == EBUSY) + /* another process is creating - poll the SETUP flag */ + continue; + return -1; + } + + /* created context and migrated to it i.e., we're done */ + return 1; + } + + /* check the SETUP flag */ + if (vc_flags.flagword & VC_VXF_STATE_SETUP) + { + /* context is still being setup - wait a while then retry */ + if (retry_count++ >= RETRY_LIMIT) + { + errno = EBUSY; + return -1; + } + sleep(1); + continue; + } + + /* context has been setup */ + migrate: + if (net_migrated || !vc_net_migrate(ctx)) + { + if (!vc_ctx_migrate(ctx, 0)) + break; /* done */ + net_migrated = 1; + } + + /* context disappeared - retry */ + } + + return 0; +} + +/* it's okay for a syscall to fail because the context doesn't exist */ +#define VC_SYSCALL(x) \ +do \ +{ \ + if (x) \ + return errno == ESRCH ? 0 : -1; \ +} \ +while (0) + +int +pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags) +{ + struct vc_set_sched vc_sched; + struct vc_ctx_flags vc_flags; + uint32_t new_flags; + + vc_sched.set_mask = (VC_VXSM_FILL_RATE | VC_VXSM_INTERVAL | VC_VXSM_TOKENS | + VC_VXSM_TOKENS_MIN | VC_VXSM_TOKENS_MAX | VC_VXSM_MSEC | + VC_VXSM_FILL_RATE2 | VC_VXSM_INTERVAL2 | VC_VXSM_FORCE | + VC_VXSM_IDLE_TIME); + vc_sched.fill_rate = 0; + vc_sched.fill_rate2 = cpu_share; /* tokens accumulated per interval */ + vc_sched.interval = vc_sched.interval2 = 1000; /* milliseconds */ + vc_sched.tokens = 100; /* initial allocation of tokens */ + vc_sched.tokens_min = 50; /* need this many tokens to run */ + vc_sched.tokens_max = 100; /* max accumulated number of tokens */ + + if (cpu_share == (uint32_t)VC_LIM_KEEP) + vc_sched.set_mask &= ~(VC_VXSM_FILL_RATE|VC_VXSM_FILL_RATE2); + + /* guaranteed CPU corresponds to SCHED_SHARE flag being cleared */ + if (cpu_sched_flags & VS_SCHED_CPU_GUARANTEED) { + new_flags = 0; + vc_sched.fill_rate = vc_sched.fill_rate2; + } + else + new_flags = VC_VXF_SCHED_SHARE; + + VC_SYSCALL(vc_set_sched(ctx, &vc_sched)); + + vc_flags.mask = VC_VXF_SCHED_FLAGS; + vc_flags.flagword = new_flags | VC_VXF_SCHED_HARD; + VC_SYSCALL(vc_set_cflags(ctx, &vc_flags)); + + return 0; +} + +struct pl_resources { + char *name; + unsigned long long *limit; +}; + +#define WHITESPACE(buffer,index,len) \ + while(isspace((int)buffer[index])) \ + if (index < len) index++; else goto out; + +#define VSERVERCONF "/etc/vservers/" +void +pl_get_limits(const char *context, struct sliver_resources *slr) +{ + FILE *fb; + int cwd; + size_t len = strlen(VSERVERCONF) + strlen(context) + NULLBYTE_SIZE; + char *conf = (char *)malloc(len + strlen("rlimits/openfd.hard")); + struct pl_resources *r; + struct pl_resources sliver_list[] = { + {"sched/fill-rate2", &slr->vs_cpu}, + + {"rlimits/nproc.hard", &slr->vs_nproc.hard}, + {"rlimits/nproc.soft", &slr->vs_nproc.soft}, + {"rlimits/nproc.min", &slr->vs_nproc.min}, + + {"rlimits/rss.hard", &slr->vs_rss.hard}, + {"rlimits/rss.soft", &slr->vs_rss.soft}, + {"rlimits/rss.min", &slr->vs_rss.min}, + + {"rlimits/as.hard", &slr->vs_as.hard}, + {"rlimits/as.soft", &slr->vs_as.soft}, + {"rlimits/as.min", &slr->vs_as.min}, + + {"rlimits/openfd.hard", &slr->vs_openfd.hard}, + {"rlimits/openfd.soft", &slr->vs_openfd.soft}, + {"rlimits/openfd.min", &slr->vs_openfd.min}, + + {0,0} + }; + + sprintf(conf, "%s%s", VSERVERCONF, context); + + slr->vs_rss.hard = VC_LIM_KEEP; + slr->vs_rss.soft = VC_LIM_KEEP; + slr->vs_rss.min = VC_LIM_KEEP; + + slr->vs_as.hard = VC_LIM_KEEP; + slr->vs_as.soft = VC_LIM_KEEP; + slr->vs_as.min = VC_LIM_KEEP; + + slr->vs_nproc.hard = VC_LIM_KEEP; + slr->vs_nproc.soft = VC_LIM_KEEP; + slr->vs_nproc.min = VC_LIM_KEEP; + + slr->vs_openfd.hard = VC_LIM_KEEP; + slr->vs_openfd.soft = VC_LIM_KEEP; + slr->vs_openfd.min = VC_LIM_KEEP; + + cwd = open(".", O_RDONLY); + if (cwd == -1) { + perror("cannot get a handle on ."); + goto out; + } + if (chdir(conf) == -1) { + fprintf(stderr, "cannot chdir to "); + perror(conf); + goto out_fd; + } + + for (r = &sliver_list[0]; r->name; r++) { + char buf[1000]; + fb = fopen(r->name, "r"); + if (fb == NULL) + continue; + if (fgets(buf, sizeof(buf), fb) != NULL && isdigit(*buf)) + *r->limit = atoi(buf); + fclose(fb); + } + + fchdir(cwd); +out_fd: + close(cwd); +out: + free(conf); +} + +int +adjust_lim(const struct vc_rlimit *vcr, struct rlimit *lim) +{ + int adjusted = 0; + if (vcr->min != VC_LIM_KEEP) { + if (vcr->min > lim->rlim_cur) { + lim->rlim_cur = vcr->min; + adjusted = 1; + } + if (vcr->min > lim->rlim_max) { + lim->rlim_max = vcr->min; + adjusted = 1; + } + } + + if (vcr->soft != VC_LIM_KEEP) { + switch (vcr->min != VC_LIM_KEEP) { + case 1: + if (vcr->soft < vcr->min) + break; + case 0: + lim->rlim_cur = vcr->soft; + adjusted = 1; + } + } + + if (vcr->hard != VC_LIM_KEEP) { + switch (vcr->min != VC_LIM_KEEP) { + case 1: + if (vcr->hard < vcr->min) + break; + case 0: + lim->rlim_cur = vcr->hard; + adjusted = 1; + } + } + return adjusted; +} + +static inline void +set_one_ulimit(int resource, const struct vc_rlimit *limit) +{ + struct rlimit lim; + getrlimit(resource, &lim); + adjust_lim(limit, &lim); + setrlimit(resource, &lim); +} + +void +pl_set_ulimits(const struct sliver_resources *slr) +{ + if (!slr) + return; + + set_one_ulimit(RLIMIT_RSS, &slr->vs_rss); + set_one_ulimit(RLIMIT_AS, &slr->vs_as); + set_one_ulimit(RLIMIT_NPROC, &slr->vs_nproc); + set_one_ulimit(RLIMIT_NOFILE, &slr->vs_openfd); +} diff --git a/src/planetlab.h b/src/planetlab.h new file mode 100644 index 0000000..78a6a9c --- /dev/null +++ b/src/planetlab.h @@ -0,0 +1,88 @@ +/* Copyright 2005 Princeton University + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PRINCETON +UNIVERSITY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _LIB_PLANETLAB_H_ +#define _LIB_PLANETLAB_H_ + +#ifndef VC_VXF_SCHED_SHARE +# define VC_VXF_SCHED_SHARE 0x00000800ull +#endif + +#define VC_VXF_SCHED_FLAGS (VC_VXF_SCHED_HARD | VC_VXF_SCHED_SHARE) + +struct sliver_resources { + unsigned long long vs_cpu; + struct vc_rlimit vs_rss; + struct vc_rlimit vs_as; + struct vc_rlimit vs_nproc; + struct vc_rlimit vs_openfd; +}; + +int adjust_lim(const struct vc_rlimit *vcr, struct rlimit *lim); + +int +pl_chcontext(xid_t ctx, uint64_t bcaps, const struct sliver_resources *slr); + +int +pl_setup_done(xid_t ctx); + +int +pl_setsched(xid_t ctx, uint32_t cpu_share, uint32_t cpu_sched_flags); + +/* scheduler flags */ +#define VS_SCHED_CPU_GUARANTEED 1 + +/* Null byte made explicit */ +#define NULLBYTE_SIZE 1 + +void pl_get_limits(const char *, struct sliver_resources *); +void pl_set_ulimits(const struct sliver_resources *); + +static inline int +_PERROR(const char *format, char *file, int line, int _errno, ...) +{ + va_list ap; + + va_start(ap, _errno); + fprintf(stderr, "%s:%d: ", file, line); + vfprintf(stderr, format, ap); + if (_errno) + fprintf(stderr, ": %s (%d)", strerror(_errno), _errno); + fputs("\n", stderr); + fflush(stderr); + + return _errno; +} + +#define PERROR(format, args...) _PERROR(format, __FILE__, __LINE__, errno, ## args) +#endif diff --git a/src/vip6-autod.c b/src/vip6-autod.c new file mode 100644 index 0000000..feade59 --- /dev/null +++ b/src/vip6-autod.c @@ -0,0 +1,623 @@ +/* + * $Id$ + * Copyright (c) 2007 The Trustees of Princeton University + * Author: Daniel Hokka Zakrisson + * + * Licensed under the terms of the GNU General Public License + * version 2 or later. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#define HAS_ADDRESS 0x01 +#define HAS_PREFIX 0x02 + +struct nid_list { + nid_t nid; + struct nid_list *next; +}; +struct prefix { + uint32_t mask; + int ifindex; + struct { + struct in6_addr addr; + int prefix_len; + time_t valid_until; + } prefix; + struct { + struct in6_addr addr; + int prefix_len; + time_t valid_until; + } address; +}; +struct nid_prefix_map { + struct { + struct nid_prefix_map *prev; + struct nid_prefix_map *next; + } n; + struct { + struct nid_prefix_map *prev; + struct nid_prefix_map *next; + } p; + struct prefix *prefix; + nid_t nid; +}; + +struct nl_handle *handle; + +/* from linux/include/net/ipv6.h */ +static inline int ipv6_prefix_equal(struct in6_addr *prefix, + struct in6_addr *addr, int prefixlen) +{ + uint32_t *a1 = prefix->s6_addr32, *a2 = addr->s6_addr32; + unsigned pdw, pbi; + + /* check complete u32 in prefix */ + pdw = prefixlen >> 5; + if (pdw && memcmp(a1, a2, pdw << 2)) + return 0; + + /* check incomplete u32 in prefix */ + pbi = prefixlen & 0x1f; + if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) + return 0; + + return 1; +} + +static int add_address_to_interface(int ifindex, struct in6_addr *address, + int prefix) +{ + int err = -1; + struct rtnl_addr *rta; + struct nl_addr *nl; + + nl = nl_addr_build(AF_INET6, address, sizeof(struct in6_addr)); + rta = rtnl_addr_alloc(); + + rtnl_addr_set_family(rta, AF_INET6); + rtnl_addr_set_ifindex(rta, ifindex); + rtnl_addr_set_local(rta, nl); + rtnl_addr_set_prefixlen(rta, prefix); + + if (rtnl_addr_add(handle, rta, NLM_F_REPLACE) != -1 || errno == EEXIST) + err = 0; + + rtnl_addr_free(rta); + nl_addr_destroy(nl); + return err; +} + +static inline int remove_address_from_interface(struct nid_prefix_map *entry) +{ + struct rtnl_addr *rta; + struct nl_addr *nl; + struct in6_addr a; + int ret; + + memcpy(&a, &entry->prefix->address.addr, sizeof(a)); + if (entry->nid != 0) { + a.s6_addr[11] = (entry->nid & 0x7f80) >> 7; + a.s6_addr[12] = (entry->nid & 0x7f) << 1; + } + + nl = nl_addr_build(AF_INET6, &a, sizeof(a)); + if (!nl) + return -1; + rta = rtnl_addr_alloc(); + if (!rta) + return -1; + + rtnl_addr_set_family(rta, AF_INET6); + rtnl_addr_set_ifindex(rta, entry->prefix->ifindex); + rtnl_addr_set_local(rta, nl); + rtnl_addr_set_prefixlen(rta, entry->prefix->address.prefix_len); + + ret = rtnl_addr_delete(handle, rta, 0); + + rtnl_addr_free(rta); + nl_addr_destroy(nl); + + return ret; +} + +static int add_to_map(struct nid_prefix_map *map, struct nid_prefix_map *new) +{ + struct nid_prefix_map *i; +#define PUT_IT_IN_PLACE(node, member, om) \ + /* find the correct location in the list */ \ + for (i = map->node.next; i->node.next && i->member < \ + new->member; i = i->node.next) \ + ; \ + if (i && i->member == new->member && i->om == new->om) \ + return 0; \ + /* first in the list */ \ + if (!i || !i->node.prev) { \ + new->node.prev = NULL; \ + new->node.next = i; \ + map->node.next = new; \ + if (i) \ + i->node.prev = new; \ + } \ + /* last in the list */ \ + else if (i->node.next == NULL) { \ + new->node.prev = i; \ + new->node.next = NULL; \ + i->node.next = new; \ + } \ + /* somewhere in the middle */ \ + else { \ + new->node.prev = i->node.prev; \ + new->node.next = i; \ + i->node.prev->node.next = new; \ + i->node.prev = new; \ + } + PUT_IT_IN_PLACE(p, prefix, nid) + PUT_IT_IN_PLACE(n, nid, prefix) + return 1; +} + +static inline void remove_from_map(struct nid_prefix_map *map, + struct nid_prefix_map *entry) +{ + if (map->n.next == entry) + map->n.next = entry->n.next; + if (map->n.prev == entry) + map->n.prev = entry->n.prev; + if (map->p.next == entry) + map->p.next = entry->p.next; + if (map->p.prev == entry) + map->p.prev = entry->p.prev; +} + +static inline void remove_from_map_and_free(struct nid_prefix_map *map, + struct nid_prefix_map *entry) +{ + remove_from_map(map, entry); + free(entry); +} + +static int add_nid_to_map(struct nid_prefix_map *map, struct prefix *prefix, + nid_t nid) +{ + struct nid_prefix_map *new = calloc(1, sizeof(struct nid_prefix_map)); + int ret; + + if (!new) + return -1; + + new->prefix = prefix; + new->nid = nid; + ret = add_to_map(map, new); + + if (ret == 0) + free(new); + + return ret; +} + +static int add_prefix_to_map(struct nid_prefix_map *map, struct prefix *prefix) +{ + return add_nid_to_map(map, prefix, 0); +} + +static void cleanup_prefix(struct nid_prefix_map *map, + struct nid_prefix_map *first) +{ + struct nid_prefix_map *i, *p = NULL; + + for (i = first; i && first->prefix == i->prefix; i = i->p.next) { + if (p) + remove_from_map_and_free(map, p); + + /* ignore errors */ + remove_address_from_interface(i); + + p = i; + } + if (p) + remove_from_map_and_free(map, p); +} + +static inline int add_nid_to_list(struct nid_list **head, nid_t nid) +{ + struct nid_list *i, *new; + + for (i = *head; i && i->next && i->next->nid < nid; i = i->next) + ; + /* check if this nid is first in the list */ + if (i && i->nid == nid) + return 0; + /* check if it's already in the list */ + if (i && i->next && i->next->nid == nid) + return 0; + + /* add it */ + new = calloc(1, sizeof(struct nid_list)); + if (!new) + return -1; + new->nid = nid; + + /* this is the lowest nid in the list */ + if (i == *head) { + *head = new; + new->next = i; + } + /* in the middle/at the end */ + else if (i) { + new->next = i->next; + i->next = new; + } + /* there was no list */ + else + *head = new; + + return 1; +} + +static inline void free_nid_list(struct nid_list *head) +{ + struct nid_list *p; + for (p = NULL; head; head = head->next) { + if (p) + free(p); + p = head; + } + if (p) + free(p); +} + +static inline void cleanup_nid(struct nid_prefix_map *map, + nid_t nid) +{ + struct nid_prefix_map *i, *p = NULL; + for (i = map->n.next; i->nid < nid; i = i->n.next) + ; + /* this nid doesn't have any entries in the map */ + if (i->nid != nid) + return; + for (; i->nid == nid; i = i->n.next) { + if (p) + remove_from_map_and_free(map, p); + remove_address_from_interface(i); + p = i; + } + if (p) + remove_from_map_and_free(map, p); +} + +static inline void cleanup_nids(struct nid_prefix_map *map, + struct nid_list *previous, + struct nid_list *current) +{ + struct nid_list *p, *pprev = NULL, *c; + for (p = previous, c = current; p; pprev = p, p = p->next) { + if (pprev) + free(pprev); + while (c->nid < p->nid) + c = c->next; + if (c->nid == p->nid) + continue; + /* this context has disappeared */ + cleanup_nid(map, p->nid); + } + if (pprev) + free(pprev); +} + +static void do_slices_autoconf(struct nid_prefix_map *map) +{ + DIR *dp; + struct dirent *de; + struct vc_net_addr addr; + struct nid_prefix_map *i; + struct nid_list *current = NULL, *n; + static struct nid_list *previous = NULL; + + if ((dp = opendir("/proc/virtnet")) == NULL) + return; + while ((de = readdir(dp)) != NULL) { + nid_t nid; + + if (!isdigit(de->d_name[0])) + continue; + + nid = strtoul(de->d_name, NULL, 10); + addr.vna_type = VC_NXA_TYPE_IPV6 | VC_NXA_TYPE_ANY; + if (vc_net_remove(nid, &addr) == -1) { + syslog(LOG_ERR, "vc_net_remove(%u): %s", nid, strerror(errno)); + continue; + } + + add_nid_to_list(¤t, nid); + } + closedir(dp); + + for (n = current; n; n = n->next) { + for (i = map->p.next; i && i->nid == 0;) { + /* expired */ + if (i->prefix->mask & HAS_PREFIX && i->prefix->prefix.valid_until < time(NULL)) { + struct nid_prefix_map *tmp; + char buf[64]; + + inet_ntop(AF_INET6, &i->prefix->address.addr, buf, sizeof(buf)); + syslog(LOG_NOTICE, "Address %s timed out", buf); + + tmp = i->p.next; + + cleanup_prefix(map, i); + + i = tmp; + continue; + } + if (i->prefix->mask != (HAS_ADDRESS|HAS_PREFIX)) + goto next; + + addr.vna_type = VC_NXA_TYPE_IPV6 | VC_NXA_TYPE_ADDR; + memcpy(&addr.vna_v6_ip, &i->prefix->address.addr, sizeof(struct in6_addr)); + addr.vna_prefix = i->prefix->prefix.prefix_len; + if (addr.vna_prefix == 64) { + addr.vna_v6_mask.s6_addr32[0] = addr.vna_v6_mask.s6_addr32[1] = 0xffffffff; + addr.vna_v6_mask.s6_addr32[2] = addr.vna_v6_mask.s6_addr32[3] = 0; + } + addr.vna_v6_ip.s6_addr[11] = (n->nid & 0x7f80) >> 7; + addr.vna_v6_ip.s6_addr[12] = (n->nid & 0x007f) << 1; + if (vc_net_add(n->nid, &addr) == -1) { + syslog(LOG_ERR, "vc_net_add(%u): %s", n->nid, strerror(errno)); + goto next; + } + if (add_address_to_interface(i->prefix->ifindex, &addr.vna_v6_ip, addr.vna_prefix) == -1) { + syslog(LOG_ERR, "add_address_to_interface: %s", strerror(errno)); + goto next; + } + if (add_nid_to_map(map, i->prefix, n->nid) == -1) { + syslog(LOG_ERR, "add_nid_to_map: %s", strerror(errno)); + goto next; + } +next: + i = i->p.next; + } + } + + cleanup_nids(map, previous, current); + previous = current; +} + +/* XXX These two functions are very similar */ +static int add_prefix(struct nid_prefix_map *map, struct prefixmsg *msg, + struct in6_addr *prefix, struct prefix_cacheinfo *cache) +{ + struct nid_prefix_map *i = map; + struct prefix *new; + + if (!msg || !prefix || !cache) + return -1; + /* XXX IF_PREFIX_AUTOCONF == 0x02 */ + if (!(msg->prefix_flags & 0x02)) + return -1; + + do { + if (i->p.next != NULL) + i = i->p.next; + if (ipv6_prefix_equal(prefix, &i->prefix->prefix.addr, msg->prefix_len) || + ipv6_prefix_equal(prefix, &i->prefix->address.addr, msg->prefix_len)) { + i->prefix->mask |= HAS_PREFIX; + i->prefix->ifindex = msg->prefix_ifindex; + memcpy(&i->prefix->prefix.addr, prefix, sizeof(*prefix)); + i->prefix->prefix.prefix_len = msg->prefix_len; + i->prefix->prefix.valid_until = time(NULL) + cache->preferred_time; + return 0; + } + } while (i->p.next && i->nid == 0); + + /* not yet in the map */ + new = calloc(1, sizeof(*new)); + if (!new) + return -1; + new->mask = HAS_PREFIX; + memcpy(&new->prefix.addr, prefix, sizeof(*prefix)); + new->prefix.prefix_len = msg->prefix_len; + new->prefix.valid_until = time(NULL) + cache->preferred_time; + if (add_prefix_to_map(map, new) == -1) + return -1; + + return 1; +} + +static inline int add_address(struct nid_prefix_map *map, struct ifaddrmsg *msg, + struct in6_addr *address, struct ifa_cacheinfo *cache) +{ + struct nid_prefix_map *i = map; + struct prefix *new; + + if (!msg || !address || !cache) + return -1; + + if (address->s6_addr[11] != 0xFF || address->s6_addr[12] != 0xFE) + return -1; + + do { + if (i->p.next != NULL) + i = i->p.next; + if (ipv6_prefix_equal(address, &i->prefix->prefix.addr, msg->ifa_prefixlen) || + ipv6_prefix_equal(address, &i->prefix->address.addr, 128)) { + i->prefix->mask |= HAS_ADDRESS; + memcpy(&i->prefix->address.addr, address, sizeof(*address)); + i->prefix->address.prefix_len = msg->ifa_prefixlen; + i->prefix->address.valid_until = time(NULL) + cache->ifa_prefered; + return 0; + } + } while (i->p.next && i->nid == 0); + + new = calloc(1, sizeof(*new)); + if (!new) + return -1; + new->mask = HAS_ADDRESS; + memcpy(&new->address.addr, address, sizeof(*address)); + new->address.prefix_len = msg->ifa_prefixlen; + new->address.valid_until = time(NULL) + cache->ifa_prefered; + if (add_prefix_to_map(map, new) == -1) + return -1; + + return 1; +} + +static struct nla_policy addr_policy[IFA_MAX+1] = { + [IFA_ADDRESS] = { .minlen = sizeof(struct in6_addr) }, + [IFA_LABEL] = { .type = NLA_STRING, + .maxlen = IFNAMSIZ }, + [IFA_CACHEINFO] = { .minlen = sizeof(struct ifa_cacheinfo) }, +}; +static struct nla_policy prefix_policy[PREFIX_MAX+1] = { + [PREFIX_ADDRESS] = { .minlen = sizeof(struct in6_addr) }, + [PREFIX_CACHEINFO] = { .minlen = sizeof(struct prefix_cacheinfo) }, +}; +int handle_valid_msg(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + int ret = -1; + char *payload; + struct sockaddr_nl *source = nlmsg_get_src(msg); + + payload = nlmsg_data(nlh); + if (source->nl_groups == RTMGRP_IPV6_PREFIX) { + struct prefixmsg *prefixmsg; + struct in6_addr *prefix = NULL; + struct prefix_cacheinfo *cacheinfo = NULL; + struct nlattr *tb[PREFIX_MAX+1]; + + if (nlmsg_parse(nlh, sizeof(struct prefixmsg), tb, PREFIX_MAX, prefix_policy) < 0) { + syslog(LOG_ERR, "Failed to parse prefixmsg"); + return -1; + } + + prefixmsg = (struct prefixmsg *) payload; + if (tb[PREFIX_ADDRESS]) + prefix = nl_data_get(nla_get_data(tb[PREFIX_ADDRESS])); + if (tb[PREFIX_CACHEINFO]) + cacheinfo = nl_data_get(nla_get_data(tb[PREFIX_CACHEINFO])); + ret = add_prefix(arg, prefixmsg, prefix, cacheinfo); + } + else if (source->nl_groups == RTMGRP_IPV6_IFADDR) { + struct ifaddrmsg *ifaddrmsg; + struct in6_addr *address = NULL; + struct ifa_cacheinfo *cacheinfo = NULL; + struct nlattr *tb[IFA_MAX+1]; + + if (nlmsg_parse(nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX, addr_policy) < 0) { + syslog(LOG_ERR, "Failed to parse ifaddrmsg"); + return -1; + } + + ifaddrmsg = (struct ifaddrmsg *) payload; + if (tb[IFA_ADDRESS]) + address = nl_data_get(nla_get_data(tb[IFA_ADDRESS])); + if (tb[IFA_CACHEINFO]) + cacheinfo = nl_data_get(nla_get_data(tb[IFA_CACHEINFO])); + ret = add_address(arg, ifaddrmsg, address, cacheinfo); + } + if (ret >= 0) + do_slices_autoconf(arg); + + return 0; +} + +int handle_error_msg(struct sockaddr_nl *source, struct nlmsgerr *err, + void *arg) +{ + syslog(LOG_ERR, "%s", strerror(err->error)); + return 0; +} + +int handle_no_op(struct nl_msg *msg, void *arg) +{ + return 0; +} + +/* only for access in the signal handler */ +struct nid_prefix_map map = { + .n = { + .next = NULL, + .prev = NULL, + }, + .p = { + .next = NULL, + .prev = NULL, + }, +}; +void signal_handler(int signal) +{ + switch (signal) { + case SIGUSR1: + do_slices_autoconf(&map); + break; + } +} + +static int write_pidfile(const char *filename) +{ + FILE *fp; + fp = fopen(filename, "w"); + if (!fp) + return -1; + fprintf(fp, "%d\n", getpid()); + fclose(fp); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct nl_cb *cbs; + + openlog("vip6-autod", LOG_PERROR, LOG_DAEMON); + + handle = nl_handle_alloc_nondefault(NL_CB_VERBOSE); + cbs = nl_handle_get_cb(handle); + nl_cb_set(cbs, NL_CB_VALID, NL_CB_CUSTOM, handle_valid_msg, &map); + nl_cb_set(cbs, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, handle_no_op, NULL); + nl_cb_err(cbs, NL_CB_CUSTOM, handle_error_msg, &map); + nl_disable_sequence_check(handle); + + nl_join_groups(handle, RTMGRP_IPV6_PREFIX|RTMGRP_IPV6_IFADDR); + if (nl_connect(handle, NETLINK_ROUTE) == -1) { + syslog(LOG_CRIT, "nl_connect: %s", strerror(errno)); + exit(1); + } + + if (daemon(0, 0) == -1) + return -1; + + write_pidfile(LOCALSTATEDIR "/run/vip6-autod.pid"); + + signal(SIGUSR1, signal_handler); + + while (nl_recvmsgs(handle, cbs) > 0); + + nl_close(handle); + closelog(); + return 0; +} diff --git a/src/vsh.c b/src/vsh.c new file mode 100644 index 0000000..ffbe027 --- /dev/null +++ b/src/vsh.c @@ -0,0 +1,343 @@ +/* + * Marc E. Fiuczynski + * + * Copyright (c) 2004 The Trustees of Princeton University (Trustees). + * + * vsh is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * vsh is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Poptop; see the file COPYING. If not, write to the Free + * Software Foundation, 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//-------------------------------------------------------------------- +#include +#include "planetlab.h" + +/* Change to root:root (before entering new context) */ +static int setuidgid_root() +{ + if (setgid(0) < 0) { + PERROR("setgid(0)"); + return -1; + } + if (setuid(0) < 0) { + PERROR("setuid(0)"); + return -1; + } + return 0; +} + +static void compute_new_root(char *base, char **root, const struct passwd *pwd) +{ + int root_len; + + root_len = + strlen(base) + strlen("/") + + strlen(pwd->pw_name) + NULLBYTE_SIZE; + (*root) = (char *)malloc(root_len); + if ((*root) == NULL) { + PERROR("malloc(%d)", root_len); + exit(1); + } + + sprintf((*root), "%s/%s", base, pwd->pw_name); + (*root)[root_len - 1] = '\0'; +} + +static int sandbox_chroot(const struct passwd *pwd) +{ + char *sandbox_root = NULL; + + compute_new_root(DEFAULT_VSERVERDIR,&sandbox_root, pwd); + if (chroot(sandbox_root) < 0) { + PERROR("chroot(%s)", sandbox_root); + exit(1); + } + if (chdir("/") < 0) { + PERROR("chdir(/)"); + exit(1); + } + return 0; +} + +static int sandbox_processes(xid_t ctx, const char *context, const struct passwd *pwd) +{ +#ifdef CONFIG_VSERVER_LEGACY + int flags; + + flags = 0; + flags |= 1; /* VX_INFO_LOCK -- cannot request a new vx_id */ + /* flags |= 4; VX_INFO_NPROC -- limit number of procs in a context */ + + (void) vc_new_s_context(ctx, 0, flags); + + /* use legacy dirty hack for capremove */ + if (vc_new_s_context(VC_SAMECTX, vc_get_insecurebcaps(), flags) == VC_NOCTX) { + PERROR("vc_new_s_context(%u, 0x%16llx, 0x%08x)", + VC_SAMECTX, vc_get_insecurebcaps(), flags); + exit(1); + } +#else + int ctx_is_new; + struct sliver_resources slr; + char hostname[HOST_NAME_MAX+1]; + pl_get_limits(context,&slr); + + if (gethostname(hostname, sizeof hostname) == -1) + { + PERROR("gethostname(...)"); + exit(1); + } + + /* check whether the slice has been suspended */ + if (slr.vs_cpu==0) + { + fprintf(stderr, "*** %s: %s has zero cpu resources and presumably it has been disabled/suspended ***\n", hostname, context); + exit(0); + } + + (void) (sandbox_chroot(pwd)); + + if ((ctx_is_new = pl_chcontext(ctx, ~vc_get_insecurebcaps(),&slr)) < 0) + { + PERROR("pl_chcontext(%u)", ctx); + exit(1); + } + if (ctx_is_new) + { + fprintf(stderr, " *** %s: %s has not been started yet, please check back later ***\n", hostname, context); + exit(1); + } +#endif + return 0; +} + + +void runas_slice_user(struct passwd *pwd) +{ + char *username = pwd->pw_name; + char *home_env, *logname_env, *mail_env, *shell_env, *user_env; + int home_len, logname_len, mail_len, shell_len, user_len; + static char *envp[10]; + + if (setgid(pwd->pw_gid) < 0) { + PERROR("setgid(%d)", pwd->pw_gid); + exit(1); + } + + if (setuid(pwd->pw_uid) < 0) { + PERROR("setuid(%d)", pwd->pw_uid); + exit(1); + } + + if (chdir(pwd->pw_dir) < 0) { + PERROR("chdir(%s)", pwd->pw_dir); + exit(1); + } + + home_len = strlen("HOME=") + strlen(pwd->pw_dir) + NULLBYTE_SIZE; + logname_len = strlen("LOGNAME=") + strlen(username) + NULLBYTE_SIZE; + mail_len = strlen("MAIL=/var/spool/mail/") + strlen(username) + + NULLBYTE_SIZE; + shell_len = strlen("SHELL=") + strlen(pwd->pw_shell) + NULLBYTE_SIZE; + user_len = strlen("USER=") + strlen(username) + NULLBYTE_SIZE; + + home_env = (char *)malloc(home_len); + logname_env = (char *)malloc(logname_len); + mail_env = (char *)malloc(mail_len); + shell_env = (char *)malloc(shell_len); + user_env = (char *)malloc(user_len); + + if ((home_env == NULL) || + (logname_env == NULL) || + (mail_env == NULL) || + (shell_env == NULL) || + (user_env == NULL)) { + PERROR("malloc"); + exit(1); + } + + sprintf(home_env, "HOME=%s", pwd->pw_dir); + sprintf(logname_env, "LOGNAME=%s", username); + sprintf(mail_env, "MAIL=/var/spool/mail/%s", username); + sprintf(shell_env, "SHELL=%s", pwd->pw_shell); + sprintf(user_env, "USER=%s", username); + + home_env[home_len - 1] = '\0'; + logname_env[logname_len - 1] = '\0'; + mail_env[mail_len - 1] = '\0'; + shell_env[shell_len - 1] = '\0'; + user_env[user_len - 1] = '\0'; + + envp[0] = home_env; + envp[1] = logname_env; + envp[2] = mail_env; + envp[3] = shell_env; + envp[4] = user_env; + envp[5] = 0; + + if ((putenv(home_env) < 0) || + (putenv(logname_env) < 0) || + (putenv(mail_env) < 0) || + (putenv(shell_env) < 0) || + (putenv(user_env) < 0)) { + PERROR("vserver: putenv error "); + exit(1); + } +} + +void slice_enter(struct passwd *pwd) +{ + if (setuidgid_root() < 0) { /* For chroot, new_s_context */ + fprintf(stderr, "vsh: Could not become root, check that SUID flag is set on binary\n"); + exit(2); + } + +#ifdef CONFIG_VSERVER_LEGACY + (void) (sandbox_chroot(pwd)); +#endif + + if (sandbox_processes((xid_t) pwd->pw_uid, pwd->pw_name, pwd) < 0) { + fprintf(stderr, "vsh: Could not change context to %d\n", pwd->pw_uid); + exit(2); + } +} + +//-------------------------------------------------------------------- + +#define DEFAULT_SHELL "/bin/sh" + +/* Exit statuses for programs like 'env' that exec other programs. + EXIT_FAILURE might not be 1, so use EXIT_FAIL in such programs. */ +enum +{ + EXIT_CANNOT_INVOKE = 126, + EXIT_ENOENT = 127 +}; + +int main(int argc, char **argv) +{ + struct passwd pwdd, *result, *prechroot, *postchroot = &pwdd; + char *context, *username, *shell, *pwdBuffer; + long pwdBuffer_len; + uid_t uid; + int index, i; + + if (argv[0][0]=='-') + index = 1; + else + index = 0; + + uid = getuid(); + if ((prechroot = getpwuid(uid)) == NULL) { + PERROR("getpwuid(%d)", uid); + exit(1); + } + + context = (char*)strdup(prechroot->pw_name); + if (!context) { + PERROR("strdup"); + exit(2); + } + + /* enter vserver "context" */ + slice_enter(prechroot); + + /* Get the /etc/passwd entry for this user, this time inside + * the chroot. + */ + username = context; + + pwdBuffer_len = sysconf(_SC_GETPW_R_SIZE_MAX); + if (pwdBuffer_len == -1) { + PERROR("sysconf(_SC_GETPW_R_SIZE_MAX"); + exit(1); + } + pwdBuffer = (char*)malloc(pwdBuffer_len); + if (pwdBuffer == NULL) { + PERROR("malloc(%d)", pwdBuffer_len); + exit(1); + } + + errno = 0; + if ((getpwnam_r(username,postchroot,pwdBuffer,pwdBuffer_len, &result) != 0) || + (errno != 0) || result != postchroot) { + PERROR("getpwnam_r(%s)", username); + exit(1); + } + + /* Now run as username in this context. Note that for PlanetLab's + vserver configuration the context name also happens to be the + "default" username within the vserver context. + */ + runas_slice_user(postchroot); + + /* Make sure pw->pw_shell is non-NULL.*/ + if (postchroot->pw_shell == NULL || postchroot->pw_shell[0] == '\0') { + postchroot->pw_shell = (char *) DEFAULT_SHELL; + } + + shell = (char *)strdup(postchroot->pw_shell); + if (!shell) { + PERROR("strdup"); + exit(2); + } + + /* Check whether 'su' or 'sshd' invoked us as a login shell or + not; did this above when testing argv[0]=='-'. + */ + argv[0] = shell; + if (index == 1) { + char **args; + args = (char**)malloc(sizeof(char*)*(argc+2)); + if (!args) { + PERROR("malloc(%d)", sizeof(char*)*(argc+2)); + exit(1); + } + args[0] = argv[0]; + args[1] = "-l"; + for(i=1;i&2 + exit 1 +} +. "$UTIL_VSERVER_VARS" + +LOCKFILE=vip6-autod +. "$_LIB_VSERVER_INIT_FUNCTIONS" + +prog="vip6-autod" + +function start() +{ + _beginResult $"Starting $prog" + $__SBINDIR/$prog + _endResult $? + local retval=$? + test "$retval" -ne 0 || touch "$lockfile" + return $retval +} + +function stop() +{ + _beginResult $"Stopping $prog" + kill `cat $LOCALSTATEDIR/run/vip6-autod.pid` &>/dev/null + _endResult $? + local retval=$? + $_RM -f "$lockfile" + return $retval +} + +function restart() +{ + stop + start +} + +case "$1" in + start|stop|restart) $1;; + reload) ;; + condrestart) + test -f $lockfile && restart || : + ;; + status) + status $prog + ;; + *) + echo "Usage: $0 {start|stop|reload|restart|condrestart|status}" + exit 2 + ;; +esac diff --git a/util-vserver-pl.spec.in b/util-vserver-pl.spec.in new file mode 100644 index 0000000..ee196cd --- /dev/null +++ b/util-vserver-pl.spec.in @@ -0,0 +1,93 @@ +%define name util-vserver-pl +%define version 0.1 +%define release 1%{?pldistro:.%{pldistro}}%{?date:.%{date}} + +%define python_sitearch %( python -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)" ) + +Summary: PlanetLab extensions to util-vserver +Name: %{name} +Version: %{version} +Release: %{release} +License: GPL +Group: System Environment/Base +Source0: %{name}-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root +Requires: util-vserver util-vserver-core util-vserver-build util-vserver-sysv +Obsoletes: util-vserver-py32 resman util-vserver-python +BuildRequires: util-vserver-core util-vserver-devel +BuildRequires: autoconf automake libtool + +%description +This package contains all PlanetLab extensions to util-vserver. + +%prep +%setup -q +autoreconf -fi + + +%build +%configure +make + + +%install +rm -fr %{buildroot} +make DESTDIR=%{buildroot} install + +ln -s ..%{_sbindir}/vsh %{buildroot}/bin/vsh + +rm -f %{buildroot}%{python_sitearch}/vserverimpl.a +rm -f %{buildroot}%{python_sitearch}/vserverimpl.la + +# Generate file list for python package +find "%{buildroot}" -name '*.py' | { while read FILE; do + f="${FILE#%{buildroot}}" + echo "${f}" + # need to touch these files, as they are not produced on FC4 or below + touch ${FILE}c + touch ${FILE}o + echo %%ghost "${f}c" + echo %%ghost "${f}o" +done } > %name-python.list + + +%post +# add /bin/vsh to list of secure shells +if [ ! -f /etc/shells ] || ! grep -q '^/bin/vsh$' /etc/shells ; then + echo /bin/vsh >> /etc/shells +fi + + +%postun +# 0 = erase, 1 = upgrade +if [ "$1" = 0 ] ; then + perl -i -n -e 'next if /^\/bin\/vsh$/; print' /etc/shells +fi + + +%clean +rm -fr %{buildroot} + + +%files -f %name-python.list +%defattr(-,root,root,-) +%{_sbindir}/bwlimit +%{_sbindir}/disklimit +%{_sbindir}/vuseradd +%{_sbindir}/vuserdel + +%{_sbindir}/vsh +/bin/vsh +%{_mandir}/man8/vsh.8* + +%{_sbindir}/vip6-autod +%{_sysconfdir}/init.d/vip6-autod + +%{_sbindir}/vcached +%{_sysconfdir}/cron.d/vcached.cron +%{_sysconfdir}/logrotate.d/vcached.logorate + + +%changelog +* Fri Nov 30 2007 Daniel Hokka Zakrisson - 0.1-1 +- Initial release -- 2.43.0