#!/usr/bin/python
#
-# Bandwidth limit script to run on PlanetLab nodes. The intent is to use
-# the Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to:
+# Bandwidth limit module for PlanetLab nodes. The intent is to use the
+# Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
+# slices to fairly share access to available node bandwidth. We
+# currently define three classes of "available node bandwidth":
#
-# 1. Cap the total output bandwidth of the node at a specified rate
-# (e.g., 5 Mbps).
+# 1. Available hardware bandwidth (bwmax): The maximum rate of the
+# hardware.
#
-# 2. Allow slices to fairly share this rate. Some slices have more
-# shares than other.
+# 2. Available capped bandwidth (bwcap): The maximum rate allowed to
+# non-exempt destinations. By default, equal to bwmax, but may be
+# lowered by PIs.
#
-# For instance, if the node is capped at 5 Mbps, there are N slices,
-# and each slice has 1 share, then each slice should get at least 5/N
-# Mbps of bandwidth. How HTB is implemented makes this statement a
-# little too simplistic. What it really means is that during any
-# single time period, only a certain number of bytes can be sent onto
-# the wire. Each slice is guaranteed that at least some small number
-# of its bytes will be sent. Whatever is left over from the budget, is
-# split in proportion to the number of shares each slice has.
+# 3. Available uncapped ("exempt") bandwidth: The difference between
+# bwmax and what is currently being used of bwcap, or the maximum rate
+# allowed to destinations exempt from caps (e.g., Internet2).
#
-# The root context is exempt from this sharing and can send as much as
-# it needs to.
+# All three classes of bandwidth are fairly shared according to the
+# notion of "shares". For instance, if the node is capped at 5 Mbps,
+# there are N slices, and each slice has 1 share, then each slice
+# should get at least 5/N Mbps of bandwidth. How HTB is implemented
+# makes this statement a little too simplistic. What it really means
+# is that during any single time period, only a certain number of
+# bytes can be sent onto the wire. Each slice is guaranteed that at
+# least some small number of its bytes will be sent. Whatever is left
+# over from the budget, is split in proportion to the number of shares
+# each slice has.
+#
+# Even if the node is not capped at a particular limit (bwcap ==
+# bwmax), this module enforces fair share access to bwmax. Also, if
+# the node is capped at a particular limit, rules may optionally be
+# defined that classify certain packets into the "exempt" class. This
+# class receives whatever bandwidth is leftover between bwcap and
+# bwmax; slices fairly share this bandwidth as well.
+#
+# The root context is exempt from sharing and can send as much as it
+# needs to.
#
# Some relevant URLs:
#
# Mark Huang <mlhuang@cs.princeton.edu>
# Copyright (C) 2006 The Trustees of Princeton University
#
-# $Id$
+# $Id: bwlimit.py,v 1.9 2006/03/01 22:37:24 mlhuang Exp $
#
import sys, os, re, getopt
+from sets import Set
+import pwd
-# Where the tc binary lives.
+# Where the tc binary lives
TC = "/sbin/tc"
-# Default interface.
-DEV = "eth0"
+# Default interface
+dev = "eth0"
-# Verbosity level.
+# Verbosity level
verbose = 0
-# guarantee is the minimum rate in bits per second that each slice is
-# guaranteed. The value of this parameter is fairly meaningless, since
-# it is unlikely that every slice will try to transmit full blast
-# simultaneously. It just needs to be small enough so that the total
-# of all outstanding guarantees is less than or equal to the node
-# bandwidth cap (see below). A node with a 500kbit cap (the minimum
-# recommended) can support up to 500kbit/1000 = 500 slices.
-guarantee = 1000
+# bwmin should be small enough that it can be considered negligibly
+# slow compared to the hardware. 8 bits/second appears to be the
+# smallest value supported by tc.
+bwmin = 8
+
+# bwmax should be large enough that it can be considered at least as
+# fast as the hardware.
+bwmax = 1000*1000*1000
# quantum is the maximum number of bytes that can be borrowed by a
# share (or slice, if each slice gets 1 share) in one time period
# one time period as slices with 1 share, so averaged over time, they
# will get twice as much of the excess bandwidth. The value should be
# as small as possible and at least 1 MTU. By default, it would be
-# calculated as guarantee/10, but since we use such small guarantees,
-# it's better to just set it to a value safely above 1 Ethernet MTU.
+# calculated as bwmin/10, but since we use such small a value for
+# bwmin, it's better to just set it to a value safely above 1 Ethernet
+# MTU.
quantum = 1600
# cburst is the maximum number of bytes that can be burst onto the
# to borrow enough bandwidth to do so. For now, it's unclear how or if
# to relate this to the notion of shares, so just let tc set the
# default.
-
-# bwcap is the node bandwidth cap in tc format (see below for
-# supported suffixes), read in from /etc/planetlab/bwcap. We allow
-# each slice to borrow up to this rate, so it is also usually the
-# "ceil" rate for each slice. -1 means disabled.
-bwcap_file = "/etc/planetlab/bwcap"
-bwcap = -1
+cburst = None
# There is another parameter that controls how bandwidth is allocated
# between slices on nodes that is outside the scope of HTB. We enforce
# node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
# rule and executes this script to override "ceil".
-# root_minor is the special class for the root context. The root
-# context is exempt from minrate and fair sharing.
-root_minor = 0x2
-
-# default_minor is the special default class for unclassifiable
-# packets. Packets should not be classified here very often. They can
-# be if a slice's HTB class is deleted before its processes are.
-default_minor = 0xFFFF
+# We support multiple bandwidth limits, by reserving the top nibble of
+# the minor classid to be the "subclassid". Theoretically, we could
+# support up to 15 subclasses, but for now, we only define two: the
+# "default" subclass 1:10 that is capped at the node bandwidth cap (in
+# this example, 5mbit) and the "exempt" subclass 1:20 that is capped
+# at bwmax (i.e., not capped). The 1:1 parent class exists only to
+# make the borrowing model work. All bandwidth above minimum
+# guarantees is fairly shared (in this example, slice 2 is guaranteed
+# at least 1mbit in addition to fair access to the rest), subject to
+# the restrictions of the class hierarchy: namely, that the total
+# bandwidth to non-exempt destinations should not exceed the node
+# bandwidth cap.
+#
+# 1:
+# |
+# 1:1 (1gbit)
+# ______________|_____________
+# | |
+# 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
+# | |
+# 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
+# 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
+# 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
+# ... ...
+# 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
+#
+default_minor = 0x1000
+exempt_minor = 0x2000
+
+# root_xid is for the root context. The root context is exempt from
+# fair sharing in both the default and exempt subclasses. The root
+# context gets 5 shares by default.
+root_xid = 0x0000
+root_share = 5
+
+# default_xid is for unclassifiable packets. Packets should not be
+# classified here very often. They can be if a slice's HTB classes are
+# deleted before its processes are. Each slice gets 1 share by
+# default.
+default_xid = 0x0FFF
+default_share = 1
# See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
# warned that older versions of tc interpret "kbps", "mbps", "mbit",
}
-# Parses a tc rate string (e.g., 1.5mbit) into bits/second
+# Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
def get_tc_rate(s):
+ if type(s) == int:
+ return s
m = re.match(r"([0-9.]+)(\D*)", s)
if m is None:
return -1
return "%.0fbit" % rate
-# Parse /etc/planetlab/bwcap. XXX Should get this from the API
-# instead.
-def parse_bwcap():
- global bwcap
-
+# Parse /etc/planetlab/bwcap (or equivalent)
+def read_bwcap(bwcap_file):
+ bwcap = bwmax
try:
fp = open(bwcap_file, "r")
line = fp.readline().strip()
bwcap = get_tc_rate(line)
except:
pass
+ if bwcap == -1:
+ bwcap = bwmax
+ return bwcap
-# Before doing anything else, parse the node bandwidth cap file
-parse_bwcap()
+# Get current (live) value of bwcap
+def get_bwcap(dev = dev):
+
+ state = tc("-d class show dev %s" % dev)
+ base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
+ base_classes = filter(None, map(base_re.match, state))
+ if not base_classes:
+ return -1
+ if len(base_classes) > 1:
+ raise Exception, "unable to get current bwcap"
+ return get_tc_rate(base_classes[0].group(1))
# Get slice xid (500) from slice name ("500" or "princeton_mlh") or
# slice name ("princeton_mlh") from slice xid (500).
def get_slice(xid_or_name):
- labels = ['account', 'password', 'uid', 'gid', 'gecos', 'directory', 'shell']
- for line in file("/etc/passwd"):
- # Comment
- if line.strip() == '' or line[0] in '#':
- continue
- # princeton_mlh:x:...
- fields = line.strip().split(':')
- if len(fields) < len(labels):
- continue
- # {'account': 'princeton_mlh', 'password': 'x', ...}
- pw = dict(zip(labels, fields))
- if xid_or_name == default_minor:
- # Convert 0xffff into "default"
- return "default"
- elif xid_or_name == root_minor:
- # Convert 0x2 into "root"
- return "root"
- elif xid_or_name == int(pw['uid']):
- # Convert xid into name
- return pw['account']
- elif pw['uid'] == xid_or_name or pw['account'] == xid_or_name:
- # Convert name into xid
- return int(pw['uid'])
+ if xid_or_name == root_xid:
+ return "root"
+ if xid_or_name == default_xid:
+ return "default"
+ if isinstance(xid_or_name, (int, long)):
+ try:
+ return pwd.getpwuid(xid_or_name).pw_name
+ except KeyError:
+ pass
+ else:
+ try:
+ try:
+ return int(xid_or_name)
+ except ValueError:
+ pass
+ return pwd.getpwnam(xid_or_name).pw_uid
+ except KeyError:
+ pass
return None
-# Shortcut for running a tc command
-def tc(cmd):
+# Shortcut for running a command
+def run(cmd, input = None):
try:
if verbose:
- sys.stderr.write("Executing: " + TC + " " + cmd + "\n")
- fileobj = os.popen(TC + " " + cmd, "r")
- output = fileobj.readlines()
+ sys.stderr.write("Executing: " + cmd + "\n")
+ if input is None:
+ fileobj = os.popen(cmd, "r")
+ output = fileobj.readlines()
+ else:
+ fileobj = os.popen(cmd, "w")
+ fileobj.write(input)
+ output = None
if fileobj.close() is None:
return output
except Exception, e:
return None
+# Shortcut for running a tc command
+def tc(cmd):
+ return run(TC + " " + cmd)
+
+
# (Re)initialize the bandwidth limits on this node
-def init(dev = DEV):
- # Save current state (if any)
- caps = get(dev = DEV)
+def init(dev, bwcap):
# Delete root qdisc 1: if it exists. This will also automatically
# delete any child classes.
tc("qdisc del dev %s root handle 1:" % dev)
break
- # Nothing to do
- if bwcap == -1:
- return
-
# Initialize HTB. The "default" clause specifies that if a packet
# fails classification, it should go into the class with handle
- # FFFF.
- tc("qdisc add dev %s root handle 1: htb default FFFF" % dev)
+ # 1FFF.
+ tc("qdisc add dev %s root handle 1: htb default %x" % \
+ (dev, default_minor | default_xid))
- # Set up the parent class that represents the node bandwidth
- # cap; in other words, the class from which all others borrow.
+ # Set up a parent class from which all subclasses borrow.
tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
- (dev, bwcap))
+ (dev, bwmax))
+
+ # Set up a subclass that represents the node bandwidth cap. We
+ # allow each slice to borrow up to this rate, so it is also
+ # usually the "ceil" rate for each slice.
+ tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
+ (dev, bwmin, bwcap))
+
+ # Set up a subclass that represents "exemption" from the node
+ # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
+ # to exempt destinations can still be fairly shared up to bwmax.
+ tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
+ (dev, bwmin, bwmax))
# Set up the root class (and tell VNET what it is). Packets sent
# by root end up here and are capped at the node bandwidth
# cap.
- on(root_minor, dev, minrate = bwcap, maxrate = bwcap)
- file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | root_minor))
+ on(root_xid, dev, share = root_share)
+ file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
# Set up the default class. Packets that fail classification end
# up here.
- on(default_minor, dev, maxrate = bwcap)
-
- # Reapply bandwidth caps. If the node bandwidth cap is now lower
- # than it was before, "ceil" for each class will be lowered. XXX
- # If the node bandwidth cap is now higher than it was before,
- # "ceil" for each class should be raised, but we have no idea
- # whether the lower cap was put on by pl_mom or by an admin, so it
- # is left as it was before, at least until pl_mom gets around to
- # resetting each slice's cap at the beginning of the next
- # day. What *should* happen is that Node Manager should control
- # both the application of the node bandwidth cap and the
- # application of the per-slice bandwidth caps, and there should be
- # only one external caller of this script (pl_mom). Even then,
- # pl_mom should probably be merged into Node Manager at some
- # point.
- for (xid, share, minrate, maxrate) in caps:
- if xid != root_minor and xid != default_minor:
- on(xid, dev, share = share, minrate = minrate, maxrate = maxrate)
+ on(default_xid, dev, share = default_share)
# Get the bandwidth limits for a particular slice xid as a tuple (xid,
# share, minrate, maxrate), or all classes as a list of tuples.
-def get(xid = None, dev = DEV):
+def get(xid = None, dev = dev):
if xid is None:
ret = []
else:
ret = None
- # class htb 1:2 parent 1:1 leaf 2: prio 0 rate 10Mbit ceil 10Mbit burst 14704b cburst 14704b
+ # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
for line in tc("-d class show dev %s" % dev):
- # Search for child classes of 1:1
- m = re.match(r"class htb 1:([0-9a-f]+) parent 1:1", line)
+ # Search for child classes of 1:10
+ m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
if m is None:
continue
# If we are looking for a particular class
- classid = int(m.group(1), 16)
+ classid = int(m.group(1), 16) & default_xid
if xid is not None and xid != classid:
continue
share = int(m.group(1)) / quantum
# Parse minrate
- minrate = guarantee
+ minrate = bwmin
m = re.search(r"rate (\w+)", line)
if m is not None:
minrate = get_tc_rate(m.group(1))
# Parse maxrate
- maxrate = bwcap
+ maxrate = bwmax
m = re.search(r"ceil (\w+)", line)
if m is not None:
maxrate = get_tc_rate(m.group(1))
# Apply specified bandwidth limit to the specified slice xid
-def on(xid, dev = DEV, share = None, minrate = None, maxrate = None):
+def on(xid, dev = dev, share = None, minrate = None, maxrate = None):
# Get defaults from current state if available
cap = get(xid, dev)
if cap is not None:
if maxrate is None:
maxrate = cap[3]
+ # Figure out what the current node bandwidth cap is
+ bwcap = bwmax
+ for line in tc("-d class show dev %s" % dev):
+ # Search for 1:10
+ m = re.match(r"class htb 1:10.*ceil (\w+)", line)
+ if m is not None:
+ bwcap = get_tc_rate(m.group(1))
+ break
+
# Set defaults
if share is None:
- share = 1
+ share = default_share
if minrate is None:
- minrate = guarantee
+ minrate = bwmin
+ else:
+ minrate = get_tc_rate(minrate)
if maxrate is None:
maxrate = bwcap
+ else:
+ maxrate = get_tc_rate(maxrate)
+
+ # Sanity checks
+ if maxrate > bwcap:
+ maxrate = bwcap
+ if minrate > maxrate:
+ minrate = maxrate
- # Allow slices to burst up to the node bandwidth cap by default.
- maxrate = min(maxrate, bwcap)
+ # Set up subclasses for the slice
+ tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+ (dev, default_minor | xid, minrate, maxrate, share * quantum))
- # Set up a class for the slice.
- tc("class replace dev %s parent 1:1 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
- (dev, xid, minrate, maxrate, share * quantum))
+ tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+ (dev, exempt_minor | xid, minrate, bwmax, share * quantum))
- # Attach a FIFO to the class, which helps to throttle back
- # processes that are sending faster than the token bucket can
+ # Attach a FIFO to each subclass, which helps to throttle back
+ # processes that are sending faster than the token buckets can
# support.
tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
- (dev, xid, xid))
+ (dev, default_minor | xid, default_minor | xid))
+
+ tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
+ (dev, exempt_minor | xid, exempt_minor | xid))
# Remove class associated with specified slice xid. If further packets
# are seen from this slice, they will be classified into the default
-# class 1:FFFF.
-def off(xid, dev = DEV):
- tc("class del dev %s classid 1:%x" % (dev, xid))
+# class 1:1FFF.
+def off(xid, dev = dev):
+ cap = get(xid, dev)
+ if cap is not None:
+ tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
+ tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
+
+
+def exempt_init(group_name, node_ips):
+
+ # Clean up
+ iptables = "/sbin/iptables -t vnet %s POSTROUTING"
+ run(iptables % "-F")
+ run("/sbin/ipset -X " + group_name)
+
+ # Create a hashed IP set of all of these destinations
+ run("/sbin/modprobe ip_set_iphash")
+ lines = ["-N %s iphash" % group_name]
+ add_cmd = "-A %s " % group_name
+ lines += [(add_cmd + ip) for ip in node_ips]
+ lines += ["COMMIT"]
+ restore = "\n".join(lines) + "\n"
+ run("/sbin/ipset -R", restore)
+
+ # Add rule to match on destination IP set
+ run((iptables + " -m set --set %s dst -j CLASSIFY --set-class 1:%x") %
+ ("-A", group_name, exempt_minor))
def usage():
- if bwcap == -1:
- bwcap_description = "disabled"
- else:
- bwcap_description = "%d bits/second" % bwcap
+ bwcap_description = format_tc_rate(get_bwcap())
print """
Usage:
%s [OPTION]... [COMMAND] [ARGUMENT]...
-Options (override configuration file):
- -f file Configuration file (default: %s)
+Options:
-d device Network interface (default: %s)
-r rate Node bandwidth cap (default: %s)
- -g guarantee Default minimum slice rate (default: %d bits/second)
-q quantum Share multiplier (default: %d bytes)
-h This message
Commands:
init
- (Re)load configuration and (re)initialize bandwidth caps.
- on
- Same as init
+ (Re)initialize bandwidth caps.
on slice [share] [minrate] [maxrate]
Set bandwidth cap for the specified slice
- off
- Remove all bandwidth caps
off slice
Remove bandwidth caps for the specified slice
get
Get maxrate for the specified slice
setcap slice maxrate
Set maxrate for the specified slice
-""" % (sys.argv[0], bwcap_file, DEV, bwcap_description, guarantee, quantum)
+""" % (sys.argv[0], dev, bwcap_description, quantum)
sys.exit(1)
def main():
- global DEV, bwcap_file, bwcap, guarantee, quantum, verbose
+ global dev, quantum, verbose
+
+ # Defaults
+ bwcap = get_bwcap()
(opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
for (opt, optval) in opts:
- if opt == '-f':
- bwcap_file = optval
- parse_bwcap()
- elif opt == '-d':
- DEV = optval
+ if opt == '-d':
+ dev = optval
elif opt == '-r':
bwcap = get_tc_rate(optval)
- elif opt == '-g':
- guarantee = get_tc_rate(optval)
elif opt == '-q':
quantum = int(optval)
elif opt == '-v':
if len(argv):
if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
# (Re)initialize
- init(DEV)
-
- elif argv[0] == "off" and len(argv) == 1:
- # Disable all caps
- bwcap = -1
- init(DEV)
- sys.stderr.write("Warning: all configured bandwidth limits have been removed\n")
+ init(dev, get_tc_rate(bwcap))
elif argv[0] == "get" or argv[0] == "show":
# Show
if xid is None:
sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
usage()
- caps = [get(xid, DEV)]
+ caps = [get(xid, dev)]
else:
# Show all slices
- caps = get(None, DEV)
+ caps = get(None, dev)
for (xid, share, minrate, maxrate) in caps:
slice = get_slice(xid)
if slice is None:
# Orphaned (not associated with a slice) class
slice = "%d?" % xid
- print "%s: share %d minrate %s maxrate %s" % \
+ print "%s %d %s %s" % \
(slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
elif len(argv) >= 2:
if i >= len(casts):
break
args.append(casts[i](arg))
- on(xid, DEV, *args)
+ on(xid, dev, *args)
elif argv[0] == "off" or argv[0] == "del":
# Disable cap
- off(xid, DEV)
+ off(xid, dev)
# Backward compatibility with old resman script
elif argv[0] == "getcap":
# Get maxrate
- cap = get(xid, DEV)
+ cap = get(xid, dev)
if cap is not None:
(xid, share, minrate, maxrate) = cap
print format_tc_rate(maxrate)
elif argv[0] == "setcap":
if len(argv) >= 3:
# Set maxrate
- on(xid, DEV, maxrate = get_tc_rate(argv[2]))
+ on(xid, dev, maxrate = get_tc_rate(argv[2]))
else:
usage()