# Mark Huang <mlhuang@cs.princeton.edu>
# Copyright (C) 2006 The Trustees of Princeton University
#
-# $Id: bwlimit.py,v 1.4 2006/02/22 23:46:51 mlhuang Exp $
+# $Id: bwlimit.py,v 1.7 2006/03/01 18:54:38 mlhuang Exp $
#
import sys, os, re, getopt
+from sets import Set
+import plcapi
-# Where the tc binary lives.
+# Where the tc binary lives
TC = "/sbin/tc"
-# Default interface.
+# Default interface
dev = "eth0"
# For backward compatibility, if bwcap is not specified, attempt to
# get it from here.
bwcap_file = "/etc/planetlab/bwcap"
-# Verbosity level.
+# Verbosity level
verbose = 0
-# guarantee is the minimum rate in bits per second that each slice is
-# guaranteed. The value of this parameter is fairly meaningless, since
-# it is unlikely that every slice will try to transmit full blast
-# simultaneously. It just needs to be small enough so that the total
-# of all outstanding guarantees is less than or equal to the node
-# bandwidth cap (see below). A node with a 500kbit cap (the minimum
-# recommended) can support up to 500kbit/1kbit = 500 slices.
-guarantee = 1000
+# bwmin should be small enough that it can be considered negligibly
+# slow compared to the hardware. 8 bits/second appears to be the
+# smallest value supported by tc.
+bwmin = 8
+
+# bwmax should be large enough that it can be considered at least as
+# fast as the hardware.
+bwmax = 1000*1000*1000
# quantum is the maximum number of bytes that can be borrowed by a
# share (or slice, if each slice gets 1 share) in one time period
# one time period as slices with 1 share, so averaged over time, they
# will get twice as much of the excess bandwidth. The value should be
# as small as possible and at least 1 MTU. By default, it would be
-# calculated as guarantee/10, but since we use such small guarantees,
-# it's better to just set it to a value safely above 1 Ethernet MTU.
+# calculated as bwmin/10, but since we use such small a value for
+# bwmin, it's better to just set it to a value safely above 1 Ethernet
+# MTU.
quantum = 1600
# cburst is the maximum number of bytes that can be burst onto the
# to borrow enough bandwidth to do so. For now, it's unclear how or if
# to relate this to the notion of shares, so just let tc set the
# default.
-
-# bwmax should just be large enough that it can be considered
-# "unlimited".
-bwmax = 1000*1000*1000
+cburst = None
# There is another parameter that controls how bandwidth is allocated
# between slices on nodes that is outside the scope of HTB. We enforce
# node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
# rule and executes this script to override "ceil".
-# We can support multiple bandwidth limits, by reserving the top
-# nibble of the minor classid to be the "subclassid". Theoretically,
-# we could support up to 15 subclasses, but for now, we only define
-# two: the "default" subclass 1:1 that is capped at the node bandwidth
-# cap (in this example, 5mbit) and the "exempt" subclass 1:2 that is
-# capped at the hardware speed (in this example, 1gbit). The "exempt"
-# subclass is entitled to whatever bandwidth is leftover after the
-# node bandwidth cap is reached, and is fairly shared amongst non-root
-# slices.
-#
-# 1:
-# 1:1 (5mbit, 5mbit) 1:2 (1gbit, 1gbit)
+# We support multiple bandwidth limits, by reserving the top nibble of
+# the minor classid to be the "subclassid". Theoretically, we could
+# support up to 15 subclasses, but for now, we only define two: the
+# "default" subclass 1:10 that is capped at the node bandwidth cap (in
+# this example, 5mbit) and the "exempt" subclass 1:20 that is capped
+# at bwmax (i.e., not capped). The 1:1 parent class exists only to
+# make the borrowing model work. All bandwidth above minimum
+# guarantees is fairly shared (in this example, slice 2 is guaranteed
+# at least 1mbit in addition to fair access to the rest), subject to
+# the restrictions of the class hierarchy: namely, that the total
+# bandwidth to non-exempt destinations should not exceed the node
+# bandwidth cap.
#
-# 1:1000 (1, 5mbit, 5mbit) 1:2000 (1gbit, 1gbit)
-# 1:1001 (1, 1kbit, 5mbit) 1:2001 (1kbit, 1gbit)
-# 1:1002 (1, 1kbit, 5mbit) 1:2002 (1kbit, 1gbit)
-# ... ...
-# 1:1FFF (1, 1kbit, 5mbit) 1:2FFF (1kbit, 1gbit)
+# 1:
+# |
+# 1:1 (1gbit)
+# ______________|_____________
+# | |
+# 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
+# | |
+# 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
+# 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
+# 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
+# ... ...
+# 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
#
default_minor = 0x1000
exempt_minor = 0x2000
# root_xid is for the root context. The root context is exempt from
-# fair sharing in both the default and exempt subclasses..
+# fair sharing in both the default and exempt subclasses. The root
+# context gets 5 shares by default.
root_xid = 0x0000
+root_share = 5
# default_xid is for unclassifiable packets. Packets should not be
# classified here very often. They can be if a slice's HTB classes are
-# deleted before its processes are.
+# deleted before its processes are. Each slice gets 1 share by
+# default.
default_xid = 0x0FFF
+default_share = 1
# See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
# warned that older versions of tc interpret "kbps", "mbps", "mbit",
return None
-# Shortcut for running a tc command
-def tc(cmd):
+# Shortcut for running a command
+def run(cmd, input = None):
try:
if verbose:
- sys.stderr.write("Executing: " + TC + " " + cmd + "\n")
- fileobj = os.popen(TC + " " + cmd, "r")
- output = fileobj.readlines()
+ sys.stderr.write("Executing: " + cmd + "\n")
+ if input is None:
+ fileobj = os.popen(cmd, "r")
+ output = fileobj.readlines()
+ else:
+ fileobj = os.popen(cmd, "w")
+ fileobj.write(input)
+ output = None
if fileobj.close() is None:
return output
except Exception, e:
return None
+# Shortcut for running a tc command
+def tc(cmd):
+ return run(TC + " " + cmd)
+
+
# (Re)initialize the bandwidth limits on this node
def init(dev = dev, bwcap = None):
if bwcap is None:
# Allow bwcap to be specified as a tc rate string
bwcap = get_tc_rate(bwcap)
- # Save current state (if any)
- caps = get(dev = dev)
-
# Delete root qdisc 1: if it exists. This will also automatically
# delete any child classes.
for line in tc("qdisc show dev %s" % dev):
tc("qdisc add dev %s root handle 1: htb default %x" % \
(dev, default_minor | default_xid))
+ # Set up a parent class from which all subclasses borrow.
+ tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
+ (dev, bwmax))
+
# Set up a subclass that represents the node bandwidth cap. We
# allow each slice to borrow up to this rate, so it is also
# usually the "ceil" rate for each slice.
- tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
- (dev, bwcap))
+ tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
+ (dev, bwmin, bwcap))
# Set up a subclass that represents "exemption" from the node
- # bandwidth cap. It gets whatever bandwidth is leftover after
- # applying the node bandwidth cap to non-exempt packets.
- tc("class add dev %s parent 1: classid 1:2 htb rate %dbit" % \
- (dev, bwmax))
+ # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
+ # to exempt destinations can still be fairly shared up to bwmax.
+ tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
+ (dev, bwmin, bwmax))
# Set up the root class (and tell VNET what it is). Packets sent
# by root end up here and are capped at the node bandwidth
# cap.
- on(root_xid, dev, minrate = bwmax, maxrate = bwmax)
+ on(root_xid, dev, share = root_share)
file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
# Set up the default class. Packets that fail classification end
# up here.
- on(default_xid, dev, maxrate = bwcap)
+ on(default_xid, dev, share = default_share)
- # Reapply bandwidth caps. If the node bandwidth cap is now lower
- # than it was before, "ceil" for each class will be lowered. If
- # the node bandwidth cap is now higher than it was before, "ceil"
- # for each class should be reapplied.
- for (xid, share, minrate, maxrate) in caps:
- if xid != 0 and xid != default_xid:
- on(xid, dev, share = share, minrate = minrate, maxrate = maxrate)
+ # Set up exemptions.
+ exempt_init()
# Get the bandwidth limits for a particular slice xid as a tuple (xid,
else:
ret = None
- # class htb 1:1002 parent 1:1 leaf 1002: prio 0 rate 10Mbit ceil 10Mbit burst 14704b cburst 14704b
+ # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
for line in tc("-d class show dev %s" % dev):
- # Search for child classes of 1:1
- m = re.match(r"class htb 1:([0-9a-f]+) parent 1:1", line)
+ # Search for child classes of 1:10
+ m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
if m is None:
continue
share = int(m.group(1)) / quantum
# Parse minrate
- minrate = guarantee
+ minrate = bwmin
m = re.search(r"rate (\w+)", line)
if m is not None:
minrate = get_tc_rate(m.group(1))
# Figure out what the current node bandwidth cap is
bwcap = bwmax
for line in tc("-d class show dev %s" % dev):
- # Search for 1:1
- m = re.match(r"class htb 1:1 root .*ceil (\w+)", line)
+ # Search for 1:10
+ m = re.match(r"class htb 1:10.*ceil (\w+)", line)
if m is not None:
bwcap = get_tc_rate(m.group(1))
break
# Set defaults
if share is None:
- share = 1
+ share = default_share
if minrate is None:
- minrate = guarantee
+ minrate = bwmin
else:
minrate = get_tc_rate(minrate)
if maxrate is None:
else:
maxrate = get_tc_rate(maxrate)
+ # Sanity checks
+ if maxrate > bwcap:
+ maxrate = bwcap
if minrate > maxrate:
minrate = maxrate
- # Set up subclasses for the slice.
- tc("class replace dev %s parent 1:1 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
- (dev, default_minor | xid, min(minrate, bwcap), min(maxrate, bwcap), share * quantum))
+ # Set up subclasses for the slice
+ tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+ (dev, default_minor | xid, minrate, maxrate, share * quantum))
- tc("class replace dev %s parent 1:2 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
- (dev, exempt_minor | xid, min(minrate, bwmax), bwmax, share * quantum))
+ tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
+ (dev, exempt_minor | xid, minrate, bwmax, share * quantum))
# Attach a FIFO to each subclass, which helps to throttle back
# processes that are sending faster than the token buckets can
# are seen from this slice, they will be classified into the default
# class 1:1FFF.
def off(xid, dev = dev):
- tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
- tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
+ cap = get(xid, dev)
+ if cap is not None:
+ tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
+ tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
+
+
+def exempt_init():
+ # Who are we?
+ try:
+ node_id = int(file('/etc/planetlab/node_id').readline().strip())
+ except:
+ return False
+
+ api = plcapi.PLCAPI()
+
+ # All nodes that have access to Internet2
+ node_ids = []
+ for node_group in api.AnonAdmGetNodeGroups(api.auth):
+ if node_group['name'] == "Internet2":
+ node_ids += api.AnonAdmGetNodeGroupNodes(api.auth, node_group['nodegroup_id'])
+
+ # Remove duplicates
+ node_ids = list(Set(node_ids))
+
+ # Continue only if we ourselves have access to Internet2
+ if node_id not in node_ids:
+ return True
+
+ # Exempt the following destinations from the node bandwidth cap
+ node_ips = [node['ip'] for node in api.AnonAdmGetNodes(api.auth, node_ids, ['ip'])]
+
+ # Clean up
+ run("/sbin/iptables -t vnet -F POSTROUTING")
+ run("/sbin/ipset -X Internet2")
+
+ # Create a hashed IP set of all of these destinations
+ run("/sbin/modprobe ip_set_iphash")
+ lines = ["-N Internet2 iphash"]
+ lines += ["-A Internet2 " + ip for ip in node_ips]
+ lines += ["COMMIT"]
+ restore = "\n".join(lines) + "\n"
+ run("/sbin/ipset -R", restore)
+
+ # Add rule to match on destination IP set
+ run("/sbin/iptables -t vnet -A POSTROUTING -m set --set Internet2 dst -j CLASSIFY --set-class 1:%x" %
+ exempt_minor)
def usage():
Options:
-d device Network interface (default: %s)
-r rate Node bandwidth cap (default: %s)
- -g guarantee Default minimum slice rate (default: %s bits/second)
-q quantum Share multiplier (default: %d bytes)
-h This message
Get maxrate for the specified slice
setcap slice maxrate
Set maxrate for the specified slice
-""" % (sys.argv[0], dev, bwcap_description, guarantee, quantum)
+""" % (sys.argv[0], dev, bwcap_description, quantum)
sys.exit(1)
def main():
- global dev, guarantee, quantum, verbose
+ global dev, quantum, verbose
# Defaults
bwcap = get_bwcap()
dev = optval
elif opt == '-r':
bwcap = get_tc_rate(optval)
- elif opt == '-g':
- guarantee = get_tc_rate(optval)
elif opt == '-q':
quantum = int(optval)
elif opt == '-v':
if slice is None:
# Orphaned (not associated with a slice) class
slice = "%d?" % xid
- print "%s: share %d minrate %s maxrate %s" % \
+ print "%s %d %s %s" % \
(slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
elif len(argv) >= 2: