From: Mark Huang Date: Mon, 27 Feb 2006 01:58:09 +0000 (+0000) Subject: - rewrite to support a fairly shared "unlimited" bandwidth class X-Git-Tag: after-util-vserver-0_30_208-revert~47 X-Git-Url: http://git.onelab.eu/?p=util-vserver.git;a=commitdiff_plain;h=d00807c4ad78dec0938b0cb802514acc151cf33c - rewrite to support a fairly shared "unlimited" bandwidth class --- diff --git a/python/bwlimit.py b/python/bwlimit.py index 2043f7b..a5de20e 100644 --- a/python/bwlimit.py +++ b/python/bwlimit.py @@ -1,25 +1,41 @@ #!/usr/bin/python # -# Bandwidth limit script to run on PlanetLab nodes. The intent is to use -# the Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to: +# Bandwidth limit module for PlanetLab nodes. The intent is to use the +# Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow +# slices to fairly share access to available node bandwidth. We +# currently define three classes of "available node bandwidth": # -# 1. Cap the total output bandwidth of the node at a specified rate -# (e.g., 5 Mbps). +# 1. Available hardware bandwidth (bwmax): The maximum rate of the +# hardware. # -# 2. Allow slices to fairly share this rate. Some slices have more -# shares than other. +# 2. Available capped bandwidth (bwcap): The maximum rate allowed to +# non-exempt destinations. By default, equal to bwmax, but may be +# lowered by PIs. # -# For instance, if the node is capped at 5 Mbps, there are N slices, -# and each slice has 1 share, then each slice should get at least 5/N -# Mbps of bandwidth. How HTB is implemented makes this statement a -# little too simplistic. What it really means is that during any -# single time period, only a certain number of bytes can be sent onto -# the wire. Each slice is guaranteed that at least some small number -# of its bytes will be sent. Whatever is left over from the budget, is -# split in proportion to the number of shares each slice has. +# 3. Available uncapped ("exempt") bandwidth: The difference between +# bwmax and what is currently being used of bwcap, or the maximum rate +# allowed to destinations exempt from caps (e.g., Internet2). # -# The root context is exempt from this sharing and can send as much as -# it needs to. +# All three classes of bandwidth are fairly shared according to the +# notion of "shares". For instance, if the node is capped at 5 Mbps, +# there are N slices, and each slice has 1 share, then each slice +# should get at least 5/N Mbps of bandwidth. How HTB is implemented +# makes this statement a little too simplistic. What it really means +# is that during any single time period, only a certain number of +# bytes can be sent onto the wire. Each slice is guaranteed that at +# least some small number of its bytes will be sent. Whatever is left +# over from the budget, is split in proportion to the number of shares +# each slice has. +# +# Even if the node is not capped at a particular limit (bwcap == +# bwmax), this module enforces fair share access to bwmax. Also, if +# the node is capped at a particular limit, rules may optionally be +# defined that classify certain packets into the "exempt" class. This +# class receives whatever bandwidth is leftover between bwcap and +# bwmax; slices fairly share this bandwidth as well. +# +# The root context is exempt from sharing and can send as much as it +# needs to. # # Some relevant URLs: # @@ -30,7 +46,7 @@ # Mark Huang # Copyright (C) 2006 The Trustees of Princeton University # -# $Id$ +# $Id: bwlimit.py,v 1.4 2006/02/22 23:46:51 mlhuang Exp $ # import sys, os, re, getopt @@ -40,7 +56,11 @@ import sys, os, re, getopt TC = "/sbin/tc" # Default interface. -DEV = "eth0" +dev = "eth0" + +# For backward compatibility, if bwcap is not specified, attempt to +# get it from here. +bwcap_file = "/etc/planetlab/bwcap" # Verbosity level. verbose = 0 @@ -51,7 +71,7 @@ verbose = 0 # simultaneously. It just needs to be small enough so that the total # of all outstanding guarantees is less than or equal to the node # bandwidth cap (see below). A node with a 500kbit cap (the minimum -# recommended) can support up to 500kbit/1000 = 500 slices. +# recommended) can support up to 500kbit/1kbit = 500 slices. guarantee = 1000 # quantum is the maximum number of bytes that can be borrowed by a @@ -77,12 +97,9 @@ quantum = 1600 # to relate this to the notion of shares, so just let tc set the # default. -# bwcap is the node bandwidth cap in tc format (see below for -# supported suffixes), read in from /etc/planetlab/bwcap. We allow -# each slice to borrow up to this rate, so it is also usually the -# "ceil" rate for each slice. -1 means disabled. -bwcap_file = "/etc/planetlab/bwcap" -bwcap = -1 +# bwmax should just be large enough that it can be considered +# "unlimited". +bwmax = 1000*1000*1000 # There is another parameter that controls how bandwidth is allocated # between slices on nodes that is outside the scope of HTB. We enforce @@ -92,14 +109,36 @@ bwcap = -1 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this # rule and executes this script to override "ceil". -# root_minor is the special class for the root context. The root -# context is exempt from minrate and fair sharing. -root_minor = 0x2 +# We can support multiple bandwidth limits, by reserving the top +# nibble of the minor classid to be the "subclassid". Theoretically, +# we could support up to 15 subclasses, but for now, we only define +# two: the "default" subclass 1:1 that is capped at the node bandwidth +# cap (in this example, 5mbit) and the "exempt" subclass 1:2 that is +# capped at the hardware speed (in this example, 1gbit). The "exempt" +# subclass is entitled to whatever bandwidth is leftover after the +# node bandwidth cap is reached, and is fairly shared amongst non-root +# slices. +# +# 1: +# 1:1 (5mbit, 5mbit) 1:2 (1gbit, 1gbit) +# +# 1:1000 (1, 5mbit, 5mbit) 1:2000 (1gbit, 1gbit) +# 1:1001 (1, 1kbit, 5mbit) 1:2001 (1kbit, 1gbit) +# 1:1002 (1, 1kbit, 5mbit) 1:2002 (1kbit, 1gbit) +# ... ... +# 1:1FFF (1, 1kbit, 5mbit) 1:2FFF (1kbit, 1gbit) +# +default_minor = 0x1000 +exempt_minor = 0x2000 + +# root_xid is for the root context. The root context is exempt from +# fair sharing in both the default and exempt subclasses.. +root_xid = 0x0000 -# default_minor is the special default class for unclassifiable -# packets. Packets should not be classified here very often. They can -# be if a slice's HTB class is deleted before its processes are. -default_minor = 0xFFFF +# default_xid is for unclassifiable packets. Packets should not be +# classified here very often. They can be if a slice's HTB classes are +# deleted before its processes are. +default_xid = 0x0FFF # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be # warned that older versions of tc interpret "kbps", "mbps", "mbit", @@ -129,8 +168,10 @@ suffixes = { } -# Parses a tc rate string (e.g., 1.5mbit) into bits/second +# Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second def get_tc_rate(s): + if type(s) == int: + return s m = re.match(r"([0-9.]+)(\D*)", s) if m is None: return -1 @@ -153,9 +194,8 @@ def format_tc_rate(rate): # Parse /etc/planetlab/bwcap. XXX Should get this from the API # instead. -def parse_bwcap(): - global bwcap - +def get_bwcap(): + bwcap = bwmax try: fp = open(bwcap_file, "r") line = fp.readline().strip() @@ -163,10 +203,9 @@ def parse_bwcap(): bwcap = get_tc_rate(line) except: pass - - -# Before doing anything else, parse the node bandwidth cap file -parse_bwcap() + if bwcap == -1: + bwcap = bwmax + return bwcap # Get slice xid (500) from slice name ("500" or "princeton_mlh") or @@ -184,12 +223,10 @@ def get_slice(xid_or_name): continue # {'account': 'princeton_mlh', 'password': 'x', ...} pw = dict(zip(labels, fields)) - if xid_or_name == default_minor: - # Convert 0xffff into "default" - return "default" - elif xid_or_name == root_minor: - # Convert 0x2 into "root" + if xid_or_name == root_xid: return "root" + if xid_or_name == default_xid: + return "default" elif xid_or_name == int(pw['uid']): # Convert xid into name return pw['account'] @@ -215,9 +252,17 @@ def tc(cmd): # (Re)initialize the bandwidth limits on this node -def init(dev = DEV): +def init(dev = dev, bwcap = None): + if bwcap is None: + # For backward compatibility, if bwcap is not specified, + # attempt to get it from /etc/planetlab/bwcap. + bwcap = get_bwcap() + else: + # Allow bwcap to be specified as a tc rate string + bwcap = get_tc_rate(bwcap) + # Save current state (if any) - caps = get(dev = DEV) + caps = get(dev = dev) # Delete root qdisc 1: if it exists. This will also automatically # delete any child classes. @@ -228,57 +273,52 @@ def init(dev = DEV): tc("qdisc del dev %s root handle 1:" % dev) break - # Nothing to do - if bwcap == -1: - return - # Initialize HTB. The "default" clause specifies that if a packet # fails classification, it should go into the class with handle - # FFFF. - tc("qdisc add dev %s root handle 1: htb default FFFF" % dev) + # 1FFF. + tc("qdisc add dev %s root handle 1: htb default %x" % \ + (dev, default_minor | default_xid)) - # Set up the parent class that represents the node bandwidth - # cap; in other words, the class from which all others borrow. + # Set up a subclass that represents the node bandwidth cap. We + # allow each slice to borrow up to this rate, so it is also + # usually the "ceil" rate for each slice. tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \ (dev, bwcap)) + # Set up a subclass that represents "exemption" from the node + # bandwidth cap. It gets whatever bandwidth is leftover after + # applying the node bandwidth cap to non-exempt packets. + tc("class add dev %s parent 1: classid 1:2 htb rate %dbit" % \ + (dev, bwmax)) + # Set up the root class (and tell VNET what it is). Packets sent # by root end up here and are capped at the node bandwidth # cap. - on(root_minor, dev, minrate = bwcap, maxrate = bwcap) - file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | root_minor)) + on(root_xid, dev, minrate = bwmax, maxrate = bwmax) + file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid)) # Set up the default class. Packets that fail classification end # up here. - on(default_minor, dev, maxrate = bwcap) + on(default_xid, dev, maxrate = bwcap) # Reapply bandwidth caps. If the node bandwidth cap is now lower - # than it was before, "ceil" for each class will be lowered. XXX - # If the node bandwidth cap is now higher than it was before, - # "ceil" for each class should be raised, but we have no idea - # whether the lower cap was put on by pl_mom or by an admin, so it - # is left as it was before, at least until pl_mom gets around to - # resetting each slice's cap at the beginning of the next - # day. What *should* happen is that Node Manager should control - # both the application of the node bandwidth cap and the - # application of the per-slice bandwidth caps, and there should be - # only one external caller of this script (pl_mom). Even then, - # pl_mom should probably be merged into Node Manager at some - # point. + # than it was before, "ceil" for each class will be lowered. If + # the node bandwidth cap is now higher than it was before, "ceil" + # for each class should be reapplied. for (xid, share, minrate, maxrate) in caps: - if xid != root_minor and xid != default_minor: + if xid != 0 and xid != default_xid: on(xid, dev, share = share, minrate = minrate, maxrate = maxrate) # Get the bandwidth limits for a particular slice xid as a tuple (xid, # share, minrate, maxrate), or all classes as a list of tuples. -def get(xid = None, dev = DEV): +def get(xid = None, dev = dev): if xid is None: ret = [] else: ret = None - # class htb 1:2 parent 1:1 leaf 2: prio 0 rate 10Mbit ceil 10Mbit burst 14704b cburst 14704b + # class htb 1:1002 parent 1:1 leaf 1002: prio 0 rate 10Mbit ceil 10Mbit burst 14704b cburst 14704b for line in tc("-d class show dev %s" % dev): # Search for child classes of 1:1 m = re.match(r"class htb 1:([0-9a-f]+) parent 1:1", line) @@ -286,7 +326,7 @@ def get(xid = None, dev = DEV): continue # If we are looking for a particular class - classid = int(m.group(1), 16) + classid = int(m.group(1), 16) & default_xid if xid is not None and xid != classid: continue @@ -303,7 +343,7 @@ def get(xid = None, dev = DEV): minrate = get_tc_rate(m.group(1)) # Parse maxrate - maxrate = bwcap + maxrate = bwmax m = re.search(r"ceil (\w+)", line) if m is not None: maxrate = get_tc_rate(m.group(1)) @@ -320,7 +360,7 @@ def get(xid = None, dev = DEV): # Apply specified bandwidth limit to the specified slice xid -def on(xid, dev = DEV, share = None, minrate = None, maxrate = None): +def on(xid, dev = dev, share = None, minrate = None, maxrate = None): # Get defaults from current state if available cap = get(xid, dev) if cap is not None: @@ -331,63 +371,75 @@ def on(xid, dev = DEV, share = None, minrate = None, maxrate = None): if maxrate is None: maxrate = cap[3] + # Figure out what the current node bandwidth cap is + bwcap = bwmax + for line in tc("-d class show dev %s" % dev): + # Search for 1:1 + m = re.match(r"class htb 1:1 root .*ceil (\w+)", line) + if m is not None: + bwcap = get_tc_rate(m.group(1)) + break + # Set defaults if share is None: share = 1 if minrate is None: minrate = guarantee + else: + minrate = get_tc_rate(minrate) if maxrate is None: maxrate = bwcap + else: + maxrate = get_tc_rate(maxrate) - # Allow slices to burst up to the node bandwidth cap by default. - maxrate = min(maxrate, bwcap) + if minrate > maxrate: + minrate = maxrate - # Set up a class for the slice. + # Set up subclasses for the slice. tc("class replace dev %s parent 1:1 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \ - (dev, xid, minrate, maxrate, share * quantum)) + (dev, default_minor | xid, min(minrate, bwcap), min(maxrate, bwcap), share * quantum)) + + tc("class replace dev %s parent 1:2 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \ + (dev, exempt_minor | xid, min(minrate, bwmax), bwmax, share * quantum)) - # Attach a FIFO to the class, which helps to throttle back - # processes that are sending faster than the token bucket can + # Attach a FIFO to each subclass, which helps to throttle back + # processes that are sending faster than the token buckets can # support. tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \ - (dev, xid, xid)) + (dev, default_minor | xid, default_minor | xid)) + + tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \ + (dev, exempt_minor | xid, exempt_minor | xid)) # Remove class associated with specified slice xid. If further packets # are seen from this slice, they will be classified into the default -# class 1:FFFF. -def off(xid, dev = DEV): - tc("class del dev %s classid 1:%x" % (dev, xid)) +# class 1:1FFF. +def off(xid, dev = dev): + tc("class del dev %s classid 1:%x" % (dev, default_minor | xid)) + tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid)) def usage(): - if bwcap == -1: - bwcap_description = "disabled" - else: - bwcap_description = "%d bits/second" % bwcap + bwcap_description = format_tc_rate(bwmax) print """ Usage: %s [OPTION]... [COMMAND] [ARGUMENT]... -Options (override configuration file): - -f file Configuration file (default: %s) +Options: -d device Network interface (default: %s) -r rate Node bandwidth cap (default: %s) - -g guarantee Default minimum slice rate (default: %d bits/second) + -g guarantee Default minimum slice rate (default: %s bits/second) -q quantum Share multiplier (default: %d bytes) -h This message Commands: init - (Re)load configuration and (re)initialize bandwidth caps. - on - Same as init + (Re)initialize bandwidth caps. on slice [share] [minrate] [maxrate] Set bandwidth cap for the specified slice - off - Remove all bandwidth caps off slice Remove bandwidth caps for the specified slice get @@ -398,20 +450,20 @@ Commands: Get maxrate for the specified slice setcap slice maxrate Set maxrate for the specified slice -""" % (sys.argv[0], bwcap_file, DEV, bwcap_description, guarantee, quantum) +""" % (sys.argv[0], dev, bwcap_description, guarantee, quantum) sys.exit(1) def main(): - global DEV, bwcap_file, bwcap, guarantee, quantum, verbose + global dev, guarantee, quantum, verbose + + # Defaults + bwcap = get_bwcap() (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh") for (opt, optval) in opts: - if opt == '-f': - bwcap_file = optval - parse_bwcap() - elif opt == '-d': - DEV = optval + if opt == '-d': + dev = optval elif opt == '-r': bwcap = get_tc_rate(optval) elif opt == '-g': @@ -426,13 +478,7 @@ def main(): if len(argv): if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1): # (Re)initialize - init(DEV) - - elif argv[0] == "off" and len(argv) == 1: - # Disable all caps - bwcap = -1 - init(DEV) - sys.stderr.write("Warning: all configured bandwidth limits have been removed\n") + init(dev, bwcap) elif argv[0] == "get" or argv[0] == "show": # Show @@ -442,10 +488,10 @@ def main(): if xid is None: sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1]) usage() - caps = [get(xid, DEV)] + caps = [get(xid, dev)] else: # Show all slices - caps = get(None, DEV) + caps = get(None, dev) for (xid, share, minrate, maxrate) in caps: slice = get_slice(xid) @@ -472,16 +518,16 @@ def main(): if i >= len(casts): break args.append(casts[i](arg)) - on(xid, DEV, *args) + on(xid, dev, *args) elif argv[0] == "off" or argv[0] == "del": # Disable cap - off(xid, DEV) + off(xid, dev) # Backward compatibility with old resman script elif argv[0] == "getcap": # Get maxrate - cap = get(xid, DEV) + cap = get(xid, dev) if cap is not None: (xid, share, minrate, maxrate) = cap print format_tc_rate(maxrate) @@ -490,7 +536,7 @@ def main(): elif argv[0] == "setcap": if len(argv) >= 3: # Set maxrate - on(xid, DEV, maxrate = get_tc_rate(argv[2])) + on(xid, dev, maxrate = get_tc_rate(argv[2])) else: usage()