3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.5 2006/02/27 01:58:09 mlhuang Exp $
52 import sys, os, re, getopt
55 # Where the tc binary lives
61 # For backward compatibility, if bwcap is not specified, attempt to
63 bwcap_file = "/etc/planetlab/bwcap"
68 # bwmin should be small enough that it can be considered negligibly
69 # slow compared to the hardware. 8 bits/second appears to be the
70 # smallest value supported by tc.
73 # bwmax should be large enough that it can be considered at least as
74 # fast as the hardware.
75 bwmax = 1000*1000*1000
77 # quantum is the maximum number of bytes that can be borrowed by a
78 # share (or slice, if each slice gets 1 share) in one time period
79 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
80 # above their guarantees, and each is attempting to borrow up to the
81 # node bandwidth cap, quantums control how the excess bandwidth is
82 # distributed. Slices with 2 shares will borrow twice the amount in
83 # one time period as slices with 1 share, so averaged over time, they
84 # will get twice as much of the excess bandwidth. The value should be
85 # as small as possible and at least 1 MTU. By default, it would be
86 # calculated as bwmin/10, but since we use such small a value for
87 # bwmin, it's better to just set it to a value safely above 1 Ethernet
91 # cburst is the maximum number of bytes that can be burst onto the
92 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
93 # have data queued for transmission, cbursts control how long each
94 # slice can have the wire for. If not specified, it is set to the
95 # smallest possible value that would enable the slice's "ceil" rate
96 # (usually the node bandwidth cap), to be reached if a slice was able
97 # to borrow enough bandwidth to do so. For now, it's unclear how or if
98 # to relate this to the notion of shares, so just let tc set the
102 # There is another parameter that controls how bandwidth is allocated
103 # between slices on nodes that is outside the scope of HTB. We enforce
104 # a 16 GByte/day total limit on each slice, which works out to about
105 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
106 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
107 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
108 # rule and executes this script to override "ceil".
110 # We support multiple bandwidth limits, by reserving the top nibble of
111 # the minor classid to be the "subclassid". Theoretically, we could
112 # support up to 15 subclasses, but for now, we only define two: the
113 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
114 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
115 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
116 # make the borrowing model work. All bandwidth is fairly shared,
117 # subject to the restrictions of the class hierarchy: namely, that the
118 # total bandwidth to non-exempt destinations should not exceed the
119 # node bandwidth cap. The root slice has a higher priority (0) than
120 # the others (1) and can thus request all of the bandwidth of that
126 # ______________|______________
128 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
130 # 1:1000 (8bit, 5mbit, 0), 1:2000 (8bit, 1gbit, 0),
131 # 1:1001 (8bit, 5mbit, 1), 1:2001 (8bit, 1gbit, 1),
132 # 1:1002 (8bit, 5mbit, 1), 1:2002 (8bit, 1gbit, 1),
134 # 1:1FFF (8bit, 5mbit, 1) 1:2FFF (8bit, 1gbit, 1)
136 default_minor = 0x1000
137 exempt_minor = 0x2000
139 # root_xid is for the root context. The root context is exempt from
140 # fair sharing in both the default and exempt subclasses..
143 # default_xid is for unclassifiable packets. Packets should not be
144 # classified here very often. They can be if a slice's HTB classes are
145 # deleted before its processes are.
148 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
149 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
150 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
151 # "kibit" and that if an older version is installed, all rates will
152 # be off by a small fraction.
160 "gibit": 1024*1024*1024,
162 "tibit": 1024*1024*1024*1024,
163 "tbit": 1000000000000,
167 "mibps": 8*1024*1024,
169 "gibps": 8*1024*1024*1024,
171 "tibps": 8*1024*1024*1024*1024,
172 "tbps": 8000000000000
176 # Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
180 m = re.match(r"([0-9.]+)(\D*)", s)
183 suffix = m.group(2).lower()
184 if suffixes.has_key(suffix):
185 return int(float(m.group(1)) * suffixes[suffix])
190 # Prints a tc rate string
191 def format_tc_rate(rate):
193 return "%.0fmbit" % (rate / 1000000.)
195 return "%.0fkbit" % (rate / 1000.)
197 return "%.0fbit" % rate
200 # Parse /etc/planetlab/bwcap. XXX Should get this from the API
205 fp = open(bwcap_file, "r")
206 line = fp.readline().strip()
208 bwcap = get_tc_rate(line)
216 # Get slice xid (500) from slice name ("500" or "princeton_mlh") or
217 # slice name ("princeton_mlh") from slice xid (500).
218 def get_slice(xid_or_name):
219 labels = ['account', 'password', 'uid', 'gid', 'gecos', 'directory', 'shell']
221 for line in file("/etc/passwd"):
223 if line.strip() == '' or line[0] in '#':
225 # princeton_mlh:x:...
226 fields = line.strip().split(':')
227 if len(fields) < len(labels):
229 # {'account': 'princeton_mlh', 'password': 'x', ...}
230 pw = dict(zip(labels, fields))
231 if xid_or_name == root_xid:
233 if xid_or_name == default_xid:
235 elif xid_or_name == int(pw['uid']):
236 # Convert xid into name
238 elif pw['uid'] == xid_or_name or pw['account'] == xid_or_name:
239 # Convert name into xid
240 return int(pw['uid'])
245 # Shortcut for running a tc command
249 sys.stderr.write("Executing: " + TC + " " + cmd + "\n")
250 fileobj = os.popen(TC + " " + cmd, "r")
251 output = fileobj.readlines()
252 if fileobj.close() is None:
259 # (Re)initialize the bandwidth limits on this node
260 def init(dev = dev, bwcap = None):
262 # For backward compatibility, if bwcap is not specified,
263 # attempt to get it from /etc/planetlab/bwcap.
266 # Allow bwcap to be specified as a tc rate string
267 bwcap = get_tc_rate(bwcap)
269 # Save current state (if any)
270 caps = get(dev = dev)
272 # Delete root qdisc 1: if it exists. This will also automatically
273 # delete any child classes.
274 for line in tc("qdisc show dev %s" % dev):
275 # Search for the root qdisc 1:
276 m = re.match(r"qdisc htb 1:", line)
278 tc("qdisc del dev %s root handle 1:" % dev)
281 # Initialize HTB. The "default" clause specifies that if a packet
282 # fails classification, it should go into the class with handle
284 tc("qdisc add dev %s root handle 1: htb default %x" % \
285 (dev, default_minor | default_xid))
287 # Set up a parent class from which all subclasses borrow.
288 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
291 # Set up a subclass that represents the node bandwidth cap. We
292 # allow each slice to borrow up to this rate, so it is also
293 # usually the "ceil" rate for each slice.
294 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
297 # Set up a subclass that represents "exemption" from the node
298 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
299 # to exempt destinations can still be fairly shared up to bwmax.
300 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
303 # Set up the root class (and tell VNET what it is). Packets sent
304 # by root end up here and are capped at the node bandwidth
306 on(root_xid, dev, prio = 0)
307 file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
309 # Set up the default class. Packets that fail classification end
313 # Reapply bandwidth caps. If the node bandwidth cap is now lower
314 # than it was before, "ceil" for each class will be lowered. If
315 # the node bandwidth cap is now higher than it was before, "ceil"
316 # for each class should be reapplied.
317 for (xid, share, minrate, maxrate) in caps:
318 if xid != root_xid and xid != default_xid:
319 on(xid, dev, share = share, minrate = minrate, maxrate = maxrate)
322 # Get the bandwidth limits for a particular slice xid as a tuple (xid,
323 # share, minrate, maxrate), or all classes as a list of tuples.
324 def get(xid = None, dev = dev):
330 # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
331 for line in tc("-d class show dev %s" % dev):
332 # Search for child classes of 1:10
333 m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
337 # If we are looking for a particular class
338 classid = int(m.group(1), 16) & default_xid
339 if xid is not None and xid != classid:
344 m = re.search(r"quantum (\d+)", line)
346 share = int(m.group(1)) / quantum
350 m = re.search(r"rate (\w+)", line)
352 minrate = get_tc_rate(m.group(1))
356 m = re.search(r"ceil (\w+)", line)
358 maxrate = get_tc_rate(m.group(1))
361 # Return a list of parameters
362 ret.append((classid, share, minrate, maxrate))
364 # Return the parameters for this class
365 ret = (classid, share, minrate, maxrate)
371 # Apply specified bandwidth limit to the specified slice xid
372 def on(xid, dev = dev, share = None, minrate = None, maxrate = None, prio = 1):
373 # Get defaults from current state if available
383 # Figure out what the current node bandwidth cap is
385 for line in tc("-d class show dev %s" % dev):
387 m = re.match(r"class htb 1:10.*ceil (\w+)", line)
389 bwcap = get_tc_rate(m.group(1))
398 minrate = get_tc_rate(minrate)
402 maxrate = get_tc_rate(maxrate)
408 if minrate > maxrate:
411 # Set up subclasses for the slice
412 tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d prio %d" % \
413 (dev, default_minor | xid, minrate, maxrate, share * quantum, prio))
415 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d prio %d" % \
416 (dev, exempt_minor | xid, minrate, bwmax, share * quantum, prio))
418 # Attach a FIFO to each subclass, which helps to throttle back
419 # processes that are sending faster than the token buckets can
421 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
422 (dev, default_minor | xid, default_minor | xid))
424 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
425 (dev, exempt_minor | xid, exempt_minor | xid))
428 # Remove class associated with specified slice xid. If further packets
429 # are seen from this slice, they will be classified into the default
431 def off(xid, dev = dev):
432 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
433 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
437 bwcap_description = format_tc_rate(bwmax)
442 %s [OPTION]... [COMMAND] [ARGUMENT]...
445 -d device Network interface (default: %s)
446 -r rate Node bandwidth cap (default: %s)
447 -q quantum Share multiplier (default: %d bytes)
452 (Re)initialize bandwidth caps.
453 on slice [share] [minrate] [maxrate]
454 Set bandwidth cap for the specified slice
456 Remove bandwidth caps for the specified slice
458 Get all bandwidth caps
460 Get bandwidth caps for the specified slice
462 Get maxrate for the specified slice
464 Set maxrate for the specified slice
465 """ % (sys.argv[0], dev, bwcap_description, quantum)
470 global dev, quantum, verbose
475 (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
476 for (opt, optval) in opts:
480 bwcap = get_tc_rate(optval)
482 quantum = int(optval)
489 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
493 elif argv[0] == "get" or argv[0] == "show":
496 # Show a particular slice
497 xid = get_slice(argv[1])
499 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
501 caps = [get(xid, dev)]
504 caps = get(None, dev)
506 for (xid, share, minrate, maxrate) in caps:
507 slice = get_slice(xid)
509 # Orphaned (not associated with a slice) class
511 print "%s: share %d minrate %s maxrate %s" % \
512 (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
516 xid = get_slice(argv[1])
518 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
521 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
525 # ... share, minrate, maxrate
526 casts = [int, get_tc_rate, get_tc_rate]
527 for i, arg in enumerate(argv[2:]):
530 args.append(casts[i](arg))
533 elif argv[0] == "off" or argv[0] == "del":
537 # Backward compatibility with old resman script
538 elif argv[0] == "getcap":
542 (xid, share, minrate, maxrate) = cap
543 print format_tc_rate(maxrate)
545 # Backward compatibility with old resman script
546 elif argv[0] == "setcap":
549 on(xid, dev, maxrate = get_tc_rate(argv[2]))
560 if __name__ == '__main__':