3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.4 2006/02/22 23:46:51 mlhuang Exp $
52 import sys, os, re, getopt
55 # Where the tc binary lives.
61 # For backward compatibility, if bwcap is not specified, attempt to
63 bwcap_file = "/etc/planetlab/bwcap"
68 # guarantee is the minimum rate in bits per second that each slice is
69 # guaranteed. The value of this parameter is fairly meaningless, since
70 # it is unlikely that every slice will try to transmit full blast
71 # simultaneously. It just needs to be small enough so that the total
72 # of all outstanding guarantees is less than or equal to the node
73 # bandwidth cap (see below). A node with a 500kbit cap (the minimum
74 # recommended) can support up to 500kbit/1kbit = 500 slices.
77 # quantum is the maximum number of bytes that can be borrowed by a
78 # share (or slice, if each slice gets 1 share) in one time period
79 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
80 # above their guarantees, and each is attempting to borrow up to the
81 # node bandwidth cap, quantums control how the excess bandwidth is
82 # distributed. Slices with 2 shares will borrow twice the amount in
83 # one time period as slices with 1 share, so averaged over time, they
84 # will get twice as much of the excess bandwidth. The value should be
85 # as small as possible and at least 1 MTU. By default, it would be
86 # calculated as guarantee/10, but since we use such small guarantees,
87 # it's better to just set it to a value safely above 1 Ethernet MTU.
90 # cburst is the maximum number of bytes that can be burst onto the
91 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
92 # have data queued for transmission, cbursts control how long each
93 # slice can have the wire for. If not specified, it is set to the
94 # smallest possible value that would enable the slice's "ceil" rate
95 # (usually the node bandwidth cap), to be reached if a slice was able
96 # to borrow enough bandwidth to do so. For now, it's unclear how or if
97 # to relate this to the notion of shares, so just let tc set the
100 # bwmax should just be large enough that it can be considered
102 bwmax = 1000*1000*1000
104 # There is another parameter that controls how bandwidth is allocated
105 # between slices on nodes that is outside the scope of HTB. We enforce
106 # a 16 GByte/day total limit on each slice, which works out to about
107 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
108 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
109 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
110 # rule and executes this script to override "ceil".
112 # We can support multiple bandwidth limits, by reserving the top
113 # nibble of the minor classid to be the "subclassid". Theoretically,
114 # we could support up to 15 subclasses, but for now, we only define
115 # two: the "default" subclass 1:1 that is capped at the node bandwidth
116 # cap (in this example, 5mbit) and the "exempt" subclass 1:2 that is
117 # capped at the hardware speed (in this example, 1gbit). The "exempt"
118 # subclass is entitled to whatever bandwidth is leftover after the
119 # node bandwidth cap is reached, and is fairly shared amongst non-root
123 # 1:1 (5mbit, 5mbit) 1:2 (1gbit, 1gbit)
125 # 1:1000 (1, 5mbit, 5mbit) 1:2000 (1gbit, 1gbit)
126 # 1:1001 (1, 1kbit, 5mbit) 1:2001 (1kbit, 1gbit)
127 # 1:1002 (1, 1kbit, 5mbit) 1:2002 (1kbit, 1gbit)
129 # 1:1FFF (1, 1kbit, 5mbit) 1:2FFF (1kbit, 1gbit)
131 default_minor = 0x1000
132 exempt_minor = 0x2000
134 # root_xid is for the root context. The root context is exempt from
135 # fair sharing in both the default and exempt subclasses..
138 # default_xid is for unclassifiable packets. Packets should not be
139 # classified here very often. They can be if a slice's HTB classes are
140 # deleted before its processes are.
143 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
144 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
145 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
146 # "kibit" and that if an older version is installed, all rates will
147 # be off by a small fraction.
155 "gibit": 1024*1024*1024,
157 "tibit": 1024*1024*1024*1024,
158 "tbit": 1000000000000,
162 "mibps": 8*1024*1024,
164 "gibps": 8*1024*1024*1024,
166 "tibps": 8*1024*1024*1024*1024,
167 "tbps": 8000000000000
171 # Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
175 m = re.match(r"([0-9.]+)(\D*)", s)
178 suffix = m.group(2).lower()
179 if suffixes.has_key(suffix):
180 return int(float(m.group(1)) * suffixes[suffix])
185 # Prints a tc rate string
186 def format_tc_rate(rate):
188 return "%.0fmbit" % (rate / 1000000.)
190 return "%.0fkbit" % (rate / 1000.)
192 return "%.0fbit" % rate
195 # Parse /etc/planetlab/bwcap. XXX Should get this from the API
200 fp = open(bwcap_file, "r")
201 line = fp.readline().strip()
203 bwcap = get_tc_rate(line)
211 # Get slice xid (500) from slice name ("500" or "princeton_mlh") or
212 # slice name ("princeton_mlh") from slice xid (500).
213 def get_slice(xid_or_name):
214 labels = ['account', 'password', 'uid', 'gid', 'gecos', 'directory', 'shell']
216 for line in file("/etc/passwd"):
218 if line.strip() == '' or line[0] in '#':
220 # princeton_mlh:x:...
221 fields = line.strip().split(':')
222 if len(fields) < len(labels):
224 # {'account': 'princeton_mlh', 'password': 'x', ...}
225 pw = dict(zip(labels, fields))
226 if xid_or_name == root_xid:
228 if xid_or_name == default_xid:
230 elif xid_or_name == int(pw['uid']):
231 # Convert xid into name
233 elif pw['uid'] == xid_or_name or pw['account'] == xid_or_name:
234 # Convert name into xid
235 return int(pw['uid'])
240 # Shortcut for running a tc command
244 sys.stderr.write("Executing: " + TC + " " + cmd + "\n")
245 fileobj = os.popen(TC + " " + cmd, "r")
246 output = fileobj.readlines()
247 if fileobj.close() is None:
254 # (Re)initialize the bandwidth limits on this node
255 def init(dev = dev, bwcap = None):
257 # For backward compatibility, if bwcap is not specified,
258 # attempt to get it from /etc/planetlab/bwcap.
261 # Allow bwcap to be specified as a tc rate string
262 bwcap = get_tc_rate(bwcap)
264 # Save current state (if any)
265 caps = get(dev = dev)
267 # Delete root qdisc 1: if it exists. This will also automatically
268 # delete any child classes.
269 for line in tc("qdisc show dev %s" % dev):
270 # Search for the root qdisc 1:
271 m = re.match(r"qdisc htb 1:", line)
273 tc("qdisc del dev %s root handle 1:" % dev)
276 # Initialize HTB. The "default" clause specifies that if a packet
277 # fails classification, it should go into the class with handle
279 tc("qdisc add dev %s root handle 1: htb default %x" % \
280 (dev, default_minor | default_xid))
282 # Set up a subclass that represents the node bandwidth cap. We
283 # allow each slice to borrow up to this rate, so it is also
284 # usually the "ceil" rate for each slice.
285 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
288 # Set up a subclass that represents "exemption" from the node
289 # bandwidth cap. It gets whatever bandwidth is leftover after
290 # applying the node bandwidth cap to non-exempt packets.
291 tc("class add dev %s parent 1: classid 1:2 htb rate %dbit" % \
294 # Set up the root class (and tell VNET what it is). Packets sent
295 # by root end up here and are capped at the node bandwidth
297 on(root_xid, dev, minrate = bwmax, maxrate = bwmax)
298 file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
300 # Set up the default class. Packets that fail classification end
302 on(default_xid, dev, maxrate = bwcap)
304 # Reapply bandwidth caps. If the node bandwidth cap is now lower
305 # than it was before, "ceil" for each class will be lowered. If
306 # the node bandwidth cap is now higher than it was before, "ceil"
307 # for each class should be reapplied.
308 for (xid, share, minrate, maxrate) in caps:
309 if xid != 0 and xid != default_xid:
310 on(xid, dev, share = share, minrate = minrate, maxrate = maxrate)
313 # Get the bandwidth limits for a particular slice xid as a tuple (xid,
314 # share, minrate, maxrate), or all classes as a list of tuples.
315 def get(xid = None, dev = dev):
321 # class htb 1:1002 parent 1:1 leaf 1002: prio 0 rate 10Mbit ceil 10Mbit burst 14704b cburst 14704b
322 for line in tc("-d class show dev %s" % dev):
323 # Search for child classes of 1:1
324 m = re.match(r"class htb 1:([0-9a-f]+) parent 1:1", line)
328 # If we are looking for a particular class
329 classid = int(m.group(1), 16) & default_xid
330 if xid is not None and xid != classid:
335 m = re.search(r"quantum (\d+)", line)
337 share = int(m.group(1)) / quantum
341 m = re.search(r"rate (\w+)", line)
343 minrate = get_tc_rate(m.group(1))
347 m = re.search(r"ceil (\w+)", line)
349 maxrate = get_tc_rate(m.group(1))
352 # Return a list of parameters
353 ret.append((classid, share, minrate, maxrate))
355 # Return the parameters for this class
356 ret = (classid, share, minrate, maxrate)
362 # Apply specified bandwidth limit to the specified slice xid
363 def on(xid, dev = dev, share = None, minrate = None, maxrate = None):
364 # Get defaults from current state if available
374 # Figure out what the current node bandwidth cap is
376 for line in tc("-d class show dev %s" % dev):
378 m = re.match(r"class htb 1:1 root .*ceil (\w+)", line)
380 bwcap = get_tc_rate(m.group(1))
389 minrate = get_tc_rate(minrate)
393 maxrate = get_tc_rate(maxrate)
395 if minrate > maxrate:
398 # Set up subclasses for the slice.
399 tc("class replace dev %s parent 1:1 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
400 (dev, default_minor | xid, min(minrate, bwcap), min(maxrate, bwcap), share * quantum))
402 tc("class replace dev %s parent 1:2 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
403 (dev, exempt_minor | xid, min(minrate, bwmax), bwmax, share * quantum))
405 # Attach a FIFO to each subclass, which helps to throttle back
406 # processes that are sending faster than the token buckets can
408 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
409 (dev, default_minor | xid, default_minor | xid))
411 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
412 (dev, exempt_minor | xid, exempt_minor | xid))
415 # Remove class associated with specified slice xid. If further packets
416 # are seen from this slice, they will be classified into the default
418 def off(xid, dev = dev):
419 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
420 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
424 bwcap_description = format_tc_rate(bwmax)
429 %s [OPTION]... [COMMAND] [ARGUMENT]...
432 -d device Network interface (default: %s)
433 -r rate Node bandwidth cap (default: %s)
434 -g guarantee Default minimum slice rate (default: %s bits/second)
435 -q quantum Share multiplier (default: %d bytes)
440 (Re)initialize bandwidth caps.
441 on slice [share] [minrate] [maxrate]
442 Set bandwidth cap for the specified slice
444 Remove bandwidth caps for the specified slice
446 Get all bandwidth caps
448 Get bandwidth caps for the specified slice
450 Get maxrate for the specified slice
452 Set maxrate for the specified slice
453 """ % (sys.argv[0], dev, bwcap_description, guarantee, quantum)
458 global dev, guarantee, quantum, verbose
463 (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
464 for (opt, optval) in opts:
468 bwcap = get_tc_rate(optval)
470 guarantee = get_tc_rate(optval)
472 quantum = int(optval)
479 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
483 elif argv[0] == "get" or argv[0] == "show":
486 # Show a particular slice
487 xid = get_slice(argv[1])
489 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
491 caps = [get(xid, dev)]
494 caps = get(None, dev)
496 for (xid, share, minrate, maxrate) in caps:
497 slice = get_slice(xid)
499 # Orphaned (not associated with a slice) class
501 print "%s: share %d minrate %s maxrate %s" % \
502 (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
506 xid = get_slice(argv[1])
508 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
511 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
515 # ... share, minrate, maxrate
516 casts = [int, get_tc_rate, get_tc_rate]
517 for i, arg in enumerate(argv[2:]):
520 args.append(casts[i](arg))
523 elif argv[0] == "off" or argv[0] == "del":
527 # Backward compatibility with old resman script
528 elif argv[0] == "getcap":
532 (xid, share, minrate, maxrate) = cap
533 print format_tc_rate(maxrate)
535 # Backward compatibility with old resman script
536 elif argv[0] == "setcap":
539 on(xid, dev, maxrate = get_tc_rate(argv[2]))
550 if __name__ == '__main__':