3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.10 2006/03/14 22:57:50 smuir Exp $
52 import sys, os, re, getopt
57 # Where the tc binary lives
66 # bwmin should be small enough that it can be considered negligibly
67 # slow compared to the hardware. 8 bits/second appears to be the
68 # smallest value supported by tc.
71 # bwmax should be large enough that it can be considered at least as
72 # fast as the hardware.
73 bwmax = 1000*1000*1000
75 # quantum is the maximum number of bytes that can be borrowed by a
76 # share (or slice, if each slice gets 1 share) in one time period
77 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
78 # above their guarantees, and each is attempting to borrow up to the
79 # node bandwidth cap, quantums control how the excess bandwidth is
80 # distributed. Slices with 2 shares will borrow twice the amount in
81 # one time period as slices with 1 share, so averaged over time, they
82 # will get twice as much of the excess bandwidth. The value should be
83 # as small as possible and at least 1 MTU. By default, it would be
84 # calculated as bwmin/10, but since we use such small a value for
85 # bwmin, it's better to just set it to a value safely above 1 Ethernet
89 # cburst is the maximum number of bytes that can be burst onto the
90 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
91 # have data queued for transmission, cbursts control how long each
92 # slice can have the wire for. If not specified, it is set to the
93 # smallest possible value that would enable the slice's "ceil" rate
94 # (usually the node bandwidth cap), to be reached if a slice was able
95 # to borrow enough bandwidth to do so. For now, it's unclear how or if
96 # to relate this to the notion of shares, so just let tc set the
100 # There is another parameter that controls how bandwidth is allocated
101 # between slices on nodes that is outside the scope of HTB. We enforce
102 # a 16 GByte/day total limit on each slice, which works out to about
103 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
104 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
105 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
106 # rule and executes this script to override "ceil".
108 # We support multiple bandwidth limits, by reserving the top nibble of
109 # the minor classid to be the "subclassid". Theoretically, we could
110 # support up to 15 subclasses, but for now, we only define two: the
111 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
112 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
113 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
114 # make the borrowing model work. All bandwidth above minimum
115 # guarantees is fairly shared (in this example, slice 2 is guaranteed
116 # at least 1mbit in addition to fair access to the rest), subject to
117 # the restrictions of the class hierarchy: namely, that the total
118 # bandwidth to non-exempt destinations should not exceed the node
124 # ______________|_____________
126 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
128 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
129 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
130 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
132 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
134 default_minor = 0x1000
135 exempt_minor = 0x2000
137 # root_xid is for the root context. The root context is exempt from
138 # fair sharing in both the default and exempt subclasses. The root
139 # context gets 5 shares by default.
143 # default_xid is for unclassifiable packets. Packets should not be
144 # classified here very often. They can be if a slice's HTB classes are
145 # deleted before its processes are. Each slice gets 1 share by
150 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
151 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
152 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
153 # "kibit" and that if an older version is installed, all rates will
154 # be off by a small fraction.
162 "gibit": 1024*1024*1024,
164 "tibit": 1024*1024*1024*1024,
165 "tbit": 1000000000000,
169 "mibps": 8*1024*1024,
171 "gibps": 8*1024*1024*1024,
173 "tibps": 8*1024*1024*1024*1024,
174 "tbps": 8000000000000
178 # Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
182 m = re.match(r"([0-9.]+)(\D*)", s)
185 suffix = m.group(2).lower()
186 if suffixes.has_key(suffix):
187 return int(float(m.group(1)) * suffixes[suffix])
192 # Prints a tc rate string
193 def format_tc_rate(rate):
195 return "%.0fmbit" % (rate / 1000000.)
197 return "%.0fkbit" % (rate / 1000.)
199 return "%.0fbit" % rate
202 # Parse /etc/planetlab/bwcap (or equivalent)
203 def read_bwcap(bwcap_file):
206 fp = open(bwcap_file, "r")
207 line = fp.readline().strip()
209 bwcap = get_tc_rate(line)
217 # Get current (live) value of bwcap
218 def get_bwcap(dev = dev):
220 state = tc("-d class show dev %s" % dev)
221 base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
222 base_classes = filter(None, map(base_re.match, state))
225 if len(base_classes) > 1:
226 raise Exception, "unable to get current bwcap"
227 return get_tc_rate(base_classes[0].group(1))
230 # Get slice xid (500) from slice name ("500" or "princeton_mlh") or
231 # slice name ("princeton_mlh") from slice xid (500).
232 def get_slice(xid_or_name):
234 if xid_or_name == root_xid:
236 if xid_or_name == default_xid:
238 if isinstance(xid_or_name, (int, long)):
240 return pwd.getpwuid(xid_or_name).pw_name
246 return int(xid_or_name)
249 return pwd.getpwnam(xid_or_name).pw_uid
256 # Shortcut for running a command
257 def run(cmd, input = None):
260 sys.stderr.write("Executing: " + cmd + "\n")
262 fileobj = os.popen(cmd, "r")
263 output = fileobj.readlines()
265 fileobj = os.popen(cmd, "w")
268 if fileobj.close() is None:
275 # Shortcut for running a tc command
277 return run(TC + " " + cmd)
280 # (Re)initialize the bandwidth limits on this node
281 def init(dev, bwcap):
283 # load the module used to manage exempt classes
284 run("/sbin/modprobe ip_set_iphash")
286 # Delete root qdisc 1: if it exists. This will also automatically
287 # delete any child classes.
288 for line in tc("qdisc show dev %s" % dev):
289 # Search for the root qdisc 1:
290 m = re.match(r"qdisc htb 1:", line)
292 tc("qdisc del dev %s root handle 1:" % dev)
295 # Initialize HTB. The "default" clause specifies that if a packet
296 # fails classification, it should go into the class with handle
298 tc("qdisc add dev %s root handle 1: htb default %x" % \
299 (dev, default_minor | default_xid))
301 # Set up a parent class from which all subclasses borrow.
302 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
305 # Set up a subclass that represents the node bandwidth cap. We
306 # allow each slice to borrow up to this rate, so it is also
307 # usually the "ceil" rate for each slice.
308 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
311 # Set up a subclass that represents "exemption" from the node
312 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
313 # to exempt destinations can still be fairly shared up to bwmax.
314 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
317 # Set up the root class (and tell VNET what it is). Packets sent
318 # by root end up here and are capped at the node bandwidth
320 on(root_xid, dev, share = root_share)
321 file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
323 # Set up the default class. Packets that fail classification end
325 on(default_xid, dev, share = default_share)
328 # Get the bandwidth limits for a particular slice xid as a tuple (xid,
329 # share, minrate, maxrate), or all classes as a list of tuples.
330 def get(xid = None, dev = dev):
336 # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
337 for line in tc("-d class show dev %s" % dev):
338 # Search for child classes of 1:10
339 m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
343 # If we are looking for a particular class
344 classid = int(m.group(1), 16) & default_xid
345 if xid is not None and xid != classid:
350 m = re.search(r"quantum (\d+)", line)
352 share = int(m.group(1)) / quantum
356 m = re.search(r"rate (\w+)", line)
358 minrate = get_tc_rate(m.group(1))
362 m = re.search(r"ceil (\w+)", line)
364 maxrate = get_tc_rate(m.group(1))
367 # Return a list of parameters
368 ret.append((classid, share, minrate, maxrate))
370 # Return the parameters for this class
371 ret = (classid, share, minrate, maxrate)
377 # Apply specified bandwidth limit to the specified slice xid
378 def on(xid, dev = dev, share = None, minrate = None, maxrate = None):
379 # Get defaults from current state if available
389 # Figure out what the current node bandwidth cap is
391 for line in tc("-d class show dev %s" % dev):
393 m = re.match(r"class htb 1:10.*ceil (\w+)", line)
395 bwcap = get_tc_rate(m.group(1))
400 share = default_share
404 minrate = get_tc_rate(minrate)
408 maxrate = get_tc_rate(maxrate)
413 if minrate > maxrate:
416 # Set up subclasses for the slice
417 tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
418 (dev, default_minor | xid, minrate, maxrate, share * quantum))
420 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
421 (dev, exempt_minor | xid, minrate, bwmax, share * quantum))
423 # Attach a FIFO to each subclass, which helps to throttle back
424 # processes that are sending faster than the token buckets can
426 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
427 (dev, default_minor | xid, default_minor | xid))
429 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
430 (dev, exempt_minor | xid, exempt_minor | xid))
433 # Remove class associated with specified slice xid. If further packets
434 # are seen from this slice, they will be classified into the default
436 def off(xid, dev = dev):
439 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
440 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
443 def exempt_init(group_name, node_ips):
446 iptables = "/sbin/iptables -t vnet %s POSTROUTING"
448 run("/sbin/ipset -X " + group_name)
450 # Create a hashed IP set of all of these destinations
451 lines = ["-N %s iphash" % group_name]
452 add_cmd = "-A %s " % group_name
453 lines += [(add_cmd + ip) for ip in node_ips]
455 restore = "\n".join(lines) + "\n"
456 run("/sbin/ipset -R", restore)
458 # Add rule to match on destination IP set
459 run((iptables + " -m set --set %s dst -j CLASSIFY --set-class 1:%x") %
460 ("-A", group_name, exempt_minor))
464 bwcap_description = format_tc_rate(get_bwcap())
469 %s [OPTION]... [COMMAND] [ARGUMENT]...
472 -d device Network interface (default: %s)
473 -r rate Node bandwidth cap (default: %s)
474 -q quantum Share multiplier (default: %d bytes)
479 (Re)initialize bandwidth caps.
480 on slice [share] [minrate] [maxrate]
481 Set bandwidth cap for the specified slice
483 Remove bandwidth caps for the specified slice
485 Get all bandwidth caps
487 Get bandwidth caps for the specified slice
489 Get maxrate for the specified slice
491 Set maxrate for the specified slice
492 """ % (sys.argv[0], dev, bwcap_description, quantum)
497 global dev, quantum, verbose
502 (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
503 for (opt, optval) in opts:
507 bwcap = get_tc_rate(optval)
509 quantum = int(optval)
516 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
518 init(dev, get_tc_rate(bwcap))
520 elif argv[0] == "get" or argv[0] == "show":
523 # Show a particular slice
524 xid = get_slice(argv[1])
526 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
528 caps = [get(xid, dev)]
531 caps = get(None, dev)
533 for (xid, share, minrate, maxrate) in caps:
534 slice = get_slice(xid)
536 # Orphaned (not associated with a slice) class
538 print "%s %d %s %s" % \
539 (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
543 xid = get_slice(argv[1])
545 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
548 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
552 # ... share, minrate, maxrate
553 casts = [int, get_tc_rate, get_tc_rate]
554 for i, arg in enumerate(argv[2:]):
557 args.append(casts[i](arg))
560 elif argv[0] == "off" or argv[0] == "del":
564 # Backward compatibility with old resman script
565 elif argv[0] == "getcap":
569 (xid, share, minrate, maxrate) = cap
570 print format_tc_rate(maxrate)
572 # Backward compatibility with old resman script
573 elif argv[0] == "setcap":
576 on(xid, dev, maxrate = get_tc_rate(argv[2]))
587 if __name__ == '__main__':