3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.8 2006/03/01 22:02:52 mlhuang Exp $
52 import sys, os, re, getopt
57 # Where the tc binary lives
63 # For backward compatibility, if bwcap is not specified, attempt to
65 bwcap_file = "/etc/planetlab/bwcap"
70 # bwmin should be small enough that it can be considered negligibly
71 # slow compared to the hardware. 8 bits/second appears to be the
72 # smallest value supported by tc.
75 # bwmax should be large enough that it can be considered at least as
76 # fast as the hardware.
77 bwmax = 1000*1000*1000
79 # quantum is the maximum number of bytes that can be borrowed by a
80 # share (or slice, if each slice gets 1 share) in one time period
81 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
82 # above their guarantees, and each is attempting to borrow up to the
83 # node bandwidth cap, quantums control how the excess bandwidth is
84 # distributed. Slices with 2 shares will borrow twice the amount in
85 # one time period as slices with 1 share, so averaged over time, they
86 # will get twice as much of the excess bandwidth. The value should be
87 # as small as possible and at least 1 MTU. By default, it would be
88 # calculated as bwmin/10, but since we use such small a value for
89 # bwmin, it's better to just set it to a value safely above 1 Ethernet
93 # cburst is the maximum number of bytes that can be burst onto the
94 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
95 # have data queued for transmission, cbursts control how long each
96 # slice can have the wire for. If not specified, it is set to the
97 # smallest possible value that would enable the slice's "ceil" rate
98 # (usually the node bandwidth cap), to be reached if a slice was able
99 # to borrow enough bandwidth to do so. For now, it's unclear how or if
100 # to relate this to the notion of shares, so just let tc set the
104 # There is another parameter that controls how bandwidth is allocated
105 # between slices on nodes that is outside the scope of HTB. We enforce
106 # a 16 GByte/day total limit on each slice, which works out to about
107 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
108 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
109 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
110 # rule and executes this script to override "ceil".
112 # We support multiple bandwidth limits, by reserving the top nibble of
113 # the minor classid to be the "subclassid". Theoretically, we could
114 # support up to 15 subclasses, but for now, we only define two: the
115 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
116 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
117 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
118 # make the borrowing model work. All bandwidth above minimum
119 # guarantees is fairly shared (in this example, slice 2 is guaranteed
120 # at least 1mbit in addition to fair access to the rest), subject to
121 # the restrictions of the class hierarchy: namely, that the total
122 # bandwidth to non-exempt destinations should not exceed the node
128 # ______________|_____________
130 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
132 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
133 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
134 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
136 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
138 default_minor = 0x1000
139 exempt_minor = 0x2000
141 # root_xid is for the root context. The root context is exempt from
142 # fair sharing in both the default and exempt subclasses. The root
143 # context gets 5 shares by default.
147 # default_xid is for unclassifiable packets. Packets should not be
148 # classified here very often. They can be if a slice's HTB classes are
149 # deleted before its processes are. Each slice gets 1 share by
154 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
155 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
156 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
157 # "kibit" and that if an older version is installed, all rates will
158 # be off by a small fraction.
166 "gibit": 1024*1024*1024,
168 "tibit": 1024*1024*1024*1024,
169 "tbit": 1000000000000,
173 "mibps": 8*1024*1024,
175 "gibps": 8*1024*1024*1024,
177 "tibps": 8*1024*1024*1024*1024,
178 "tbps": 8000000000000
182 # Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
186 m = re.match(r"([0-9.]+)(\D*)", s)
189 suffix = m.group(2).lower()
190 if suffixes.has_key(suffix):
191 return int(float(m.group(1)) * suffixes[suffix])
196 # Prints a tc rate string
197 def format_tc_rate(rate):
199 return "%.0fmbit" % (rate / 1000000.)
201 return "%.0fkbit" % (rate / 1000.)
203 return "%.0fbit" % rate
206 # Parse /etc/planetlab/bwcap. XXX Should get this from the API
211 fp = open(bwcap_file, "r")
212 line = fp.readline().strip()
214 bwcap = get_tc_rate(line)
222 # Get slice xid (500) from slice name ("500" or "princeton_mlh") or
223 # slice name ("princeton_mlh") from slice xid (500).
224 def get_slice(xid_or_name):
225 labels = ['account', 'password', 'uid', 'gid', 'gecos', 'directory', 'shell']
227 for line in file("/etc/passwd"):
229 if line.strip() == '' or line[0] in '#':
231 # princeton_mlh:x:...
232 fields = line.strip().split(':')
233 if len(fields) < len(labels):
235 # {'account': 'princeton_mlh', 'password': 'x', ...}
236 pw = dict(zip(labels, fields))
237 if xid_or_name == root_xid:
239 if xid_or_name == default_xid:
241 elif xid_or_name == int(pw['uid']):
242 # Convert xid into name
244 elif pw['uid'] == xid_or_name or pw['account'] == xid_or_name:
245 # Convert name into xid
246 return int(pw['uid'])
251 # Shortcut for running a command
252 def run(cmd, input = None):
255 sys.stderr.write("Executing: " + cmd + "\n")
257 fileobj = os.popen(cmd, "r")
258 output = fileobj.readlines()
260 fileobj = os.popen(cmd, "w")
263 if fileobj.close() is None:
270 # Shortcut for running a tc command
272 return run(TC + " " + cmd)
275 # (Re)initialize the bandwidth limits on this node
276 def init(dev = dev, bwcap = None):
278 # For backward compatibility, if bwcap is not specified,
279 # attempt to get it from /etc/planetlab/bwcap.
282 # Allow bwcap to be specified as a tc rate string
283 bwcap = get_tc_rate(bwcap)
285 # Delete root qdisc 1: if it exists. This will also automatically
286 # delete any child classes.
287 for line in tc("qdisc show dev %s" % dev):
288 # Search for the root qdisc 1:
289 m = re.match(r"qdisc htb 1:", line)
291 tc("qdisc del dev %s root handle 1:" % dev)
294 # Initialize HTB. The "default" clause specifies that if a packet
295 # fails classification, it should go into the class with handle
297 tc("qdisc add dev %s root handle 1: htb default %x" % \
298 (dev, default_minor | default_xid))
300 # Set up a parent class from which all subclasses borrow.
301 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
304 # Set up a subclass that represents the node bandwidth cap. We
305 # allow each slice to borrow up to this rate, so it is also
306 # usually the "ceil" rate for each slice.
307 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
310 # Set up a subclass that represents "exemption" from the node
311 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
312 # to exempt destinations can still be fairly shared up to bwmax.
313 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
316 # Set up the root class (and tell VNET what it is). Packets sent
317 # by root end up here and are capped at the node bandwidth
319 on(root_xid, dev, share = root_share)
320 file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
322 # Set up the default class. Packets that fail classification end
324 on(default_xid, dev, share = default_share)
330 # Get the bandwidth limits for a particular slice xid as a tuple (xid,
331 # share, minrate, maxrate), or all classes as a list of tuples.
332 def get(xid = None, dev = dev):
338 # class htb 1:1002 parent 1:10 leaf 81b3: prio 1 rate 8bit ceil 5000Kbit burst 1600b cburst 4Kb
339 for line in tc("-d class show dev %s" % dev):
340 # Search for child classes of 1:10
341 m = re.match(r"class htb 1:([0-9a-f]+) parent 1:10", line)
345 # If we are looking for a particular class
346 classid = int(m.group(1), 16) & default_xid
347 if xid is not None and xid != classid:
352 m = re.search(r"quantum (\d+)", line)
354 share = int(m.group(1)) / quantum
358 m = re.search(r"rate (\w+)", line)
360 minrate = get_tc_rate(m.group(1))
364 m = re.search(r"ceil (\w+)", line)
366 maxrate = get_tc_rate(m.group(1))
369 # Return a list of parameters
370 ret.append((classid, share, minrate, maxrate))
372 # Return the parameters for this class
373 ret = (classid, share, minrate, maxrate)
379 # Apply specified bandwidth limit to the specified slice xid
380 def on(xid, dev = dev, share = None, minrate = None, maxrate = None):
381 # Get defaults from current state if available
391 # Figure out what the current node bandwidth cap is
393 for line in tc("-d class show dev %s" % dev):
395 m = re.match(r"class htb 1:10.*ceil (\w+)", line)
397 bwcap = get_tc_rate(m.group(1))
402 share = default_share
406 minrate = get_tc_rate(minrate)
410 maxrate = get_tc_rate(maxrate)
415 if minrate > maxrate:
418 # Set up subclasses for the slice
419 tc("class replace dev %s parent 1:10 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
420 (dev, default_minor | xid, minrate, maxrate, share * quantum))
422 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
423 (dev, exempt_minor | xid, minrate, bwmax, share * quantum))
425 # Attach a FIFO to each subclass, which helps to throttle back
426 # processes that are sending faster than the token buckets can
428 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
429 (dev, default_minor | xid, default_minor | xid))
431 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
432 (dev, exempt_minor | xid, exempt_minor | xid))
435 # Remove class associated with specified slice xid. If further packets
436 # are seen from this slice, they will be classified into the default
438 def off(xid, dev = dev):
441 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
442 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
448 node_id = int(file('/etc/planetlab/node_id').readline().strip())
452 api = plcapi.PLCAPI()
454 # All nodes that have access to Internet2
456 for node_group in api.AnonAdmGetNodeGroups(api.auth):
457 if node_group['name'] == "Internet2":
458 node_ids += api.AnonAdmGetNodeGroupNodes(api.auth, node_group['nodegroup_id'])
461 node_ids = list(Set(node_ids))
463 # Continue only if we ourselves have access to Internet2
464 if node_id not in node_ids:
467 # Exempt the following destinations from the node bandwidth cap
468 node_ips = [node['ip'] for node in api.AnonAdmGetNodes(api.auth, node_ids, ['ip'])]
471 run("/sbin/iptables -t vnet -F POSTROUTING")
472 run("/sbin/ipset -X Internet2")
474 # Create a hashed IP set of all of these destinations
475 run("/sbin/modprobe ip_set_iphash")
476 lines = ["-N Internet2 iphash"]
477 lines += ["-A Internet2 " + ip for ip in node_ips]
479 restore = "\n".join(lines) + "\n"
480 run("/sbin/ipset -R", restore)
482 # Add rule to match on destination IP set
483 run("/sbin/iptables -t vnet -A POSTROUTING -m set --set Internet2 dst -j CLASSIFY --set-class 1:%x" %
488 bwcap_description = format_tc_rate(get_bwcap())
493 %s [OPTION]... [COMMAND] [ARGUMENT]...
496 -d device Network interface (default: %s)
497 -r rate Node bandwidth cap (default: %s)
498 -q quantum Share multiplier (default: %d bytes)
503 (Re)initialize bandwidth caps.
504 on slice [share] [minrate] [maxrate]
505 Set bandwidth cap for the specified slice
507 Remove bandwidth caps for the specified slice
509 Get all bandwidth caps
511 Get bandwidth caps for the specified slice
513 Get maxrate for the specified slice
515 Set maxrate for the specified slice
516 """ % (sys.argv[0], dev, bwcap_description, quantum)
521 global dev, quantum, verbose
526 (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
527 for (opt, optval) in opts:
531 bwcap = get_tc_rate(optval)
533 quantum = int(optval)
540 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
544 elif argv[0] == "get" or argv[0] == "show":
547 # Show a particular slice
548 xid = get_slice(argv[1])
550 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
552 caps = [get(xid, dev)]
555 caps = get(None, dev)
557 for (xid, share, minrate, maxrate) in caps:
558 slice = get_slice(xid)
560 # Orphaned (not associated with a slice) class
562 print "%s %d %s %s" % \
563 (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
567 xid = get_slice(argv[1])
569 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
572 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
576 # ... share, minrate, maxrate
577 casts = [int, get_tc_rate, get_tc_rate]
578 for i, arg in enumerate(argv[2:]):
581 args.append(casts[i](arg))
584 elif argv[0] == "off" or argv[0] == "del":
588 # Backward compatibility with old resman script
589 elif argv[0] == "getcap":
593 (xid, share, minrate, maxrate) = cap
594 print format_tc_rate(maxrate)
596 # Backward compatibility with old resman script
597 elif argv[0] == "setcap":
600 on(xid, dev, maxrate = get_tc_rate(argv[2]))
611 if __name__ == '__main__':