3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.15 2007/02/07 04:21:11 mlhuang Exp $
52 import sys, os, re, getopt
57 # Where the tc binary lives
66 # bwmin should be small enough that it can be considered negligibly
67 # slow compared to the hardware. 8 bits/second appears to be the
68 # smallest value supported by tc.
71 # bwmax should be large enough that it can be considered at least as
72 # fast as the hardware.
73 bwmax = 1000*1000*1000
75 # quantum is the maximum number of bytes that can be borrowed by a
76 # share (or slice, if each slice gets 1 share) in one time period
77 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
78 # above their guarantees, and each is attempting to borrow up to the
79 # node bandwidth cap, quantums control how the excess bandwidth is
80 # distributed. Slices with 2 shares will borrow twice the amount in
81 # one time period as slices with 1 share, so averaged over time, they
82 # will get twice as much of the excess bandwidth. The value should be
83 # as small as possible and at least 1 MTU. By default, it would be
84 # calculated as bwmin/10, but since we use such small a value for
85 # bwmin, it's better to just set it to a value safely above 1 Ethernet
89 # cburst is the maximum number of bytes that can be burst onto the
90 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
91 # have data queued for transmission, cbursts control how long each
92 # slice can have the wire for. If not specified, it is set to the
93 # smallest possible value that would enable the slice's "ceil" rate
94 # (usually the node bandwidth cap), to be reached if a slice was able
95 # to borrow enough bandwidth to do so. For now, it's unclear how or if
96 # to relate this to the notion of shares, so just let tc set the
100 # There is another parameter that controls how bandwidth is allocated
101 # between slices on nodes that is outside the scope of HTB. We enforce
102 # a 16 GByte/day total limit on each slice, which works out to about
103 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
104 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
105 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
106 # rule and executes this script to override "ceil".
108 # We support multiple bandwidth limits, by reserving the top nibble of
109 # the minor classid to be the "subclassid". Theoretically, we could
110 # support up to 15 subclasses, but for now, we only define two: the
111 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
112 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
113 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
114 # make the borrowing model work. All bandwidth above minimum
115 # guarantees is fairly shared (in this example, slice 2 is guaranteed
116 # at least 1mbit in addition to fair access to the rest), subject to
117 # the restrictions of the class hierarchy: namely, that the total
118 # bandwidth to non-exempt destinations should not exceed the node
124 # ______________|_____________
126 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
128 # 1:100 (8bit, 5mbit) |
130 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
131 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
132 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
134 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
136 default_minor = 0x1000
137 exempt_minor = 0x2000
139 # root_xid is for the root context. The root context is exempt from
140 # fair sharing in both the default and exempt subclasses. The root
141 # context gets 5 shares by default.
145 # default_xid is for unclassifiable packets. Packets should not be
146 # classified here very often. They can be if a slice's HTB classes are
147 # deleted before its processes are. Each slice gets 1 share by
152 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
153 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
154 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
155 # "kibit" and that if an older version is installed, all rates will
156 # be off by a small fraction.
164 "gibit": 1024*1024*1024,
166 "tibit": 1024*1024*1024*1024,
167 "tbit": 1000000000000,
171 "mibps": 8*1024*1024,
173 "gibps": 8*1024*1024*1024,
175 "tibps": 8*1024*1024*1024*1024,
176 "tbps": 8000000000000
182 Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
187 m = re.match(r"([0-9.]+)(\D*)", s)
190 suffix = m.group(2).lower()
191 if suffixes.has_key(suffix):
192 return int(float(m.group(1)) * suffixes[suffix])
196 def format_bytes(bytes, si = True):
198 Formats bytes into a string
203 # Officially, a kibibyte
206 if bytes >= (kilo * kilo * kilo):
207 return "%.1f GB" % (bytes / (kilo * kilo * kilo))
208 elif bytes >= 1000000:
209 return "%.1f MB" % (bytes / (kilo * kilo))
211 return "%.1f KB" % (bytes / kilo)
213 return "%.0f bytes" % bytes
215 def format_tc_rate(rate):
217 Formats a bits/second rate into a tc rate string
220 if rate >= 1000000000 and (rate % 1000000000) == 0:
221 return "%.0fgbit" % (rate / 1000000000.)
222 elif rate >= 1000000 and (rate % 1000000) == 0:
223 return "%.0fmbit" % (rate / 1000000.)
225 return "%.0fkbit" % (rate / 1000.)
227 return "%.0fbit" % rate
230 # Parse /etc/planetlab/bwcap (or equivalent)
231 def read_bwcap(bwcap_file):
234 fp = open(bwcap_file, "r")
235 line = fp.readline().strip()
237 bwcap = get_tc_rate(line)
245 def get_bwcap(dev = dev):
247 Get the current (live) value of the node bandwidth cap
250 state = tc("-d class show dev %s" % dev)
251 base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
252 base_classes = filter(None, map(base_re.match, state))
255 if len(base_classes) > 1:
256 raise Exception, "unable to get current bwcap"
257 return get_tc_rate(base_classes[0].group(1))
262 Get slice name ("princeton_mlh") from slice xid (500)
267 if xid == default_xid:
270 return pwd.getpwuid(xid).pw_name
278 Get slice xid ("princeton_mlh") from slice name ("500" or "princeton_mlh")
283 if slice == "default":
290 return pwd.getpwnam(slice).pw_uid
296 def run(cmd, input = None):
298 Shortcut for running a shell command
303 sys.stderr.write("Executing: " + cmd + "\n")
305 fileobj = os.popen(cmd, "r")
306 output = fileobj.readlines()
308 fileobj = os.popen(cmd, "w")
311 if fileobj.close() is None:
320 Shortcut for running a tc command
323 return run(TC + " " + cmd)
328 Turn off all queing. Stops all slice HTBS and reverts to pfifo_fast (the default).
332 tc("qdisc del dev %s root" % dev)
336 def init(dev = dev, bwcap = bwmax):
338 (Re)initialize the bandwidth limits on this node
341 # Load the module used to manage exempt classes
342 run("/sbin/modprobe ip_set_iphash")
344 # Save current settings
345 paramslist = get(None, dev)
347 # Delete root qdisc 1: if it exists. This will also automatically
348 # delete any child classes.
349 for line in tc("qdisc show dev %s" % dev):
350 # Search for the root qdisc 1:
351 m = re.match(r"qdisc htb 1:", line)
353 tc("qdisc del dev %s root handle 1:" % dev)
356 # Initialize HTB. The "default" clause specifies that if a packet
357 # fails classification, it should go into the class with handle
359 tc("qdisc add dev %s root handle 1: htb default %x" % \
360 (dev, default_minor | default_xid))
362 # Set up a parent class from which all subclasses borrow.
363 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
366 # Set up a subclass that represents the node bandwidth cap. We
367 # allow each slice to borrow up to this rate, so it is also
368 # usually the "ceil" rate for each slice.
369 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
372 # Set up a subclass for DRL(Distributed Rate Limiting).
373 # DRL will directly modify that subclass implementing the site limits.
374 tc("class add dev %s parent 1:10 classid 1:100 htb rate %dbit ceil %dbit" % \
378 # Set up a subclass that represents "exemption" from the node
379 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
380 # to exempt destinations can still be fairly shared up to bwmax.
381 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
384 # Set up the root class (and tell VNET what it is). Packets sent
385 # by root end up here and are capped at the node bandwidth
387 #on(root_xid, dev, share = root_share)
389 # file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
393 # Set up the default class. Packets that fail classification end
395 on(default_xid, dev, share = default_share)
397 # Restore old settings
400 minexemptrate, maxexemptrate,
401 bytes, exemptbytes) in paramslist:
402 if xid not in (root_xid, default_xid):
403 on(xid, dev, share, minrate, maxrate, minexemptrate, maxexemptrate)
406 def get(xid = None, dev = dev):
408 Get the bandwidth limits and current byte totals for a
409 particular slice xid as a tuple (xid, share, minrate, maxrate,
410 minexemptrate, maxexemptrate, bytes, exemptbytes), or all classes
411 as a list of such tuples.
423 # class htb 1:1000 parent 1:10 leaf 1000: prio 0 quantum 8000 rate 8bit ceil 10000Kbit ...
424 # Sent 6851486 bytes 49244 pkt (dropped 0, overlimits 0 requeues 0)
426 # class htb 1:2000 parent 1:20 leaf 2000: prio 0 quantum 8000 rate 8bit ceil 1000Mbit ...
427 # Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
429 for line in tc("-s -d class show dev %s" % dev):
430 # Rate parameter line
431 params = re.match(r"class htb 1:([0-9a-f]+) parent 1:(10|20)", line)
433 stats = re.match(r".* Sent ([0-9]+) bytes", line)
435 ignore = re.match(r"class htb", line)
437 if params is not None:
439 if params.group(2) == "10":
446 bytes = 'exemptbytes'
449 id = int(params.group(1), 16) & 0x0FFF;
451 if rates.has_key(id):
458 m = re.search(r"quantum (\d+)", line)
460 rate['share'] = int(m.group(1)) / quantum
464 m = re.search(r"rate (\w+)", line)
466 rate[min] = get_tc_rate(m.group(1))
470 m = re.search(r"ceil (\w+)", line)
472 rate[max] = get_tc_rate(m.group(1))
474 # Which statistics to parse
475 rate['stats'] = bytes
479 elif stats is not None:
481 rate[rate['stats']] = int(stats.group(1))
483 elif ignore is not None:
486 # Keep parsing until we get everything
487 if rate is not None and \
488 rate.has_key('min') and rate.has_key('minexempt') and \
489 rate.has_key('max') and rate.has_key('maxexempt') and \
490 rate.has_key('bytes') and rate.has_key('exemptbytes'):
491 params = (rate['id'], rate['share'],
492 rate['min'], rate['max'],
493 rate['minexempt'], rate['maxexempt'],
494 rate['bytes'], rate['exemptbytes'])
496 # Return a list of parameters
499 elif xid == rate['id']:
500 # Return the parameters for this class
507 def on(xid, dev = dev, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None):
509 Apply specified bandwidth limit to the specified slice xid
512 # Get defaults from current state if available
521 if minexemptrate is None:
522 minexemptrate = cap[4]
523 if maxexemptrate is None:
524 maxexemptrate = cap[5]
526 # Figure out what the current node bandwidth cap is
531 share = default_share
535 minrate = get_tc_rate(minrate)
539 maxrate = get_tc_rate(maxrate)
540 if minexemptrate is None:
541 minexemptrate = minrate
543 minexemptrate = get_tc_rate(minexemptrate)
544 if maxexemptrate is None:
545 maxexemptrate = bwmax
547 maxexemptrate = get_tc_rate(maxexemptrate)
556 if minrate > maxrate:
558 if maxexemptrate < bwmin:
559 maxexemptrate = bwmin
560 if maxexemptrate > bwmax:
561 maxexemptrate = bwmax
562 if minexemptrate < bwmin:
563 minexemptrate = bwmin
564 if minexemptrate > maxexemptrate:
565 minexemptrate = maxexemptrate
567 # Set up subclasses for the slice
568 tc("class replace dev %s parent 1:100 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
569 (dev, default_minor | xid, minrate, maxrate, share * quantum))
571 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
572 (dev, exempt_minor | xid, minexemptrate, maxexemptrate, share * quantum))
574 # Attach a FIFO to each subclass, which helps to throttle back
575 # processes that are sending faster than the token buckets can
577 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
578 (dev, default_minor | xid, default_minor | xid))
580 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
581 (dev, exempt_minor | xid, exempt_minor | xid))
584 def set(xid, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None, dev = dev ):
585 on(xid = xid, dev = dev, share = share,
586 minrate = minrate, maxrate = maxrate,
587 minexemptrate = minexemptrate, maxexemptrate = maxexemptrate)
590 # Remove class associated with specified slice xid. If further packets
591 # are seen from this slice, they will be classified into the default
593 def off(xid, dev = dev):
595 Remove class associated with specified slice xid. If further
596 packets are seen from this slice, they will be classified into the
597 default class 1:1FFF.
602 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
603 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
606 def exempt_init(group_name, node_ips):
608 Initialize the list of destinations exempt from the node bandwidth
612 # Check of set exists
613 set = run("/sbin/ipset -S " + group_name)
615 # Create a hashed IP set of all of these destinations
616 lines = ["-N %s iphash" % group_name]
617 add_cmd = "-A %s " % group_name
618 lines += [(add_cmd + ip) for ip in node_ips]
620 restore = "\n".join(lines) + "\n"
621 run("/sbin/ipset -R", restore)
623 # Check all hosts and add missing.
624 for nodeip in node_ips:
625 if not run("/sbin/ipset -T %s %s" % (group_name, nodeip)):
626 run("/sbin/ipset -A %s %s" % (group_name, nodeip))
630 bwcap_description = format_tc_rate(get_bwcap())
635 %s [OPTION]... [COMMAND] [ARGUMENT]...
638 -d device Network interface (default: %s)
639 -r rate Node bandwidth cap (default: %s)
640 -q quantum Share multiplier (default: %d bytes)
641 -n Print rates in numeric bits per second
642 -v Enable verbose debug messages
647 (Re)initialize all bandwidth parameters
648 on slice [share|-] [minrate|-] [maxrate|-] [minexemptrate|-] [maxexemptrate|-]
649 Set bandwidth parameter(s) for the specified slice
651 Remove all bandwidth parameters for the specified slice
653 Get all bandwidth parameters for all slices
655 Get bandwidth parameters for the specified slice
656 """ % (sys.argv[0], dev, bwcap_description, quantum)
661 global dev, quantum, verbose
667 (opts, argv) = getopt.getopt(sys.argv[1:], "d:nr:q:vh")
668 for (opt, optval) in opts:
674 bwcap = get_tc_rate(optval)
676 quantum = int(optval)
686 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
688 init(dev, get_tc_rate(bwcap))
690 elif argv[0] == "get" or argv[0] == "show":
693 # Show a particular slice
694 xid = get_xid(argv[1])
696 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
698 params = get(xid, dev)
702 paramslist = [params]
705 paramslist = get(None, dev)
709 minexemptrate, maxexemptrate,
710 bytes, exemptbytes) in paramslist:
711 slice = get_slice(xid)
713 # Orphaned (not associated with a slice) class
716 print "%s %d %d %d %d %d %d %d" % \
719 minexemptrate, maxexemptrate,
722 print "%s %d %s %s %s %s %s %s" % \
724 format_tc_rate(minrate), format_tc_rate(maxrate),
725 format_tc_rate(minexemptrate), format_tc_rate(maxexemptrate),
726 format_bytes(bytes), format_bytes(exemptbytes))
730 xid = get_xid(argv[1])
732 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
735 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace" or argv[0] == "set":
739 # ... share, minrate, maxrate, minexemptrate, maxexemptrate
740 casts = [int, get_tc_rate, get_tc_rate, get_tc_rate, get_tc_rate]
741 for i, arg in enumerate(argv[2:]):
747 args.append(casts[i](arg))
750 elif argv[0] == "off" or argv[0] == "del":
761 if __name__ == '__main__':