3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.15 2007/02/07 04:21:11 mlhuang Exp $
52 import sys, os, re, getopt
56 # Where the tc binary lives
59 # Where the ebtables binary lives
60 EBTABLES = "/sbin/ebtables"
68 # bwmin should be small enough that it can be considered negligibly
69 # slow compared to the hardware. 8 bits/second appears to be the
70 # smallest value supported by tc.
73 # bwmax should be large enough that it can be considered at least as
74 # fast as the hardware.
75 bwmax = 1000*1000*1000
77 # quantum is the maximum number of bytes that can be borrowed by a
78 # share (or slice, if each slice gets 1 share) in one time period
79 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
80 # above their guarantees, and each is attempting to borrow up to the
81 # node bandwidth cap, quantums control how the excess bandwidth is
82 # distributed. Slices with 2 shares will borrow twice the amount in
83 # one time period as slices with 1 share, so averaged over time, they
84 # will get twice as much of the excess bandwidth. The value should be
85 # as small as possible and at least 1 MTU. By default, it would be
86 # calculated as bwmin/10, but since we use such small a value for
87 # bwmin, it's better to just set it to a value safely above 1 Ethernet
91 # cburst is the maximum number of bytes that can be burst onto the
92 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
93 # have data queued for transmission, cbursts control how long each
94 # slice can have the wire for. If not specified, it is set to the
95 # smallest possible value that would enable the slice's "ceil" rate
96 # (usually the node bandwidth cap), to be reached if a slice was able
97 # to borrow enough bandwidth to do so. For now, it's unclear how or if
98 # to relate this to the notion of shares, so just let tc set the
102 # There is another parameter that controls how bandwidth is allocated
103 # between slices on nodes that is outside the scope of HTB. We enforce
104 # a 16 GByte/day total limit on each slice, which works out to about
105 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
106 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
107 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
108 # rule and executes this script to override "ceil".
110 # We support multiple bandwidth limits, by reserving the top nibble of
111 # the minor classid to be the "subclassid". Theoretically, we could
112 # support up to 15 subclasses, but for now, we only define two: the
113 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
114 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
115 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
116 # make the borrowing model work. All bandwidth above minimum
117 # guarantees is fairly shared (in this example, slice 2 is guaranteed
118 # at least 1mbit in addition to fair access to the rest), subject to
119 # the restrictions of the class hierarchy: namely, that the total
120 # bandwidth to non-exempt destinations should not exceed the node
126 # ______________|_____________
128 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
130 # 1:100 (8bit, 5mbit) |
132 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
133 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
134 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
136 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
138 default_minor = 0x1000
139 exempt_minor = 0x2000
141 # root_xid is for the root context. The root context is exempt from
142 # fair sharing in both the default and exempt subclasses. The root
143 # context gets 5 shares by default.
147 # default_xid is for unclassifiable packets. Packets should not be
148 # classified here very often. They can be if a slice's HTB classes are
149 # deleted before its processes are. Each slice gets 1 share by
154 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
155 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
156 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
157 # "kibit" and that if an older version is installed, all rates will
158 # be off by a small fraction.
166 "gibit": 1024*1024*1024,
168 "tibit": 1024*1024*1024*1024,
169 "tbit": 1000000000000,
173 "mibps": 8*1024*1024,
175 "gibps": 8*1024*1024*1024,
177 "tibps": 8*1024*1024*1024*1024,
178 "tbps": 8000000000000
184 Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
189 m = re.match(r"([0-9.]+)(\D*)", s)
192 suffix = m.group(2).lower()
193 if suffixes.has_key(suffix):
194 return int(float(m.group(1)) * suffixes[suffix])
198 def format_bytes(bytes, si = True):
200 Formats bytes into a string
205 # Officially, a kibibyte
208 if bytes >= (kilo * kilo * kilo):
209 return "%.1f GB" % (bytes / (kilo * kilo * kilo))
210 elif bytes >= 1000000:
211 return "%.1f MB" % (bytes / (kilo * kilo))
213 return "%.1f KB" % (bytes / kilo)
215 return "%.0f bytes" % bytes
217 def format_tc_rate(rate):
219 Formats a bits/second rate into a tc rate string
222 if rate >= 1000000000 and (rate % 1000000000) == 0:
223 return "%.0fgbit" % (rate / 1000000000.)
224 elif rate >= 1000000 and (rate % 1000000) == 0:
225 return "%.0fmbit" % (rate / 1000000.)
227 return "%.0fkbit" % (rate / 1000.)
229 return "%.0fbit" % rate
232 # Parse /etc/planetlab/bwcap (or equivalent)
233 def read_bwcap(bwcap_file):
236 fp = open(bwcap_file, "r")
237 line = fp.readline().strip()
239 bwcap = get_tc_rate(line)
247 def get_bwcap(dev = dev):
249 Get the current (live) value of the node bandwidth cap
252 state = tc("-d class show dev %s" % dev)
253 base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
254 base_classes = filter(None, map(base_re.match, state))
257 if len(base_classes) > 1:
258 raise Exception, "unable to get current bwcap"
259 return get_tc_rate(base_classes[0].group(1))
264 Get slice name ("princeton_mlh") from slice xid (500)
269 if xid == default_xid:
272 return pwd.getpwuid(xid).pw_name
280 Get slice xid ("500") from slice name ("princeton_mlh")
285 if slice == "default":
292 return pwd.getpwnam(slice).pw_uid
298 def run(cmd, input = None):
300 Shortcut for running a shell command
305 sys.stderr.write("Executing: " + cmd + "\n")
307 fileobj = os.popen(cmd, "r")
308 output = fileobj.readlines()
310 fileobj = os.popen(cmd, "w")
313 if fileobj.close() is None:
322 Shortcut for running a tc command
325 return run(TC + " " + cmd)
329 Shortcut for running a ebtables command
332 return run(EBTABLES + " " + cmd)
337 Turn off all queing. Stops all slice HTBS and reverts to pfifo_fast (the default).
341 tc("qdisc del dev %s root" % dev)
345 def init(dev = dev, bwcap = bwmax):
347 (Re)initialize the bandwidth limits on this node
350 # Load the module used to manage exempt classes
351 #run("/sbin/modprobe ip_set_iphash")
352 # Test the new module included in kernel 3 series
353 run("/sbin/modprobe ip_set_hash_ip")
355 # Save current settings
356 paramslist = get(None, dev)
358 # Delete root qdisc 1: if it exists. This will also automatically
359 # delete any child classes.
360 for line in tc("qdisc show dev %s" % dev):
361 # Search for the root qdisc 1:
362 m = re.match(r"qdisc htb 1:", line)
364 tc("qdisc del dev %s root handle 1:" % dev)
367 # Initialize HTB. The "default" clause specifies that if a packet
368 # fails classification, it should go into the class with handle
370 tc("qdisc add dev %s root handle 1: htb default %x" % \
371 (dev, default_minor | default_xid))
373 # Set up a parent class from which all subclasses borrow.
374 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
377 # Set up a subclass that represents the node bandwidth cap. We
378 # allow each slice to borrow up to this rate, so it is also
379 # usually the "ceil" rate for each slice.
380 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
383 # Set up a subclass for DRL(Distributed Rate Limiting).
384 # DRL will directly modify that subclass implementing the site limits.
385 tc("class add dev %s parent 1:10 classid 1:100 htb rate %dbit ceil %dbit" % \
389 # Set up a subclass that represents "exemption" from the node
390 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
391 # to exempt destinations can still be fairly shared up to bwmax.
392 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
395 # Set up the root class (and tell VNET what it is). Packets sent
396 # by root end up here and are capped at the node bandwidth
398 #on(root_xid, dev, share = root_share)
400 # file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
404 # Set up the default class. Packets that fail classification end
406 on(default_xid, dev, share = default_share)
408 # Restore old settings
411 minexemptrate, maxexemptrate,
412 bytes, exemptbytes) in paramslist:
413 if xid not in (root_xid, default_xid):
414 on(xid, dev, share, minrate, maxrate, minexemptrate, maxexemptrate)
417 def get(xid = None, dev = dev):
419 Get the bandwidth limits and current byte totals for a
420 particular slice xid as a tuple (xid, share, minrate, maxrate,
421 minexemptrate, maxexemptrate, bytes, exemptbytes), or all classes
422 as a list of such tuples.
434 # class htb 1:1000 parent 1:10 leaf 1000: prio 0 quantum 8000 rate 8bit ceil 10000Kbit ...
435 # Sent 6851486 bytes 49244 pkt (dropped 0, overlimits 0 requeues 0)
437 # class htb 1:2000 parent 1:20 leaf 2000: prio 0 quantum 8000 rate 8bit ceil 1000Mbit ...
438 # Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
440 for line in tc("-s -d class show dev %s" % dev):
441 # Rate parameter line
442 params = re.match(r"class htb 1:([0-9a-f]+) parent 1:(10|20)", line)
444 stats = re.match(r".* Sent ([0-9]+) bytes", line)
446 ignore = re.match(r"class htb", line)
448 if params is not None:
450 if params.group(2) == "10":
457 bytes = 'exemptbytes'
460 id = int(params.group(1), 16) & 0x0FFF;
462 if rates.has_key(id):
469 m = re.search(r"quantum (\d+)", line)
471 rate['share'] = int(m.group(1)) / quantum
475 m = re.search(r"rate (\w+)", line)
477 rate[min] = get_tc_rate(m.group(1))
481 m = re.search(r"ceil (\w+)", line)
483 rate[max] = get_tc_rate(m.group(1))
485 # Which statistics to parse
486 rate['stats'] = bytes
490 elif stats is not None:
492 rate[rate['stats']] = int(stats.group(1))
494 elif ignore is not None:
497 # Keep parsing until we get everything
498 if rate is not None and \
499 rate.has_key('min') and rate.has_key('minexempt') and \
500 rate.has_key('max') and rate.has_key('maxexempt') and \
501 rate.has_key('bytes') and rate.has_key('exemptbytes'):
502 params = (rate['id'], rate['share'],
503 rate['min'], rate['max'],
504 rate['minexempt'], rate['maxexempt'],
505 rate['bytes'], rate['exemptbytes'])
507 # Return a list of parameters
510 elif xid == rate['id']:
511 # Return the parameters for this class
518 def on(xid, dev = dev, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None):
520 Apply specified bandwidth limit to the specified slice xid
523 # Get defaults from current state if available
532 if minexemptrate is None:
533 minexemptrate = cap[4]
534 if maxexemptrate is None:
535 maxexemptrate = cap[5]
537 # Figure out what the current node bandwidth cap is
538 bwcap = get_bwcap(dev)
542 share = default_share
546 minrate = get_tc_rate(minrate)
550 maxrate = get_tc_rate(maxrate)
551 if minexemptrate is None:
552 minexemptrate = minrate
554 minexemptrate = get_tc_rate(minexemptrate)
555 if maxexemptrate is None:
556 maxexemptrate = bwmax
558 maxexemptrate = get_tc_rate(maxexemptrate)
567 if minrate > maxrate:
569 if maxexemptrate < bwmin:
570 maxexemptrate = bwmin
571 if maxexemptrate > bwmax:
572 maxexemptrate = bwmax
573 if minexemptrate < bwmin:
574 minexemptrate = bwmin
575 if minexemptrate > maxexemptrate:
576 minexemptrate = maxexemptrate
578 # Set up subclasses for the slice
579 tc("class replace dev %s parent 1:100 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
580 (dev, default_minor | xid, minrate, maxrate, share * quantum))
582 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
583 (dev, exempt_minor | xid, minexemptrate, maxexemptrate, share * quantum))
585 # Attach a FIFO to each subclass, which helps to throttle back
586 # processes that are sending faster than the token buckets can
588 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
589 (dev, default_minor | xid, default_minor | xid))
591 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
592 (dev, exempt_minor | xid, exempt_minor | xid))
594 # Setup a filter rule to the root class so each packet originated by a
595 # container interface is classified to it corresponding class
596 # The handle number is a mark created by ebtables with the xid
597 tc("filter replace dev %s parent 1:1 protocol ip prio 1 handle %d fw flowid 1:%x" % \
598 (dev, default_minor | xid, default_minor | xid))
600 # Create the ebtables rule to mark the packets going out from the virtual
601 # interface to the actual device so the filter canmatch against the mark
602 # We remove and readd the rule because this method is called each time the
603 # bandwidth limit is changed
604 ebtables("-D INPUT -i veth%d -j mark --set-mark %d" % \
605 (xid, default_minor | xid))
606 ebtables("-A INPUT -i veth%d -j mark --set-mark %d" % \
607 (xid, default_minor | xid))
610 def set(xid, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None, dev = dev ):
611 on(xid = xid, dev = dev, share = share,
612 minrate = minrate, maxrate = maxrate,
613 minexemptrate = minexemptrate, maxexemptrate = maxexemptrate)
616 # Remove class associated with specified slice xid. If further packets
617 # are seen from this slice, they will be classified into the default
619 def off(xid, dev = dev):
621 Remove class associated with specified slice xid. If further
622 packets are seen from this slice, they will be classified into the
623 default class 1:1FFF.
628 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
629 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
632 def exempt_init(group_name, node_ips):
634 Initialize the list of destinations exempt from the node bandwidth
638 # Check of set exists
639 set = run("/sbin/ipset -S " + group_name)
641 # Create a hashed IP set of all of these destinations
642 lines = ["-N %s iphash" % group_name]
643 add_cmd = "-A %s " % group_name
644 lines += [(add_cmd + ip) for ip in node_ips]
646 restore = "\n".join(lines) + "\n"
647 run("/sbin/ipset -R", restore)
649 # Check all hosts and add missing.
650 for nodeip in node_ips:
651 if not run("/sbin/ipset -T %s %s" % (group_name, nodeip)):
652 run("/sbin/ipset -A %s %s" % (group_name, nodeip))
656 bwcap_description = format_tc_rate(get_bwcap())
661 %s [OPTION]... [COMMAND] [ARGUMENT]...
664 -d device Network interface (default: %s)
665 -r rate Node bandwidth cap (default: %s)
666 -q quantum Share multiplier (default: %d bytes)
667 -n Print rates in numeric bits per second
668 -v Enable verbose debug messages
673 (Re)initialize all bandwidth parameters
674 on slice [share|-] [minrate|-] [maxrate|-] [minexemptrate|-] [maxexemptrate|-]
675 Set bandwidth parameter(s) for the specified slice
677 Remove all bandwidth parameters for the specified slice
679 Get all bandwidth parameters for all slices
681 Get bandwidth parameters for the specified slice
682 """ % (sys.argv[0], dev, bwcap_description, quantum)
687 global dev, quantum, verbose
693 (opts, argv) = getopt.getopt(sys.argv[1:], "d:nr:q:vh")
694 for (opt, optval) in opts:
700 bwcap = get_tc_rate(optval)
702 quantum = int(optval)
709 bwcap = get_bwcap(dev)
715 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
717 init(dev, get_tc_rate(bwcap))
719 elif argv[0] == "get" or argv[0] == "show":
722 # Show a particular slice
723 xid = get_xid(argv[1])
725 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
727 params = get(xid, dev)
731 paramslist = [params]
734 paramslist = get(None, dev)
738 minexemptrate, maxexemptrate,
739 bytes, exemptbytes) in paramslist:
740 slice = get_slice(xid)
742 # Orphaned (not associated with a slice) class
745 print "%s %d %d %d %d %d %d %d" % \
748 minexemptrate, maxexemptrate,
751 print "%s %d %s %s %s %s %s %s" % \
753 format_tc_rate(minrate), format_tc_rate(maxrate),
754 format_tc_rate(minexemptrate), format_tc_rate(maxexemptrate),
755 format_bytes(bytes), format_bytes(exemptbytes))
759 xid = get_xid(argv[1])
761 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
764 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace" or argv[0] == "set":
768 # ... share, minrate, maxrate, minexemptrate, maxexemptrate
769 casts = [int, get_tc_rate, get_tc_rate, get_tc_rate, get_tc_rate]
770 for i, arg in enumerate(argv[2:]):
776 args.append(casts[i](arg))
779 elif argv[0] == "off" or argv[0] == "del":
790 if __name__ == '__main__':