4 # this file historically came with util-vserver-pl which is not available
6 # it is thus intended to become packaged separately at some point
8 # It was renamed into bwlimitlxc so that this branch can be tested on
9 # vserver nodes as well
13 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
14 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
15 # slices to fairly share access to available node bandwidth. We
16 # currently define three classes of "available node bandwidth":
18 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
21 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
22 # non-exempt destinations. By default, equal to bwmax, but may be
25 # 3. Available uncapped ("exempt") bandwidth: The difference between
26 # bwmax and what is currently being used of bwcap, or the maximum rate
27 # allowed to destinations exempt from caps (e.g., Internet2).
29 # All three classes of bandwidth are fairly shared according to the
30 # notion of "shares". For instance, if the node is capped at 5 Mbps,
31 # there are N slices, and each slice has 1 share, then each slice
32 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
33 # makes this statement a little too simplistic. What it really means
34 # is that during any single time period, only a certain number of
35 # bytes can be sent onto the wire. Each slice is guaranteed that at
36 # least some small number of its bytes will be sent. Whatever is left
37 # over from the budget, is split in proportion to the number of shares
40 # Even if the node is not capped at a particular limit (bwcap ==
41 # bwmax), this module enforces fair share access to bwmax. Also, if
42 # the node is capped at a particular limit, rules may optionally be
43 # defined that classify certain packets into the "exempt" class. This
44 # class receives whatever bandwidth is leftover between bwcap and
45 # bwmax; slices fairly share this bandwidth as well.
47 # The root context is exempt from sharing and can send as much as it
52 # 1. http://lartc.org/howto for how to use tc
53 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
55 # Andy Bavier <acb@cs.princeton.edu>
56 # Mark Huang <mlhuang@cs.princeton.edu>
57 # Copyright (C) 2006 The Trustees of Princeton University
60 import sys, os, re, getopt
64 # Where the tc binary lives
67 # Where the ebtables binary lives
68 EBTABLES = "/sbin/ebtables"
76 # bwmin should be small enough that it can be considered negligibly
77 # slow compared to the hardware. 8 bits/second appears to be the
78 # smallest value supported by tc.
81 # bwmax should be large enough that it can be considered at least as
82 # fast as the hardware.
83 bwmax = 1000*1000*1000
85 # quantum is the maximum number of bytes that can be borrowed by a
86 # share (or slice, if each slice gets 1 share) in one time period
87 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
88 # above their guarantees, and each is attempting to borrow up to the
89 # node bandwidth cap, quantums control how the excess bandwidth is
90 # distributed. Slices with 2 shares will borrow twice the amount in
91 # one time period as slices with 1 share, so averaged over time, they
92 # will get twice as much of the excess bandwidth. The value should be
93 # as small as possible and at least 1 MTU. By default, it would be
94 # calculated as bwmin/10, but since we use such small a value for
95 # bwmin, it's better to just set it to a value safely above 1 Ethernet
99 # cburst is the maximum number of bytes that can be burst onto the
100 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
101 # have data queued for transmission, cbursts control how long each
102 # slice can have the wire for. If not specified, it is set to the
103 # smallest possible value that would enable the slice's "ceil" rate
104 # (usually the node bandwidth cap), to be reached if a slice was able
105 # to borrow enough bandwidth to do so. For now, it's unclear how or if
106 # to relate this to the notion of shares, so just let tc set the
110 # There is another parameter that controls how bandwidth is allocated
111 # between slices on nodes that is outside the scope of HTB. We enforce
112 # a 16 GByte/day total limit on each slice, which works out to about
113 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
114 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
115 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
116 # rule and executes this script to override "ceil".
118 # We support multiple bandwidth limits, by reserving the top nibble of
119 # the minor classid to be the "subclassid". Theoretically, we could
120 # support up to 15 subclasses, but for now, we only define two: the
121 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
122 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
123 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
124 # make the borrowing model work. All bandwidth above minimum
125 # guarantees is fairly shared (in this example, slice 2 is guaranteed
126 # at least 1mbit in addition to fair access to the rest), subject to
127 # the restrictions of the class hierarchy: namely, that the total
128 # bandwidth to non-exempt destinations should not exceed the node
134 # ______________|_____________
136 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
138 # 1:100 (8bit, 5mbit) |
140 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
141 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
142 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
144 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
146 default_minor = 0x1000
147 exempt_minor = 0x2000
149 # root_xid is for the root context. The root context is exempt from
150 # fair sharing in both the default and exempt subclasses. The root
151 # context gets 5 shares by default.
155 # default_xid is for unclassifiable packets. Packets should not be
156 # classified here very often. They can be if a slice's HTB classes are
157 # deleted before its processes are. Each slice gets 1 share by
162 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
163 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
164 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
165 # "kibit" and that if an older version is installed, all rates will
166 # be off by a small fraction.
174 "gibit": 1024*1024*1024,
176 "tibit": 1024*1024*1024*1024,
177 "tbit": 1000000000000,
181 "mibps": 8*1024*1024,
183 "gibps": 8*1024*1024*1024,
185 "tibps": 8*1024*1024*1024*1024,
186 "tbps": 8000000000000
192 Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
197 m = re.match(r"([0-9.]+)(\D*)", s)
200 suffix = m.group(2).lower()
201 if suffixes.has_key(suffix):
202 return int(float(m.group(1)) * suffixes[suffix])
206 def format_bytes(bytes, si = True):
208 Formats bytes into a string
213 # Officially, a kibibyte
216 if bytes >= (kilo * kilo * kilo):
217 return "%.1f GB" % (bytes / (kilo * kilo * kilo))
218 elif bytes >= 1000000:
219 return "%.1f MB" % (bytes / (kilo * kilo))
221 return "%.1f KB" % (bytes / kilo)
223 return "%.0f bytes" % bytes
225 def format_tc_rate(rate):
227 Formats a bits/second rate into a tc rate string
230 if rate >= 1000000000 and (rate % 1000000000) == 0:
231 return "%.0fgbit" % (rate / 1000000000.)
232 elif rate >= 1000000 and (rate % 1000000) == 0:
233 return "%.0fmbit" % (rate / 1000000.)
235 return "%.0fkbit" % (rate / 1000.)
237 return "%.0fbit" % rate
240 # Parse /etc/planetlab/bwcap (or equivalent)
241 def read_bwcap(bwcap_file):
244 fp = open(bwcap_file, "r")
245 line = fp.readline().strip()
247 bwcap = get_tc_rate(line)
255 def get_bwcap(dev = dev):
257 Get the current (live) value of the node bandwidth cap
260 state = tc("-d class show dev %s" % dev)
261 base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
262 base_classes = filter(None, map(base_re.match, state))
265 if len(base_classes) > 1:
266 raise Exception, "unable to get current bwcap"
267 return get_tc_rate(base_classes[0].group(1))
272 Get slice name ("princeton_mlh") from slice xid (500)
277 if xid == default_xid:
280 return pwd.getpwuid(xid).pw_name
288 Get slice xid ("500") from slice name ("princeton_mlh")
293 if slice == "default":
300 return pwd.getpwnam(slice).pw_uid
306 def run(cmd, input = None):
308 Shortcut for running a shell command
313 sys.stderr.write("Executing: " + cmd + "\n")
315 fileobj = os.popen(cmd, "r")
316 output = fileobj.readlines()
318 fileobj = os.popen(cmd, "w")
321 if fileobj.close() is None:
330 Shortcut for running a tc command
333 return run(TC + " " + cmd)
337 Shortcut for running a ebtables command
340 return run(EBTABLES + " " + cmd)
345 Turn off all queing. Stops all slice HTBS and reverts to pfifo_fast (the default).
349 tc("qdisc del dev %s root" % dev)
353 def init(dev = dev, bwcap = bwmax):
355 (Re)initialize the bandwidth limits on this node
358 # Load the module used to manage exempt classes
359 #run("/sbin/modprobe ip_set_iphash")
360 # Test the new module included in kernel 3 series
361 run("/sbin/modprobe ip_set_hash_ip")
363 # Save current settings
364 paramslist = get(None, dev)
366 # Delete root qdisc 1: if it exists. This will also automatically
367 # delete any child classes.
368 for line in tc("qdisc show dev %s" % dev):
369 # Search for the root qdisc 1:
370 m = re.match(r"qdisc htb 1:", line)
372 tc("qdisc del dev %s root handle 1:" % dev)
375 # Initialize HTB. The "default" clause specifies that if a packet
376 # fails classification, it should go into the class with handle
378 tc("qdisc add dev %s root handle 1: htb default %x" % \
379 (dev, default_minor | default_xid))
381 # Set up a parent class from which all subclasses borrow.
382 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
385 # Set up a subclass that represents the node bandwidth cap. We
386 # allow each slice to borrow up to this rate, so it is also
387 # usually the "ceil" rate for each slice.
388 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
391 # Set up a subclass for DRL(Distributed Rate Limiting).
392 # DRL will directly modify that subclass implementing the site limits.
393 tc("class add dev %s parent 1:10 classid 1:100 htb rate %dbit ceil %dbit" % \
397 # Set up a subclass that represents "exemption" from the node
398 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
399 # to exempt destinations can still be fairly shared up to bwmax.
400 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
403 # Set up the root class (and tell VNET what it is). Packets sent
404 # by root end up here and are capped at the node bandwidth
406 #on(root_xid, dev, share = root_share)
408 # file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
412 # Set up the default class. Packets that fail classification end
414 on(default_xid, dev, share = default_share)
416 # Restore old settings
419 minexemptrate, maxexemptrate,
420 bytes, exemptbytes) in paramslist:
421 if xid not in (root_xid, default_xid):
422 on(xid, dev, share, minrate, maxrate, minexemptrate, maxexemptrate)
425 def get(xid = None, dev = dev):
427 Get the bandwidth limits and current byte totals for a
428 particular slice xid as a tuple (xid, share, minrate, maxrate,
429 minexemptrate, maxexemptrate, bytes, exemptbytes), or all classes
430 as a list of such tuples.
442 # class htb 1:1000 parent 1:10 leaf 1000: prio 0 quantum 8000 rate 8bit ceil 10000Kbit ...
443 # Sent 6851486 bytes 49244 pkt (dropped 0, overlimits 0 requeues 0)
445 # class htb 1:2000 parent 1:20 leaf 2000: prio 0 quantum 8000 rate 8bit ceil 1000Mbit ...
446 # Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
448 for line in tc("-s -d class show dev %s" % dev):
449 # Rate parameter line
450 params = re.match(r"class htb 1:([0-9a-f]+) parent 1:(10|20)", line)
452 stats = re.match(r".* Sent ([0-9]+) bytes", line)
454 ignore = re.match(r"class htb", line)
456 if params is not None:
458 if params.group(2) == "10":
465 bytes = 'exemptbytes'
468 id = int(params.group(1), 16) & 0x0FFF;
470 if rates.has_key(id):
477 m = re.search(r"quantum (\d+)", line)
479 rate['share'] = int(m.group(1)) / quantum
483 m = re.search(r"rate (\w+)", line)
485 rate[min] = get_tc_rate(m.group(1))
489 m = re.search(r"ceil (\w+)", line)
491 rate[max] = get_tc_rate(m.group(1))
493 # Which statistics to parse
494 rate['stats'] = bytes
498 elif stats is not None:
500 rate[rate['stats']] = int(stats.group(1))
502 elif ignore is not None:
505 # Keep parsing until we get everything
506 if rate is not None and \
507 rate.has_key('min') and rate.has_key('minexempt') and \
508 rate.has_key('max') and rate.has_key('maxexempt') and \
509 rate.has_key('bytes') and rate.has_key('exemptbytes'):
510 params = (rate['id'], rate['share'],
511 rate['min'], rate['max'],
512 rate['minexempt'], rate['maxexempt'],
513 rate['bytes'], rate['exemptbytes'])
515 # Return a list of parameters
518 elif xid == rate['id']:
519 # Return the parameters for this class
526 def on(xid, dev = dev, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None):
528 Apply specified bandwidth limit to the specified slice xid
531 # Get defaults from current state if available
540 if minexemptrate is None:
541 minexemptrate = cap[4]
542 if maxexemptrate is None:
543 maxexemptrate = cap[5]
545 # Figure out what the current node bandwidth cap is
546 bwcap = get_bwcap(dev)
550 share = default_share
554 minrate = get_tc_rate(minrate)
558 maxrate = get_tc_rate(maxrate)
559 if minexemptrate is None:
560 minexemptrate = minrate
562 minexemptrate = get_tc_rate(minexemptrate)
563 if maxexemptrate is None:
564 maxexemptrate = bwmax
566 maxexemptrate = get_tc_rate(maxexemptrate)
575 if minrate > maxrate:
577 if maxexemptrate < bwmin:
578 maxexemptrate = bwmin
579 if maxexemptrate > bwmax:
580 maxexemptrate = bwmax
581 if minexemptrate < bwmin:
582 minexemptrate = bwmin
583 if minexemptrate > maxexemptrate:
584 minexemptrate = maxexemptrate
586 # Set up subclasses for the slice
587 tc("class replace dev %s parent 1:100 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
588 (dev, default_minor | xid, minrate, maxrate, share * quantum))
590 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
591 (dev, exempt_minor | xid, minexemptrate, maxexemptrate, share * quantum))
593 # Attach a FIFO to each subclass, which helps to throttle back
594 # processes that are sending faster than the token buckets can
596 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
597 (dev, default_minor | xid, default_minor | xid))
599 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
600 (dev, exempt_minor | xid, exempt_minor | xid))
602 # Setup a filter rule to the root class so each packet originated by a
603 # container interface is classified to it corresponding class
604 # The handle number is a mark created by ebtables with the xid
605 tc("filter replace dev %s parent 1: protocol ip prio 1 handle %d fw flowid 1:%x" % \
606 (dev, xid, default_minor | xid))
608 def set(xid, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None, dev = dev ):
609 on(xid = xid, dev = dev, share = share,
610 minrate = minrate, maxrate = maxrate,
611 minexemptrate = minexemptrate, maxexemptrate = maxexemptrate)
614 # Remove class associated with specified slice xid. If further packets
615 # are seen from this slice, they will be classified into the default
617 def off(xid, dev = dev):
619 Remove class associated with specified slice xid. If further
620 packets are seen from this slice, they will be classified into the
621 default class 1:1FFF.
626 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
627 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
630 def exempt_init(group_name, node_ips):
632 Initialize the list of destinations exempt from the node bandwidth
636 # Check of set exists
637 set = run("/sbin/ipset -S " + group_name)
639 # Create a hashed IP set of all of these destinations
640 lines = ["-N %s iphash" % group_name]
641 add_cmd = "-A %s " % group_name
642 lines += [(add_cmd + ip) for ip in node_ips]
644 restore = "\n".join(lines) + "\n"
645 run("/sbin/ipset -R", restore)
647 # Check all hosts and add missing.
648 for nodeip in node_ips:
649 if not run("/sbin/ipset -T %s %s" % (group_name, nodeip)):
650 run("/sbin/ipset -A %s %s" % (group_name, nodeip))
654 bwcap_description = format_tc_rate(get_bwcap())
659 %s [OPTION]... [COMMAND] [ARGUMENT]...
662 -d device Network interface (default: %s)
663 -r rate Node bandwidth cap (default: %s)
664 -q quantum Share multiplier (default: %d bytes)
665 -n Print rates in numeric bits per second
666 -v Enable verbose debug messages
671 (Re)initialize all bandwidth parameters
672 on slice [share|-] [minrate|-] [maxrate|-] [minexemptrate|-] [maxexemptrate|-]
673 Set bandwidth parameter(s) for the specified slice
675 Remove all bandwidth parameters for the specified slice
677 Get all bandwidth parameters for all slices
679 Get bandwidth parameters for the specified slice
680 """ % (sys.argv[0], dev, bwcap_description, quantum)
685 global dev, quantum, verbose
691 (opts, argv) = getopt.getopt(sys.argv[1:], "d:nr:q:vh")
692 for (opt, optval) in opts:
698 bwcap = get_tc_rate(optval)
700 quantum = int(optval)
707 bwcap = get_bwcap(dev)
713 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
715 init(dev, get_tc_rate(bwcap))
717 elif argv[0] == "get" or argv[0] == "show":
720 # Show a particular slice
721 xid = get_xid(argv[1])
723 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
725 params = get(xid, dev)
729 paramslist = [params]
732 paramslist = get(None, dev)
736 minexemptrate, maxexemptrate,
737 bytes, exemptbytes) in paramslist:
738 slice = get_slice(xid)
740 # Orphaned (not associated with a slice) class
743 print "%s %d %d %d %d %d %d %d" % \
746 minexemptrate, maxexemptrate,
749 print "%s %d %s %s %s %s %s %s" % \
751 format_tc_rate(minrate), format_tc_rate(maxrate),
752 format_tc_rate(minexemptrate), format_tc_rate(maxexemptrate),
753 format_bytes(bytes), format_bytes(exemptbytes))
757 xid = get_xid(argv[1])
759 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
762 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace" or argv[0] == "set":
766 # ... share, minrate, maxrate, minexemptrate, maxexemptrate
767 casts = [int, get_tc_rate, get_tc_rate, get_tc_rate, get_tc_rate]
768 for i, arg in enumerate(argv[2:]):
774 args.append(casts[i](arg))
777 elif argv[0] == "off" or argv[0] == "del":
788 if __name__ == '__main__':