3 # This file is under git as plnode-utils/bwlimit_lxc.py
5 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
6 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
7 # slices to fairly share access to available node bandwidth. We
8 # currently define three classes of "available node bandwidth":
10 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
13 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
14 # non-exempt destinations. By default, equal to bwmax, but may be
17 # 3. Available uncapped ("exempt") bandwidth: The difference between
18 # bwmax and what is currently being used of bwcap, or the maximum rate
19 # allowed to destinations exempt from caps (e.g., Internet2).
21 # All three classes of bandwidth are fairly shared according to the
22 # notion of "shares". For instance, if the node is capped at 5 Mbps,
23 # there are N slices, and each slice has 1 share, then each slice
24 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
25 # makes this statement a little too simplistic. What it really means
26 # is that during any single time period, only a certain number of
27 # bytes can be sent onto the wire. Each slice is guaranteed that at
28 # least some small number of its bytes will be sent. Whatever is left
29 # over from the budget, is split in proportion to the number of shares
32 # Even if the node is not capped at a particular limit (bwcap ==
33 # bwmax), this module enforces fair share access to bwmax. Also, if
34 # the node is capped at a particular limit, rules may optionally be
35 # defined that classify certain packets into the "exempt" class. This
36 # class receives whatever bandwidth is leftover between bwcap and
37 # bwmax; slices fairly share this bandwidth as well.
39 # The root context is exempt from sharing and can send as much as it
44 # 1. http://lartc.org/howto for how to use tc
45 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
47 # Andy Bavier <acb@cs.princeton.edu>
48 # Mark Huang <mlhuang@cs.princeton.edu>
49 # Copyright (C) 2006 The Trustees of Princeton University
52 import sys, os, re, getopt
56 # Where the tc binary lives
59 # Where the ebtables binary lives
60 EBTABLES = "/sbin/ebtables"
68 # bwmin should be small enough that it can be considered negligibly
69 # slow compared to the hardware. 8 bits/second appears to be the
70 # smallest value supported by tc.
73 # bwmax should be large enough that it can be considered at least as
74 # fast as the hardware.
75 bwmax = 1000*1000*1000
77 # quantum is the maximum number of bytes that can be borrowed by a
78 # share (or slice, if each slice gets 1 share) in one time period
79 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
80 # above their guarantees, and each is attempting to borrow up to the
81 # node bandwidth cap, quantums control how the excess bandwidth is
82 # distributed. Slices with 2 shares will borrow twice the amount in
83 # one time period as slices with 1 share, so averaged over time, they
84 # will get twice as much of the excess bandwidth. The value should be
85 # as small as possible and at least 1 MTU. By default, it would be
86 # calculated as bwmin/10, but since we use such small a value for
87 # bwmin, it's better to just set it to a value safely above 1 Ethernet
91 # cburst is the maximum number of bytes that can be burst onto the
92 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
93 # have data queued for transmission, cbursts control how long each
94 # slice can have the wire for. If not specified, it is set to the
95 # smallest possible value that would enable the slice's "ceil" rate
96 # (usually the node bandwidth cap), to be reached if a slice was able
97 # to borrow enough bandwidth to do so. For now, it's unclear how or if
98 # to relate this to the notion of shares, so just let tc set the
102 # There is another parameter that controls how bandwidth is allocated
103 # between slices on nodes that is outside the scope of HTB. We enforce
104 # a 16 GByte/day total limit on each slice, which works out to about
105 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
106 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
107 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
108 # rule and executes this script to override "ceil".
110 # We support multiple bandwidth limits, by reserving the top nibble of
111 # the minor classid to be the "subclassid". Theoretically, we could
112 # support up to 15 subclasses, but for now, we only define two: the
113 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
114 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
115 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
116 # make the borrowing model work. All bandwidth above minimum
117 # guarantees is fairly shared (in this example, slice 2 is guaranteed
118 # at least 1mbit in addition to fair access to the rest), subject to
119 # the restrictions of the class hierarchy: namely, that the total
120 # bandwidth to non-exempt destinations should not exceed the node
126 # ______________|_____________
128 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
130 # 1:100 (8bit, 5mbit) |
132 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
133 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
134 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
136 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
138 default_minor = 0x1000
139 exempt_minor = 0x2000
141 # root_xid is for the root context. The root context is exempt from
142 # fair sharing in both the default and exempt subclasses. The root
143 # context gets 5 shares by default.
147 # default_xid is for unclassifiable packets. Packets should not be
148 # classified here very often. They can be if a slice's HTB classes are
149 # deleted before its processes are. Each slice gets 1 share by
154 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
155 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
156 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
157 # "kibit" and that if an older version is installed, all rates will
158 # be off by a small fraction.
166 "gibit": 1024*1024*1024,
168 "tibit": 1024*1024*1024*1024,
169 "tbit": 1000000000000,
173 "mibps": 8*1024*1024,
175 "gibps": 8*1024*1024*1024,
177 "tibps": 8*1024*1024*1024*1024,
178 "tbps": 8000000000000
184 Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
187 if isinstance(s, int):
189 if isinstance(s, float):
191 m = re.match(r"([0-9.]+)(\D*)", s)
194 suffix = m.group(2).lower()
195 if suffix in suffixes:
196 return int(float(m.group(1)) * suffixes[suffix])
200 def format_bytes(bytes, si = True):
202 Formats bytes into a string
207 # Officially, a kibibyte
210 if bytes >= (kilo * kilo * kilo):
211 return "%.1f GB" % (bytes / (kilo * kilo * kilo))
212 elif bytes >= 1000000:
213 return "%.1f MB" % (bytes / (kilo * kilo))
215 return "%.1f KB" % (bytes / kilo)
217 return "%.0f bytes" % bytes
219 def format_tc_rate(rate):
221 Formats a bits/second rate into a tc rate string
224 if rate >= 1000000000 and (rate % 1000000000) == 0:
225 return "%.0fgbit" % (rate / 1000000000.)
226 elif rate >= 1000000 and (rate % 1000000) == 0:
227 return "%.0fmbit" % (rate / 1000000.)
229 return "%.0fkbit" % (rate / 1000.)
231 return "%.0fbit" % rate
234 # Parse /etc/planetlab/bwcap (or equivalent)
235 def read_bwcap(bwcap_file):
238 fp = open(bwcap_file, "r")
239 line = fp.readline().strip()
241 bwcap = get_tc_rate(line)
249 def get_bwcap(dev = dev):
251 Get the current (live) value of the node bandwidth cap
254 state = tc("-d class show dev %s" % dev)
255 base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
256 base_classes = [_f for _f in map(base_re.match, state) if _f]
259 if len(base_classes) > 1:
260 raise Exception("unable to get current bwcap")
261 return get_tc_rate(base_classes[0].group(1))
266 Get slice name ("princeton_mlh") from slice xid (500)
271 if xid == default_xid:
274 return pwd.getpwuid(xid).pw_name
282 Get slice xid ("500") from slice name ("princeton_mlh")
287 if slice == "default":
294 return pwd.getpwnam(slice).pw_uid
300 def run(cmd, input = None):
302 Shortcut for running a shell command
307 sys.stderr.write("Executing: " + cmd + "\n")
309 fileobj = os.popen(cmd, "r")
310 output = fileobj.readlines()
312 fileobj = os.popen(cmd, "w")
315 if fileobj.close() is None:
317 except Exception as e:
324 Shortcut for running a tc command
327 return run(TC + " " + cmd)
332 Shortcut for running a ebtables command
335 return run(EBTABLES + " " + cmd)
340 Turn off all queing. Stops all slice HTBS and reverts to pfifo_fast (the default).
344 tc("qdisc del dev %s root" % dev)
348 def init(dev = dev, bwcap = bwmax):
350 (Re)initialize the bandwidth limits on this node
353 # Load the module used to manage exempt classes
354 #run("/sbin/modprobe ip_set_iphash")
355 # Test the new module included in kernel 3 series
356 run("/sbin/modprobe ip_set_hash_ip")
358 # Save current settings
359 paramslist = get(None, dev)
361 # Delete root qdisc 1: if it exists. This will also automatically
362 # delete any child classes.
363 for line in tc("qdisc show dev %s" % dev):
364 # Search for the root qdisc 1:
365 m = re.match(r"qdisc htb 1:", line)
367 tc("qdisc del dev %s root handle 1:" % dev)
370 # Initialize HTB. The "default" clause specifies that if a packet
371 # fails classification, it should go into the class with handle
373 tc("qdisc add dev %s root handle 1: htb default %x" % \
374 (dev, default_minor | default_xid))
376 # Set up a parent class from which all subclasses borrow.
377 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
380 # Set up a subclass that represents the node bandwidth cap. We
381 # allow each slice to borrow up to this rate, so it is also
382 # usually the "ceil" rate for each slice.
383 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
386 # Set up a subclass for DRL(Distributed Rate Limiting).
387 # DRL will directly modify that subclass implementing the site limits.
388 tc("class add dev %s parent 1:10 classid 1:100 htb rate %dbit ceil %dbit" % \
392 # Set up a subclass that represents "exemption" from the node
393 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
394 # to exempt destinations can still be fairly shared up to bwmax.
395 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
398 # Set up the root class (and tell VNET what it is). Packets sent
399 # by root end up here and are capped at the node bandwidth
401 #on(root_xid, dev, share = root_share)
403 # file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
407 # Set up the default class. Packets that fail classification end
409 on(default_xid, dev, share = default_share)
411 # Restore old settings
414 minexemptrate, maxexemptrate,
415 bytes, exemptbytes) in paramslist:
416 if xid not in (root_xid, default_xid):
417 on(xid, dev, share, minrate, maxrate, minexemptrate, maxexemptrate)
420 def get(xid = None, dev = dev):
422 Get the bandwidth limits and current byte totals for a
423 particular slice xid as a tuple (xid, share, minrate, maxrate,
424 minexemptrate, maxexemptrate, bytes, exemptbytes), or all classes
425 as a list of such tuples.
437 # class htb 1:1000 parent 1:10 leaf 1000: prio 0 quantum 8000 rate 8bit ceil 10000Kbit ...
438 # Sent 6851486 bytes 49244 pkt (dropped 0, overlimits 0 requeues 0)
440 # class htb 1:2000 parent 1:20 leaf 2000: prio 0 quantum 8000 rate 8bit ceil 1000Mbit ...
441 # Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
443 for line in tc("-s -d class show dev %s" % dev):
444 # Rate parameter line
445 params = re.match(r"class htb 1:([0-9a-f]+) parent 1:(10|20)", line)
447 stats = re.match(r".* Sent ([0-9]+) bytes", line)
449 ignore = re.match(r"class htb", line)
451 if params is not None:
453 if params.group(2) == "10":
460 bytes = 'exemptbytes'
463 id = int(params.group(1), 16) & 0x0FFF;
472 m = re.search(r"quantum (\d+)", line)
474 rate['share'] = int(m.group(1)) / quantum
478 m = re.search(r"rate (\w+)", line)
480 rate[min] = get_tc_rate(m.group(1))
484 m = re.search(r"ceil (\w+)", line)
486 rate[max] = get_tc_rate(m.group(1))
488 # Which statistics to parse
489 rate['stats'] = bytes
493 elif stats is not None:
495 rate[rate['stats']] = int(stats.group(1))
497 elif ignore is not None:
500 # Keep parsing until we get everything
501 if rate is not None and \
502 'min' in rate and 'minexempt' in rate and \
503 'max' in rate and 'maxexempt' in rate and \
504 'bytes' in rate and 'exemptbytes' in rate:
505 params = (rate['id'], rate['share'],
506 rate['min'], rate['max'],
507 rate['minexempt'], rate['maxexempt'],
508 rate['bytes'], rate['exemptbytes'])
510 # Return a list of parameters
513 elif xid == rate['id']:
514 # Return the parameters for this class
521 def on(xid, dev=dev, share=None,
522 minrate=None, maxrate=None,
523 minexemptrate=None, maxexemptrate=None):
525 Apply specified bandwidth limit to the specified slice xid
528 # Get defaults from current state if available
537 if minexemptrate is None:
538 minexemptrate = cap[4]
539 if maxexemptrate is None:
540 maxexemptrate = cap[5]
542 # Figure out what the current node bandwidth cap is
543 bwcap = get_bwcap(dev)
547 share = default_share
551 minrate = get_tc_rate(minrate)
555 maxrate = get_tc_rate(maxrate)
556 if minexemptrate is None:
557 minexemptrate = minrate
559 minexemptrate = get_tc_rate(minexemptrate)
560 if maxexemptrate is None:
561 maxexemptrate = bwmax
563 maxexemptrate = get_tc_rate(maxexemptrate)
572 if minrate > maxrate:
574 if maxexemptrate < bwmin:
575 maxexemptrate = bwmin
576 if maxexemptrate > bwmax:
577 maxexemptrate = bwmax
578 if minexemptrate < bwmin:
579 minexemptrate = bwmin
580 if minexemptrate > maxexemptrate:
581 minexemptrate = maxexemptrate
583 # Set up subclasses for the slice
584 tc("class replace dev %s parent 1:100 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
585 (dev, default_minor | xid, minrate, maxrate, share * quantum))
587 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
588 (dev, exempt_minor | xid, minexemptrate, maxexemptrate, share * quantum))
590 # Attach a FIFO to each subclass, which helps to throttle back
591 # processes that are sending faster than the token buckets can
593 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
594 (dev, default_minor | xid, default_minor | xid))
596 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
597 (dev, exempt_minor | xid, exempt_minor | xid))
599 # Setup a filter rule to the root class so each packet originated by a
600 # container interface is classified to it corresponding class
601 # The handle number is a mark created by ebtables with the xid
602 tc("filter replace dev %s parent 1: protocol ip prio 1 handle %d fw flowid 1:%x" % \
603 (dev, xid, default_minor | xid))
606 def set(xid, share=None,
607 minrate=None, maxrate=None,
608 minexemptrate=None, maxexemptrate=None, dev=dev):
609 on(xid=xid, dev=dev, share=share,
610 minrate=minrate, maxrate=maxrate,
611 minexemptrate=minexemptrate, maxexemptrate=maxexemptrate)
614 # Remove class associated with specified slice xid. If further packets
615 # are seen from this slice, they will be classified into the default
617 def off(xid, dev=dev):
619 Remove class associated with specified slice xid. If further
620 packets are seen from this slice, they will be classified into the
621 default class 1:1FFF.
626 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
627 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
630 def exempt_init(group_name, node_ips):
632 Initialize the list of destinations exempt from the node bandwidth
636 # Check of set exists
637 set = run("/sbin/ipset -S " + group_name)
639 # Create a hashed IP set of all of these destinations
640 lines = ["-N %s iphash" % group_name]
641 add_cmd = "-A %s " % group_name
642 lines += [(add_cmd + ip) for ip in node_ips]
644 restore = "\n".join(lines) + "\n"
645 run("/sbin/ipset -R", restore)
647 # Check all hosts and add missing.
648 for nodeip in node_ips:
649 if not run("/sbin/ipset -T %s %s" % (group_name, nodeip)):
650 run("/sbin/ipset -A %s %s" % (group_name, nodeip))
654 bwcap_description = format_tc_rate(get_bwcap())
659 %s [OPTION]... [COMMAND] [ARGUMENT]...
662 -d device Network interface (default: %s)
663 -r rate Node bandwidth cap (default: %s)
664 -q quantum Share multiplier (default: %d bytes)
665 -n Print rates in numeric bits per second
666 -v Enable verbose debug messages
671 (Re)initialize all bandwidth parameters
672 on slice [share|-] [minrate|-] [maxrate|-] [minexemptrate|-] [maxexemptrate|-]
673 Set bandwidth parameter(s) for the specified slice
675 Remove all bandwidth parameters for the specified slice
677 Get all bandwidth parameters for all slices
679 Get bandwidth parameters for the specified slice
680 """ % (sys.argv[0], dev, bwcap_description, quantum))
685 global dev, quantum, verbose
691 (opts, argv) = getopt.getopt(sys.argv[1:], "d:nr:q:vh")
692 for (opt, optval) in opts:
698 bwcap = get_tc_rate(optval)
700 quantum = int(optval)
707 bwcap = get_bwcap(dev)
713 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
715 init(dev, get_tc_rate(bwcap))
717 elif argv[0] == "get" or argv[0] == "show":
720 # Show a particular slice
721 xid = get_xid(argv[1])
723 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
725 params = get(xid, dev)
729 paramslist = [params]
732 paramslist = get(None, dev)
736 minexemptrate, maxexemptrate,
737 bytes, exemptbytes) in paramslist:
738 slice = get_slice(xid)
740 # Orphaned (not associated with a slice) class
743 print("%s %d %d %d %d %d %d %d" % \
746 minexemptrate, maxexemptrate,
749 print("%s %d %s %s %s %s %s %s" % \
751 format_tc_rate(minrate), format_tc_rate(maxrate),
752 format_tc_rate(minexemptrate), format_tc_rate(maxexemptrate),
753 format_bytes(bytes), format_bytes(exemptbytes)))
757 xid = get_xid(argv[1])
759 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
762 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace" or argv[0] == "set":
766 # ... share, minrate, maxrate, minexemptrate, maxexemptrate
767 casts = [int, get_tc_rate, get_tc_rate, get_tc_rate, get_tc_rate]
768 for i, arg in enumerate(argv[2:]):
774 args.append(casts[i](arg))
777 elif argv[0] == "off" or argv[0] == "del":
788 if __name__ == '__main__':