3 # Bandwidth limit module for PlanetLab nodes. The intent is to use the
4 # Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to allow
5 # slices to fairly share access to available node bandwidth. We
6 # currently define three classes of "available node bandwidth":
8 # 1. Available hardware bandwidth (bwmax): The maximum rate of the
11 # 2. Available capped bandwidth (bwcap): The maximum rate allowed to
12 # non-exempt destinations. By default, equal to bwmax, but may be
15 # 3. Available uncapped ("exempt") bandwidth: The difference between
16 # bwmax and what is currently being used of bwcap, or the maximum rate
17 # allowed to destinations exempt from caps (e.g., Internet2).
19 # All three classes of bandwidth are fairly shared according to the
20 # notion of "shares". For instance, if the node is capped at 5 Mbps,
21 # there are N slices, and each slice has 1 share, then each slice
22 # should get at least 5/N Mbps of bandwidth. How HTB is implemented
23 # makes this statement a little too simplistic. What it really means
24 # is that during any single time period, only a certain number of
25 # bytes can be sent onto the wire. Each slice is guaranteed that at
26 # least some small number of its bytes will be sent. Whatever is left
27 # over from the budget, is split in proportion to the number of shares
30 # Even if the node is not capped at a particular limit (bwcap ==
31 # bwmax), this module enforces fair share access to bwmax. Also, if
32 # the node is capped at a particular limit, rules may optionally be
33 # defined that classify certain packets into the "exempt" class. This
34 # class receives whatever bandwidth is leftover between bwcap and
35 # bwmax; slices fairly share this bandwidth as well.
37 # The root context is exempt from sharing and can send as much as it
42 # 1. http://lartc.org/howto for how to use tc
43 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
45 # Andy Bavier <acb@cs.princeton.edu>
46 # Mark Huang <mlhuang@cs.princeton.edu>
47 # Copyright (C) 2006 The Trustees of Princeton University
49 # $Id: bwlimit.py,v 1.15 2007/02/07 04:21:11 mlhuang Exp $
52 import sys, os, re, getopt
56 # Where the tc binary lives
65 # bwmin should be small enough that it can be considered negligibly
66 # slow compared to the hardware. 8 bits/second appears to be the
67 # smallest value supported by tc.
70 # bwmax should be large enough that it can be considered at least as
71 # fast as the hardware.
72 bwmax = 1000*1000*1000
74 # quantum is the maximum number of bytes that can be borrowed by a
75 # share (or slice, if each slice gets 1 share) in one time period
76 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
77 # above their guarantees, and each is attempting to borrow up to the
78 # node bandwidth cap, quantums control how the excess bandwidth is
79 # distributed. Slices with 2 shares will borrow twice the amount in
80 # one time period as slices with 1 share, so averaged over time, they
81 # will get twice as much of the excess bandwidth. The value should be
82 # as small as possible and at least 1 MTU. By default, it would be
83 # calculated as bwmin/10, but since we use such small a value for
84 # bwmin, it's better to just set it to a value safely above 1 Ethernet
88 # cburst is the maximum number of bytes that can be burst onto the
89 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
90 # have data queued for transmission, cbursts control how long each
91 # slice can have the wire for. If not specified, it is set to the
92 # smallest possible value that would enable the slice's "ceil" rate
93 # (usually the node bandwidth cap), to be reached if a slice was able
94 # to borrow enough bandwidth to do so. For now, it's unclear how or if
95 # to relate this to the notion of shares, so just let tc set the
99 # There is another parameter that controls how bandwidth is allocated
100 # between slices on nodes that is outside the scope of HTB. We enforce
101 # a 16 GByte/day total limit on each slice, which works out to about
102 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
103 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
104 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
105 # rule and executes this script to override "ceil".
107 # We support multiple bandwidth limits, by reserving the top nibble of
108 # the minor classid to be the "subclassid". Theoretically, we could
109 # support up to 15 subclasses, but for now, we only define two: the
110 # "default" subclass 1:10 that is capped at the node bandwidth cap (in
111 # this example, 5mbit) and the "exempt" subclass 1:20 that is capped
112 # at bwmax (i.e., not capped). The 1:1 parent class exists only to
113 # make the borrowing model work. All bandwidth above minimum
114 # guarantees is fairly shared (in this example, slice 2 is guaranteed
115 # at least 1mbit in addition to fair access to the rest), subject to
116 # the restrictions of the class hierarchy: namely, that the total
117 # bandwidth to non-exempt destinations should not exceed the node
123 # ______________|_____________
125 # 1:10 (8bit, 5mbit) 1:20 (8bit, 1gbit)
127 # 1:100 (8bit, 5mbit) |
129 # 1:1000 (8bit, 5mbit), 1:2000 (8bit, 1gbit),
130 # 1:1001 (8bit, 5mbit), 1:2001 (8bit, 1gbit),
131 # 1:1002 (1mbit, 5mbit), 1:2002 (1mbit, 1gbit),
133 # 1:1FFF (8bit, 5mbit) 1:2FFF (8bit, 1gbit)
135 default_minor = 0x1000
136 exempt_minor = 0x2000
138 # root_xid is for the root context. The root context is exempt from
139 # fair sharing in both the default and exempt subclasses. The root
140 # context gets 5 shares by default.
144 # default_xid is for unclassifiable packets. Packets should not be
145 # classified here very often. They can be if a slice's HTB classes are
146 # deleted before its processes are. Each slice gets 1 share by
151 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
152 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
153 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
154 # "kibit" and that if an older version is installed, all rates will
155 # be off by a small fraction.
163 "gibit": 1024*1024*1024,
165 "tibit": 1024*1024*1024*1024,
166 "tbit": 1000000000000,
170 "mibps": 8*1024*1024,
172 "gibps": 8*1024*1024*1024,
174 "tibps": 8*1024*1024*1024*1024,
175 "tbps": 8000000000000
181 Parses an integer or a tc rate string (e.g., 1.5mbit) into bits/second
186 m = re.match(r"([0-9.]+)(\D*)", s)
189 suffix = m.group(2).lower()
190 if suffixes.has_key(suffix):
191 return int(float(m.group(1)) * suffixes[suffix])
195 def format_bytes(bytes, si = True):
197 Formats bytes into a string
202 # Officially, a kibibyte
205 if bytes >= (kilo * kilo * kilo):
206 return "%.1f GB" % (bytes / (kilo * kilo * kilo))
207 elif bytes >= 1000000:
208 return "%.1f MB" % (bytes / (kilo * kilo))
210 return "%.1f KB" % (bytes / kilo)
212 return "%.0f bytes" % bytes
214 def format_tc_rate(rate):
216 Formats a bits/second rate into a tc rate string
219 if rate >= 1000000000 and (rate % 1000000000) == 0:
220 return "%.0fgbit" % (rate / 1000000000.)
221 elif rate >= 1000000 and (rate % 1000000) == 0:
222 return "%.0fmbit" % (rate / 1000000.)
224 return "%.0fkbit" % (rate / 1000.)
226 return "%.0fbit" % rate
229 # Parse /etc/planetlab/bwcap (or equivalent)
230 def read_bwcap(bwcap_file):
233 fp = open(bwcap_file, "r")
234 line = fp.readline().strip()
236 bwcap = get_tc_rate(line)
244 def get_bwcap(dev = dev):
246 Get the current (live) value of the node bandwidth cap
249 state = tc("-d class show dev %s" % dev)
250 base_re = re.compile(r"class htb 1:10 parent 1:1 .*ceil ([^ ]+) .*")
251 base_classes = filter(None, map(base_re.match, state))
254 if len(base_classes) > 1:
255 raise Exception, "unable to get current bwcap"
256 return get_tc_rate(base_classes[0].group(1))
261 Get slice name ("princeton_mlh") from slice xid (500)
266 if xid == default_xid:
269 return pwd.getpwuid(xid).pw_name
277 Get slice xid ("princeton_mlh") from slice name ("500" or "princeton_mlh")
282 if slice == "default":
289 return pwd.getpwnam(slice).pw_uid
295 def run(cmd, input = None):
297 Shortcut for running a shell command
302 sys.stderr.write("Executing: " + cmd + "\n")
304 fileobj = os.popen(cmd, "r")
305 output = fileobj.readlines()
307 fileobj = os.popen(cmd, "w")
310 if fileobj.close() is None:
319 Shortcut for running a tc command
322 return run(TC + " " + cmd)
327 Turn off all queing. Stops all slice HTBS and reverts to pfifo_fast (the default).
331 tc("qdisc del dev %s root" % dev)
335 def init(dev = dev, bwcap = bwmax):
337 (Re)initialize the bandwidth limits on this node
340 # Load the module used to manage exempt classes
341 run("/sbin/modprobe ip_set_iphash")
343 # Save current settings
344 paramslist = get(None, dev)
346 # Delete root qdisc 1: if it exists. This will also automatically
347 # delete any child classes.
348 for line in tc("qdisc show dev %s" % dev):
349 # Search for the root qdisc 1:
350 m = re.match(r"qdisc htb 1:", line)
352 tc("qdisc del dev %s root handle 1:" % dev)
355 # Initialize HTB. The "default" clause specifies that if a packet
356 # fails classification, it should go into the class with handle
358 tc("qdisc add dev %s root handle 1: htb default %x" % \
359 (dev, default_minor | default_xid))
361 # Set up a parent class from which all subclasses borrow.
362 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
365 # Set up a subclass that represents the node bandwidth cap. We
366 # allow each slice to borrow up to this rate, so it is also
367 # usually the "ceil" rate for each slice.
368 tc("class add dev %s parent 1:1 classid 1:10 htb rate %dbit ceil %dbit" % \
371 # Set up a subclass for DRL(Distributed Rate Limiting).
372 # DRL will directly modify that subclass implementing the site limits.
373 tc("class add dev %s parent 1:10 classid 1:100 htb rate %dbit ceil %dbit" % \
377 # Set up a subclass that represents "exemption" from the node
378 # bandwidth cap. Once the node bandwidth cap is reached, bandwidth
379 # to exempt destinations can still be fairly shared up to bwmax.
380 tc("class add dev %s parent 1:1 classid 1:20 htb rate %dbit ceil %dbit" % \
383 # Set up the root class (and tell VNET what it is). Packets sent
384 # by root end up here and are capped at the node bandwidth
386 #on(root_xid, dev, share = root_share)
388 # file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | default_minor | root_xid))
392 # Set up the default class. Packets that fail classification end
394 on(default_xid, dev, share = default_share)
396 # Restore old settings
399 minexemptrate, maxexemptrate,
400 bytes, exemptbytes) in paramslist:
401 if xid not in (root_xid, default_xid):
402 on(xid, dev, share, minrate, maxrate, minexemptrate, maxexemptrate)
405 def get(xid = None, dev = dev):
407 Get the bandwidth limits and current byte totals for a
408 particular slice xid as a tuple (xid, share, minrate, maxrate,
409 minexemptrate, maxexemptrate, bytes, exemptbytes), or all classes
410 as a list of such tuples.
422 # class htb 1:1000 parent 1:10 leaf 1000: prio 0 quantum 8000 rate 8bit ceil 10000Kbit ...
423 # Sent 6851486 bytes 49244 pkt (dropped 0, overlimits 0 requeues 0)
425 # class htb 1:2000 parent 1:20 leaf 2000: prio 0 quantum 8000 rate 8bit ceil 1000Mbit ...
426 # Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
428 for line in tc("-s -d class show dev %s" % dev):
429 # Rate parameter line
430 params = re.match(r"class htb 1:([0-9a-f]+) parent 1:(10|20)", line)
432 stats = re.match(r".* Sent ([0-9]+) bytes", line)
434 ignore = re.match(r"class htb", line)
436 if params is not None:
438 if params.group(2) == "10":
445 bytes = 'exemptbytes'
448 id = int(params.group(1), 16) & 0x0FFF;
450 if rates.has_key(id):
457 m = re.search(r"quantum (\d+)", line)
459 rate['share'] = int(m.group(1)) / quantum
463 m = re.search(r"rate (\w+)", line)
465 rate[min] = get_tc_rate(m.group(1))
469 m = re.search(r"ceil (\w+)", line)
471 rate[max] = get_tc_rate(m.group(1))
473 # Which statistics to parse
474 rate['stats'] = bytes
478 elif stats is not None:
480 rate[rate['stats']] = int(stats.group(1))
482 elif ignore is not None:
485 # Keep parsing until we get everything
486 if rate is not None and \
487 rate.has_key('min') and rate.has_key('minexempt') and \
488 rate.has_key('max') and rate.has_key('maxexempt') and \
489 rate.has_key('bytes') and rate.has_key('exemptbytes'):
490 params = (rate['id'], rate['share'],
491 rate['min'], rate['max'],
492 rate['minexempt'], rate['maxexempt'],
493 rate['bytes'], rate['exemptbytes'])
495 # Return a list of parameters
498 elif xid == rate['id']:
499 # Return the parameters for this class
506 def on(xid, dev = dev, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None):
508 Apply specified bandwidth limit to the specified slice xid
511 # Get defaults from current state if available
520 if minexemptrate is None:
521 minexemptrate = cap[4]
522 if maxexemptrate is None:
523 maxexemptrate = cap[5]
525 # Figure out what the current node bandwidth cap is
530 share = default_share
534 minrate = get_tc_rate(minrate)
538 maxrate = get_tc_rate(maxrate)
539 if minexemptrate is None:
540 minexemptrate = minrate
542 minexemptrate = get_tc_rate(minexemptrate)
543 if maxexemptrate is None:
544 maxexemptrate = bwmax
546 maxexemptrate = get_tc_rate(maxexemptrate)
555 if minrate > maxrate:
557 if maxexemptrate < bwmin:
558 maxexemptrate = bwmin
559 if maxexemptrate > bwmax:
560 maxexemptrate = bwmax
561 if minexemptrate < bwmin:
562 minexemptrate = bwmin
563 if minexemptrate > maxexemptrate:
564 minexemptrate = maxexemptrate
566 # Set up subclasses for the slice
567 tc("class replace dev %s parent 1:100 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
568 (dev, default_minor | xid, minrate, maxrate, share * quantum))
570 tc("class replace dev %s parent 1:20 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
571 (dev, exempt_minor | xid, minexemptrate, maxexemptrate, share * quantum))
573 # Attach a FIFO to each subclass, which helps to throttle back
574 # processes that are sending faster than the token buckets can
576 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
577 (dev, default_minor | xid, default_minor | xid))
579 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
580 (dev, exempt_minor | xid, exempt_minor | xid))
583 def set(xid, share = None, minrate = None, maxrate = None, minexemptrate = None, maxexemptrate = None, dev = dev ):
584 on(xid = xid, dev = dev, share = share,
585 minrate = minrate, maxrate = maxrate,
586 minexemptrate = minexemptrate, maxexemptrate = maxexemptrate)
589 # Remove class associated with specified slice xid. If further packets
590 # are seen from this slice, they will be classified into the default
592 def off(xid, dev = dev):
594 Remove class associated with specified slice xid. If further
595 packets are seen from this slice, they will be classified into the
596 default class 1:1FFF.
601 tc("class del dev %s classid 1:%x" % (dev, default_minor | xid))
602 tc("class del dev %s classid 1:%x" % (dev, exempt_minor | xid))
605 def exempt_init(group_name, node_ips):
607 Initialize the list of destinations exempt from the node bandwidth
611 # Check of set exists
612 set = run("/sbin/ipset -S " + group_name)
614 # Create a hashed IP set of all of these destinations
615 lines = ["-N %s iphash" % group_name]
616 add_cmd = "-A %s " % group_name
617 lines += [(add_cmd + ip) for ip in node_ips]
619 restore = "\n".join(lines) + "\n"
620 run("/sbin/ipset -R", restore)
622 # Check all hosts and add missing.
623 for nodeip in node_ips:
624 if not run("/sbin/ipset -T %s %s" % (group_name, nodeip)):
625 run("/sbin/ipset -A %s %s" % (group_name, nodeip))
629 bwcap_description = format_tc_rate(get_bwcap())
634 %s [OPTION]... [COMMAND] [ARGUMENT]...
637 -d device Network interface (default: %s)
638 -r rate Node bandwidth cap (default: %s)
639 -q quantum Share multiplier (default: %d bytes)
640 -n Print rates in numeric bits per second
641 -v Enable verbose debug messages
646 (Re)initialize all bandwidth parameters
647 on slice [share|-] [minrate|-] [maxrate|-] [minexemptrate|-] [maxexemptrate|-]
648 Set bandwidth parameter(s) for the specified slice
650 Remove all bandwidth parameters for the specified slice
652 Get all bandwidth parameters for all slices
654 Get bandwidth parameters for the specified slice
655 """ % (sys.argv[0], dev, bwcap_description, quantum)
660 global dev, quantum, verbose
666 (opts, argv) = getopt.getopt(sys.argv[1:], "d:nr:q:vh")
667 for (opt, optval) in opts:
673 bwcap = get_tc_rate(optval)
675 quantum = int(optval)
685 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
687 init(dev, get_tc_rate(bwcap))
689 elif argv[0] == "get" or argv[0] == "show":
692 # Show a particular slice
693 xid = get_xid(argv[1])
695 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
697 params = get(xid, dev)
701 paramslist = [params]
704 paramslist = get(None, dev)
708 minexemptrate, maxexemptrate,
709 bytes, exemptbytes) in paramslist:
710 slice = get_slice(xid)
712 # Orphaned (not associated with a slice) class
715 print "%s %d %d %d %d %d %d %d" % \
718 minexemptrate, maxexemptrate,
721 print "%s %d %s %s %s %s %s %s" % \
723 format_tc_rate(minrate), format_tc_rate(maxrate),
724 format_tc_rate(minexemptrate), format_tc_rate(maxexemptrate),
725 format_bytes(bytes), format_bytes(exemptbytes))
729 xid = get_xid(argv[1])
731 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
734 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace" or argv[0] == "set":
738 # ... share, minrate, maxrate, minexemptrate, maxexemptrate
739 casts = [int, get_tc_rate, get_tc_rate, get_tc_rate, get_tc_rate]
740 for i, arg in enumerate(argv[2:]):
746 args.append(casts[i](arg))
749 elif argv[0] == "off" or argv[0] == "del":
760 if __name__ == '__main__':