3 # Bandwidth limit script to run on PlanetLab nodes. The intent is to use
4 # the Hierarchical Token Bucket (HTB) queueing discipline (qdisc) to:
6 # 1. Cap the total output bandwidth of the node at a specified rate
9 # 2. Allow slices to fairly share this rate. Some slices have more
12 # For instance, if the node is capped at 5 Mbps, there are N slices,
13 # and each slice has 1 share, then each slice should get at least 5/N
14 # Mbps of bandwidth. How HTB is implemented makes this statement a
15 # little too simplistic. What it really means is that during any
16 # single time period, only a certain number of bytes can be sent onto
17 # the wire. Each slice is guaranteed that at least some small number
18 # of its bytes will be sent. Whatever is left over from the budget, is
19 # split in proportion to the number of shares each slice has.
21 # The root context is exempt from this sharing and can send as much as
26 # 1. http://lartc.org/howto for how to use tc
27 # 2. http://luxik.cdi.cz/~devik/qos/htb/ for info on HTB
29 # Andy Bavier <acb@cs.princeton.edu>
30 # Mark Huang <mlhuang@cs.princeton.edu>
31 # Copyright (C) 2006 The Trustees of Princeton University
36 import sys, os, re, getopt
39 # Where the tc binary lives.
48 # guarantee is the minimum rate in bits per second that each slice is
49 # guaranteed. The value of this parameter is fairly meaningless, since
50 # it is unlikely that every slice will try to transmit full blast
51 # simultaneously. It just needs to be small enough so that the total
52 # of all outstanding guarantees is less than or equal to the node
53 # bandwidth cap (see below). A node with a 500kbit cap (the minimum
54 # recommended) can support up to 500kbit/1000 = 500 slices.
57 # quantum is the maximum number of bytes that can be borrowed by a
58 # share (or slice, if each slice gets 1 share) in one time period
59 # (with HZ=1000, 1 ms). If multiple slices are competing for bandwidth
60 # above their guarantees, and each is attempting to borrow up to the
61 # node bandwidth cap, quantums control how the excess bandwidth is
62 # distributed. Slices with 2 shares will borrow twice the amount in
63 # one time period as slices with 1 share, so averaged over time, they
64 # will get twice as much of the excess bandwidth. The value should be
65 # as small as possible and at least 1 MTU. By default, it would be
66 # calculated as guarantee/10, but since we use such small guarantees,
67 # it's better to just set it to a value safely above 1 Ethernet MTU.
70 # cburst is the maximum number of bytes that can be burst onto the
71 # wire in one time period (with HZ=1000, 1 ms). If multiple slices
72 # have data queued for transmission, cbursts control how long each
73 # slice can have the wire for. If not specified, it is set to the
74 # smallest possible value that would enable the slice's "ceil" rate
75 # (usually the node bandwidth cap), to be reached if a slice was able
76 # to borrow enough bandwidth to do so. For now, it's unclear how or if
77 # to relate this to the notion of shares, so just let tc set the
80 # bwcap is the node bandwidth cap in tc format (see below for
81 # supported suffixes), read in from /etc/planetlab/bwcap. We allow
82 # each slice to borrow up to this rate, so it is also usually the
83 # "ceil" rate for each slice. -1 means disabled.
84 bwcap_file = "/etc/planetlab/bwcap"
87 # There is another parameter that controls how bandwidth is allocated
88 # between slices on nodes that is outside the scope of HTB. We enforce
89 # a 16 GByte/day total limit on each slice, which works out to about
90 # 1.5mbit. If a slice exceeds this byte limit before the day finishes,
91 # it is capped at (i.e., its "ceil" rate is set to) the smaller of the
92 # node bandwidth cap or 1.5mbit. pl_mom is in charge of enforcing this
93 # rule and executes this script to override "ceil".
95 # root_minor is the special class for the root context. The root
96 # context is exempt from minrate and fair sharing.
99 # default_minor is the special default class for unclassifiable
100 # packets. Packets should not be classified here very often. They can
101 # be if a slice's HTB class is deleted before its processes are.
102 default_minor = 0xFFFF
104 # See tc_util.c and http://physics.nist.gov/cuu/Units/binary.html. Be
105 # warned that older versions of tc interpret "kbps", "mbps", "mbit",
106 # and "kbit" to mean (in this system) "kibps", "mibps", "mibit", and
107 # "kibit" and that if an older version is installed, all rates will
108 # be off by a small fraction.
116 "gibit": 1024*1024*1024,
118 "tibit": 1024*1024*1024*1024,
119 "tbit": 1000000000000,
123 "mibps": 8*1024*1024,
125 "gibps": 8*1024*1024*1024,
127 "tibps": 8*1024*1024*1024*1024,
128 "tbps": 8000000000000
132 # Parses a tc rate string (e.g., 1.5mbit) into bits/second
134 m = re.match(r"([0-9.]+)(\D*)", s)
137 suffix = m.group(2).lower()
138 if suffixes.has_key(suffix):
139 return int(float(m.group(1)) * suffixes[suffix])
144 # Prints a tc rate string
145 def format_tc_rate(rate):
147 return "%.0fmbit" % (rate / 1000000.)
149 return "%.0fkbit" % (rate / 1000.)
151 return "%.0fbit" % rate
154 # Parse /etc/planetlab/bwcap. XXX Should get this from the API
160 fp = open(bwcap_file, "r")
161 line = fp.readline().strip()
163 bwcap = get_tc_rate(line)
168 # Before doing anything else, parse the node bandwidth cap file
172 # Get slice xid (500) from slice name ("500" or "princeton_mlh") or
173 # slice name ("princeton_mlh") from slice xid (500).
174 def get_slice(xid_or_name):
175 labels = ['account', 'password', 'uid', 'gid', 'gecos', 'directory', 'shell']
177 for line in file("/etc/passwd"):
179 if line.strip() == '' or line[0] in '#':
181 # princeton_mlh:x:...
182 fields = line.strip().split(':')
183 if len(fields) < len(labels):
185 # {'account': 'princeton_mlh', 'password': 'x', ...}
186 pw = dict(zip(labels, fields))
187 if xid_or_name == default_minor:
188 # Convert 0xffff into "default"
190 elif xid_or_name == root_minor:
191 # Convert 0x2 into "root"
193 elif xid_or_name == int(pw['uid']):
194 # Convert xid into name
196 elif pw['uid'] == xid_or_name or pw['account'] == xid_or_name:
197 # Convert name into xid
198 return int(pw['uid'])
203 # Shortcut for running a tc command
207 sys.stderr.write("Executing: " + TC + " " + cmd + "\n")
208 fileobj = os.popen(TC + " " + cmd, "r")
209 output = fileobj.readlines()
210 if fileobj.close() is None:
217 # (Re)initialize the bandwidth limits on this node
219 # Save current state (if any)
220 caps = get(dev = DEV)
222 # Delete root qdisc 1: if it exists. This will also automatically
223 # delete any child classes.
224 for line in tc("qdisc show dev %s" % dev):
225 # Search for the root qdisc 1:
226 m = re.match(r"qdisc htb 1:", line)
228 tc("qdisc del dev %s root handle 1:" % dev)
235 # Initialize HTB. The "default" clause specifies that if a packet
236 # fails classification, it should go into the class with handle
238 tc("qdisc add dev %s root handle 1: htb default FFFF" % dev)
240 # Set up the parent class that represents the node bandwidth
241 # cap; in other words, the class from which all others borrow.
242 tc("class add dev %s parent 1: classid 1:1 htb rate %dbit" % \
245 # Set up the root class (and tell VNET what it is). Packets sent
246 # by root end up here and are capped at the node bandwidth
248 on(root_minor, dev, minrate = bwcap, maxrate = bwcap)
249 file("/proc/sys/vnet/root_class", "w").write("%d" % ((1 << 16) | root_minor))
251 # Set up the default class. Packets that fail classification end
253 on(default_minor, dev, maxrate = bwcap)
255 # Reapply bandwidth caps. If the node bandwidth cap is now lower
256 # than it was before, "ceil" for each class will be lowered. XXX
257 # If the node bandwidth cap is now higher than it was before,
258 # "ceil" for each class should be raised, but we have no idea
259 # whether the lower cap was put on by pl_mom or by an admin, so it
260 # is left as it was before, at least until pl_mom gets around to
261 # resetting each slice's cap at the beginning of the next
262 # day. What *should* happen is that Node Manager should control
263 # both the application of the node bandwidth cap and the
264 # application of the per-slice bandwidth caps, and there should be
265 # only one external caller of this script (pl_mom). Even then,
266 # pl_mom should probably be merged into Node Manager at some
268 for (xid, share, minrate, maxrate) in caps:
269 if xid != root_minor and xid != default_minor:
270 on(xid, dev, share = share, minrate = minrate, maxrate = maxrate)
273 # Get the bandwidth limits for a particular slice xid as a tuple (xid,
274 # share, minrate, maxrate), or all classes as a list of tuples.
275 def get(xid = None, dev = DEV):
281 # class htb 1:2 parent 1:1 leaf 2: prio 0 rate 10Mbit ceil 10Mbit burst 14704b cburst 14704b
282 for line in tc("-d class show dev %s" % dev):
283 # Search for child classes of 1:1
284 m = re.match(r"class htb 1:([0-9a-f]+) parent 1:1", line)
288 # If we are looking for a particular class
289 classid = int(m.group(1), 16)
290 if xid is not None and xid != classid:
295 m = re.search(r"quantum (\d+)", line)
297 share = int(m.group(1)) / quantum
301 m = re.search(r"rate (\w+)", line)
303 minrate = get_tc_rate(m.group(1))
307 m = re.search(r"ceil (\w+)", line)
309 maxrate = get_tc_rate(m.group(1))
312 # Return a list of parameters
313 ret.append((classid, share, minrate, maxrate))
315 # Return the parameters for this class
316 ret = (classid, share, minrate, maxrate)
322 # Apply specified bandwidth limit to the specified slice xid
323 def on(xid, dev = DEV, share = None, minrate = None, maxrate = None):
324 # Get defaults from current state if available
342 # Allow slices to burst up to the node bandwidth cap by default.
343 maxrate = min(maxrate, bwcap)
345 # Set up a class for the slice.
346 tc("class replace dev %s parent 1:1 classid 1:%x htb rate %dbit ceil %dbit quantum %d" % \
347 (dev, xid, minrate, maxrate, share * quantum))
349 # Attach a FIFO to the class, which helps to throttle back
350 # processes that are sending faster than the token bucket can
352 tc("qdisc replace dev %s parent 1:%x handle %x pfifo" % \
356 # Remove class associated with specified slice xid. If further packets
357 # are seen from this slice, they will be classified into the default
359 def off(xid, dev = DEV):
360 tc("class del dev %s classid 1:%x" % (dev, xid))
365 bwcap_description = "disabled"
367 bwcap_description = "%d bits/second" % bwcap
372 %s [OPTION]... [COMMAND] [ARGUMENT]...
374 Options (override configuration file):
375 -f file Configuration file (default: %s)
376 -d device Network interface (default: %s)
377 -r rate Node bandwidth cap (default: %s)
378 -g guarantee Default minimum slice rate (default: %d bits/second)
379 -q quantum Share multiplier (default: %d bytes)
384 (Re)load configuration and (re)initialize bandwidth caps.
387 on slice [share] [minrate] [maxrate]
388 Set bandwidth cap for the specified slice
390 Remove all bandwidth caps
392 Remove bandwidth caps for the specified slice
394 Get all bandwidth caps
396 Get bandwidth caps for the specified slice
398 Get maxrate for the specified slice
400 Set maxrate for the specified slice
401 """ % (sys.argv[0], bwcap_file, DEV, bwcap_description, guarantee, quantum)
406 global DEV, bwcap_file, bwcap, guarantee, quantum, verbose
408 (opts, argv) = getopt.getopt(sys.argv[1:], "f:d:r:g:q:vh")
409 for (opt, optval) in opts:
416 bwcap = get_tc_rate(optval)
418 guarantee = get_tc_rate(optval)
420 quantum = int(optval)
427 if argv[0] == "init" or (argv[0] == "on" and len(argv) == 1):
431 elif argv[0] == "off" and len(argv) == 1:
435 sys.stderr.write("Warning: all configured bandwidth limits have been removed\n")
437 elif argv[0] == "get" or argv[0] == "show":
440 # Show a particular slice
441 xid = get_slice(argv[1])
443 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
445 caps = [get(xid, DEV)]
448 caps = get(None, DEV)
450 for (xid, share, minrate, maxrate) in caps:
451 slice = get_slice(xid)
453 # Orphaned (not associated with a slice) class
455 print "%s: share %d minrate %s maxrate %s" % \
456 (slice, share, format_tc_rate(minrate), format_tc_rate(maxrate))
460 xid = get_slice(argv[1])
462 sys.stderr.write("Error: Invalid slice name or context '%s'\n" % argv[1])
465 if argv[0] == "on" or argv[0] == "add" or argv[0] == "replace":
469 # ... share, minrate, maxrate
470 casts = [int, get_tc_rate, get_tc_rate]
471 for i, arg in enumerate(argv[2:]):
474 args.append(casts[i](arg))
477 elif argv[0] == "off" or argv[0] == "del":
481 # Backward compatibility with old resman script
482 elif argv[0] == "getcap":
486 (xid, share, minrate, maxrate) = cap
487 print format_tc_rate(maxrate)
489 # Backward compatibility with old resman script
490 elif argv[0] == "setcap":
493 on(xid, DEV, maxrate = get_tc_rate(argv[2]))
504 if __name__ == '__main__':