From f29465c44ae1678ffd8c40057af0900dc40acd12 Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Wed, 22 Feb 2006 21:38:27 +0000 Subject: [PATCH] - fc4 additions --- ip.8 | 1809 ++++++++++++++++++++++++++++++++++++++++++++++ tc-cbq-details.8 | 425 +++++++++++ tc-cbq.8 | 353 +++++++++ tc-htb.8 | 150 ++++ tc-pbfifo.8 | 72 ++ tc-pfifo_fast.8 | 59 ++ tc-prio.8 | 187 +++++ tc-red.8 | 131 ++++ tc-sfq.8 | 107 +++ tc-tbf.8 | 138 ++++ tc.8 | 348 +++++++++ 11 files changed, 3779 insertions(+) create mode 100644 ip.8 create mode 100644 tc-cbq-details.8 create mode 100644 tc-cbq.8 create mode 100644 tc-htb.8 create mode 100644 tc-pbfifo.8 create mode 100644 tc-pfifo_fast.8 create mode 100644 tc-prio.8 create mode 100644 tc-red.8 create mode 100644 tc-sfq.8 create mode 100644 tc-tbf.8 create mode 100644 tc.8 diff --git a/ip.8 b/ip.8 new file mode 100644 index 0000000..50e4419 --- /dev/null +++ b/ip.8 @@ -0,0 +1,1809 @@ +.TH IP 8 "17 January 2002" "iproute2" "Linux" +.SH NAME +ip \- show / manipulate routing, devices, policy routing and tunnels +.SH SYNOPSIS + +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OBJECT " := { " +.BR link " | " addr " | " route " | " rule " | " neigh " | " tunnel " | "\ +maddr " | " mroute " | " monitor " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-s\fR[\fItatistics\fR] | +\fB\-r\fR[\fIesolve\fR] | +\fB\-f\fR[\fIamily\fR] { +.BR inet " | " inet6 " | " ipx " | " dnet " | " link " } | " +\fB\-o\fR[\fIneline\fR] } + +.ti -8 +.BI "ip link set " DEVICE +.RB "{ " up " | " down " | " arp " { " on " | " off " } |" +.br +.BR promisc " { " on " | " off " } |" +.br +.BR allmulti " { " on " | " off " } |" +.br +.BR dynamic " { " on " | " off " } |" +.br +.BR multicast " { " on " | " off " } |" +.br +.B txqueuelen +.IR PACKETS " |" +.br +.B name +.IR NEWNAME " |" +.br +.B address +.IR LLADDR " |" +.B broadcast +.IR LLADDR " |" +.br +.B mtu +.IR MTU " }" + +.ti -8 +.B ip link show +.RI "[ " DEVICE " ]" + +.ti -8 +.BR "ip addr" " { " add " | " del " } " +.IB IFADDR " dev " STRING + +.ti -8 +.BR "ip addr" " { " show " | " flush " } [ " dev +.IR STRING " ] [ " +.B scope +.IR SCOPE-ID " ] [ " +.B to +.IR PREFIX " ] [ " FLAG-LIST " ] [ " +.B label +.IR PATTERN " ]" + +.ti -8 +.IR IFADDR " := " PREFIX " | " ADDR +.B peer +.IR PREFIX " [ " +.B broadcast +.IR ADDR " ] [ " +.B anycast +.IR ADDR " ] [ " +.B label +.IR STRING " ] [ " +.B scope +.IR SCOPE-ID " ]" + +.ti -8 +.IR SCOPE-ID " := " +.RB "[ " host " | " link " | " global " | " +.IR NUMBER " ]" + +.ti -8 +.IR FLAG-LIST " := [ " FLAG-LIST " ] " FLAG + +.ti -8 +.IR FLAG " := " +.RB "[ " permanent " | " dynamic " | " secondary " | " primary " | "\ +tentative " | " deprecated " ]" + +.ti -8 +.BR "ip route" " { " +.BR list " | " flush " } " +.I SELECTOR + +.ti -8 +.B ip route get +.IR ADDRESS " [ " +.BI from " ADDRESS " iif " STRING" +.RB " ] [ " oif +.IR STRING " ] [ " +.B tos +.IR TOS " ]" + +.ti -8 +.BR "ip route" " { " add " | " del " | " change " | " append " | "\ +replace " | " monitor " } " +.I ROUTE + +.ti -8 +.IR SELECTOR " := " +.RB "[ " root +.IR PREFIX " ] [ " +.B match +.IR PREFIX " ] [ " +.B exact +.IR PREFIX " ] [ " +.B table +.IR TABLE_ID " ] [ " +.B proto +.IR RTPROTO " ] [ " +.B type +.IR TYPE " ] [ " +.B scope +.IR SCOPE " ]" + +.ti -8 +.IR ROUTE " := " NODE_SPEC " [ " INFO_SPEC " ]" + +.ti -8 +.IR NODE_SPEC " := [ " TYPE " ] " PREFIX " [" +.B tos +.IR TOS " ] [ " +.B table +.IR TABLE_ID " ] [ " +.B proto +.IR RTPROTO " ] [ " +.B scope +.IR SCOPE " ] [ " +.B metric +.IR METRIC " ]" + +.ti -8 +.IR INFO_SPEC " := " "NH OPTIONS FLAGS" " [" +.B nexthop +.IR NH " ] ..." + +.ti -8 +.IR NH " := [ " +.B via +.IR ADDRESS " ] [ " +.B dev +.IR STRING " ] [ " +.B weight +.IR NUMBER " ] " NHFLAGS + +.ti -8 +.IR OPTIONS " := " FLAGS " [ " +.B mtu +.IR NUMBER " ] [ " +.B advmss +.IR NUMBER " ] [ " +.B rtt +.IR NUMBER " ] [ " +.B rttvar +.IR NUMBER " ] [ " +.B window +.IR NUMBER " ] [ " +.B cwnd +.IR NUMBER " ] [ " +.B ssthresh +.IR REALM " ] [ " +.B realms +.IR REALM " ]" + +.ti -8 +.IR TYPE " := [ " +.BR unicast " | " local " | " broadcast " | " multicast " | "\ +throw " | " unreachable " | " prohibit " | " blackhole " | " nat " ]" + +.ti -8 +.IR TABLE_ID " := [ " +.BR local "| " main " | " default " | " all " |" +.IR NUMBER " ]" + +.ti -8 +.IR SCOPE " := [ " +.BR host " | " link " | " global " |" +.IR NUMBER " ]" + +.ti -8 +.IR FLAGS " := [ " +.BR equalize " ]" + +.ti -8 +.IR NHFLAGS " := [ " +.BR onlink " | " pervasive " ]" + +.ti -8 +.IR RTPROTO " := [ " +.BR kernel " | " boot " | " static " |" +.IR NUMBER " ]" + +.ti -8 +.B ip rule +.RB " [ " list " | " add " | " del " ]" +.I SELECTOR ACTION + +.ti -8 +.IR SELECTOR " := [ " +.B from +.IR PREFIX " ] [ " +.B to +.IR PREFIX " ] [ " +.B tos +.IR TOS " ] [ " +.B fwmark +.IR FWMARK " ] [ " +.B dev +.IR STRING " ] [ " +.B pref +.IR NUMBER " ]" + +.ti -8 +.IR ACTION " := [ " +.B table +.IR TABLE_ID " ] [ " +.B nat +.IR ADDRESS " ] [ " +.BR prohibit " | " reject " | " unreachable " ] [ " realms +.RI "[" SRCREALM "/]" DSTREALM " ]" + +.ti -8 +.IR TABLE_ID " := [ " +.BR local " | " main " | " default " |" +.IR NUMBER " ]" + +.ti -8 +.BR "ip neigh" " { " add " | " del " | " change " | " replace " } { " +.IR ADDR " [ " +.B lladdr +.IR LLADDR " ] [ " +.BR nud " { " permanent " | " noarp " | " stale " | " reachable " } ] | " proxy +.IR ADDR " } [ " +.B dev +.IR DEV " ]" + +.ti -8 +.BR "ip neigh" " { " show " | " flush " } [ " to +.IR PREFIX " ] [ " +.B dev +.IR DEV " ] [ " +.B nud +.IR STATE " ]" + +.ti -8 +.BR "ip tunnel" " { " add " | " change " | " del " | " show " }" +.RI "[ " NAME " ]" +.br +.RB "[ " mode " { " ipip " | " gre " | " sit " } ]" +.br +.RB "[ " remote +.IR ADDR " ] [ " +.B local +.IR ADDR " ]" +.br +.RB "[ [" i "|" o "]" seq " ] [ [" i "|" o "]" key +.IR KEY " ] [ " +.RB "[" i "|" o "]" csum " ] ]" +.br +.RB "[ " ttl +.IR TTL " ] [ " +.B tos +.IR TOS " ] [ " +.RB "[" no "]" pmtudisc " ]" +.br +.RB "[ " dev +.IR PHYS_DEV " ]" + +.ti -8 +.IR ADDR " := { " IP_ADDRESS " |" +.BR any " }" + +.ti -8 +.IR TOS " := { " NUMBER " |" +.BR inherit " }" + +.ti -8 +.IR TTL " := { " 1 ".." 255 " | " +.BR inherit " }" + +.ti -8 +.IR KEY " := { " DOTTED_QUAD " | " NUMBER " }" + +.ti -8 +.BR "ip maddr" " [ " add " | " del " ]" +.IB MULTIADDR " dev " STRING + +.ti -8 +.BR "ip maddr show" " [ " dev +.IR STRING " ]" + +.ti -8 +.BR "ip mroute show" " [" +.IR PREFIX " ] [ " +.B from +.IR PREFIX " ] [ " +.B iif +.IR DEVICE " ]" + +.ti -8 +.BR "ip monitor" " [ " all " |" +.IR LISTofOBJECTS " ]" +.in -8 +.ad b + +.SH OPTIONS + +.TP +.BR "\-V" , " -Version" +print the version of the +.B ip +utility and exit. + +.TP +.BR "\-s" , " \-stats", " \-statistics" +output more information. If the option +appears twice or more, the amount of information increases. +As a rule, the information is statistics or some time values. + +.TP +.BR "\-f" , " \-family" +followed by protocol family identifier: +.BR "inet" , " inet6" +or +.B link +,enforce the protocol family to use. If the option is not present, +the protocol family is guessed from other arguments. If the rest +of the command line does not give enough information to guess the +family, +.B ip +falls back to the default one, usually +.B inet +or +.BR "any" . +.B link +is a special family identifier meaning that no networking protocol +is involved. + +.TP +.B \-4 +shortcut for +.BR "-family inet" . + +.TP +.B \-6 +shortcut for +.BR "\-family inet6" . + +.TP +.B \-0 +shortcut for +.BR "\-family link" . + +.TP +.BR "\-o" , " \-oneline" +output each record on a single line, replacing line feeds +with the +.B '\' +character. This is convenient when you want to count records +with +.BR wc (1) + or to +.BR grep (1) +the output. + +.TP +.BR "\-r" , " \-resolve" +use the system's name resolver to print DNS names instead of +host addresses. + +.SH IP - COMMAND SYNTAX + +.SS +.I OBJECT + +.TP +.B link +- network device. + +.TP +.B address +- protocol (IP or IPv6) address on a device. +.TP +.B neighbour +- ARP or NDISC cache entry. + +.TP +.B route +- routing table entry. + +.TP +.B rule +- rule in routing policy database. + +.TP +.B maddress +- multicast address. + +.TP +.B mroute +- multicast routing cache entry. + +.TP +.B tunnel +- tunnel over IP. + +.PP +The names of all objects may be written in full or +abbreviated form, f.e. +.B address +is abbreviated as +.B addr +or just +.B a. + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +As a rule, it is possible to +.BR "add" , " delete" +and +.B show +(or +.B list +) objects, but some objects do not allow all of these operations +or have some additional commands. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B list +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH ip link - network device configuration + +.B link +is a network device and the corresponding commands +display and change the state of devices. + +.SS ip link set - change device attributes + +.TP +.BI dev " NAME " (default) +.I NAME +specifies network device to operate on. + +.TP +.BR up " and " down +change the state of the device to +.B UP +or +.BR "DOWN" . + +.TP +.BR "arp on " or " arp off" +change the +.B NOARP +flag on the device. + +.TP +.BR "multicast on " or " multicast off" +change the +.B MULTICAST +flag on the device. + +.TP +.BR "dynamic on " or " dynamic off" +change the +.B DYNAMIC +flag on the device. + +.TP +.BI name " NAME" +change the name of the device. This operation is not +recommended if the device is running or has some addresses +already configured. + +.TP +.BI txqueuelen " NUMBER" +.TP +.BI txqlen " NUMBER" +change the transmit queue length of the device. + +.TP +.BI mtu " NUMBER" +change the +.I MTU +of the device. + +.TP +.BI address " LLADDRESS" +change the station address of the interface. + +.TP +.BI broadcast " LLADDRESS" +.TP +.BI brd " LLADDRESS" +.TP +.BI peer " LLADDRESS" +change the link layer broadcast address or the peer address when +the interface is +.IR "POINTOPOINT" . + +.PP +.B Warning: +If multiple parameter changes are requested, +.B ip +aborts immediately after any of the changes have failed. +This is the only case when +.B ip +can move the system to an unpredictable state. The solution +is to avoid changing several parameters with one +.B ip link set +call. + +.SS ip link show - display device attributes + +.TP +.BI dev " NAME " (default) +.I NAME +specifies the network device to show. +If this argument is omitted all devices are listed. + +.TP +.B up +only display running interfaces. + +.SH ip address - protocol address management. + +The +.B address +is a protocol (IP or IPv6) address attached +to a network device. Each device must have at least one address +to use the corresponding protocol. It is possible to have several +different addresses attached to one device. These addresses are not +discriminated, so that the term +.B alias +is not quite appropriate for them and we do not use it in this document. +.sp +The +.B ip addr +command displays addresses and their properties, adds new addresses +and deletes old ones. + +.SS ip address add - add new protocol address. + +.TP +.BI dev " NAME" +the name of the device to add the address to. + +.TP +.BI local " ADDRESS " (default) +the address of the interface. The format of the address depends +on the protocol. It is a dotted quad for IP and a sequence of +hexadecimal halfwords separated by colons for IPv6. The +.I ADDRESS +may be followed by a slash and a decimal number which encodes +the network prefix length. + +.TP +.BI peer " ADDRESS" +the address of the remote endpoint for pointopoint interfaces. +Again, the +.I ADDRESS +may be followed by a slash and a decimal number, encoding the network +prefix length. If a peer address is specified, the local address +cannot have a prefix length. The network prefix is associated +with the peer rather than with the local address. + +.TP +.BI broadcast " ADDRESS" +the broadcast address on the interface. +.sp +It is possible to use the special symbols +.B '+' +and +.B '-' +instead of the broadcast address. In this case, the broadcast address +is derived by setting/resetting the host bits of the interface prefix. + +.TP +.BI label " NAME" +Each address may be tagged with a label string. +In order to preserve compatibility with Linux-2.0 net aliases, +this string must coincide with the name of the device or must be prefixed +with the device name followed by colon. + +.TP +.BI scope " SCOPE_VALUE" +the scope of the area where this address is valid. +The available scopes are listed in file +.BR "/etc/iproute2/rt_scopes" . +Predefined scope values are: + +.in +8 +.B global +- the address is globally valid. +.sp +.B site +- (IPv6 only) the address is site local, i.e. it is +valid inside this site. +.sp +.B link +- the address is link local, i.e. it is valid only on this device. +.sp +.B host +- the address is valid only inside this host. +.in -8 + +.SS ip address delete - delete protocol address +.B Arguments: +coincide with the arguments of +.B ip addr add. +The device name is a required argument. The rest are optional. +If no arguments are given, the first address is deleted. + +.SS ip address show - look at protocol addresses + +.TP +.BI dev " NAME " (default) +name of device. + +.TP +.BI scope " SCOPE_VAL" +only list addresses with this scope. + +.TP +.BI to " PREFIX" +only list addresses matching this prefix. + +.TP +.BI label " PATTERN" +only list addresses with labels matching the +.IR "PATTERN" . +.I PATTERN +is a usual shell style pattern. + +.TP +.BR dynamic " and " permanent +(IPv6 only) only list addresses installed due to stateless +address configuration or only list permanent (not dynamic) +addresses. + +.TP +.B tentative +(IPv6 only) only list addresses which did not pass duplicate +address detection. + +.TP +.B deprecated +(IPv6 only) only list deprecated addresses. + +.TP +.BR primary " and " secondary +only list primary (or secondary) addresses. + +.SS ip address flush - flush protocol addresses +This command flushes the protocol addresses selected by some criteria. + +.PP +This command has the same arguments as +.B show. +The difference is that it does not run when no arguments are given. + +.PP +.B Warning: +This command (and other +.B flush +commands described below) is pretty dangerous. If you make a mistake, +it will not forgive it, but will cruelly purge all the addresses. + +.PP +With the +.B -statistics +option, the command becomes verbose. It prints out the number of deleted +addresses and the number of rounds made to flush the address list. If +this option is given twice, +.B ip addr flush +also dumps all the deleted addresses in the format described in the +previous subsection. + +.SH ip neighbour - neighbour/arp tables management. + +.B neighbour +objects establish bindings between protocol addresses and +link layer addresses for hosts sharing the same link. +Neighbour entries are organized into tables. The IPv4 neighbour table +is known by another name - the ARP table. + +.P +The corresponding commands display neighbour bindings +and their properties, add new neighbour entries and delete old ones. + +.SS ip neighbour add - add a new neighbour entry +.SS ip neighbour change - change an existing entry +.SS ip neighbour replace - add a new entry or change an existing one + +These commands create new neighbour records or update existing ones. + +.TP +.BI to " ADDRESS " (default) +the protocol address of the neighbour. It is either an IPv4 or IPv6 address. + +.TP +.BI dev " NAME" +the interface to which this neighbour is attached. + +.TP +.BI lladdr " LLADDRESS" +the link layer address of the neighbour. +.I LLADDRESS +can also be +.BR "null" . + +.TP +.BI nud " NUD_STATE" +the state of the neighbour entry. +.B nud +is an abbreviation for 'Neigh bour Unreachability Detection'. +The state can take one of the following values: + +.in +8 +.B permanent +- the neighbour entry is valid forever and can be only +be removed administratively. +.sp + +.B noarp +- the neighbour entry is valid. No attempts to validate +this entry will be made but it can be removed when its lifetime expires. +.sp + +.B reachable +- the neighbour entry is valid until the reachability +timeout expires. +.sp + +.B stale +- the neighbour entry is valid but suspicious. +This option to +.B ip neigh +does not change the neighbour state if it was valid and the address +is not changed by this command. +.in -8 + +.SS ip neighbour delete - delete a neighbour entry +This command invalidates a neighbour entry. + +.PP +The arguments are the same as with +.BR "ip neigh add" , +except that +.B lladdr +and +.B nud +are ignored. + +.PP +.B Warning: +Attempts to delete or manually change a +.B noarp +entry created by the kernel may result in unpredictable behaviour. +Particularly, the kernel may try to resolve this address even +on a +.B NOARP +interface or if the address is multicast or broadcast. + +.SS ip neighbour show - list neighbour entries + +This commands displays neighbour tables. + +.TP +.BI to " ADDRESS " (default) +the prefix selecting the neighbours to list. + +.TP +.BI dev " NAME" +only list the neighbours attached to this device. + +.TP +.B unused +only list neighbours which are not currently in use. + +.TP +.BI nud " NUD_STATE" +only list neighbour entries in this state. +.I NUD_STATE +takes values listed below or the special value +.B all +which means all states. This option may occur more than once. +If this option is absent, +.B ip +lists all entries except for +.B none +and +.BR "noarp" . + +.SS ip neighbour flush - flush neighbour entries +This command flushes neighbour tables, selecting +entries to flush by some criteria. + +.PP +This command has the same arguments as +.B show. +The differences are that it does not run when no arguments are given, +and that the default neighbour states to be flushed do not include +.B permanent +and +.BR "noarp" . + +.PP +With the +.B -statistics +option, the command becomes verbose. It prints out the number of +deleted neighbours and the number of rounds made to flush the +neighbour table. If the option is given +twice, +.B ip neigh flush +also dumps all the deleted neighbours. + +.SH ip route - routing table management +Manipulate route entries in the kernel routing tables keep +information about paths to other networked nodes. +.sp +.B Route types: + +.in +8 +.B unicast +- the route entry describes real paths to the destinations covered +by the route prefix. + +.sp +.B unreachable +- these destinations are unreachable. Packets are discarded and the +ICMP message +.I host unreachable +is generated. +The local senders get an +.I EHOSTUNREACH +error. + +.sp +.B blackhole +- these destinations are unreachable. Packets are discarded silently. +The local senders get an +.I EINVAL +error. + +.sp +.B prohibit +- these destinations are unreachable. Packets are discarded and the +ICMP message +.I communication administratively prohibited +is generated. The local senders get an +.I EACCES +error. + +.sp +.B local +- the destinations are assigned to this host. The packets are looped +back and delivered locally. + +.sp +.B broadcast +- the destinations are broadcast addresses. The packets are sent as +link broadcasts. + +.sp +.B throw +- a special control route used together with policy rules. If such a +route is selected, lookup in this table is terminated pretending that +no route was found. Without policy routing it is equivalent to the +absence of the route in the routing table. The packets are dropped +and the ICMP message +.I net unreachable +is generated. The local senders get an +.I ENETUNREACH +error. + +.sp +.B nat +- a special NAT route. Destinations covered by the prefix +are considered to be dummy (or external) addresses which require translation +to real (or internal) ones before forwarding. The addresses to translate to +are selected with the attribute +.BR "via" . + +.sp +.B anycast +.RI "- " "not implemented" +the destinations are +.I anycast +addresses assigned to this host. They are mainly equivalent +to +.B local +with one difference: such addresses are invalid when used +as the source address of any packet. + +.sp +.B multicast +- a special type used for multicast routing. It is not present in +normal routing tables. +.in -8 + +.P +.B Route tables: +Linux-2.x can pack routes into several routing +tables identified by a number in the range from 1 to 255 or by +name from the file +.B /etc/iproute2/rt_tables +. By default all normal routes are inserted into the +.B main +table (ID 254) and the kernel only uses this table when calculating routes. + +.sp +Actually, one other table always exists, which is invisible but +even more important. It is the +.B local +table (ID 255). This table +consists of routes for local and broadcast addresses. The kernel maintains +this table automatically and the administrator usually need not modify it +or even look at it. + +The multiple routing tables enter the game when +.I policy routing +is used. + +.SS ip route add - add new route +.SS ip route change - change route +.SS ip route replace - change or add new one + +.TP +.BI to " TYPE PREFIX " (default) +the destination prefix of the route. If +.I TYPE +is omitted, +.B ip +assumes type +.BR "unicast" . +Other values of +.I TYPE +are listed above. +.I PREFIX +is an IP or IPv6 address optionally followed by a slash and the +prefix length. If the length of the prefix is missing, +.B ip +assumes a full-length host route. There is also a special +.I PREFIX +.B default +- which is equivalent to IP +.B 0/0 +or to IPv6 +.BR "::/0" . + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +the Type Of Service (TOS) key. This key has no associated mask and +the longest match is understood as: First, compare the TOS +of the route and of the packet. If they are not equal, then the packet +may still match a route with a zero TOS. +.I TOS +is either an 8 bit hexadecimal number or an identifier +from +.BR "/etc/iproute2/rt_dsfield" . + +.TP +.BI metric " NUMBER" +.TP +.BI preference " NUMBER" +the preference value of the route. +.I NUMBER +is an arbitrary 32bit number. + +.TP +.BI table " TABLEID" +the table to add this route to. +.I TABLEID +may be a number or a string from the file +.BR "/etc/iproute2/rt_tables" . +If this parameter is omitted, +.B ip +assumes the +.B main +table, with the exception of +.BR local " , " broadcast " and " nat +routes, which are put into the +.B local +table by default. + +.TP +.BI dev " NAME" +the output device name. + +.TP +.BI via " ADDRESS" +the address of the nexthop router. Actually, the sense of this field +depends on the route type. For normal +.B unicast +routes it is either the true next hop router or, if it is a direct +route installed in BSD compatibility mode, it can be a local address +of the interface. For NAT routes it is the first address of the block +of translated IP destinations. + +.TP +.BI src " ADDRESS" +the source address to prefer when sending to the destinations +covered by the route prefix. + +.TP +.BI realm " REALMID" +the realm to which this route is assigned. +.I REALMID +may be a number or a string from the file +.BR "/etc/iproute2/rt_realms" . + +.TP +.BI mtu " MTU" +.TP +.BI "mtu lock" " MTU" +the MTU along the path to the destination. If the modifier +.B lock +is not used, the MTU may be updated by the kernel due to +Path MTU Discovery. If the modifier +.B lock +is used, no path MTU discovery will be tried, all packets +will be sent without the DF bit in IPv4 case or fragmented +to MTU for IPv6. + +.TP +.BI window " NUMBER" +the maximal window for TCP to advertise to these destinations, +measured in bytes. It limits maximal data bursts that our TCP +peers are allowed to send to us. + +.TP +.BI rtt " NUMBER" +the initial RTT ('Round Trip Time') estimate. + +.TP +.BI rttvar " NUMBER " "(2.3.15+ only)" +the initial RTT variance estimate. + +.TP +.BI ssthresh " NUMBER " "(2.3.15+ only)" +an estimate for the initial slow start threshold. + +.TP +.BI cwnd " NUMBER " "(2.3.15+ only)" +the clamp for congestion window. It is ignored if the +.B lock +flag is not used. + +.TP +.BI advmss " NUMBER " "(2.3.15+ only)" +the MSS ('Maximal Segment Size') to advertise to these +destinations when establishing TCP connections. If it is not given, +Linux uses a default value calculated from the first hop device MTU. +(If the path to these destination is asymmetric, this guess may be wrong.) + +.TP +.BI reordering " NUMBER " "(2.3.15+ only)" +Maximal reordering on the path to this destination. +If it is not given, Linux uses the value selected with +.B sysctl +variable +.BR "net/ipv4/tcp_reordering" . + +.TP +.BI nexthop " NEXTHOP" +the nexthop of a multipath route. +.I NEXTHOP +is a complex value with its own syntax similar to the top level +argument lists: + +.in +8 +.BI via " ADDRESS" +- is the nexthop router. +.sp + +.BI dev " NAME" +- is the output device. +.sp + +.BI weight " NUMBER" +- is a weight for this element of a multipath +route reflecting its relative bandwidth or quality. +.in -8 + +.TP +.BI scope " SCOPE_VAL" +the scope of the destinations covered by the route prefix. +.I SCOPE_VAL +may be a number or a string from the file +.BR "/etc/iproute2/rt_scopes" . +If this parameter is omitted, +.B ip +assumes scope +.B global +for all gatewayed +.B unicast +routes, scope +.B link +for direct +.BR unicast " and " broadcast +routes and scope +.BR host " for " local +routes. + +.TP +.BI protocol " RTPROTO" +the routing protocol identifier of this route. +.I RTPROTO +may be a number or a string from the file +.BR "/etc/iproute2/rt_protos" . +If the routing protocol ID is not given, +.B ip assumes protocol +.B boot +(i.e. it assumes the route was added by someone who doesn't +understand what they are doing). Several protocol values have +a fixed interpretation. +Namely: + +.in +8 +.B redirect +- the route was installed due to an ICMP redirect. +.sp + +.B kernel +- the route was installed by the kernel during autoconfiguration. +.sp + +.B boot +- the route was installed during the bootup sequence. +If a routing daemon starts, it will purge all of them. +.sp + +.B static +- the route was installed by the administrator +to override dynamic routing. Routing daemon will respect them +and, probably, even advertise them to its peers. +.sp + +.B ra +- the route was installed by Router Discovery protocol. +.in -8 + +.sp +The rest of the values are not reserved and the administrator is free +to assign (or not to assign) protocol tags. + +.TP +.B onlink +pretend that the nexthop is directly attached to this link, +even if it does not match any interface prefix. + +.TP +.B equalize +allow packet by packet randomization on multipath routes. +Without this modifier, the route will be frozen to one selected +nexthop, so that load splitting will only occur on per-flow base. +.B equalize +only works if the kernel is patched. + +.SS ip route delete - delete route + +.B ip route del +has the same arguments as +.BR "ip route add" , +but their semantics are a bit different. + +Key values +.RB "(" to ", " tos ", " preference " and " table ")" +select the route to delete. If optional attributes are present, +.B ip +verifies that they coincide with the attributes of the route to delete. +If no route with the given key and attributes was found, +.B ip route del +fails. + +.SS ip route show - list routes +the command displays the contents of the routing tables or the route(s) +selected by some criteria. + +.TP +.BI to " SELECTOR " (default) +only select routes from the given range of destinations. +.I SELECTOR +consists of an optional modifier +.RB "(" root ", " match " or " exact ")" +and a prefix. +.BI root " PREFIX" +selects routes with prefixes not shorter than +.IR PREFIX "." +F.e. +.BI root " 0/0" +selects the entire routing table. +.BI match " PREFIX" +selects routes with prefixes not longer than +.IR PREFIX "." +F.e. +.BI match " 10.0/16" +selects +.IR 10.0/16 "," +.IR 10/8 " and " 0/0 , +but it does not select +.IR 10.1/16 " and " 10.0.0/24 . +And +.BI exact " PREFIX" +(or just +.IR PREFIX ")" +selects routes with this exact prefix. If neither of these options +are present, +.B ip +assumes +.BI root " 0/0" +i.e. it lists the entire table. + +.TP +.BI tos " TOS" +.BI dsfield " TOS" +only select routes with the given TOS. + +.TP +.BI table " TABLEID" +show the routes from this table(s). The default setting is to show +.BR table main "." +.I TABLEID +may either be the ID of a real table or one of the special values: +.sp +.in +8 +.B all +- list all of the tables. +.sp +.B cache +- dump the routing cache. +.in -8 + +.TP +.B cloned +.TP +.B cached +list cloned routes i.e. routes which were dynamically forked from +other routes because some route attribute (f.e. MTU) was updated. +Actually, it is equivalent to +.BR "table cache" "." + +.TP +.BI from " SELECTOR" +the same syntax as for +.BR to "," +but it binds the source address range rather than destinations. +Note that the +.B from +option only works with cloned routes. + +.TP +.BI protocol " RTPROTO" +only list routes of this protocol. + +.TP +.BI scope " SCOPE_VAL" +only list routes with this scope. + +.TP +.BI type " TYPE" +only list routes of this type. + +.TP +.BI dev " NAME" +only list routes going via this device. + +.TP +.BI via " PREFIX" +only list routes going via the nexthop routers selected by +.IR PREFIX "." + +.TP +.BI src " PREFIX" +only list routes with preferred source addresses selected +by +.IR PREFIX "." + +.TP +.BI realm " REALMID" +.TP +.BI realms " FROMREALM/TOREALM" +only list routes with these realms. + +.SS ip route flush - flush routing tables +this command flushes routes selected by some criteria. + +.sp +The arguments have the same syntax and semantics as the arguments of +.BR "ip route show" , +but routing tables are not listed but purged. The only difference is +the default action: +.B show +dumps all the IP main routing table but +.B flush +prints the helper page. + +.sp +With the +.B -statistics +option, the command becomes verbose. It prints out the number of +deleted routes and the number of rounds made to flush the routing +table. If the option is given +twice, +.B ip route flush +also dumps all the deleted routes in the format described in the +previous subsection. + +.SS ip route get - get a single route +this command gets a single route to a destination and prints its +contents exactly as the kernel sees it. + +.TP +.BI to " ADDRESS " (default) +the destination address. + +.TP +.BI from " ADDRESS" +the source address. + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +the Type Of Service. + +.TP +.BI iif " NAME" +the device from which this packet is expected to arrive. + +.TP +.BI oif " NAME" +force the output device on which this packet will be routed. + +.TP +.B connected +if no source address +.RB "(option " from ")" +was given, relookup the route with the source set to the preferred +address received from the first lookup. +If policy routing is used, it may be a different route. + +.P +Note that this operation is not equivalent to +.BR "ip route show" . +.B show +shows existing routes. +.B get +resolves them and creates new clones if necessary. Essentially, +.B get +is equivalent to sending a packet along this path. +If the +.B iif +argument is not given, the kernel creates a route +to output packets towards the requested destination. +This is equivalent to pinging the destination +with a subsequent +.BR "ip route ls cache" , +however, no packets are actually sent. With the +.B iif +argument, the kernel pretends that a packet arrived from this interface +and searches for a path to forward the packet. + +.SH ip rule - routing policy database management + +.BR "Rule" s +in the routing policy database control the route selection algorithm. + +.P +Classic routing algorithms used in the Internet make routing decisions +based only on the destination address of packets (and in theory, +but not in practice, on the TOS field). + +.P +In some circumstances we want to route packets differently depending not only +on destination addresses, but also on other packet fields: source address, +IP protocol, transport protocol ports or even packet payload. +This task is called 'policy routing'. + +.P +To solve this task, the conventional destination based routing table, ordered +according to the longest match rule, is replaced with a 'routing policy +database' (or RPDB), which selects routes by executing some set of rules. + +.P +Each policy routing rule consists of a +.B selector +and an +.B action predicate. +The RPDB is scanned in the order of increasing priority. The selector +of each rule is applied to {source address, destination address, incoming +interface, tos, fwmark} and, if the selector matches the packet, +the action is performed. The action predicate may return with success. +In this case, it will either give a route or failure indication +and the RPDB lookup is terminated. Otherwise, the RPDB program +continues on the next rule. + +.P +Semantically, natural action is to select the nexthop and the output device. + +.P +At startup time the kernel configures the default RPDB consisting of three +rules: + +.TP +1. +Priority: 0, Selector: match anything, Action: lookup routing +table +.B local +(ID 255). +The +.B local +table is a special routing table containing +high priority control routes for local and broadcast addresses. +.sp +Rule 0 is special. It cannot be deleted or overridden. + +.TP +2. +Priority: 32766, Selector: match anything, Action: lookup routing +table +.B main +(ID 254). +The +.B main +table is the normal routing table containing all non-policy +routes. This rule may be deleted and/or overridden with other +ones by the administrator. + +.TP +3. +Priority: 32767, Selector: match anything, Action: lookup routing +table +.B default +(ID 253). +The +.B default +table is empty. It is reserved for some post-processing if no previous +default rules selected the packet. +This rule may also be deleted. + +.P +Each RPDB entry has additional +attributes. F.e. each rule has a pointer to some routing +table. NAT and masquerading rules have an attribute to select new IP +address to translate/masquerade. Besides that, rules have some +optional attributes, which routes have, namely +.BR "realms" . +These values do not override those contained in the routing tables. They +are only used if the route did not select any attributes. + +.sp +The RPDB may contain rules of the following types: + +.in +8 +.B unicast +- the rule prescribes to return the route found +in the routing table referenced by the rule. + +.B blackhole +- the rule prescribes to silently drop the packet. + +.B unreachable +- the rule prescribes to generate a 'Network is unreachable' error. + +.B prohibit +- the rule prescribes to generate 'Communication is administratively +prohibited' error. + +.B nat +- the rule prescribes to translate the source address +of the IP packet into some other value. +.in -8 + +.SS ip rule add - insert a new rule +.SS ip rule delete - delete a rule + +.TP +.BI type " TYPE " (default) +the type of this rule. The list of valid types was given in the previous +subsection. + +.TP +.BI from " PREFIX" +select the source prefix to match. + +.TP +.BI to " PREFIX" +select the destination prefix to match. + +.TP +.BI iif " NAME" +select the incoming device to match. If the interface is loopback, +the rule only matches packets originating from this host. This means +that you may create separate routing tables for forwarded and local +packets and, hence, completely segregate them. + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +select the TOS value to match. + +.TP +.BI fwmark " MARK" +select the +.B fwmark +value to match. + +.TP +.BI priority " PREFERENCE" +the priority of this rule. Each rule should have an explicitly +set +.I unique +priority value. + +.TP +.BI table " TABLEID" +the routing table identifier to lookup if the rule selector matches. + +.TP +.BI realms " FROM/TO" +Realms to select if the rule matched and the routing table lookup +succeeded. Realm +.I TO +is only used if the route did not select any realm. + +.TP +.BI nat " ADDRESS" +The base of the IP address block to translate (for source addresses). +The +.I ADDRESS +may be either the start of the block of NAT addresses (selected by NAT +routes) or a local host address (or even zero). +In the last case the router does not translate the packets, but +masquerades them to this address. + +.B Warning: +Changes to the RPDB made with these commands do not become active +immediately. It is assumed that after a script finishes a batch of +updates, it flushes the routing cache with +.BR "ip route flush cache" . + +.SS ip rule show - list rules +This command has no arguments. + +.SH ip maddress - multicast addresses management + +.B maddress +objects are multicast addresses. + +.SS ip maddress show - list multicast addresses + +.TP +.BI dev " NAME " (default) +the device name. + +.SS ip maddress add - add a multicast address +.SS ip maddress delete - delete a multicast address +these commands attach/detach a static link layer multicast address +to listen on the interface. +Note that it is impossible to join protocol multicast groups +statically. This command only manages link layer addresses. + +.TP +.BI address " LLADDRESS " (default) +the link layer multicast address. + +.TP +.BI dev " NAME" +the device to join/leave this multicast address. + +.SH ip mroute - multicast routing cache management +.B mroute +objects are multicast routing cache entries created by a user level +mrouting daemon (f.e. +.B pimd +or +.B mrouted +). + +Due to the limitations of the current interface to the multicast routing +engine, it is impossible to change +.B mroute +objects administratively, so we may only display them. This limitation +will be removed in the future. + +.SS ip mroute show - list mroute cache entries + +.TP +.BI to " PREFIX " (default) +the prefix selecting the destination multicast addresses to list. + +.TP +.BI iif " NAME" +the interface on which multicast packets are received. + +.TP +.BI from " PREFIX" +the prefix selecting the IP source addresses of the multicast route. + +.SH ip tunnel - tunnel configuration +.B tunnel +objects are tunnels, encapsulating packets in IPv4 packets and then +sending them over the IP infrastructure. + +.SS ip tunnel add - add a new tunnel +.SS ip tunnel change - change an existing tunnel +.SS ip tunnel delete - destroy a tunnel + +.TP +.BI name " NAME " (default) +select the tunnel device name. + +.TP +.BI mode " MODE" +set the tunnel mode. Three modes are currently available: +.BR ipip ", " sit " and " gre "." + +.TP +.BI remote " ADDRESS" +set the remote endpoint of the tunnel. + +.TP +.BI local " ADDRESS" +set the fixed local address for tunneled packets. +It must be an address on another interface of this host. + +.TP +.BI ttl " N" +set a fixed TTL +.I N +on tunneled packets. +.I N +is a number in the range 1--255. 0 is a special value +meaning that packets inherit the TTL value. +The default value is: +.BR "inherit" . + +.TP +.BI tos " T" +.TP +.BI dsfield " T" +set a fixed TOS +.I T +on tunneled packets. +The default value is: +.BR "inherit" . + +.TP +.BI dev " NAME" +bind the tunnel to the device +.I NAME +so that tunneled packets will only be routed via this device and will +not be able to escape to another device when the route to endpoint +changes. + +.TP +.B nopmtudisc +disable Path MTU Discovery on this tunnel. +It is enabled by default. Note that a fixed ttl is incompatible +with this option: tunnelling with a fixed ttl always makes pmtu +discovery. + +.TP +.BI key " K" +.TP +.BI ikey " K" +.TP +.BI okey " K" +.RB ( " only GRE tunnels " ) +use keyed GRE with key +.IR K ". " K +is either a number or an IP address-like dotted quad. +The +.B key +parameter sets the key to use in both directions. +The +.BR ikey " and " okey +parameters set different keys for input and output. + +.TP +.BR csum ", " icsum ", " ocsum +.RB ( " only GRE tunnels " ) +generate/require checksums for tunneled packets. +The +.B ocsum +flag calculates checksums for outgoing packets. +The +.B icsum +flag requires that all input packets have the correct +checksum. The +.B csum +flag is equivalent to the combination +.BR "icsum ocsum" . + +.TP +.BR seq ", " iseq ", " oseq +.RB ( " only GRE tunnels " ) +serialize packets. +The +.B oseq +flag enables sequencing of outgoing packets. +The +.B iseq +flag requires that all input packets are serialized. +The +.B seq +flag is equivalent to the combination +.BR "iseq oseq" . +.B It isn't work. Don't use it. + +.SS ip tunnel show - list tunnels +This command has no arguments. + +.SH ip monitor and rtmon - state monitoring + +The +.B ip +utility can monitor the state of devices, addresses +and routes continuously. This option has a slightly different format. +Namely, the +.B monitor +command is the first in the command line and then the object list follows: + +.BR "ip monitor" " [ " all " |" +.IR LISTofOBJECTS " ]" + +.I OBJECT-LIST +is the list of object types that we want to monitor. +It may contain +.BR link ", " address " and " route "." +If no +.B file +argument is given, +.B ip +opens RTNETLINK, listens on it and dumps state changes in the format +described in previous sections. + +.P +If a file name is given, it does not listen on RTNETLINK, +but opens the file containing RTNETLINK messages saved in binary format +and dumps them. Such a history file can be generated with the +.B rtmon +utility. This utility has a command line syntax similar to +.BR "ip monitor" . +Ideally, +.B rtmon +should be started before the first network configuration command +is issued. F.e. if you insert: +.sp +.in +8 +rtmon file /var/log/rtmon.log +.in -8 +.sp +in a startup script, you will be able to view the full history +later. + +.P +Certainly, it is possible to start +.B rtmon +at any time. +It prepends the history with the state snapshot dumped at the moment +of starting. + +.SH HISTORY + +.B ip +was written by Alexey N. Kuznetsov and added in Linux 2.2. +.SH SEE ALSO +.BR tc (8) +.br +.RB "IP Command reference " ip-cref.ps +.br +.RB "IP tunnels " ip-cref.ps + +.SH AUTHOR + +Manpage maintained by Michail Litvak diff --git a/tc-cbq-details.8 b/tc-cbq-details.8 new file mode 100644 index 0000000..e47da62 --- /dev/null +++ b/tc-cbq-details.8 @@ -0,0 +1,425 @@ +.TH CBQ 8 "8 December 2001" "iproute2" "Linux" +.SH NAME +CBQ \- Class Based Queueing +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B ( parent +classid +.B | root) [ handle +major: +.B ] cbq avpkt +bytes +.B bandwidth +rate +.B [ cell +bytes +.B ] [ ewma +log +.B ] [ mpu +bytes +.B ] + +.B tc class ... dev +dev +.B parent +major:[minor] +.B [ classid +major:minor +.B ] cbq allot +bytes +.B [ bandwidth +rate +.B ] [ rate +rate +.B ] prio +priority +.B [ weight +weight +.B ] [ minburst +packets +.B ] [ maxburst +packets +.B ] [ ewma +log +.B ] [ cell +bytes +.B ] avpkt +bytes +.B [ mpu +bytes +.B ] [ bounded isolated ] [ split +handle +.B & defmap +defmap +.B ] [ estimator +interval timeconstant +.B ] + +.SH DESCRIPTION +Class Based Queueing is a classful qdisc that implements a rich +linksharing hierarchy of classes. It contains shaping elements as +well as prioritizing capabilities. Shaping is performed using link +idle time calculations based on the timing of dequeue events and +underlying link bandwidth. + +.SH SHAPING ALGORITHM +Shaping is done using link idle time calculations, and actions taken if +these calculations deviate from set limits. + +When shaping a 10mbit/s connection to 1mbit/s, the link will +be idle 90% of the time. If it isn't, it needs to be throttled so that it +IS idle 90% of the time. + +From the kernel's perspective, this is hard to measure, so CBQ instead +derives the idle time from the number of microseconds (in fact, jiffies) +that elapse between requests from the device driver for more data. Combined +with the knowledge of packet sizes, this is used to approximate how full or +empty the link is. + +This is rather circumspect and doesn't always arrive at proper +results. For example, what is the actual link speed of an interface +that is not really able to transmit the full 100mbit/s of data, +perhaps because of a badly implemented driver? A PCMCIA network card +will also never achieve 100mbit/s because of the way the bus is +designed - again, how do we calculate the idle time? + +The physical link bandwidth may be ill defined in case of not-quite-real +network devices like PPP over Ethernet or PPTP over TCP/IP. The effective +bandwidth in that case is probably determined by the efficiency of pipes +to userspace - which not defined. + +During operations, the effective idletime is measured using an +exponential weighted moving average (EWMA), which considers recent +packets to be exponentially more important than past ones. The Unix +loadaverage is calculated in the same way. + +The calculated idle time is subtracted from the EWMA measured one, +the resulting number is called 'avgidle'. A perfectly loaded link has +an avgidle of zero: packets arrive exactly at the calculated +interval. + +An overloaded link has a negative avgidle and if it gets too negative, +CBQ throttles and is then 'overlimit'. + +Conversely, an idle link might amass a huge avgidle, which would then +allow infinite bandwidths after a few hours of silence. To prevent +this, avgidle is capped at +.B maxidle. + +If overlimit, in theory, the CBQ could throttle itself for exactly the +amount of time that was calculated to pass between packets, and then +pass one packet, and throttle again. Due to timer resolution constraints, +this may not be feasible, see the +.B minburst +parameter below. + +.SH CLASSIFICATION +Within the one CBQ instance many classes may exist. Each of these classes +contains another qdisc, by default +.BR tc-pfifo (8). + +When enqueueing a packet, CBQ starts at the root and uses various methods to +determine which class should receive the data. If a verdict is reached, this +process is repeated for the recipient class which might have further +means of classifying traffic to its children, if any. + +CBQ has the following methods available to classify a packet to any child +classes. +.TP +(i) +.B skb->priority class encoding. +Can be set from userspace by an application with the +.B SO_PRIORITY +setsockopt. +The +.B skb->priority class encoding +only applies if the skb->priority holds a major:minor handle of an existing +class within this qdisc. +.TP +(ii) +tc filters attached to the class. +.TP +(iii) +The defmap of a class, as set with the +.B split & defmap +parameters. The defmap may contain instructions for each possible Linux packet +priority. + +.P +Each class also has a +.B level. +Leaf nodes, attached to the bottom of the class hierarchy, have a level of 0. +.SH CLASSIFICATION ALGORITHM + +Classification is a loop, which terminates when a leaf class is found. At any +point the loop may jump to the fallback algorithm. + +The loop consists of the following steps: +.TP +(i) +If the packet is generated locally and has a valid classid encoded within its +.B skb->priority, +choose it and terminate. + +.TP +(ii) +Consult the tc filters, if any, attached to this child. If these return +a class which is not a leaf class, restart loop from the class returned. +If it is a leaf, choose it and terminate. +.TP +(iii) +If the tc filters did not return a class, but did return a classid, +try to find a class with that id within this qdisc. +Check if the found class is of a lower +.B level +than the current class. If so, and the returned class is not a leaf node, +restart the loop at the found class. If it is a leaf node, terminate. +If we found an upward reference to a higher level, enter the fallback +algorithm. +.TP +(iv) +If the tc filters did not return a class, nor a valid reference to one, +consider the minor number of the reference to be the priority. Retrieve +a class from the defmap of this class for the priority. If this did not +contain a class, consult the defmap of this class for the +.B BEST_EFFORT +class. If this is an upward reference, or no +.B BEST_EFFORT +class was defined, +enter the fallback algorithm. If a valid class was found, and it is not a +leaf node, restart the loop at this class. If it is a leaf, choose it and +terminate. If +neither the priority distilled from the classid, nor the +.B BEST_EFFORT +priority yielded a class, enter the fallback algorithm. +.P +The fallback algorithm resides outside of the loop and is as follows. +.TP +(i) +Consult the defmap of the class at which the jump to fallback occured. If +the defmap contains a class for the +.B +priority +of the class (which is related to the TOS field), choose this class and +terminate. +.TP +(ii) +Consult the map for a class for the +.B BEST_EFFORT +priority. If found, choose it, and terminate. +.TP +(iii) +Choose the class at which break out to the fallback algorithm occured. Terminate. +.P +The packet is enqueued to the class which was chosen when either algorithm +terminated. It is therefore possible for a packet to be enqueued *not* at a +leaf node, but in the middle of the hierarchy. + +.SH LINK SHARING ALGORITHM +When dequeuing for sending to the network device, CBQ decides which of its +classes will be allowed to send. It does so with a Weighted Round Robin process +in which each class with packets gets a chance to send in turn. The WRR process +starts by asking the highest priority classes (lowest numerically - +highest semantically) for packets, and will continue to do so until they +have no more data to offer, in which case the process repeats for lower +priorities. + +.B CERTAINTY ENDS HERE, ANK PLEASE HELP + +Each class is not allowed to send at length though - they can only dequeue a +configurable amount of data during each round. + +If a class is about to go overlimit, and it is not +.B bounded +it will try to borrow avgidle from siblings that are not +.B isolated. +This process is repeated from the bottom upwards. If a class is unable +to borrow enough avgidle to send a packet, it is throttled and not asked +for a packet for enough time for the avgidle to increase above zero. + +.B I REALLY NEED HELP FIGURING THIS OUT. REST OF DOCUMENT IS PRETTY CERTAIN +.B AGAIN. + +.SH QDISC +The root qdisc of a CBQ class tree has the following parameters: + +.TP +parent major:minor | root +This mandatory parameter determines the place of the CBQ instance, either at the +.B root +of an interface or within an existing class. +.TP +handle major: +Like all other qdiscs, the CBQ can be assigned a handle. Should consist only +of a major number, followed by a colon. Optional. +.TP +avpkt bytes +For calculations, the average packet size must be known. It is silently capped +at a minimum of 2/3 of the interface MTU. Mandatory. +.TP +bandwidth rate +To determine the idle time, CBQ must know the bandwidth of your underlying +physical interface, or parent qdisc. This is a vital parameter, more about it +later. Mandatory. +.TP +cell +The cell size determines he granularity of packet transmission time calculations. Has a sensible default. +.TP +mpu +A zero sized packet may still take time to transmit. This value is the lower +cap for packet transmission time calculations - packets smaller than this value +are still deemed to have this size. Defaults to zero. +.TP +ewma log +When CBQ needs to measure the average idle time, it does so using an +Exponentially Weighted Moving Average which smoothes out measurements into +a moving average. The EWMA LOG determines how much smoothing occurs. Defaults +to 5. Lower values imply greater sensitivity. Must be between 0 and 31. +.P +A CBQ qdisc does not shape out of its own accord. It only needs to know certain +parameters about the underlying link. Actual shaping is done in classes. + +.SH CLASSES +Classes have a host of parameters to configure their operation. + +.TP +parent major:minor +Place of this class within the hierarchy. If attached directly to a qdisc +and not to another class, minor can be omitted. Mandatory. +.TP +classid major:minor +Like qdiscs, classes can be named. The major number must be equal to the +major number of the qdisc to which it belongs. Optional, but needed if this +class is going to have children. +.TP +weight weight +When dequeuing to the interface, classes are tried for traffic in a +round-robin fashion. Classes with a higher configured qdisc will generally +have more traffic to offer during each round, so it makes sense to allow +it to dequeue more traffic. All weights under a class are normalized, so +only the ratios matter. Defaults to the configured rate, unless the priority +of this class is maximal, in which case it is set to 1. +.TP +allot bytes +Allot specifies how many bytes a qdisc can dequeue +during each round of the process. This parameter is weighted using the +renormalized class weight described above. + +.TP +priority priority +In the round-robin process, classes with the lowest priority field are tried +for packets first. Mandatory. + +.TP +rate rate +Maximum rate this class and all its children combined can send at. Mandatory. + +.TP +bandwidth rate +This is different from the bandwidth specified when creating a CBQ disc. Only +used to determine maxidle and offtime, which are only calculated when +specifying maxburst or minburst. Mandatory if specifying maxburst or minburst. + +.TP +maxburst +This number of packets is used to calculate maxidle so that when +avgidle is at maxidle, this number of average packets can be burst +before avgidle drops to 0. Set it higher to be more tolerant of +bursts. You can't set maxidle directly, only via this parameter. + +.TP +minburst +As mentioned before, CBQ needs to throttle in case of +overlimit. The ideal solution is to do so for exactly the calculated +idle time, and pass 1 packet. However, Unix kernels generally have a +hard time scheduling events shorter than 10ms, so it is better to +throttle for a longer period, and then pass minburst packets in one +go, and then sleep minburst times longer. + +The time to wait is called the offtime. Higher values of minburst lead +to more accurate shaping in the long term, but to bigger bursts at +millisecond timescales. + +.TP +minidle +If avgidle is below 0, we are overlimits and need to wait until +avgidle will be big enough to send one packet. To prevent a sudden +burst from shutting down the link for a prolonged period of time, +avgidle is reset to minidle if it gets too low. + +Minidle is specified in negative microseconds, so 10 means that +avgidle is capped at -10us. + +.TP +bounded +Signifies that this class will not borrow bandwidth from its siblings. +.TP +isolated +Means that this class will not borrow bandwidth to its siblings + +.TP +split major:minor & defmap bitmap[/bitmap] +If consulting filters attached to a class did not give a verdict, +CBQ can also classify based on the packet's priority. There are 16 +priorities available, numbered from 0 to 15. + +The defmap specifies which priorities this class wants to receive, +specified as a bitmap. The Least Significant Bit corresponds to priority +zero. The +.B split +parameter tells CBQ at which class the decision must be made, which should +be a (grand)parent of the class you are adding. + +As an example, 'tc class add ... classid 10:1 cbq .. split 10:0 defmap c0' +configures class 10:0 to send packets with priorities 6 and 7 to 10:1. + +The complimentary configuration would then +be: 'tc class add ... classid 10:2 cbq ... split 10:0 defmap 3f' +Which would send all packets 0, 1, 2, 3, 4 and 5 to 10:1. +.TP +estimator interval timeconstant +CBQ can measure how much bandwidth each class is using, which tc filters +can use to classify packets with. In order to determine the bandwidth +it uses a very simple estimator that measures once every +.B interval +microseconds how much traffic has passed. This again is a EWMA, for which +the time constant can be specified, also in microseconds. The +.B time constant +corresponds to the sluggishness of the measurement or, conversely, to the +sensitivity of the average to short bursts. Higher values mean less +sensitivity. + + + +.SH SOURCES +.TP +o +Sally Floyd and Van Jacobson, "Link-sharing and Resource +Management Models for Packet Networks", +IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + +.TP +o +Sally Floyd, "Notes on CBQ and Guarantee Service", 1995 + +.TP +o +Sally Floyd, "Notes on Class-Based Queueing: Setting +Parameters", 1996 + +.TP +o +Sally Floyd and Michael Speer, "Experimental Results +for Class-Based Queueing", 1998, not published. + + + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHOR +Alexey N. Kuznetsov, . This manpage maintained by +bert hubert + + diff --git a/tc-cbq.8 b/tc-cbq.8 new file mode 100644 index 0000000..79fb93b --- /dev/null +++ b/tc-cbq.8 @@ -0,0 +1,353 @@ +.TH CBQ 8 "16 December 2001" "iproute2" "Linux" +.SH NAME +CBQ \- Class Based Queueing +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B ( parent +classid +.B | root) [ handle +major: +.B ] cbq [ allot +bytes +.B ] avpkt +bytes +.B bandwidth +rate +.B [ cell +bytes +.B ] [ ewma +log +.B ] [ mpu +bytes +.B ] + +.B tc class ... dev +dev +.B parent +major:[minor] +.B [ classid +major:minor +.B ] cbq allot +bytes +.B [ bandwidth +rate +.B ] [ rate +rate +.B ] prio +priority +.B [ weight +weight +.B ] [ minburst +packets +.B ] [ maxburst +packets +.B ] [ ewma +log +.B ] [ cell +bytes +.B ] avpkt +bytes +.B [ mpu +bytes +.B ] [ bounded isolated ] [ split +handle +.B & defmap +defmap +.B ] [ estimator +interval timeconstant +.B ] + +.SH DESCRIPTION +Class Based Queueing is a classful qdisc that implements a rich +linksharing hierarchy of classes. It contains shaping elements as +well as prioritizing capabilities. Shaping is performed using link +idle time calculations based on the timing of dequeue events and +underlying link bandwidth. + +.SH SHAPING ALGORITHM +When shaping a 10mbit/s connection to 1mbit/s, the link will +be idle 90% of the time. If it isn't, it needs to be throttled so that it +IS idle 90% of the time. + +During operations, the effective idletime is measured using an +exponential weighted moving average (EWMA), which considers recent +packets to be exponentially more important than past ones. The Unix +loadaverage is calculated in the same way. + +The calculated idle time is subtracted from the EWMA measured one, +the resulting number is called 'avgidle'. A perfectly loaded link has +an avgidle of zero: packets arrive exactly at the calculated +interval. + +An overloaded link has a negative avgidle and if it gets too negative, +CBQ throttles and is then 'overlimit'. + +Conversely, an idle link might amass a huge avgidle, which would then +allow infinite bandwidths after a few hours of silence. To prevent +this, avgidle is capped at +.B maxidle. + +If overlimit, in theory, the CBQ could throttle itself for exactly the +amount of time that was calculated to pass between packets, and then +pass one packet, and throttle again. Due to timer resolution constraints, +this may not be feasible, see the +.B minburst +parameter below. + +.SH CLASSIFICATION +Within the one CBQ instance many classes may exist. Each of these classes +contains another qdisc, by default +.BR tc-pfifo (8). + +When enqueueing a packet, CBQ starts at the root and uses various methods to +determine which class should receive the data. + +In the absence of uncommon configuration options, the process is rather easy. +At each node we look for an instruction, and then go to the class the +instruction refers us to. If the class found is a barren leaf-node (without +children), we enqueue the packet there. If it is not yet a leaf node, we do +the whole thing over again starting from that node. + +The following actions are performed, in order at each node we visit, until one +sends us to another node, or terminates the process. +.TP +(i) +Consult filters attached to the class. If sent to a leafnode, we are done. +Otherwise, restart. +.TP +(ii) +Consult the defmap for the priority assigned to this packet, which depends +on the TOS bits. Check if the referral is leafless, otherwise restart. +.TP +(iii) +Ask the defmap for instructions for the 'best effort' priority. Check the +answer for leafness, otherwise restart. +.TP +(iv) +If none of the above returned with an instruction, enqueue at this node. +.P +This algorithm makes sure that a packet always ends up somewhere, even while +you are busy building your configuration. + +For more details, see +.BR tc-cbq-details(8). + +.SH LINK SHARING ALGORITHM +When dequeuing for sending to the network device, CBQ decides which of its +classes will be allowed to send. It does so with a Weighted Round Robin process +in which each class with packets gets a chance to send in turn. The WRR process +starts by asking the highest priority classes (lowest numerically - +highest semantically) for packets, and will continue to do so until they +have no more data to offer, in which case the process repeats for lower +priorities. + +Classes by default borrow bandwidth from their siblings. A class can be +prevented from doing so by declaring it 'bounded'. A class can also indicate +its unwillingness to lend out bandwidth by being 'isolated'. + +.SH QDISC +The root of a CBQ qdisc class tree has the following parameters: + +.TP +parent major:minor | root +This mandatory parameter determines the place of the CBQ instance, either at the +.B root +of an interface or within an existing class. +.TP +handle major: +Like all other qdiscs, the CBQ can be assigned a handle. Should consist only +of a major number, followed by a colon. Optional, but very useful if classes +will be generated within this qdisc. +.TP +allot bytes +This allotment is the 'chunkiness' of link sharing and is used for determining packet +transmission time tables. The qdisc allot differs slightly from the class allot discussed +below. Optional. Defaults to a reasonable value, related to avpkt. +.TP +avpkt bytes +The average size of a packet is needed for calculating maxidle, and is also used +for making sure 'allot' has a safe value. Mandatory. +.TP +bandwidth rate +To determine the idle time, CBQ must know the bandwidth of your underlying +physical interface, or parent qdisc. This is a vital parameter, more about it +later. Mandatory. +.TP +cell +The cell size determines he granularity of packet transmission time calculations. Has a sensible default. +.TP +mpu +A zero sized packet may still take time to transmit. This value is the lower +cap for packet transmission time calculations - packets smaller than this value +are still deemed to have this size. Defaults to zero. +.TP +ewma log +When CBQ needs to measure the average idle time, it does so using an +Exponentially Weighted Moving Average which smoothes out measurements into +a moving average. The EWMA LOG determines how much smoothing occurs. Lower +values imply greater sensitivity. Must be between 0 and 31. Defaults +to 5. +.P +A CBQ qdisc does not shape out of its own accord. It only needs to know certain +parameters about the underlying link. Actual shaping is done in classes. + +.SH CLASSES +Classes have a host of parameters to configure their operation. + +.TP +parent major:minor +Place of this class within the hierarchy. If attached directly to a qdisc +and not to another class, minor can be omitted. Mandatory. +.TP +classid major:minor +Like qdiscs, classes can be named. The major number must be equal to the +major number of the qdisc to which it belongs. Optional, but needed if this +class is going to have children. +.TP +weight weight +When dequeuing to the interface, classes are tried for traffic in a +round-robin fashion. Classes with a higher configured qdisc will generally +have more traffic to offer during each round, so it makes sense to allow +it to dequeue more traffic. All weights under a class are normalized, so +only the ratios matter. Defaults to the configured rate, unless the priority +of this class is maximal, in which case it is set to 1. +.TP +allot bytes +Allot specifies how many bytes a qdisc can dequeue +during each round of the process. This parameter is weighted using the +renormalized class weight described above. Silently capped at a minimum of +3/2 avpkt. Mandatory. + +.TP +prio priority +In the round-robin process, classes with the lowest priority field are tried +for packets first. Mandatory. + +.TP +avpkt +See the QDISC section. + +.TP +rate rate +Maximum rate this class and all its children combined can send at. Mandatory. + +.TP +bandwidth rate +This is different from the bandwidth specified when creating a CBQ disc! Only +used to determine maxidle and offtime, which are only calculated when +specifying maxburst or minburst. Mandatory if specifying maxburst or minburst. + +.TP +maxburst +This number of packets is used to calculate maxidle so that when +avgidle is at maxidle, this number of average packets can be burst +before avgidle drops to 0. Set it higher to be more tolerant of +bursts. You can't set maxidle directly, only via this parameter. + +.TP +minburst +As mentioned before, CBQ needs to throttle in case of +overlimit. The ideal solution is to do so for exactly the calculated +idle time, and pass 1 packet. However, Unix kernels generally have a +hard time scheduling events shorter than 10ms, so it is better to +throttle for a longer period, and then pass minburst packets in one +go, and then sleep minburst times longer. + +The time to wait is called the offtime. Higher values of minburst lead +to more accurate shaping in the long term, but to bigger bursts at +millisecond timescales. Optional. + +.TP +minidle +If avgidle is below 0, we are overlimits and need to wait until +avgidle will be big enough to send one packet. To prevent a sudden +burst from shutting down the link for a prolonged period of time, +avgidle is reset to minidle if it gets too low. + +Minidle is specified in negative microseconds, so 10 means that +avgidle is capped at -10us. Optional. + +.TP +bounded +Signifies that this class will not borrow bandwidth from its siblings. +.TP +isolated +Means that this class will not borrow bandwidth to its siblings + +.TP +split major:minor & defmap bitmap[/bitmap] +If consulting filters attached to a class did not give a verdict, +CBQ can also classify based on the packet's priority. There are 16 +priorities available, numbered from 0 to 15. + +The defmap specifies which priorities this class wants to receive, +specified as a bitmap. The Least Significant Bit corresponds to priority +zero. The +.B split +parameter tells CBQ at which class the decision must be made, which should +be a (grand)parent of the class you are adding. + +As an example, 'tc class add ... classid 10:1 cbq .. split 10:0 defmap c0' +configures class 10:0 to send packets with priorities 6 and 7 to 10:1. + +The complimentary configuration would then +be: 'tc class add ... classid 10:2 cbq ... split 10:0 defmap 3f' +Which would send all packets 0, 1, 2, 3, 4 and 5 to 10:1. +.TP +estimator interval timeconstant +CBQ can measure how much bandwidth each class is using, which tc filters +can use to classify packets with. In order to determine the bandwidth +it uses a very simple estimator that measures once every +.B interval +microseconds how much traffic has passed. This again is a EWMA, for which +the time constant can be specified, also in microseconds. The +.B time constant +corresponds to the sluggishness of the measurement or, conversely, to the +sensitivity of the average to short bursts. Higher values mean less +sensitivity. + +.SH BUGS +The actual bandwidth of the underlying link may not be known, for example +in the case of PPoE or PPTP connections which in fact may send over a +pipe, instead of over a physical device. CBQ is quite resilient to major +errors in the configured bandwidth, probably a the cost of coarser shaping. + +Default kernels rely on coarse timing information for making decisions. These +may make shaping precise in the long term, but inaccurate on second long scales. + +See +.BR tc-cbq-details(8) +for hints on how to improve this. + +.SH SOURCES +.TP +o +Sally Floyd and Van Jacobson, "Link-sharing and Resource +Management Models for Packet Networks", +IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + +.TP +o +Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 + +.TP +o +Sally Floyd, "Notes on Class-Based Queueing: Setting +Parameters", 1996 + +.TP +o +Sally Floyd and Michael Speer, "Experimental Results +for Class-Based Queueing", 1998, not published. + + + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHOR +Alexey N. Kuznetsov, . This manpage maintained by +bert hubert + + diff --git a/tc-htb.8 b/tc-htb.8 new file mode 100644 index 0000000..f61b818 --- /dev/null +++ b/tc-htb.8 @@ -0,0 +1,150 @@ +.TH HTB 8 "10 January 2002" "iproute2" "Linux" +.SH NAME +HTB \- Hierarchy Token Bucket +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B ( parent +classid +.B | root) [ handle +major: +.B ] htb [ default +minor-id +.B ] + +.B tc class ... dev +dev +.B parent +major:[minor] +.B [ classid +major:minor +.B ] htb rate +rate +.B [ ceil +rate +.B ] burst +bytes +.B [ cburst +bytes +.B ] [ prio +priority +.B ] + +.SH DESCRIPTION +HTB is meant as a more understandable and intuitive replacement for +the CBQ qdisc in Linux. Both CBQ and HTB help you to control the use +of the outbound bandwidth on a given link. Both allow you to use one +physical link to simulate several slower links and to send different +kinds of traffic on different simulated links. In both cases, you have +to specify how to divide the physical link into simulated links and +how to decide which simulated link to use for a given packet to be sent. + +Unlike CBQ, HTB shapes traffic based on the Token Bucket Filter algorithm +which does not depend on interface characteristics and so does not need to +know the underlying bandwidth of the outgoing interface. + +.SH SHAPING ALGORITHM +Shaping works as documented in +.B tc-tbf (8). + +.SH CLASSIFICATION +Within the one HRB instance many classes may exist. Each of these classes +contains another qdisc, by default +.BR tc-pfifo (8). + +When enqueueing a packet, HTB starts at the root and uses various methods to +determine which class should receive the data. + +In the absence of uncommon configuration options, the process is rather easy. +At each node we look for an instruction, and then go to the class the +instruction refers us to. If the class found is a barren leaf-node (without +children), we enqueue the packet there. If it is not yet a leaf node, we do +the whole thing over again starting from that node. + +The following actions are performed, in order at each node we visit, until one +sends us to another node, or terminates the process. +.TP +(i) +Consult filters attached to the class. If sent to a leafnode, we are done. +Otherwise, restart. +.TP +(ii) +If none of the above returned with an instruction, enqueue at this node. +.P +This algorithm makes sure that a packet always ends up somewhere, even while +you are busy building your configuration. + +.SH LINK SHARING ALGORITHM +FIXME + +.SH QDISC +The root of a HTB qdisc class tree has the following parameters: + +.TP +parent major:minor | root +This mandatory parameter determines the place of the HTB instance, either at the +.B root +of an interface or within an existing class. +.TP +handle major: +Like all other qdiscs, the HTB can be assigned a handle. Should consist only +of a major number, followed by a colon. Optional, but very useful if classes +will be generated within this qdisc. +.TP +default minor-id +Unclassified traffic gets sent to the class with this minor-id. + +.SH CLASSES +Classes have a host of parameters to configure their operation. + +.TP +parent major:minor +Place of this class within the hierarchy. If attached directly to a qdisc +and not to another class, minor can be omitted. Mandatory. +.TP +classid major:minor +Like qdiscs, classes can be named. The major number must be equal to the +major number of the qdisc to which it belongs. Optional, but needed if this +class is going to have children. +.TP +prio priority +In the round-robin process, classes with the lowest priority field are tried +for packets first. Mandatory. + +.TP +rate rate +Maximum rate this class and all its children are guaranteed. Mandatory. + +.TP +ceil rate +Maximum rate at which a class can send, if its parent has bandwidth to spare. +Defaults to the configured rate, which implies no borrowing + +.TP +burst bytes +Amount of bytes that can be burst at +.B ceil +speed, in excess of the configured +.B rate. +Should be at least as high as the highest burst of all children. + +.TP +cburst bytes +Amount of bytes that can be burst at 'infinite' speed, in other words, as fast +as the interface can transmit them. For perfect evening out, should be equal to at most one average +packet. Should be at least as high as the highest cburst of all children. + +.SH NOTES +Due to Unix timing constraints, the maximum ceil rate is not infinite and may in fact be quite low. On Intel, +there are 100 timer events per second, the maximum rate is that rate at which 'burst' bytes are sent each timer tick. +From this, the mininum burst size for a specified rate can be calculated. For i386, a 10mbit rate requires a 12 kilobyte +burst as 100*12kb*8 equals 10mbit. + +.SH SEE ALSO +.BR tc (8) +.P +HTB website: http://luxik.cdi.cz/~devik/qos/htb/ +.SH AUTHOR +Martin Devera . This manpage maintained by bert hubert + + diff --git a/tc-pbfifo.8 b/tc-pbfifo.8 new file mode 100644 index 0000000..8dda4bb --- /dev/null +++ b/tc-pbfifo.8 @@ -0,0 +1,72 @@ +.TH PBFIFO 8 "10 January 2002" "iproute2" "Linux" +.SH NAME +pfifo \- Packet limited First In, First Out queue +.P +bfifo \- Byte limited First In, First Out queue + +.SH SYNOPSIS +.B tc qdisc ... add pfifo +.B [ limit +packets +.B ] +.P +.B tc qdisc ... add bfifo +.B [ limit +bytes +.B ] + +.SH DESCRIPTION +The pfifo and bfifo qdiscs are unadorned First In, First Out queues. They are the +simplest queues possible and therefore have no overhead. +.B pfifo +constrains the queue size as measured in packets. +.B bfifo +does so as measured in bytes. + +Like all non-default qdiscs, they maintain statistics. This might be a reason to prefer +pfifo or bfifo over the default. + +.SH ALGORITHM +A list of packets is maintained, when a packet is enqueued it gets inserted at the tail of +a list. When a packet needs to be sent out to the network, it is taken from the head of the list. + +If the list is too long, no further packets are allowed on. This is called 'tail drop'. + +.SH PARAMETERS +.TP +limit +Maximum queue size. Specified in bytes for bfifo, in packets for pfifo. For pfifo, defaults +to the interface txqueuelen, as specified with +.BR ifconfig (8) +or +.BR ip (8). + +For bfifo, it defaults to the txqueuelen multiplied by the interface MTU. + +.SH OUTPUT +The output of +.B tc -s qdisc ls +contains the limit, either in packets or in bytes, and the number of bytes +and packets actually sent. An unsent and dropped packet only appears between braces +and is not counted as 'Sent'. + +In this example, the queue length is 100 packets, 45894 bytes were sent over 681 packets. +No packets were dropped, and as the pfifo queue does not slow down packets, there were also no +overlimits: +.P +.nf +# tc -s qdisc ls dev eth0 +qdisc pfifo 8001: dev eth0 limit 100p + Sent 45894 bytes 681 pkts (dropped 0, overlimits 0) +.fi + +If a backlog occurs, this is displayed as well. +.SH SEE ALSO +.BR tc (8) + +.SH AUTHORS +Alexey N. Kuznetsov, + +This manpage maintained by bert hubert + + diff --git a/tc-pfifo_fast.8 b/tc-pfifo_fast.8 new file mode 100644 index 0000000..43ab166 --- /dev/null +++ b/tc-pfifo_fast.8 @@ -0,0 +1,59 @@ +.TH PFIFO_FAST 8 "10 January 2002" "iproute2" "Linux" +.SH NAME +pfifo_fast \- three-band first in, first out queue + +.SH DESCRIPTION +pfifo_fast is the default qdisc of each interface. + +Whenever an interface is created, the pfifo_fast qdisc is automatically used +as a queue. If another qdisc is attached, it preempts the default +pfifo_fast, which automatically returns to function when an existing qdisc +is detached. + +In this sense this qdisc is magic, and unlike other qdiscs. + +.SH ALGORITHM +The algorithm is very similar to that of the classful +.BR tc-prio (8) +qdisc. +.B pfifo_fast +is like three +.BR tc-pfifo (8) +queues side by side, where packets can be enqueued in any of the three bands +based on their Type of Service bits or assigned priority. + +Not all three bands are dequeued simultaneously - as long as lower bands +have traffic, higher bands are never dequeued. This can be used to +prioritize interactive traffic or penalize 'lowest cost' traffic. + +Each band can be txqueuelen packets long, as configured with +.BR ifconfig (8) +or +.BR ip (8). +Additional packets coming in are not enqueued but are instead dropped. + +See +.BR tc-prio (8) +for complete details on how TOS bits are translated into bands. +.SH PARAMETERS +.TP +txqueuelen +The length of the three bands depends on the interface txqueuelen, as +specified with +.BR ifconfig (8) +or +.BR ip (8). + +.SH BUGS +Does not maintain statistics and does not show up in tc qdisc ls. This is because +it is the automatic default in the absence of a configured qdisc. + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHORS +Alexey N. Kuznetsov, + +This manpage maintained by bert hubert + + diff --git a/tc-prio.8 b/tc-prio.8 new file mode 100644 index 0000000..e942e62 --- /dev/null +++ b/tc-prio.8 @@ -0,0 +1,187 @@ +.TH PRIO 8 "16 December 2001" "iproute2" "Linux" +.SH NAME +PRIO \- Priority qdisc +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B ( parent +classid +.B | root) [ handle +major: +.B ] prio [ bands +bands +.B ] [ priomap +band,band,band... +.B ] [ estimator +interval timeconstant +.B ] + +.SH DESCRIPTION +The PRIO qdisc is a simple classful queueing discipline that contains +an arbitrary number of classes of differing priority. The classes are +dequeued in numerical descending order of priority. PRIO is a scheduler +and never delays packets - it is a work-conserving qdisc, though the qdiscs +contained in the classes may not be. + +Very useful for lowering latency when there is no need for slowing down +traffic. + +.SH ALGORITHM +On creation with 'tc qdisc add', a fixed number of bands is created. Each +band is a class, although is not possible to add classes with 'tc qdisc +add', the number of bands to be created must instead be specified on the +commandline attaching PRIO to its root. + +When dequeueing, band 0 is tried first and only if it did not deliver a +packet does PRIO try band 1, and so onwards. Maximum reliability packets +should therefore go to band 0, minimum delay to band 1 and the rest to band +2. + +As the PRIO qdisc itself will have minor number 0, band 0 is actually +major:1, band 1 is major:2, etc. For major, substitute the major number +assigned to the qdisc on 'tc qdisc add' with the +.B handle +parameter. + +.SH CLASSIFICATION +Three methods are available to PRIO to determine in which band a packet will +be enqueued. +.TP +From userspace +A process with sufficient privileges can encode the destination class +directly with SO_PRIORITY, see +.BR tc(7). +.TP +with a tc filter +A tc filter attached to the root qdisc can point traffic directly to a class +.TP +with the priomap +Based on the packet priority, which in turn is derived from the Type of +Service assigned to the packet. +.P +Only the priomap is specific to this qdisc. +.SH QDISC PARAMETERS +.TP +bands +Number of bands. If changed from the default of 3, +.B priomap +must be updated as well. +.TP +priomap +The priomap maps the priority of +a packet to a class. The priority can either be set directly from userspace, +or be derived from the Type of Service of the packet. + +Determines how packet priorities, as assigned by the kernel, map to +bands. Mapping occurs based on the TOS octet of the packet, which looks like +this: + +.nf +0 1 2 3 4 5 6 7 ++---+---+---+---+---+---+---+---+ +| | | | +|PRECEDENCE | TOS |MBZ| +| | | | ++---+---+---+---+---+---+---+---+ +.fi + +The four TOS bits (the 'TOS field') are defined as: + +.nf +Binary Decimcal Meaning +----------------------------------------- +1000 8 Minimize delay (md) +0100 4 Maximize throughput (mt) +0010 2 Maximize reliability (mr) +0001 1 Minimize monetary cost (mmc) +0000 0 Normal Service +.fi + +As there is 1 bit to the right of these four bits, the actual value of the +TOS field is double the value of the TOS bits. Tcpdump -v -v shows you the +value of the entire TOS field, not just the four bits. It is the value you +see in the first column of this table: + +.nf +TOS Bits Means Linux Priority Band +------------------------------------------------------------ +0x0 0 Normal Service 0 Best Effort 1 +0x2 1 Minimize Monetary Cost 1 Filler 2 +0x4 2 Maximize Reliability 0 Best Effort 1 +0x6 3 mmc+mr 0 Best Effort 1 +0x8 4 Maximize Throughput 2 Bulk 2 +0xa 5 mmc+mt 2 Bulk 2 +0xc 6 mr+mt 2 Bulk 2 +0xe 7 mmc+mr+mt 2 Bulk 2 +0x10 8 Minimize Delay 6 Interactive 0 +0x12 9 mmc+md 6 Interactive 0 +0x14 10 mr+md 6 Interactive 0 +0x16 11 mmc+mr+md 6 Interactive 0 +0x18 12 mt+md 4 Int. Bulk 1 +0x1a 13 mmc+mt+md 4 Int. Bulk 1 +0x1c 14 mr+mt+md 4 Int. Bulk 1 +0x1e 15 mmc+mr+mt+md 4 Int. Bulk 1 +.fi + +The second column contains the value of the relevant +four TOS bits, followed by their translated meaning. For example, 15 stands +for a packet wanting Minimal Montetary Cost, Maximum Reliability, Maximum +Throughput AND Minimum Delay. + +The fourth column lists the way the Linux kernel interprets the TOS bits, by +showing to which Priority they are mapped. + +The last column shows the result of the default priomap. On the commandline, +the default priomap looks like this: + + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 + +This means that priority 4, for example, gets mapped to band number 1. +The priomap also allows you to list higher priorities (> 7) which do not +correspond to TOS mappings, but which are set by other means. + +This table from RFC 1349 (read it for more details) explains how +applications might very well set their TOS bits: + +.nf +TELNET 1000 (minimize delay) +FTP + Control 1000 (minimize delay) + Data 0100 (maximize throughput) + +TFTP 1000 (minimize delay) + +SMTP + Command phase 1000 (minimize delay) + DATA phase 0100 (maximize throughput) + +Domain Name Service + UDP Query 1000 (minimize delay) + TCP Query 0000 + Zone Transfer 0100 (maximize throughput) + +NNTP 0001 (minimize monetary cost) + +ICMP + Errors 0000 + Requests 0000 (mostly) + Responses (mostly) +.fi + + +.SH CLASSES +PRIO classes cannot be configured further - they are automatically created +when the PRIO qdisc is attached. Each class however can contain yet a +further qdisc. + +.SH BUGS +Large amounts of traffic in the lower bands can cause starvation of higher +bands. Can be prevented by attaching a shaper (for example, +.BR tc-tbf(8) +to these bands to make sure they cannot dominate the link. + +.SH AUTHORS +Alexey N. Kuznetsov, , J Hadi Salim +. This manpage maintained by bert hubert + + diff --git a/tc-red.8 b/tc-red.8 new file mode 100644 index 0000000..d02b411 --- /dev/null +++ b/tc-red.8 @@ -0,0 +1,131 @@ +.TH RED 8 "13 December 2001" "iproute2" "Linux" +.SH NAME +red \- Random Early Detection +.SH SYNOPSIS +.B tc qdisc ... red +.B limit +bytes +.B min +bytes +.B max +bytes +.B avpkt +bytes +.B burst +packets +.B [ ecn ] [ bandwidth +rate +.B ] probability +chance + +.SH DESCRIPTION +Random Early Detection is a classless qdisc which manages its queue size +smartly. Regular queues simply drop packets from the tail when they are +full, which may not be the optimal behaviour. RED also performs tail drop, +but does so in a more gradual way. + +Once the queue hits a certain average length, packets enqueued have a +configurable chance of being marked (which may mean dropped). This chance +increases linearly up to a point called the +.B max +average queue length, although the queue might get bigger. + +This has a host of benefits over simple taildrop, while not being processor +intensive. It prevents synchronous retransmits after a burst in traffic, +which cause further retransmits, etc. + +The goal is the have a small queue size, which is good for interactivity +while not disturbing TCP/IP traffic with too many sudden drops after a burst +of traffic. + +Depending on if ECN is configured, marking either means dropping or +purely marking a packet as overlimit. +.SH ALGORITHM +The average queue size is used for determining the marking +probability. This is calculated using an Exponential Weighted Moving +Average, which can be more or less sensitive to bursts. + +When the average queue size is below +.B min +bytes, no packet will ever be marked. When it exceeds +.B min, +the probability of doing so climbs linearly up +to +.B probability, +until the average queue size hits +.B max +bytes. Because +.B probability +is normally not set to 100%, the queue size might +conceivably rise above +.B max +bytes, so the +.B limit +parameter is provided to set a hard maximum for the size of the queue. + +.SH PARAMETERS +.TP +min +Average queue size at which marking becomes a possibility. +.TP +max +At this average queue size, the marking probability is maximal. Should be at +least twice +.B min +to prevent synchronous retransmits, higher for low +.B min. +.TP +probability +Maximum probability for marking, specified as a floating point +number from 0.0 to 1.0. Suggested values are 0.01 or 0.02 (1 or 2%, +respectively). +.TP +limit +Hard limit on the real (not average) queue size in bytes. Further packets +are dropped. Should be set higher than max+burst. It is advised to set this +a few times higher than +.B max. +.TP +burst +Used for determining how fast the average queue size is influenced by the +real queue size. Larger values make the calculation more sluggish, allowing +longer bursts of traffic before marking starts. Real life experiments +support the following guideline: (min+min+max)/(3*avpkt). +.TP +avpkt +Specified in bytes. Used with burst to determine the time constant for +average queue size calculations. 1000 is a good value. +.TP +bandwidth +This rate is used for calculating the average queue size after some +idle time. Should be set to the bandwidth of your interface. Does not mean +that RED will shape for you! Optional. +.TP +ecn +As mentioned before, RED can either 'mark' or 'drop'. Explicit Congestion +Notification allows RED to notify remote hosts that their rate exceeds the +amount of bandwidth available. Non-ECN capable hosts can only be notified by +dropping a packet. If this parameter is specified, packets which indicate +that their hosts honor ECN will only be marked and not dropped, unless the +queue size hits +.B limit +bytes. Needs a tc binary with RED support compiled in. Recommended. + +.SH SEE ALSO +.BR tc (8) + +.SH SOURCES +.TP +o +Floyd, S., and Jacobson, V., Random Early Detection gateways for +Congestion Avoidance. http://www.aciri.org/floyd/papers/red/red.html +.TP +o +Some changes to the algorithm by Alexey N. Kuznetsov. + +.SH AUTHORS +Alexey N. Kuznetsov, , Alexey Makarenko +, J Hadi Salim . +This manpage maintained by bert hubert + + diff --git a/tc-sfq.8 b/tc-sfq.8 new file mode 100644 index 0000000..337c795 --- /dev/null +++ b/tc-sfq.8 @@ -0,0 +1,107 @@ +.TH TC 8 "8 December 2001" "iproute2" "Linux" +.SH NAME +sfq \- Stochastic Fairness Queueing +.SH SYNOPSIS +.B tc qdisc ... perturb +seconds +.B quantum +bytes + +.SH DESCRIPTION + +Stochastic Fairness Queueing is a classless queueing discipline available for +traffic control with the +.BR tc (8) +command. + +SFQ does not shape traffic but only schedules the transmission of packets, based on 'flows'. +The goal is to ensure fairness so that each flow is able to send data in turn, thus preventing +any single flow from drowning out the rest. + +This may in fact have some effect in mitigating a Denial of Service attempt. + +SFQ is work-conserving and therefore always delivers a packet if it has one available. +.SH ALGORITHM +On enqueueing, each packet is assigned to a hash bucket, based on +.TP +(i) +Source address +.TP +(ii) +Destination address +.TP +(iii) +Source port +.P +If these are available. SFQ knows about ipv4 and ipv6 and also UDP, TCP and ESP. +Packets with other protocols are hashed based on the 32bits representation of their +destination and the socket they belong to. A flow corresponds mostly to a TCP/IP +connection. + +Each of these buckets should represent a unique flow. Because multiple flows may +get hashed to the same bucket, the hashing algorithm is perturbed at configurable +intervals so that the unfairness lasts only for a short while. Perturbation may +however cause some inadvertent packet reordering to occur. + +When dequeuing, each hashbucket with data is queried in a round robin fashion. + +The compile time maximum length of the SFQ is 128 packets, which can be spread over +at most 128 buckets of 1024 available. In case of overflow, tail-drop is performed +on the fullest bucket, thus maintaining fairness. + +.SH PARAMETERS +.TP +perturb +Interval in seconds for queue algorithm perturbation. Defaults to 0, which means that +no perturbation occurs. Do not set too low for each perturbation may cause some packet +reordering. Advised value: 10 +.TP +quantum +Amount of bytes a flow is allowed to dequeue during a round of the round robin process. +Defaults to the MTU of the interface which is also the advised value and the minimum value. + +.SH EXAMPLE & USAGE + +To attach to device ppp0: +.P +# tc qdisc add dev ppp0 root sfq perturb 10 +.P +Please note that SFQ, like all non-shaping (work-conserving) qdiscs, is only useful +if it owns the queue. +This is the case when the link speed equals the actually available bandwidth. This holds +for regular phone modems, ISDN connections and direct non-switched ethernet links. +.P +Most often, cable modems and DSL devices do not fall into this category. The same holds +for when connected to a switch and trying to send data to a congested segment also +connected to the switch. +.P +In this case, the effective queue does not reside within Linux and is therefore not +available for scheduling. +.P +Embed SFQ in a classful qdisc to make sure it owns the queue. + +.SH SOURCE +.TP +o +Paul E. McKenney "Stochastic Fairness Queuing", +IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + +.TP +o +Paul E. McKenney "Stochastic Fairness Queuing", +"Interworking: Research and Experience", v.2, 1991, p.113-131. + +.TP +o +See also: +M. Shreedhar and George Varghese "Efficient Fair +Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHOR +Alexey N. Kuznetsov, . This manpage maintained by +bert hubert + + diff --git a/tc-tbf.8 b/tc-tbf.8 new file mode 100644 index 0000000..3abb238 --- /dev/null +++ b/tc-tbf.8 @@ -0,0 +1,138 @@ +.TH TC 8 "13 December 2001" "iproute2" "Linux" +.SH NAME +tbf \- Token Bucket Filter +.SH SYNOPSIS +.B tc qdisc ... tbf rate +rate +.B burst +bytes/cell +.B ( latency +ms +.B | limit +bytes +.B ) [ mpu +bytes +.B [ peakrate +rate +.B mtu +bytes/cell +.B ] ] +.P +burst is also known as buffer and maxburst. mtu is also known as minburst. +.SH DESCRIPTION + +The Token Bucket Filter is a classless queueing discipline available for +traffic control with the +.BR tc (8) +command. + +TBF is a pure shaper and never schedules traffic. It is non-work-conserving and may throttle +itself, although packets are available, to ensure that the configured rate is not exceeded. +On all platforms except for Alpha, +it is able to shape up to 1mbit/s of normal traffic with ideal minimal burstiness, +sending out data exactly at the configured rates. + +Much higher rates are possible but at the cost of losing the minimal burstiness. In that +case, data is on average dequeued at the configured rate but may be sent much faster at millisecond +timescales. Because of further queues living in network adaptors, this is often not a problem. + +Kernels with a higher 'HZ' can achieve higher rates with perfect burstiness. On Alpha, HZ is ten +times higher, leading to a 10mbit/s limit to perfection. These calculations hold for packets of on +average 1000 bytes. + +.SH ALGORITHM +As the name implies, traffic is filtered based on the expenditure of +.B tokens. +Tokens roughly correspond to bytes, with the additional constraint that each packet consumes +some tokens, no matter how small it is. This reflects the fact that even a zero-sized packet occupies +the link for some time. + +On creation, the TBF is stocked with tokens which correspond to the amount of traffic that can be burst +in one go. Tokens arrive at a steady rate, until the bucket is full. + +If no tokens are available, packets are queued, up to a configured limit. The TBF now +calculates the token deficit, and throttles until the first packet in the queue can be sent. + +If it is not acceptable to burst out packets at maximum speed, a peakrate can be configured +to limit the speed at which the bucket empties. This peakrate is implemented as a second TBF +with a very small bucket, so that it doesn't burst. + +To achieve perfection, the second bucket may contain only a single packet, which leads to +the earlier mentioned 1mbit/s limit. + +This limit is caused by the fact that the kernel can only throttle for at minimum 1 'jiffy', which depends +on HZ as 1/HZ. For perfect shaping, only a single packet can get sent per jiffy - for HZ=100, this means 100 +packets of on average 1000 bytes each, which roughly corresponds to 1mbit/s. + +.SH PARAMETERS +See +.BR tc (8) +for how to specify the units of these values. +.TP +limit or latency +Limit is the number of bytes that can be queued waiting for tokens to become +available. You can also specify this the other way around by setting the +latency parameter, which specifies the maximum amount of time a packet can +sit in the TBF. The latter calculation takes into account the size of the +bucket, the rate and possibly the peakrate (if set). These two parameters +are mutually exclusive. +.TP +burst +Also known as buffer or maxburst. +Size of the bucket, in bytes. This is the maximum amount of bytes that tokens can be available for instantaneously. +In general, larger shaping rates require a larger buffer. For 10mbit/s on Intel, you need at least 10kbyte buffer +if you want to reach your configured rate! + +If your buffer is too small, packets may be dropped because more tokens arrive per timer tick than fit in your bucket. +The minimum buffer size can be calculated by dividing the rate by HZ. + +Token usage calculations are performed using a table which by default has a resolution of 8 packets. +This resolution can be changed by specifying the +.B cell +size with the burst. For example, to specify a 6000 byte buffer with a 16 +byte cell size, set a burst of 6000/16. You will probably never have to set +this. Must be an integral power of 2. +.TP +mpu +A zero-sized packet does not use zero bandwidth. For ethernet, no packet uses less than 64 bytes. The Minimum Packet Unit +determines the minimal token usage (specified in bytes) for a packet. Defaults to zero. +.TP +rate +The speed knob. See remarks above about limits! See +.BR tc (8) +for units. +.PP +Furthermore, if a peakrate is desired, the following parameters are available: + +.TP +peakrate +Maximum depletion rate of the bucket. Limited to 1mbit/s on Intel, 10mbit/s on Alpha. The peakrate does +not need to be set, it is only necessary if perfect millisecond timescale shaping is required. + +.TP +mtu/minburst +Specifies the size of the peakrate bucket. For perfect accuracy, should be set to the MTU of the interface. +If a peakrate is needed, but some burstiness is acceptable, this size can be raised. A 3000 byte minburst +allows around 3mbit/s of peakrate, given 1000 byte packets. + +Like the regular burstsize you can also specify a +.B cell +size. +.SH EXAMPLE & USAGE + +To attach a TBF with a sustained maximum rate of 0.5mbit/s, a peakrate of 1.0mbit/s, +a 5kilobyte buffer, with a pre-bucket queue size limit calculated so the TBF causes +at most 70ms of latency, with perfect peakrate behaviour, issue: +.P +# tc qdisc add dev eth0 root tbf rate 0.5mbit \\ + burst 5kb latency 70ms peakrate 1mbit \\ + minburst 1540 + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHOR +Alexey N. Kuznetsov, . This manpage maintained by +bert hubert + + diff --git a/tc.8 b/tc.8 new file mode 100644 index 0000000..b9b8039 --- /dev/null +++ b/tc.8 @@ -0,0 +1,348 @@ +.TH TC 8 "16 December 2001" "iproute2" "Linux" +.SH NAME +tc \- show / manipulate traffic control settings +.SH SYNOPSIS +.B tc qdisc [ add | change | replace | link ] dev +DEV +.B +[ parent +qdisc-id +.B | root ] +.B [ handle +qdisc-id ] qdisc +[ qdisc specific parameters ] +.P + +.B tc class [ add | change | replace ] dev +DEV +.B parent +qdisc-id +.B [ classid +class-id ] qdisc +[ qdisc specific parameters ] +.P + +.B tc filter [ add | change | replace ] dev +DEV +.B [ parent +qdisc-id +.B | root ] protocol +protocol +.B prio +priority filtertype +[ filtertype specific parameters ] +.B flowid +flow-id + +.B tc [-s | -d ] qdisc show [ dev +DEV +.B ] +.P +.B tc [-s | -d ] class show dev +DEV +.P +.B tc filter show dev +DEV + +.SH DESCRIPTION +.B Tc +is used to configure Traffic Control in the Linux kernel. Traffic Control consists +of the following: + +.TP +SHAPING +When traffic is shaped, its rate of transmission is under control. Shaping may +be more than lowering the available bandwidth - it is also used to smooth out +bursts in traffic for better network behaviour. Shaping occurs on egress. + +.TP +SCHEDULING +By scheduling the transmission of packets it is possible to improve interactivity +for traffic that needs it while still guaranteeing bandwidth to bulk transfers. Reordering +is also called prioritizing, and happens only on egress. + +.TP +POLICING +Where shaping deals with transmission of traffic, policing pertains to traffic +arriving. Policing thus occurs on ingress. + +.TP +DROPPING +Traffic exceeding a set bandwidth may also be dropped forthwith, both on +ingress and on egress. + +.P +Processing of traffic is controlled by three kinds of objects: qdiscs, +classes and filters. + +.SH QDISCS +.B qdisc +is short for 'queueing discipline' and it is elementary to +understanding traffic control. Whenever the kernel needs to send a +packet to an interface, it is +.B enqueued +to the qdisc configured for that interface. Immediately afterwards, the kernel +tries to get as many packets as possible from the qdisc, for giving them +to the network adaptor driver. + +A simple QDISC is the 'pfifo' one, which does no processing at all and is a pure +First In, First Out queue. It does however store traffic when the network interface +can't handle it momentarily. + +.SH CLASSES +Some qdiscs can contain classes, which contain further qdiscs - traffic may +then be enqueued in any of the inner qdiscs, which are within the +.B classes. +When the kernel tries to dequeue a packet from such a +.B classful qdisc +it can come from any of the classes. A qdisc may for example prioritize +certain kinds of traffic by trying to dequeue from certain classes +before others. + +.SH FILTERS +A +.B filter +is used by a classful qdisc to determine in which class a packet will +be enqueued. Whenever traffic arrives at a class with subclasses, it needs +to be classified. Various methods may be employed to do so, one of these +are the filters. All filters attached to the class are called, until one of +them returns with a verdict. If no verdict was made, other criteria may be +available. This differs per qdisc. + +It is important to notice that filters reside +.B within +qdiscs - they are not masters of what happens. + +.SH CLASSLESS QDISCS +The classless qdiscs are: +.TP +[p|b]fifo +Simplest usable qdisc, pure First In, First Out behaviour. Limited in +packets or in bytes. +.TP +pfifo_fast +Standard qdisc for 'Advanced Router' enabled kernels. Consists of a three-band +queue which honors Type of Service flags, as well as the priority that may be +assigned to a packet. +.TP +red +Random Early Detection simulates physical congestion by randomly dropping +packets when nearing configured bandwidth allocation. Well suited to very +large bandwidth applications. +.TP +sfq +Stochastic Fairness Queueing reorders queued traffic so each 'session' +gets to send a packet in turn. +.TP +tbf +The Token Bucket Filter is suited for slowing traffic down to a precisely +configured rate. Scales well to large bandwidths. +.SH CONFIGURING CLASSLESS QDISCS +In the absence of classful qdiscs, classless qdiscs can only be attached at +the root of a device. Full syntax: +.P +.B tc qdisc add dev +DEV +.B root +QDISC QDISC-PARAMETERS + +To remove, issue +.P +.B tc qdisc del dev +DEV +.B root + +The +.B pfifo_fast +qdisc is the automatic default in the absence of a configured qdisc. + +.SH CLASSFUL QDISCS +The classful qdiscs are: +.TP +CBQ +Class Based Queueing implements a rich linksharing hierarchy of classes. +It contains shaping elements as well as prioritizing capabilities. Shaping is +performed using link idle time calculations based on average packet size and +underlying link bandwidth. The latter may be ill-defined for some interfaces. +.TP +HTB +The Hierarchy Token Bucket implements a rich linksharing hierarchy of +classes with an emphasis on conforming to existing practices. HTB facilitates +guaranteeing bandwidth to classes, while also allowing specification of upper +limits to inter-class sharing. It contains shaping elements, based on TBF and +can prioritize classes. +.TP +PRIO +The PRIO qdisc is a non-shaping container for a configurable number of +classes which are dequeued in order. This allows for easy prioritization +of traffic, where lower classes are only able to send if higher ones have +no packets available. To facilitate configuration, Type Of Service bits are +honored by default. +.SH THEORY OF OPERATION +Classes form a tree, where each class has a single parent. +A class may have multiple children. Some qdiscs allow for runtime addition +of classes (CBQ, HTB) while others (PRIO) are created with a static number of +children. + +Qdiscs which allow dynamic addition of classes can have zero or more +subclasses to which traffic may be enqueued. + +Furthermore, each class contains a +.B leaf qdisc +which by default has +.B pfifo +behaviour though another qdisc can be attached in place. This qdisc may again +contain classes, but each class can have only one leaf qdisc. + +When a packet enters a classful qdisc it can be +.B classified +to one of the classes within. Three criteria are available, although not all +qdiscs will use all three: +.TP +tc filters +If tc filters are attached to a class, they are consulted first +for relevant instructions. Filters can match on all fields of a packet header, +as well as on the firewall mark applied by ipchains or iptables. See +.BR tc-filters (8). +.TP +Type of Service +Some qdiscs have built in rules for classifying packets based on the TOS field. +.TP +skb->priority +Userspace programs can encode a class-id in the 'skb->priority' field using +the SO_PRIORITY option. +.P +Each node within the tree can have its own filters but higher level filters +may also point directly to lower classes. + +If classification did not succeed, packets are enqueued to the leaf qdisc +attached to that class. Check qdisc specific manpages for details, however. + +.SH NAMING +All qdiscs, classes and filters have IDs, which can either be specified +or be automatically assigned. + +IDs consist of a major number and a minor number, separated by a colon. + +.TP +QDISCS +A qdisc, which potentially can have children, +gets assigned a major number, called a 'handle', leaving the minor +number namespace available for classes. The handle is expressed as '10:'. +It is customary to explicitly assign a handle to qdiscs expected to have +children. + +.TP +CLASSES +Classes residing under a qdisc share their qdisc major number, but each have +a separate minor number called a 'classid' that has no relation to their +parent classes, only to their parent qdisc. The same naming custom as for +qdiscs applies. + +.TP +FILTERS +Filters have a three part ID, which is only needed when using a hashed +filter hierarchy, for which see +.BR tc-filters (8). +.SH UNITS +All parameters accept a floating point number, possibly followed by a unit. +.P +Bandwidths or rates can be specified in: +.TP +kbps +Kilobytes per second +.TP +mbps +Megabytes per second +.TP +kbit +Kilobits per second +.TP +mbit +Megabits per second +.TP +bps or a bare number +Bytes per second +.P +Amounts of data can be specified in: +.TP +kb or k +Kilobytes +.TP +mb or m +Megabytes +.TP +mbit +Megabits +.TP +kbit +Kilobits +.TP +b or a bare number +Bytes. +.P +Lengths of time can be specified in: +.TP +s, sec or secs +Whole seconds +.TP +ms, msec or msecs +Milliseconds +.TP +us, usec, usecs or a bare number +Microseconds. + +.SH TC COMMANDS +The following commands are available for qdiscs, classes and filter: +.TP +add +Add a qdisc, class or filter to a node. For all entities, a +.B parent +must be passed, either by passing its ID or by attaching directly to the root of a device. +When creating a qdisc or a filter, it can be named with the +.B handle +parameter. A class is named with the +.B classid +parameter. + +.TP +remove +A qdisc can be removed by specifying its handle, which may also be 'root'. All subclasses and their leaf qdiscs +are automatically deleted, as well as any filters attached to them. + +.TP +change +Some entities can be modified 'in place'. Shares the syntax of 'add', with the exception +that the handle cannot be changed and neither can the parent. In other words, +.B +change +cannot move a node. + +.TP +replace +Performs a nearly atomic remove/add on an existing node id. If the node does not exist yet +it is created. + +.TP +link +Only available for qdiscs and performs a replace where the node +must exist already. + + +.SH HISTORY +.B tc +was written by Alexey N. Kuznetsov and added in Linux 2.2. +.SH SEE ALSO +.BR tc-cbq (8), +.BR tc-htb (8), +.BR tc-sfq (8), +.BR tc-red (8), +.BR tc-tbf (8), +.BR tc-pfifo (8), +.BR tc-bfifo (8), +.BR tc-pfifo_fast (8), +.BR tc-filters (8) + +.SH AUTHOR +Manpage maintained by bert hubert (ahu@ds9a.nl) + -- 2.43.0