net/sched/sch_red.c

   1 /*
   2  * net/sched/sch_red.c  Random Early Detection queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  * Changes:
  12  * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
  13  * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
  14  * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/module.h>
  19 #include <asm/uaccess.h>
  20 #include <asm/system.h>
  21 #include <asm/bitops.h>
  22 #include <linux/types.h>
  23 #include <linux/kernel.h>
  24 #include <linux/sched.h>
  25 #include <linux/string.h>
  26 #include <linux/mm.h>
  27 #include <linux/socket.h>
  28 #include <linux/sockios.h>
  29 #include <linux/in.h>
  30 #include <linux/errno.h>
  31 #include <linux/interrupt.h>
  32 #include <linux/if_ether.h>
  33 #include <linux/inet.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/etherdevice.h>
  36 #include <linux/notifier.h>
  37 #include <net/ip.h>
  38 #include <net/route.h>
  39 #include <linux/skbuff.h>
  40 #include <net/sock.h>
  41 #include <net/pkt_sched.h>
  42 #include <net/inet_ecn.h>
  43
  44
  45 /*      Random Early Detection (RED) algorithm.
  46         =======================================
  47
  48         Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
  49         for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
  50
  51         This file codes a "divisionless" version of RED algorithm
  52         as written down in Fig.17 of the paper.
  53
  54 Short description.
  55 ------------------
  56
  57         When a new packet arrives we calculate the average queue length:
  58
  59         avg = (1-W)*avg + W*current_queue_len,
  60
  61         W is the filter time constant (chosen as 2^(-Wlog)), it controls
  62         the inertia of the algorithm. To allow larger bursts, W should be
  63         decreased.
  64
  65         if (avg > th_max) -> packet marked (dropped).
  66         if (avg < th_min) -> packet passes.
  67         if (th_min < avg < th_max) we calculate probability:
  68
  69         Pb = max_P * (avg - th_min)/(th_max-th_min)
  70
  71         and mark (drop) packet with this probability.
  72         Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
  73         max_P should be small (not 1), usually 0.01..0.02 is good value.
  74
  75         max_P is chosen as a number, so that max_P/(th_max-th_min)
  76         is a negative power of two in order arithmetics to contain
  77         only shifts.
  78
  79
  80         Parameters, settable by user:
  81         -----------------------------
  82
  83         limit           - bytes (must be > qth_max + burst)
  84
  85         Hard limit on queue length, should be chosen >qth_max
  86         to allow packet bursts. This parameter does not
  87         affect the algorithms behaviour and can be chosen
  88         arbitrarily high (well, less than ram size)
  89         Really, this limit will never be reached
  90         if RED works correctly.
  91
  92         qth_min         - bytes (should be < qth_max/2)
  93         qth_max         - bytes (should be at least 2*qth_min and less limit)
  94         Wlog            - bits (<32) log(1/W).
  95         Plog            - bits (<32)
  96
  97         Plog is related to max_P by formula:
  98
  99         max_P = (qth_max-qth_min)/2^Plog;
 100
 101         F.e. if qth_max=128K and qth_min=32K, then Plog=22
 102         corresponds to max_P=0.02
 103
 104         Scell_log
 105         Stab
 106
 107         Lookup table for log((1-W)^(t/t_ave).
 108
 109
 110 NOTES:
 111
 112 Upper bound on W.
 113 -----------------
 114
 115         If you want to allow bursts of L packets of size S,
 116         you should choose W:
 117
 118         L + 1 - th_min/S < (1-(1-W)^L)/W
 119
 120         th_min/S = 32         th_min/S = 4
 121
 122         log(W)  L
 123         -1      33
 124         -2      35
 125         -3      39
 126         -4      46
 127         -5      57
 128         -6      75
 129         -7      101
 130         -8      135
 131         -9      190
 132         etc.
 133  */
 134
 135 struct red_sched_data
 136 {
 137 /* Parameters */
 138         u32             limit;          /* HARD maximal queue length    */
 139         u32             qth_min;        /* Min average length threshold: A scaled */
 140         u32             qth_max;        /* Max average length threshold: A scaled */
 141         u32             Rmask;
 142         u32             Scell_max;
 143         unsigned char   flags;
 144         char            Wlog;           /* log(W)               */
 145         char            Plog;           /* random number bits   */
 146         char            Scell_log;
 147         u8              Stab[256];
 148
 149 /* Variables */
 150         unsigned long   qave;           /* Average queue length: A scaled */
 151         int             qcount;         /* Packets since last random number generation */
 152         u32             qR;             /* Cached random number */
 153
 154         psched_time_t   qidlestart;     /* Start of idle period         */
 155         struct tc_red_xstats st;
 156 };
 157
 158 static int red_ecn_mark(struct sk_buff *skb)
 159 {
 160         if (skb->nh.raw + 20 > skb->tail)
 161                 return 0;
 162
 163         switch (skb->protocol) {
 164         case __constant_htons(ETH_P_IP):
 165                 if (!INET_ECN_is_capable(skb->nh.iph->tos))
 166                         return 0;
 167                 if (INET_ECN_is_not_ce(skb->nh.iph->tos))
 168                         IP_ECN_set_ce(skb->nh.iph);
 169                 return 1;
 170         case __constant_htons(ETH_P_IPV6):
 171                 if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
 172                         return 0;
 173                 IP6_ECN_set_ce(skb->nh.ipv6h);
 174                 return 1;
 175         default:
 176                 return 0;
 177         }
 178 }
 179
 180 static int
 181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 182 {
 183         struct red_sched_data *q = qdisc_priv(sch);
 184
 185         psched_time_t now;
 186
 187         if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
 188                 long us_idle;
 189                 int  shift;
 190
 191                 PSCHED_GET_TIME(now);
 192                 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
 193                 PSCHED_SET_PASTPERFECT(q->qidlestart);
 194
 195 /*
 196    The problem: ideally, average length queue recalcultion should
 197    be done over constant clock intervals. This is too expensive, so that
 198    the calculation is driven by outgoing packets.
 199    When the queue is idle we have to model this clock by hand.
 200
 201    SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
 202    dummy packets as a burst after idle time, i.e.
 203
 204           q->qave *= (1-W)^m
 205
 206    This is an apparently overcomplicated solution (f.e. we have to precompute
 207    a table to make this calculation in reasonable time)
 208    I believe that a simpler model may be used here,
 209    but it is field for experiments.
 210 */
 211                 shift = q->Stab[us_idle>>q->Scell_log];
 212
 213                 if (shift) {
 214                         q->qave >>= shift;
 215                 } else {
 216                         /* Approximate initial part of exponent
 217                            with linear function:
 218                            (1-W)^m ~= 1-mW + ...
 219
 220                            Seems, it is the best solution to
 221                            problem of too coarce exponent tabulation.
 222                          */
 223
 224                         us_idle = (q->qave * us_idle)>>q->Scell_log;
 225                         if (us_idle < q->qave/2)
 226                                 q->qave -= us_idle;
 227                         else
 228                                 q->qave >>= 1;
 229                 }
 230         } else {
 231                 q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
 232                 /* NOTE:
 233                    q->qave is fixed point number with point at Wlog.
 234                    The formulae above is equvalent to floating point
 235                    version:
 236
 237                    qave = qave*(1-W) + sch->stats.backlog*W;
 238                                                            --ANK (980924)
 239                  */
 240         }
 241
 242         if (q->qave < q->qth_min) {
 243                 q->qcount = -1;
 244 enqueue:
 245                 if (sch->stats.backlog + skb->len <= q->limit) {
 246                         __skb_queue_tail(&sch->q, skb);
 247                         sch->stats.backlog += skb->len;
 248                         sch->stats.bytes += skb->len;
 249                         sch->stats.packets++;
 250                         return NET_XMIT_SUCCESS;
 251                 } else {
 252                         q->st.pdrop++;
 253                 }
 254                 kfree_skb(skb);
 255                 sch->stats.drops++;
 256                 return NET_XMIT_DROP;
 257         }
 258         if (q->qave >= q->qth_max) {
 259                 q->qcount = -1;
 260                 sch->stats.overlimits++;
 261 mark:
 262                 if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
 263                         q->st.early++;
 264                         goto drop;
 265                 }
 266                 q->st.marked++;
 267                 goto enqueue;
 268         }
 269
 270         if (++q->qcount) {
 271                 /* The formula used below causes questions.
 272
 273                    OK. qR is random number in the interval 0..Rmask
 274                    i.e. 0..(2^Plog). If we used floating point
 275                    arithmetics, it would be: (2^Plog)*rnd_num,
 276                    where rnd_num is less 1.
 277
 278                    Taking into account, that qave have fixed
 279                    point at Wlog, and Plog is related to max_P by
 280                    max_P = (qth_max-qth_min)/2^Plog; two lines
 281                    below have the following floating point equivalent:
 282
 283                    max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
 284
 285                    Any questions? --ANK (980924)
 286                  */
 287                 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
 288                         goto enqueue;
 289                 q->qcount = 0;
 290                 q->qR = net_random()&q->Rmask;
 291                 sch->stats.overlimits++;
 292                 goto mark;
 293         }
 294         q->qR = net_random()&q->Rmask;
 295         goto enqueue;
 296
 297 drop:
 298         kfree_skb(skb);
 299         sch->stats.drops++;
 300         return NET_XMIT_CN;
 301 }
 302
 303 static int
 304 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
 305 {
 306         struct red_sched_data *q = qdisc_priv(sch);
 307
 308         PSCHED_SET_PASTPERFECT(q->qidlestart);
 309
 310         __skb_queue_head(&sch->q, skb);
 311         sch->stats.backlog += skb->len;
 312         return 0;
 313 }
 314
 315 static struct sk_buff *
 316 red_dequeue(struct Qdisc* sch)
 317 {
 318         struct sk_buff *skb;
 319         struct red_sched_data *q = qdisc_priv(sch);
 320
 321         skb = __skb_dequeue(&sch->q);
 322         if (skb) {
 323                 sch->stats.backlog -= skb->len;
 324                 return skb;
 325         }
 326         PSCHED_GET_TIME(q->qidlestart);
 327         return NULL;
 328 }
 329
 330 static unsigned int red_drop(struct Qdisc* sch)
 331 {
 332         struct sk_buff *skb;
 333         struct red_sched_data *q = qdisc_priv(sch);
 334
 335         skb = __skb_dequeue_tail(&sch->q);
 336         if (skb) {
 337                 unsigned int len = skb->len;
 338                 sch->stats.backlog -= len;
 339                 sch->stats.drops++;
 340                 q->st.other++;
 341                 kfree_skb(skb);
 342                 return len;
 343         }
 344         PSCHED_GET_TIME(q->qidlestart);
 345         return 0;
 346 }
 347
 348 static void red_reset(struct Qdisc* sch)
 349 {
 350         struct red_sched_data *q = qdisc_priv(sch);
 351
 352         __skb_queue_purge(&sch->q);
 353         sch->stats.backlog = 0;
 354         PSCHED_SET_PASTPERFECT(q->qidlestart);
 355         q->qave = 0;
 356         q->qcount = -1;
 357 }
 358
 359 static int red_change(struct Qdisc *sch, struct rtattr *opt)
 360 {
 361         struct red_sched_data *q = qdisc_priv(sch);
 362         struct rtattr *tb[TCA_RED_STAB];
 363         struct tc_red_qopt *ctl;
 364
 365         if (opt == NULL ||
 366             rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
 367             tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
 368             RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
 369             RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
 370                 return -EINVAL;
 371
 372         ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
 373
 374         sch_tree_lock(sch);
 375         q->flags = ctl->flags;
 376         q->Wlog = ctl->Wlog;
 377         q->Plog = ctl->Plog;
 378         q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
 379         q->Scell_log = ctl->Scell_log;
 380         q->Scell_max = (255<<q->Scell_log);
 381         q->qth_min = ctl->qth_min<<ctl->Wlog;
 382         q->qth_max = ctl->qth_max<<ctl->Wlog;
 383         q->limit = ctl->limit;
 384         memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
 385
 386         q->qcount = -1;
 387         if (skb_queue_len(&sch->q) == 0)
 388                 PSCHED_SET_PASTPERFECT(q->qidlestart);
 389         sch_tree_unlock(sch);
 390         return 0;
 391 }
 392
 393 static int red_init(struct Qdisc* sch, struct rtattr *opt)
 394 {
 395         return red_change(sch, opt);
 396 }
 397
 398
 399 int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
 400 {
 401         RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
 402         return 0;
 403
 404 rtattr_failure:
 405         return 1;
 406 }
 407
 408 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 409 {
 410         struct red_sched_data *q = qdisc_priv(sch);
 411         unsigned char    *b = skb->tail;
 412         struct rtattr *rta;
 413         struct tc_red_qopt opt;
 414
 415         rta = (struct rtattr*)b;
 416         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 417         opt.limit = q->limit;
 418         opt.qth_min = q->qth_min>>q->Wlog;
 419         opt.qth_max = q->qth_max>>q->Wlog;
 420         opt.Wlog = q->Wlog;
 421         opt.Plog = q->Plog;
 422         opt.Scell_log = q->Scell_log;
 423         opt.flags = q->flags;
 424         RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
 425         rta->rta_len = skb->tail - b;
 426
 427         if (red_copy_xstats(skb, &q->st))
 428                 goto rtattr_failure;
 429
 430         return skb->len;
 431
 432 rtattr_failure:
 433         skb_trim(skb, b - skb->data);
 434         return -1;
 435 }
 436
 437 static struct Qdisc_ops red_qdisc_ops = {
 438         .next           =       NULL,
 439         .cl_ops         =       NULL,
 440         .id             =       "red",
 441         .priv_size      =       sizeof(struct red_sched_data),
 442         .enqueue        =       red_enqueue,
 443         .dequeue        =       red_dequeue,
 444         .requeue        =       red_requeue,
 445         .drop           =       red_drop,
 446         .init           =       red_init,
 447         .reset          =       red_reset,
 448         .change         =       red_change,
 449         .dump           =       red_dump,
 450         .owner          =       THIS_MODULE,
 451 };
 452
 453 static int __init red_module_init(void)
 454 {
 455         return register_qdisc(&red_qdisc_ops);
 456 }
 457 static void __exit red_module_exit(void)
 458 {
 459         unregister_qdisc(&red_qdisc_ops);
 460 }
 461 module_init(red_module_init)
 462 module_exit(red_module_exit)
 463 MODULE_LICENSE("GPL");