2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
26 #include <net/pkt_sched.h>
28 /* Network Emulation Queuing algorithm.
29 ====================================
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
35 ----------------------------------------------------------------
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
54 struct netem_sched_data {
56 struct sk_buff_head delayed;
57 struct timer_list timer;
70 } delay_cor, loss_cor, dup_cor;
78 /* Time stamp put into socket buffer control block */
80 psched_time_t time_to_send;
83 /* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
86 static void init_crandom(struct crndstate *state, unsigned long rho)
89 state->last = net_random();
92 /* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
96 static unsigned long get_crandom(struct crndstate *state)
101 if (state->rho == 0) /* no correllation */
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
111 /* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
115 static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
124 rnd = get_crandom(state);
126 /* default uniform distribution */
128 return (rnd % (2*sigma)) - sigma + mu;
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
133 x += NETEM_DIST_SCALE/2;
135 x -= NETEM_DIST_SCALE/2;
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
140 /* Put skb in the private delayed queue. */
141 static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
143 struct netem_sched_data *q = qdisc_priv(sch);
144 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
148 PSCHED_GET_TIME(now);
149 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
150 PSCHED_TADD2(now, td, cb->time_to_send);
152 /* Always queue at tail to keep packets in order */
153 if (likely(q->delayed.qlen < q->limit)) {
154 __skb_queue_tail(&q->delayed, skb);
155 sch->bstats.bytes += skb->len;
156 sch->bstats.packets++;
158 if (!timer_pending(&q->timer)) {
159 q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
160 add_timer(&q->timer);
162 return NET_XMIT_SUCCESS;
167 return NET_XMIT_DROP;
170 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
172 struct netem_sched_data *q = qdisc_priv(sch);
174 pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
176 /* Random packet drop 0 => none, ~0 => all */
177 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
178 pr_debug("netem_enqueue: random loss\n");
180 return 0; /* lie about loss so TCP doesn't know */
183 /* Random duplication */
184 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
185 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
187 pr_debug("netem_enqueue: dup %p\n", skb2);
189 delay_skb(sch, skb2);
192 /* If doing simple delay then gap == 0 so all packets
193 * go into the delayed holding queue
194 * otherwise if doing out of order only "1 out of gap"
195 * packets will be delayed.
197 if (q->counter < q->gap) {
201 ret = q->qdisc->enqueue(skb, q->qdisc);
202 if (likely(ret == NET_XMIT_SUCCESS)) {
204 sch->bstats.bytes += skb->len;
205 sch->bstats.packets++;
213 return delay_skb(sch, skb);
216 /* Requeue packets but don't change time stamp */
217 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
219 struct netem_sched_data *q = qdisc_priv(sch);
222 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
224 sch->qstats.requeues++;
230 static unsigned int netem_drop(struct Qdisc* sch)
232 struct netem_sched_data *q = qdisc_priv(sch);
235 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
243 * Move all packets that are ready to send from the delay holding
244 * list to the underlying qdisc, then just call dequeue
246 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
248 struct netem_sched_data *q = qdisc_priv(sch);
251 skb = q->qdisc->dequeue(q->qdisc);
257 static void netem_watchdog(unsigned long arg)
259 struct Qdisc *sch = (struct Qdisc *)arg;
260 struct netem_sched_data *q = qdisc_priv(sch);
261 struct net_device *dev = sch->dev;
265 pr_debug("netem_watchdog: fired @%lu\n", jiffies);
267 spin_lock_bh(&dev->queue_lock);
268 PSCHED_GET_TIME(now);
270 while ((skb = skb_peek(&q->delayed)) != NULL) {
271 const struct netem_skb_cb *cb
272 = (const struct netem_skb_cb *)skb->cb;
274 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
275 pr_debug("netem_watchdog: skb %p@%lu %ld\n",
276 skb, jiffies, delay);
278 /* if more time remaining? */
280 mod_timer(&q->timer, jiffies + delay);
283 __skb_unlink(skb, &q->delayed);
285 if (q->qdisc->enqueue(skb, q->qdisc))
291 spin_unlock_bh(&dev->queue_lock);
294 static void netem_reset(struct Qdisc *sch)
296 struct netem_sched_data *q = qdisc_priv(sch);
298 qdisc_reset(q->qdisc);
299 skb_queue_purge(&q->delayed);
302 del_timer_sync(&q->timer);
305 static int set_fifo_limit(struct Qdisc *q, int limit)
310 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
312 rta->rta_type = RTM_NEWQDISC;
313 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
314 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
316 ret = q->ops->change(q, rta);
323 * Distribution data is a variable size payload containing
324 * signed 16 bit values.
326 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
328 struct netem_sched_data *q = qdisc_priv(sch);
329 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
330 const __s16 *data = RTA_DATA(attr);
337 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
342 for (i = 0; i < n; i++)
343 d->table[i] = data[i];
345 spin_lock_bh(&sch->dev->queue_lock);
346 d = xchg(&q->delay_dist, d);
347 spin_unlock_bh(&sch->dev->queue_lock);
353 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
355 struct netem_sched_data *q = qdisc_priv(sch);
356 const struct tc_netem_corr *c = RTA_DATA(attr);
358 if (RTA_PAYLOAD(attr) != sizeof(*c))
361 init_crandom(&q->delay_cor, c->delay_corr);
362 init_crandom(&q->loss_cor, c->loss_corr);
363 init_crandom(&q->dup_cor, c->dup_corr);
367 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
369 struct netem_sched_data *q = qdisc_priv(sch);
370 struct tc_netem_qopt *qopt;
373 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
376 qopt = RTA_DATA(opt);
377 ret = set_fifo_limit(q->qdisc, qopt->limit);
379 pr_debug("netem: can't set fifo limit\n");
383 q->latency = qopt->latency;
384 q->jitter = qopt->jitter;
385 q->limit = qopt->limit;
387 q->loss = qopt->loss;
388 q->duplicate = qopt->duplicate;
390 /* Handle nested options after initial queue options.
391 * Should have put all options in nested format but too late now.
393 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
394 struct rtattr *tb[TCA_NETEM_MAX];
395 if (rtattr_parse(tb, TCA_NETEM_MAX,
396 RTA_DATA(opt) + sizeof(*qopt),
397 RTA_PAYLOAD(opt) - sizeof(*qopt)))
400 if (tb[TCA_NETEM_CORR-1]) {
401 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
406 if (tb[TCA_NETEM_DELAY_DIST-1]) {
407 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
417 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
419 struct netem_sched_data *q = qdisc_priv(sch);
425 skb_queue_head_init(&q->delayed);
426 init_timer(&q->timer);
427 q->timer.function = netem_watchdog;
428 q->timer.data = (unsigned long) sch;
431 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
433 pr_debug("netem: qdisc create failed\n");
437 ret = netem_change(sch, opt);
439 pr_debug("netem: change failed\n");
440 qdisc_destroy(q->qdisc);
445 static void netem_destroy(struct Qdisc *sch)
447 struct netem_sched_data *q = qdisc_priv(sch);
449 del_timer_sync(&q->timer);
450 qdisc_destroy(q->qdisc);
451 kfree(q->delay_dist);
454 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
456 const struct netem_sched_data *q = qdisc_priv(sch);
457 unsigned char *b = skb->tail;
458 struct rtattr *rta = (struct rtattr *) b;
459 struct tc_netem_qopt qopt;
460 struct tc_netem_corr cor;
462 qopt.latency = q->latency;
463 qopt.jitter = q->jitter;
464 qopt.limit = q->limit;
467 qopt.duplicate = q->duplicate;
468 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
470 cor.delay_corr = q->delay_cor.rho;
471 cor.loss_corr = q->loss_cor.rho;
472 cor.dup_corr = q->dup_cor.rho;
473 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
474 rta->rta_len = skb->tail - b;
479 skb_trim(skb, b - skb->data);
483 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
484 struct sk_buff *skb, struct tcmsg *tcm)
486 struct netem_sched_data *q = qdisc_priv(sch);
488 if (cl != 1) /* only one class */
491 tcm->tcm_handle |= TC_H_MIN(1);
492 tcm->tcm_info = q->qdisc->handle;
497 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
500 struct netem_sched_data *q = qdisc_priv(sch);
506 *old = xchg(&q->qdisc, new);
509 sch_tree_unlock(sch);
514 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
516 struct netem_sched_data *q = qdisc_priv(sch);
520 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
525 static void netem_put(struct Qdisc *sch, unsigned long arg)
529 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
530 struct rtattr **tca, unsigned long *arg)
535 static int netem_delete(struct Qdisc *sch, unsigned long arg)
540 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
543 if (walker->count >= walker->skip)
544 if (walker->fn(sch, 1, walker) < 0) {
552 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
557 static struct Qdisc_class_ops netem_class_ops = {
558 .graft = netem_graft,
562 .change = netem_change_class,
563 .delete = netem_delete,
565 .tcf_chain = netem_find_tcf,
566 .dump = netem_dump_class,
569 static struct Qdisc_ops netem_qdisc_ops = {
571 .cl_ops = &netem_class_ops,
572 .priv_size = sizeof(struct netem_sched_data),
573 .enqueue = netem_enqueue,
574 .dequeue = netem_dequeue,
575 .requeue = netem_requeue,
578 .reset = netem_reset,
579 .destroy = netem_destroy,
580 .change = netem_change,
582 .owner = THIS_MODULE,
586 static int __init netem_module_init(void)
588 return register_qdisc(&netem_qdisc_ops);
590 static void __exit netem_module_exit(void)
592 unregister_qdisc(&netem_qdisc_ops);
594 module_init(netem_module_init)
595 module_exit(netem_module_exit)
596 MODULE_LICENSE("GPL");