2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
26 #include <net/pkt_sched.h>
28 /* Network Emulation Queuing algorithm.
29 ====================================
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
35 ----------------------------------------------------------------
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
54 struct netem_sched_data {
56 struct sk_buff_head delayed;
57 struct timer_list timer;
70 } delay_cor, loss_cor, dup_cor;
78 /* Time stamp put into socket buffer control block */
80 psched_time_t time_to_send;
83 /* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
86 static void init_crandom(struct crndstate *state, unsigned long rho)
89 state->last = net_random();
92 /* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
96 static unsigned long get_crandom(struct crndstate *state)
101 if (state->rho == 0) /* no correllation */
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
111 /* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
115 static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
124 rnd = get_crandom(state);
126 /* default uniform distribution */
128 return (rnd % (2*sigma)) - sigma + mu;
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
133 x += NETEM_DIST_SCALE/2;
135 x -= NETEM_DIST_SCALE/2;
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
140 /* Put skb in the private delayed queue. */
141 static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
143 struct netem_sched_data *q = qdisc_priv(sch);
144 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
148 PSCHED_GET_TIME(now);
149 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
150 PSCHED_TADD2(now, td, cb->time_to_send);
152 /* Always queue at tail to keep packets in order */
153 if (likely(q->delayed.qlen < q->limit)) {
154 __skb_queue_tail(&q->delayed, skb);
155 sch->bstats.bytes += skb->len;
156 sch->bstats.packets++;
158 if (!timer_pending(&q->timer)) {
159 q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
160 add_timer(&q->timer);
162 return NET_XMIT_SUCCESS;
167 return NET_XMIT_DROP;
170 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
172 struct netem_sched_data *q = qdisc_priv(sch);
174 pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
176 /* Random packet drop 0 => none, ~0 => all */
177 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
178 pr_debug("netem_enqueue: random loss\n");
181 return 0; /* lie about loss so TCP doesn't know */
184 /* Random duplication */
185 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
186 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
188 pr_debug("netem_enqueue: dup %p\n", skb2);
190 delay_skb(sch, skb2);
193 /* If doing simple delay then gap == 0 so all packets
194 * go into the delayed holding queue
195 * otherwise if doing out of order only "1 out of gap"
196 * packets will be delayed.
198 if (q->counter < q->gap) {
202 ret = q->qdisc->enqueue(skb, q->qdisc);
203 if (likely(ret == NET_XMIT_SUCCESS)) {
205 sch->bstats.bytes += skb->len;
206 sch->bstats.packets++;
214 return delay_skb(sch, skb);
217 /* Requeue packets but don't change time stamp */
218 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
220 struct netem_sched_data *q = qdisc_priv(sch);
223 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
225 sch->qstats.requeues++;
231 static unsigned int netem_drop(struct Qdisc* sch)
233 struct netem_sched_data *q = qdisc_priv(sch);
236 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
244 * Move all packets that are ready to send from the delay holding
245 * list to the underlying qdisc, then just call dequeue
247 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
249 struct netem_sched_data *q = qdisc_priv(sch);
252 skb = q->qdisc->dequeue(q->qdisc);
258 static void netem_watchdog(unsigned long arg)
260 struct Qdisc *sch = (struct Qdisc *)arg;
261 struct netem_sched_data *q = qdisc_priv(sch);
262 struct net_device *dev = sch->dev;
266 pr_debug("netem_watchdog: fired @%lu\n", jiffies);
268 spin_lock_bh(&dev->queue_lock);
269 PSCHED_GET_TIME(now);
271 while ((skb = skb_peek(&q->delayed)) != NULL) {
272 const struct netem_skb_cb *cb
273 = (const struct netem_skb_cb *)skb->cb;
275 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
276 pr_debug("netem_watchdog: skb %p@%lu %ld\n",
277 skb, jiffies, delay);
279 /* if more time remaining? */
281 mod_timer(&q->timer, jiffies + delay);
284 __skb_unlink(skb, &q->delayed);
286 if (q->qdisc->enqueue(skb, q->qdisc))
292 spin_unlock_bh(&dev->queue_lock);
295 static void netem_reset(struct Qdisc *sch)
297 struct netem_sched_data *q = qdisc_priv(sch);
299 qdisc_reset(q->qdisc);
300 skb_queue_purge(&q->delayed);
303 del_timer_sync(&q->timer);
306 static int set_fifo_limit(struct Qdisc *q, int limit)
311 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
313 rta->rta_type = RTM_NEWQDISC;
314 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
315 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
317 ret = q->ops->change(q, rta);
324 * Distribution data is a variable size payload containing
325 * signed 16 bit values.
327 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
329 struct netem_sched_data *q = qdisc_priv(sch);
330 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
331 const __s16 *data = RTA_DATA(attr);
338 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
343 for (i = 0; i < n; i++)
344 d->table[i] = data[i];
346 spin_lock_bh(&sch->dev->queue_lock);
347 d = xchg(&q->delay_dist, d);
348 spin_unlock_bh(&sch->dev->queue_lock);
354 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
356 struct netem_sched_data *q = qdisc_priv(sch);
357 const struct tc_netem_corr *c = RTA_DATA(attr);
359 if (RTA_PAYLOAD(attr) != sizeof(*c))
362 init_crandom(&q->delay_cor, c->delay_corr);
363 init_crandom(&q->loss_cor, c->loss_corr);
364 init_crandom(&q->dup_cor, c->dup_corr);
368 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
370 struct netem_sched_data *q = qdisc_priv(sch);
371 struct tc_netem_qopt *qopt;
374 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
377 qopt = RTA_DATA(opt);
378 ret = set_fifo_limit(q->qdisc, qopt->limit);
380 pr_debug("netem: can't set fifo limit\n");
384 q->latency = qopt->latency;
385 q->jitter = qopt->jitter;
386 q->limit = qopt->limit;
388 q->loss = qopt->loss;
389 q->duplicate = qopt->duplicate;
391 /* Handle nested options after initial queue options.
392 * Should have put all options in nested format but too late now.
394 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
395 struct rtattr *tb[TCA_NETEM_MAX];
396 if (rtattr_parse(tb, TCA_NETEM_MAX,
397 RTA_DATA(opt) + sizeof(*qopt),
398 RTA_PAYLOAD(opt) - sizeof(*qopt)))
401 if (tb[TCA_NETEM_CORR-1]) {
402 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
407 if (tb[TCA_NETEM_DELAY_DIST-1]) {
408 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
418 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
420 struct netem_sched_data *q = qdisc_priv(sch);
426 skb_queue_head_init(&q->delayed);
427 init_timer(&q->timer);
428 q->timer.function = netem_watchdog;
429 q->timer.data = (unsigned long) sch;
432 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
434 pr_debug("netem: qdisc create failed\n");
438 ret = netem_change(sch, opt);
440 pr_debug("netem: change failed\n");
441 qdisc_destroy(q->qdisc);
446 static void netem_destroy(struct Qdisc *sch)
448 struct netem_sched_data *q = qdisc_priv(sch);
450 del_timer_sync(&q->timer);
451 qdisc_destroy(q->qdisc);
452 kfree(q->delay_dist);
455 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
457 const struct netem_sched_data *q = qdisc_priv(sch);
458 unsigned char *b = skb->tail;
459 struct rtattr *rta = (struct rtattr *) b;
460 struct tc_netem_qopt qopt;
461 struct tc_netem_corr cor;
463 qopt.latency = q->latency;
464 qopt.jitter = q->jitter;
465 qopt.limit = q->limit;
468 qopt.duplicate = q->duplicate;
469 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
471 cor.delay_corr = q->delay_cor.rho;
472 cor.loss_corr = q->loss_cor.rho;
473 cor.dup_corr = q->dup_cor.rho;
474 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
475 rta->rta_len = skb->tail - b;
480 skb_trim(skb, b - skb->data);
484 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
485 struct sk_buff *skb, struct tcmsg *tcm)
487 struct netem_sched_data *q = qdisc_priv(sch);
489 if (cl != 1) /* only one class */
492 tcm->tcm_handle |= TC_H_MIN(1);
493 tcm->tcm_info = q->qdisc->handle;
498 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
501 struct netem_sched_data *q = qdisc_priv(sch);
507 *old = xchg(&q->qdisc, new);
510 sch_tree_unlock(sch);
515 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
517 struct netem_sched_data *q = qdisc_priv(sch);
521 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
526 static void netem_put(struct Qdisc *sch, unsigned long arg)
530 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
531 struct rtattr **tca, unsigned long *arg)
536 static int netem_delete(struct Qdisc *sch, unsigned long arg)
541 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
544 if (walker->count >= walker->skip)
545 if (walker->fn(sch, 1, walker) < 0) {
553 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
558 static struct Qdisc_class_ops netem_class_ops = {
559 .graft = netem_graft,
563 .change = netem_change_class,
564 .delete = netem_delete,
566 .tcf_chain = netem_find_tcf,
567 .dump = netem_dump_class,
570 static struct Qdisc_ops netem_qdisc_ops = {
572 .cl_ops = &netem_class_ops,
573 .priv_size = sizeof(struct netem_sched_data),
574 .enqueue = netem_enqueue,
575 .dequeue = netem_dequeue,
576 .requeue = netem_requeue,
579 .reset = netem_reset,
580 .destroy = netem_destroy,
581 .change = netem_change,
583 .owner = THIS_MODULE,
587 static int __init netem_module_init(void)
589 return register_qdisc(&netem_qdisc_ops);
591 static void __exit netem_module_exit(void)
593 unregister_qdisc(&netem_qdisc_ops);
595 module_init(netem_module_init)
596 module_exit(netem_module_exit)
597 MODULE_LICENSE("GPL");