2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <asm/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
26 #include <net/pkt_sched.h>
28 /* Network Emulation Queuing algorithm.
29 ====================================
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
35 ----------------------------------------------------------------
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
54 struct netem_sched_data {
56 struct sk_buff_head delayed;
57 struct timer_list timer;
70 } delay_cor, loss_cor, dup_cor;
78 /* Time stamp put into socket buffer control block */
80 psched_time_t time_to_send;
83 /* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
86 static void init_crandom(struct crndstate *state, unsigned long rho)
89 state->last = net_random();
92 /* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
96 static unsigned long get_crandom(struct crndstate *state)
101 if (state->rho == 0) /* no correllation */
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
111 /* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
115 static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
124 rnd = get_crandom(state);
126 /* default uniform distribution */
128 return (rnd % (2*sigma)) - sigma + mu;
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
133 x += NETEM_DIST_SCALE/2;
135 x -= NETEM_DIST_SCALE/2;
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
140 /* Put skb in the private delayed queue. */
141 static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
143 struct netem_sched_data *q = qdisc_priv(sch);
144 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
147 PSCHED_GET_TIME(now);
148 PSCHED_TADD2(now, tabledist(q->latency, q->jitter,
149 &q->delay_cor, q->delay_dist),
152 /* Always queue at tail to keep packets in order */
153 if (likely(q->delayed.qlen < q->limit)) {
154 __skb_queue_tail(&q->delayed, skb);
156 sch->stats.bytes += skb->len;
157 sch->stats.packets++;
158 return NET_XMIT_SUCCESS;
163 return NET_XMIT_DROP;
166 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
168 struct netem_sched_data *q = qdisc_priv(sch);
170 pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
172 /* Random packet drop 0 => none, ~0 => all */
173 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
174 pr_debug("netem_enqueue: random loss\n");
176 return 0; /* lie about loss so TCP doesn't know */
179 /* Random duplication */
180 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
181 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
183 pr_debug("netem_enqueue: dup %p\n", skb2);
185 delay_skb(sch, skb2);
188 /* If doing simple delay then gap == 0 so all packets
189 * go into the delayed holding queue
190 * otherwise if doing out of order only "1 out of gap"
191 * packets will be delayed.
193 if (q->counter < q->gap) {
197 ret = q->qdisc->enqueue(skb, q->qdisc);
205 return delay_skb(sch, skb);
208 /* Requeue packets but don't change time stamp */
209 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
211 struct netem_sched_data *q = qdisc_priv(sch);
214 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0)
220 static unsigned int netem_drop(struct Qdisc* sch)
222 struct netem_sched_data *q = qdisc_priv(sch);
225 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
233 * Move all packets that are ready to send from the delay holding
234 * list to the underlying qdisc, then just call dequeue
236 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
238 struct netem_sched_data *q = qdisc_priv(sch);
242 PSCHED_GET_TIME(now);
243 while ((skb = skb_peek(&q->delayed)) != NULL) {
244 const struct netem_skb_cb *cb
245 = (const struct netem_skb_cb *)skb->cb;
247 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
248 pr_debug("netem_dequeue: delay queue %p@%lu %ld\n",
249 skb, jiffies, delay);
251 /* if more time remaining? */
253 mod_timer(&q->timer, jiffies + delay);
256 __skb_unlink(skb, &q->delayed);
258 if (q->qdisc->enqueue(skb, q->qdisc))
262 skb = q->qdisc->dequeue(q->qdisc);
268 static void netem_watchdog(unsigned long arg)
270 struct Qdisc *sch = (struct Qdisc *)arg;
272 pr_debug("netem_watchdog: fired @%lu\n", jiffies);
273 netif_schedule(sch->dev);
276 static void netem_reset(struct Qdisc *sch)
278 struct netem_sched_data *q = qdisc_priv(sch);
280 qdisc_reset(q->qdisc);
281 skb_queue_purge(&q->delayed);
284 del_timer_sync(&q->timer);
287 static int set_fifo_limit(struct Qdisc *q, int limit)
292 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
294 rta->rta_type = RTM_NEWQDISC;
295 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
296 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
298 ret = q->ops->change(q, rta);
305 * Distribution data is a variable size payload containing
306 * signed 16 bit values.
308 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
310 struct netem_sched_data *q = qdisc_priv(sch);
311 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
312 const __s16 *data = RTA_DATA(attr);
319 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
324 for (i = 0; i < n; i++)
325 d->table[i] = data[i];
327 spin_lock_bh(&sch->dev->queue_lock);
328 d = xchg(&q->delay_dist, d);
329 spin_unlock_bh(&sch->dev->queue_lock);
335 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
337 struct netem_sched_data *q = qdisc_priv(sch);
338 const struct tc_netem_corr *c = RTA_DATA(attr);
340 if (RTA_PAYLOAD(attr) != sizeof(*c))
343 init_crandom(&q->delay_cor, c->delay_corr);
344 init_crandom(&q->loss_cor, c->loss_corr);
345 init_crandom(&q->dup_cor, c->dup_corr);
349 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
351 struct netem_sched_data *q = qdisc_priv(sch);
352 struct tc_netem_qopt *qopt;
355 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
358 qopt = RTA_DATA(opt);
359 ret = set_fifo_limit(q->qdisc, qopt->limit);
361 pr_debug("netem: can't set fifo limit\n");
365 q->latency = qopt->latency;
366 q->jitter = qopt->jitter;
367 q->limit = qopt->limit;
369 q->loss = qopt->loss;
370 q->duplicate = qopt->duplicate;
372 /* Handle nested options after initial queue options.
373 * Should have put all options in nested format but too late now.
375 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
376 struct rtattr *tb[TCA_NETEM_MAX];
377 if (rtattr_parse(tb, TCA_NETEM_MAX,
378 RTA_DATA(opt) + sizeof(*qopt),
379 RTA_PAYLOAD(opt) - sizeof(*qopt)))
382 if (tb[TCA_NETEM_CORR-1]) {
383 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
388 if (tb[TCA_NETEM_DELAY_DIST-1]) {
389 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
399 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
401 struct netem_sched_data *q = qdisc_priv(sch);
407 skb_queue_head_init(&q->delayed);
408 init_timer(&q->timer);
409 q->timer.function = netem_watchdog;
410 q->timer.data = (unsigned long) sch;
413 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
415 pr_debug("netem: qdisc create failed\n");
419 ret = netem_change(sch, opt);
421 pr_debug("netem: change failed\n");
422 qdisc_destroy(q->qdisc);
427 static void netem_destroy(struct Qdisc *sch)
429 struct netem_sched_data *q = qdisc_priv(sch);
431 del_timer_sync(&q->timer);
432 qdisc_destroy(q->qdisc);
433 kfree(q->delay_dist);
436 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
438 const struct netem_sched_data *q = qdisc_priv(sch);
439 unsigned char *b = skb->tail;
440 struct rtattr *rta = (struct rtattr *) b;
441 struct tc_netem_qopt qopt;
442 struct tc_netem_corr cor;
444 qopt.latency = q->latency;
445 qopt.jitter = q->jitter;
446 qopt.limit = q->limit;
449 qopt.duplicate = q->duplicate;
450 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
452 cor.delay_corr = q->delay_cor.rho;
453 cor.loss_corr = q->loss_cor.rho;
454 cor.dup_corr = q->dup_cor.rho;
455 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
456 rta->rta_len = skb->tail - b;
461 skb_trim(skb, b - skb->data);
465 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
466 struct sk_buff *skb, struct tcmsg *tcm)
468 struct netem_sched_data *q = qdisc_priv(sch);
470 if (cl != 1) /* only one class */
473 tcm->tcm_handle |= TC_H_MIN(1);
474 tcm->tcm_info = q->qdisc->handle;
479 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
482 struct netem_sched_data *q = qdisc_priv(sch);
488 *old = xchg(&q->qdisc, new);
491 sch_tree_unlock(sch);
496 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
498 struct netem_sched_data *q = qdisc_priv(sch);
502 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
507 static void netem_put(struct Qdisc *sch, unsigned long arg)
511 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
512 struct rtattr **tca, unsigned long *arg)
517 static int netem_delete(struct Qdisc *sch, unsigned long arg)
522 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
525 if (walker->count >= walker->skip)
526 if (walker->fn(sch, 1, walker) < 0) {
534 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
539 static struct Qdisc_class_ops netem_class_ops = {
540 .graft = netem_graft,
544 .change = netem_change_class,
545 .delete = netem_delete,
547 .tcf_chain = netem_find_tcf,
548 .dump = netem_dump_class,
551 static struct Qdisc_ops netem_qdisc_ops = {
553 .cl_ops = &netem_class_ops,
554 .priv_size = sizeof(struct netem_sched_data),
555 .enqueue = netem_enqueue,
556 .dequeue = netem_dequeue,
557 .requeue = netem_requeue,
560 .reset = netem_reset,
561 .destroy = netem_destroy,
562 .change = netem_change,
564 .owner = THIS_MODULE,
568 static int __init netem_module_init(void)
570 return register_qdisc(&netem_qdisc_ops);
572 static void __exit netem_module_exit(void)
574 unregister_qdisc(&netem_qdisc_ops);
576 module_init(netem_module_init)
577 module_exit(netem_module_exit)
578 MODULE_LICENSE("GPL");