X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fsched%2Fsch_netem.c;h=79542af9dab1fb3213778011d3222e34bd202e0c;hb=refs%2Fheads%2Fvserver;hp=d29df1cabf594ba609b354da8bf7ae2f148865c6;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index d29df1cab..79542af9d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -4,7 +4,7 @@ * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * 2 of the License. * * Many of the algorithms and ideas for this came from * NIST Net which is not copyrighted. @@ -13,9 +13,8 @@ * Catalin(ux aka Dino) BOIE */ -#include #include -#include +#include #include #include #include @@ -25,6 +24,8 @@ #include +#define VERSION "1.2" + /* Network Emulation Queuing algorithm. ==================================== @@ -53,7 +54,6 @@ struct netem_sched_data { struct Qdisc *qdisc; - struct sk_buff_head delayed; struct timer_list timer; u32 latency; @@ -63,11 +63,13 @@ struct netem_sched_data { u32 gap; u32 jitter; u32 duplicate; + u32 reorder; + u32 corrupt; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { u32 size; @@ -137,72 +139,103 @@ static long tabledist(unsigned long mu, long sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -/* Put skb in the private delayed queue. */ -static int delay_skb(struct Qdisc *sch, struct sk_buff *skb) -{ - struct netem_sched_data *q = qdisc_priv(sch); - struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; - psched_time_t now; - - PSCHED_GET_TIME(now); - PSCHED_TADD2(now, tabledist(q->latency, q->jitter, - &q->delay_cor, q->delay_dist), - cb->time_to_send); - - /* Always queue at tail to keep packets in order */ - if (likely(q->delayed.qlen < q->limit)) { - __skb_queue_tail(&q->delayed, skb); - sch->q.qlen++; - sch->stats.bytes += skb->len; - sch->stats.packets++; - return NET_XMIT_SUCCESS; - } - - sch->stats.drops++; - kfree_skb(skb); - return NET_XMIT_DROP; -} - +/* + * Insert one skb into qdisc. + * Note: parent depends on return value to account for queue length. + * NET_XMIT_DROP: queue length didn't change. + * NET_XMIT_SUCCESS: one skb was queued. + */ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); + /* We don't fill cb now as skb_unshare() may invalidate it */ + struct netem_skb_cb *cb; + struct sk_buff *skb2; + int ret; + int count = 1; - pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + pr_debug("netem_enqueue skb=%p\n", skb); + + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) + ++count; /* Random packet drop 0 => none, ~0 => all */ - if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { - pr_debug("netem_enqueue: random loss\n"); - sch->stats.drops++; - return 0; /* lie about loss so TCP doesn't know */ + if (q->loss && q->loss >= get_crandom(&q->loss_cor)) + --count; + + if (count == 0) { + sch->qstats.drops++; + kfree_skb(skb); + return NET_XMIT_BYPASS; } - /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + skb_orphan(skb); - pr_debug("netem_enqueue: dup %p\n", skb2); - if (skb2) - delay_skb(sch, skb2); + /* + * If we need to duplicate packet, then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + struct Qdisc *rootq = sch->dev->qdisc; + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + q->duplicate = 0; + + rootq->enqueue(skb2, rootq); + q->duplicate = dupsave; } - /* If doing simple delay then gap == 0 so all packets - * go into the delayed holding queue - * otherwise if doing out of order only "1 out of gap" - * packets will be delayed. + /* + * Randomized packet corruption. + * Make copy if needed since we are modifying + * If packet is going to be hardware checksummed, then + * do it now in software before we mangle it. */ - if (q->counter < q->gap) { - int ret; + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) + || (skb->ip_summed == CHECKSUM_PARTIAL + && skb_checksum_help(skb))) { + sch->qstats.drops++; + return NET_XMIT_DROP; + } + skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + } + + cb = (struct netem_skb_cb *)skb->cb; + if (q->gap == 0 /* not doing reordering */ + || q->counter < q->gap /* inside last reordering gap */ + || q->reorder < get_crandom(&q->reorder_cor)) { + psched_time_t now; + psched_tdiff_t delay; + + delay = tabledist(q->latency, q->jitter, + &q->delay_cor, q->delay_dist); + + PSCHED_GET_TIME(now); + PSCHED_TADD2(now, delay, cb->time_to_send); ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); - if (ret) - sch->stats.drops++; - return ret; + } else { + /* + * Do re-ordering by putting one out of N packets at the front + * of the queue. + */ + PSCHED_GET_TIME(cb->time_to_send); + q->counter = 0; + ret = q->qdisc->ops->requeue(skb, q->qdisc); } - - q->counter = 0; - return delay_skb(sch, skb); + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + + pr_debug("netem: enqueue ret %d\n", ret); + return ret; } /* Requeue packets but don't change time stamp */ @@ -211,8 +244,10 @@ static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); int ret; - if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { sch->q.qlen++; + sch->qstats.requeues++; + } return ret; } @@ -220,56 +255,58 @@ static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) static unsigned int netem_drop(struct Qdisc* sch) { struct netem_sched_data *q = qdisc_priv(sch); - unsigned int len; + unsigned int len = 0; - if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { sch->q.qlen--; - sch->stats.drops++; + sch->qstats.drops++; } return len; } -/* Dequeue packet. - * Move all packets that are ready to send from the delay holding - * list to the underlying qdisc, then just call dequeue - */ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - psched_time_t now; - PSCHED_GET_TIME(now); - while ((skb = skb_peek(&q->delayed)) != NULL) { + skb = q->qdisc->dequeue(q->qdisc); + if (skb) { const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; - long delay - = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); - pr_debug("netem_dequeue: delay queue %p@%lu %ld\n", - skb, jiffies, delay); + psched_time_t now; /* if more time remaining? */ - if (delay > 0) { - mod_timer(&q->timer, jiffies + delay); - break; - } - __skb_unlink(skb, &q->delayed); + PSCHED_GET_TIME(now); + + if (PSCHED_TLESS(cb->time_to_send, now)) { + pr_debug("netem_dequeue: return skb=%p\n", skb); + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } else { + psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now); + + if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { + qdisc_tree_decrease_qlen(q->qdisc, 1); + sch->qstats.drops++; + printk(KERN_ERR "netem: queue discpline %s could not requeue\n", + q->qdisc->ops->id); + } - if (q->qdisc->enqueue(skb, q->qdisc)) - sch->stats.drops++; + mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay)); + sch->flags |= TCQ_F_THROTTLED; + } } - skb = q->qdisc->dequeue(q->qdisc); - if (skb) - sch->q.qlen--; - return skb; + return NULL; } static void netem_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc *)arg; - pr_debug("netem_watchdog: fired @%lu\n", jiffies); + pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen); + sch->flags &= ~TCQ_F_THROTTLED; netif_schedule(sch->dev); } @@ -278,17 +315,21 @@ static void netem_reset(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); - skb_queue_purge(&q->delayed); - sch->q.qlen = 0; + sch->flags &= ~TCQ_F_THROTTLED; del_timer_sync(&q->timer); } +/* Pass size change message down to embedded FIFO */ static int set_fifo_limit(struct Qdisc *q, int limit) { struct rtattr *rta; int ret = -ENOMEM; + /* Hack to avoid sending change message to non-FIFO */ + if (strncmp(q->ops->id + 1, "fifo", 4) != 0) + return 0; + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (rta) { rta->rta_type = RTM_NEWQDISC; @@ -346,6 +387,33 @@ static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_reorder *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->reorder = r->probability; + init_crandom(&q->reorder_cor, r->correlation); + return 0; +} + +static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corrupt *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->corrupt = r->probability; + init_crandom(&q->corrupt_cor, r->correlation); + return 0; +} + +/* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -366,9 +434,15 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) q->jitter = qopt->jitter; q->limit = qopt->limit; q->gap = qopt->gap; + q->counter = 0; q->loss = qopt->loss; q->duplicate = qopt->duplicate; + /* for compatiablity with earlier versions. + * if gap is set, need to assume 100% probablity + */ + q->reorder = ~0; + /* Handle nested options after initial queue options. * Should have put all options in nested format but too late now. */ @@ -390,12 +464,101 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + + if (tb[TCA_NETEM_REORDER-1]) { + ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); + if (ret) + return ret; + } + + if (tb[TCA_NETEM_CORRUPT-1]) { + ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); + if (ret) + return ret; + } + } + + return 0; +} + +/* + * Special case version of FIFO queue for use by netem. + * It queues in order based on timestamps in skb's + */ +struct fifo_sched_data { + u32 limit; +}; + +static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *list = &sch->q; + const struct netem_skb_cb *ncb + = (const struct netem_skb_cb *)nskb->cb; + struct sk_buff *skb; + + if (likely(skb_queue_len(list) < q->limit)) { + skb_queue_reverse_walk(list, skb) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + + if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send)) + break; + } + + __skb_queue_after(list, skb, nskb); + + sch->qstats.backlog += nskb->len; + sch->bstats.bytes += nskb->len; + sch->bstats.packets++; + + return NET_XMIT_SUCCESS; } + return qdisc_drop(nskb, sch); +} + +static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (opt) { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (RTA_PAYLOAD(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } else + q->limit = max_t(u32, sch->dev->tx_queue_len, 1); return 0; } +static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct tc_fifo_qopt opt = { .limit = q->limit }; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + return -1; +} + +static struct Qdisc_ops tfifo_qdisc_ops = { + .id = "tfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = tfifo_enqueue, + .dequeue = qdisc_dequeue_head, + .requeue = qdisc_requeue, + .drop = qdisc_queue_drop, + .init = tfifo_init, + .reset = qdisc_reset_queue, + .change = tfifo_init, + .dump = tfifo_dump, +}; + static int netem_init(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -404,13 +567,12 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) if (!opt) return -EINVAL; - skb_queue_head_init(&q->delayed); init_timer(&q->timer); q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; - q->counter = 0; - q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 1)); if (!q->qdisc) { pr_debug("netem: qdisc create failed\n"); return -ENOMEM; @@ -440,6 +602,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct rtattr *rta = (struct rtattr *) b; struct tc_netem_qopt qopt; struct tc_netem_corr cor; + struct tc_netem_reorder reorder; + struct tc_netem_corrupt corrupt; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -453,6 +617,15 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + + reorder.probability = q->reorder; + reorder.correlation = q->reorder_cor.rho; + RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + + corrupt.probability = q->corrupt; + corrupt.correlation = q->corrupt_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rta->rta_len = skb->tail - b; return skb->len; @@ -486,8 +659,8 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, sch_tree_lock(sch); *old = xchg(&q->qdisc, new); + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); qdisc_reset(*old); - sch->q.qlen = 0; sch_tree_unlock(sch); return 0; @@ -567,6 +740,7 @@ static struct Qdisc_ops netem_qdisc_ops = { static int __init netem_module_init(void) { + pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void)