X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fsched%2Fsch_api.c;h=07977f8f2679b30cf93808f93fbbcce3c3dbc776;hb=f7f1b0f1e2fbadeab12d24236000e778aa9b1ead;hp=432531dca1c8e9ebdf192c3fae72df027dcb9361;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 432531dca..07977f8f2 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include @@ -41,7 +43,6 @@ #include #include #include -#include static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new); @@ -130,7 +131,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, */ /* Protects list of registered TC modules. It is pure SMP lock. */ -static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(qdisc_mod_lock); /************************************************ @@ -195,14 +196,18 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; - for (q = dev->qdisc_list; q; q = q->next) { - if (q->handle == handle) + read_lock_bh(&qdisc_tree_lock); + list_for_each_entry(q, &dev->qdisc_list, list) { + if (q->handle == handle) { + read_unlock_bh(&qdisc_tree_lock); return q; + } } + read_unlock_bh(&qdisc_tree_lock); return NULL; } -struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; struct Qdisc *leaf; @@ -221,15 +226,18 @@ struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) /* Find queueing discipline by name */ -struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { - if (rtattr_strcmp(kind, q->id) == 0) + if (rtattr_strcmp(kind, q->id) == 0) { + if (!try_module_get(q->owner)) + q = NULL; break; + } } read_unlock(&qdisc_mod_lock); } @@ -282,7 +290,7 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) /* Allocate an unique handle from space managed by kernel */ -u32 qdisc_alloc_handle(struct net_device *dev) +static u32 qdisc_alloc_handle(struct net_device *dev) { int i = 0x10000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); @@ -306,9 +314,8 @@ dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) if (dev->flags & IFF_UP) dev_deactivate(dev); - write_lock(&qdisc_tree_lock); - spin_lock_bh(&dev->queue_lock); - if (qdisc && qdisc->flags&TCQ_F_INGRES) { + qdisc_lock_tree(dev); + if (qdisc && qdisc->flags&TCQ_F_INGRESS) { oqdisc = dev->qdisc_ingress; /* Prune old scheduler */ if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { @@ -334,8 +341,7 @@ dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) dev->qdisc = &noop_qdisc; } - spin_unlock_bh(&dev->queue_lock); - write_unlock(&qdisc_tree_lock); + qdisc_unlock_tree(dev); if (dev->flags & IFF_UP) dev_activate(dev); @@ -350,15 +356,16 @@ dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) Old qdisc is not destroyed but returned in *old. */ -int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid, - struct Qdisc *new, struct Qdisc **old) +static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, + u32 classid, + struct Qdisc *new, struct Qdisc **old) { int err = 0; struct Qdisc *q = *old; if (parent == NULL) { - if (q && q->flags&TCQ_F_INGRES) { + if (q && q->flags&TCQ_F_INGRESS) { *old = dev_graft_qdisc(dev, q); } else { *old = dev_graft_qdisc(dev, new); @@ -372,6 +379,8 @@ int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid, unsigned long cl = cops->get(parent, classid); if (cl) { err = cops->graft(parent, cl, new, old); + if (new) + new->parent = classid; cops->put(parent, cl); } } @@ -390,16 +399,36 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) { int err; struct rtattr *kind = tca[TCA_KIND-1]; - struct Qdisc *sch = NULL; + void *p = NULL; + struct Qdisc *sch; struct Qdisc_ops *ops; int size; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_KMOD - if (ops==NULL && tca[TCA_KIND-1] != NULL) { - if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { - request_module("sch_%s", (char*)RTA_DATA(kind)); + if (ops == NULL && kind != NULL) { + char name[IFNAMSIZ]; + if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + * We replay the request because the device may + * go away in the mean time. + */ + rtnl_unlock(); + request_module("sch_%s", name); + rtnl_lock(); ops = qdisc_lookup_ops(kind); + if (ops != NULL) { + /* We will try again qdisc_lookup_ops, + * so don't keep a reference. + */ + module_put(ops->owner); + err = -EAGAIN; + goto err_out; + } } } #endif @@ -408,37 +437,37 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) if (ops == NULL) goto err_out; - size = sizeof(*sch) + ops->priv_size; + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); + size += ops->priv_size + QDISC_ALIGN_CONST; - sch = kmalloc(size, GFP_KERNEL); + p = kmalloc(size, GFP_KERNEL); err = -ENOBUFS; - if (!sch) - goto err_out; - - /* Grrr... Resolve race condition with module unload */ - - err = -EINVAL; - if (ops != qdisc_lookup_ops(kind)) - goto err_out; - - memset(sch, 0, size); - + if (!p) + goto err_out2; + memset(p, 0, size); + sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) + & ~QDISC_ALIGN_CONST); + sch->padded = (char *)sch - (char *)p; + + INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); if (handle == TC_H_INGRESS) - sch->flags |= TCQ_F_INGRES; + sch->flags |= TCQ_F_INGRESS; sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev = dev; + dev_hold(dev); atomic_set(&sch->refcnt, 1); - sch->stats.lock = &dev->queue_lock; + sch->stats_lock = &dev->queue_lock; if (handle == 0) { handle = qdisc_alloc_handle(dev); err = -ENOMEM; if (handle == 0) - goto err_out; + goto err_out3; } if (handle == TC_H_INGRESS) @@ -446,27 +475,26 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) else sch->handle = handle; - err = -EBUSY; - if (!try_module_get(ops->owner)) - goto err_out; - if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { - write_lock(&qdisc_tree_lock); - sch->next = dev->qdisc_list; - dev->qdisc_list = sch; - write_unlock(&qdisc_tree_lock); + qdisc_lock_tree(dev); + list_add_tail(&sch->list, &dev->qdisc_list); + qdisc_unlock_tree(dev); + #ifdef CONFIG_NET_ESTIMATOR if (tca[TCA_RATE-1]) - qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); + gen_new_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, tca[TCA_RATE-1]); #endif return sch; } +err_out3: + dev_put(dev); +err_out2: module_put(ops->owner); - err_out: *errp = err; - if (sch) - kfree(sch); + if (p) + kfree(p); return NULL; } @@ -482,10 +510,9 @@ static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) return err; } #ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) { - qdisc_kill_estimator(&sch->stats); - qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); - } + if (tca[TCA_RATE-1]) + gen_replace_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, tca[TCA_RATE-1]); #endif return 0; } @@ -597,14 +624,20 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { - struct tcmsg *tcm = NLMSG_DATA(n); - struct rtattr **tca = arg; + struct tcmsg *tcm; + struct rtattr **tca; struct net_device *dev; - u32 clid = tcm->tcm_parent; - struct Qdisc *q = NULL; - struct Qdisc *p = NULL; + u32 clid; + struct Qdisc *q, *p; int err; +replay: + /* Reinit, just in case something touches this. */ + tcm = NLMSG_DATA(n); + tca = arg; + clid = tcm->tcm_parent; + q = p = NULL; + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) return -ENODEV; @@ -698,8 +731,11 @@ create_n_graft: q = qdisc_create(dev, tcm->tcm_parent, tca, &err); else q = qdisc_create(dev, tcm->tcm_handle, tca, &err); - if (q == NULL) + if (q == NULL) { + if (err == -EAGAIN) + goto replay; return err; + } graft: if (1) { @@ -723,40 +759,44 @@ graft: return 0; } -int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st) -{ - spin_lock_bh(st->lock); - RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st); - spin_unlock_bh(st->lock); - return 0; - -rtattr_failure: - spin_unlock_bh(st->lock); - return -1; -} - - static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 pid, u32 seq, unsigned flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb->tail; + struct gnet_dump d; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); nlh->nlmsg_flags = flags; tcm = NLMSG_DATA(nlh); tcm->tcm_family = AF_UNSPEC; - tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_ifindex = q->dev->ifindex; tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = atomic_read(&q->refcnt); RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); if (q->ops->dump && q->ops->dump(q, skb) < 0) goto rtattr_failure; - q->stats.qlen = q->q.qlen; - if (qdisc_copy_stats(skb, &q->stats)) + q->qstats.qlen = q->q.qlen; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, q->stats_lock, &d) < 0) + goto rtattr_failure; + + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto rtattr_failure; + + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &q->qstats) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -808,18 +848,21 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) continue; if (idx > s_idx) s_q_idx = 0; - read_lock(&qdisc_tree_lock); - for (q = dev->qdisc_list, q_idx = 0; q; - q = q->next, q_idx++) { - if (q_idx < s_q_idx) + read_lock_bh(&qdisc_tree_lock); + q_idx = 0; + list_for_each_entry(q, &dev->qdisc_list, list) { + if (q_idx < s_q_idx) { + q_idx++; continue; - if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid, + } + if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { - read_unlock(&qdisc_tree_lock); + read_unlock_bh(&qdisc_tree_lock); goto done; } + q_idx++; } - read_unlock(&qdisc_tree_lock); + read_unlock_bh(&qdisc_tree_lock); } done: @@ -959,18 +1002,31 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb->tail; + struct gnet_dump d; + struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); nlh->nlmsg_flags = flags; tcm = NLMSG_DATA(nlh); tcm->tcm_family = AF_UNSPEC; - tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_ifindex = q->dev->ifindex; tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); - if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) + if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto rtattr_failure; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, q->stats_lock, &d) < 0) + goto rtattr_failure; + + if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1028,13 +1084,16 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) return 0; s_t = cb->args[0]; - - read_lock(&qdisc_tree_lock); - for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { - if (t < s_t) continue; - if (!q->ops->cl_ops) continue; - if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle) + t = 0; + + read_lock_bh(&qdisc_tree_lock); + list_for_each_entry(q, &dev->qdisc_list, list) { + if (t < s_t || !q->ops->cl_ops || + (tcm->tcm_parent && + TC_H_MAJ(tcm->tcm_parent) != q->handle)) { + t++; continue; + } if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); arg.w.fn = qdisc_class_dump; @@ -1047,8 +1106,9 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = arg.w.count; if (arg.w.stop) break; + t++; } - read_unlock(&qdisc_tree_lock); + read_unlock_bh(&qdisc_tree_lock); cb->args[0] = t; @@ -1056,8 +1116,54 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -int psched_us_per_tick = 1; -int psched_tick_per_us = 1; +/* Main classifier routine: scans classifier chain attached + to this qdisc, (optionally) tests for protocol and asks + specific classifiers. + */ +int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int err = 0; + u32 protocol = skb->protocol; +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *otp = tp; +reclassify: +#endif + protocol = skb->protocol; + + for ( ; tp; tp = tp->next) { + if ((tp->protocol == protocol || + tp->protocol == __constant_htons(ETH_P_ALL)) && + (err = tp->classify(skb, tp, res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + if ( TC_ACT_RECLASSIFY == err) { + __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); + tp = otp; + + if (MAX_REC_LOOP < verd++) { + printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", + tp->prio&0xffff, ntohs(tp->protocol)); + return TC_ACT_SHOT; + } + skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); + goto reclassify; + } else { + if (skb->tc_verd) + skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); + return err; + } +#else + + return err; +#endif + } + + } + return -1; +} + +static int psched_us_per_tick = 1; +static int psched_tick_per_us = 1; #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) @@ -1083,52 +1189,34 @@ static struct file_operations psched_fops = { }; #endif -#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY -int psched_tod_diff(int delta_sec, int bound) -{ - int delta; - - if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1) - return bound; - delta = delta_sec * 1000000; - if (delta > bound) - delta = bound; - return delta; -} -#endif - -psched_time_t psched_time_base; - -#if PSCHED_CLOCK_SOURCE == PSCHED_CPU +#ifdef CONFIG_NET_SCH_CLK_CPU psched_tdiff_t psched_clock_per_hz; int psched_clock_scale; -#endif +EXPORT_SYMBOL(psched_clock_per_hz); +EXPORT_SYMBOL(psched_clock_scale); -#ifdef PSCHED_WATCHER -PSCHED_WATCHER psched_time_mark; +psched_time_t psched_time_base; +cycles_t psched_time_mark; +EXPORT_SYMBOL(psched_time_mark); +EXPORT_SYMBOL(psched_time_base); +/* + * Periodically adjust psched_time_base to avoid overflow + * with 32-bit get_cycles(). Safe up to 4GHz CPU. + */ static void psched_tick(unsigned long); - static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); static void psched_tick(unsigned long dummy) { -#if PSCHED_CLOCK_SOURCE == PSCHED_CPU - psched_time_t dummy_stamp; - PSCHED_GET_TIME(dummy_stamp); - /* It is OK up to 4GHz cpu */ - psched_timer.expires = jiffies + 1*HZ; -#else - unsigned long now = jiffies; - psched_time_base += ((u64)(now-psched_time_mark))<