2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
41 #include <net/pkt_sched.h>
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
57 This file consists of two interrelated parts:
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
81 All real intelligent work is done inside qdisc modules.
85 Every discipline has two major routines: enqueue and dequeue.
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
122 initializes newly created qdisc.
126 destroys resources allocated by init and during lifetime of qdisc.
130 changes qdisc parameters.
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock);
137 /************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
142 /* The list of all installed queueing disciplines. */
144 static struct Qdisc_ops *qdisc_base;
146 /* Register/uregister queueing discipline */
148 int register_qdisc(struct Qdisc_ops *qops)
150 struct Qdisc_ops *q, **qp;
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
169 write_unlock(&qdisc_mod_lock);
173 int unregister_qdisc(struct Qdisc_ops *qops)
175 struct Qdisc_ops *q, **qp;
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
187 write_unlock(&qdisc_mod_lock);
191 /* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
206 read_unlock_bh(&qdisc_tree_lock);
210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
218 cl = cops->get(p, classid);
222 leaf = cops->leaf(p, cl);
227 /* Find queueing discipline by name */
229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
231 struct Qdisc_ops *q = NULL;
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
242 read_unlock(&qdisc_mod_lock);
247 static struct qdisc_rate_table *qdisc_rtab_list;
249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
251 struct qdisc_rate_table *rtab;
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
274 void qdisc_put_rtab(struct qdisc_rate_table *tab)
276 struct qdisc_rate_table *rtab, **rtabp;
278 if (!tab || --tab->refcnt)
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
291 /* Allocate an unique handle from space managed by kernel */
293 static u32 qdisc_alloc_handle(struct net_device *dev)
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
304 return i>0 ? autohandle : 0;
307 /* Attach toplevel qdisc to device dev */
309 static struct Qdisc *
310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
312 struct Qdisc *oqdisc;
314 if (dev->flags & IFF_UP)
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
324 dev->qdisc_ingress = NULL;
326 dev->qdisc_ingress = qdisc;
331 oqdisc = dev->qdisc_sleeping;
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
337 /* ... and graft new one */
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
344 qdisc_unlock_tree(dev);
346 if (dev->flags & IFF_UP)
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
356 Old qdisc is not destroyed but returned in *old.
359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
361 struct Qdisc *new, struct Qdisc **old)
364 struct Qdisc *q = *old;
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
371 *old = dev_graft_qdisc(dev, new);
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
379 unsigned long cl = cops->get(parent, classid);
381 err = cops->graft(parent, cl, new, old);
383 new->parent = classid;
384 cops->put(parent, cl);
392 Allocate and initialize new qdisc.
394 Parameters are passed via opt.
397 static struct Qdisc *
398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
401 struct rtattr *kind = tca[TCA_KIND-1];
404 struct Qdisc_ops *ops;
407 ops = qdisc_lookup_ops(kind);
409 if (ops==NULL && tca[TCA_KIND-1] != NULL) {
411 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
412 request_module("sch_%s", name);
413 ops = qdisc_lookup_ops(kind);
422 /* ensure that the Qdisc and the private data are 32-byte aligned */
423 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
424 size += ops->priv_size + QDISC_ALIGN_CONST;
426 p = kmalloc(size, GFP_KERNEL);
431 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
432 & ~QDISC_ALIGN_CONST);
433 sch->padded = (char *)sch - (char *)p;
435 INIT_LIST_HEAD(&sch->list);
436 skb_queue_head_init(&sch->q);
438 if (handle == TC_H_INGRESS)
439 sch->flags |= TCQ_F_INGRESS;
442 sch->enqueue = ops->enqueue;
443 sch->dequeue = ops->dequeue;
446 atomic_set(&sch->refcnt, 1);
447 sch->stats_lock = &dev->queue_lock;
449 handle = qdisc_alloc_handle(dev);
455 if (handle == TC_H_INGRESS)
456 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
458 sch->handle = handle;
460 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
461 qdisc_lock_tree(dev);
462 list_add_tail(&sch->list, &dev->qdisc_list);
463 qdisc_unlock_tree(dev);
465 #ifdef CONFIG_NET_ESTIMATOR
467 gen_new_estimator(&sch->bstats, &sch->rate_est,
468 sch->stats_lock, tca[TCA_RATE-1]);
475 module_put(ops->owner);
483 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
485 if (tca[TCA_OPTIONS-1]) {
488 if (sch->ops->change == NULL)
490 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
494 #ifdef CONFIG_NET_ESTIMATOR
496 gen_replace_estimator(&sch->bstats, &sch->rate_est,
497 sch->stats_lock, tca[TCA_RATE-1]);
502 struct check_loop_arg
504 struct qdisc_walker w;
509 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
511 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
513 struct check_loop_arg arg;
515 if (q->ops->cl_ops == NULL)
518 arg.w.stop = arg.w.skip = arg.w.count = 0;
519 arg.w.fn = check_loop_fn;
522 q->ops->cl_ops->walk(q, &arg.w);
523 return arg.w.stop ? -ELOOP : 0;
527 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
530 struct Qdisc_class_ops *cops = q->ops->cl_ops;
531 struct check_loop_arg *arg = (struct check_loop_arg *)w;
533 leaf = cops->leaf(q, cl);
535 if (leaf == arg->p || arg->depth > 7)
537 return check_loop(leaf, arg->p, arg->depth + 1);
546 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
548 struct tcmsg *tcm = NLMSG_DATA(n);
549 struct rtattr **tca = arg;
550 struct net_device *dev;
551 u32 clid = tcm->tcm_parent;
552 struct Qdisc *q = NULL;
553 struct Qdisc *p = NULL;
556 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
560 if (clid != TC_H_ROOT) {
561 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
562 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
564 q = qdisc_leaf(p, clid);
565 } else { /* ingress */
566 q = dev->qdisc_ingress;
569 q = dev->qdisc_sleeping;
574 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
577 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
581 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
584 if (n->nlmsg_type == RTM_DELQDISC) {
589 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
592 qdisc_notify(skb, n, clid, q, NULL);
593 spin_lock_bh(&dev->queue_lock);
595 spin_unlock_bh(&dev->queue_lock);
598 qdisc_notify(skb, n, clid, NULL, q);
607 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
609 struct tcmsg *tcm = NLMSG_DATA(n);
610 struct rtattr **tca = arg;
611 struct net_device *dev;
612 u32 clid = tcm->tcm_parent;
613 struct Qdisc *q = NULL;
614 struct Qdisc *p = NULL;
617 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
621 if (clid != TC_H_ROOT) {
622 if (clid != TC_H_INGRESS) {
623 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
625 q = qdisc_leaf(p, clid);
626 } else { /*ingress */
627 q = dev->qdisc_ingress;
630 q = dev->qdisc_sleeping;
633 /* It may be default qdisc, ignore it */
634 if (q && q->handle == 0)
637 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
638 if (tcm->tcm_handle) {
639 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
641 if (TC_H_MIN(tcm->tcm_handle))
643 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
645 if (n->nlmsg_flags&NLM_F_EXCL)
647 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
650 (p && check_loop(q, p, 0)))
652 atomic_inc(&q->refcnt);
658 /* This magic test requires explanation.
660 * We know, that some child q is already
661 * attached to this parent and have choice:
662 * either to change it or to create/graft new one.
664 * 1. We are allowed to create/graft only
665 * if CREATE and REPLACE flags are set.
667 * 2. If EXCL is set, requestor wanted to say,
668 * that qdisc tcm_handle is not expected
669 * to exist, so that we choose create/graft too.
671 * 3. The last case is when no flags are set.
672 * Alas, it is sort of hole in API, we
673 * cannot decide what to do unambiguously.
674 * For now we select create/graft, if
675 * user gave KIND, which does not match existing.
677 if ((n->nlmsg_flags&NLM_F_CREATE) &&
678 (n->nlmsg_flags&NLM_F_REPLACE) &&
679 ((n->nlmsg_flags&NLM_F_EXCL) ||
681 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
686 if (!tcm->tcm_handle)
688 q = qdisc_lookup(dev, tcm->tcm_handle);
691 /* Change qdisc parameters */
694 if (n->nlmsg_flags&NLM_F_EXCL)
696 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
698 err = qdisc_change(q, tca);
700 qdisc_notify(skb, n, clid, NULL, q);
704 if (!(n->nlmsg_flags&NLM_F_CREATE))
706 if (clid == TC_H_INGRESS)
707 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
709 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
715 struct Qdisc *old_q = NULL;
716 err = qdisc_graft(dev, p, clid, q, &old_q);
719 spin_lock_bh(&dev->queue_lock);
721 spin_unlock_bh(&dev->queue_lock);
725 qdisc_notify(skb, n, clid, old_q, q);
727 spin_lock_bh(&dev->queue_lock);
728 qdisc_destroy(old_q);
729 spin_unlock_bh(&dev->queue_lock);
735 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
736 u32 pid, u32 seq, unsigned flags, int event)
739 struct nlmsghdr *nlh;
740 unsigned char *b = skb->tail;
743 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
744 nlh->nlmsg_flags = flags;
745 tcm = NLMSG_DATA(nlh);
746 tcm->tcm_family = AF_UNSPEC;
747 tcm->tcm_ifindex = q->dev->ifindex;
748 tcm->tcm_parent = clid;
749 tcm->tcm_handle = q->handle;
750 tcm->tcm_info = atomic_read(&q->refcnt);
751 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
752 if (q->ops->dump && q->ops->dump(q, skb) < 0)
754 q->qstats.qlen = q->q.qlen;
756 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
757 TCA_XSTATS, q->stats_lock, &d) < 0)
760 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
763 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
764 #ifdef CONFIG_NET_ESTIMATOR
765 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
767 gnet_stats_copy_queue(&d, &q->qstats) < 0)
770 if (gnet_stats_finish_copy(&d) < 0)
773 nlh->nlmsg_len = skb->tail - b;
778 skb_trim(skb, b - skb->data);
782 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
783 u32 clid, struct Qdisc *old, struct Qdisc *new)
786 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
788 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
792 if (old && old->handle) {
793 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
797 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
802 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
809 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
813 struct net_device *dev;
817 s_q_idx = q_idx = cb->args[1];
818 read_lock(&dev_base_lock);
819 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
824 read_lock_bh(&qdisc_tree_lock);
826 list_for_each_entry(q, &dev->qdisc_list, list) {
827 if (q_idx < s_q_idx) {
831 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
832 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
833 read_unlock_bh(&qdisc_tree_lock);
838 read_unlock_bh(&qdisc_tree_lock);
842 read_unlock(&dev_base_lock);
852 /************************************************
853 * Traffic classes manipulation. *
854 ************************************************/
858 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
860 struct tcmsg *tcm = NLMSG_DATA(n);
861 struct rtattr **tca = arg;
862 struct net_device *dev;
863 struct Qdisc *q = NULL;
864 struct Qdisc_class_ops *cops;
865 unsigned long cl = 0;
866 unsigned long new_cl;
867 u32 pid = tcm->tcm_parent;
868 u32 clid = tcm->tcm_handle;
869 u32 qid = TC_H_MAJ(clid);
872 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
876 parent == TC_H_UNSPEC - unspecified parent.
877 parent == TC_H_ROOT - class is root, which has no parent.
878 parent == X:0 - parent is root class.
879 parent == X:Y - parent is a node in hierarchy.
880 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
882 handle == 0:0 - generate handle from kernel pool.
883 handle == 0:Y - class is X:Y, where X:0 is qdisc.
884 handle == X:Y - clear.
885 handle == X:0 - root class.
888 /* Step 1. Determine qdisc handle X:0 */
890 if (pid != TC_H_ROOT) {
891 u32 qid1 = TC_H_MAJ(pid);
894 /* If both majors are known, they must be identical. */
900 qid = dev->qdisc_sleeping->handle;
902 /* Now qid is genuine qdisc handle consistent
903 both with parent and child.
905 TC_H_MAJ(pid) still may be unspecified, complete it now.
908 pid = TC_H_MAKE(qid, pid);
911 qid = dev->qdisc_sleeping->handle;
914 /* OK. Locate qdisc */
915 if ((q = qdisc_lookup(dev, qid)) == NULL)
918 /* An check that it supports classes */
919 cops = q->ops->cl_ops;
923 /* Now try to get class */
925 if (pid == TC_H_ROOT)
928 clid = TC_H_MAKE(qid, clid);
931 cl = cops->get(q, clid);
935 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
938 switch (n->nlmsg_type) {
941 if (n->nlmsg_flags&NLM_F_EXCL)
945 err = cops->delete(q, cl);
947 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
950 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
959 err = cops->change(q, clid, pid, tca, &new_cl);
961 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
971 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
973 u32 pid, u32 seq, unsigned flags, int event)
976 struct nlmsghdr *nlh;
977 unsigned char *b = skb->tail;
979 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
981 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
982 nlh->nlmsg_flags = flags;
983 tcm = NLMSG_DATA(nlh);
984 tcm->tcm_family = AF_UNSPEC;
985 tcm->tcm_ifindex = q->dev->ifindex;
986 tcm->tcm_parent = q->handle;
987 tcm->tcm_handle = q->handle;
989 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
990 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
993 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
994 TCA_XSTATS, q->stats_lock, &d) < 0)
997 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1000 if (gnet_stats_finish_copy(&d) < 0)
1001 goto rtattr_failure;
1003 nlh->nlmsg_len = skb->tail - b;
1008 skb_trim(skb, b - skb->data);
1012 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1013 struct Qdisc *q, unsigned long cl, int event)
1015 struct sk_buff *skb;
1016 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1018 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1022 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1027 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1030 struct qdisc_dump_args
1032 struct qdisc_walker w;
1033 struct sk_buff *skb;
1034 struct netlink_callback *cb;
1037 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1039 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1041 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1042 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1045 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1049 struct net_device *dev;
1051 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1052 struct qdisc_dump_args arg;
1054 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1056 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1062 read_lock_bh(&qdisc_tree_lock);
1063 list_for_each_entry(q, &dev->qdisc_list, list) {
1064 if (t < s_t || !q->ops->cl_ops ||
1066 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1071 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1072 arg.w.fn = qdisc_class_dump;
1076 arg.w.skip = cb->args[1];
1078 q->ops->cl_ops->walk(q, &arg.w);
1079 cb->args[1] = arg.w.count;
1084 read_unlock_bh(&qdisc_tree_lock);
1092 /* Main classifier routine: scans classifier chain attached
1093 to this qdisc, (optionally) tests for protocol and asks
1094 specific classifiers.
1096 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1097 struct tcf_result *res)
1100 u32 protocol = skb->protocol;
1101 #ifdef CONFIG_NET_CLS_ACT
1102 struct tcf_proto *otp = tp;
1105 protocol = skb->protocol;
1107 for ( ; tp; tp = tp->next) {
1108 if ((tp->protocol == protocol ||
1109 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1110 (err = tp->classify(skb, tp, res)) >= 0) {
1111 #ifdef CONFIG_NET_CLS_ACT
1112 if ( TC_ACT_RECLASSIFY == err) {
1113 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1116 if (MAX_REC_LOOP < verd++) {
1117 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1118 tp->prio&0xffff, ntohs(tp->protocol));
1121 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1125 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1138 static int psched_us_per_tick = 1;
1139 static int psched_tick_per_us = 1;
1141 #ifdef CONFIG_PROC_FS
1142 static int psched_show(struct seq_file *seq, void *v)
1144 seq_printf(seq, "%08x %08x %08x %08x\n",
1145 psched_tick_per_us, psched_us_per_tick,
1151 static int psched_open(struct inode *inode, struct file *file)
1153 return single_open(file, psched_show, PDE(inode)->data);
1156 static struct file_operations psched_fops = {
1157 .owner = THIS_MODULE,
1158 .open = psched_open,
1160 .llseek = seq_lseek,
1161 .release = single_release,
1165 #ifdef CONFIG_NET_SCH_CLK_CPU
1166 psched_tdiff_t psched_clock_per_hz;
1167 int psched_clock_scale;
1168 EXPORT_SYMBOL(psched_clock_per_hz);
1169 EXPORT_SYMBOL(psched_clock_scale);
1171 psched_time_t psched_time_base;
1172 cycles_t psched_time_mark;
1173 EXPORT_SYMBOL(psched_time_mark);
1174 EXPORT_SYMBOL(psched_time_base);
1177 * Periodically adjust psched_time_base to avoid overflow
1178 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1180 static void psched_tick(unsigned long);
1181 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1183 static void psched_tick(unsigned long dummy)
1185 if (sizeof(cycles_t) == sizeof(u32)) {
1186 psched_time_t dummy_stamp;
1187 PSCHED_GET_TIME(dummy_stamp);
1188 psched_timer.expires = jiffies + 1*HZ;
1189 add_timer(&psched_timer);
1193 int __init psched_calibrate_clock(void)
1195 psched_time_t stamp, stamp1;
1196 struct timeval tv, tv1;
1197 psched_tdiff_t delay;
1202 stop = jiffies + HZ/10;
1203 PSCHED_GET_TIME(stamp);
1204 do_gettimeofday(&tv);
1205 while (time_before(jiffies, stop)) {
1209 PSCHED_GET_TIME(stamp1);
1210 do_gettimeofday(&tv1);
1212 delay = PSCHED_TDIFF(stamp1, stamp);
1213 rdelay = tv1.tv_usec - tv.tv_usec;
1214 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1218 psched_tick_per_us = delay;
1219 while ((delay>>=1) != 0)
1220 psched_clock_scale++;
1221 psched_us_per_tick = 1<<psched_clock_scale;
1222 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1227 static int __init pktsched_init(void)
1229 struct rtnetlink_link *link_p;
1231 #ifdef CONFIG_NET_SCH_CLK_CPU
1232 if (psched_calibrate_clock() < 0)
1234 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1235 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1236 psched_us_per_tick = 1000000;
1239 link_p = rtnetlink_links[PF_UNSPEC];
1241 /* Setup rtnetlink links. It is made here to avoid
1242 exporting large number of public symbols.
1246 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1247 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1248 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1249 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1250 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1251 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1252 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1253 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1256 register_qdisc(&pfifo_qdisc_ops);
1257 register_qdisc(&bfifo_qdisc_ops);
1258 proc_net_fops_create("psched", 0, &psched_fops);
1263 subsys_initcall(pktsched_init);
1265 EXPORT_SYMBOL(qdisc_get_rtab);
1266 EXPORT_SYMBOL(qdisc_put_rtab);
1267 EXPORT_SYMBOL(register_qdisc);
1268 EXPORT_SYMBOL(unregister_qdisc);
1269 EXPORT_SYMBOL(tc_classify);