VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / sched / sch_generic.c
index 97b5625..82117f9 100644 (file)
@@ -30,6 +30,8 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
    The idea is the following:
    - enqueue, dequeue are serialized via top level device
      spinlock dev->queue_lock.
-   - tree walking is protected by read_lock(qdisc_tree_lock)
+   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
      and this lock is used only in process context.
-   - updates to tree are made only under rtnl semaphore,
-     hence this lock may be made without local bh disabling.
+   - updates to tree are made under rtnl semaphore or
+     from softirq context (__qdisc_destroy rcu-callback)
+     hence this lock needs local bh disabling.
 
    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
  */
 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
 
+void qdisc_lock_tree(struct net_device *dev)
+{
+       write_lock_bh(&qdisc_tree_lock);
+       spin_lock_bh(&dev->queue_lock);
+}
+
+void qdisc_unlock_tree(struct net_device *dev)
+{
+       spin_unlock_bh(&dev->queue_lock);
+       write_unlock_bh(&qdisc_tree_lock);
+}
+
 /* 
    dev->queue_lock serializes queue accesses for this device
    AND dev->qdisc pointer itself.
@@ -270,10 +285,9 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
 static int
 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
-       struct sk_buff_head *list;
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
-       list = ((struct sk_buff_head*)qdisc->data) +
-               prio2band[skb->priority&TC_PRIO_MAX];
+       list += prio2band[skb->priority&TC_PRIO_MAX];
 
        if (list->qlen < qdisc->dev->tx_queue_len) {
                __skb_queue_tail(list, skb);
@@ -291,7 +305,7 @@ static struct sk_buff *
 pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
        int prio;
-       struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+       struct sk_buff_head *list = qdisc_priv(qdisc);
        struct sk_buff *skb;
 
        for (prio = 0; prio < 3; prio++, list++) {
@@ -307,10 +321,9 @@ pfifo_fast_dequeue(struct Qdisc* qdisc)
 static int
 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
-       struct sk_buff_head *list;
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
-       list = ((struct sk_buff_head*)qdisc->data) +
-               prio2band[skb->priority&TC_PRIO_MAX];
+       list += prio2band[skb->priority&TC_PRIO_MAX];
 
        __skb_queue_head(list, skb);
        qdisc->q.qlen++;
@@ -321,7 +334,7 @@ static void
 pfifo_fast_reset(struct Qdisc* qdisc)
 {
        int prio;
-       struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
        for (prio=0; prio < 3; prio++)
                skb_queue_purge(list+prio);
@@ -346,9 +359,7 @@ rtattr_failure:
 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
 {
        int i;
-       struct sk_buff_head *list;
-
-       list = ((struct sk_buff_head*)qdisc->data);
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
        for (i=0; i<3; i++)
                skb_queue_head_init(list+i);
@@ -372,25 +383,39 @@ static struct Qdisc_ops pfifo_fast_ops = {
 
 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
 {
+       void *p;
        struct Qdisc *sch;
-       int size = sizeof(*sch) + ops->priv_size;
+       int size;
 
-       sch = kmalloc(size, GFP_KERNEL);
-       if (!sch)
+       /* ensure that the Qdisc and the private data are 32-byte aligned */
+       size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+       size += ops->priv_size + QDISC_ALIGN_CONST;
+
+       p = kmalloc(size, GFP_KERNEL);
+       if (!p)
                return NULL;
-       memset(sch, 0, size);
+       memset(p, 0, size);
+
+       sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 
+                              & ~QDISC_ALIGN_CONST);
+       sch->padded = (char *)sch - (char *)p;
 
+       INIT_LIST_HEAD(&sch->list);
        skb_queue_head_init(&sch->q);
        sch->ops = ops;
        sch->enqueue = ops->enqueue;
        sch->dequeue = ops->dequeue;
        sch->dev = dev;
-       sch->stats.lock = &dev->queue_lock;
+       dev_hold(dev);
+       sch->stats_lock = &dev->queue_lock;
        atomic_set(&sch->refcnt, 1);
+       /* enqueue is accessed locklessly - make sure it's visible
+        * before we set a netdevice's qdisc pointer to sch */
+       smp_wmb();
        if (!ops->init || ops->init(sch, NULL) == 0)
                return sch;
 
-       kfree(sch);
+       kfree(p);
        return NULL;
 }
 
@@ -404,39 +429,39 @@ void qdisc_reset(struct Qdisc *qdisc)
                ops->reset(qdisc);
 }
 
-/* Under dev->queue_lock and BH! */
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
 
-void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct rcu_head *head)
 {
-       struct Qdisc_ops *ops = qdisc->ops;
-       struct net_device *dev;
-
-       if (!atomic_dec_and_test(&qdisc->refcnt))
-               return;
-
-       dev = qdisc->dev;
+       struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
+       struct Qdisc_ops  *ops = qdisc->ops;
 
-       if (dev) {
-               struct Qdisc *q, **qp;
-               for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
-                       if (q == qdisc) {
-                               *qp = q->next;
-                               break;
-                       }
-               }
-       }
 #ifdef CONFIG_NET_ESTIMATOR
        qdisc_kill_estimator(&qdisc->stats);
 #endif
+       write_lock(&qdisc_tree_lock);
        if (ops->reset)
                ops->reset(qdisc);
        if (ops->destroy)
                ops->destroy(qdisc);
+       write_unlock(&qdisc_tree_lock);
        module_put(ops->owner);
+
+       dev_put(qdisc->dev);
        if (!(qdisc->flags&TCQ_F_BUILTIN))
-               kfree(qdisc);
+               kfree((char *) qdisc - qdisc->padded);
 }
 
+/* Under dev->queue_lock and BH! */
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+       if (!atomic_dec_and_test(&qdisc->refcnt))
+               return;
+       list_del(&qdisc->list);
+       call_rcu(&qdisc->q_rcu, __qdisc_destroy);
+}
 
 void dev_activate(struct net_device *dev)
 {
@@ -454,18 +479,15 @@ void dev_activate(struct net_device *dev)
                                printk(KERN_INFO "%s: activation failed\n", dev->name);
                                return;
                        }
-
-                       write_lock(&qdisc_tree_lock);
-                       qdisc->next = dev->qdisc_list;
-                       dev->qdisc_list = qdisc;
-                       write_unlock(&qdisc_tree_lock);
-
+                       write_lock_bh(&qdisc_tree_lock);
+                       list_add_tail(&qdisc->list, &dev->qdisc_list);
+                       write_unlock_bh(&qdisc_tree_lock);
                } else {
                        qdisc =  &noqueue_qdisc;
                }
-               write_lock(&qdisc_tree_lock);
+               write_lock_bh(&qdisc_tree_lock);
                dev->qdisc_sleeping = qdisc;
-               write_unlock(&qdisc_tree_lock);
+               write_unlock_bh(&qdisc_tree_lock);
        }
 
        spin_lock_bh(&dev->queue_lock);
@@ -498,13 +520,11 @@ void dev_deactivate(struct net_device *dev)
 
 void dev_init_scheduler(struct net_device *dev)
 {
-       write_lock(&qdisc_tree_lock);
-       spin_lock_bh(&dev->queue_lock);
+       qdisc_lock_tree(dev);
        dev->qdisc = &noop_qdisc;
-       spin_unlock_bh(&dev->queue_lock);
        dev->qdisc_sleeping = &noop_qdisc;
-       dev->qdisc_list = NULL;
-       write_unlock(&qdisc_tree_lock);
+       INIT_LIST_HEAD(&dev->qdisc_list);
+       qdisc_unlock_tree(dev);
 
        dev_watchdog_init(dev);
 }
@@ -513,8 +533,7 @@ void dev_shutdown(struct net_device *dev)
 {
        struct Qdisc *qdisc;
 
-       write_lock(&qdisc_tree_lock);
-       spin_lock_bh(&dev->queue_lock);
+       qdisc_lock_tree(dev);
        qdisc = dev->qdisc_sleeping;
        dev->qdisc = &noop_qdisc;
        dev->qdisc_sleeping = &noop_qdisc;
@@ -525,11 +544,8 @@ void dev_shutdown(struct net_device *dev)
                qdisc_destroy(qdisc);
         }
 #endif
-       BUG_TRAP(dev->qdisc_list == NULL);
        BUG_TRAP(!timer_pending(&dev->watchdog_timer));
-       dev->qdisc_list = NULL;
-       spin_unlock_bh(&dev->queue_lock);
-       write_unlock(&qdisc_tree_lock);
+       qdisc_unlock_tree(dev);
 }
 
 EXPORT_SYMBOL(__netdev_watchdog_up);
@@ -539,4 +555,5 @@ EXPORT_SYMBOL(qdisc_create_dflt);
 EXPORT_SYMBOL(qdisc_destroy);
 EXPORT_SYMBOL(qdisc_reset);
 EXPORT_SYMBOL(qdisc_restart);
-EXPORT_SYMBOL(qdisc_tree_lock);
+EXPORT_SYMBOL(qdisc_lock_tree);
+EXPORT_SYMBOL(qdisc_unlock_tree);