vserver 1.9.3
[linux-2.6.git] / net / sched / sch_generic.c
index 97b5625..2516dd9 100644 (file)
@@ -30,6 +30,8 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
    The idea is the following:
    - enqueue, dequeue are serialized via top level device
      spinlock dev->queue_lock.
-   - tree walking is protected by read_lock(qdisc_tree_lock)
+   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
      and this lock is used only in process context.
-   - updates to tree are made only under rtnl semaphore,
-     hence this lock may be made without local bh disabling.
+   - updates to tree are made under rtnl semaphore or
+     from softirq context (__qdisc_destroy rcu-callback)
+     hence this lock needs local bh disabling.
 
    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
  */
 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
 
+void qdisc_lock_tree(struct net_device *dev)
+{
+       write_lock_bh(&qdisc_tree_lock);
+       spin_lock_bh(&dev->queue_lock);
+}
+
+void qdisc_unlock_tree(struct net_device *dev)
+{
+       spin_unlock_bh(&dev->queue_lock);
+       write_unlock_bh(&qdisc_tree_lock);
+}
+
 /* 
    dev->queue_lock serializes queue accesses for this device
    AND dev->qdisc pointer itself.
@@ -82,46 +97,71 @@ int qdisc_restart(struct net_device *dev)
 
        /* Dequeue packet */
        if ((skb = q->dequeue(q)) != NULL) {
-               if (spin_trylock(&dev->xmit_lock)) {
+               unsigned nolock = (dev->features & NETIF_F_LLTX);
+               /*
+                * When the driver has LLTX set it does its own locking
+                * in start_xmit. No need to add additional overhead by
+                * locking again. These checks are worth it because
+                * even uncongested locks can be quite expensive.
+                * The driver can do trylock like here too, in case
+                * of lock congestion it should return -1 and the packet
+                * will be requeued.
+                */
+               if (!nolock) {
+                       if (!spin_trylock(&dev->xmit_lock)) {
+                       collision:
+                               /* So, someone grabbed the driver. */
+                               
+                               /* It may be transient configuration error,
+                                  when hard_start_xmit() recurses. We detect
+                                  it by checking xmit owner and drop the
+                                  packet when deadloop is detected.
+                               */
+                               if (dev->xmit_lock_owner == smp_processor_id()) {
+                                       kfree_skb(skb);
+                                       if (net_ratelimit())
+                                               printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
+                                       return -1;
+                               }
+                               __get_cpu_var(netdev_rx_stat).cpu_collision++;
+                               goto requeue;
+                       }
                        /* Remember that the driver is grabbed by us. */
                        dev->xmit_lock_owner = smp_processor_id();
-
+               }
+               
+               {
                        /* And release queue */
                        spin_unlock(&dev->queue_lock);
 
                        if (!netif_queue_stopped(dev)) {
+                               int ret;
                                if (netdev_nit)
                                        dev_queue_xmit_nit(skb, dev);
 
-                               if (dev->hard_start_xmit(skb, dev) == 0) {
-                                       dev->xmit_lock_owner = -1;
-                                       spin_unlock(&dev->xmit_lock);
-
+                               ret = dev->hard_start_xmit(skb, dev);
+                               if (ret == NETDEV_TX_OK) { 
+                                       if (!nolock) {
+                                               dev->xmit_lock_owner = -1;
+                                               spin_unlock(&dev->xmit_lock);
+                                       }
                                        spin_lock(&dev->queue_lock);
                                        return -1;
                                }
+                               if (ret == NETDEV_TX_LOCKED && nolock) {
+                                       spin_lock(&dev->queue_lock);
+                                       goto collision; 
+                               }
                        }
 
+                       /* NETDEV_TX_BUSY - we need to requeue */
                        /* Release the driver */
-                       dev->xmit_lock_owner = -1;
-                       spin_unlock(&dev->xmit_lock);
+                       if (!nolock) { 
+                               dev->xmit_lock_owner = -1;
+                               spin_unlock(&dev->xmit_lock);
+                       } 
                        spin_lock(&dev->queue_lock);
                        q = dev->qdisc;
-               } else {
-                       /* So, someone grabbed the driver. */
-
-                       /* It may be transient configuration error,
-                          when hard_start_xmit() recurses. We detect
-                          it by checking xmit owner and drop the
-                          packet when deadloop is detected.
-                        */
-                       if (dev->xmit_lock_owner == smp_processor_id()) {
-                               kfree_skb(skb);
-                               if (net_ratelimit())
-                                       printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
-                               return -1;
-                       }
-                       __get_cpu_var(netdev_rx_stat).cpu_collision++;
                }
 
                /* Device kicked us out :(
@@ -134,6 +174,7 @@ int qdisc_restart(struct net_device *dev)
                   3. device is buggy (ppp)
                 */
 
+requeue:
                q->ops->requeue(skb, q);
                netif_schedule(dev);
                return 1;
@@ -270,10 +311,9 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
 static int
 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
-       struct sk_buff_head *list;
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
-       list = ((struct sk_buff_head*)qdisc->data) +
-               prio2band[skb->priority&TC_PRIO_MAX];
+       list += prio2band[skb->priority&TC_PRIO_MAX];
 
        if (list->qlen < qdisc->dev->tx_queue_len) {
                __skb_queue_tail(list, skb);
@@ -291,7 +331,7 @@ static struct sk_buff *
 pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
        int prio;
-       struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+       struct sk_buff_head *list = qdisc_priv(qdisc);
        struct sk_buff *skb;
 
        for (prio = 0; prio < 3; prio++, list++) {
@@ -307,10 +347,9 @@ pfifo_fast_dequeue(struct Qdisc* qdisc)
 static int
 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
-       struct sk_buff_head *list;
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
-       list = ((struct sk_buff_head*)qdisc->data) +
-               prio2band[skb->priority&TC_PRIO_MAX];
+       list += prio2band[skb->priority&TC_PRIO_MAX];
 
        __skb_queue_head(list, skb);
        qdisc->q.qlen++;
@@ -321,7 +360,7 @@ static void
 pfifo_fast_reset(struct Qdisc* qdisc)
 {
        int prio;
-       struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
        for (prio=0; prio < 3; prio++)
                skb_queue_purge(list+prio);
@@ -346,9 +385,7 @@ rtattr_failure:
 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
 {
        int i;
-       struct sk_buff_head *list;
-
-       list = ((struct sk_buff_head*)qdisc->data);
+       struct sk_buff_head *list = qdisc_priv(qdisc);
 
        for (i=0; i<3; i++)
                skb_queue_head_init(list+i);
@@ -372,25 +409,40 @@ static struct Qdisc_ops pfifo_fast_ops = {
 
 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
 {
+       void *p;
        struct Qdisc *sch;
-       int size = sizeof(*sch) + ops->priv_size;
+       int size;
+
+       /* ensure that the Qdisc and the private data are 32-byte aligned */
+       size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+       size += ops->priv_size + QDISC_ALIGN_CONST;
 
-       sch = kmalloc(size, GFP_KERNEL);
-       if (!sch)
+       p = kmalloc(size, GFP_KERNEL);
+       if (!p)
                return NULL;
-       memset(sch, 0, size);
+       memset(p, 0, size);
 
+       sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 
+                              & ~QDISC_ALIGN_CONST);
+       sch->padded = (char *)sch - (char *)p;
+
+       INIT_LIST_HEAD(&sch->list);
        skb_queue_head_init(&sch->q);
        sch->ops = ops;
        sch->enqueue = ops->enqueue;
        sch->dequeue = ops->dequeue;
        sch->dev = dev;
-       sch->stats.lock = &dev->queue_lock;
+       dev_hold(dev);
+       sch->stats_lock = &dev->queue_lock;
        atomic_set(&sch->refcnt, 1);
+       /* enqueue is accessed locklessly - make sure it's visible
+        * before we set a netdevice's qdisc pointer to sch */
+       smp_wmb();
        if (!ops->init || ops->init(sch, NULL) == 0)
                return sch;
 
-       kfree(sch);
+       dev_put(dev);
+       kfree(p);
        return NULL;
 }
 
@@ -404,39 +456,39 @@ void qdisc_reset(struct Qdisc *qdisc)
                ops->reset(qdisc);
 }
 
-/* Under dev->queue_lock and BH! */
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
 
-void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct rcu_head *head)
 {
-       struct Qdisc_ops *ops = qdisc->ops;
-       struct net_device *dev;
+       struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
+       struct Qdisc_ops  *ops = qdisc->ops;
 
-       if (!atomic_dec_and_test(&qdisc->refcnt))
-               return;
-
-       dev = qdisc->dev;
-
-       if (dev) {
-               struct Qdisc *q, **qp;
-               for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
-                       if (q == qdisc) {
-                               *qp = q->next;
-                               break;
-                       }
-               }
-       }
 #ifdef CONFIG_NET_ESTIMATOR
        qdisc_kill_estimator(&qdisc->stats);
 #endif
+       write_lock(&qdisc_tree_lock);
        if (ops->reset)
                ops->reset(qdisc);
        if (ops->destroy)
                ops->destroy(qdisc);
+       write_unlock(&qdisc_tree_lock);
        module_put(ops->owner);
+
+       dev_put(qdisc->dev);
        if (!(qdisc->flags&TCQ_F_BUILTIN))
-               kfree(qdisc);
+               kfree((char *) qdisc - qdisc->padded);
 }
 
+/* Under dev->queue_lock and BH! */
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+       if (!atomic_dec_and_test(&qdisc->refcnt))
+               return;
+       list_del(&qdisc->list);
+       call_rcu(&qdisc->q_rcu, __qdisc_destroy);
+}
 
 void dev_activate(struct net_device *dev)
 {
@@ -454,18 +506,15 @@ void dev_activate(struct net_device *dev)
                                printk(KERN_INFO "%s: activation failed\n", dev->name);
                                return;
                        }
-
-                       write_lock(&qdisc_tree_lock);
-                       qdisc->next = dev->qdisc_list;
-                       dev->qdisc_list = qdisc;
-                       write_unlock(&qdisc_tree_lock);
-
+                       write_lock_bh(&qdisc_tree_lock);
+                       list_add_tail(&qdisc->list, &dev->qdisc_list);
+                       write_unlock_bh(&qdisc_tree_lock);
                } else {
                        qdisc =  &noqueue_qdisc;
                }
-               write_lock(&qdisc_tree_lock);
+               write_lock_bh(&qdisc_tree_lock);
                dev->qdisc_sleeping = qdisc;
-               write_unlock(&qdisc_tree_lock);
+               write_unlock_bh(&qdisc_tree_lock);
        }
 
        spin_lock_bh(&dev->queue_lock);
@@ -498,13 +547,11 @@ void dev_deactivate(struct net_device *dev)
 
 void dev_init_scheduler(struct net_device *dev)
 {
-       write_lock(&qdisc_tree_lock);
-       spin_lock_bh(&dev->queue_lock);
+       qdisc_lock_tree(dev);
        dev->qdisc = &noop_qdisc;
-       spin_unlock_bh(&dev->queue_lock);
        dev->qdisc_sleeping = &noop_qdisc;
-       dev->qdisc_list = NULL;
-       write_unlock(&qdisc_tree_lock);
+       INIT_LIST_HEAD(&dev->qdisc_list);
+       qdisc_unlock_tree(dev);
 
        dev_watchdog_init(dev);
 }
@@ -513,8 +560,7 @@ void dev_shutdown(struct net_device *dev)
 {
        struct Qdisc *qdisc;
 
-       write_lock(&qdisc_tree_lock);
-       spin_lock_bh(&dev->queue_lock);
+       qdisc_lock_tree(dev);
        qdisc = dev->qdisc_sleeping;
        dev->qdisc = &noop_qdisc;
        dev->qdisc_sleeping = &noop_qdisc;
@@ -525,11 +571,8 @@ void dev_shutdown(struct net_device *dev)
                qdisc_destroy(qdisc);
         }
 #endif
-       BUG_TRAP(dev->qdisc_list == NULL);
        BUG_TRAP(!timer_pending(&dev->watchdog_timer));
-       dev->qdisc_list = NULL;
-       spin_unlock_bh(&dev->queue_lock);
-       write_unlock(&qdisc_tree_lock);
+       qdisc_unlock_tree(dev);
 }
 
 EXPORT_SYMBOL(__netdev_watchdog_up);
@@ -539,4 +582,5 @@ EXPORT_SYMBOL(qdisc_create_dflt);
 EXPORT_SYMBOL(qdisc_destroy);
 EXPORT_SYMBOL(qdisc_reset);
 EXPORT_SYMBOL(qdisc_restart);
-EXPORT_SYMBOL(qdisc_tree_lock);
+EXPORT_SYMBOL(qdisc_lock_tree);
+EXPORT_SYMBOL(qdisc_unlock_tree);