Merge to Fedora kernel-2.6.17-1.2187_FC5 patched with stable patch-2.6.17.13-vs2...
[linux-2.6.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <linux/list.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
37
38 /* Main transmission queue. */
39
40 /* Main qdisc structure lock. 
41
42    However, modifications
43    to data, participating in scheduling must be additionally
44    protected with dev->queue_lock spinlock.
45
46    The idea is the following:
47    - enqueue, dequeue are serialized via top level device
48      spinlock dev->queue_lock.
49    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
50      and this lock is used only in process context.
51    - updates to tree are made under rtnl semaphore or
52      from softirq context (__qdisc_destroy rcu-callback)
53      hence this lock needs local bh disabling.
54
55    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
56  */
57 DEFINE_RWLOCK(qdisc_tree_lock);
58
59 void qdisc_lock_tree(struct net_device *dev)
60 {
61         write_lock_bh(&qdisc_tree_lock);
62         spin_lock_bh(&dev->queue_lock);
63 }
64
65 void qdisc_unlock_tree(struct net_device *dev)
66 {
67         spin_unlock_bh(&dev->queue_lock);
68         write_unlock_bh(&qdisc_tree_lock);
69 }
70
71 /* 
72    dev->queue_lock serializes queue accesses for this device
73    AND dev->qdisc pointer itself.
74
75    netif_tx_lock serializes accesses to device driver.
76
77    dev->queue_lock and netif_tx_lock are mutually exclusive,
78    if one is grabbed, another must be free.
79  */
80
81
82 /* Kick device.
83    Note, that this procedure can be called by a watchdog timer, so that
84    we do not check dev->tbusy flag here.
85
86    Returns:  0  - queue is empty.
87             >0  - queue is not empty, but throttled.
88             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
89
90    NOTE: Called under dev->queue_lock with locally disabled BH.
91 */
92
93 static inline int qdisc_restart(struct net_device *dev)
94 {
95         struct Qdisc *q = dev->qdisc;
96         struct sk_buff *skb;
97
98         /* Dequeue packet */
99         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
100                 unsigned nolock = (dev->features & NETIF_F_LLTX);
101
102                 dev->gso_skb = NULL;
103
104                 /*
105                  * When the driver has LLTX set it does its own locking
106                  * in start_xmit. No need to add additional overhead by
107                  * locking again. These checks are worth it because
108                  * even uncongested locks can be quite expensive.
109                  * The driver can do trylock like here too, in case
110                  * of lock congestion it should return -1 and the packet
111                  * will be requeued.
112                  */
113                 if (!nolock) {
114                         if (!netif_tx_trylock(dev)) {
115                         collision:
116                                 /* So, someone grabbed the driver. */
117                                 
118                                 /* It may be transient configuration error,
119                                    when hard_start_xmit() recurses. We detect
120                                    it by checking xmit owner and drop the
121                                    packet when deadloop is detected.
122                                 */
123                                 if (dev->xmit_lock_owner == smp_processor_id()) {
124                                         kfree_skb(skb);
125                                         if (net_ratelimit())
126                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
127                                         return -1;
128                                 }
129                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
130                                 goto requeue;
131                         }
132                 }
133                 
134                 {
135                         /* And release queue */
136                         spin_unlock(&dev->queue_lock);
137
138                         if (!netif_queue_stopped(dev)) {
139                                 int ret;
140
141                                 ret = dev_hard_start_xmit(skb, dev);
142                                 if (ret == NETDEV_TX_OK) { 
143                                         if (!nolock) {
144                                                 netif_tx_unlock(dev);
145                                         }
146                                         spin_lock(&dev->queue_lock);
147                                         return -1;
148                                 }
149                                 if (ret == NETDEV_TX_LOCKED && nolock) {
150                                         spin_lock(&dev->queue_lock);
151                                         goto collision; 
152                                 }
153                         }
154
155                         /* NETDEV_TX_BUSY - we need to requeue */
156                         /* Release the driver */
157                         if (!nolock) { 
158                                 netif_tx_unlock(dev);
159                         } 
160                         spin_lock(&dev->queue_lock);
161                         q = dev->qdisc;
162                 }
163
164                 /* Device kicked us out :(
165                    This is possible in three cases:
166
167                    0. driver is locked
168                    1. fastroute is enabled
169                    2. device cannot determine busy state
170                       before start of transmission (f.e. dialout)
171                    3. device is buggy (ppp)
172                  */
173
174 requeue:
175                 if (skb->next)
176                         dev->gso_skb = skb;
177                 else
178                         q->ops->requeue(skb, q);
179                 netif_schedule(dev);
180                 return 1;
181         }
182         BUG_ON((int) q->q.qlen < 0);
183         return q->q.qlen;
184 }
185
186 void __qdisc_run(struct net_device *dev)
187 {
188         if (unlikely(dev->qdisc == &noop_qdisc))
189                 goto out;
190
191         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
192                 /* NOTHING */;
193
194 out:
195         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
196 }
197
198 static void dev_watchdog(unsigned long arg)
199 {
200         struct net_device *dev = (struct net_device *)arg;
201
202         netif_tx_lock(dev);
203         if (dev->qdisc != &noop_qdisc) {
204                 if (netif_device_present(dev) &&
205                     netif_running(dev) &&
206                     netif_carrier_ok(dev)) {
207                         if (netif_queue_stopped(dev) &&
208                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
209
210                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
211                                        dev->name);
212                                 dev->tx_timeout(dev);
213                         }
214                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
215                                 dev_hold(dev);
216                 }
217         }
218         netif_tx_unlock(dev);
219
220         dev_put(dev);
221 }
222
223 static void dev_watchdog_init(struct net_device *dev)
224 {
225         init_timer(&dev->watchdog_timer);
226         dev->watchdog_timer.data = (unsigned long)dev;
227         dev->watchdog_timer.function = dev_watchdog;
228 }
229
230 void __netdev_watchdog_up(struct net_device *dev)
231 {
232         if (dev->tx_timeout) {
233                 if (dev->watchdog_timeo <= 0)
234                         dev->watchdog_timeo = 5*HZ;
235                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
236                         dev_hold(dev);
237         }
238 }
239
240 static void dev_watchdog_up(struct net_device *dev)
241 {
242         netif_tx_lock_bh(dev);
243         __netdev_watchdog_up(dev);
244         netif_tx_unlock_bh(dev);
245 }
246
247 static void dev_watchdog_down(struct net_device *dev)
248 {
249         netif_tx_lock_bh(dev);
250         if (del_timer(&dev->watchdog_timer))
251                 dev_put(dev);
252         netif_tx_unlock_bh(dev);
253 }
254
255 void netif_carrier_on(struct net_device *dev)
256 {
257         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
258                 linkwatch_fire_event(dev);
259         if (netif_running(dev))
260                 __netdev_watchdog_up(dev);
261 }
262
263 void netif_carrier_off(struct net_device *dev)
264 {
265         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
266                 linkwatch_fire_event(dev);
267 }
268
269 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
270    under all circumstances. It is difficult to invent anything faster or
271    cheaper.
272  */
273
274 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
275 {
276         kfree_skb(skb);
277         return NET_XMIT_CN;
278 }
279
280 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
281 {
282         return NULL;
283 }
284
285 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
286 {
287         if (net_ratelimit())
288                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
289                        skb->dev->name);
290         kfree_skb(skb);
291         return NET_XMIT_CN;
292 }
293
294 struct Qdisc_ops noop_qdisc_ops = {
295         .id             =       "noop",
296         .priv_size      =       0,
297         .enqueue        =       noop_enqueue,
298         .dequeue        =       noop_dequeue,
299         .requeue        =       noop_requeue,
300         .owner          =       THIS_MODULE,
301 };
302
303 struct Qdisc noop_qdisc = {
304         .enqueue        =       noop_enqueue,
305         .dequeue        =       noop_dequeue,
306         .flags          =       TCQ_F_BUILTIN,
307         .ops            =       &noop_qdisc_ops,        
308         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
309 };
310
311 static struct Qdisc_ops noqueue_qdisc_ops = {
312         .id             =       "noqueue",
313         .priv_size      =       0,
314         .enqueue        =       noop_enqueue,
315         .dequeue        =       noop_dequeue,
316         .requeue        =       noop_requeue,
317         .owner          =       THIS_MODULE,
318 };
319
320 static struct Qdisc noqueue_qdisc = {
321         .enqueue        =       NULL,
322         .dequeue        =       noop_dequeue,
323         .flags          =       TCQ_F_BUILTIN,
324         .ops            =       &noqueue_qdisc_ops,
325         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
326 };
327
328
329 static const u8 prio2band[TC_PRIO_MAX+1] =
330         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
331
332 /* 3-band FIFO queue: old style, but should be a bit faster than
333    generic prio+fifo combination.
334  */
335
336 #define PFIFO_FAST_BANDS 3
337
338 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
339                                              struct Qdisc *qdisc)
340 {
341         struct sk_buff_head *list = qdisc_priv(qdisc);
342         return list + prio2band[skb->priority & TC_PRIO_MAX];
343 }
344
345 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
346 {
347         struct sk_buff_head *list = prio2list(skb, qdisc);
348
349         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
350                 qdisc->q.qlen++;
351                 return __qdisc_enqueue_tail(skb, qdisc, list);
352         }
353
354         return qdisc_drop(skb, qdisc);
355 }
356
357 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
358 {
359         int prio;
360         struct sk_buff_head *list = qdisc_priv(qdisc);
361
362         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
363                 if (!skb_queue_empty(list + prio)) {
364                         qdisc->q.qlen--;
365                         return __qdisc_dequeue_head(qdisc, list + prio);
366                 }
367         }
368
369         return NULL;
370 }
371
372 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
373 {
374         qdisc->q.qlen++;
375         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
376 }
377
378 static void pfifo_fast_reset(struct Qdisc* qdisc)
379 {
380         int prio;
381         struct sk_buff_head *list = qdisc_priv(qdisc);
382
383         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
384                 __qdisc_reset_queue(qdisc, list + prio);
385
386         qdisc->qstats.backlog = 0;
387         qdisc->q.qlen = 0;
388 }
389
390 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
391 {
392         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
393
394         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
395         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
396         return skb->len;
397
398 rtattr_failure:
399         return -1;
400 }
401
402 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
403 {
404         int prio;
405         struct sk_buff_head *list = qdisc_priv(qdisc);
406
407         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
408                 skb_queue_head_init(list + prio);
409
410         return 0;
411 }
412
413 static struct Qdisc_ops pfifo_fast_ops = {
414         .id             =       "pfifo_fast",
415         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
416         .enqueue        =       pfifo_fast_enqueue,
417         .dequeue        =       pfifo_fast_dequeue,
418         .requeue        =       pfifo_fast_requeue,
419         .init           =       pfifo_fast_init,
420         .reset          =       pfifo_fast_reset,
421         .dump           =       pfifo_fast_dump,
422         .owner          =       THIS_MODULE,
423 };
424
425 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
426 {
427         void *p;
428         struct Qdisc *sch;
429         unsigned int size;
430         int err = -ENOBUFS;
431
432         /* ensure that the Qdisc and the private data are 32-byte aligned */
433         size = QDISC_ALIGN(sizeof(*sch));
434         size += ops->priv_size + (QDISC_ALIGNTO - 1);
435
436         p = kmalloc(size, GFP_KERNEL);
437         if (!p)
438                 goto errout;
439         memset(p, 0, size);
440         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
441         sch->padded = (char *) sch - (char *) p;
442
443         INIT_LIST_HEAD(&sch->list);
444         skb_queue_head_init(&sch->q);
445         sch->ops = ops;
446         sch->enqueue = ops->enqueue;
447         sch->dequeue = ops->dequeue;
448         sch->dev = dev;
449         dev_hold(dev);
450         sch->stats_lock = &dev->queue_lock;
451         atomic_set(&sch->refcnt, 1);
452
453         return sch;
454 errout:
455         return ERR_PTR(-err);
456 }
457
458 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
459 {
460         struct Qdisc *sch;
461         
462         sch = qdisc_alloc(dev, ops);
463         if (IS_ERR(sch))
464                 goto errout;
465
466         if (!ops->init || ops->init(sch, NULL) == 0)
467                 return sch;
468
469         qdisc_destroy(sch);
470 errout:
471         return NULL;
472 }
473
474 /* Under dev->queue_lock and BH! */
475
476 void qdisc_reset(struct Qdisc *qdisc)
477 {
478         struct Qdisc_ops *ops = qdisc->ops;
479
480         if (ops->reset)
481                 ops->reset(qdisc);
482 }
483
484 /* this is the rcu callback function to clean up a qdisc when there 
485  * are no further references to it */
486
487 static void __qdisc_destroy(struct rcu_head *head)
488 {
489         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
490         struct Qdisc_ops  *ops = qdisc->ops;
491
492 #ifdef CONFIG_NET_ESTIMATOR
493         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
494 #endif
495         write_lock(&qdisc_tree_lock);
496         if (ops->reset)
497                 ops->reset(qdisc);
498         if (ops->destroy)
499                 ops->destroy(qdisc);
500         write_unlock(&qdisc_tree_lock);
501         module_put(ops->owner);
502
503         dev_put(qdisc->dev);
504         kfree((char *) qdisc - qdisc->padded);
505 }
506
507 /* Under dev->queue_lock and BH! */
508
509 void qdisc_destroy(struct Qdisc *qdisc)
510 {
511         struct list_head cql = LIST_HEAD_INIT(cql);
512         struct Qdisc *cq, *q, *n;
513
514         if (qdisc->flags & TCQ_F_BUILTIN ||
515                 !atomic_dec_and_test(&qdisc->refcnt))
516                 return;
517
518         if (!list_empty(&qdisc->list)) {
519                 if (qdisc->ops->cl_ops == NULL)
520                         list_del(&qdisc->list);
521                 else
522                         list_move(&qdisc->list, &cql);
523         }
524
525         /* unlink inner qdiscs from dev->qdisc_list immediately */
526         list_for_each_entry(cq, &cql, list)
527                 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
528                         if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
529                                 if (q->ops->cl_ops == NULL)
530                                         list_del_init(&q->list);
531                                 else
532                                         list_move_tail(&q->list, &cql);
533                         }
534         list_for_each_entry_safe(cq, n, &cql, list)
535                 list_del_init(&cq->list);
536
537         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
538 }
539
540 void dev_activate(struct net_device *dev)
541 {
542         /* No queueing discipline is attached to device;
543            create default one i.e. pfifo_fast for devices,
544            which need queueing and noqueue_qdisc for
545            virtual interfaces
546          */
547
548         if (dev->qdisc_sleeping == &noop_qdisc) {
549                 struct Qdisc *qdisc;
550                 if (dev->tx_queue_len) {
551                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
552                         if (qdisc == NULL) {
553                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
554                                 return;
555                         }
556                         write_lock_bh(&qdisc_tree_lock);
557                         list_add_tail(&qdisc->list, &dev->qdisc_list);
558                         write_unlock_bh(&qdisc_tree_lock);
559                 } else {
560                         qdisc =  &noqueue_qdisc;
561                 }
562                 write_lock_bh(&qdisc_tree_lock);
563                 dev->qdisc_sleeping = qdisc;
564                 write_unlock_bh(&qdisc_tree_lock);
565         }
566
567         if (!netif_carrier_ok(dev))
568                 /* Delay activation until next carrier-on event */
569                 return;
570
571         spin_lock_bh(&dev->queue_lock);
572         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
573         if (dev->qdisc != &noqueue_qdisc) {
574                 dev->trans_start = jiffies;
575                 dev_watchdog_up(dev);
576         }
577         spin_unlock_bh(&dev->queue_lock);
578 }
579
580 void dev_deactivate(struct net_device *dev)
581 {
582         struct Qdisc *qdisc;
583
584         spin_lock_bh(&dev->queue_lock);
585         qdisc = dev->qdisc;
586         dev->qdisc = &noop_qdisc;
587
588         qdisc_reset(qdisc);
589
590         spin_unlock_bh(&dev->queue_lock);
591
592         dev_watchdog_down(dev);
593
594         /* Wait for outstanding dev_queue_xmit calls. */
595         synchronize_rcu();
596
597         /* Wait for outstanding qdisc_run calls. */
598         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
599                 yield();
600
601         if (dev->gso_skb) {
602                 kfree_skb(dev->gso_skb);
603                 dev->gso_skb = NULL;
604         }
605 }
606
607 void dev_init_scheduler(struct net_device *dev)
608 {
609         qdisc_lock_tree(dev);
610         dev->qdisc = &noop_qdisc;
611         dev->qdisc_sleeping = &noop_qdisc;
612         INIT_LIST_HEAD(&dev->qdisc_list);
613         qdisc_unlock_tree(dev);
614
615         dev_watchdog_init(dev);
616 }
617
618 void dev_shutdown(struct net_device *dev)
619 {
620         struct Qdisc *qdisc;
621
622         qdisc_lock_tree(dev);
623         qdisc = dev->qdisc_sleeping;
624         dev->qdisc = &noop_qdisc;
625         dev->qdisc_sleeping = &noop_qdisc;
626         qdisc_destroy(qdisc);
627 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
628         if ((qdisc = dev->qdisc_ingress) != NULL) {
629                 dev->qdisc_ingress = NULL;
630                 qdisc_destroy(qdisc);
631         }
632 #endif
633         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
634         qdisc_unlock_tree(dev);
635 }
636
637 EXPORT_SYMBOL(__netdev_watchdog_up);
638 EXPORT_SYMBOL(netif_carrier_on);
639 EXPORT_SYMBOL(netif_carrier_off);
640 EXPORT_SYMBOL(noop_qdisc);
641 EXPORT_SYMBOL(noop_qdisc_ops);
642 EXPORT_SYMBOL(qdisc_create_dflt);
643 EXPORT_SYMBOL(qdisc_alloc);
644 EXPORT_SYMBOL(qdisc_destroy);
645 EXPORT_SYMBOL(qdisc_reset);
646 EXPORT_SYMBOL(qdisc_lock_tree);
647 EXPORT_SYMBOL(qdisc_unlock_tree);