Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Main qdisc structure lock. 
40
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made only under rtnl semaphore,
51      hence this lock may be made without local bh disabling.
52
53    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
54  */
55 DEFINE_RWLOCK(qdisc_tree_lock);
56
57 void qdisc_lock_tree(struct net_device *dev)
58 {
59         write_lock(&qdisc_tree_lock);
60         spin_lock_bh(&dev->queue_lock);
61 }
62
63 void qdisc_unlock_tree(struct net_device *dev)
64 {
65         spin_unlock_bh(&dev->queue_lock);
66         write_unlock(&qdisc_tree_lock);
67 }
68
69 /* 
70    dev->queue_lock serializes queue accesses for this device
71    AND dev->qdisc pointer itself.
72
73    netif_tx_lock serializes accesses to device driver.
74
75    dev->queue_lock and netif_tx_lock are mutually exclusive,
76    if one is grabbed, another must be free.
77  */
78
79
80 /* Kick device.
81    Note, that this procedure can be called by a watchdog timer, so that
82    we do not check dev->tbusy flag here.
83
84    Returns:  0  - queue is empty.
85             >0  - queue is not empty, but throttled.
86             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
87
88    NOTE: Called under dev->queue_lock with locally disabled BH.
89 */
90
91 static inline int qdisc_restart(struct net_device *dev)
92 {
93         struct Qdisc *q = dev->qdisc;
94         struct sk_buff *skb;
95
96         /* Dequeue packet */
97         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
98                 unsigned nolock = (dev->features & NETIF_F_LLTX);
99
100                 dev->gso_skb = NULL;
101
102                 /*
103                  * When the driver has LLTX set it does its own locking
104                  * in start_xmit. No need to add additional overhead by
105                  * locking again. These checks are worth it because
106                  * even uncongested locks can be quite expensive.
107                  * The driver can do trylock like here too, in case
108                  * of lock congestion it should return -1 and the packet
109                  * will be requeued.
110                  */
111                 if (!nolock) {
112                         if (!netif_tx_trylock(dev)) {
113                         collision:
114                                 /* So, someone grabbed the driver. */
115                                 
116                                 /* It may be transient configuration error,
117                                    when hard_start_xmit() recurses. We detect
118                                    it by checking xmit owner and drop the
119                                    packet when deadloop is detected.
120                                 */
121                                 if (dev->xmit_lock_owner == smp_processor_id()) {
122                                         kfree_skb(skb);
123                                         if (net_ratelimit())
124                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
125                                         return -1;
126                                 }
127                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
128                                 goto requeue;
129                         }
130                 }
131                 
132                 {
133                         /* And release queue */
134                         spin_unlock(&dev->queue_lock);
135
136                         if (!netif_queue_stopped(dev)) {
137                                 int ret;
138
139                                 ret = dev_hard_start_xmit(skb, dev);
140                                 if (ret == NETDEV_TX_OK) { 
141                                         if (!nolock) {
142                                                 netif_tx_unlock(dev);
143                                         }
144                                         spin_lock(&dev->queue_lock);
145                                         return -1;
146                                 }
147                                 if (ret == NETDEV_TX_LOCKED && nolock) {
148                                         spin_lock(&dev->queue_lock);
149                                         goto collision; 
150                                 }
151                         }
152
153                         /* NETDEV_TX_BUSY - we need to requeue */
154                         /* Release the driver */
155                         if (!nolock) { 
156                                 netif_tx_unlock(dev);
157                         } 
158                         spin_lock(&dev->queue_lock);
159                         q = dev->qdisc;
160                 }
161
162                 /* Device kicked us out :(
163                    This is possible in three cases:
164
165                    0. driver is locked
166                    1. fastroute is enabled
167                    2. device cannot determine busy state
168                       before start of transmission (f.e. dialout)
169                    3. device is buggy (ppp)
170                  */
171
172 requeue:
173                 if (skb->next)
174                         dev->gso_skb = skb;
175                 else
176                         q->ops->requeue(skb, q);
177                 netif_schedule(dev);
178                 return 1;
179         }
180         BUG_ON((int) q->q.qlen < 0);
181         return q->q.qlen;
182 }
183
184 void __qdisc_run(struct net_device *dev)
185 {
186         if (unlikely(dev->qdisc == &noop_qdisc))
187                 goto out;
188
189         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
190                 /* NOTHING */;
191
192 out:
193         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
194 }
195
196 static void dev_watchdog(unsigned long arg)
197 {
198         struct net_device *dev = (struct net_device *)arg;
199
200         netif_tx_lock(dev);
201         if (dev->qdisc != &noop_qdisc) {
202                 if (netif_device_present(dev) &&
203                     netif_running(dev) &&
204                     netif_carrier_ok(dev)) {
205                         if (netif_queue_stopped(dev) &&
206                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
207
208                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
209                                        dev->name);
210                                 dev->tx_timeout(dev);
211                         }
212                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
213                                 dev_hold(dev);
214                 }
215         }
216         netif_tx_unlock(dev);
217
218         dev_put(dev);
219 }
220
221 static void dev_watchdog_init(struct net_device *dev)
222 {
223         init_timer(&dev->watchdog_timer);
224         dev->watchdog_timer.data = (unsigned long)dev;
225         dev->watchdog_timer.function = dev_watchdog;
226 }
227
228 void __netdev_watchdog_up(struct net_device *dev)
229 {
230         if (dev->tx_timeout) {
231                 if (dev->watchdog_timeo <= 0)
232                         dev->watchdog_timeo = 5*HZ;
233                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
234                         dev_hold(dev);
235         }
236 }
237
238 static void dev_watchdog_up(struct net_device *dev)
239 {
240         __netdev_watchdog_up(dev);
241 }
242
243 static void dev_watchdog_down(struct net_device *dev)
244 {
245         netif_tx_lock_bh(dev);
246         if (del_timer(&dev->watchdog_timer))
247                 dev_put(dev);
248         netif_tx_unlock_bh(dev);
249 }
250
251 void netif_carrier_on(struct net_device *dev)
252 {
253         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
254                 linkwatch_fire_event(dev);
255         if (netif_running(dev))
256                 __netdev_watchdog_up(dev);
257 }
258
259 void netif_carrier_off(struct net_device *dev)
260 {
261         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
262                 linkwatch_fire_event(dev);
263 }
264
265 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
266    under all circumstances. It is difficult to invent anything faster or
267    cheaper.
268  */
269
270 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
271 {
272         kfree_skb(skb);
273         return NET_XMIT_CN;
274 }
275
276 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
277 {
278         return NULL;
279 }
280
281 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
282 {
283         if (net_ratelimit())
284                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
285                        skb->dev->name);
286         kfree_skb(skb);
287         return NET_XMIT_CN;
288 }
289
290 struct Qdisc_ops noop_qdisc_ops = {
291         .id             =       "noop",
292         .priv_size      =       0,
293         .enqueue        =       noop_enqueue,
294         .dequeue        =       noop_dequeue,
295         .requeue        =       noop_requeue,
296         .owner          =       THIS_MODULE,
297 };
298
299 struct Qdisc noop_qdisc = {
300         .enqueue        =       noop_enqueue,
301         .dequeue        =       noop_dequeue,
302         .flags          =       TCQ_F_BUILTIN,
303         .ops            =       &noop_qdisc_ops,        
304         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
305 };
306
307 static struct Qdisc_ops noqueue_qdisc_ops = {
308         .id             =       "noqueue",
309         .priv_size      =       0,
310         .enqueue        =       noop_enqueue,
311         .dequeue        =       noop_dequeue,
312         .requeue        =       noop_requeue,
313         .owner          =       THIS_MODULE,
314 };
315
316 static struct Qdisc noqueue_qdisc = {
317         .enqueue        =       NULL,
318         .dequeue        =       noop_dequeue,
319         .flags          =       TCQ_F_BUILTIN,
320         .ops            =       &noqueue_qdisc_ops,
321         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
322 };
323
324
325 static const u8 prio2band[TC_PRIO_MAX+1] =
326         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
327
328 /* 3-band FIFO queue: old style, but should be a bit faster than
329    generic prio+fifo combination.
330  */
331
332 #define PFIFO_FAST_BANDS 3
333
334 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
335                                              struct Qdisc *qdisc)
336 {
337         struct sk_buff_head *list = qdisc_priv(qdisc);
338         return list + prio2band[skb->priority & TC_PRIO_MAX];
339 }
340
341 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
342 {
343         struct sk_buff_head *list = prio2list(skb, qdisc);
344
345         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
346                 qdisc->q.qlen++;
347                 return __qdisc_enqueue_tail(skb, qdisc, list);
348         }
349
350         return qdisc_drop(skb, qdisc);
351 }
352
353 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
354 {
355         int prio;
356         struct sk_buff_head *list = qdisc_priv(qdisc);
357
358         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
359                 if (!skb_queue_empty(list + prio)) {
360                         qdisc->q.qlen--;
361                         return __qdisc_dequeue_head(qdisc, list + prio);
362                 }
363         }
364
365         return NULL;
366 }
367
368 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
369 {
370         qdisc->q.qlen++;
371         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
372 }
373
374 static void pfifo_fast_reset(struct Qdisc* qdisc)
375 {
376         int prio;
377         struct sk_buff_head *list = qdisc_priv(qdisc);
378
379         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
380                 __qdisc_reset_queue(qdisc, list + prio);
381
382         qdisc->qstats.backlog = 0;
383         qdisc->q.qlen = 0;
384 }
385
386 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
387 {
388         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
389
390         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
391         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
392         return skb->len;
393
394 rtattr_failure:
395         return -1;
396 }
397
398 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
399 {
400         int prio;
401         struct sk_buff_head *list = qdisc_priv(qdisc);
402
403         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
404                 skb_queue_head_init(list + prio);
405
406         return 0;
407 }
408
409 static struct Qdisc_ops pfifo_fast_ops = {
410         .id             =       "pfifo_fast",
411         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
412         .enqueue        =       pfifo_fast_enqueue,
413         .dequeue        =       pfifo_fast_dequeue,
414         .requeue        =       pfifo_fast_requeue,
415         .init           =       pfifo_fast_init,
416         .reset          =       pfifo_fast_reset,
417         .dump           =       pfifo_fast_dump,
418         .owner          =       THIS_MODULE,
419 };
420
421 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
422 {
423         void *p;
424         struct Qdisc *sch;
425         unsigned int size;
426         int err = -ENOBUFS;
427
428         /* ensure that the Qdisc and the private data are 32-byte aligned */
429         size = QDISC_ALIGN(sizeof(*sch));
430         size += ops->priv_size + (QDISC_ALIGNTO - 1);
431
432         p = kzalloc(size, GFP_KERNEL);
433         if (!p)
434                 goto errout;
435         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
436         sch->padded = (char *) sch - (char *) p;
437
438         INIT_LIST_HEAD(&sch->list);
439         skb_queue_head_init(&sch->q);
440         sch->ops = ops;
441         sch->enqueue = ops->enqueue;
442         sch->dequeue = ops->dequeue;
443         sch->dev = dev;
444         dev_hold(dev);
445         sch->stats_lock = &dev->queue_lock;
446         atomic_set(&sch->refcnt, 1);
447
448         return sch;
449 errout:
450         return ERR_PTR(-err);
451 }
452
453 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
454 {
455         struct Qdisc *sch;
456         
457         sch = qdisc_alloc(dev, ops);
458         if (IS_ERR(sch))
459                 goto errout;
460
461         if (!ops->init || ops->init(sch, NULL) == 0)
462                 return sch;
463
464         qdisc_destroy(sch);
465 errout:
466         return NULL;
467 }
468
469 /* Under dev->queue_lock and BH! */
470
471 void qdisc_reset(struct Qdisc *qdisc)
472 {
473         struct Qdisc_ops *ops = qdisc->ops;
474
475         if (ops->reset)
476                 ops->reset(qdisc);
477 }
478
479 /* this is the rcu callback function to clean up a qdisc when there 
480  * are no further references to it */
481
482 static void __qdisc_destroy(struct rcu_head *head)
483 {
484         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
485         kfree((char *) qdisc - qdisc->padded);
486 }
487
488 /* Under dev->queue_lock and BH! */
489
490 void qdisc_destroy(struct Qdisc *qdisc)
491 {
492         struct Qdisc_ops  *ops = qdisc->ops;
493
494         if (qdisc->flags & TCQ_F_BUILTIN ||
495             !atomic_dec_and_test(&qdisc->refcnt))
496                 return;
497
498         list_del(&qdisc->list);
499 #ifdef CONFIG_NET_ESTIMATOR
500         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
501 #endif
502         if (ops->reset)
503                 ops->reset(qdisc);
504         if (ops->destroy)
505                 ops->destroy(qdisc);
506
507         module_put(ops->owner);
508         dev_put(qdisc->dev);
509         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
510 }
511
512 void dev_activate(struct net_device *dev)
513 {
514         /* No queueing discipline is attached to device;
515            create default one i.e. pfifo_fast for devices,
516            which need queueing and noqueue_qdisc for
517            virtual interfaces
518          */
519
520         if (dev->qdisc_sleeping == &noop_qdisc) {
521                 struct Qdisc *qdisc;
522                 if (dev->tx_queue_len) {
523                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
524                         if (qdisc == NULL) {
525                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
526                                 return;
527                         }
528                         write_lock(&qdisc_tree_lock);
529                         list_add_tail(&qdisc->list, &dev->qdisc_list);
530                         write_unlock(&qdisc_tree_lock);
531                 } else {
532                         qdisc =  &noqueue_qdisc;
533                 }
534                 write_lock(&qdisc_tree_lock);
535                 dev->qdisc_sleeping = qdisc;
536                 write_unlock(&qdisc_tree_lock);
537         }
538
539         if (!netif_carrier_ok(dev))
540                 /* Delay activation until next carrier-on event */
541                 return;
542
543         spin_lock_bh(&dev->queue_lock);
544         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
545         if (dev->qdisc != &noqueue_qdisc) {
546                 dev->trans_start = jiffies;
547                 dev_watchdog_up(dev);
548         }
549         spin_unlock_bh(&dev->queue_lock);
550 }
551
552 void dev_deactivate(struct net_device *dev)
553 {
554         struct Qdisc *qdisc;
555
556         spin_lock_bh(&dev->queue_lock);
557         qdisc = dev->qdisc;
558         dev->qdisc = &noop_qdisc;
559
560         qdisc_reset(qdisc);
561
562         spin_unlock_bh(&dev->queue_lock);
563
564         dev_watchdog_down(dev);
565
566         /* Wait for outstanding dev_queue_xmit calls. */
567         synchronize_rcu();
568
569         /* Wait for outstanding qdisc_run calls. */
570         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
571                 yield();
572
573         if (dev->gso_skb) {
574                 kfree_skb(dev->gso_skb);
575                 dev->gso_skb = NULL;
576         }
577 }
578
579 void dev_init_scheduler(struct net_device *dev)
580 {
581         qdisc_lock_tree(dev);
582         dev->qdisc = &noop_qdisc;
583         dev->qdisc_sleeping = &noop_qdisc;
584         INIT_LIST_HEAD(&dev->qdisc_list);
585         qdisc_unlock_tree(dev);
586
587         dev_watchdog_init(dev);
588 }
589
590 void dev_shutdown(struct net_device *dev)
591 {
592         struct Qdisc *qdisc;
593
594         qdisc_lock_tree(dev);
595         qdisc = dev->qdisc_sleeping;
596         dev->qdisc = &noop_qdisc;
597         dev->qdisc_sleeping = &noop_qdisc;
598         qdisc_destroy(qdisc);
599 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
600         if ((qdisc = dev->qdisc_ingress) != NULL) {
601                 dev->qdisc_ingress = NULL;
602                 qdisc_destroy(qdisc);
603         }
604 #endif
605         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
606         qdisc_unlock_tree(dev);
607 }
608
609 EXPORT_SYMBOL(__netdev_watchdog_up);
610 EXPORT_SYMBOL(netif_carrier_on);
611 EXPORT_SYMBOL(netif_carrier_off);
612 EXPORT_SYMBOL(noop_qdisc);
613 EXPORT_SYMBOL(noop_qdisc_ops);
614 EXPORT_SYMBOL(qdisc_create_dflt);
615 EXPORT_SYMBOL(qdisc_alloc);
616 EXPORT_SYMBOL(qdisc_destroy);
617 EXPORT_SYMBOL(qdisc_reset);
618 EXPORT_SYMBOL(qdisc_lock_tree);
619 EXPORT_SYMBOL(qdisc_unlock_tree);