0326752bd0ec1a47837da0cbc64d4e448bd0a5f4
[linux-2.6.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <linux/list.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
37
38 /* Main transmission queue. */
39
40 /* Main qdisc structure lock. 
41
42    However, modifications
43    to data, participating in scheduling must be additionally
44    protected with dev->queue_lock spinlock.
45
46    The idea is the following:
47    - enqueue, dequeue are serialized via top level device
48      spinlock dev->queue_lock.
49    - tree walking is protected by read_lock(qdisc_tree_lock)
50      and this lock is used only in process context.
51    - updates to tree are made only under rtnl semaphore,
52      hence this lock may be made without local bh disabling.
53
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60         write_lock(&qdisc_tree_lock);
61         spin_lock_bh(&dev->queue_lock);
62 }
63
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66         spin_unlock_bh(&dev->queue_lock);
67         write_unlock(&qdisc_tree_lock);
68 }
69
70 /* 
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73
74    dev->xmit_lock serializes accesses to device driver.
75
76    dev->queue_lock and dev->xmit_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79
80
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91
92 int qdisc_restart(struct net_device *dev)
93 {
94         struct Qdisc *q = dev->qdisc;
95         struct sk_buff *skb;
96
97         /* Dequeue packet */
98         if ((skb = q->dequeue(q)) != NULL) {
99                 unsigned nolock = (dev->features & NETIF_F_LLTX);
100                 /*
101                  * When the driver has LLTX set it does its own locking
102                  * in start_xmit. No need to add additional overhead by
103                  * locking again. These checks are worth it because
104                  * even uncongested locks can be quite expensive.
105                  * The driver can do trylock like here too, in case
106                  * of lock congestion it should return -1 and the packet
107                  * will be requeued.
108                  */
109                 if (!nolock) {
110                         if (!spin_trylock(&dev->xmit_lock)) {
111                         collision:
112                                 /* So, someone grabbed the driver. */
113                                 
114                                 /* It may be transient configuration error,
115                                    when hard_start_xmit() recurses. We detect
116                                    it by checking xmit owner and drop the
117                                    packet when deadloop is detected.
118                                 */
119                                 if (dev->xmit_lock_owner == smp_processor_id()) {
120                                         kfree_skb(skb);
121                                         if (net_ratelimit())
122                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
123                                         return -1;
124                                 }
125                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
126                                 goto requeue;
127                         }
128                         /* Remember that the driver is grabbed by us. */
129                         dev->xmit_lock_owner = smp_processor_id();
130                 }
131                 
132                 {
133                         /* And release queue */
134                         spin_unlock(&dev->queue_lock);
135
136                         if (!netif_queue_stopped(dev)) {
137                                 int ret;
138                                 if (netdev_nit)
139                                         dev_queue_xmit_nit(skb, dev);
140
141                                 ret = dev->hard_start_xmit(skb, dev);
142                                 if (ret == NETDEV_TX_OK) { 
143                                         if (!nolock) {
144                                                 dev->xmit_lock_owner = -1;
145                                                 spin_unlock(&dev->xmit_lock);
146                                         }
147                                         spin_lock(&dev->queue_lock);
148                                         return -1;
149                                 }
150                                 if (ret == NETDEV_TX_LOCKED && nolock) {
151                                         spin_lock(&dev->queue_lock);
152                                         goto collision; 
153                                 }
154                         }
155
156                         /* NETDEV_TX_BUSY - we need to requeue */
157                         /* Release the driver */
158                         if (!nolock) { 
159                                 dev->xmit_lock_owner = -1;
160                                 spin_unlock(&dev->xmit_lock);
161                         } 
162                         spin_lock(&dev->queue_lock);
163                         q = dev->qdisc;
164                 }
165
166                 /* Device kicked us out :(
167                    This is possible in three cases:
168
169                    0. driver is locked
170                    1. fastroute is enabled
171                    2. device cannot determine busy state
172                       before start of transmission (f.e. dialout)
173                    3. device is buggy (ppp)
174                  */
175
176 requeue:
177                 q->ops->requeue(skb, q);
178                 netif_schedule(dev);
179                 return 1;
180         }
181         BUG_ON((int) q->q.qlen < 0);
182         return q->q.qlen;
183 }
184
185 static void dev_watchdog(unsigned long arg)
186 {
187         struct net_device *dev = (struct net_device *)arg;
188
189         spin_lock(&dev->xmit_lock);
190         if (dev->qdisc != &noop_qdisc) {
191                 if (netif_device_present(dev) &&
192                     netif_running(dev) &&
193                     netif_carrier_ok(dev)) {
194                         if (netif_queue_stopped(dev) &&
195                             (jiffies - dev->trans_start) > dev->watchdog_timeo) {
196                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
197                                 dev->tx_timeout(dev);
198                         }
199                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
200                                 dev_hold(dev);
201                 }
202         }
203         spin_unlock(&dev->xmit_lock);
204
205         dev_put(dev);
206 }
207
208 static void dev_watchdog_init(struct net_device *dev)
209 {
210         init_timer(&dev->watchdog_timer);
211         dev->watchdog_timer.data = (unsigned long)dev;
212         dev->watchdog_timer.function = dev_watchdog;
213 }
214
215 void __netdev_watchdog_up(struct net_device *dev)
216 {
217         if (dev->tx_timeout) {
218                 if (dev->watchdog_timeo <= 0)
219                         dev->watchdog_timeo = 5*HZ;
220                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
221                         dev_hold(dev);
222         }
223 }
224
225 static void dev_watchdog_up(struct net_device *dev)
226 {
227         spin_lock_bh(&dev->xmit_lock);
228         __netdev_watchdog_up(dev);
229         spin_unlock_bh(&dev->xmit_lock);
230 }
231
232 static void dev_watchdog_down(struct net_device *dev)
233 {
234         spin_lock_bh(&dev->xmit_lock);
235         if (del_timer(&dev->watchdog_timer))
236                 __dev_put(dev);
237         spin_unlock_bh(&dev->xmit_lock);
238 }
239
240 void netif_carrier_on(struct net_device *dev)
241 {
242         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
243                 linkwatch_fire_event(dev);
244         if (netif_running(dev))
245                 __netdev_watchdog_up(dev);
246 }
247
248 void netif_carrier_off(struct net_device *dev)
249 {
250         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
251                 linkwatch_fire_event(dev);
252 }
253
254 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
255    under all circumstances. It is difficult to invent anything faster or
256    cheaper.
257  */
258
259 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
260 {
261         kfree_skb(skb);
262         return NET_XMIT_CN;
263 }
264
265 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
266 {
267         return NULL;
268 }
269
270 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
271 {
272         if (net_ratelimit())
273                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
274                        skb->dev->name);
275         kfree_skb(skb);
276         return NET_XMIT_CN;
277 }
278
279 struct Qdisc_ops noop_qdisc_ops = {
280         .id             =       "noop",
281         .priv_size      =       0,
282         .enqueue        =       noop_enqueue,
283         .dequeue        =       noop_dequeue,
284         .requeue        =       noop_requeue,
285         .owner          =       THIS_MODULE,
286 };
287
288 struct Qdisc noop_qdisc = {
289         .enqueue        =       noop_enqueue,
290         .dequeue        =       noop_dequeue,
291         .flags          =       TCQ_F_BUILTIN,
292         .ops            =       &noop_qdisc_ops,        
293         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
294 };
295
296 static struct Qdisc_ops noqueue_qdisc_ops = {
297         .id             =       "noqueue",
298         .priv_size      =       0,
299         .enqueue        =       noop_enqueue,
300         .dequeue        =       noop_dequeue,
301         .requeue        =       noop_requeue,
302         .owner          =       THIS_MODULE,
303 };
304
305 static struct Qdisc noqueue_qdisc = {
306         .enqueue        =       NULL,
307         .dequeue        =       noop_dequeue,
308         .flags          =       TCQ_F_BUILTIN,
309         .ops            =       &noqueue_qdisc_ops,
310         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
311 };
312
313
314 static const u8 prio2band[TC_PRIO_MAX+1] =
315         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
316
317 /* 3-band FIFO queue: old style, but should be a bit faster than
318    generic prio+fifo combination.
319  */
320
321 #define PFIFO_FAST_BANDS 3
322
323 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
324                                              struct Qdisc *qdisc)
325 {
326         struct sk_buff_head *list = qdisc_priv(qdisc);
327         return list + prio2band[skb->priority & TC_PRIO_MAX];
328 }
329
330 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
331 {
332         struct sk_buff_head *list = prio2list(skb, qdisc);
333
334         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
335                 qdisc->q.qlen++;
336                 return __qdisc_enqueue_tail(skb, qdisc, list);
337         }
338
339         return qdisc_drop(skb, qdisc);
340 }
341
342 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
343 {
344         int prio;
345         struct sk_buff_head *list = qdisc_priv(qdisc);
346
347         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
348                 if (!skb_queue_empty(list + prio)) {
349                         qdisc->q.qlen--;
350                         return __qdisc_dequeue_head(qdisc, list + prio);
351                 }
352         }
353
354         return NULL;
355 }
356
357 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
358 {
359         qdisc->q.qlen++;
360         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
361 }
362
363 static void pfifo_fast_reset(struct Qdisc* qdisc)
364 {
365         int prio;
366         struct sk_buff_head *list = qdisc_priv(qdisc);
367
368         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
369                 __qdisc_reset_queue(qdisc, list + prio);
370
371         qdisc->qstats.backlog = 0;
372         qdisc->q.qlen = 0;
373 }
374
375 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
376 {
377         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
378
379         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
380         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
381         return skb->len;
382
383 rtattr_failure:
384         return -1;
385 }
386
387 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
388 {
389         int prio;
390         struct sk_buff_head *list = qdisc_priv(qdisc);
391
392         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
393                 skb_queue_head_init(list + prio);
394
395         return 0;
396 }
397
398 static struct Qdisc_ops pfifo_fast_ops = {
399         .id             =       "pfifo_fast",
400         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
401         .enqueue        =       pfifo_fast_enqueue,
402         .dequeue        =       pfifo_fast_dequeue,
403         .requeue        =       pfifo_fast_requeue,
404         .init           =       pfifo_fast_init,
405         .reset          =       pfifo_fast_reset,
406         .dump           =       pfifo_fast_dump,
407         .owner          =       THIS_MODULE,
408 };
409
410 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
411 {
412         void *p;
413         struct Qdisc *sch;
414         unsigned int size;
415         int err = -ENOBUFS;
416
417         /* ensure that the Qdisc and the private data are 32-byte aligned */
418         size = QDISC_ALIGN(sizeof(*sch));
419         size += ops->priv_size + (QDISC_ALIGNTO - 1);
420
421         p = kmalloc(size, GFP_KERNEL);
422         if (!p)
423                 goto errout;
424         memset(p, 0, size);
425         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
426         sch->padded = (char *) sch - (char *) p;
427
428         INIT_LIST_HEAD(&sch->list);
429         skb_queue_head_init(&sch->q);
430         sch->ops = ops;
431         sch->enqueue = ops->enqueue;
432         sch->dequeue = ops->dequeue;
433         sch->dev = dev;
434         dev_hold(dev);
435         sch->stats_lock = &dev->queue_lock;
436         atomic_set(&sch->refcnt, 1);
437
438         return sch;
439 errout:
440         return ERR_PTR(-err);
441 }
442
443 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
444 {
445         struct Qdisc *sch;
446         
447         sch = qdisc_alloc(dev, ops);
448         if (IS_ERR(sch))
449                 goto errout;
450
451         if (!ops->init || ops->init(sch, NULL) == 0)
452                 return sch;
453
454         qdisc_destroy(sch);
455 errout:
456         return NULL;
457 }
458
459 /* Under dev->queue_lock and BH! */
460
461 void qdisc_reset(struct Qdisc *qdisc)
462 {
463         struct Qdisc_ops *ops = qdisc->ops;
464
465         if (ops->reset)
466                 ops->reset(qdisc);
467 }
468
469 /* this is the rcu callback function to clean up a qdisc when there 
470  * are no further references to it */
471
472 static void __qdisc_destroy(struct rcu_head *head)
473 {
474         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
475         kfree((char *) qdisc - qdisc->padded);
476 }
477
478 /* Under dev->queue_lock and BH! */
479
480 void qdisc_destroy(struct Qdisc *qdisc)
481 {
482         struct Qdisc_ops  *ops = qdisc->ops;
483
484         if (qdisc->flags & TCQ_F_BUILTIN ||
485             !atomic_dec_and_test(&qdisc->refcnt))
486                 return;
487
488         list_del(&qdisc->list);
489 #ifdef CONFIG_NET_ESTIMATOR
490         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
491 #endif
492         if (ops->reset)
493                 ops->reset(qdisc);
494         if (ops->destroy)
495                 ops->destroy(qdisc);
496
497         module_put(ops->owner);
498         dev_put(qdisc->dev);
499         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
500 }
501
502 void dev_activate(struct net_device *dev)
503 {
504         /* No queueing discipline is attached to device;
505            create default one i.e. pfifo_fast for devices,
506            which need queueing and noqueue_qdisc for
507            virtual interfaces
508          */
509
510         if (dev->qdisc_sleeping == &noop_qdisc) {
511                 struct Qdisc *qdisc;
512                 if (dev->tx_queue_len) {
513                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
514                         if (qdisc == NULL) {
515                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
516                                 return;
517                         }
518                         write_lock(&qdisc_tree_lock);
519                         list_add_tail(&qdisc->list, &dev->qdisc_list);
520                         write_unlock(&qdisc_tree_lock);
521                 } else {
522                         qdisc =  &noqueue_qdisc;
523                 }
524                 write_lock(&qdisc_tree_lock);
525                 dev->qdisc_sleeping = qdisc;
526                 write_unlock(&qdisc_tree_lock);
527         }
528
529         if (!netif_carrier_ok(dev))
530                 /* Delay activation until next carrier-on event */
531                 return;
532
533         spin_lock_bh(&dev->queue_lock);
534         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
535         if (dev->qdisc != &noqueue_qdisc) {
536                 dev->trans_start = jiffies;
537                 dev_watchdog_up(dev);
538         }
539         spin_unlock_bh(&dev->queue_lock);
540 }
541
542 void dev_deactivate(struct net_device *dev)
543 {
544         struct Qdisc *qdisc;
545
546         spin_lock_bh(&dev->queue_lock);
547         qdisc = dev->qdisc;
548         dev->qdisc = &noop_qdisc;
549
550         qdisc_reset(qdisc);
551
552         spin_unlock_bh(&dev->queue_lock);
553
554         dev_watchdog_down(dev);
555
556         while (test_bit(__LINK_STATE_SCHED, &dev->state))
557                 yield();
558
559         spin_unlock_wait(&dev->xmit_lock);
560 }
561
562 void dev_init_scheduler(struct net_device *dev)
563 {
564         qdisc_lock_tree(dev);
565         dev->qdisc = &noop_qdisc;
566         dev->qdisc_sleeping = &noop_qdisc;
567         INIT_LIST_HEAD(&dev->qdisc_list);
568         qdisc_unlock_tree(dev);
569
570         dev_watchdog_init(dev);
571 }
572
573 void dev_shutdown(struct net_device *dev)
574 {
575         struct Qdisc *qdisc;
576
577         qdisc_lock_tree(dev);
578         qdisc = dev->qdisc_sleeping;
579         dev->qdisc = &noop_qdisc;
580         dev->qdisc_sleeping = &noop_qdisc;
581         qdisc_destroy(qdisc);
582 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
583         if ((qdisc = dev->qdisc_ingress) != NULL) {
584                 dev->qdisc_ingress = NULL;
585                 qdisc_destroy(qdisc);
586         }
587 #endif
588         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
589         qdisc_unlock_tree(dev);
590 }
591
592 EXPORT_SYMBOL(__netdev_watchdog_up);
593 EXPORT_SYMBOL(netif_carrier_on);
594 EXPORT_SYMBOL(netif_carrier_off);
595 EXPORT_SYMBOL(noop_qdisc);
596 EXPORT_SYMBOL(noop_qdisc_ops);
597 EXPORT_SYMBOL(qdisc_create_dflt);
598 EXPORT_SYMBOL(qdisc_alloc);
599 EXPORT_SYMBOL(qdisc_destroy);
600 EXPORT_SYMBOL(qdisc_reset);
601 EXPORT_SYMBOL(qdisc_restart);
602 EXPORT_SYMBOL(qdisc_lock_tree);
603 EXPORT_SYMBOL(qdisc_unlock_tree);