Merge to Fedora kernel-2.6.7-1.492
[linux-2.6.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <asm/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Main qdisc structure lock. 
40
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made only under rtnl semaphore,
51      hence this lock may be made without local bh disabling.
52
53    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
54  */
55 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
56
57 /* 
58    dev->queue_lock serializes queue accesses for this device
59    AND dev->qdisc pointer itself.
60
61    dev->xmit_lock serializes accesses to device driver.
62
63    dev->queue_lock and dev->xmit_lock are mutually exclusive,
64    if one is grabbed, another must be free.
65  */
66
67
68 /* Kick device.
69    Note, that this procedure can be called by a watchdog timer, so that
70    we do not check dev->tbusy flag here.
71
72    Returns:  0  - queue is empty.
73             >0  - queue is not empty, but throttled.
74             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
75
76    NOTE: Called under dev->queue_lock with locally disabled BH.
77 */
78
79 int qdisc_restart(struct net_device *dev)
80 {
81         struct Qdisc *q = dev->qdisc;
82         struct sk_buff *skb;
83
84         /* Dequeue packet */
85         if ((skb = q->dequeue(q)) != NULL) {
86                 if (spin_trylock(&dev->xmit_lock)) {
87                         /* Remember that the driver is grabbed by us. */
88                         dev->xmit_lock_owner = smp_processor_id();
89
90                         /* And release queue */
91                         spin_unlock(&dev->queue_lock);
92
93                         if (!netif_queue_stopped(dev)) {
94                                 if (netdev_nit)
95                                         dev_queue_xmit_nit(skb, dev);
96
97                                 if (dev->hard_start_xmit(skb, dev) == 0) {
98                                         dev->xmit_lock_owner = -1;
99                                         spin_unlock(&dev->xmit_lock);
100
101                                         spin_lock(&dev->queue_lock);
102                                         return -1;
103                                 }
104                         }
105
106                         /* Release the driver */
107                         dev->xmit_lock_owner = -1;
108                         spin_unlock(&dev->xmit_lock);
109                         spin_lock(&dev->queue_lock);
110                         q = dev->qdisc;
111                 } else {
112                         /* So, someone grabbed the driver. */
113
114                         /* It may be transient configuration error,
115                            when hard_start_xmit() recurses. We detect
116                            it by checking xmit owner and drop the
117                            packet when deadloop is detected.
118                          */
119                         if (dev->xmit_lock_owner == smp_processor_id()) {
120                                 kfree_skb(skb);
121                                 if (net_ratelimit())
122                                         printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
123                                 return -1;
124                         }
125                         __get_cpu_var(netdev_rx_stat).cpu_collision++;
126                 }
127
128                 /* Device kicked us out :(
129                    This is possible in three cases:
130
131                    0. driver is locked
132                    1. fastroute is enabled
133                    2. device cannot determine busy state
134                       before start of transmission (f.e. dialout)
135                    3. device is buggy (ppp)
136                  */
137
138                 q->ops->requeue(skb, q);
139                 netif_schedule(dev);
140                 return 1;
141         }
142         return q->q.qlen;
143 }
144
145 static void dev_watchdog(unsigned long arg)
146 {
147         struct net_device *dev = (struct net_device *)arg;
148
149         spin_lock(&dev->xmit_lock);
150         if (dev->qdisc != &noop_qdisc) {
151                 if (netif_device_present(dev) &&
152                     netif_running(dev) &&
153                     netif_carrier_ok(dev)) {
154                         if (netif_queue_stopped(dev) &&
155                             (jiffies - dev->trans_start) > dev->watchdog_timeo) {
156                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
157                                 dev->tx_timeout(dev);
158                         }
159                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
160                                 dev_hold(dev);
161                 }
162         }
163         spin_unlock(&dev->xmit_lock);
164
165         dev_put(dev);
166 }
167
168 static void dev_watchdog_init(struct net_device *dev)
169 {
170         init_timer(&dev->watchdog_timer);
171         dev->watchdog_timer.data = (unsigned long)dev;
172         dev->watchdog_timer.function = dev_watchdog;
173 }
174
175 void __netdev_watchdog_up(struct net_device *dev)
176 {
177         if (dev->tx_timeout) {
178                 if (dev->watchdog_timeo <= 0)
179                         dev->watchdog_timeo = 5*HZ;
180                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
181                         dev_hold(dev);
182         }
183 }
184
185 static void dev_watchdog_up(struct net_device *dev)
186 {
187         spin_lock_bh(&dev->xmit_lock);
188         __netdev_watchdog_up(dev);
189         spin_unlock_bh(&dev->xmit_lock);
190 }
191
192 static void dev_watchdog_down(struct net_device *dev)
193 {
194         spin_lock_bh(&dev->xmit_lock);
195         if (del_timer(&dev->watchdog_timer))
196                 __dev_put(dev);
197         spin_unlock_bh(&dev->xmit_lock);
198 }
199
200 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
201    under all circumstances. It is difficult to invent anything faster or
202    cheaper.
203  */
204
205 static int
206 noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
207 {
208         kfree_skb(skb);
209         return NET_XMIT_CN;
210 }
211
212 static struct sk_buff *
213 noop_dequeue(struct Qdisc * qdisc)
214 {
215         return NULL;
216 }
217
218 static int
219 noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
220 {
221         if (net_ratelimit())
222                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
223         kfree_skb(skb);
224         return NET_XMIT_CN;
225 }
226
227 struct Qdisc_ops noop_qdisc_ops = {
228         .next           =       NULL,
229         .cl_ops         =       NULL,
230         .id             =       "noop",
231         .priv_size      =       0,
232         .enqueue        =       noop_enqueue,
233         .dequeue        =       noop_dequeue,
234         .requeue        =       noop_requeue,
235         .owner          =       THIS_MODULE,
236 };
237
238 struct Qdisc noop_qdisc = {
239         .enqueue        =       noop_enqueue,
240         .dequeue        =       noop_dequeue,
241         .flags          =       TCQ_F_BUILTIN,
242         .ops            =       &noop_qdisc_ops,        
243 };
244
245 struct Qdisc_ops noqueue_qdisc_ops = {
246         .next           =       NULL,
247         .cl_ops         =       NULL,
248         .id             =       "noqueue",
249         .priv_size      =       0,
250         .enqueue        =       noop_enqueue,
251         .dequeue        =       noop_dequeue,
252         .requeue        =       noop_requeue,
253         .owner          =       THIS_MODULE,
254 };
255
256 struct Qdisc noqueue_qdisc = {
257         .enqueue        =       NULL,
258         .dequeue        =       noop_dequeue,
259         .flags          =       TCQ_F_BUILTIN,
260         .ops            =       &noqueue_qdisc_ops,
261 };
262
263
264 static const u8 prio2band[TC_PRIO_MAX+1] =
265         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
266
267 /* 3-band FIFO queue: old style, but should be a bit faster than
268    generic prio+fifo combination.
269  */
270
271 static int
272 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
273 {
274         struct sk_buff_head *list;
275
276         list = ((struct sk_buff_head*)qdisc->data) +
277                 prio2band[skb->priority&TC_PRIO_MAX];
278
279         if (list->qlen < qdisc->dev->tx_queue_len) {
280                 __skb_queue_tail(list, skb);
281                 qdisc->q.qlen++;
282                 qdisc->stats.bytes += skb->len;
283                 qdisc->stats.packets++;
284                 return 0;
285         }
286         qdisc->stats.drops++;
287         kfree_skb(skb);
288         return NET_XMIT_DROP;
289 }
290
291 static struct sk_buff *
292 pfifo_fast_dequeue(struct Qdisc* qdisc)
293 {
294         int prio;
295         struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
296         struct sk_buff *skb;
297
298         for (prio = 0; prio < 3; prio++, list++) {
299                 skb = __skb_dequeue(list);
300                 if (skb) {
301                         qdisc->q.qlen--;
302                         return skb;
303                 }
304         }
305         return NULL;
306 }
307
308 static int
309 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
310 {
311         struct sk_buff_head *list;
312
313         list = ((struct sk_buff_head*)qdisc->data) +
314                 prio2band[skb->priority&TC_PRIO_MAX];
315
316         __skb_queue_head(list, skb);
317         qdisc->q.qlen++;
318         return 0;
319 }
320
321 static void
322 pfifo_fast_reset(struct Qdisc* qdisc)
323 {
324         int prio;
325         struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
326
327         for (prio=0; prio < 3; prio++)
328                 skb_queue_purge(list+prio);
329         qdisc->q.qlen = 0;
330 }
331
332 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
333 {
334         unsigned char    *b = skb->tail;
335         struct tc_prio_qopt opt;
336
337         opt.bands = 3; 
338         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
339         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
340         return skb->len;
341
342 rtattr_failure:
343         skb_trim(skb, b - skb->data);
344         return -1;
345 }
346
347 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
348 {
349         int i;
350         struct sk_buff_head *list;
351
352         list = ((struct sk_buff_head*)qdisc->data);
353
354         for (i=0; i<3; i++)
355                 skb_queue_head_init(list+i);
356
357         return 0;
358 }
359
360 static struct Qdisc_ops pfifo_fast_ops = {
361         .next           =       NULL,
362         .cl_ops         =       NULL,
363         .id             =       "pfifo_fast",
364         .priv_size      =       3 * sizeof(struct sk_buff_head),
365         .enqueue        =       pfifo_fast_enqueue,
366         .dequeue        =       pfifo_fast_dequeue,
367         .requeue        =       pfifo_fast_requeue,
368         .init           =       pfifo_fast_init,
369         .reset          =       pfifo_fast_reset,
370         .dump           =       pfifo_fast_dump,
371         .owner          =       THIS_MODULE,
372 };
373
374 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
375 {
376         struct Qdisc *sch;
377         int size = sizeof(*sch) + ops->priv_size;
378
379         sch = kmalloc(size, GFP_KERNEL);
380         if (!sch)
381                 return NULL;
382         memset(sch, 0, size);
383
384         skb_queue_head_init(&sch->q);
385         sch->ops = ops;
386         sch->enqueue = ops->enqueue;
387         sch->dequeue = ops->dequeue;
388         sch->dev = dev;
389         sch->stats_lock = &dev->queue_lock;
390         atomic_set(&sch->refcnt, 1);
391         /* enqueue is accessed locklessly - make sure it's visible
392          * before we set a netdevice's qdisc pointer to sch */
393         smp_wmb();
394         if (!ops->init || ops->init(sch, NULL) == 0)
395                 return sch;
396
397         kfree(sch);
398         return NULL;
399 }
400
401 /* Under dev->queue_lock and BH! */
402
403 void qdisc_reset(struct Qdisc *qdisc)
404 {
405         struct Qdisc_ops *ops = qdisc->ops;
406
407         if (ops->reset)
408                 ops->reset(qdisc);
409 }
410
411 /* this is the rcu callback function to clean up a qdisc when there 
412  * are no further references to it */
413
414 static void __qdisc_destroy(struct rcu_head *head)
415 {
416         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
417         struct Qdisc_ops  *ops = qdisc->ops;
418
419 #ifdef CONFIG_NET_ESTIMATOR
420         qdisc_kill_estimator(&qdisc->stats);
421 #endif
422         if (ops->reset)
423                 ops->reset(qdisc);
424         if (ops->destroy)
425                 ops->destroy(qdisc);
426         module_put(ops->owner);
427
428         if (!(qdisc->flags&TCQ_F_BUILTIN))
429                 kfree(qdisc);
430 }
431
432 /* Under dev->queue_lock and BH! */
433
434 void qdisc_destroy(struct Qdisc *qdisc)
435 {
436         struct net_device *dev = qdisc->dev;
437
438         if (!atomic_dec_and_test(&qdisc->refcnt))
439                 return;
440
441         if (dev) {
442                 struct Qdisc *q, **qp;
443                 for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
444                         if (q == qdisc) {
445                                 *qp = q->next;
446                                 break;
447                         }
448                 }
449         }
450
451         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
452
453 }
454
455
456 void dev_activate(struct net_device *dev)
457 {
458         /* No queueing discipline is attached to device;
459            create default one i.e. pfifo_fast for devices,
460            which need queueing and noqueue_qdisc for
461            virtual interfaces
462          */
463
464         if (dev->qdisc_sleeping == &noop_qdisc) {
465                 struct Qdisc *qdisc;
466                 if (dev->tx_queue_len) {
467                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
468                         if (qdisc == NULL) {
469                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
470                                 return;
471                         }
472
473                         write_lock(&qdisc_tree_lock);
474                         qdisc->next = dev->qdisc_list;
475                         dev->qdisc_list = qdisc;
476                         write_unlock(&qdisc_tree_lock);
477
478                 } else {
479                         qdisc =  &noqueue_qdisc;
480                 }
481                 write_lock(&qdisc_tree_lock);
482                 dev->qdisc_sleeping = qdisc;
483                 write_unlock(&qdisc_tree_lock);
484         }
485
486         spin_lock_bh(&dev->queue_lock);
487         if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
488                 dev->trans_start = jiffies;
489                 dev_watchdog_up(dev);
490         }
491         spin_unlock_bh(&dev->queue_lock);
492 }
493
494 void dev_deactivate(struct net_device *dev)
495 {
496         struct Qdisc *qdisc;
497
498         spin_lock_bh(&dev->queue_lock);
499         qdisc = dev->qdisc;
500         dev->qdisc = &noop_qdisc;
501
502         qdisc_reset(qdisc);
503
504         spin_unlock_bh(&dev->queue_lock);
505
506         dev_watchdog_down(dev);
507
508         while (test_bit(__LINK_STATE_SCHED, &dev->state))
509                 yield();
510
511         spin_unlock_wait(&dev->xmit_lock);
512 }
513
514 void dev_init_scheduler(struct net_device *dev)
515 {
516         write_lock(&qdisc_tree_lock);
517         spin_lock_bh(&dev->queue_lock);
518         dev->qdisc = &noop_qdisc;
519         spin_unlock_bh(&dev->queue_lock);
520         dev->qdisc_sleeping = &noop_qdisc;
521         dev->qdisc_list = NULL;
522         write_unlock(&qdisc_tree_lock);
523
524         dev_watchdog_init(dev);
525 }
526
527 void dev_shutdown(struct net_device *dev)
528 {
529         struct Qdisc *qdisc;
530
531         write_lock(&qdisc_tree_lock);
532         spin_lock_bh(&dev->queue_lock);
533         qdisc = dev->qdisc_sleeping;
534         dev->qdisc = &noop_qdisc;
535         dev->qdisc_sleeping = &noop_qdisc;
536         qdisc_destroy(qdisc);
537 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
538         if ((qdisc = dev->qdisc_ingress) != NULL) {
539                 dev->qdisc_ingress = NULL;
540                 qdisc_destroy(qdisc);
541         }
542 #endif
543         BUG_TRAP(dev->qdisc_list == NULL);
544         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
545         dev->qdisc_list = NULL;
546         spin_unlock_bh(&dev->queue_lock);
547         write_unlock(&qdisc_tree_lock);
548 }
549
550 EXPORT_SYMBOL(__netdev_watchdog_up);
551 EXPORT_SYMBOL(noop_qdisc);
552 EXPORT_SYMBOL(noop_qdisc_ops);
553 EXPORT_SYMBOL(qdisc_create_dflt);
554 EXPORT_SYMBOL(qdisc_destroy);
555 EXPORT_SYMBOL(qdisc_reset);
556 EXPORT_SYMBOL(qdisc_restart);
557 EXPORT_SYMBOL(qdisc_tree_lock);