ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <asm/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <net/sock.h>
34 #include <net/pkt_sched.h>
35
36 /* Main transmission queue. */
37
38 /* Main qdisc structure lock. 
39
40    However, modifications
41    to data, participating in scheduling must be additionally
42    protected with dev->queue_lock spinlock.
43
44    The idea is the following:
45    - enqueue, dequeue are serialized via top level device
46      spinlock dev->queue_lock.
47    - tree walking is protected by read_lock(qdisc_tree_lock)
48      and this lock is used only in process context.
49    - updates to tree are made only under rtnl semaphore,
50      hence this lock may be made without local bh disabling.
51
52    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
53  */
54 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
55
56 /* 
57    dev->queue_lock serializes queue accesses for this device
58    AND dev->qdisc pointer itself.
59
60    dev->xmit_lock serializes accesses to device driver.
61
62    dev->queue_lock and dev->xmit_lock are mutually exclusive,
63    if one is grabbed, another must be free.
64  */
65
66
67 /* Kick device.
68    Note, that this procedure can be called by a watchdog timer, so that
69    we do not check dev->tbusy flag here.
70
71    Returns:  0  - queue is empty.
72             >0  - queue is not empty, but throttled.
73             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
74
75    NOTE: Called under dev->queue_lock with locally disabled BH.
76 */
77
78 int qdisc_restart(struct net_device *dev)
79 {
80         struct Qdisc *q = dev->qdisc;
81         struct sk_buff *skb;
82
83         /* Dequeue packet */
84         if ((skb = q->dequeue(q)) != NULL) {
85                 if (spin_trylock(&dev->xmit_lock)) {
86                         /* Remember that the driver is grabbed by us. */
87                         dev->xmit_lock_owner = smp_processor_id();
88
89                         /* And release queue */
90                         spin_unlock(&dev->queue_lock);
91
92                         if (!netif_queue_stopped(dev)) {
93                                 if (netdev_nit)
94                                         dev_queue_xmit_nit(skb, dev);
95
96                                 if (dev->hard_start_xmit(skb, dev) == 0) {
97                                         dev->xmit_lock_owner = -1;
98                                         spin_unlock(&dev->xmit_lock);
99
100                                         spin_lock(&dev->queue_lock);
101                                         return -1;
102                                 }
103                         }
104
105                         /* Release the driver */
106                         dev->xmit_lock_owner = -1;
107                         spin_unlock(&dev->xmit_lock);
108                         spin_lock(&dev->queue_lock);
109                         q = dev->qdisc;
110                 } else {
111                         /* So, someone grabbed the driver. */
112
113                         /* It may be transient configuration error,
114                            when hard_start_xmit() recurses. We detect
115                            it by checking xmit owner and drop the
116                            packet when deadloop is detected.
117                          */
118                         if (dev->xmit_lock_owner == smp_processor_id()) {
119                                 kfree_skb(skb);
120                                 if (net_ratelimit())
121                                         printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
122                                 return -1;
123                         }
124                         __get_cpu_var(netdev_rx_stat).cpu_collision++;
125                 }
126
127                 /* Device kicked us out :(
128                    This is possible in three cases:
129
130                    0. driver is locked
131                    1. fastroute is enabled
132                    2. device cannot determine busy state
133                       before start of transmission (f.e. dialout)
134                    3. device is buggy (ppp)
135                  */
136
137                 q->ops->requeue(skb, q);
138                 netif_schedule(dev);
139                 return 1;
140         }
141         return q->q.qlen;
142 }
143
144 static void dev_watchdog(unsigned long arg)
145 {
146         struct net_device *dev = (struct net_device *)arg;
147
148         spin_lock(&dev->xmit_lock);
149         if (dev->qdisc != &noop_qdisc) {
150                 if (netif_device_present(dev) &&
151                     netif_running(dev) &&
152                     netif_carrier_ok(dev)) {
153                         if (netif_queue_stopped(dev) &&
154                             (jiffies - dev->trans_start) > dev->watchdog_timeo) {
155                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
156                                 dev->tx_timeout(dev);
157                         }
158                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
159                                 dev_hold(dev);
160                 }
161         }
162         spin_unlock(&dev->xmit_lock);
163
164         dev_put(dev);
165 }
166
167 static void dev_watchdog_init(struct net_device *dev)
168 {
169         init_timer(&dev->watchdog_timer);
170         dev->watchdog_timer.data = (unsigned long)dev;
171         dev->watchdog_timer.function = dev_watchdog;
172 }
173
174 void __netdev_watchdog_up(struct net_device *dev)
175 {
176         if (dev->tx_timeout) {
177                 if (dev->watchdog_timeo <= 0)
178                         dev->watchdog_timeo = 5*HZ;
179                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
180                         dev_hold(dev);
181         }
182 }
183
184 static void dev_watchdog_up(struct net_device *dev)
185 {
186         spin_lock_bh(&dev->xmit_lock);
187         __netdev_watchdog_up(dev);
188         spin_unlock_bh(&dev->xmit_lock);
189 }
190
191 static void dev_watchdog_down(struct net_device *dev)
192 {
193         spin_lock_bh(&dev->xmit_lock);
194         if (del_timer(&dev->watchdog_timer))
195                 __dev_put(dev);
196         spin_unlock_bh(&dev->xmit_lock);
197 }
198
199 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
200    under all circumstances. It is difficult to invent anything faster or
201    cheaper.
202  */
203
204 static int
205 noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
206 {
207         kfree_skb(skb);
208         return NET_XMIT_CN;
209 }
210
211 static struct sk_buff *
212 noop_dequeue(struct Qdisc * qdisc)
213 {
214         return NULL;
215 }
216
217 static int
218 noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
219 {
220         if (net_ratelimit())
221                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
222         kfree_skb(skb);
223         return NET_XMIT_CN;
224 }
225
226 struct Qdisc_ops noop_qdisc_ops = {
227         .next           =       NULL,
228         .cl_ops         =       NULL,
229         .id             =       "noop",
230         .priv_size      =       0,
231         .enqueue        =       noop_enqueue,
232         .dequeue        =       noop_dequeue,
233         .requeue        =       noop_requeue,
234         .owner          =       THIS_MODULE,
235 };
236
237 struct Qdisc noop_qdisc = {
238         .enqueue        =       noop_enqueue,
239         .dequeue        =       noop_dequeue,
240         .flags          =       TCQ_F_BUILTIN,
241         .ops            =       &noop_qdisc_ops,        
242 };
243
244 struct Qdisc_ops noqueue_qdisc_ops = {
245         .next           =       NULL,
246         .cl_ops         =       NULL,
247         .id             =       "noqueue",
248         .priv_size      =       0,
249         .enqueue        =       noop_enqueue,
250         .dequeue        =       noop_dequeue,
251         .requeue        =       noop_requeue,
252         .owner          =       THIS_MODULE,
253 };
254
255 struct Qdisc noqueue_qdisc = {
256         .enqueue        =       NULL,
257         .dequeue        =       noop_dequeue,
258         .flags          =       TCQ_F_BUILTIN,
259         .ops            =       &noqueue_qdisc_ops,
260 };
261
262
263 static const u8 prio2band[TC_PRIO_MAX+1] =
264         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
265
266 /* 3-band FIFO queue: old style, but should be a bit faster than
267    generic prio+fifo combination.
268  */
269
270 static int
271 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
272 {
273         struct sk_buff_head *list;
274
275         list = ((struct sk_buff_head*)qdisc->data) +
276                 prio2band[skb->priority&TC_PRIO_MAX];
277
278         if (list->qlen < qdisc->dev->tx_queue_len) {
279                 __skb_queue_tail(list, skb);
280                 qdisc->q.qlen++;
281                 qdisc->stats.bytes += skb->len;
282                 qdisc->stats.packets++;
283                 return 0;
284         }
285         qdisc->stats.drops++;
286         kfree_skb(skb);
287         return NET_XMIT_DROP;
288 }
289
290 static struct sk_buff *
291 pfifo_fast_dequeue(struct Qdisc* qdisc)
292 {
293         int prio;
294         struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
295         struct sk_buff *skb;
296
297         for (prio = 0; prio < 3; prio++, list++) {
298                 skb = __skb_dequeue(list);
299                 if (skb) {
300                         qdisc->q.qlen--;
301                         return skb;
302                 }
303         }
304         return NULL;
305 }
306
307 static int
308 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
309 {
310         struct sk_buff_head *list;
311
312         list = ((struct sk_buff_head*)qdisc->data) +
313                 prio2band[skb->priority&TC_PRIO_MAX];
314
315         __skb_queue_head(list, skb);
316         qdisc->q.qlen++;
317         return 0;
318 }
319
320 static void
321 pfifo_fast_reset(struct Qdisc* qdisc)
322 {
323         int prio;
324         struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
325
326         for (prio=0; prio < 3; prio++)
327                 skb_queue_purge(list+prio);
328         qdisc->q.qlen = 0;
329 }
330
331 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
332 {
333         unsigned char    *b = skb->tail;
334         struct tc_prio_qopt opt;
335
336         opt.bands = 3; 
337         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
338         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
339         return skb->len;
340
341 rtattr_failure:
342         skb_trim(skb, b - skb->data);
343         return -1;
344 }
345
346 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
347 {
348         int i;
349         struct sk_buff_head *list;
350
351         list = ((struct sk_buff_head*)qdisc->data);
352
353         for (i=0; i<3; i++)
354                 skb_queue_head_init(list+i);
355
356         return 0;
357 }
358
359 static struct Qdisc_ops pfifo_fast_ops = {
360         .next           =       NULL,
361         .cl_ops         =       NULL,
362         .id             =       "pfifo_fast",
363         .priv_size      =       3 * sizeof(struct sk_buff_head),
364         .enqueue        =       pfifo_fast_enqueue,
365         .dequeue        =       pfifo_fast_dequeue,
366         .requeue        =       pfifo_fast_requeue,
367         .init           =       pfifo_fast_init,
368         .reset          =       pfifo_fast_reset,
369         .dump           =       pfifo_fast_dump,
370         .owner          =       THIS_MODULE,
371 };
372
373 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
374 {
375         struct Qdisc *sch;
376         int size = sizeof(*sch) + ops->priv_size;
377
378         sch = kmalloc(size, GFP_KERNEL);
379         if (!sch)
380                 return NULL;
381         memset(sch, 0, size);
382
383         skb_queue_head_init(&sch->q);
384         sch->ops = ops;
385         sch->enqueue = ops->enqueue;
386         sch->dequeue = ops->dequeue;
387         sch->dev = dev;
388         sch->stats.lock = &dev->queue_lock;
389         atomic_set(&sch->refcnt, 1);
390         if (!ops->init || ops->init(sch, NULL) == 0)
391                 return sch;
392
393         kfree(sch);
394         return NULL;
395 }
396
397 /* Under dev->queue_lock and BH! */
398
399 void qdisc_reset(struct Qdisc *qdisc)
400 {
401         struct Qdisc_ops *ops = qdisc->ops;
402
403         if (ops->reset)
404                 ops->reset(qdisc);
405 }
406
407 /* Under dev->queue_lock and BH! */
408
409 void qdisc_destroy(struct Qdisc *qdisc)
410 {
411         struct Qdisc_ops *ops = qdisc->ops;
412         struct net_device *dev;
413
414         if (!atomic_dec_and_test(&qdisc->refcnt))
415                 return;
416
417         dev = qdisc->dev;
418
419         if (dev) {
420                 struct Qdisc *q, **qp;
421                 for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
422                         if (q == qdisc) {
423                                 *qp = q->next;
424                                 break;
425                         }
426                 }
427         }
428 #ifdef CONFIG_NET_ESTIMATOR
429         qdisc_kill_estimator(&qdisc->stats);
430 #endif
431         if (ops->reset)
432                 ops->reset(qdisc);
433         if (ops->destroy)
434                 ops->destroy(qdisc);
435         module_put(ops->owner);
436         if (!(qdisc->flags&TCQ_F_BUILTIN))
437                 kfree(qdisc);
438 }
439
440
441 void dev_activate(struct net_device *dev)
442 {
443         /* No queueing discipline is attached to device;
444            create default one i.e. pfifo_fast for devices,
445            which need queueing and noqueue_qdisc for
446            virtual interfaces
447          */
448
449         if (dev->qdisc_sleeping == &noop_qdisc) {
450                 struct Qdisc *qdisc;
451                 if (dev->tx_queue_len) {
452                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
453                         if (qdisc == NULL) {
454                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
455                                 return;
456                         }
457
458                         write_lock(&qdisc_tree_lock);
459                         qdisc->next = dev->qdisc_list;
460                         dev->qdisc_list = qdisc;
461                         write_unlock(&qdisc_tree_lock);
462
463                 } else {
464                         qdisc =  &noqueue_qdisc;
465                 }
466                 write_lock(&qdisc_tree_lock);
467                 dev->qdisc_sleeping = qdisc;
468                 write_unlock(&qdisc_tree_lock);
469         }
470
471         spin_lock_bh(&dev->queue_lock);
472         if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
473                 dev->trans_start = jiffies;
474                 dev_watchdog_up(dev);
475         }
476         spin_unlock_bh(&dev->queue_lock);
477 }
478
479 void dev_deactivate(struct net_device *dev)
480 {
481         struct Qdisc *qdisc;
482
483         spin_lock_bh(&dev->queue_lock);
484         qdisc = dev->qdisc;
485         dev->qdisc = &noop_qdisc;
486
487         qdisc_reset(qdisc);
488
489         spin_unlock_bh(&dev->queue_lock);
490
491         dev_watchdog_down(dev);
492
493         while (test_bit(__LINK_STATE_SCHED, &dev->state))
494                 yield();
495
496         spin_unlock_wait(&dev->xmit_lock);
497 }
498
499 void dev_init_scheduler(struct net_device *dev)
500 {
501         write_lock(&qdisc_tree_lock);
502         spin_lock_bh(&dev->queue_lock);
503         dev->qdisc = &noop_qdisc;
504         spin_unlock_bh(&dev->queue_lock);
505         dev->qdisc_sleeping = &noop_qdisc;
506         dev->qdisc_list = NULL;
507         write_unlock(&qdisc_tree_lock);
508
509         dev_watchdog_init(dev);
510 }
511
512 void dev_shutdown(struct net_device *dev)
513 {
514         struct Qdisc *qdisc;
515
516         write_lock(&qdisc_tree_lock);
517         spin_lock_bh(&dev->queue_lock);
518         qdisc = dev->qdisc_sleeping;
519         dev->qdisc = &noop_qdisc;
520         dev->qdisc_sleeping = &noop_qdisc;
521         qdisc_destroy(qdisc);
522 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
523         if ((qdisc = dev->qdisc_ingress) != NULL) {
524                 dev->qdisc_ingress = NULL;
525                 qdisc_destroy(qdisc);
526         }
527 #endif
528         BUG_TRAP(dev->qdisc_list == NULL);
529         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
530         dev->qdisc_list = NULL;
531         spin_unlock_bh(&dev->queue_lock);
532         write_unlock(&qdisc_tree_lock);
533 }
534
535 EXPORT_SYMBOL(__netdev_watchdog_up);
536 EXPORT_SYMBOL(noop_qdisc);
537 EXPORT_SYMBOL(noop_qdisc_ops);
538 EXPORT_SYMBOL(qdisc_create_dflt);
539 EXPORT_SYMBOL(qdisc_destroy);
540 EXPORT_SYMBOL(qdisc_reset);
541 EXPORT_SYMBOL(qdisc_restart);
542 EXPORT_SYMBOL(qdisc_tree_lock);