VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106
107 #define IP_MAX_MTU      0xFFF0
108
109 #define RT_GC_TIMEOUT (300*HZ)
110
111 int ip_rt_min_delay             = 2 * HZ;
112 int ip_rt_max_delay             = 10 * HZ;
113 int ip_rt_max_size;
114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval           = 60 * HZ;
116 int ip_rt_gc_min_interval       = HZ / 2;
117 int ip_rt_redirect_number       = 9;
118 int ip_rt_redirect_load         = HZ / 50;
119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost            = HZ;
121 int ip_rt_error_burst           = 5 * HZ;
122 int ip_rt_gc_elasticity         = 8;
123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
124 int ip_rt_min_pmtu              = 512 + 20 + 20;
125 int ip_rt_min_advmss            = 256;
126 int ip_rt_secret_interval       = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static void              ipv4_dst_ifdown(struct dst_entry *dst, int how);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(void);
146
147
148 static struct dst_ops ipv4_dst_ops = {
149         .family =               AF_INET,
150         .protocol =             __constant_htons(ETH_P_IP),
151         .gc =                   rt_garbage_collect,
152         .check =                ipv4_dst_check,
153         .destroy =              ipv4_dst_destroy,
154         .ifdown =               ipv4_dst_ifdown,
155         .negative_advice =      ipv4_negative_advice,
156         .link_failure =         ipv4_link_failure,
157         .update_pmtu =          ip_rt_update_pmtu,
158         .entry_size =           sizeof(struct rtable),
159 };
160
161 #define ECN_OR_COST(class)      TC_PRIO_##class
162
163 __u8 ip_tos2prio[16] = {
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(FILLER),
166         TC_PRIO_BESTEFFORT,
167         ECN_OR_COST(BESTEFFORT),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_BULK,
171         ECN_OR_COST(BULK),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE,
175         ECN_OR_COST(INTERACTIVE),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK),
178         TC_PRIO_INTERACTIVE_BULK,
179         ECN_OR_COST(INTERACTIVE_BULK)
180 };
181
182
183 /*
184  * Route cache.
185  */
186
187 /* The locking scheme is rather straight forward:
188  *
189  * 1) Read-Copy Update protects the buckets of the central route hash.
190  * 2) Only writers remove entries, and they hold the lock
191  *    as they look at rtable reference counts.
192  * 3) Only readers acquire references to rtable entries,
193  *    they do so with atomic increments and with the
194  *    lock held.
195  */
196
197 struct rt_hash_bucket {
198         struct rtable   *chain;
199         spinlock_t      lock;
200 } __attribute__((__aligned__(8)));
201
202 static struct rt_hash_bucket    *rt_hash_table;
203 static unsigned                 rt_hash_mask;
204 static int                      rt_hash_log;
205 static unsigned int             rt_hash_rnd;
206
207 struct rt_cache_stat *rt_cache_stat;
208
209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
210                                 struct rtable **res);
211
212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
213 {
214         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
215                 & rt_hash_mask);
216 }
217
218 #ifdef CONFIG_PROC_FS
219 struct rt_cache_iter_state {
220         int bucket;
221 };
222
223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
224 {
225         struct rtable *r = NULL;
226         struct rt_cache_iter_state *st = seq->private;
227
228         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
229                 rcu_read_lock();
230                 r = rt_hash_table[st->bucket].chain;
231                 if (r)
232                         break;
233                 rcu_read_unlock();
234         }
235         return r;
236 }
237
238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
239 {
240         struct rt_cache_iter_state *st = seq->private;
241
242         smp_read_barrier_depends();
243         r = r->u.rt_next;
244         while (!r) {
245                 rcu_read_unlock();
246                 if (--st->bucket < 0)
247                         break;
248                 rcu_read_lock();
249                 r = rt_hash_table[st->bucket].chain;
250         }
251         return r;
252 }
253
254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
255 {
256         struct rtable *r = rt_cache_get_first(seq);
257
258         if (r)
259                 while (pos && (r = rt_cache_get_next(seq, r)))
260                         --pos;
261         return pos ? NULL : r;
262 }
263
264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
265 {
266         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
267 }
268
269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 {
271         struct rtable *r = NULL;
272
273         if (v == SEQ_START_TOKEN)
274                 r = rt_cache_get_first(seq);
275         else
276                 r = rt_cache_get_next(seq, v);
277         ++*pos;
278         return r;
279 }
280
281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
282 {
283         if (v && v != SEQ_START_TOKEN)
284                 rcu_read_unlock();
285 }
286
287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
288 {
289         if (v == SEQ_START_TOKEN)
290                 seq_printf(seq, "%-127s\n",
291                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
292                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
293                            "HHUptod\tSpecDst");
294         else {
295                 struct rtable *r = v;
296                 char temp[256];
297
298                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
299                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
300                         r->u.dst.dev ? r->u.dst.dev->name : "*",
301                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
302                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
303                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
304                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
305                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
306                         dst_metric(&r->u.dst, RTAX_WINDOW),
307                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
308                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
309                         r->fl.fl4_tos,
310                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
311                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
312                                        dev_queue_xmit) : 0,
313                         r->rt_spec_dst);
314                 seq_printf(seq, "%-127s\n", temp);
315         }
316         return 0;
317 }
318
319 static struct seq_operations rt_cache_seq_ops = {
320         .start  = rt_cache_seq_start,
321         .next   = rt_cache_seq_next,
322         .stop   = rt_cache_seq_stop,
323         .show   = rt_cache_seq_show,
324 };
325
326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
327 {
328         struct seq_file *seq;
329         int rc = -ENOMEM;
330         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
331
332         if (!s)
333                 goto out;
334         rc = seq_open(file, &rt_cache_seq_ops);
335         if (rc)
336                 goto out_kfree;
337         seq          = file->private_data;
338         seq->private = s;
339         memset(s, 0, sizeof(*s));
340 out:
341         return rc;
342 out_kfree:
343         kfree(s);
344         goto out;
345 }
346
347 static struct file_operations rt_cache_seq_fops = {
348         .owner   = THIS_MODULE,
349         .open    = rt_cache_seq_open,
350         .read    = seq_read,
351         .llseek  = seq_lseek,
352         .release = seq_release_private,
353 };
354
355
356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
357 {
358         int cpu;
359
360         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
361                 if (!cpu_possible(cpu))
362                         continue;
363                 *pos = cpu;
364                 return per_cpu_ptr(rt_cache_stat, cpu);
365         }
366         return NULL;
367 }
368
369 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
370 {
371         int cpu;
372
373         for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
374                 if (!cpu_possible(cpu))
375                         continue;
376                 *pos = cpu;
377                 return per_cpu_ptr(rt_cache_stat, cpu);
378         }
379         return NULL;
380         
381 }
382
383 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
384 {
385
386 }
387
388 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
389 {
390         struct rt_cache_stat *st = v;
391         
392         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
393                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
394                    atomic_read(&ipv4_dst_ops.entries),
395                    st->in_hit,
396                    st->in_slow_tot,
397                    st->in_slow_mc,
398                    st->in_no_route,
399                    st->in_brd,
400                    st->in_martian_dst,
401                    st->in_martian_src,
402
403                    st->out_hit,
404                    st->out_slow_tot,
405                    st->out_slow_mc, 
406
407                    st->gc_total,
408                    st->gc_ignored,
409                    st->gc_goal_miss,
410                    st->gc_dst_overflow,
411                    st->in_hlist_search,
412                    st->out_hlist_search
413                 );
414         return 0;
415 }
416
417 static struct seq_operations rt_cpu_seq_ops = {
418         .start  = rt_cpu_seq_start,
419         .next   = rt_cpu_seq_next,
420         .stop   = rt_cpu_seq_stop,
421         .show   = rt_cpu_seq_show,
422 };
423
424
425 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
426 {
427         return seq_open(file, &rt_cpu_seq_ops);
428 }
429
430 static struct file_operations rt_cpu_seq_fops = {
431         .owner   = THIS_MODULE,
432         .open    = rt_cpu_seq_open,
433         .read    = seq_read,
434         .llseek  = seq_lseek,
435         .release = seq_release,
436 };
437
438 #endif /* CONFIG_PROC_FS */
439   
440 static __inline__ void rt_free(struct rtable *rt)
441 {
442         call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
443 }
444
445 static __inline__ void rt_drop(struct rtable *rt)
446 {
447         ip_rt_put(rt);
448         call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
449 }
450
451 static __inline__ int rt_fast_clean(struct rtable *rth)
452 {
453         /* Kill broadcast/multicast entries very aggresively, if they
454            collide in hash table with more useful entries */
455         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
456                 rth->fl.iif && rth->u.rt_next;
457 }
458
459 static __inline__ int rt_valuable(struct rtable *rth)
460 {
461         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
462                 rth->u.dst.expires;
463 }
464
465 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
466 {
467         unsigned long age;
468         int ret = 0;
469
470         if (atomic_read(&rth->u.dst.__refcnt))
471                 goto out;
472
473         ret = 1;
474         if (rth->u.dst.expires &&
475             time_after_eq(jiffies, rth->u.dst.expires))
476                 goto out;
477
478         age = jiffies - rth->u.dst.lastuse;
479         ret = 0;
480         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
481             (age <= tmo2 && rt_valuable(rth)))
482                 goto out;
483         ret = 1;
484 out:    return ret;
485 }
486
487 /* Bits of score are:
488  * 31: very valuable
489  * 30: not quite useless
490  * 29..0: usage counter
491  */
492 static inline u32 rt_score(struct rtable *rt)
493 {
494         u32 score = jiffies - rt->u.dst.lastuse;
495
496         score = ~score & ~(3<<30);
497
498         if (rt_valuable(rt))
499                 score |= (1<<31);
500
501         if (!rt->fl.iif ||
502             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
503                 score |= (1<<30);
504
505         return score;
506 }
507
508 /* This runs via a timer and thus is always in BH context. */
509 static void rt_check_expire(unsigned long dummy)
510 {
511         static int rover;
512         int i = rover, t;
513         struct rtable *rth, **rthp;
514         unsigned long now = jiffies;
515
516         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
517              t -= ip_rt_gc_timeout) {
518                 unsigned long tmo = ip_rt_gc_timeout;
519
520                 i = (i + 1) & rt_hash_mask;
521                 rthp = &rt_hash_table[i].chain;
522
523                 spin_lock(&rt_hash_table[i].lock);
524                 while ((rth = *rthp) != NULL) {
525                         if (rth->u.dst.expires) {
526                                 /* Entry is expired even if it is in use */
527                                 if (time_before_eq(now, rth->u.dst.expires)) {
528                                         tmo >>= 1;
529                                         rthp = &rth->u.rt_next;
530                                         continue;
531                                 }
532                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
533                                 tmo >>= 1;
534                                 rthp = &rth->u.rt_next;
535                                 continue;
536                         }
537
538                         /* Cleanup aged off entries. */
539                         *rthp = rth->u.rt_next;
540                         rt_free(rth);
541                 }
542                 spin_unlock(&rt_hash_table[i].lock);
543
544                 /* Fallback loop breaker. */
545                 if (time_after(jiffies, now))
546                         break;
547         }
548         rover = i;
549         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
550 }
551
552 /* This can run from both BH and non-BH contexts, the latter
553  * in the case of a forced flush event.
554  */
555 static void rt_run_flush(unsigned long dummy)
556 {
557         int i;
558         struct rtable *rth, *next;
559
560         rt_deadline = 0;
561
562         get_random_bytes(&rt_hash_rnd, 4);
563
564         for (i = rt_hash_mask; i >= 0; i--) {
565                 spin_lock_bh(&rt_hash_table[i].lock);
566                 rth = rt_hash_table[i].chain;
567                 if (rth)
568                         rt_hash_table[i].chain = NULL;
569                 spin_unlock_bh(&rt_hash_table[i].lock);
570
571                 for (; rth; rth = next) {
572                         next = rth->u.rt_next;
573                         rt_free(rth);
574                 }
575         }
576 }
577
578 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
579
580 void rt_cache_flush(int delay)
581 {
582         unsigned long now = jiffies;
583         int user_mode = !in_softirq();
584
585         if (delay < 0)
586                 delay = ip_rt_min_delay;
587
588         spin_lock_bh(&rt_flush_lock);
589
590         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
591                 long tmo = (long)(rt_deadline - now);
592
593                 /* If flush timer is already running
594                    and flush request is not immediate (delay > 0):
595
596                    if deadline is not achieved, prolongate timer to "delay",
597                    otherwise fire it at deadline time.
598                  */
599
600                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
601                         tmo = 0;
602                 
603                 if (delay > tmo)
604                         delay = tmo;
605         }
606
607         if (delay <= 0) {
608                 spin_unlock_bh(&rt_flush_lock);
609                 rt_run_flush(0);
610                 return;
611         }
612
613         if (rt_deadline == 0)
614                 rt_deadline = now + ip_rt_max_delay;
615
616         mod_timer(&rt_flush_timer, now+delay);
617         spin_unlock_bh(&rt_flush_lock);
618 }
619
620 static void rt_secret_rebuild(unsigned long dummy)
621 {
622         unsigned long now = jiffies;
623
624         rt_cache_flush(0);
625         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
626 }
627
628 /*
629    Short description of GC goals.
630
631    We want to build algorithm, which will keep routing cache
632    at some equilibrium point, when number of aged off entries
633    is kept approximately equal to newly generated ones.
634
635    Current expiration strength is variable "expire".
636    We try to adjust it dynamically, so that if networking
637    is idle expires is large enough to keep enough of warm entries,
638    and when load increases it reduces to limit cache size.
639  */
640
641 static int rt_garbage_collect(void)
642 {
643         static unsigned long expire = RT_GC_TIMEOUT;
644         static unsigned long last_gc;
645         static int rover;
646         static int equilibrium;
647         struct rtable *rth, **rthp;
648         unsigned long now = jiffies;
649         int goal;
650
651         /*
652          * Garbage collection is pretty expensive,
653          * do not make it too frequently.
654          */
655
656         RT_CACHE_STAT_INC(gc_total);
657
658         if (now - last_gc < ip_rt_gc_min_interval &&
659             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
660                 RT_CACHE_STAT_INC(gc_ignored);
661                 goto out;
662         }
663
664         /* Calculate number of entries, which we want to expire now. */
665         goal = atomic_read(&ipv4_dst_ops.entries) -
666                 (ip_rt_gc_elasticity << rt_hash_log);
667         if (goal <= 0) {
668                 if (equilibrium < ipv4_dst_ops.gc_thresh)
669                         equilibrium = ipv4_dst_ops.gc_thresh;
670                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
671                 if (goal > 0) {
672                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
673                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
674                 }
675         } else {
676                 /* We are in dangerous area. Try to reduce cache really
677                  * aggressively.
678                  */
679                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
680                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
681         }
682
683         if (now - last_gc >= ip_rt_gc_min_interval)
684                 last_gc = now;
685
686         if (goal <= 0) {
687                 equilibrium += goal;
688                 goto work_done;
689         }
690
691         do {
692                 int i, k;
693
694                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
695                         unsigned long tmo = expire;
696
697                         k = (k + 1) & rt_hash_mask;
698                         rthp = &rt_hash_table[k].chain;
699                         spin_lock_bh(&rt_hash_table[k].lock);
700                         while ((rth = *rthp) != NULL) {
701                                 if (!rt_may_expire(rth, tmo, expire)) {
702                                         tmo >>= 1;
703                                         rthp = &rth->u.rt_next;
704                                         continue;
705                                 }
706                                 *rthp = rth->u.rt_next;
707                                 rt_free(rth);
708                                 goal--;
709                         }
710                         spin_unlock_bh(&rt_hash_table[k].lock);
711                         if (goal <= 0)
712                                 break;
713                 }
714                 rover = k;
715
716                 if (goal <= 0)
717                         goto work_done;
718
719                 /* Goal is not achieved. We stop process if:
720
721                    - if expire reduced to zero. Otherwise, expire is halfed.
722                    - if table is not full.
723                    - if we are called from interrupt.
724                    - jiffies check is just fallback/debug loop breaker.
725                      We will not spin here for long time in any case.
726                  */
727
728                 RT_CACHE_STAT_INC(gc_goal_miss);
729
730                 if (expire == 0)
731                         break;
732
733                 expire >>= 1;
734 #if RT_CACHE_DEBUG >= 2
735                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
736                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
737 #endif
738
739                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
740                         goto out;
741         } while (!in_softirq() && time_before_eq(jiffies, now));
742
743         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
744                 goto out;
745         if (net_ratelimit())
746                 printk(KERN_WARNING "dst cache overflow\n");
747         RT_CACHE_STAT_INC(gc_dst_overflow);
748         return 1;
749
750 work_done:
751         expire += ip_rt_gc_min_interval;
752         if (expire > ip_rt_gc_timeout ||
753             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
754                 expire = ip_rt_gc_timeout;
755 #if RT_CACHE_DEBUG >= 2
756         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
757                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
758 #endif
759 out:    return 0;
760 }
761
762 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
763 {
764         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
765                fl1->oif     == fl2->oif &&
766                fl1->iif     == fl2->iif;
767 }
768
769 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
770 {
771         struct rtable   *rth, **rthp;
772         unsigned long   now;
773         struct rtable *cand, **candp;
774         u32             min_score;
775         int             chain_length;
776         int attempts = !in_softirq();
777
778 restart:
779         chain_length = 0;
780         min_score = ~(u32)0;
781         cand = NULL;
782         candp = NULL;
783         now = jiffies;
784
785         rthp = &rt_hash_table[hash].chain;
786
787         spin_lock_bh(&rt_hash_table[hash].lock);
788         while ((rth = *rthp) != NULL) {
789                 if (compare_keys(&rth->fl, &rt->fl)) {
790                         /* Put it first */
791                         *rthp = rth->u.rt_next;
792                         /*
793                          * Since lookup is lockfree, the deletion
794                          * must be visible to another weakly ordered CPU before
795                          * the insertion at the start of the hash chain.
796                          */
797                         smp_wmb();
798                         rth->u.rt_next = rt_hash_table[hash].chain;
799                         /*
800                          * Since lookup is lockfree, the update writes
801                          * must be ordered for consistency on SMP.
802                          */
803                         smp_wmb();
804                         rt_hash_table[hash].chain = rth;
805
806                         rth->u.dst.__use++;
807                         dst_hold(&rth->u.dst);
808                         rth->u.dst.lastuse = now;
809                         spin_unlock_bh(&rt_hash_table[hash].lock);
810
811                         rt_drop(rt);
812                         *rp = rth;
813                         return 0;
814                 }
815
816                 if (!atomic_read(&rth->u.dst.__refcnt)) {
817                         u32 score = rt_score(rth);
818
819                         if (score <= min_score) {
820                                 cand = rth;
821                                 candp = rthp;
822                                 min_score = score;
823                         }
824                 }
825
826                 chain_length++;
827
828                 rthp = &rth->u.rt_next;
829         }
830
831         if (cand) {
832                 /* ip_rt_gc_elasticity used to be average length of chain
833                  * length, when exceeded gc becomes really aggressive.
834                  *
835                  * The second limit is less certain. At the moment it allows
836                  * only 2 entries per bucket. We will see.
837                  */
838                 if (chain_length > ip_rt_gc_elasticity) {
839                         *candp = cand->u.rt_next;
840                         rt_free(cand);
841                 }
842         }
843
844         /* Try to bind route to arp only if it is output
845            route or unicast forwarding path.
846          */
847         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
848                 int err = arp_bind_neighbour(&rt->u.dst);
849                 if (err) {
850                         spin_unlock_bh(&rt_hash_table[hash].lock);
851
852                         if (err != -ENOBUFS) {
853                                 rt_drop(rt);
854                                 return err;
855                         }
856
857                         /* Neighbour tables are full and nothing
858                            can be released. Try to shrink route cache,
859                            it is most likely it holds some neighbour records.
860                          */
861                         if (attempts-- > 0) {
862                                 int saved_elasticity = ip_rt_gc_elasticity;
863                                 int saved_int = ip_rt_gc_min_interval;
864                                 ip_rt_gc_elasticity     = 1;
865                                 ip_rt_gc_min_interval   = 0;
866                                 rt_garbage_collect();
867                                 ip_rt_gc_min_interval   = saved_int;
868                                 ip_rt_gc_elasticity     = saved_elasticity;
869                                 goto restart;
870                         }
871
872                         if (net_ratelimit())
873                                 printk(KERN_WARNING "Neighbour table overflow.\n");
874                         rt_drop(rt);
875                         return -ENOBUFS;
876                 }
877         }
878
879         rt->u.rt_next = rt_hash_table[hash].chain;
880 #if RT_CACHE_DEBUG >= 2
881         if (rt->u.rt_next) {
882                 struct rtable *trt;
883                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
884                        NIPQUAD(rt->rt_dst));
885                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
886                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
887                 printk("\n");
888         }
889 #endif
890         rt_hash_table[hash].chain = rt;
891         spin_unlock_bh(&rt_hash_table[hash].lock);
892         *rp = rt;
893         return 0;
894 }
895
896 void rt_bind_peer(struct rtable *rt, int create)
897 {
898         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
899         struct inet_peer *peer;
900
901         peer = inet_getpeer(rt->rt_dst, create);
902
903         spin_lock_bh(&rt_peer_lock);
904         if (rt->peer == NULL) {
905                 rt->peer = peer;
906                 peer = NULL;
907         }
908         spin_unlock_bh(&rt_peer_lock);
909         if (peer)
910                 inet_putpeer(peer);
911 }
912
913 /*
914  * Peer allocation may fail only in serious out-of-memory conditions.  However
915  * we still can generate some output.
916  * Random ID selection looks a bit dangerous because we have no chances to
917  * select ID being unique in a reasonable period of time.
918  * But broken packet identifier may be better than no packet at all.
919  */
920 static void ip_select_fb_ident(struct iphdr *iph)
921 {
922         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
923         static u32 ip_fallback_id;
924         u32 salt;
925
926         spin_lock_bh(&ip_fb_id_lock);
927         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
928         iph->id = htons(salt & 0xFFFF);
929         ip_fallback_id = salt;
930         spin_unlock_bh(&ip_fb_id_lock);
931 }
932
933 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
934 {
935         struct rtable *rt = (struct rtable *) dst;
936
937         if (rt) {
938                 if (rt->peer == NULL)
939                         rt_bind_peer(rt, 1);
940
941                 /* If peer is attached to destination, it is never detached,
942                    so that we need not to grab a lock to dereference it.
943                  */
944                 if (rt->peer) {
945                         iph->id = htons(inet_getid(rt->peer, more));
946                         return;
947                 }
948         } else
949                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
950
951         ip_select_fb_ident(iph);
952 }
953
954 static void rt_del(unsigned hash, struct rtable *rt)
955 {
956         struct rtable **rthp;
957
958         spin_lock_bh(&rt_hash_table[hash].lock);
959         ip_rt_put(rt);
960         for (rthp = &rt_hash_table[hash].chain; *rthp;
961              rthp = &(*rthp)->u.rt_next)
962                 if (*rthp == rt) {
963                         *rthp = rt->u.rt_next;
964                         rt_free(rt);
965                         break;
966                 }
967         spin_unlock_bh(&rt_hash_table[hash].lock);
968 }
969
970 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
971                     u32 saddr, u8 tos, struct net_device *dev)
972 {
973         int i, k;
974         struct in_device *in_dev = in_dev_get(dev);
975         struct rtable *rth, **rthp;
976         u32  skeys[2] = { saddr, 0 };
977         int  ikeys[2] = { dev->ifindex, 0 };
978
979         tos &= IPTOS_RT_MASK;
980
981         if (!in_dev)
982                 return;
983
984         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
985             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
986                 goto reject_redirect;
987
988         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
989                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
990                         goto reject_redirect;
991                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
992                         goto reject_redirect;
993         } else {
994                 if (inet_addr_type(new_gw) != RTN_UNICAST)
995                         goto reject_redirect;
996         }
997
998         for (i = 0; i < 2; i++) {
999                 for (k = 0; k < 2; k++) {
1000                         unsigned hash = rt_hash_code(daddr,
1001                                                      skeys[i] ^ (ikeys[k] << 5),
1002                                                      tos);
1003
1004                         rthp=&rt_hash_table[hash].chain;
1005
1006                         rcu_read_lock();
1007                         while ((rth = *rthp) != NULL) {
1008                                 struct rtable *rt;
1009
1010                                 smp_read_barrier_depends();
1011                                 if (rth->fl.fl4_dst != daddr ||
1012                                     rth->fl.fl4_src != skeys[i] ||
1013                                     rth->fl.fl4_tos != tos ||
1014                                     rth->fl.oif != ikeys[k] ||
1015                                     rth->fl.iif != 0) {
1016                                         rthp = &rth->u.rt_next;
1017                                         continue;
1018                                 }
1019
1020                                 if (rth->rt_dst != daddr ||
1021                                     rth->rt_src != saddr ||
1022                                     rth->u.dst.error ||
1023                                     rth->rt_gateway != old_gw ||
1024                                     rth->u.dst.dev != dev)
1025                                         break;
1026
1027                                 dst_hold(&rth->u.dst);
1028                                 rcu_read_unlock();
1029
1030                                 rt = dst_alloc(&ipv4_dst_ops);
1031                                 if (rt == NULL) {
1032                                         ip_rt_put(rth);
1033                                         in_dev_put(in_dev);
1034                                         return;
1035                                 }
1036
1037                                 /* Copy all the information. */
1038                                 *rt = *rth;
1039                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1040                                 rt->u.dst.__use         = 1;
1041                                 atomic_set(&rt->u.dst.__refcnt, 1);
1042                                 rt->u.dst.child         = NULL;
1043                                 if (rt->u.dst.dev)
1044                                         dev_hold(rt->u.dst.dev);
1045                                 if (rt->idev)
1046                                         in_dev_hold(rt->idev);
1047                                 rt->u.dst.obsolete      = 0;
1048                                 rt->u.dst.lastuse       = jiffies;
1049                                 rt->u.dst.path          = &rt->u.dst;
1050                                 rt->u.dst.neighbour     = NULL;
1051                                 rt->u.dst.hh            = NULL;
1052                                 rt->u.dst.xfrm          = NULL;
1053
1054                                 rt->rt_flags            |= RTCF_REDIRECTED;
1055
1056                                 /* Gateway is different ... */
1057                                 rt->rt_gateway          = new_gw;
1058
1059                                 /* Redirect received -> path was valid */
1060                                 dst_confirm(&rth->u.dst);
1061
1062                                 if (rt->peer)
1063                                         atomic_inc(&rt->peer->refcnt);
1064
1065                                 if (arp_bind_neighbour(&rt->u.dst) ||
1066                                     !(rt->u.dst.neighbour->nud_state &
1067                                             NUD_VALID)) {
1068                                         if (rt->u.dst.neighbour)
1069                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1070                                         ip_rt_put(rth);
1071                                         rt_drop(rt);
1072                                         goto do_next;
1073                                 }
1074
1075                                 rt_del(hash, rth);
1076                                 if (!rt_intern_hash(hash, rt, &rt))
1077                                         ip_rt_put(rt);
1078                                 goto do_next;
1079                         }
1080                         rcu_read_unlock();
1081                 do_next:
1082                         ;
1083                 }
1084         }
1085         in_dev_put(in_dev);
1086         return;
1087
1088 reject_redirect:
1089 #ifdef CONFIG_IP_ROUTE_VERBOSE
1090         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1091                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1092                         "%u.%u.%u.%u ignored.\n"
1093                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1094                         "tos %02x\n",
1095                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1096                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1097 #endif
1098         in_dev_put(in_dev);
1099 }
1100
1101 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1102 {
1103         struct rtable *rt = (struct rtable*)dst;
1104         struct dst_entry *ret = dst;
1105
1106         if (rt) {
1107                 if (dst->obsolete) {
1108                         ip_rt_put(rt);
1109                         ret = NULL;
1110                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1111                            rt->u.dst.expires) {
1112                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1113                                                      rt->fl.fl4_src ^
1114                                                         (rt->fl.oif << 5),
1115                                                      rt->fl.fl4_tos);
1116 #if RT_CACHE_DEBUG >= 1
1117                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1118                                           "%u.%u.%u.%u/%02x dropped\n",
1119                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1120 #endif
1121                         rt_del(hash, rt);
1122                         ret = NULL;
1123                 }
1124         }
1125         return ret;
1126 }
1127
1128 /*
1129  * Algorithm:
1130  *      1. The first ip_rt_redirect_number redirects are sent
1131  *         with exponential backoff, then we stop sending them at all,
1132  *         assuming that the host ignores our redirects.
1133  *      2. If we did not see packets requiring redirects
1134  *         during ip_rt_redirect_silence, we assume that the host
1135  *         forgot redirected route and start to send redirects again.
1136  *
1137  * This algorithm is much cheaper and more intelligent than dumb load limiting
1138  * in icmp.c.
1139  *
1140  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1141  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1142  */
1143
1144 void ip_rt_send_redirect(struct sk_buff *skb)
1145 {
1146         struct rtable *rt = (struct rtable*)skb->dst;
1147         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1148
1149         if (!in_dev)
1150                 return;
1151
1152         if (!IN_DEV_TX_REDIRECTS(in_dev))
1153                 goto out;
1154
1155         /* No redirected packets during ip_rt_redirect_silence;
1156          * reset the algorithm.
1157          */
1158         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1159                 rt->u.dst.rate_tokens = 0;
1160
1161         /* Too many ignored redirects; do not send anything
1162          * set u.dst.rate_last to the last seen redirected packet.
1163          */
1164         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1165                 rt->u.dst.rate_last = jiffies;
1166                 goto out;
1167         }
1168
1169         /* Check for load limit; set rate_last to the latest sent
1170          * redirect.
1171          */
1172         if (time_after(jiffies,
1173                        (rt->u.dst.rate_last +
1174                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1175                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1176                 rt->u.dst.rate_last = jiffies;
1177                 ++rt->u.dst.rate_tokens;
1178 #ifdef CONFIG_IP_ROUTE_VERBOSE
1179                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1180                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1181                     net_ratelimit())
1182                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1183                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1184                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1185                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1186 #endif
1187         }
1188 out:
1189         in_dev_put(in_dev);
1190 }
1191
1192 static int ip_error(struct sk_buff *skb)
1193 {
1194         struct rtable *rt = (struct rtable*)skb->dst;
1195         unsigned long now;
1196         int code;
1197
1198         switch (rt->u.dst.error) {
1199                 case EINVAL:
1200                 default:
1201                         goto out;
1202                 case EHOSTUNREACH:
1203                         code = ICMP_HOST_UNREACH;
1204                         break;
1205                 case ENETUNREACH:
1206                         code = ICMP_NET_UNREACH;
1207                         break;
1208                 case EACCES:
1209                         code = ICMP_PKT_FILTERED;
1210                         break;
1211         }
1212
1213         now = jiffies;
1214         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1215         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1216                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1217         rt->u.dst.rate_last = now;
1218         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1219                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1220                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1221         }
1222
1223 out:    kfree_skb(skb);
1224         return 0;
1225
1226
1227 /*
1228  *      The last two values are not from the RFC but
1229  *      are needed for AMPRnet AX.25 paths.
1230  */
1231
1232 static unsigned short mtu_plateau[] =
1233 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1234
1235 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1236 {
1237         int i;
1238         
1239         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1240                 if (old_mtu > mtu_plateau[i])
1241                         return mtu_plateau[i];
1242         return 68;
1243 }
1244
1245 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1246 {
1247         int i;
1248         unsigned short old_mtu = ntohs(iph->tot_len);
1249         struct rtable *rth;
1250         u32  skeys[2] = { iph->saddr, 0, };
1251         u32  daddr = iph->daddr;
1252         u8   tos = iph->tos & IPTOS_RT_MASK;
1253         unsigned short est_mtu = 0;
1254
1255         if (ipv4_config.no_pmtu_disc)
1256                 return 0;
1257
1258         for (i = 0; i < 2; i++) {
1259                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1260
1261                 rcu_read_lock();
1262                 for (rth = rt_hash_table[hash].chain; rth;
1263                      rth = rth->u.rt_next) {
1264                         smp_read_barrier_depends();
1265                         if (rth->fl.fl4_dst == daddr &&
1266                             rth->fl.fl4_src == skeys[i] &&
1267                             rth->rt_dst  == daddr &&
1268                             rth->rt_src  == iph->saddr &&
1269                             rth->fl.fl4_tos == tos &&
1270                             rth->fl.iif == 0 &&
1271                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1272                                 unsigned short mtu = new_mtu;
1273
1274                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1275
1276                                         /* BSD 4.2 compatibility hack :-( */
1277                                         if (mtu == 0 &&
1278                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1279                                             old_mtu >= 68 + (iph->ihl << 2))
1280                                                 old_mtu -= iph->ihl << 2;
1281
1282                                         mtu = guess_mtu(old_mtu);
1283                                 }
1284                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1285                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1286                                                 dst_confirm(&rth->u.dst);
1287                                                 if (mtu < ip_rt_min_pmtu) {
1288                                                         mtu = ip_rt_min_pmtu;
1289                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1290                                                                 (1 << RTAX_MTU);
1291                                                 }
1292                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1293                                                 dst_set_expires(&rth->u.dst,
1294                                                         ip_rt_mtu_expires);
1295                                         }
1296                                         est_mtu = mtu;
1297                                 }
1298                         }
1299                 }
1300                 rcu_read_unlock();
1301         }
1302         return est_mtu ? : new_mtu;
1303 }
1304
1305 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1306 {
1307         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1308             !(dst_metric_locked(dst, RTAX_MTU))) {
1309                 if (mtu < ip_rt_min_pmtu) {
1310                         mtu = ip_rt_min_pmtu;
1311                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1312                 }
1313                 dst->metrics[RTAX_MTU-1] = mtu;
1314                 dst_set_expires(dst, ip_rt_mtu_expires);
1315         }
1316 }
1317
1318 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1319 {
1320         dst_release(dst);
1321         return NULL;
1322 }
1323
1324 static void ipv4_dst_destroy(struct dst_entry *dst)
1325 {
1326         struct rtable *rt = (struct rtable *) dst;
1327         struct inet_peer *peer = rt->peer;
1328         struct in_device *idev = rt->idev;
1329
1330         if (peer) {
1331                 rt->peer = NULL;
1332                 inet_putpeer(peer);
1333         }
1334
1335         if (idev) {
1336                 rt->idev = NULL;
1337                 in_dev_put(idev);
1338         }
1339 }
1340
1341 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1342 {
1343         struct rtable *rt = (struct rtable *) dst;
1344         struct in_device *idev = rt->idev;
1345         if (idev) {
1346                 rt->idev = NULL;
1347                 in_dev_put(idev);
1348         }
1349 }
1350
1351 static void ipv4_link_failure(struct sk_buff *skb)
1352 {
1353         struct rtable *rt;
1354
1355         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1356
1357         rt = (struct rtable *) skb->dst;
1358         if (rt)
1359                 dst_set_expires(&rt->u.dst, 0);
1360 }
1361
1362 static int ip_rt_bug(struct sk_buff **pskb)
1363 {
1364         struct sk_buff *skb = *pskb;
1365
1366         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1367                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1368                 skb->dev ? skb->dev->name : "?");
1369         kfree_skb(skb);
1370         return 0;
1371 }
1372
1373 /*
1374    We do not cache source address of outgoing interface,
1375    because it is used only by IP RR, TS and SRR options,
1376    so that it out of fast path.
1377
1378    BTW remember: "addr" is allowed to be not aligned
1379    in IP options!
1380  */
1381
1382 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1383 {
1384         u32 src;
1385         struct fib_result res;
1386
1387         if (rt->fl.iif == 0)
1388                 src = rt->rt_src;
1389         else if (fib_lookup(&rt->fl, &res) == 0) {
1390 #ifdef CONFIG_IP_ROUTE_NAT
1391                 if (res.type == RTN_NAT)
1392                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1393                                                 RT_SCOPE_UNIVERSE);
1394                 else
1395 #endif
1396                         src = FIB_RES_PREFSRC(res);
1397                 fib_res_put(&res);
1398         } else
1399                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1400                                         RT_SCOPE_UNIVERSE);
1401         memcpy(addr, &src, 4);
1402 }
1403
1404 #ifdef CONFIG_NET_CLS_ROUTE
1405 static void set_class_tag(struct rtable *rt, u32 tag)
1406 {
1407         if (!(rt->u.dst.tclassid & 0xFFFF))
1408                 rt->u.dst.tclassid |= tag & 0xFFFF;
1409         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1410                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1411 }
1412 #endif
1413
1414 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1415 {
1416         struct fib_info *fi = res->fi;
1417
1418         if (fi) {
1419                 if (FIB_RES_GW(*res) &&
1420                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1421                         rt->rt_gateway = FIB_RES_GW(*res);
1422                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1423                        sizeof(rt->u.dst.metrics));
1424                 if (fi->fib_mtu == 0) {
1425                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1426                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1427                             rt->rt_gateway != rt->rt_dst &&
1428                             rt->u.dst.dev->mtu > 576)
1429                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1430                 }
1431 #ifdef CONFIG_NET_CLS_ROUTE
1432                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1433 #endif
1434         } else
1435                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1436
1437         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1438                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1439         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1440                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1441         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1442                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1443                                        ip_rt_min_advmss);
1444         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1445                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1446
1447 #ifdef CONFIG_NET_CLS_ROUTE
1448 #ifdef CONFIG_IP_MULTIPLE_TABLES
1449         set_class_tag(rt, fib_rules_tclass(res));
1450 #endif
1451         set_class_tag(rt, itag);
1452 #endif
1453         rt->rt_type = res->type;
1454 }
1455
1456 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1457                                 u8 tos, struct net_device *dev, int our)
1458 {
1459         unsigned hash;
1460         struct rtable *rth;
1461         u32 spec_dst;
1462         struct in_device *in_dev = in_dev_get(dev);
1463         u32 itag = 0;
1464
1465         /* Primary sanity checks. */
1466
1467         if (in_dev == NULL)
1468                 return -EINVAL;
1469
1470         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1471             skb->protocol != htons(ETH_P_IP))
1472                 goto e_inval;
1473
1474         if (ZERONET(saddr)) {
1475                 if (!LOCAL_MCAST(daddr))
1476                         goto e_inval;
1477                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1478         } else if (fib_validate_source(saddr, 0, tos, 0,
1479                                         dev, &spec_dst, &itag) < 0)
1480                 goto e_inval;
1481
1482         rth = dst_alloc(&ipv4_dst_ops);
1483         if (!rth)
1484                 goto e_nobufs;
1485
1486         rth->u.dst.output= ip_rt_bug;
1487
1488         atomic_set(&rth->u.dst.__refcnt, 1);
1489         rth->u.dst.flags= DST_HOST;
1490         if (in_dev->cnf.no_policy)
1491                 rth->u.dst.flags |= DST_NOPOLICY;
1492         rth->fl.fl4_dst = daddr;
1493         rth->rt_dst     = daddr;
1494         rth->fl.fl4_tos = tos;
1495 #ifdef CONFIG_IP_ROUTE_FWMARK
1496         rth->fl.fl4_fwmark= skb->nfmark;
1497 #endif
1498         rth->fl.fl4_src = saddr;
1499         rth->rt_src     = saddr;
1500 #ifdef CONFIG_IP_ROUTE_NAT
1501         rth->rt_dst_map = daddr;
1502         rth->rt_src_map = saddr;
1503 #endif
1504 #ifdef CONFIG_NET_CLS_ROUTE
1505         rth->u.dst.tclassid = itag;
1506 #endif
1507         rth->rt_iif     =
1508         rth->fl.iif     = dev->ifindex;
1509         rth->u.dst.dev  = &loopback_dev;
1510         dev_hold(rth->u.dst.dev);
1511         rth->idev       = in_dev_get(rth->u.dst.dev);
1512         rth->fl.oif     = 0;
1513         rth->rt_gateway = daddr;
1514         rth->rt_spec_dst= spec_dst;
1515         rth->rt_type    = RTN_MULTICAST;
1516         rth->rt_flags   = RTCF_MULTICAST;
1517         if (our) {
1518                 rth->u.dst.input= ip_local_deliver;
1519                 rth->rt_flags |= RTCF_LOCAL;
1520         }
1521
1522 #ifdef CONFIG_IP_MROUTE
1523         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1524                 rth->u.dst.input = ip_mr_input;
1525 #endif
1526         RT_CACHE_STAT_INC(in_slow_mc);
1527
1528         in_dev_put(in_dev);
1529         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1530         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1531
1532 e_nobufs:
1533         in_dev_put(in_dev);
1534         return -ENOBUFS;
1535
1536 e_inval:
1537         in_dev_put(in_dev);
1538         return -EINVAL;
1539 }
1540
1541 /*
1542  *      NOTE. We drop all the packets that has local source
1543  *      addresses, because every properly looped back packet
1544  *      must have correct destination already attached by output routine.
1545  *
1546  *      Such approach solves two big problems:
1547  *      1. Not simplex devices are handled properly.
1548  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1549  */
1550
1551 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1552                         u8 tos, struct net_device *dev)
1553 {
1554         struct fib_result res;
1555         struct in_device *in_dev = in_dev_get(dev);
1556         struct in_device *out_dev = NULL;
1557         struct flowi fl = { .nl_u = { .ip4_u =
1558                                       { .daddr = daddr,
1559                                         .saddr = saddr,
1560                                         .tos = tos,
1561                                         .scope = RT_SCOPE_UNIVERSE,
1562 #ifdef CONFIG_IP_ROUTE_FWMARK
1563                                         .fwmark = skb->nfmark
1564 #endif
1565                                       } },
1566                             .iif = dev->ifindex };
1567         unsigned        flags = 0;
1568         u32             itag = 0;
1569         struct rtable * rth;
1570         unsigned        hash;
1571         u32             spec_dst;
1572         int             err = -EINVAL;
1573         int             free_res = 0;
1574
1575         /* IP on this device is disabled. */
1576
1577         if (!in_dev)
1578                 goto out;
1579
1580         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1581
1582         /* Check for the most weird martians, which can be not detected
1583            by fib_lookup.
1584          */
1585
1586         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1587                 goto martian_source;
1588
1589         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1590                 goto brd_input;
1591
1592         /* Accept zero addresses only to limited broadcast;
1593          * I even do not know to fix it or not. Waiting for complains :-)
1594          */
1595         if (ZERONET(saddr))
1596                 goto martian_source;
1597
1598         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1599                 goto martian_destination;
1600
1601         /*
1602          *      Now we are ready to route packet.
1603          */
1604         if ((err = fib_lookup(&fl, &res)) != 0) {
1605                 if (!IN_DEV_FORWARD(in_dev))
1606                         goto e_inval;
1607                 goto no_route;
1608         }
1609         free_res = 1;
1610
1611         RT_CACHE_STAT_INC(in_slow_tot);
1612
1613 #ifdef CONFIG_IP_ROUTE_NAT
1614         /* Policy is applied before mapping destination,
1615            but rerouting after map should be made with old source.
1616          */
1617
1618         if (1) {
1619                 u32 src_map = saddr;
1620                 if (res.r)
1621                         src_map = fib_rules_policy(saddr, &res, &flags);
1622
1623                 if (res.type == RTN_NAT) {
1624                         fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1625                         fib_res_put(&res);
1626                         free_res = 0;
1627                         if (fib_lookup(&fl, &res))
1628                                 goto e_inval;
1629                         free_res = 1;
1630                         if (res.type != RTN_UNICAST)
1631                                 goto e_inval;
1632                         flags |= RTCF_DNAT;
1633                 }
1634                 fl.fl4_src = src_map;
1635         }
1636 #endif
1637
1638         if (res.type == RTN_BROADCAST)
1639                 goto brd_input;
1640
1641         if (res.type == RTN_LOCAL) {
1642                 int result;
1643                 result = fib_validate_source(saddr, daddr, tos,
1644                                              loopback_dev.ifindex,
1645                                              dev, &spec_dst, &itag);
1646                 if (result < 0)
1647                         goto martian_source;
1648                 if (result)
1649                         flags |= RTCF_DIRECTSRC;
1650                 spec_dst = daddr;
1651                 goto local_input;
1652         }
1653
1654         if (!IN_DEV_FORWARD(in_dev))
1655                 goto e_inval;
1656         if (res.type != RTN_UNICAST)
1657                 goto martian_destination;
1658
1659 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1660         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1661                 fib_select_multipath(&fl, &res);
1662 #endif
1663         out_dev = in_dev_get(FIB_RES_DEV(res));
1664         if (out_dev == NULL) {
1665                 if (net_ratelimit())
1666                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1667                                          "Please, report\n");
1668                 goto e_inval;
1669         }
1670
1671         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1672                                   &spec_dst, &itag);
1673         if (err < 0)
1674                 goto martian_source;
1675
1676         if (err)
1677                 flags |= RTCF_DIRECTSRC;
1678
1679         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1680             (IN_DEV_SHARED_MEDIA(out_dev) ||
1681              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1682                 flags |= RTCF_DOREDIRECT;
1683
1684         if (skb->protocol != htons(ETH_P_IP)) {
1685                 /* Not IP (i.e. ARP). Do not create route, if it is
1686                  * invalid for proxy arp. DNAT routes are always valid.
1687                  */
1688                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1689                         goto e_inval;
1690         }
1691
1692         rth = dst_alloc(&ipv4_dst_ops);
1693         if (!rth)
1694                 goto e_nobufs;
1695
1696         atomic_set(&rth->u.dst.__refcnt, 1);
1697         rth->u.dst.flags= DST_HOST;
1698         if (in_dev->cnf.no_policy)
1699                 rth->u.dst.flags |= DST_NOPOLICY;
1700         if (in_dev->cnf.no_xfrm)
1701                 rth->u.dst.flags |= DST_NOXFRM;
1702         rth->fl.fl4_dst = daddr;
1703         rth->rt_dst     = daddr;
1704         rth->fl.fl4_tos = tos;
1705 #ifdef CONFIG_IP_ROUTE_FWMARK
1706         rth->fl.fl4_fwmark= skb->nfmark;
1707 #endif
1708         rth->fl.fl4_src = saddr;
1709         rth->rt_src     = saddr;
1710         rth->rt_gateway = daddr;
1711 #ifdef CONFIG_IP_ROUTE_NAT
1712         rth->rt_src_map = fl.fl4_src;
1713         rth->rt_dst_map = fl.fl4_dst;
1714         if (flags&RTCF_DNAT)
1715                 rth->rt_gateway = fl.fl4_dst;
1716 #endif
1717         rth->rt_iif     =
1718         rth->fl.iif     = dev->ifindex;
1719         rth->u.dst.dev  = out_dev->dev;
1720         dev_hold(rth->u.dst.dev);
1721         rth->idev       = in_dev_get(rth->u.dst.dev);
1722         rth->fl.oif     = 0;
1723         rth->rt_spec_dst= spec_dst;
1724
1725         rth->u.dst.input = ip_forward;
1726         rth->u.dst.output = ip_output;
1727
1728         rt_set_nexthop(rth, &res, itag);
1729
1730         rth->rt_flags = flags;
1731
1732 intern:
1733         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1734 done:
1735         in_dev_put(in_dev);
1736         if (out_dev)
1737                 in_dev_put(out_dev);
1738         if (free_res)
1739                 fib_res_put(&res);
1740 out:    return err;
1741
1742 brd_input:
1743         if (skb->protocol != htons(ETH_P_IP))
1744                 goto e_inval;
1745
1746         if (ZERONET(saddr))
1747                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1748         else {
1749                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1750                                           &itag);
1751                 if (err < 0)
1752                         goto martian_source;
1753                 if (err)
1754                         flags |= RTCF_DIRECTSRC;
1755         }
1756         flags |= RTCF_BROADCAST;
1757         res.type = RTN_BROADCAST;
1758         RT_CACHE_STAT_INC(in_brd);
1759
1760 local_input:
1761         rth = dst_alloc(&ipv4_dst_ops);
1762         if (!rth)
1763                 goto e_nobufs;
1764
1765         rth->u.dst.output= ip_rt_bug;
1766
1767         atomic_set(&rth->u.dst.__refcnt, 1);
1768         rth->u.dst.flags= DST_HOST;
1769         if (in_dev->cnf.no_policy)
1770                 rth->u.dst.flags |= DST_NOPOLICY;
1771         rth->fl.fl4_dst = daddr;
1772         rth->rt_dst     = daddr;
1773         rth->fl.fl4_tos = tos;
1774 #ifdef CONFIG_IP_ROUTE_FWMARK
1775         rth->fl.fl4_fwmark= skb->nfmark;
1776 #endif
1777         rth->fl.fl4_src = saddr;
1778         rth->rt_src     = saddr;
1779 #ifdef CONFIG_IP_ROUTE_NAT
1780         rth->rt_dst_map = fl.fl4_dst;
1781         rth->rt_src_map = fl.fl4_src;
1782 #endif
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784         rth->u.dst.tclassid = itag;
1785 #endif
1786         rth->rt_iif     =
1787         rth->fl.iif     = dev->ifindex;
1788         rth->u.dst.dev  = &loopback_dev;
1789         dev_hold(rth->u.dst.dev);
1790         rth->idev       = in_dev_get(rth->u.dst.dev);
1791         rth->rt_gateway = daddr;
1792         rth->rt_spec_dst= spec_dst;
1793         rth->u.dst.input= ip_local_deliver;
1794         rth->rt_flags   = flags|RTCF_LOCAL;
1795         if (res.type == RTN_UNREACHABLE) {
1796                 rth->u.dst.input= ip_error;
1797                 rth->u.dst.error= -err;
1798                 rth->rt_flags   &= ~RTCF_LOCAL;
1799         }
1800         rth->rt_type    = res.type;
1801         goto intern;
1802
1803 no_route:
1804         RT_CACHE_STAT_INC(in_no_route);
1805         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1806         res.type = RTN_UNREACHABLE;
1807         goto local_input;
1808
1809         /*
1810          *      Do not cache martian addresses: they should be logged (RFC1812)
1811          */
1812 martian_destination:
1813         RT_CACHE_STAT_INC(in_martian_dst);
1814 #ifdef CONFIG_IP_ROUTE_VERBOSE
1815         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1816                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1817                         "%u.%u.%u.%u, dev %s\n",
1818                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1819 #endif
1820 e_inval:
1821         err = -EINVAL;
1822         goto done;
1823
1824 e_nobufs:
1825         err = -ENOBUFS;
1826         goto done;
1827
1828 martian_source:
1829
1830         RT_CACHE_STAT_INC(in_martian_src);
1831 #ifdef CONFIG_IP_ROUTE_VERBOSE
1832         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1833                 /*
1834                  *      RFC1812 recommendation, if source is martian,
1835                  *      the only hint is MAC header.
1836                  */
1837                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1838                         "%u.%u.%u.%u, on dev %s\n",
1839                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1840                 if (dev->hard_header_len) {
1841                         int i;
1842                         unsigned char *p = skb->mac.raw;
1843                         printk(KERN_WARNING "ll header: ");
1844                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1845                                 printk("%02x", *p);
1846                                 if (i < (dev->hard_header_len - 1))
1847                                         printk(":");
1848                         }
1849                         printk("\n");
1850                 }
1851         }
1852 #endif
1853         goto e_inval;
1854 }
1855
1856 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1857                    u8 tos, struct net_device *dev)
1858 {
1859         struct rtable * rth;
1860         unsigned        hash;
1861         int iif = dev->ifindex;
1862
1863         tos &= IPTOS_RT_MASK;
1864         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1865
1866         rcu_read_lock();
1867         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1868                 smp_read_barrier_depends();
1869                 if (rth->fl.fl4_dst == daddr &&
1870                     rth->fl.fl4_src == saddr &&
1871                     rth->fl.iif == iif &&
1872                     rth->fl.oif == 0 &&
1873 #ifdef CONFIG_IP_ROUTE_FWMARK
1874                     rth->fl.fl4_fwmark == skb->nfmark &&
1875 #endif
1876                     rth->fl.fl4_tos == tos) {
1877                         rth->u.dst.lastuse = jiffies;
1878                         dst_hold(&rth->u.dst);
1879                         rth->u.dst.__use++;
1880                         RT_CACHE_STAT_INC(in_hit);
1881                         rcu_read_unlock();
1882                         skb->dst = (struct dst_entry*)rth;
1883                         return 0;
1884                 }
1885                 RT_CACHE_STAT_INC(in_hlist_search);
1886         }
1887         rcu_read_unlock();
1888
1889         /* Multicast recognition logic is moved from route cache to here.
1890            The problem was that too many Ethernet cards have broken/missing
1891            hardware multicast filters :-( As result the host on multicasting
1892            network acquires a lot of useless route cache entries, sort of
1893            SDR messages from all the world. Now we try to get rid of them.
1894            Really, provided software IP multicast filter is organized
1895            reasonably (at least, hashed), it does not result in a slowdown
1896            comparing with route cache reject entries.
1897            Note, that multicast routers are not affected, because
1898            route cache entry is created eventually.
1899          */
1900         if (MULTICAST(daddr)) {
1901                 struct in_device *in_dev;
1902
1903                 read_lock(&inetdev_lock);
1904                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1905                         int our = ip_check_mc(in_dev, daddr, saddr,
1906                                 skb->nh.iph->protocol);
1907                         if (our
1908 #ifdef CONFIG_IP_MROUTE
1909                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1910 #endif
1911                             ) {
1912                                 read_unlock(&inetdev_lock);
1913                                 return ip_route_input_mc(skb, daddr, saddr,
1914                                                          tos, dev, our);
1915                         }
1916                 }
1917                 read_unlock(&inetdev_lock);
1918                 return -EINVAL;
1919         }
1920         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1921 }
1922
1923 /*
1924  * Major route resolver routine.
1925  */
1926
1927 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1928 {
1929         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1930         struct flowi fl = { .nl_u = { .ip4_u =
1931                                       { .daddr = oldflp->fl4_dst,
1932                                         .saddr = oldflp->fl4_src,
1933                                         .tos = tos & IPTOS_RT_MASK,
1934                                         .scope = ((tos & RTO_ONLINK) ?
1935                                                   RT_SCOPE_LINK :
1936                                                   RT_SCOPE_UNIVERSE),
1937 #ifdef CONFIG_IP_ROUTE_FWMARK
1938                                         .fwmark = oldflp->fl4_fwmark
1939 #endif
1940                                       } },
1941                             .iif = loopback_dev.ifindex,
1942                             .oif = oldflp->oif };
1943         struct fib_result res;
1944         unsigned flags = 0;
1945         struct rtable *rth;
1946         struct net_device *dev_out = NULL;
1947         struct in_device *in_dev = NULL;
1948         unsigned hash;
1949         int free_res = 0;
1950         int err;
1951
1952         res.fi          = NULL;
1953 #ifdef CONFIG_IP_MULTIPLE_TABLES
1954         res.r           = NULL;
1955 #endif
1956
1957         if (oldflp->fl4_src) {
1958                 err = -EINVAL;
1959                 if (MULTICAST(oldflp->fl4_src) ||
1960                     BADCLASS(oldflp->fl4_src) ||
1961                     ZERONET(oldflp->fl4_src))
1962                         goto out;
1963
1964                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1965                 dev_out = ip_dev_find(oldflp->fl4_src);
1966                 if (dev_out == NULL)
1967                         goto out;
1968
1969                 /* I removed check for oif == dev_out->oif here.
1970                    It was wrong for two reasons:
1971                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1972                       assigned to multiple interfaces.
1973                    2. Moreover, we are allowed to send packets with saddr
1974                       of another iface. --ANK
1975                  */
1976
1977                 if (oldflp->oif == 0
1978                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1979                         /* Special hack: user can direct multicasts
1980                            and limited broadcast via necessary interface
1981                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1982                            This hack is not just for fun, it allows
1983                            vic,vat and friends to work.
1984                            They bind socket to loopback, set ttl to zero
1985                            and expect that it will work.
1986                            From the viewpoint of routing cache they are broken,
1987                            because we are not allowed to build multicast path
1988                            with loopback source addr (look, routing cache
1989                            cannot know, that ttl is zero, so that packet
1990                            will not leave this host and route is valid).
1991                            Luckily, this hack is good workaround.
1992                          */
1993
1994                         fl.oif = dev_out->ifindex;
1995                         goto make_route;
1996                 }
1997                 if (dev_out)
1998                         dev_put(dev_out);
1999                 dev_out = NULL;
2000         }
2001         if (oldflp->oif) {
2002                 dev_out = dev_get_by_index(oldflp->oif);
2003                 err = -ENODEV;
2004                 if (dev_out == NULL)
2005                         goto out;
2006                 if (__in_dev_get(dev_out) == NULL) {
2007                         dev_put(dev_out);
2008                         goto out;       /* Wrong error code */
2009                 }
2010
2011                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2012                         if (!fl.fl4_src)
2013                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2014                                                               RT_SCOPE_LINK);
2015                         goto make_route;
2016                 }
2017                 if (!fl.fl4_src) {
2018                         if (MULTICAST(oldflp->fl4_dst))
2019                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2020                                                               fl.fl4_scope);
2021                         else if (!oldflp->fl4_dst)
2022                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2023                                                               RT_SCOPE_HOST);
2024                 }
2025         }
2026
2027         if (!fl.fl4_dst) {
2028                 fl.fl4_dst = fl.fl4_src;
2029                 if (!fl.fl4_dst)
2030                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2031                 if (dev_out)
2032                         dev_put(dev_out);
2033                 dev_out = &loopback_dev;
2034                 dev_hold(dev_out);
2035                 fl.oif = loopback_dev.ifindex;
2036                 res.type = RTN_LOCAL;
2037                 flags |= RTCF_LOCAL;
2038                 goto make_route;
2039         }
2040
2041         if (fib_lookup(&fl, &res)) {
2042                 res.fi = NULL;
2043                 if (oldflp->oif) {
2044                         /* Apparently, routing tables are wrong. Assume,
2045                            that the destination is on link.
2046
2047                            WHY? DW.
2048                            Because we are allowed to send to iface
2049                            even if it has NO routes and NO assigned
2050                            addresses. When oif is specified, routing
2051                            tables are looked up with only one purpose:
2052                            to catch if destination is gatewayed, rather than
2053                            direct. Moreover, if MSG_DONTROUTE is set,
2054                            we send packet, ignoring both routing tables
2055                            and ifaddr state. --ANK
2056
2057
2058                            We could make it even if oif is unknown,
2059                            likely IPv6, but we do not.
2060                          */
2061
2062                         if (fl.fl4_src == 0)
2063                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2064                                                               RT_SCOPE_LINK);
2065                         res.type = RTN_UNICAST;
2066                         goto make_route;
2067                 }
2068                 if (dev_out)
2069                         dev_put(dev_out);
2070                 err = -ENETUNREACH;
2071                 goto out;
2072         }
2073         free_res = 1;
2074
2075         if (res.type == RTN_NAT)
2076                 goto e_inval;
2077
2078         if (res.type == RTN_LOCAL) {
2079                 if (!fl.fl4_src)
2080                         fl.fl4_src = fl.fl4_dst;
2081                 if (dev_out)
2082                         dev_put(dev_out);
2083                 dev_out = &loopback_dev;
2084                 dev_hold(dev_out);
2085                 fl.oif = dev_out->ifindex;
2086                 if (res.fi)
2087                         fib_info_put(res.fi);
2088                 res.fi = NULL;
2089                 flags |= RTCF_LOCAL;
2090                 goto make_route;
2091         }
2092
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2095                 fib_select_multipath(&fl, &res);
2096         else
2097 #endif
2098         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2099                 fib_select_default(&fl, &res);
2100
2101         if (!fl.fl4_src)
2102                 fl.fl4_src = FIB_RES_PREFSRC(res);
2103
2104         if (dev_out)
2105                 dev_put(dev_out);
2106         dev_out = FIB_RES_DEV(res);
2107         dev_hold(dev_out);
2108         fl.oif = dev_out->ifindex;
2109
2110 make_route:
2111         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2112                 goto e_inval;
2113
2114         if (fl.fl4_dst == 0xFFFFFFFF)
2115                 res.type = RTN_BROADCAST;
2116         else if (MULTICAST(fl.fl4_dst))
2117                 res.type = RTN_MULTICAST;
2118         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2119                 goto e_inval;
2120
2121         if (dev_out->flags & IFF_LOOPBACK)
2122                 flags |= RTCF_LOCAL;
2123
2124         in_dev = in_dev_get(dev_out);
2125         if (!in_dev)
2126                 goto e_inval;
2127
2128         if (res.type == RTN_BROADCAST) {
2129                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2130                 if (res.fi) {
2131                         fib_info_put(res.fi);
2132                         res.fi = NULL;
2133                 }
2134         } else if (res.type == RTN_MULTICAST) {
2135                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2136                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2137                         flags &= ~RTCF_LOCAL;
2138                 /* If multicast route do not exist use
2139                    default one, but do not gateway in this case.
2140                    Yes, it is hack.
2141                  */
2142                 if (res.fi && res.prefixlen < 4) {
2143                         fib_info_put(res.fi);
2144                         res.fi = NULL;
2145                 }
2146         }
2147
2148         rth = dst_alloc(&ipv4_dst_ops);
2149         if (!rth)
2150                 goto e_nobufs;
2151
2152         atomic_set(&rth->u.dst.__refcnt, 1);
2153         rth->u.dst.flags= DST_HOST;
2154         if (in_dev->cnf.no_xfrm)
2155                 rth->u.dst.flags |= DST_NOXFRM;
2156         if (in_dev->cnf.no_policy)
2157                 rth->u.dst.flags |= DST_NOPOLICY;
2158         rth->fl.fl4_dst = oldflp->fl4_dst;
2159         rth->fl.fl4_tos = tos;
2160         rth->fl.fl4_src = oldflp->fl4_src;
2161         rth->fl.oif     = oldflp->oif;
2162 #ifdef CONFIG_IP_ROUTE_FWMARK
2163         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2164 #endif
2165         rth->rt_dst     = fl.fl4_dst;
2166         rth->rt_src     = fl.fl4_src;
2167 #ifdef CONFIG_IP_ROUTE_NAT
2168         rth->rt_dst_map = fl.fl4_dst;
2169         rth->rt_src_map = fl.fl4_src;
2170 #endif
2171         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2172         rth->u.dst.dev  = dev_out;
2173         dev_hold(dev_out);
2174         rth->idev       = in_dev_get(dev_out);
2175         rth->rt_gateway = fl.fl4_dst;
2176         rth->rt_spec_dst= fl.fl4_src;
2177
2178         rth->u.dst.output=ip_output;
2179
2180         RT_CACHE_STAT_INC(out_slow_tot);
2181
2182         if (flags & RTCF_LOCAL) {
2183                 rth->u.dst.input = ip_local_deliver;
2184                 rth->rt_spec_dst = fl.fl4_dst;
2185         }
2186         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2187                 rth->rt_spec_dst = fl.fl4_src;
2188                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2189                         rth->u.dst.output = ip_mc_output;
2190                         RT_CACHE_STAT_INC(out_slow_mc);
2191                 }
2192 #ifdef CONFIG_IP_MROUTE
2193                 if (res.type == RTN_MULTICAST) {
2194                         if (IN_DEV_MFORWARD(in_dev) &&
2195                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2196                                 rth->u.dst.input = ip_mr_input;
2197                                 rth->u.dst.output = ip_mc_output;
2198                         }
2199                 }
2200 #endif
2201         }
2202
2203         rt_set_nexthop(rth, &res, 0);
2204         
2205
2206         rth->rt_flags = flags;
2207
2208         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2209         err = rt_intern_hash(hash, rth, rp);
2210 done:
2211         if (free_res)
2212                 fib_res_put(&res);
2213         if (dev_out)
2214                 dev_put(dev_out);
2215         if (in_dev)
2216                 in_dev_put(in_dev);
2217 out:    return err;
2218
2219 e_inval:
2220         err = -EINVAL;
2221         goto done;
2222 e_nobufs:
2223         err = -ENOBUFS;
2224         goto done;
2225 }
2226
2227 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2228 {
2229         unsigned hash;
2230         struct rtable *rth;
2231
2232         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2233
2234         rcu_read_lock();
2235         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2236                 smp_read_barrier_depends();
2237                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2238                     rth->fl.fl4_src == flp->fl4_src &&
2239                     rth->fl.iif == 0 &&
2240                     rth->fl.oif == flp->oif &&
2241 #ifdef CONFIG_IP_ROUTE_FWMARK
2242                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2243 #endif
2244                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2245                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2246                         rth->u.dst.lastuse = jiffies;
2247                         dst_hold(&rth->u.dst);
2248                         rth->u.dst.__use++;
2249                         RT_CACHE_STAT_INC(out_hit);
2250                         rcu_read_unlock();
2251                         *rp = rth;
2252                         return 0;
2253                 }
2254                 RT_CACHE_STAT_INC(out_hlist_search);
2255         }
2256         rcu_read_unlock();
2257
2258         return ip_route_output_slow(rp, flp);
2259 }
2260
2261 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2262 {
2263         int err;
2264
2265         if ((err = __ip_route_output_key(rp, flp)) != 0)
2266                 return err;
2267         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2268 }
2269
2270 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2271 {
2272         int err;
2273
2274         if ((err = __ip_route_output_key(rp, flp)) != 0)
2275                 return err;
2276         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2277 }
2278
2279 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2280                         int nowait)
2281 {
2282         struct rtable *rt = (struct rtable*)skb->dst;
2283         struct rtmsg *r;
2284         struct nlmsghdr  *nlh;
2285         unsigned char    *b = skb->tail;
2286         struct rta_cacheinfo ci;
2287 #ifdef CONFIG_IP_MROUTE
2288         struct rtattr *eptr;
2289 #endif
2290         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2291         r = NLMSG_DATA(nlh);
2292         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2293         r->rtm_family    = AF_INET;
2294         r->rtm_dst_len  = 32;
2295         r->rtm_src_len  = 0;
2296         r->rtm_tos      = rt->fl.fl4_tos;
2297         r->rtm_table    = RT_TABLE_MAIN;
2298         r->rtm_type     = rt->rt_type;
2299         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2300         r->rtm_protocol = RTPROT_UNSPEC;
2301         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2302         if (rt->rt_flags & RTCF_NOTIFY)
2303                 r->rtm_flags |= RTM_F_NOTIFY;
2304         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2305         if (rt->fl.fl4_src) {
2306                 r->rtm_src_len = 32;
2307                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2308         }
2309         if (rt->u.dst.dev)
2310                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2311 #ifdef CONFIG_NET_CLS_ROUTE
2312         if (rt->u.dst.tclassid)
2313                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2314 #endif
2315         if (rt->fl.iif)
2316                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2317         else if (rt->rt_src != rt->fl.fl4_src)
2318                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2319         if (rt->rt_dst != rt->rt_gateway)
2320                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2321         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2322                 goto rtattr_failure;
2323         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2324         ci.rta_used     = rt->u.dst.__use;
2325         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2326         if (rt->u.dst.expires)
2327                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2328         else
2329                 ci.rta_expires = 0;
2330         ci.rta_error    = rt->u.dst.error;
2331         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2332         if (rt->peer) {
2333                 ci.rta_id = rt->peer->ip_id_count;
2334                 if (rt->peer->tcp_ts_stamp) {
2335                         ci.rta_ts = rt->peer->tcp_ts;
2336                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2337                 }
2338         }
2339 #ifdef CONFIG_IP_MROUTE
2340         eptr = (struct rtattr*)skb->tail;
2341 #endif
2342         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2343         if (rt->fl.iif) {
2344 #ifdef CONFIG_IP_MROUTE
2345                 u32 dst = rt->rt_dst;
2346
2347                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2348                     ipv4_devconf.mc_forwarding) {
2349                         int err = ipmr_get_route(skb, r, nowait);
2350                         if (err <= 0) {
2351                                 if (!nowait) {
2352                                         if (err == 0)
2353                                                 return 0;
2354                                         goto nlmsg_failure;
2355                                 } else {
2356                                         if (err == -EMSGSIZE)
2357                                                 goto nlmsg_failure;
2358                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2359                                 }
2360                         }
2361                 } else
2362 #endif
2363                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2364         }
2365
2366         nlh->nlmsg_len = skb->tail - b;
2367         return skb->len;
2368
2369 nlmsg_failure:
2370 rtattr_failure:
2371         skb_trim(skb, b - skb->data);
2372         return -1;
2373 }
2374
2375 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2376 {
2377         struct rtattr **rta = arg;
2378         struct rtmsg *rtm = NLMSG_DATA(nlh);
2379         struct rtable *rt = NULL;
2380         u32 dst = 0;
2381         u32 src = 0;
2382         int iif = 0;
2383         int err = -ENOBUFS;
2384         struct sk_buff *skb;
2385
2386         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2387         if (!skb)
2388                 goto out;
2389
2390         /* Reserve room for dummy headers, this skb can pass
2391            through good chunk of routing engine.
2392          */
2393         skb->mac.raw = skb->data;
2394         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2395
2396         if (rta[RTA_SRC - 1])
2397                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2398         if (rta[RTA_DST - 1])
2399                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2400         if (rta[RTA_IIF - 1])
2401                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2402
2403         if (iif) {
2404                 struct net_device *dev = __dev_get_by_index(iif);
2405                 err = -ENODEV;
2406                 if (!dev)
2407                         goto out_free;
2408                 skb->protocol   = htons(ETH_P_IP);
2409                 skb->dev        = dev;
2410                 local_bh_disable();
2411                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2412                 local_bh_enable();
2413                 rt = (struct rtable*)skb->dst;
2414                 if (!err && rt->u.dst.error)
2415                         err = -rt->u.dst.error;
2416         } else {
2417                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2418                                                          .saddr = src,
2419                                                          .tos = rtm->rtm_tos } } };
2420                 int oif = 0;
2421                 if (rta[RTA_OIF - 1])
2422                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2423                 fl.oif = oif;
2424                 err = ip_route_output_key(&rt, &fl);
2425         }
2426         if (err)
2427                 goto out_free;
2428
2429         skb->dst = &rt->u.dst;
2430         if (rtm->rtm_flags & RTM_F_NOTIFY)
2431                 rt->rt_flags |= RTCF_NOTIFY;
2432
2433         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2434
2435         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2436                                 RTM_NEWROUTE, 0);
2437         if (!err)
2438                 goto out_free;
2439         if (err < 0) {
2440                 err = -EMSGSIZE;
2441                 goto out_free;
2442         }
2443
2444         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2445         if (err > 0)
2446                 err = 0;
2447 out:    return err;
2448
2449 out_free:
2450         kfree_skb(skb);
2451         goto out;
2452 }
2453
2454 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2455 {
2456         struct rtable *rt;
2457         int h, s_h;
2458         int idx, s_idx;
2459
2460         s_h = cb->args[0];
2461         s_idx = idx = cb->args[1];
2462         for (h = 0; h <= rt_hash_mask; h++) {
2463                 if (h < s_h) continue;
2464                 if (h > s_h)
2465                         s_idx = 0;
2466                 rcu_read_lock();
2467                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2468                      rt = rt->u.rt_next, idx++) {
2469                         smp_read_barrier_depends();
2470                         if (idx < s_idx)
2471                                 continue;
2472                         skb->dst = dst_clone(&rt->u.dst);
2473                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2474                                          cb->nlh->nlmsg_seq,
2475                                          RTM_NEWROUTE, 1) <= 0) {
2476                                 dst_release(xchg(&skb->dst, NULL));
2477                                 rcu_read_unlock();
2478                                 goto done;
2479                         }
2480                         dst_release(xchg(&skb->dst, NULL));
2481                 }
2482                 rcu_read_unlock();
2483         }
2484
2485 done:
2486         cb->args[0] = h;
2487         cb->args[1] = idx;
2488         return skb->len;
2489 }
2490
2491 void ip_rt_multicast_event(struct in_device *in_dev)
2492 {
2493         rt_cache_flush(0);
2494 }
2495
2496 #ifdef CONFIG_SYSCTL
2497 static int flush_delay;
2498
2499 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2500                                         struct file *filp, void __user *buffer,
2501                                         size_t *lenp, loff_t *ppos)
2502 {
2503         if (write) {
2504                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2505                 rt_cache_flush(flush_delay);
2506                 return 0;
2507         } 
2508
2509         return -EINVAL;
2510 }
2511
2512 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2513                                                 int __user *name,
2514                                                 int nlen,
2515                                                 void __user *oldval,
2516                                                 size_t __user *oldlenp,
2517                                                 void __user *newval,
2518                                                 size_t newlen,
2519                                                 void **context)
2520 {
2521         int delay;
2522         if (newlen != sizeof(int))
2523                 return -EINVAL;
2524         if (get_user(delay, (int __user *)newval))
2525                 return -EFAULT; 
2526         rt_cache_flush(delay); 
2527         return 0;
2528 }
2529
2530 ctl_table ipv4_route_table[] = {
2531         {
2532                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2533                 .procname       = "flush",
2534                 .data           = &flush_delay,
2535                 .maxlen         = sizeof(int),
2536                 .mode           = 0644,
2537                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2538                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2539         },
2540         {
2541                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2542                 .procname       = "min_delay",
2543                 .data           = &ip_rt_min_delay,
2544                 .maxlen         = sizeof(int),
2545                 .mode           = 0644,
2546                 .proc_handler   = &proc_dointvec_jiffies,
2547                 .strategy       = &sysctl_jiffies,
2548         },
2549         {
2550                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2551                 .procname       = "max_delay",
2552                 .data           = &ip_rt_max_delay,
2553                 .maxlen         = sizeof(int),
2554                 .mode           = 0644,
2555                 .proc_handler   = &proc_dointvec_jiffies,
2556                 .strategy       = &sysctl_jiffies,
2557         },
2558         {
2559                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2560                 .procname       = "gc_thresh",
2561                 .data           = &ipv4_dst_ops.gc_thresh,
2562                 .maxlen         = sizeof(int),
2563                 .mode           = 0644,
2564                 .proc_handler   = &proc_dointvec,
2565         },
2566         {
2567                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2568                 .procname       = "max_size",
2569                 .data           = &ip_rt_max_size,
2570                 .maxlen         = sizeof(int),
2571                 .mode           = 0644,
2572                 .proc_handler   = &proc_dointvec,
2573         },
2574         {
2575                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2576                 .procname       = "gc_min_interval",
2577                 .data           = &ip_rt_gc_min_interval,
2578                 .maxlen         = sizeof(int),
2579                 .mode           = 0644,
2580                 .proc_handler   = &proc_dointvec_jiffies,
2581                 .strategy       = &sysctl_jiffies,
2582         },
2583         {
2584                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2585                 .procname       = "gc_timeout",
2586                 .data           = &ip_rt_gc_timeout,
2587                 .maxlen         = sizeof(int),
2588                 .mode           = 0644,
2589                 .proc_handler   = &proc_dointvec_jiffies,
2590                 .strategy       = &sysctl_jiffies,
2591         },
2592         {
2593                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2594                 .procname       = "gc_interval",
2595                 .data           = &ip_rt_gc_interval,
2596                 .maxlen         = sizeof(int),
2597                 .mode           = 0644,
2598                 .proc_handler   = &proc_dointvec_jiffies,
2599                 .strategy       = &sysctl_jiffies,
2600         },
2601         {
2602                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2603                 .procname       = "redirect_load",
2604                 .data           = &ip_rt_redirect_load,
2605                 .maxlen         = sizeof(int),
2606                 .mode           = 0644,
2607                 .proc_handler   = &proc_dointvec,
2608         },
2609         {
2610                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2611                 .procname       = "redirect_number",
2612                 .data           = &ip_rt_redirect_number,
2613                 .maxlen         = sizeof(int),
2614                 .mode           = 0644,
2615                 .proc_handler   = &proc_dointvec,
2616         },
2617         {
2618                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2619                 .procname       = "redirect_silence",
2620                 .data           = &ip_rt_redirect_silence,
2621                 .maxlen         = sizeof(int),
2622                 .mode           = 0644,
2623                 .proc_handler   = &proc_dointvec,
2624         },
2625         {
2626                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2627                 .procname       = "error_cost",
2628                 .data           = &ip_rt_error_cost,
2629                 .maxlen         = sizeof(int),
2630                 .mode           = 0644,
2631                 .proc_handler   = &proc_dointvec,
2632         },
2633         {
2634                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2635                 .procname       = "error_burst",
2636                 .data           = &ip_rt_error_burst,
2637                 .maxlen         = sizeof(int),
2638                 .mode           = 0644,
2639                 .proc_handler   = &proc_dointvec,
2640         },
2641         {
2642                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2643                 .procname       = "gc_elasticity",
2644                 .data           = &ip_rt_gc_elasticity,
2645                 .maxlen         = sizeof(int),
2646                 .mode           = 0644,
2647                 .proc_handler   = &proc_dointvec,
2648         },
2649         {
2650                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2651                 .procname       = "mtu_expires",
2652                 .data           = &ip_rt_mtu_expires,
2653                 .maxlen         = sizeof(int),
2654                 .mode           = 0644,
2655                 .proc_handler   = &proc_dointvec_jiffies,
2656                 .strategy       = &sysctl_jiffies,
2657         },
2658         {
2659                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2660                 .procname       = "min_pmtu",
2661                 .data           = &ip_rt_min_pmtu,
2662                 .maxlen         = sizeof(int),
2663                 .mode           = 0644,
2664                 .proc_handler   = &proc_dointvec,
2665         },
2666         {
2667                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2668                 .procname       = "min_adv_mss",
2669                 .data           = &ip_rt_min_advmss,
2670                 .maxlen         = sizeof(int),
2671                 .mode           = 0644,
2672                 .proc_handler   = &proc_dointvec,
2673         },
2674         {
2675                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2676                 .procname       = "secret_interval",
2677                 .data           = &ip_rt_secret_interval,
2678                 .maxlen         = sizeof(int),
2679                 .mode           = 0644,
2680                 .proc_handler   = &proc_dointvec_jiffies,
2681                 .strategy       = &sysctl_jiffies,
2682         },
2683         { .ctl_name = 0 }
2684 };
2685 #endif
2686
2687 #ifdef CONFIG_NET_CLS_ROUTE
2688 struct ip_rt_acct *ip_rt_acct;
2689
2690 /* This code sucks.  But you should have seen it before! --RR */
2691
2692 /* IP route accounting ptr for this logical cpu number. */
2693 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2694
2695 #ifdef CONFIG_PROC_FS
2696 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2697                            int length, int *eof, void *data)
2698 {
2699         unsigned int i;
2700
2701         if ((offset & 3) || (length & 3))
2702                 return -EIO;
2703
2704         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2705                 *eof = 1;
2706                 return 0;
2707         }
2708
2709         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2710                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2711                 *eof = 1;
2712         }
2713
2714         offset /= sizeof(u32);
2715
2716         if (length > 0) {
2717                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2718                 u32 *dst = (u32 *) buffer;
2719
2720                 /* Copy first cpu. */
2721                 *start = buffer;
2722                 memcpy(dst, src, length);
2723
2724                 /* Add the other cpus in, one int at a time */
2725                 for_each_cpu(i) {
2726                         unsigned int j;
2727
2728                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2729
2730                         for (j = 0; j < length/4; j++)
2731                                 dst[j] += src[j];
2732                 }
2733         }
2734         return length;
2735 }
2736 #endif /* CONFIG_PROC_FS */
2737 #endif /* CONFIG_NET_CLS_ROUTE */
2738
2739 static __initdata unsigned long rhash_entries;
2740 static int __init set_rhash_entries(char *str)
2741 {
2742         if (!str)
2743                 return 0;
2744         rhash_entries = simple_strtoul(str, &str, 0);
2745         return 1;
2746 }
2747 __setup("rhash_entries=", set_rhash_entries);
2748
2749 int __init ip_rt_init(void)
2750 {
2751         int i, order, goal, rc = 0;
2752
2753         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2754                              (jiffies ^ (jiffies >> 7)));
2755
2756 #ifdef CONFIG_NET_CLS_ROUTE
2757         for (order = 0;
2758              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2759                 /* NOTHING */;
2760         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2761         if (!ip_rt_acct)
2762                 panic("IP: failed to allocate ip_rt_acct\n");
2763         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2764 #endif
2765
2766         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2767                                                      sizeof(struct rtable),
2768                                                      0, SLAB_HWCACHE_ALIGN,
2769                                                      NULL, NULL);
2770
2771         if (!ipv4_dst_ops.kmem_cachep)
2772                 panic("IP: failed to allocate ip_dst_cache\n");
2773
2774         goal = num_physpages >> (26 - PAGE_SHIFT);
2775         if (rhash_entries)
2776                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2777         for (order = 0; (1UL << order) < goal; order++)
2778                 /* NOTHING */;
2779
2780         do {
2781                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2782                         sizeof(struct rt_hash_bucket);
2783                 while (rt_hash_mask & (rt_hash_mask - 1))
2784                         rt_hash_mask--;
2785                 rt_hash_table = (struct rt_hash_bucket *)
2786                         __get_free_pages(GFP_ATOMIC, order);
2787         } while (rt_hash_table == NULL && --order > 0);
2788
2789         if (!rt_hash_table)
2790                 panic("Failed to allocate IP route cache hash table\n");
2791
2792         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2793                rt_hash_mask,
2794                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2795
2796         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2797                 /* NOTHING */;
2798
2799         rt_hash_mask--;
2800         for (i = 0; i <= rt_hash_mask; i++) {
2801                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2802                 rt_hash_table[i].chain = NULL;
2803         }
2804
2805         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2806         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2807
2808         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2809         if (!rt_cache_stat)
2810                 return -ENOMEM;
2811
2812         devinet_init();
2813         ip_fib_init();
2814
2815         init_timer(&rt_flush_timer);
2816         rt_flush_timer.function = rt_run_flush;
2817         init_timer(&rt_periodic_timer);
2818         rt_periodic_timer.function = rt_check_expire;
2819         init_timer(&rt_secret_timer);
2820         rt_secret_timer.function = rt_secret_rebuild;
2821
2822         /* All the timers, started at system startup tend
2823            to synchronize. Perturb it a bit.
2824          */
2825         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2826                                         ip_rt_gc_interval;
2827         add_timer(&rt_periodic_timer);
2828
2829         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2830                 ip_rt_secret_interval;
2831         add_timer(&rt_secret_timer);
2832
2833 #ifdef CONFIG_PROC_FS
2834         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2835             !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2836                 free_percpu(rt_cache_stat);
2837                 return -ENOMEM;
2838         }
2839
2840 #ifdef CONFIG_NET_CLS_ROUTE
2841         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2842 #endif
2843 #endif
2844 #ifdef CONFIG_XFRM
2845         xfrm_init();
2846         xfrm4_init();
2847 #endif
2848         return rc;
2849 }
2850
2851 EXPORT_SYMBOL(__ip_select_ident);
2852 EXPORT_SYMBOL(ip_route_input);
2853 EXPORT_SYMBOL(ip_route_output_key);