patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106
107 #define IP_MAX_MTU      0xFFF0
108
109 #define RT_GC_TIMEOUT (300*HZ)
110
111 int ip_rt_min_delay             = 2 * HZ;
112 int ip_rt_max_delay             = 10 * HZ;
113 int ip_rt_max_size;
114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval           = 60 * HZ;
116 int ip_rt_gc_min_interval       = HZ / 2;
117 int ip_rt_redirect_number       = 9;
118 int ip_rt_redirect_load         = HZ / 50;
119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost            = HZ;
121 int ip_rt_error_burst           = 5 * HZ;
122 int ip_rt_gc_elasticity         = 8;
123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
124 int ip_rt_min_pmtu              = 512 + 20 + 20;
125 int ip_rt_min_advmss            = 256;
126 int ip_rt_secret_interval       = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static void              ipv4_dst_ifdown(struct dst_entry *dst, int how);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(void);
146
147
148 static struct dst_ops ipv4_dst_ops = {
149         .family =               AF_INET,
150         .protocol =             __constant_htons(ETH_P_IP),
151         .gc =                   rt_garbage_collect,
152         .check =                ipv4_dst_check,
153         .destroy =              ipv4_dst_destroy,
154         .ifdown =               ipv4_dst_ifdown,
155         .negative_advice =      ipv4_negative_advice,
156         .link_failure =         ipv4_link_failure,
157         .update_pmtu =          ip_rt_update_pmtu,
158         .entry_size =           sizeof(struct rtable),
159 };
160
161 #define ECN_OR_COST(class)      TC_PRIO_##class
162
163 __u8 ip_tos2prio[16] = {
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(FILLER),
166         TC_PRIO_BESTEFFORT,
167         ECN_OR_COST(BESTEFFORT),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_BULK,
171         ECN_OR_COST(BULK),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE,
175         ECN_OR_COST(INTERACTIVE),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK),
178         TC_PRIO_INTERACTIVE_BULK,
179         ECN_OR_COST(INTERACTIVE_BULK)
180 };
181
182
183 /*
184  * Route cache.
185  */
186
187 /* The locking scheme is rather straight forward:
188  *
189  * 1) Read-Copy Update protects the buckets of the central route hash.
190  * 2) Only writers remove entries, and they hold the lock
191  *    as they look at rtable reference counts.
192  * 3) Only readers acquire references to rtable entries,
193  *    they do so with atomic increments and with the
194  *    lock held.
195  */
196
197 struct rt_hash_bucket {
198         struct rtable   *chain;
199         spinlock_t      lock;
200 } __attribute__((__aligned__(8)));
201
202 static struct rt_hash_bucket    *rt_hash_table;
203 static unsigned                 rt_hash_mask;
204 static int                      rt_hash_log;
205 static unsigned int             rt_hash_rnd;
206
207 struct rt_cache_stat *rt_cache_stat;
208
209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
210                                 struct rtable **res);
211
212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
213 {
214         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
215                 & rt_hash_mask);
216 }
217
218 #ifdef CONFIG_PROC_FS
219 struct rt_cache_iter_state {
220         int bucket;
221 };
222
223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
224 {
225         struct rtable *r = NULL;
226         struct rt_cache_iter_state *st = seq->private;
227
228         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
229                 rcu_read_lock();
230                 r = rt_hash_table[st->bucket].chain;
231                 if (r)
232                         break;
233                 rcu_read_unlock();
234         }
235         return r;
236 }
237
238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
239 {
240         struct rt_cache_iter_state *st = seq->private;
241
242         smp_read_barrier_depends();
243         r = r->u.rt_next;
244         while (!r) {
245                 rcu_read_unlock();
246                 if (--st->bucket < 0)
247                         break;
248                 rcu_read_lock();
249                 r = rt_hash_table[st->bucket].chain;
250         }
251         return r;
252 }
253
254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
255 {
256         struct rtable *r = rt_cache_get_first(seq);
257
258         if (r)
259                 while (pos && (r = rt_cache_get_next(seq, r)))
260                         --pos;
261         return pos ? NULL : r;
262 }
263
264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
265 {
266         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
267 }
268
269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 {
271         struct rtable *r = NULL;
272
273         if (v == SEQ_START_TOKEN)
274                 r = rt_cache_get_first(seq);
275         else
276                 r = rt_cache_get_next(seq, v);
277         ++*pos;
278         return r;
279 }
280
281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
282 {
283         if (v && v != SEQ_START_TOKEN)
284                 rcu_read_unlock();
285 }
286
287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
288 {
289         if (v == SEQ_START_TOKEN)
290                 seq_printf(seq, "%-127s\n",
291                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
292                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
293                            "HHUptod\tSpecDst");
294         else {
295                 struct rtable *r = v;
296                 char temp[256];
297
298                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
299                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
300                         r->u.dst.dev ? r->u.dst.dev->name : "*",
301                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
302                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
303                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
304                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
305                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
306                         dst_metric(&r->u.dst, RTAX_WINDOW),
307                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
308                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
309                         r->fl.fl4_tos,
310                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
311                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
312                                        dev_queue_xmit) : 0,
313                         r->rt_spec_dst);
314                 seq_printf(seq, "%-127s\n", temp);
315         }
316         return 0;
317 }
318
319 static struct seq_operations rt_cache_seq_ops = {
320         .start  = rt_cache_seq_start,
321         .next   = rt_cache_seq_next,
322         .stop   = rt_cache_seq_stop,
323         .show   = rt_cache_seq_show,
324 };
325
326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
327 {
328         struct seq_file *seq;
329         int rc = -ENOMEM;
330         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
331
332         if (!s)
333                 goto out;
334         rc = seq_open(file, &rt_cache_seq_ops);
335         if (rc)
336                 goto out_kfree;
337         seq          = file->private_data;
338         seq->private = s;
339         memset(s, 0, sizeof(*s));
340 out:
341         return rc;
342 out_kfree:
343         kfree(s);
344         goto out;
345 }
346
347 static struct file_operations rt_cache_seq_fops = {
348         .owner   = THIS_MODULE,
349         .open    = rt_cache_seq_open,
350         .read    = seq_read,
351         .llseek  = seq_lseek,
352         .release = seq_release_private,
353 };
354
355
356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
357 {
358         int cpu;
359
360         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
361                 if (!cpu_possible(cpu))
362                         continue;
363                 *pos = cpu;
364                 return per_cpu_ptr(rt_cache_stat, cpu);
365         }
366         return NULL;
367 }
368
369 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
370 {
371         int cpu;
372
373         for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
374                 if (!cpu_possible(cpu))
375                         continue;
376                 *pos = cpu;
377                 return per_cpu_ptr(rt_cache_stat, cpu);
378         }
379         return NULL;
380         
381 }
382
383 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
384 {
385
386 }
387
388 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
389 {
390         struct rt_cache_stat *st = v;
391         
392         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
393                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
394                    atomic_read(&ipv4_dst_ops.entries),
395                    st->in_hit,
396                    st->in_slow_tot,
397                    st->in_slow_mc,
398                    st->in_no_route,
399                    st->in_brd,
400                    st->in_martian_dst,
401                    st->in_martian_src,
402
403                    st->out_hit,
404                    st->out_slow_tot,
405                    st->out_slow_mc, 
406
407                    st->gc_total,
408                    st->gc_ignored,
409                    st->gc_goal_miss,
410                    st->gc_dst_overflow,
411                    st->in_hlist_search,
412                    st->out_hlist_search
413                 );
414         return 0;
415 }
416
417 static struct seq_operations rt_cpu_seq_ops = {
418         .start  = rt_cpu_seq_start,
419         .next   = rt_cpu_seq_next,
420         .stop   = rt_cpu_seq_stop,
421         .show   = rt_cpu_seq_show,
422 };
423
424
425 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
426 {
427         return seq_open(file, &rt_cpu_seq_ops);
428 }
429
430 static struct file_operations rt_cpu_seq_fops = {
431         .owner   = THIS_MODULE,
432         .open    = rt_cpu_seq_open,
433         .read    = seq_read,
434         .llseek  = seq_lseek,
435         .release = seq_release_private,
436 };
437
438 #endif /* CONFIG_PROC_FS */
439   
440 static __inline__ void rt_free(struct rtable *rt)
441 {
442         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
443 }
444
445 static __inline__ void rt_drop(struct rtable *rt)
446 {
447         ip_rt_put(rt);
448         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
449 }
450
451 static __inline__ int rt_fast_clean(struct rtable *rth)
452 {
453         /* Kill broadcast/multicast entries very aggresively, if they
454            collide in hash table with more useful entries */
455         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
456                 rth->fl.iif && rth->u.rt_next;
457 }
458
459 static __inline__ int rt_valuable(struct rtable *rth)
460 {
461         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
462                 rth->u.dst.expires;
463 }
464
465 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
466 {
467         unsigned long age;
468         int ret = 0;
469
470         if (atomic_read(&rth->u.dst.__refcnt))
471                 goto out;
472
473         ret = 1;
474         if (rth->u.dst.expires &&
475             time_after_eq(jiffies, rth->u.dst.expires))
476                 goto out;
477
478         age = jiffies - rth->u.dst.lastuse;
479         ret = 0;
480         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
481             (age <= tmo2 && rt_valuable(rth)))
482                 goto out;
483         ret = 1;
484 out:    return ret;
485 }
486
487 /* Bits of score are:
488  * 31: very valuable
489  * 30: not quite useless
490  * 29..0: usage counter
491  */
492 static inline u32 rt_score(struct rtable *rt)
493 {
494         u32 score = jiffies - rt->u.dst.lastuse;
495
496         score = ~score & ~(3<<30);
497
498         if (rt_valuable(rt))
499                 score |= (1<<31);
500
501         if (!rt->fl.iif ||
502             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
503                 score |= (1<<30);
504
505         return score;
506 }
507
508 /* This runs via a timer and thus is always in BH context. */
509 static void rt_check_expire(unsigned long dummy)
510 {
511         static int rover;
512         int i = rover, t;
513         struct rtable *rth, **rthp;
514         unsigned long now = jiffies;
515
516         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
517              t -= ip_rt_gc_timeout) {
518                 unsigned long tmo = ip_rt_gc_timeout;
519
520                 i = (i + 1) & rt_hash_mask;
521                 rthp = &rt_hash_table[i].chain;
522
523                 spin_lock(&rt_hash_table[i].lock);
524                 while ((rth = *rthp) != NULL) {
525                         if (rth->u.dst.expires) {
526                                 /* Entry is expired even if it is in use */
527                                 if (time_before_eq(now, rth->u.dst.expires)) {
528                                         tmo >>= 1;
529                                         rthp = &rth->u.rt_next;
530                                         continue;
531                                 }
532                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
533                                 tmo >>= 1;
534                                 rthp = &rth->u.rt_next;
535                                 continue;
536                         }
537
538                         /* Cleanup aged off entries. */
539                         *rthp = rth->u.rt_next;
540                         rt_free(rth);
541                 }
542                 spin_unlock(&rt_hash_table[i].lock);
543
544                 /* Fallback loop breaker. */
545                 if (time_after(jiffies, now))
546                         break;
547         }
548         rover = i;
549         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
550 }
551
552 /* This can run from both BH and non-BH contexts, the latter
553  * in the case of a forced flush event.
554  */
555 static void rt_run_flush(unsigned long dummy)
556 {
557         int i;
558         struct rtable *rth, *next;
559
560         rt_deadline = 0;
561
562         get_random_bytes(&rt_hash_rnd, 4);
563
564         for (i = rt_hash_mask; i >= 0; i--) {
565                 spin_lock_bh(&rt_hash_table[i].lock);
566                 rth = rt_hash_table[i].chain;
567                 if (rth)
568                         rt_hash_table[i].chain = NULL;
569                 spin_unlock_bh(&rt_hash_table[i].lock);
570
571                 for (; rth; rth = next) {
572                         next = rth->u.rt_next;
573                         rt_free(rth);
574                 }
575         }
576 }
577
578 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
579
580 void rt_cache_flush(int delay)
581 {
582         unsigned long now = jiffies;
583         int user_mode = !in_softirq();
584
585         if (delay < 0)
586                 delay = ip_rt_min_delay;
587
588         spin_lock_bh(&rt_flush_lock);
589
590         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
591                 long tmo = (long)(rt_deadline - now);
592
593                 /* If flush timer is already running
594                    and flush request is not immediate (delay > 0):
595
596                    if deadline is not achieved, prolongate timer to "delay",
597                    otherwise fire it at deadline time.
598                  */
599
600                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
601                         tmo = 0;
602                 
603                 if (delay > tmo)
604                         delay = tmo;
605         }
606
607         if (delay <= 0) {
608                 spin_unlock_bh(&rt_flush_lock);
609                 rt_run_flush(0);
610                 return;
611         }
612
613         if (rt_deadline == 0)
614                 rt_deadline = now + ip_rt_max_delay;
615
616         mod_timer(&rt_flush_timer, now+delay);
617         spin_unlock_bh(&rt_flush_lock);
618 }
619
620 static void rt_secret_rebuild(unsigned long dummy)
621 {
622         unsigned long now = jiffies;
623
624         rt_cache_flush(0);
625         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
626 }
627
628 /*
629    Short description of GC goals.
630
631    We want to build algorithm, which will keep routing cache
632    at some equilibrium point, when number of aged off entries
633    is kept approximately equal to newly generated ones.
634
635    Current expiration strength is variable "expire".
636    We try to adjust it dynamically, so that if networking
637    is idle expires is large enough to keep enough of warm entries,
638    and when load increases it reduces to limit cache size.
639  */
640
641 static int rt_garbage_collect(void)
642 {
643         static unsigned long expire = RT_GC_TIMEOUT;
644         static unsigned long last_gc;
645         static int rover;
646         static int equilibrium;
647         struct rtable *rth, **rthp;
648         unsigned long now = jiffies;
649         int goal;
650
651         /*
652          * Garbage collection is pretty expensive,
653          * do not make it too frequently.
654          */
655
656         RT_CACHE_STAT_INC(gc_total);
657
658         if (now - last_gc < ip_rt_gc_min_interval &&
659             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
660                 RT_CACHE_STAT_INC(gc_ignored);
661                 goto out;
662         }
663
664         /* Calculate number of entries, which we want to expire now. */
665         goal = atomic_read(&ipv4_dst_ops.entries) -
666                 (ip_rt_gc_elasticity << rt_hash_log);
667         if (goal <= 0) {
668                 if (equilibrium < ipv4_dst_ops.gc_thresh)
669                         equilibrium = ipv4_dst_ops.gc_thresh;
670                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
671                 if (goal > 0) {
672                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
673                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
674                 }
675         } else {
676                 /* We are in dangerous area. Try to reduce cache really
677                  * aggressively.
678                  */
679                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
680                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
681         }
682
683         if (now - last_gc >= ip_rt_gc_min_interval)
684                 last_gc = now;
685
686         if (goal <= 0) {
687                 equilibrium += goal;
688                 goto work_done;
689         }
690
691         do {
692                 int i, k;
693
694                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
695                         unsigned long tmo = expire;
696
697                         k = (k + 1) & rt_hash_mask;
698                         rthp = &rt_hash_table[k].chain;
699                         spin_lock_bh(&rt_hash_table[k].lock);
700                         while ((rth = *rthp) != NULL) {
701                                 if (!rt_may_expire(rth, tmo, expire)) {
702                                         tmo >>= 1;
703                                         rthp = &rth->u.rt_next;
704                                         continue;
705                                 }
706                                 *rthp = rth->u.rt_next;
707                                 rt_free(rth);
708                                 goal--;
709                         }
710                         spin_unlock_bh(&rt_hash_table[k].lock);
711                         if (goal <= 0)
712                                 break;
713                 }
714                 rover = k;
715
716                 if (goal <= 0)
717                         goto work_done;
718
719                 /* Goal is not achieved. We stop process if:
720
721                    - if expire reduced to zero. Otherwise, expire is halfed.
722                    - if table is not full.
723                    - if we are called from interrupt.
724                    - jiffies check is just fallback/debug loop breaker.
725                      We will not spin here for long time in any case.
726                  */
727
728                 RT_CACHE_STAT_INC(gc_goal_miss);
729
730                 if (expire == 0)
731                         break;
732
733                 expire >>= 1;
734 #if RT_CACHE_DEBUG >= 2
735                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
736                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
737 #endif
738
739                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
740                         goto out;
741         } while (!in_softirq() && time_before_eq(jiffies, now));
742
743         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
744                 goto out;
745         if (net_ratelimit())
746                 printk(KERN_WARNING "dst cache overflow\n");
747         RT_CACHE_STAT_INC(gc_dst_overflow);
748         return 1;
749
750 work_done:
751         expire += ip_rt_gc_min_interval;
752         if (expire > ip_rt_gc_timeout ||
753             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
754                 expire = ip_rt_gc_timeout;
755 #if RT_CACHE_DEBUG >= 2
756         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
757                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
758 #endif
759 out:    return 0;
760 }
761
762 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
763 {
764         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
765                fl1->oif     == fl2->oif &&
766                fl1->iif     == fl2->iif;
767 }
768
769 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
770 {
771         struct rtable   *rth, **rthp;
772         unsigned long   now;
773         struct rtable *cand, **candp;
774         u32             min_score;
775         int             chain_length;
776         int attempts = !in_softirq();
777
778 restart:
779         chain_length = 0;
780         min_score = ~(u32)0;
781         cand = NULL;
782         candp = NULL;
783         now = jiffies;
784
785         rthp = &rt_hash_table[hash].chain;
786
787         spin_lock_bh(&rt_hash_table[hash].lock);
788         while ((rth = *rthp) != NULL) {
789                 if (compare_keys(&rth->fl, &rt->fl)) {
790                         /* Put it first */
791                         *rthp = rth->u.rt_next;
792                         /*
793                          * Since lookup is lockfree, the deletion
794                          * must be visible to another weakly ordered CPU before
795                          * the insertion at the start of the hash chain.
796                          */
797                         smp_wmb();
798                         rth->u.rt_next = rt_hash_table[hash].chain;
799                         /*
800                          * Since lookup is lockfree, the update writes
801                          * must be ordered for consistency on SMP.
802                          */
803                         smp_wmb();
804                         rt_hash_table[hash].chain = rth;
805
806                         rth->u.dst.__use++;
807                         dst_hold(&rth->u.dst);
808                         rth->u.dst.lastuse = now;
809                         spin_unlock_bh(&rt_hash_table[hash].lock);
810
811                         rt_drop(rt);
812                         *rp = rth;
813                         return 0;
814                 }
815
816                 if (!atomic_read(&rth->u.dst.__refcnt)) {
817                         u32 score = rt_score(rth);
818
819                         if (score <= min_score) {
820                                 cand = rth;
821                                 candp = rthp;
822                                 min_score = score;
823                         }
824                 }
825
826                 chain_length++;
827
828                 rthp = &rth->u.rt_next;
829         }
830
831         if (cand) {
832                 /* ip_rt_gc_elasticity used to be average length of chain
833                  * length, when exceeded gc becomes really aggressive.
834                  *
835                  * The second limit is less certain. At the moment it allows
836                  * only 2 entries per bucket. We will see.
837                  */
838                 if (chain_length > ip_rt_gc_elasticity) {
839                         *candp = cand->u.rt_next;
840                         rt_free(cand);
841                 }
842         }
843
844         /* Try to bind route to arp only if it is output
845            route or unicast forwarding path.
846          */
847         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
848                 int err = arp_bind_neighbour(&rt->u.dst);
849                 if (err) {
850                         spin_unlock_bh(&rt_hash_table[hash].lock);
851
852                         if (err != -ENOBUFS) {
853                                 rt_drop(rt);
854                                 return err;
855                         }
856
857                         /* Neighbour tables are full and nothing
858                            can be released. Try to shrink route cache,
859                            it is most likely it holds some neighbour records.
860                          */
861                         if (attempts-- > 0) {
862                                 int saved_elasticity = ip_rt_gc_elasticity;
863                                 int saved_int = ip_rt_gc_min_interval;
864                                 ip_rt_gc_elasticity     = 1;
865                                 ip_rt_gc_min_interval   = 0;
866                                 rt_garbage_collect();
867                                 ip_rt_gc_min_interval   = saved_int;
868                                 ip_rt_gc_elasticity     = saved_elasticity;
869                                 goto restart;
870                         }
871
872                         if (net_ratelimit())
873                                 printk(KERN_WARNING "Neighbour table overflow.\n");
874                         rt_drop(rt);
875                         return -ENOBUFS;
876                 }
877         }
878
879         rt->u.rt_next = rt_hash_table[hash].chain;
880 #if RT_CACHE_DEBUG >= 2
881         if (rt->u.rt_next) {
882                 struct rtable *trt;
883                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
884                        NIPQUAD(rt->rt_dst));
885                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
886                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
887                 printk("\n");
888         }
889 #endif
890         rt_hash_table[hash].chain = rt;
891         spin_unlock_bh(&rt_hash_table[hash].lock);
892         *rp = rt;
893         return 0;
894 }
895
896 void rt_bind_peer(struct rtable *rt, int create)
897 {
898         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
899         struct inet_peer *peer;
900
901         peer = inet_getpeer(rt->rt_dst, create);
902
903         spin_lock_bh(&rt_peer_lock);
904         if (rt->peer == NULL) {
905                 rt->peer = peer;
906                 peer = NULL;
907         }
908         spin_unlock_bh(&rt_peer_lock);
909         if (peer)
910                 inet_putpeer(peer);
911 }
912
913 /*
914  * Peer allocation may fail only in serious out-of-memory conditions.  However
915  * we still can generate some output.
916  * Random ID selection looks a bit dangerous because we have no chances to
917  * select ID being unique in a reasonable period of time.
918  * But broken packet identifier may be better than no packet at all.
919  */
920 static void ip_select_fb_ident(struct iphdr *iph)
921 {
922         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
923         static u32 ip_fallback_id;
924         u32 salt;
925
926         spin_lock_bh(&ip_fb_id_lock);
927         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
928         iph->id = htons(salt & 0xFFFF);
929         ip_fallback_id = salt;
930         spin_unlock_bh(&ip_fb_id_lock);
931 }
932
933 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
934 {
935         struct rtable *rt = (struct rtable *) dst;
936
937         if (rt) {
938                 if (rt->peer == NULL)
939                         rt_bind_peer(rt, 1);
940
941                 /* If peer is attached to destination, it is never detached,
942                    so that we need not to grab a lock to dereference it.
943                  */
944                 if (rt->peer) {
945                         iph->id = htons(inet_getid(rt->peer, more));
946                         return;
947                 }
948         } else
949                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
950
951         ip_select_fb_ident(iph);
952 }
953
954 static void rt_del(unsigned hash, struct rtable *rt)
955 {
956         struct rtable **rthp;
957
958         spin_lock_bh(&rt_hash_table[hash].lock);
959         ip_rt_put(rt);
960         for (rthp = &rt_hash_table[hash].chain; *rthp;
961              rthp = &(*rthp)->u.rt_next)
962                 if (*rthp == rt) {
963                         *rthp = rt->u.rt_next;
964                         rt_free(rt);
965                         break;
966                 }
967         spin_unlock_bh(&rt_hash_table[hash].lock);
968 }
969
970 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
971                     u32 saddr, u8 tos, struct net_device *dev)
972 {
973         int i, k;
974         struct in_device *in_dev = in_dev_get(dev);
975         struct rtable *rth, **rthp;
976         u32  skeys[2] = { saddr, 0 };
977         int  ikeys[2] = { dev->ifindex, 0 };
978
979         tos &= IPTOS_RT_MASK;
980
981         if (!in_dev)
982                 return;
983
984         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
985             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
986                 goto reject_redirect;
987
988         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
989                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
990                         goto reject_redirect;
991                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
992                         goto reject_redirect;
993         } else {
994                 if (inet_addr_type(new_gw) != RTN_UNICAST)
995                         goto reject_redirect;
996         }
997
998         for (i = 0; i < 2; i++) {
999                 for (k = 0; k < 2; k++) {
1000                         unsigned hash = rt_hash_code(daddr,
1001                                                      skeys[i] ^ (ikeys[k] << 5),
1002                                                      tos);
1003
1004                         rthp=&rt_hash_table[hash].chain;
1005
1006                         rcu_read_lock();
1007                         while ((rth = *rthp) != NULL) {
1008                                 struct rtable *rt;
1009
1010                                 smp_read_barrier_depends();
1011                                 if (rth->fl.fl4_dst != daddr ||
1012                                     rth->fl.fl4_src != skeys[i] ||
1013                                     rth->fl.fl4_tos != tos ||
1014                                     rth->fl.oif != ikeys[k] ||
1015                                     rth->fl.iif != 0) {
1016                                         rthp = &rth->u.rt_next;
1017                                         continue;
1018                                 }
1019
1020                                 if (rth->rt_dst != daddr ||
1021                                     rth->rt_src != saddr ||
1022                                     rth->u.dst.error ||
1023                                     rth->rt_gateway != old_gw ||
1024                                     rth->u.dst.dev != dev)
1025                                         break;
1026
1027                                 dst_hold(&rth->u.dst);
1028                                 rcu_read_unlock();
1029
1030                                 rt = dst_alloc(&ipv4_dst_ops);
1031                                 if (rt == NULL) {
1032                                         ip_rt_put(rth);
1033                                         in_dev_put(in_dev);
1034                                         return;
1035                                 }
1036
1037                                 /* Copy all the information. */
1038                                 *rt = *rth;
1039                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1040                                 rt->u.dst.__use         = 1;
1041                                 atomic_set(&rt->u.dst.__refcnt, 1);
1042                                 rt->u.dst.child         = NULL;
1043                                 if (rt->u.dst.dev)
1044                                         dev_hold(rt->u.dst.dev);
1045                                 if (rt->idev)
1046                                         in_dev_hold(rt->idev);
1047                                 rt->u.dst.obsolete      = 0;
1048                                 rt->u.dst.lastuse       = jiffies;
1049                                 rt->u.dst.path          = &rt->u.dst;
1050                                 rt->u.dst.neighbour     = NULL;
1051                                 rt->u.dst.hh            = NULL;
1052                                 rt->u.dst.xfrm          = NULL;
1053
1054                                 rt->rt_flags            |= RTCF_REDIRECTED;
1055
1056                                 /* Gateway is different ... */
1057                                 rt->rt_gateway          = new_gw;
1058
1059                                 /* Redirect received -> path was valid */
1060                                 dst_confirm(&rth->u.dst);
1061
1062                                 if (rt->peer)
1063                                         atomic_inc(&rt->peer->refcnt);
1064
1065                                 if (arp_bind_neighbour(&rt->u.dst) ||
1066                                     !(rt->u.dst.neighbour->nud_state &
1067                                             NUD_VALID)) {
1068                                         if (rt->u.dst.neighbour)
1069                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1070                                         ip_rt_put(rth);
1071                                         rt_drop(rt);
1072                                         goto do_next;
1073                                 }
1074
1075                                 rt_del(hash, rth);
1076                                 if (!rt_intern_hash(hash, rt, &rt))
1077                                         ip_rt_put(rt);
1078                                 goto do_next;
1079                         }
1080                         rcu_read_unlock();
1081                 do_next:
1082                         ;
1083                 }
1084         }
1085         in_dev_put(in_dev);
1086         return;
1087
1088 reject_redirect:
1089 #ifdef CONFIG_IP_ROUTE_VERBOSE
1090         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1091                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1092                         "%u.%u.%u.%u ignored.\n"
1093                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1094                         "tos %02x\n",
1095                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1096                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1097 #endif
1098         in_dev_put(in_dev);
1099 }
1100
1101 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1102 {
1103         struct rtable *rt = (struct rtable*)dst;
1104         struct dst_entry *ret = dst;
1105
1106         if (rt) {
1107                 if (dst->obsolete) {
1108                         ip_rt_put(rt);
1109                         ret = NULL;
1110                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1111                            rt->u.dst.expires) {
1112                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1113                                                      rt->fl.fl4_src ^
1114                                                         (rt->fl.oif << 5),
1115                                                      rt->fl.fl4_tos);
1116 #if RT_CACHE_DEBUG >= 1
1117                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1118                                           "%u.%u.%u.%u/%02x dropped\n",
1119                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1120 #endif
1121                         rt_del(hash, rt);
1122                         ret = NULL;
1123                 }
1124         }
1125         return ret;
1126 }
1127
1128 /*
1129  * Algorithm:
1130  *      1. The first ip_rt_redirect_number redirects are sent
1131  *         with exponential backoff, then we stop sending them at all,
1132  *         assuming that the host ignores our redirects.
1133  *      2. If we did not see packets requiring redirects
1134  *         during ip_rt_redirect_silence, we assume that the host
1135  *         forgot redirected route and start to send redirects again.
1136  *
1137  * This algorithm is much cheaper and more intelligent than dumb load limiting
1138  * in icmp.c.
1139  *
1140  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1141  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1142  */
1143
1144 void ip_rt_send_redirect(struct sk_buff *skb)
1145 {
1146         struct rtable *rt = (struct rtable*)skb->dst;
1147         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1148
1149         if (!in_dev)
1150                 return;
1151
1152         if (!IN_DEV_TX_REDIRECTS(in_dev))
1153                 goto out;
1154
1155         /* No redirected packets during ip_rt_redirect_silence;
1156          * reset the algorithm.
1157          */
1158         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1159                 rt->u.dst.rate_tokens = 0;
1160
1161         /* Too many ignored redirects; do not send anything
1162          * set u.dst.rate_last to the last seen redirected packet.
1163          */
1164         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1165                 rt->u.dst.rate_last = jiffies;
1166                 goto out;
1167         }
1168
1169         /* Check for load limit; set rate_last to the latest sent
1170          * redirect.
1171          */
1172         if (time_after(jiffies,
1173                        (rt->u.dst.rate_last +
1174                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1175                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1176                 rt->u.dst.rate_last = jiffies;
1177                 ++rt->u.dst.rate_tokens;
1178 #ifdef CONFIG_IP_ROUTE_VERBOSE
1179                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1180                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1181                     net_ratelimit())
1182                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1183                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1184                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1185                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1186 #endif
1187         }
1188 out:
1189         in_dev_put(in_dev);
1190 }
1191
1192 static int ip_error(struct sk_buff *skb)
1193 {
1194         struct rtable *rt = (struct rtable*)skb->dst;
1195         unsigned long now;
1196         int code;
1197
1198         switch (rt->u.dst.error) {
1199                 case EINVAL:
1200                 default:
1201                         goto out;
1202                 case EHOSTUNREACH:
1203                         code = ICMP_HOST_UNREACH;
1204                         break;
1205                 case ENETUNREACH:
1206                         code = ICMP_NET_UNREACH;
1207                         break;
1208                 case EACCES:
1209                         code = ICMP_PKT_FILTERED;
1210                         break;
1211         }
1212
1213         now = jiffies;
1214         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1215         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1216                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1217         rt->u.dst.rate_last = now;
1218         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1219                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1220                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1221         }
1222
1223 out:    kfree_skb(skb);
1224         return 0;
1225
1226
1227 /*
1228  *      The last two values are not from the RFC but
1229  *      are needed for AMPRnet AX.25 paths.
1230  */
1231
1232 static unsigned short mtu_plateau[] =
1233 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1234
1235 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1236 {
1237         int i;
1238         
1239         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1240                 if (old_mtu > mtu_plateau[i])
1241                         return mtu_plateau[i];
1242         return 68;
1243 }
1244
1245 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1246 {
1247         int i;
1248         unsigned short old_mtu = ntohs(iph->tot_len);
1249         struct rtable *rth;
1250         u32  skeys[2] = { iph->saddr, 0, };
1251         u32  daddr = iph->daddr;
1252         u8   tos = iph->tos & IPTOS_RT_MASK;
1253         unsigned short est_mtu = 0;
1254
1255         if (ipv4_config.no_pmtu_disc)
1256                 return 0;
1257
1258         for (i = 0; i < 2; i++) {
1259                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1260
1261                 rcu_read_lock();
1262                 for (rth = rt_hash_table[hash].chain; rth;
1263                      rth = rth->u.rt_next) {
1264                         smp_read_barrier_depends();
1265                         if (rth->fl.fl4_dst == daddr &&
1266                             rth->fl.fl4_src == skeys[i] &&
1267                             rth->rt_dst  == daddr &&
1268                             rth->rt_src  == iph->saddr &&
1269                             rth->fl.fl4_tos == tos &&
1270                             rth->fl.iif == 0 &&
1271                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1272                                 unsigned short mtu = new_mtu;
1273
1274                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1275
1276                                         /* BSD 4.2 compatibility hack :-( */
1277                                         if (mtu == 0 &&
1278                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1279                                             old_mtu >= 68 + (iph->ihl << 2))
1280                                                 old_mtu -= iph->ihl << 2;
1281
1282                                         mtu = guess_mtu(old_mtu);
1283                                 }
1284                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1285                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1286                                                 dst_confirm(&rth->u.dst);
1287                                                 if (mtu < ip_rt_min_pmtu) {
1288                                                         mtu = ip_rt_min_pmtu;
1289                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1290                                                                 (1 << RTAX_MTU);
1291                                                 }
1292                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1293                                                 dst_set_expires(&rth->u.dst,
1294                                                         ip_rt_mtu_expires);
1295                                         }
1296                                         est_mtu = mtu;
1297                                 }
1298                         }
1299                 }
1300                 rcu_read_unlock();
1301         }
1302         return est_mtu ? : new_mtu;
1303 }
1304
1305 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1306 {
1307         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1308             !(dst_metric_locked(dst, RTAX_MTU))) {
1309                 if (mtu < ip_rt_min_pmtu) {
1310                         mtu = ip_rt_min_pmtu;
1311                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1312                 }
1313                 dst->metrics[RTAX_MTU-1] = mtu;
1314                 dst_set_expires(dst, ip_rt_mtu_expires);
1315         }
1316 }
1317
1318 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1319 {
1320         dst_release(dst);
1321         return NULL;
1322 }
1323
1324 static void ipv4_dst_destroy(struct dst_entry *dst)
1325 {
1326         struct rtable *rt = (struct rtable *) dst;
1327         struct inet_peer *peer = rt->peer;
1328         struct in_device *idev = rt->idev;
1329
1330         if (peer) {
1331                 rt->peer = NULL;
1332                 inet_putpeer(peer);
1333         }
1334
1335         if (idev) {
1336                 rt->idev = NULL;
1337                 in_dev_put(idev);
1338         }
1339 }
1340
1341 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1342 {
1343         struct rtable *rt = (struct rtable *) dst;
1344         struct in_device *idev = rt->idev;
1345         if (idev) {
1346                 rt->idev = NULL;
1347                 in_dev_put(idev);
1348         }
1349 }
1350
1351 static void ipv4_link_failure(struct sk_buff *skb)
1352 {
1353         struct rtable *rt;
1354
1355         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1356
1357         rt = (struct rtable *) skb->dst;
1358         if (rt)
1359                 dst_set_expires(&rt->u.dst, 0);
1360 }
1361
1362 static int ip_rt_bug(struct sk_buff **pskb)
1363 {
1364         struct sk_buff *skb = *pskb;
1365
1366         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1367                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1368                 skb->dev ? skb->dev->name : "?");
1369         kfree_skb(skb);
1370         return 0;
1371 }
1372
1373 /*
1374    We do not cache source address of outgoing interface,
1375    because it is used only by IP RR, TS and SRR options,
1376    so that it out of fast path.
1377
1378    BTW remember: "addr" is allowed to be not aligned
1379    in IP options!
1380  */
1381
1382 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1383 {
1384         u32 src;
1385         struct fib_result res;
1386
1387         if (rt->fl.iif == 0)
1388                 src = rt->rt_src;
1389         else if (fib_lookup(&rt->fl, &res) == 0) {
1390 #ifdef CONFIG_IP_ROUTE_NAT
1391                 if (res.type == RTN_NAT)
1392                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1393                                                 RT_SCOPE_UNIVERSE);
1394                 else
1395 #endif
1396                         src = FIB_RES_PREFSRC(res);
1397                 fib_res_put(&res);
1398         } else
1399                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1400                                         RT_SCOPE_UNIVERSE);
1401         memcpy(addr, &src, 4);
1402 }
1403
1404 #ifdef CONFIG_NET_CLS_ROUTE
1405 static void set_class_tag(struct rtable *rt, u32 tag)
1406 {
1407         if (!(rt->u.dst.tclassid & 0xFFFF))
1408                 rt->u.dst.tclassid |= tag & 0xFFFF;
1409         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1410                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1411 }
1412 #endif
1413
1414 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1415 {
1416         struct fib_info *fi = res->fi;
1417
1418         if (fi) {
1419                 if (FIB_RES_GW(*res) &&
1420                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1421                         rt->rt_gateway = FIB_RES_GW(*res);
1422                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1423                        sizeof(rt->u.dst.metrics));
1424                 if (fi->fib_mtu == 0) {
1425                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1426                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1427                             rt->rt_gateway != rt->rt_dst &&
1428                             rt->u.dst.dev->mtu > 576)
1429                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1430                 }
1431 #ifdef CONFIG_NET_CLS_ROUTE
1432                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1433 #endif
1434         } else
1435                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1436
1437         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1438                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1439         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1440                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1441         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1442                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1443                                        ip_rt_min_advmss);
1444         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1445                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1446
1447 #ifdef CONFIG_NET_CLS_ROUTE
1448 #ifdef CONFIG_IP_MULTIPLE_TABLES
1449         set_class_tag(rt, fib_rules_tclass(res));
1450 #endif
1451         set_class_tag(rt, itag);
1452 #endif
1453         rt->rt_type = res->type;
1454 }
1455
1456 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1457                                 u8 tos, struct net_device *dev, int our)
1458 {
1459         unsigned hash;
1460         struct rtable *rth;
1461         u32 spec_dst;
1462         struct in_device *in_dev = in_dev_get(dev);
1463         u32 itag = 0;
1464
1465         /* Primary sanity checks. */
1466
1467         if (in_dev == NULL)
1468                 return -EINVAL;
1469
1470         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1471             skb->protocol != htons(ETH_P_IP))
1472                 goto e_inval;
1473
1474         if (ZERONET(saddr)) {
1475                 if (!LOCAL_MCAST(daddr))
1476                         goto e_inval;
1477                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1478         } else if (fib_validate_source(saddr, 0, tos, 0,
1479                                         dev, &spec_dst, &itag) < 0)
1480                 goto e_inval;
1481
1482         rth = dst_alloc(&ipv4_dst_ops);
1483         if (!rth)
1484                 goto e_nobufs;
1485
1486         rth->u.dst.output= ip_rt_bug;
1487
1488         atomic_set(&rth->u.dst.__refcnt, 1);
1489         rth->u.dst.flags= DST_HOST;
1490         if (in_dev->cnf.no_policy)
1491                 rth->u.dst.flags |= DST_NOPOLICY;
1492         rth->fl.fl4_dst = daddr;
1493         rth->rt_dst     = daddr;
1494         rth->fl.fl4_tos = tos;
1495 #ifdef CONFIG_IP_ROUTE_FWMARK
1496         rth->fl.fl4_fwmark= skb->nfmark;
1497 #endif
1498         rth->fl.fl4_src = saddr;
1499         rth->rt_src     = saddr;
1500 #ifdef CONFIG_IP_ROUTE_NAT
1501         rth->rt_dst_map = daddr;
1502         rth->rt_src_map = saddr;
1503 #endif
1504 #ifdef CONFIG_NET_CLS_ROUTE
1505         rth->u.dst.tclassid = itag;
1506 #endif
1507         rth->rt_iif     =
1508         rth->fl.iif     = dev->ifindex;
1509         rth->u.dst.dev  = &loopback_dev;
1510         dev_hold(rth->u.dst.dev);
1511         rth->idev       = in_dev_get(rth->u.dst.dev);
1512         rth->fl.oif     = 0;
1513         rth->rt_gateway = daddr;
1514         rth->rt_spec_dst= spec_dst;
1515         rth->rt_type    = RTN_MULTICAST;
1516         rth->rt_flags   = RTCF_MULTICAST;
1517         if (our) {
1518                 rth->u.dst.input= ip_local_deliver;
1519                 rth->rt_flags |= RTCF_LOCAL;
1520         }
1521
1522 #ifdef CONFIG_IP_MROUTE
1523         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1524                 rth->u.dst.input = ip_mr_input;
1525 #endif
1526         RT_CACHE_STAT_INC(in_slow_mc);
1527
1528         in_dev_put(in_dev);
1529         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1530         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1531
1532 e_nobufs:
1533         in_dev_put(in_dev);
1534         return -ENOBUFS;
1535
1536 e_inval:
1537         in_dev_put(in_dev);
1538         return -EINVAL;
1539 }
1540
1541 /*
1542  *      NOTE. We drop all the packets that has local source
1543  *      addresses, because every properly looped back packet
1544  *      must have correct destination already attached by output routine.
1545  *
1546  *      Such approach solves two big problems:
1547  *      1. Not simplex devices are handled properly.
1548  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1549  */
1550
1551 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1552                         u8 tos, struct net_device *dev)
1553 {
1554         struct fib_result res;
1555         struct in_device *in_dev = in_dev_get(dev);
1556         struct in_device *out_dev = NULL;
1557         struct flowi fl = { .nl_u = { .ip4_u =
1558                                       { .daddr = daddr,
1559                                         .saddr = saddr,
1560                                         .tos = tos,
1561                                         .scope = RT_SCOPE_UNIVERSE,
1562 #ifdef CONFIG_IP_ROUTE_FWMARK
1563                                         .fwmark = skb->nfmark
1564 #endif
1565                                       } },
1566                             .iif = dev->ifindex };
1567         unsigned        flags = 0;
1568         u32             itag = 0;
1569         struct rtable * rth;
1570         unsigned        hash;
1571         u32             spec_dst;
1572         int             err = -EINVAL;
1573         int             free_res = 0;
1574
1575         /* IP on this device is disabled. */
1576
1577         if (!in_dev)
1578                 goto out;
1579
1580         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1581
1582         /* Check for the most weird martians, which can be not detected
1583            by fib_lookup.
1584          */
1585
1586         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1587                 goto martian_source;
1588
1589         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1590                 goto brd_input;
1591
1592         /* Accept zero addresses only to limited broadcast;
1593          * I even do not know to fix it or not. Waiting for complains :-)
1594          */
1595         if (ZERONET(saddr))
1596                 goto martian_source;
1597
1598         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1599                 goto martian_destination;
1600
1601         /*
1602          *      Now we are ready to route packet.
1603          */
1604         if ((err = fib_lookup(&fl, &res)) != 0) {
1605                 if (!IN_DEV_FORWARD(in_dev))
1606                         goto e_inval;
1607                 goto no_route;
1608         }
1609         free_res = 1;
1610
1611         RT_CACHE_STAT_INC(in_slow_tot);
1612
1613 #ifdef CONFIG_IP_ROUTE_NAT
1614         /* Policy is applied before mapping destination,
1615            but rerouting after map should be made with old source.
1616          */
1617
1618         if (1) {
1619                 u32 src_map = saddr;
1620                 if (res.r)
1621                         src_map = fib_rules_policy(saddr, &res, &flags);
1622
1623                 if (res.type == RTN_NAT) {
1624                         fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1625                         fib_res_put(&res);
1626                         free_res = 0;
1627                         if (fib_lookup(&fl, &res))
1628                                 goto e_inval;
1629                         free_res = 1;
1630                         if (res.type != RTN_UNICAST)
1631                                 goto e_inval;
1632                         flags |= RTCF_DNAT;
1633                 }
1634                 fl.fl4_src = src_map;
1635         }
1636 #endif
1637
1638         if (res.type == RTN_BROADCAST)
1639                 goto brd_input;
1640
1641         if (res.type == RTN_LOCAL) {
1642                 int result;
1643                 result = fib_validate_source(saddr, daddr, tos,
1644                                              loopback_dev.ifindex,
1645                                              dev, &spec_dst, &itag);
1646                 if (result < 0)
1647                         goto martian_source;
1648                 if (result)
1649                         flags |= RTCF_DIRECTSRC;
1650                 spec_dst = daddr;
1651                 goto local_input;
1652         }
1653
1654         if (!IN_DEV_FORWARD(in_dev))
1655                 goto e_inval;
1656         if (res.type != RTN_UNICAST)
1657                 goto martian_destination;
1658
1659 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1660         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1661                 fib_select_multipath(&fl, &res);
1662 #endif
1663         out_dev = in_dev_get(FIB_RES_DEV(res));
1664         if (out_dev == NULL) {
1665                 if (net_ratelimit())
1666                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1667                                          "Please, report\n");
1668                 goto e_inval;
1669         }
1670
1671         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1672                                   &spec_dst, &itag);
1673         if (err < 0)
1674                 goto martian_source;
1675
1676         if (err)
1677                 flags |= RTCF_DIRECTSRC;
1678
1679         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1680             (IN_DEV_SHARED_MEDIA(out_dev) ||
1681              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1682                 flags |= RTCF_DOREDIRECT;
1683
1684         if (skb->protocol != htons(ETH_P_IP)) {
1685                 /* Not IP (i.e. ARP). Do not create route, if it is
1686                  * invalid for proxy arp. DNAT routes are always valid.
1687                  */
1688                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1689                         goto e_inval;
1690         }
1691
1692         rth = dst_alloc(&ipv4_dst_ops);
1693         if (!rth)
1694                 goto e_nobufs;
1695
1696         atomic_set(&rth->u.dst.__refcnt, 1);
1697         rth->u.dst.flags= DST_HOST;
1698         if (in_dev->cnf.no_policy)
1699                 rth->u.dst.flags |= DST_NOPOLICY;
1700         if (in_dev->cnf.no_xfrm)
1701                 rth->u.dst.flags |= DST_NOXFRM;
1702         rth->fl.fl4_dst = daddr;
1703         rth->rt_dst     = daddr;
1704         rth->fl.fl4_tos = tos;
1705 #ifdef CONFIG_IP_ROUTE_FWMARK
1706         rth->fl.fl4_fwmark= skb->nfmark;
1707 #endif
1708         rth->fl.fl4_src = saddr;
1709         rth->rt_src     = saddr;
1710         rth->rt_gateway = daddr;
1711 #ifdef CONFIG_IP_ROUTE_NAT
1712         rth->rt_src_map = fl.fl4_src;
1713         rth->rt_dst_map = fl.fl4_dst;
1714         if (flags&RTCF_DNAT)
1715                 rth->rt_gateway = fl.fl4_dst;
1716 #endif
1717         rth->rt_iif     =
1718         rth->fl.iif     = dev->ifindex;
1719         rth->u.dst.dev  = out_dev->dev;
1720         dev_hold(rth->u.dst.dev);
1721         rth->idev       = in_dev_get(rth->u.dst.dev);
1722         rth->fl.oif     = 0;
1723         rth->rt_spec_dst= spec_dst;
1724
1725         rth->u.dst.input = ip_forward;
1726         rth->u.dst.output = ip_output;
1727
1728         rt_set_nexthop(rth, &res, itag);
1729
1730         rth->rt_flags = flags;
1731
1732 #ifdef CONFIG_NET_FASTROUTE
1733         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1734                 struct net_device *odev = rth->u.dst.dev;
1735                 if (odev != dev &&
1736                     dev->accept_fastpath &&
1737                     odev->mtu >= dev->mtu &&
1738                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1739                         rth->rt_flags |= RTCF_FAST;
1740         }
1741 #endif
1742
1743 intern:
1744         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1745 done:
1746         in_dev_put(in_dev);
1747         if (out_dev)
1748                 in_dev_put(out_dev);
1749         if (free_res)
1750                 fib_res_put(&res);
1751 out:    return err;
1752
1753 brd_input:
1754         if (skb->protocol != htons(ETH_P_IP))
1755                 goto e_inval;
1756
1757         if (ZERONET(saddr))
1758                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1759         else {
1760                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1761                                           &itag);
1762                 if (err < 0)
1763                         goto martian_source;
1764                 if (err)
1765                         flags |= RTCF_DIRECTSRC;
1766         }
1767         flags |= RTCF_BROADCAST;
1768         res.type = RTN_BROADCAST;
1769         RT_CACHE_STAT_INC(in_brd);
1770
1771 local_input:
1772         rth = dst_alloc(&ipv4_dst_ops);
1773         if (!rth)
1774                 goto e_nobufs;
1775
1776         rth->u.dst.output= ip_rt_bug;
1777
1778         atomic_set(&rth->u.dst.__refcnt, 1);
1779         rth->u.dst.flags= DST_HOST;
1780         if (in_dev->cnf.no_policy)
1781                 rth->u.dst.flags |= DST_NOPOLICY;
1782         rth->fl.fl4_dst = daddr;
1783         rth->rt_dst     = daddr;
1784         rth->fl.fl4_tos = tos;
1785 #ifdef CONFIG_IP_ROUTE_FWMARK
1786         rth->fl.fl4_fwmark= skb->nfmark;
1787 #endif
1788         rth->fl.fl4_src = saddr;
1789         rth->rt_src     = saddr;
1790 #ifdef CONFIG_IP_ROUTE_NAT
1791         rth->rt_dst_map = fl.fl4_dst;
1792         rth->rt_src_map = fl.fl4_src;
1793 #endif
1794 #ifdef CONFIG_NET_CLS_ROUTE
1795         rth->u.dst.tclassid = itag;
1796 #endif
1797         rth->rt_iif     =
1798         rth->fl.iif     = dev->ifindex;
1799         rth->u.dst.dev  = &loopback_dev;
1800         dev_hold(rth->u.dst.dev);
1801         rth->idev       = in_dev_get(rth->u.dst.dev);
1802         rth->rt_gateway = daddr;
1803         rth->rt_spec_dst= spec_dst;
1804         rth->u.dst.input= ip_local_deliver;
1805         rth->rt_flags   = flags|RTCF_LOCAL;
1806         if (res.type == RTN_UNREACHABLE) {
1807                 rth->u.dst.input= ip_error;
1808                 rth->u.dst.error= -err;
1809                 rth->rt_flags   &= ~RTCF_LOCAL;
1810         }
1811         rth->rt_type    = res.type;
1812         goto intern;
1813
1814 no_route:
1815         RT_CACHE_STAT_INC(in_no_route);
1816         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1817         res.type = RTN_UNREACHABLE;
1818         goto local_input;
1819
1820         /*
1821          *      Do not cache martian addresses: they should be logged (RFC1812)
1822          */
1823 martian_destination:
1824         RT_CACHE_STAT_INC(in_martian_dst);
1825 #ifdef CONFIG_IP_ROUTE_VERBOSE
1826         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1827                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1828                         "%u.%u.%u.%u, dev %s\n",
1829                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1830 #endif
1831 e_inval:
1832         err = -EINVAL;
1833         goto done;
1834
1835 e_nobufs:
1836         err = -ENOBUFS;
1837         goto done;
1838
1839 martian_source:
1840
1841         RT_CACHE_STAT_INC(in_martian_src);
1842 #ifdef CONFIG_IP_ROUTE_VERBOSE
1843         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1844                 /*
1845                  *      RFC1812 recommendation, if source is martian,
1846                  *      the only hint is MAC header.
1847                  */
1848                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1849                         "%u.%u.%u.%u, on dev %s\n",
1850                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1851                 if (dev->hard_header_len) {
1852                         int i;
1853                         unsigned char *p = skb->mac.raw;
1854                         printk(KERN_WARNING "ll header: ");
1855                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1856                                 printk("%02x", *p);
1857                                 if (i < (dev->hard_header_len - 1))
1858                                         printk(":");
1859                         }
1860                         printk("\n");
1861                 }
1862         }
1863 #endif
1864         goto e_inval;
1865 }
1866
1867 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1868                    u8 tos, struct net_device *dev)
1869 {
1870         struct rtable * rth;
1871         unsigned        hash;
1872         int iif = dev->ifindex;
1873
1874         tos &= IPTOS_RT_MASK;
1875         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1876
1877         rcu_read_lock();
1878         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1879                 smp_read_barrier_depends();
1880                 if (rth->fl.fl4_dst == daddr &&
1881                     rth->fl.fl4_src == saddr &&
1882                     rth->fl.iif == iif &&
1883                     rth->fl.oif == 0 &&
1884 #ifdef CONFIG_IP_ROUTE_FWMARK
1885                     rth->fl.fl4_fwmark == skb->nfmark &&
1886 #endif
1887                     rth->fl.fl4_tos == tos) {
1888                         rth->u.dst.lastuse = jiffies;
1889                         dst_hold(&rth->u.dst);
1890                         rth->u.dst.__use++;
1891                         RT_CACHE_STAT_INC(in_hit);
1892                         rcu_read_unlock();
1893                         skb->dst = (struct dst_entry*)rth;
1894                         return 0;
1895                 }
1896                 RT_CACHE_STAT_INC(in_hlist_search);
1897         }
1898         rcu_read_unlock();
1899
1900         /* Multicast recognition logic is moved from route cache to here.
1901            The problem was that too many Ethernet cards have broken/missing
1902            hardware multicast filters :-( As result the host on multicasting
1903            network acquires a lot of useless route cache entries, sort of
1904            SDR messages from all the world. Now we try to get rid of them.
1905            Really, provided software IP multicast filter is organized
1906            reasonably (at least, hashed), it does not result in a slowdown
1907            comparing with route cache reject entries.
1908            Note, that multicast routers are not affected, because
1909            route cache entry is created eventually.
1910          */
1911         if (MULTICAST(daddr)) {
1912                 struct in_device *in_dev;
1913
1914                 read_lock(&inetdev_lock);
1915                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1916                         int our = ip_check_mc(in_dev, daddr, saddr,
1917                                 skb->nh.iph->protocol);
1918                         if (our
1919 #ifdef CONFIG_IP_MROUTE
1920                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1921 #endif
1922                             ) {
1923                                 read_unlock(&inetdev_lock);
1924                                 return ip_route_input_mc(skb, daddr, saddr,
1925                                                          tos, dev, our);
1926                         }
1927                 }
1928                 read_unlock(&inetdev_lock);
1929                 return -EINVAL;
1930         }
1931         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1932 }
1933
1934 /*
1935  * Major route resolver routine.
1936  */
1937
1938 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1939 {
1940         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1941         struct flowi fl = { .nl_u = { .ip4_u =
1942                                       { .daddr = oldflp->fl4_dst,
1943                                         .saddr = oldflp->fl4_src,
1944                                         .tos = tos & IPTOS_RT_MASK,
1945                                         .scope = ((tos & RTO_ONLINK) ?
1946                                                   RT_SCOPE_LINK :
1947                                                   RT_SCOPE_UNIVERSE),
1948 #ifdef CONFIG_IP_ROUTE_FWMARK
1949                                         .fwmark = oldflp->fl4_fwmark
1950 #endif
1951                                       } },
1952                             .iif = loopback_dev.ifindex,
1953                             .oif = oldflp->oif };
1954         struct fib_result res;
1955         unsigned flags = 0;
1956         struct rtable *rth;
1957         struct net_device *dev_out = NULL;
1958         struct in_device *in_dev = NULL;
1959         unsigned hash;
1960         int free_res = 0;
1961         int err;
1962
1963         res.fi          = NULL;
1964 #ifdef CONFIG_IP_MULTIPLE_TABLES
1965         res.r           = NULL;
1966 #endif
1967
1968         if (oldflp->fl4_src) {
1969                 err = -EINVAL;
1970                 if (MULTICAST(oldflp->fl4_src) ||
1971                     BADCLASS(oldflp->fl4_src) ||
1972                     ZERONET(oldflp->fl4_src))
1973                         goto out;
1974
1975                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1976                 dev_out = ip_dev_find(oldflp->fl4_src);
1977                 if (dev_out == NULL)
1978                         goto out;
1979
1980                 /* I removed check for oif == dev_out->oif here.
1981                    It was wrong for two reasons:
1982                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1983                       assigned to multiple interfaces.
1984                    2. Moreover, we are allowed to send packets with saddr
1985                       of another iface. --ANK
1986                  */
1987
1988                 if (oldflp->oif == 0
1989                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1990                         /* Special hack: user can direct multicasts
1991                            and limited broadcast via necessary interface
1992                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1993                            This hack is not just for fun, it allows
1994                            vic,vat and friends to work.
1995                            They bind socket to loopback, set ttl to zero
1996                            and expect that it will work.
1997                            From the viewpoint of routing cache they are broken,
1998                            because we are not allowed to build multicast path
1999                            with loopback source addr (look, routing cache
2000                            cannot know, that ttl is zero, so that packet
2001                            will not leave this host and route is valid).
2002                            Luckily, this hack is good workaround.
2003                          */
2004
2005                         fl.oif = dev_out->ifindex;
2006                         goto make_route;
2007                 }
2008                 if (dev_out)
2009                         dev_put(dev_out);
2010                 dev_out = NULL;
2011         }
2012         if (oldflp->oif) {
2013                 dev_out = dev_get_by_index(oldflp->oif);
2014                 err = -ENODEV;
2015                 if (dev_out == NULL)
2016                         goto out;
2017                 if (__in_dev_get(dev_out) == NULL) {
2018                         dev_put(dev_out);
2019                         goto out;       /* Wrong error code */
2020                 }
2021
2022                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2023                         if (!fl.fl4_src)
2024                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2025                                                               RT_SCOPE_LINK);
2026                         goto make_route;
2027                 }
2028                 if (!fl.fl4_src) {
2029                         if (MULTICAST(oldflp->fl4_dst))
2030                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2031                                                               fl.fl4_scope);
2032                         else if (!oldflp->fl4_dst)
2033                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2034                                                               RT_SCOPE_HOST);
2035                 }
2036         }
2037
2038         if (!fl.fl4_dst) {
2039                 fl.fl4_dst = fl.fl4_src;
2040                 if (!fl.fl4_dst)
2041                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2042                 if (dev_out)
2043                         dev_put(dev_out);
2044                 dev_out = &loopback_dev;
2045                 dev_hold(dev_out);
2046                 fl.oif = loopback_dev.ifindex;
2047                 res.type = RTN_LOCAL;
2048                 flags |= RTCF_LOCAL;
2049                 goto make_route;
2050         }
2051
2052         if (fib_lookup(&fl, &res)) {
2053                 res.fi = NULL;
2054                 if (oldflp->oif) {
2055                         /* Apparently, routing tables are wrong. Assume,
2056                            that the destination is on link.
2057
2058                            WHY? DW.
2059                            Because we are allowed to send to iface
2060                            even if it has NO routes and NO assigned
2061                            addresses. When oif is specified, routing
2062                            tables are looked up with only one purpose:
2063                            to catch if destination is gatewayed, rather than
2064                            direct. Moreover, if MSG_DONTROUTE is set,
2065                            we send packet, ignoring both routing tables
2066                            and ifaddr state. --ANK
2067
2068
2069                            We could make it even if oif is unknown,
2070                            likely IPv6, but we do not.
2071                          */
2072
2073                         if (fl.fl4_src == 0)
2074                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2075                                                               RT_SCOPE_LINK);
2076                         res.type = RTN_UNICAST;
2077                         goto make_route;
2078                 }
2079                 if (dev_out)
2080                         dev_put(dev_out);
2081                 err = -ENETUNREACH;
2082                 goto out;
2083         }
2084         free_res = 1;
2085
2086         if (res.type == RTN_NAT)
2087                 goto e_inval;
2088
2089         if (res.type == RTN_LOCAL) {
2090                 if (!fl.fl4_src)
2091                         fl.fl4_src = fl.fl4_dst;
2092                 if (dev_out)
2093                         dev_put(dev_out);
2094                 dev_out = &loopback_dev;
2095                 dev_hold(dev_out);
2096                 fl.oif = dev_out->ifindex;
2097                 if (res.fi)
2098                         fib_info_put(res.fi);
2099                 res.fi = NULL;
2100                 flags |= RTCF_LOCAL;
2101                 goto make_route;
2102         }
2103
2104 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2105         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2106                 fib_select_multipath(&fl, &res);
2107         else
2108 #endif
2109         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2110                 fib_select_default(&fl, &res);
2111
2112         if (!fl.fl4_src)
2113                 fl.fl4_src = FIB_RES_PREFSRC(res);
2114
2115         if (dev_out)
2116                 dev_put(dev_out);
2117         dev_out = FIB_RES_DEV(res);
2118         dev_hold(dev_out);
2119         fl.oif = dev_out->ifindex;
2120
2121 make_route:
2122         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2123                 goto e_inval;
2124
2125         if (fl.fl4_dst == 0xFFFFFFFF)
2126                 res.type = RTN_BROADCAST;
2127         else if (MULTICAST(fl.fl4_dst))
2128                 res.type = RTN_MULTICAST;
2129         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2130                 goto e_inval;
2131
2132         if (dev_out->flags & IFF_LOOPBACK)
2133                 flags |= RTCF_LOCAL;
2134
2135         in_dev = in_dev_get(dev_out);
2136         if (!in_dev)
2137                 goto e_inval;
2138
2139         if (res.type == RTN_BROADCAST) {
2140                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2141                 if (res.fi) {
2142                         fib_info_put(res.fi);
2143                         res.fi = NULL;
2144                 }
2145         } else if (res.type == RTN_MULTICAST) {
2146                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2147                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2148                         flags &= ~RTCF_LOCAL;
2149                 /* If multicast route do not exist use
2150                    default one, but do not gateway in this case.
2151                    Yes, it is hack.
2152                  */
2153                 if (res.fi && res.prefixlen < 4) {
2154                         fib_info_put(res.fi);
2155                         res.fi = NULL;
2156                 }
2157         }
2158
2159         rth = dst_alloc(&ipv4_dst_ops);
2160         if (!rth)
2161                 goto e_nobufs;
2162
2163         atomic_set(&rth->u.dst.__refcnt, 1);
2164         rth->u.dst.flags= DST_HOST;
2165         if (in_dev->cnf.no_xfrm)
2166                 rth->u.dst.flags |= DST_NOXFRM;
2167         if (in_dev->cnf.no_policy)
2168                 rth->u.dst.flags |= DST_NOPOLICY;
2169         rth->fl.fl4_dst = oldflp->fl4_dst;
2170         rth->fl.fl4_tos = tos;
2171         rth->fl.fl4_src = oldflp->fl4_src;
2172         rth->fl.oif     = oldflp->oif;
2173 #ifdef CONFIG_IP_ROUTE_FWMARK
2174         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2175 #endif
2176         rth->rt_dst     = fl.fl4_dst;
2177         rth->rt_src     = fl.fl4_src;
2178 #ifdef CONFIG_IP_ROUTE_NAT
2179         rth->rt_dst_map = fl.fl4_dst;
2180         rth->rt_src_map = fl.fl4_src;
2181 #endif
2182         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2183         rth->u.dst.dev  = dev_out;
2184         dev_hold(dev_out);
2185         rth->idev       = in_dev_get(dev_out);
2186         rth->rt_gateway = fl.fl4_dst;
2187         rth->rt_spec_dst= fl.fl4_src;
2188
2189         rth->u.dst.output=ip_output;
2190
2191         RT_CACHE_STAT_INC(out_slow_tot);
2192
2193         if (flags & RTCF_LOCAL) {
2194                 rth->u.dst.input = ip_local_deliver;
2195                 rth->rt_spec_dst = fl.fl4_dst;
2196         }
2197         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2198                 rth->rt_spec_dst = fl.fl4_src;
2199                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2200                         rth->u.dst.output = ip_mc_output;
2201                         RT_CACHE_STAT_INC(out_slow_mc);
2202                 }
2203 #ifdef CONFIG_IP_MROUTE
2204                 if (res.type == RTN_MULTICAST) {
2205                         if (IN_DEV_MFORWARD(in_dev) &&
2206                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2207                                 rth->u.dst.input = ip_mr_input;
2208                                 rth->u.dst.output = ip_mc_output;
2209                         }
2210                 }
2211 #endif
2212         }
2213
2214         rt_set_nexthop(rth, &res, 0);
2215         
2216
2217         rth->rt_flags = flags;
2218
2219         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2220         err = rt_intern_hash(hash, rth, rp);
2221 done:
2222         if (free_res)
2223                 fib_res_put(&res);
2224         if (dev_out)
2225                 dev_put(dev_out);
2226         if (in_dev)
2227                 in_dev_put(in_dev);
2228 out:    return err;
2229
2230 e_inval:
2231         err = -EINVAL;
2232         goto done;
2233 e_nobufs:
2234         err = -ENOBUFS;
2235         goto done;
2236 }
2237
2238 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2239 {
2240         unsigned hash;
2241         struct rtable *rth;
2242
2243         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2244
2245         rcu_read_lock();
2246         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2247                 smp_read_barrier_depends();
2248                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2249                     rth->fl.fl4_src == flp->fl4_src &&
2250                     rth->fl.iif == 0 &&
2251                     rth->fl.oif == flp->oif &&
2252 #ifdef CONFIG_IP_ROUTE_FWMARK
2253                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2254 #endif
2255                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2256                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2257                         rth->u.dst.lastuse = jiffies;
2258                         dst_hold(&rth->u.dst);
2259                         rth->u.dst.__use++;
2260                         RT_CACHE_STAT_INC(out_hit);
2261                         rcu_read_unlock();
2262                         *rp = rth;
2263                         return 0;
2264                 }
2265                 RT_CACHE_STAT_INC(out_hlist_search);
2266         }
2267         rcu_read_unlock();
2268
2269         return ip_route_output_slow(rp, flp);
2270 }
2271
2272 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2273 {
2274         int err;
2275
2276         if ((err = __ip_route_output_key(rp, flp)) != 0)
2277                 return err;
2278         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2279 }
2280
2281 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2282 {
2283         int err;
2284
2285         if ((err = __ip_route_output_key(rp, flp)) != 0)
2286                 return err;
2287         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2288 }
2289
2290 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2291                         int nowait)
2292 {
2293         struct rtable *rt = (struct rtable*)skb->dst;
2294         struct rtmsg *r;
2295         struct nlmsghdr  *nlh;
2296         unsigned char    *b = skb->tail;
2297         struct rta_cacheinfo ci;
2298 #ifdef CONFIG_IP_MROUTE
2299         struct rtattr *eptr;
2300 #endif
2301         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2302         r = NLMSG_DATA(nlh);
2303         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2304         r->rtm_family    = AF_INET;
2305         r->rtm_dst_len  = 32;
2306         r->rtm_src_len  = 0;
2307         r->rtm_tos      = rt->fl.fl4_tos;
2308         r->rtm_table    = RT_TABLE_MAIN;
2309         r->rtm_type     = rt->rt_type;
2310         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2311         r->rtm_protocol = RTPROT_UNSPEC;
2312         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2313         if (rt->rt_flags & RTCF_NOTIFY)
2314                 r->rtm_flags |= RTM_F_NOTIFY;
2315         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2316         if (rt->fl.fl4_src) {
2317                 r->rtm_src_len = 32;
2318                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2319         }
2320         if (rt->u.dst.dev)
2321                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2322 #ifdef CONFIG_NET_CLS_ROUTE
2323         if (rt->u.dst.tclassid)
2324                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2325 #endif
2326         if (rt->fl.iif)
2327                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2328         else if (rt->rt_src != rt->fl.fl4_src)
2329                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2330         if (rt->rt_dst != rt->rt_gateway)
2331                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2332         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2333                 goto rtattr_failure;
2334         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2335         ci.rta_used     = rt->u.dst.__use;
2336         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2337         if (rt->u.dst.expires)
2338                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2339         else
2340                 ci.rta_expires = 0;
2341         ci.rta_error    = rt->u.dst.error;
2342         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2343         if (rt->peer) {
2344                 ci.rta_id = rt->peer->ip_id_count;
2345                 if (rt->peer->tcp_ts_stamp) {
2346                         ci.rta_ts = rt->peer->tcp_ts;
2347                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2348                 }
2349         }
2350 #ifdef CONFIG_IP_MROUTE
2351         eptr = (struct rtattr*)skb->tail;
2352 #endif
2353         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2354         if (rt->fl.iif) {
2355 #ifdef CONFIG_IP_MROUTE
2356                 u32 dst = rt->rt_dst;
2357
2358                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2359                     ipv4_devconf.mc_forwarding) {
2360                         int err = ipmr_get_route(skb, r, nowait);
2361                         if (err <= 0) {
2362                                 if (!nowait) {
2363                                         if (err == 0)
2364                                                 return 0;
2365                                         goto nlmsg_failure;
2366                                 } else {
2367                                         if (err == -EMSGSIZE)
2368                                                 goto nlmsg_failure;
2369                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2370                                 }
2371                         }
2372                 } else
2373 #endif
2374                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2375         }
2376
2377         nlh->nlmsg_len = skb->tail - b;
2378         return skb->len;
2379
2380 nlmsg_failure:
2381 rtattr_failure:
2382         skb_trim(skb, b - skb->data);
2383         return -1;
2384 }
2385
2386 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2387 {
2388         struct rtattr **rta = arg;
2389         struct rtmsg *rtm = NLMSG_DATA(nlh);
2390         struct rtable *rt = NULL;
2391         u32 dst = 0;
2392         u32 src = 0;
2393         int iif = 0;
2394         int err = -ENOBUFS;
2395         struct sk_buff *skb;
2396
2397         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2398         if (!skb)
2399                 goto out;
2400
2401         /* Reserve room for dummy headers, this skb can pass
2402            through good chunk of routing engine.
2403          */
2404         skb->mac.raw = skb->data;
2405         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2406
2407         if (rta[RTA_SRC - 1])
2408                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2409         if (rta[RTA_DST - 1])
2410                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2411         if (rta[RTA_IIF - 1])
2412                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2413
2414         if (iif) {
2415                 struct net_device *dev = __dev_get_by_index(iif);
2416                 err = -ENODEV;
2417                 if (!dev)
2418                         goto out_free;
2419                 skb->protocol   = htons(ETH_P_IP);
2420                 skb->dev        = dev;
2421                 local_bh_disable();
2422                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2423                 local_bh_enable();
2424                 rt = (struct rtable*)skb->dst;
2425                 if (!err && rt->u.dst.error)
2426                         err = -rt->u.dst.error;
2427         } else {
2428                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2429                                                          .saddr = src,
2430                                                          .tos = rtm->rtm_tos } } };
2431                 int oif = 0;
2432                 if (rta[RTA_OIF - 1])
2433                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2434                 fl.oif = oif;
2435                 err = ip_route_output_key(&rt, &fl);
2436         }
2437         if (err)
2438                 goto out_free;
2439
2440         skb->dst = &rt->u.dst;
2441         if (rtm->rtm_flags & RTM_F_NOTIFY)
2442                 rt->rt_flags |= RTCF_NOTIFY;
2443
2444         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2445
2446         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2447                                 RTM_NEWROUTE, 0);
2448         if (!err)
2449                 goto out_free;
2450         if (err < 0) {
2451                 err = -EMSGSIZE;
2452                 goto out_free;
2453         }
2454
2455         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2456         if (err > 0)
2457                 err = 0;
2458 out:    return err;
2459
2460 out_free:
2461         kfree_skb(skb);
2462         goto out;
2463 }
2464
2465 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2466 {
2467         struct rtable *rt;
2468         int h, s_h;
2469         int idx, s_idx;
2470
2471         s_h = cb->args[0];
2472         s_idx = idx = cb->args[1];
2473         for (h = 0; h <= rt_hash_mask; h++) {
2474                 if (h < s_h) continue;
2475                 if (h > s_h)
2476                         s_idx = 0;
2477                 rcu_read_lock();
2478                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2479                      rt = rt->u.rt_next, idx++) {
2480                         smp_read_barrier_depends();
2481                         if (idx < s_idx)
2482                                 continue;
2483                         skb->dst = dst_clone(&rt->u.dst);
2484                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2485                                          cb->nlh->nlmsg_seq,
2486                                          RTM_NEWROUTE, 1) <= 0) {
2487                                 dst_release(xchg(&skb->dst, NULL));
2488                                 rcu_read_unlock();
2489                                 goto done;
2490                         }
2491                         dst_release(xchg(&skb->dst, NULL));
2492                 }
2493                 rcu_read_unlock();
2494         }
2495
2496 done:
2497         cb->args[0] = h;
2498         cb->args[1] = idx;
2499         return skb->len;
2500 }
2501
2502 void ip_rt_multicast_event(struct in_device *in_dev)
2503 {
2504         rt_cache_flush(0);
2505 }
2506
2507 #ifdef CONFIG_SYSCTL
2508 static int flush_delay;
2509
2510 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2511                                         struct file *filp, void __user *buffer,
2512                                         size_t *lenp)
2513 {
2514         if (write) {
2515                 proc_dointvec(ctl, write, filp, buffer, lenp);
2516                 rt_cache_flush(flush_delay);
2517                 return 0;
2518         } 
2519
2520         return -EINVAL;
2521 }
2522
2523 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2524                                                 int __user *name,
2525                                                 int nlen,
2526                                                 void __user *oldval,
2527                                                 size_t __user *oldlenp,
2528                                                 void __user *newval,
2529                                                 size_t newlen,
2530                                                 void **context)
2531 {
2532         int delay;
2533         if (newlen != sizeof(int))
2534                 return -EINVAL;
2535         if (get_user(delay, (int __user *)newval))
2536                 return -EFAULT; 
2537         rt_cache_flush(delay); 
2538         return 0;
2539 }
2540
2541 ctl_table ipv4_route_table[] = {
2542         {
2543                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2544                 .procname       = "flush",
2545                 .data           = &flush_delay,
2546                 .maxlen         = sizeof(int),
2547                 .mode           = 0644,
2548                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2549                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2550         },
2551         {
2552                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2553                 .procname       = "min_delay",
2554                 .data           = &ip_rt_min_delay,
2555                 .maxlen         = sizeof(int),
2556                 .mode           = 0644,
2557                 .proc_handler   = &proc_dointvec_jiffies,
2558                 .strategy       = &sysctl_jiffies,
2559         },
2560         {
2561                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2562                 .procname       = "max_delay",
2563                 .data           = &ip_rt_max_delay,
2564                 .maxlen         = sizeof(int),
2565                 .mode           = 0644,
2566                 .proc_handler   = &proc_dointvec_jiffies,
2567                 .strategy       = &sysctl_jiffies,
2568         },
2569         {
2570                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2571                 .procname       = "gc_thresh",
2572                 .data           = &ipv4_dst_ops.gc_thresh,
2573                 .maxlen         = sizeof(int),
2574                 .mode           = 0644,
2575                 .proc_handler   = &proc_dointvec,
2576         },
2577         {
2578                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2579                 .procname       = "max_size",
2580                 .data           = &ip_rt_max_size,
2581                 .maxlen         = sizeof(int),
2582                 .mode           = 0644,
2583                 .proc_handler   = &proc_dointvec,
2584         },
2585         {
2586                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2587                 .procname       = "gc_min_interval",
2588                 .data           = &ip_rt_gc_min_interval,
2589                 .maxlen         = sizeof(int),
2590                 .mode           = 0644,
2591                 .proc_handler   = &proc_dointvec_jiffies,
2592                 .strategy       = &sysctl_jiffies,
2593         },
2594         {
2595                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2596                 .procname       = "gc_timeout",
2597                 .data           = &ip_rt_gc_timeout,
2598                 .maxlen         = sizeof(int),
2599                 .mode           = 0644,
2600                 .proc_handler   = &proc_dointvec_jiffies,
2601                 .strategy       = &sysctl_jiffies,
2602         },
2603         {
2604                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2605                 .procname       = "gc_interval",
2606                 .data           = &ip_rt_gc_interval,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = &proc_dointvec_jiffies,
2610                 .strategy       = &sysctl_jiffies,
2611         },
2612         {
2613                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2614                 .procname       = "redirect_load",
2615                 .data           = &ip_rt_redirect_load,
2616                 .maxlen         = sizeof(int),
2617                 .mode           = 0644,
2618                 .proc_handler   = &proc_dointvec,
2619         },
2620         {
2621                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2622                 .procname       = "redirect_number",
2623                 .data           = &ip_rt_redirect_number,
2624                 .maxlen         = sizeof(int),
2625                 .mode           = 0644,
2626                 .proc_handler   = &proc_dointvec,
2627         },
2628         {
2629                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2630                 .procname       = "redirect_silence",
2631                 .data           = &ip_rt_redirect_silence,
2632                 .maxlen         = sizeof(int),
2633                 .mode           = 0644,
2634                 .proc_handler   = &proc_dointvec,
2635         },
2636         {
2637                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2638                 .procname       = "error_cost",
2639                 .data           = &ip_rt_error_cost,
2640                 .maxlen         = sizeof(int),
2641                 .mode           = 0644,
2642                 .proc_handler   = &proc_dointvec,
2643         },
2644         {
2645                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2646                 .procname       = "error_burst",
2647                 .data           = &ip_rt_error_burst,
2648                 .maxlen         = sizeof(int),
2649                 .mode           = 0644,
2650                 .proc_handler   = &proc_dointvec,
2651         },
2652         {
2653                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2654                 .procname       = "gc_elasticity",
2655                 .data           = &ip_rt_gc_elasticity,
2656                 .maxlen         = sizeof(int),
2657                 .mode           = 0644,
2658                 .proc_handler   = &proc_dointvec,
2659         },
2660         {
2661                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2662                 .procname       = "mtu_expires",
2663                 .data           = &ip_rt_mtu_expires,
2664                 .maxlen         = sizeof(int),
2665                 .mode           = 0644,
2666                 .proc_handler   = &proc_dointvec_jiffies,
2667                 .strategy       = &sysctl_jiffies,
2668         },
2669         {
2670                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2671                 .procname       = "min_pmtu",
2672                 .data           = &ip_rt_min_pmtu,
2673                 .maxlen         = sizeof(int),
2674                 .mode           = 0644,
2675                 .proc_handler   = &proc_dointvec,
2676         },
2677         {
2678                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2679                 .procname       = "min_adv_mss",
2680                 .data           = &ip_rt_min_advmss,
2681                 .maxlen         = sizeof(int),
2682                 .mode           = 0644,
2683                 .proc_handler   = &proc_dointvec,
2684         },
2685         {
2686                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2687                 .procname       = "secret_interval",
2688                 .data           = &ip_rt_secret_interval,
2689                 .maxlen         = sizeof(int),
2690                 .mode           = 0644,
2691                 .proc_handler   = &proc_dointvec_jiffies,
2692                 .strategy       = &sysctl_jiffies,
2693         },
2694         { .ctl_name = 0 }
2695 };
2696 #endif
2697
2698 #ifdef CONFIG_NET_CLS_ROUTE
2699 struct ip_rt_acct *ip_rt_acct;
2700
2701 /* This code sucks.  But you should have seen it before! --RR */
2702
2703 /* IP route accounting ptr for this logical cpu number. */
2704 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2705
2706 #ifdef CONFIG_PROC_FS
2707 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2708                            int length, int *eof, void *data)
2709 {
2710         unsigned int i;
2711
2712         if ((offset & 3) || (length & 3))
2713                 return -EIO;
2714
2715         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2716                 *eof = 1;
2717                 return 0;
2718         }
2719
2720         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2721                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2722                 *eof = 1;
2723         }
2724
2725         offset /= sizeof(u32);
2726
2727         if (length > 0) {
2728                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2729                 u32 *dst = (u32 *) buffer;
2730
2731                 /* Copy first cpu. */
2732                 *start = buffer;
2733                 memcpy(dst, src, length);
2734
2735                 /* Add the other cpus in, one int at a time */
2736                 for_each_cpu(i) {
2737                         unsigned int j;
2738
2739                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2740
2741                         for (j = 0; j < length/4; j++)
2742                                 dst[j] += src[j];
2743                 }
2744         }
2745         return length;
2746 }
2747 #endif /* CONFIG_PROC_FS */
2748 #endif /* CONFIG_NET_CLS_ROUTE */
2749
2750 static __initdata unsigned long rhash_entries;
2751 static int __init set_rhash_entries(char *str)
2752 {
2753         if (!str)
2754                 return 0;
2755         rhash_entries = simple_strtoul(str, &str, 0);
2756         return 1;
2757 }
2758 __setup("rhash_entries=", set_rhash_entries);
2759
2760 int __init ip_rt_init(void)
2761 {
2762         int i, order, goal, rc = 0;
2763
2764         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2765                              (jiffies ^ (jiffies >> 7)));
2766
2767 #ifdef CONFIG_NET_CLS_ROUTE
2768         for (order = 0;
2769              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2770                 /* NOTHING */;
2771         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2772         if (!ip_rt_acct)
2773                 panic("IP: failed to allocate ip_rt_acct\n");
2774         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2775 #endif
2776
2777         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2778                                                      sizeof(struct rtable),
2779                                                      0, SLAB_HWCACHE_ALIGN,
2780                                                      NULL, NULL);
2781
2782         if (!ipv4_dst_ops.kmem_cachep)
2783                 panic("IP: failed to allocate ip_dst_cache\n");
2784
2785         goal = num_physpages >> (26 - PAGE_SHIFT);
2786         if (rhash_entries)
2787                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2788         for (order = 0; (1UL << order) < goal; order++)
2789                 /* NOTHING */;
2790
2791         do {
2792                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2793                         sizeof(struct rt_hash_bucket);
2794                 while (rt_hash_mask & (rt_hash_mask - 1))
2795                         rt_hash_mask--;
2796                 rt_hash_table = (struct rt_hash_bucket *)
2797                         __get_free_pages(GFP_ATOMIC, order);
2798         } while (rt_hash_table == NULL && --order > 0);
2799
2800         if (!rt_hash_table)
2801                 panic("Failed to allocate IP route cache hash table\n");
2802
2803         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2804                rt_hash_mask,
2805                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2806
2807         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2808                 /* NOTHING */;
2809
2810         rt_hash_mask--;
2811         for (i = 0; i <= rt_hash_mask; i++) {
2812                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2813                 rt_hash_table[i].chain = NULL;
2814         }
2815
2816         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2817         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2818
2819         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2820         if (!rt_cache_stat)
2821                 return -ENOMEM;
2822
2823         devinet_init();
2824         ip_fib_init();
2825
2826         init_timer(&rt_flush_timer);
2827         rt_flush_timer.function = rt_run_flush;
2828         init_timer(&rt_periodic_timer);
2829         rt_periodic_timer.function = rt_check_expire;
2830         init_timer(&rt_secret_timer);
2831         rt_secret_timer.function = rt_secret_rebuild;
2832
2833         /* All the timers, started at system startup tend
2834            to synchronize. Perturb it a bit.
2835          */
2836         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2837                                         ip_rt_gc_interval;
2838         add_timer(&rt_periodic_timer);
2839
2840         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2841                 ip_rt_secret_interval;
2842         add_timer(&rt_secret_timer);
2843
2844 #ifdef CONFIG_PROC_FS
2845         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2846             !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2847                 free_percpu(rt_cache_stat);
2848                 return -ENOMEM;
2849         }
2850
2851 #ifdef CONFIG_NET_CLS_ROUTE
2852         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2853 #endif
2854 #endif
2855 #ifdef CONFIG_XFRM
2856         xfrm_init();
2857         xfrm4_init();
2858 #endif
2859         return rc;
2860 }
2861
2862 EXPORT_SYMBOL(__ip_select_ident);
2863 EXPORT_SYMBOL(ip_route_input);
2864 EXPORT_SYMBOL(ip_route_output_key);