Merge to Fedora kernel-2.6.6-1.422
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106
107 #define IP_MAX_MTU      0xFFF0
108
109 #define RT_GC_TIMEOUT (300*HZ)
110
111 int ip_rt_min_delay             = 2 * HZ;
112 int ip_rt_max_delay             = 10 * HZ;
113 int ip_rt_max_size;
114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval           = 60 * HZ;
116 int ip_rt_gc_min_interval       = HZ / 2;
117 int ip_rt_redirect_number       = 9;
118 int ip_rt_redirect_load         = HZ / 50;
119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost            = HZ;
121 int ip_rt_error_burst           = 5 * HZ;
122 int ip_rt_gc_elasticity         = 8;
123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
124 int ip_rt_min_pmtu              = 512 + 20 + 20;
125 int ip_rt_min_advmss            = 256;
126 int ip_rt_secret_interval       = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void              ipv4_link_failure(struct sk_buff *skb);
143 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
144 static int rt_garbage_collect(void);
145
146
147 static struct dst_ops ipv4_dst_ops = {
148         .family =               AF_INET,
149         .protocol =             __constant_htons(ETH_P_IP),
150         .gc =                   rt_garbage_collect,
151         .check =                ipv4_dst_check,
152         .destroy =              ipv4_dst_destroy,
153         .negative_advice =      ipv4_negative_advice,
154         .link_failure =         ipv4_link_failure,
155         .update_pmtu =          ip_rt_update_pmtu,
156         .entry_size =           sizeof(struct rtable),
157 };
158
159 #define ECN_OR_COST(class)      TC_PRIO_##class
160
161 __u8 ip_tos2prio[16] = {
162         TC_PRIO_BESTEFFORT,
163         ECN_OR_COST(FILLER),
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(BESTEFFORT),
166         TC_PRIO_BULK,
167         ECN_OR_COST(BULK),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_INTERACTIVE,
171         ECN_OR_COST(INTERACTIVE),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE_BULK,
175         ECN_OR_COST(INTERACTIVE_BULK),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK)
178 };
179
180
181 /*
182  * Route cache.
183  */
184
185 /* The locking scheme is rather straight forward:
186  *
187  * 1) Read-Copy Update protects the buckets of the central route hash.
188  * 2) Only writers remove entries, and they hold the lock
189  *    as they look at rtable reference counts.
190  * 3) Only readers acquire references to rtable entries,
191  *    they do so with atomic increments and with the
192  *    lock held.
193  */
194
195 struct rt_hash_bucket {
196         struct rtable   *chain;
197         spinlock_t      lock;
198 } __attribute__((__aligned__(8)));
199
200 static struct rt_hash_bucket    *rt_hash_table;
201 static unsigned                 rt_hash_mask;
202 static int                      rt_hash_log;
203 static unsigned int             rt_hash_rnd;
204
205 struct rt_cache_stat *rt_cache_stat;
206
207 static int rt_intern_hash(unsigned hash, struct rtable *rth,
208                                 struct rtable **res);
209
210 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
211 {
212         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
213                 & rt_hash_mask);
214 }
215
216 #ifdef CONFIG_PROC_FS
217 struct rt_cache_iter_state {
218         int bucket;
219 };
220
221 static struct rtable *rt_cache_get_first(struct seq_file *seq)
222 {
223         struct rtable *r = NULL;
224         struct rt_cache_iter_state *st = seq->private;
225
226         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
227                 rcu_read_lock();
228                 r = rt_hash_table[st->bucket].chain;
229                 if (r)
230                         break;
231                 rcu_read_unlock();
232         }
233         return r;
234 }
235
236 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
237 {
238         struct rt_cache_iter_state *st = seq->private;
239
240         smp_read_barrier_depends();
241         r = r->u.rt_next;
242         while (!r) {
243                 rcu_read_unlock();
244                 if (--st->bucket < 0)
245                         break;
246                 rcu_read_lock();
247                 r = rt_hash_table[st->bucket].chain;
248         }
249         return r;
250 }
251
252 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
253 {
254         struct rtable *r = rt_cache_get_first(seq);
255
256         if (r)
257                 while (pos && (r = rt_cache_get_next(seq, r)))
258                         --pos;
259         return pos ? NULL : r;
260 }
261
262 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
263 {
264         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
265 }
266
267 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         struct rtable *r = NULL;
270
271         if (v == SEQ_START_TOKEN)
272                 r = rt_cache_get_first(seq);
273         else
274                 r = rt_cache_get_next(seq, v);
275         ++*pos;
276         return r;
277 }
278
279 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
280 {
281         if (v && v != SEQ_START_TOKEN)
282                 rcu_read_unlock();
283 }
284
285 static int rt_cache_seq_show(struct seq_file *seq, void *v)
286 {
287         if (v == SEQ_START_TOKEN)
288                 seq_printf(seq, "%-127s\n",
289                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
290                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
291                            "HHUptod\tSpecDst");
292         else {
293                 struct rtable *r = v;
294                 char temp[256];
295
296                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
297                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
298                         r->u.dst.dev ? r->u.dst.dev->name : "*",
299                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
300                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
301                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
302                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
303                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
304                         dst_metric(&r->u.dst, RTAX_WINDOW),
305                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
306                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
307                         r->fl.fl4_tos,
308                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
309                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
310                                        dev_queue_xmit) : 0,
311                         r->rt_spec_dst);
312                 seq_printf(seq, "%-127s\n", temp);
313         }
314         return 0;
315 }
316
317 static struct seq_operations rt_cache_seq_ops = {
318         .start  = rt_cache_seq_start,
319         .next   = rt_cache_seq_next,
320         .stop   = rt_cache_seq_stop,
321         .show   = rt_cache_seq_show,
322 };
323
324 static int rt_cache_seq_open(struct inode *inode, struct file *file)
325 {
326         struct seq_file *seq;
327         int rc = -ENOMEM;
328         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
329
330         if (!s)
331                 goto out;
332         rc = seq_open(file, &rt_cache_seq_ops);
333         if (rc)
334                 goto out_kfree;
335         seq          = file->private_data;
336         seq->private = s;
337         memset(s, 0, sizeof(*s));
338 out:
339         return rc;
340 out_kfree:
341         kfree(s);
342         goto out;
343 }
344
345 static struct file_operations rt_cache_seq_fops = {
346         .owner   = THIS_MODULE,
347         .open    = rt_cache_seq_open,
348         .read    = seq_read,
349         .llseek  = seq_lseek,
350         .release = seq_release_private,
351 };
352
353
354 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
355 {
356         int cpu;
357
358         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
359                 if (!cpu_possible(cpu))
360                         continue;
361                 *pos = cpu;
362                 return per_cpu_ptr(rt_cache_stat, cpu);
363         }
364         return NULL;
365 }
366
367 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
368 {
369         int cpu;
370
371         for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
372                 if (!cpu_possible(cpu))
373                         continue;
374                 *pos = cpu;
375                 return per_cpu_ptr(rt_cache_stat, cpu);
376         }
377         return NULL;
378         
379 }
380
381 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
382 {
383
384 }
385
386 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
387 {
388         struct rt_cache_stat *st = v;
389         
390         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
391                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
392                    atomic_read(&ipv4_dst_ops.entries),
393                    st->in_hit,
394                    st->in_slow_tot,
395                    st->in_slow_mc,
396                    st->in_no_route,
397                    st->in_brd,
398                    st->in_martian_dst,
399                    st->in_martian_src,
400
401                    st->out_hit,
402                    st->out_slow_tot,
403                    st->out_slow_mc, 
404
405                    st->gc_total,
406                    st->gc_ignored,
407                    st->gc_goal_miss,
408                    st->gc_dst_overflow,
409                    st->in_hlist_search,
410                    st->out_hlist_search
411                 );
412         return 0;
413 }
414
415 static struct seq_operations rt_cpu_seq_ops = {
416         .start  = rt_cpu_seq_start,
417         .next   = rt_cpu_seq_next,
418         .stop   = rt_cpu_seq_stop,
419         .show   = rt_cpu_seq_show,
420 };
421
422
423 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
424 {
425         return seq_open(file, &rt_cpu_seq_ops);
426 }
427
428 static struct file_operations rt_cpu_seq_fops = {
429         .owner   = THIS_MODULE,
430         .open    = rt_cpu_seq_open,
431         .read    = seq_read,
432         .llseek  = seq_lseek,
433         .release = seq_release_private,
434 };
435
436 #endif /* CONFIG_PROC_FS */
437   
438 static __inline__ void rt_free(struct rtable *rt)
439 {
440         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
441 }
442
443 static __inline__ void rt_drop(struct rtable *rt)
444 {
445         ip_rt_put(rt);
446         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
447 }
448
449 static __inline__ int rt_fast_clean(struct rtable *rth)
450 {
451         /* Kill broadcast/multicast entries very aggresively, if they
452            collide in hash table with more useful entries */
453         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
454                 rth->fl.iif && rth->u.rt_next;
455 }
456
457 static __inline__ int rt_valuable(struct rtable *rth)
458 {
459         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
460                 rth->u.dst.expires;
461 }
462
463 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
464 {
465         unsigned long age;
466         int ret = 0;
467
468         if (atomic_read(&rth->u.dst.__refcnt))
469                 goto out;
470
471         ret = 1;
472         if (rth->u.dst.expires &&
473             time_after_eq(jiffies, rth->u.dst.expires))
474                 goto out;
475
476         age = jiffies - rth->u.dst.lastuse;
477         ret = 0;
478         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
479             (age <= tmo2 && rt_valuable(rth)))
480                 goto out;
481         ret = 1;
482 out:    return ret;
483 }
484
485 /* Bits of score are:
486  * 31: very valuable
487  * 30: not quite useless
488  * 29..0: usage counter
489  */
490 static inline u32 rt_score(struct rtable *rt)
491 {
492         u32 score = jiffies - rt->u.dst.lastuse;
493
494         score = ~score & ~(3<<30);
495
496         if (rt_valuable(rt))
497                 score |= (1<<31);
498
499         if (!rt->fl.iif ||
500             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
501                 score |= (1<<30);
502
503         return score;
504 }
505
506 /* This runs via a timer and thus is always in BH context. */
507 static void rt_check_expire(unsigned long dummy)
508 {
509         static int rover;
510         int i = rover, t;
511         struct rtable *rth, **rthp;
512         unsigned long now = jiffies;
513
514         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
515              t -= ip_rt_gc_timeout) {
516                 unsigned long tmo = ip_rt_gc_timeout;
517
518                 i = (i + 1) & rt_hash_mask;
519                 rthp = &rt_hash_table[i].chain;
520
521                 spin_lock(&rt_hash_table[i].lock);
522                 while ((rth = *rthp) != NULL) {
523                         if (rth->u.dst.expires) {
524                                 /* Entry is expired even if it is in use */
525                                 if (time_before_eq(now, rth->u.dst.expires)) {
526                                         tmo >>= 1;
527                                         rthp = &rth->u.rt_next;
528                                         continue;
529                                 }
530                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
531                                 tmo >>= 1;
532                                 rthp = &rth->u.rt_next;
533                                 continue;
534                         }
535
536                         /* Cleanup aged off entries. */
537                         *rthp = rth->u.rt_next;
538                         rt_free(rth);
539                 }
540                 spin_unlock(&rt_hash_table[i].lock);
541
542                 /* Fallback loop breaker. */
543                 if (time_after(jiffies, now))
544                         break;
545         }
546         rover = i;
547         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
548 }
549
550 /* This can run from both BH and non-BH contexts, the latter
551  * in the case of a forced flush event.
552  */
553 static void rt_run_flush(unsigned long dummy)
554 {
555         int i;
556         struct rtable *rth, *next;
557
558         rt_deadline = 0;
559
560         get_random_bytes(&rt_hash_rnd, 4);
561
562         for (i = rt_hash_mask; i >= 0; i--) {
563                 spin_lock_bh(&rt_hash_table[i].lock);
564                 rth = rt_hash_table[i].chain;
565                 if (rth)
566                         rt_hash_table[i].chain = NULL;
567                 spin_unlock_bh(&rt_hash_table[i].lock);
568
569                 for (; rth; rth = next) {
570                         next = rth->u.rt_next;
571                         rt_free(rth);
572                 }
573         }
574 }
575
576 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
577
578 void rt_cache_flush(int delay)
579 {
580         unsigned long now = jiffies;
581         int user_mode = !in_softirq();
582
583         if (delay < 0)
584                 delay = ip_rt_min_delay;
585
586         spin_lock_bh(&rt_flush_lock);
587
588         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
589                 long tmo = (long)(rt_deadline - now);
590
591                 /* If flush timer is already running
592                    and flush request is not immediate (delay > 0):
593
594                    if deadline is not achieved, prolongate timer to "delay",
595                    otherwise fire it at deadline time.
596                  */
597
598                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
599                         tmo = 0;
600                 
601                 if (delay > tmo)
602                         delay = tmo;
603         }
604
605         if (delay <= 0) {
606                 spin_unlock_bh(&rt_flush_lock);
607                 rt_run_flush(0);
608                 return;
609         }
610
611         if (rt_deadline == 0)
612                 rt_deadline = now + ip_rt_max_delay;
613
614         mod_timer(&rt_flush_timer, now+delay);
615         spin_unlock_bh(&rt_flush_lock);
616 }
617
618 static void rt_secret_rebuild(unsigned long dummy)
619 {
620         unsigned long now = jiffies;
621
622         rt_cache_flush(0);
623         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
624 }
625
626 /*
627    Short description of GC goals.
628
629    We want to build algorithm, which will keep routing cache
630    at some equilibrium point, when number of aged off entries
631    is kept approximately equal to newly generated ones.
632
633    Current expiration strength is variable "expire".
634    We try to adjust it dynamically, so that if networking
635    is idle expires is large enough to keep enough of warm entries,
636    and when load increases it reduces to limit cache size.
637  */
638
639 static int rt_garbage_collect(void)
640 {
641         static unsigned long expire = RT_GC_TIMEOUT;
642         static unsigned long last_gc;
643         static int rover;
644         static int equilibrium;
645         struct rtable *rth, **rthp;
646         unsigned long now = jiffies;
647         int goal;
648
649         /*
650          * Garbage collection is pretty expensive,
651          * do not make it too frequently.
652          */
653
654         RT_CACHE_STAT_INC(gc_total);
655
656         if (now - last_gc < ip_rt_gc_min_interval &&
657             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
658                 RT_CACHE_STAT_INC(gc_ignored);
659                 goto out;
660         }
661
662         /* Calculate number of entries, which we want to expire now. */
663         goal = atomic_read(&ipv4_dst_ops.entries) -
664                 (ip_rt_gc_elasticity << rt_hash_log);
665         if (goal <= 0) {
666                 if (equilibrium < ipv4_dst_ops.gc_thresh)
667                         equilibrium = ipv4_dst_ops.gc_thresh;
668                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
669                 if (goal > 0) {
670                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
671                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
672                 }
673         } else {
674                 /* We are in dangerous area. Try to reduce cache really
675                  * aggressively.
676                  */
677                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
678                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
679         }
680
681         if (now - last_gc >= ip_rt_gc_min_interval)
682                 last_gc = now;
683
684         if (goal <= 0) {
685                 equilibrium += goal;
686                 goto work_done;
687         }
688
689         do {
690                 int i, k;
691
692                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
693                         unsigned long tmo = expire;
694
695                         k = (k + 1) & rt_hash_mask;
696                         rthp = &rt_hash_table[k].chain;
697                         spin_lock_bh(&rt_hash_table[k].lock);
698                         while ((rth = *rthp) != NULL) {
699                                 if (!rt_may_expire(rth, tmo, expire)) {
700                                         tmo >>= 1;
701                                         rthp = &rth->u.rt_next;
702                                         continue;
703                                 }
704                                 *rthp = rth->u.rt_next;
705                                 rt_free(rth);
706                                 goal--;
707                         }
708                         spin_unlock_bh(&rt_hash_table[k].lock);
709                         if (goal <= 0)
710                                 break;
711                 }
712                 rover = k;
713
714                 if (goal <= 0)
715                         goto work_done;
716
717                 /* Goal is not achieved. We stop process if:
718
719                    - if expire reduced to zero. Otherwise, expire is halfed.
720                    - if table is not full.
721                    - if we are called from interrupt.
722                    - jiffies check is just fallback/debug loop breaker.
723                      We will not spin here for long time in any case.
724                  */
725
726                 RT_CACHE_STAT_INC(gc_goal_miss);
727
728                 if (expire == 0)
729                         break;
730
731                 expire >>= 1;
732 #if RT_CACHE_DEBUG >= 2
733                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
734                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
735 #endif
736
737                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
738                         goto out;
739         } while (!in_softirq() && time_before_eq(jiffies, now));
740
741         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
742                 goto out;
743         if (net_ratelimit())
744                 printk(KERN_WARNING "dst cache overflow\n");
745         RT_CACHE_STAT_INC(gc_dst_overflow);
746         return 1;
747
748 work_done:
749         expire += ip_rt_gc_min_interval;
750         if (expire > ip_rt_gc_timeout ||
751             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
752                 expire = ip_rt_gc_timeout;
753 #if RT_CACHE_DEBUG >= 2
754         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
755                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
756 #endif
757 out:    return 0;
758 }
759
760 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
761 {
762         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
763                fl1->oif     == fl2->oif &&
764                fl1->iif     == fl2->iif;
765 }
766
767 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
768 {
769         struct rtable   *rth, **rthp;
770         unsigned long   now;
771         struct rtable *cand, **candp;
772         u32             min_score;
773         int             chain_length;
774         int attempts = !in_softirq();
775
776 restart:
777         chain_length = 0;
778         min_score = ~(u32)0;
779         cand = NULL;
780         candp = NULL;
781         now = jiffies;
782
783         rthp = &rt_hash_table[hash].chain;
784
785         spin_lock_bh(&rt_hash_table[hash].lock);
786         while ((rth = *rthp) != NULL) {
787                 if (compare_keys(&rth->fl, &rt->fl)) {
788                         /* Put it first */
789                         *rthp = rth->u.rt_next;
790                         /*
791                          * Since lookup is lockfree, the deletion
792                          * must be visible to another weakly ordered CPU before
793                          * the insertion at the start of the hash chain.
794                          */
795                         smp_wmb();
796                         rth->u.rt_next = rt_hash_table[hash].chain;
797                         /*
798                          * Since lookup is lockfree, the update writes
799                          * must be ordered for consistency on SMP.
800                          */
801                         smp_wmb();
802                         rt_hash_table[hash].chain = rth;
803
804                         rth->u.dst.__use++;
805                         dst_hold(&rth->u.dst);
806                         rth->u.dst.lastuse = now;
807                         spin_unlock_bh(&rt_hash_table[hash].lock);
808
809                         rt_drop(rt);
810                         *rp = rth;
811                         return 0;
812                 }
813
814                 if (!atomic_read(&rth->u.dst.__refcnt)) {
815                         u32 score = rt_score(rth);
816
817                         if (score <= min_score) {
818                                 cand = rth;
819                                 candp = rthp;
820                                 min_score = score;
821                         }
822                 }
823
824                 chain_length++;
825
826                 rthp = &rth->u.rt_next;
827         }
828
829         if (cand) {
830                 /* ip_rt_gc_elasticity used to be average length of chain
831                  * length, when exceeded gc becomes really aggressive.
832                  *
833                  * The second limit is less certain. At the moment it allows
834                  * only 2 entries per bucket. We will see.
835                  */
836                 if (chain_length > ip_rt_gc_elasticity) {
837                         *candp = cand->u.rt_next;
838                         rt_free(cand);
839                 }
840         }
841
842         /* Try to bind route to arp only if it is output
843            route or unicast forwarding path.
844          */
845         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
846                 int err = arp_bind_neighbour(&rt->u.dst);
847                 if (err) {
848                         spin_unlock_bh(&rt_hash_table[hash].lock);
849
850                         if (err != -ENOBUFS) {
851                                 rt_drop(rt);
852                                 return err;
853                         }
854
855                         /* Neighbour tables are full and nothing
856                            can be released. Try to shrink route cache,
857                            it is most likely it holds some neighbour records.
858                          */
859                         if (attempts-- > 0) {
860                                 int saved_elasticity = ip_rt_gc_elasticity;
861                                 int saved_int = ip_rt_gc_min_interval;
862                                 ip_rt_gc_elasticity     = 1;
863                                 ip_rt_gc_min_interval   = 0;
864                                 rt_garbage_collect();
865                                 ip_rt_gc_min_interval   = saved_int;
866                                 ip_rt_gc_elasticity     = saved_elasticity;
867                                 goto restart;
868                         }
869
870                         if (net_ratelimit())
871                                 printk(KERN_WARNING "Neighbour table overflow.\n");
872                         rt_drop(rt);
873                         return -ENOBUFS;
874                 }
875         }
876
877         rt->u.rt_next = rt_hash_table[hash].chain;
878 #if RT_CACHE_DEBUG >= 2
879         if (rt->u.rt_next) {
880                 struct rtable *trt;
881                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
882                        NIPQUAD(rt->rt_dst));
883                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
884                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
885                 printk("\n");
886         }
887 #endif
888         rt_hash_table[hash].chain = rt;
889         spin_unlock_bh(&rt_hash_table[hash].lock);
890         *rp = rt;
891         return 0;
892 }
893
894 void rt_bind_peer(struct rtable *rt, int create)
895 {
896         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
897         struct inet_peer *peer;
898
899         peer = inet_getpeer(rt->rt_dst, create);
900
901         spin_lock_bh(&rt_peer_lock);
902         if (rt->peer == NULL) {
903                 rt->peer = peer;
904                 peer = NULL;
905         }
906         spin_unlock_bh(&rt_peer_lock);
907         if (peer)
908                 inet_putpeer(peer);
909 }
910
911 /*
912  * Peer allocation may fail only in serious out-of-memory conditions.  However
913  * we still can generate some output.
914  * Random ID selection looks a bit dangerous because we have no chances to
915  * select ID being unique in a reasonable period of time.
916  * But broken packet identifier may be better than no packet at all.
917  */
918 static void ip_select_fb_ident(struct iphdr *iph)
919 {
920         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
921         static u32 ip_fallback_id;
922         u32 salt;
923
924         spin_lock_bh(&ip_fb_id_lock);
925         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
926         iph->id = htons(salt & 0xFFFF);
927         ip_fallback_id = salt;
928         spin_unlock_bh(&ip_fb_id_lock);
929 }
930
931 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
932 {
933         struct rtable *rt = (struct rtable *) dst;
934
935         if (rt) {
936                 if (rt->peer == NULL)
937                         rt_bind_peer(rt, 1);
938
939                 /* If peer is attached to destination, it is never detached,
940                    so that we need not to grab a lock to dereference it.
941                  */
942                 if (rt->peer) {
943                         iph->id = htons(inet_getid(rt->peer, more));
944                         return;
945                 }
946         } else
947                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
948
949         ip_select_fb_ident(iph);
950 }
951
952 static void rt_del(unsigned hash, struct rtable *rt)
953 {
954         struct rtable **rthp;
955
956         spin_lock_bh(&rt_hash_table[hash].lock);
957         ip_rt_put(rt);
958         for (rthp = &rt_hash_table[hash].chain; *rthp;
959              rthp = &(*rthp)->u.rt_next)
960                 if (*rthp == rt) {
961                         *rthp = rt->u.rt_next;
962                         rt_free(rt);
963                         break;
964                 }
965         spin_unlock_bh(&rt_hash_table[hash].lock);
966 }
967
968 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
969                     u32 saddr, u8 tos, struct net_device *dev)
970 {
971         int i, k;
972         struct in_device *in_dev = in_dev_get(dev);
973         struct rtable *rth, **rthp;
974         u32  skeys[2] = { saddr, 0 };
975         int  ikeys[2] = { dev->ifindex, 0 };
976
977         tos &= IPTOS_RT_MASK;
978
979         if (!in_dev)
980                 return;
981
982         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
983             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
984                 goto reject_redirect;
985
986         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
987                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
988                         goto reject_redirect;
989                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
990                         goto reject_redirect;
991         } else {
992                 if (inet_addr_type(new_gw) != RTN_UNICAST)
993                         goto reject_redirect;
994         }
995
996         for (i = 0; i < 2; i++) {
997                 for (k = 0; k < 2; k++) {
998                         unsigned hash = rt_hash_code(daddr,
999                                                      skeys[i] ^ (ikeys[k] << 5),
1000                                                      tos);
1001
1002                         rthp=&rt_hash_table[hash].chain;
1003
1004                         rcu_read_lock();
1005                         while ((rth = *rthp) != NULL) {
1006                                 struct rtable *rt;
1007
1008                                 smp_read_barrier_depends();
1009                                 if (rth->fl.fl4_dst != daddr ||
1010                                     rth->fl.fl4_src != skeys[i] ||
1011                                     rth->fl.fl4_tos != tos ||
1012                                     rth->fl.oif != ikeys[k] ||
1013                                     rth->fl.iif != 0) {
1014                                         rthp = &rth->u.rt_next;
1015                                         continue;
1016                                 }
1017
1018                                 if (rth->rt_dst != daddr ||
1019                                     rth->rt_src != saddr ||
1020                                     rth->u.dst.error ||
1021                                     rth->rt_gateway != old_gw ||
1022                                     rth->u.dst.dev != dev)
1023                                         break;
1024
1025                                 dst_hold(&rth->u.dst);
1026                                 rcu_read_unlock();
1027
1028                                 rt = dst_alloc(&ipv4_dst_ops);
1029                                 if (rt == NULL) {
1030                                         ip_rt_put(rth);
1031                                         in_dev_put(in_dev);
1032                                         return;
1033                                 }
1034
1035                                 /* Copy all the information. */
1036                                 *rt = *rth;
1037                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1038                                 rt->u.dst.__use         = 1;
1039                                 atomic_set(&rt->u.dst.__refcnt, 1);
1040                                 rt->u.dst.child         = NULL;
1041                                 if (rt->u.dst.dev)
1042                                         dev_hold(rt->u.dst.dev);
1043                                 if (rt->idev)
1044                                         in_dev_hold(rt->idev);
1045                                 rt->u.dst.obsolete      = 0;
1046                                 rt->u.dst.lastuse       = jiffies;
1047                                 rt->u.dst.path          = &rt->u.dst;
1048                                 rt->u.dst.neighbour     = NULL;
1049                                 rt->u.dst.hh            = NULL;
1050                                 rt->u.dst.xfrm          = NULL;
1051
1052                                 rt->rt_flags            |= RTCF_REDIRECTED;
1053
1054                                 /* Gateway is different ... */
1055                                 rt->rt_gateway          = new_gw;
1056
1057                                 /* Redirect received -> path was valid */
1058                                 dst_confirm(&rth->u.dst);
1059
1060                                 if (rt->peer)
1061                                         atomic_inc(&rt->peer->refcnt);
1062
1063                                 if (arp_bind_neighbour(&rt->u.dst) ||
1064                                     !(rt->u.dst.neighbour->nud_state &
1065                                             NUD_VALID)) {
1066                                         if (rt->u.dst.neighbour)
1067                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1068                                         ip_rt_put(rth);
1069                                         rt_drop(rt);
1070                                         goto do_next;
1071                                 }
1072
1073                                 rt_del(hash, rth);
1074                                 if (!rt_intern_hash(hash, rt, &rt))
1075                                         ip_rt_put(rt);
1076                                 goto do_next;
1077                         }
1078                         rcu_read_unlock();
1079                 do_next:
1080                         ;
1081                 }
1082         }
1083         in_dev_put(in_dev);
1084         return;
1085
1086 reject_redirect:
1087 #ifdef CONFIG_IP_ROUTE_VERBOSE
1088         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1089                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1090                         "%u.%u.%u.%u ignored.\n"
1091                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1092                         "tos %02x\n",
1093                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1094                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1095 #endif
1096         in_dev_put(in_dev);
1097 }
1098
1099 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1100 {
1101         struct rtable *rt = (struct rtable*)dst;
1102         struct dst_entry *ret = dst;
1103
1104         if (rt) {
1105                 if (dst->obsolete) {
1106                         ip_rt_put(rt);
1107                         ret = NULL;
1108                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1109                            rt->u.dst.expires) {
1110                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1111                                                      rt->fl.fl4_src ^
1112                                                         (rt->fl.oif << 5),
1113                                                      rt->fl.fl4_tos);
1114 #if RT_CACHE_DEBUG >= 1
1115                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1116                                           "%u.%u.%u.%u/%02x dropped\n",
1117                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1118 #endif
1119                         rt_del(hash, rt);
1120                         ret = NULL;
1121                 }
1122         }
1123         return ret;
1124 }
1125
1126 /*
1127  * Algorithm:
1128  *      1. The first ip_rt_redirect_number redirects are sent
1129  *         with exponential backoff, then we stop sending them at all,
1130  *         assuming that the host ignores our redirects.
1131  *      2. If we did not see packets requiring redirects
1132  *         during ip_rt_redirect_silence, we assume that the host
1133  *         forgot redirected route and start to send redirects again.
1134  *
1135  * This algorithm is much cheaper and more intelligent than dumb load limiting
1136  * in icmp.c.
1137  *
1138  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1139  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1140  */
1141
1142 void ip_rt_send_redirect(struct sk_buff *skb)
1143 {
1144         struct rtable *rt = (struct rtable*)skb->dst;
1145         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1146
1147         if (!in_dev)
1148                 return;
1149
1150         if (!IN_DEV_TX_REDIRECTS(in_dev))
1151                 goto out;
1152
1153         /* No redirected packets during ip_rt_redirect_silence;
1154          * reset the algorithm.
1155          */
1156         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1157                 rt->u.dst.rate_tokens = 0;
1158
1159         /* Too many ignored redirects; do not send anything
1160          * set u.dst.rate_last to the last seen redirected packet.
1161          */
1162         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1163                 rt->u.dst.rate_last = jiffies;
1164                 goto out;
1165         }
1166
1167         /* Check for load limit; set rate_last to the latest sent
1168          * redirect.
1169          */
1170         if (time_after(jiffies,
1171                        (rt->u.dst.rate_last +
1172                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1173                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1174                 rt->u.dst.rate_last = jiffies;
1175                 ++rt->u.dst.rate_tokens;
1176 #ifdef CONFIG_IP_ROUTE_VERBOSE
1177                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1178                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1179                     net_ratelimit())
1180                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1181                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1182                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1183                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1184 #endif
1185         }
1186 out:
1187         in_dev_put(in_dev);
1188 }
1189
1190 static int ip_error(struct sk_buff *skb)
1191 {
1192         struct rtable *rt = (struct rtable*)skb->dst;
1193         unsigned long now;
1194         int code;
1195
1196         switch (rt->u.dst.error) {
1197                 case EINVAL:
1198                 default:
1199                         goto out;
1200                 case EHOSTUNREACH:
1201                         code = ICMP_HOST_UNREACH;
1202                         break;
1203                 case ENETUNREACH:
1204                         code = ICMP_NET_UNREACH;
1205                         break;
1206                 case EACCES:
1207                         code = ICMP_PKT_FILTERED;
1208                         break;
1209         }
1210
1211         now = jiffies;
1212         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1213         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1214                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1215         rt->u.dst.rate_last = now;
1216         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1217                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1218                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1219         }
1220
1221 out:    kfree_skb(skb);
1222         return 0;
1223
1224
1225 /*
1226  *      The last two values are not from the RFC but
1227  *      are needed for AMPRnet AX.25 paths.
1228  */
1229
1230 static unsigned short mtu_plateau[] =
1231 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1232
1233 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1234 {
1235         int i;
1236         
1237         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1238                 if (old_mtu > mtu_plateau[i])
1239                         return mtu_plateau[i];
1240         return 68;
1241 }
1242
1243 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1244 {
1245         int i;
1246         unsigned short old_mtu = ntohs(iph->tot_len);
1247         struct rtable *rth;
1248         u32  skeys[2] = { iph->saddr, 0, };
1249         u32  daddr = iph->daddr;
1250         u8   tos = iph->tos & IPTOS_RT_MASK;
1251         unsigned short est_mtu = 0;
1252
1253         if (ipv4_config.no_pmtu_disc)
1254                 return 0;
1255
1256         for (i = 0; i < 2; i++) {
1257                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1258
1259                 rcu_read_lock();
1260                 for (rth = rt_hash_table[hash].chain; rth;
1261                      rth = rth->u.rt_next) {
1262                         smp_read_barrier_depends();
1263                         if (rth->fl.fl4_dst == daddr &&
1264                             rth->fl.fl4_src == skeys[i] &&
1265                             rth->rt_dst  == daddr &&
1266                             rth->rt_src  == iph->saddr &&
1267                             rth->fl.fl4_tos == tos &&
1268                             rth->fl.iif == 0 &&
1269                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1270                                 unsigned short mtu = new_mtu;
1271
1272                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1273
1274                                         /* BSD 4.2 compatibility hack :-( */
1275                                         if (mtu == 0 &&
1276                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1277                                             old_mtu >= 68 + (iph->ihl << 2))
1278                                                 old_mtu -= iph->ihl << 2;
1279
1280                                         mtu = guess_mtu(old_mtu);
1281                                 }
1282                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1283                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1284                                                 dst_confirm(&rth->u.dst);
1285                                                 if (mtu < ip_rt_min_pmtu) {
1286                                                         mtu = ip_rt_min_pmtu;
1287                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1288                                                                 (1 << RTAX_MTU);
1289                                                 }
1290                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1291                                                 dst_set_expires(&rth->u.dst,
1292                                                         ip_rt_mtu_expires);
1293                                         }
1294                                         est_mtu = mtu;
1295                                 }
1296                         }
1297                 }
1298                 rcu_read_unlock();
1299         }
1300         return est_mtu ? : new_mtu;
1301 }
1302
1303 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1304 {
1305         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1306             !(dst_metric_locked(dst, RTAX_MTU))) {
1307                 if (mtu < ip_rt_min_pmtu) {
1308                         mtu = ip_rt_min_pmtu;
1309                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1310                 }
1311                 dst->metrics[RTAX_MTU-1] = mtu;
1312                 dst_set_expires(dst, ip_rt_mtu_expires);
1313         }
1314 }
1315
1316 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1317 {
1318         dst_release(dst);
1319         return NULL;
1320 }
1321
1322 static void ipv4_dst_destroy(struct dst_entry *dst)
1323 {
1324         struct rtable *rt = (struct rtable *) dst;
1325         struct inet_peer *peer = rt->peer;
1326         struct in_device *idev = rt->idev;
1327
1328         if (peer) {
1329                 rt->peer = NULL;
1330                 inet_putpeer(peer);
1331         }
1332
1333         if (idev) {
1334                 rt->idev = NULL;
1335                 in_dev_put(idev);
1336         }
1337 }
1338
1339 static void ipv4_link_failure(struct sk_buff *skb)
1340 {
1341         struct rtable *rt;
1342
1343         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1344
1345         rt = (struct rtable *) skb->dst;
1346         if (rt)
1347                 dst_set_expires(&rt->u.dst, 0);
1348 }
1349
1350 static int ip_rt_bug(struct sk_buff *skb)
1351 {
1352         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1353                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1354                 skb->dev ? skb->dev->name : "?");
1355         kfree_skb(skb);
1356         return 0;
1357 }
1358
1359 /*
1360    We do not cache source address of outgoing interface,
1361    because it is used only by IP RR, TS and SRR options,
1362    so that it out of fast path.
1363
1364    BTW remember: "addr" is allowed to be not aligned
1365    in IP options!
1366  */
1367
1368 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1369 {
1370         u32 src;
1371         struct fib_result res;
1372
1373         if (rt->fl.iif == 0)
1374                 src = rt->rt_src;
1375         else if (fib_lookup(&rt->fl, &res) == 0) {
1376 #ifdef CONFIG_IP_ROUTE_NAT
1377                 if (res.type == RTN_NAT)
1378                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1379                                                 RT_SCOPE_UNIVERSE);
1380                 else
1381 #endif
1382                         src = FIB_RES_PREFSRC(res);
1383                 fib_res_put(&res);
1384         } else
1385                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1386                                         RT_SCOPE_UNIVERSE);
1387         memcpy(addr, &src, 4);
1388 }
1389
1390 #ifdef CONFIG_NET_CLS_ROUTE
1391 static void set_class_tag(struct rtable *rt, u32 tag)
1392 {
1393         if (!(rt->u.dst.tclassid & 0xFFFF))
1394                 rt->u.dst.tclassid |= tag & 0xFFFF;
1395         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1396                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1397 }
1398 #endif
1399
1400 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1401 {
1402         struct fib_info *fi = res->fi;
1403
1404         if (fi) {
1405                 if (FIB_RES_GW(*res) &&
1406                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1407                         rt->rt_gateway = FIB_RES_GW(*res);
1408                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1409                        sizeof(rt->u.dst.metrics));
1410                 if (fi->fib_mtu == 0) {
1411                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1412                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1413                             rt->rt_gateway != rt->rt_dst &&
1414                             rt->u.dst.dev->mtu > 576)
1415                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1416                 }
1417 #ifdef CONFIG_NET_CLS_ROUTE
1418                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1419 #endif
1420         } else
1421                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1422
1423         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1424                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1425         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1426                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1427         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1428                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1429                                        ip_rt_min_advmss);
1430         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1431                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1432
1433 #ifdef CONFIG_NET_CLS_ROUTE
1434 #ifdef CONFIG_IP_MULTIPLE_TABLES
1435         set_class_tag(rt, fib_rules_tclass(res));
1436 #endif
1437         set_class_tag(rt, itag);
1438 #endif
1439         rt->rt_type = res->type;
1440 }
1441
1442 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1443                                 u8 tos, struct net_device *dev, int our)
1444 {
1445         unsigned hash;
1446         struct rtable *rth;
1447         u32 spec_dst;
1448         struct in_device *in_dev = in_dev_get(dev);
1449         u32 itag = 0;
1450
1451         /* Primary sanity checks. */
1452
1453         if (in_dev == NULL)
1454                 return -EINVAL;
1455
1456         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1457             skb->protocol != htons(ETH_P_IP))
1458                 goto e_inval;
1459
1460         if (ZERONET(saddr)) {
1461                 if (!LOCAL_MCAST(daddr))
1462                         goto e_inval;
1463                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1464         } else if (fib_validate_source(saddr, 0, tos, 0,
1465                                         dev, &spec_dst, &itag) < 0)
1466                 goto e_inval;
1467
1468         rth = dst_alloc(&ipv4_dst_ops);
1469         if (!rth)
1470                 goto e_nobufs;
1471
1472         rth->u.dst.output= ip_rt_bug;
1473
1474         atomic_set(&rth->u.dst.__refcnt, 1);
1475         rth->u.dst.flags= DST_HOST;
1476         if (in_dev->cnf.no_policy)
1477                 rth->u.dst.flags |= DST_NOPOLICY;
1478         rth->fl.fl4_dst = daddr;
1479         rth->rt_dst     = daddr;
1480         rth->fl.fl4_tos = tos;
1481 #ifdef CONFIG_IP_ROUTE_FWMARK
1482         rth->fl.fl4_fwmark= skb->nfmark;
1483 #endif
1484         rth->fl.fl4_src = saddr;
1485         rth->rt_src     = saddr;
1486 #ifdef CONFIG_IP_ROUTE_NAT
1487         rth->rt_dst_map = daddr;
1488         rth->rt_src_map = saddr;
1489 #endif
1490 #ifdef CONFIG_NET_CLS_ROUTE
1491         rth->u.dst.tclassid = itag;
1492 #endif
1493         rth->rt_iif     =
1494         rth->fl.iif     = dev->ifindex;
1495         rth->u.dst.dev  = &loopback_dev;
1496         dev_hold(rth->u.dst.dev);
1497         rth->idev       = in_dev_get(rth->u.dst.dev);
1498         rth->fl.oif     = 0;
1499         rth->rt_gateway = daddr;
1500         rth->rt_spec_dst= spec_dst;
1501         rth->rt_type    = RTN_MULTICAST;
1502         rth->rt_flags   = RTCF_MULTICAST;
1503         if (our) {
1504                 rth->u.dst.input= ip_local_deliver;
1505                 rth->rt_flags |= RTCF_LOCAL;
1506         }
1507
1508 #ifdef CONFIG_IP_MROUTE
1509         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1510                 rth->u.dst.input = ip_mr_input;
1511 #endif
1512         RT_CACHE_STAT_INC(in_slow_mc);
1513
1514         in_dev_put(in_dev);
1515         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1516         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1517
1518 e_nobufs:
1519         in_dev_put(in_dev);
1520         return -ENOBUFS;
1521
1522 e_inval:
1523         in_dev_put(in_dev);
1524         return -EINVAL;
1525 }
1526
1527 /*
1528  *      NOTE. We drop all the packets that has local source
1529  *      addresses, because every properly looped back packet
1530  *      must have correct destination already attached by output routine.
1531  *
1532  *      Such approach solves two big problems:
1533  *      1. Not simplex devices are handled properly.
1534  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1535  */
1536
1537 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1538                         u8 tos, struct net_device *dev)
1539 {
1540         struct fib_result res;
1541         struct in_device *in_dev = in_dev_get(dev);
1542         struct in_device *out_dev = NULL;
1543         struct flowi fl = { .nl_u = { .ip4_u =
1544                                       { .daddr = daddr,
1545                                         .saddr = saddr,
1546                                         .tos = tos,
1547                                         .scope = RT_SCOPE_UNIVERSE,
1548 #ifdef CONFIG_IP_ROUTE_FWMARK
1549                                         .fwmark = skb->nfmark
1550 #endif
1551                                       } },
1552                             .iif = dev->ifindex };
1553         unsigned        flags = 0;
1554         u32             itag = 0;
1555         struct rtable * rth;
1556         unsigned        hash;
1557         u32             spec_dst;
1558         int             err = -EINVAL;
1559         int             free_res = 0;
1560
1561         /* IP on this device is disabled. */
1562
1563         if (!in_dev)
1564                 goto out;
1565
1566         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1567
1568         /* Check for the most weird martians, which can be not detected
1569            by fib_lookup.
1570          */
1571
1572         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1573                 goto martian_source;
1574
1575         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1576                 goto brd_input;
1577
1578         /* Accept zero addresses only to limited broadcast;
1579          * I even do not know to fix it or not. Waiting for complains :-)
1580          */
1581         if (ZERONET(saddr))
1582                 goto martian_source;
1583
1584         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1585                 goto martian_destination;
1586
1587         /*
1588          *      Now we are ready to route packet.
1589          */
1590         if ((err = fib_lookup(&fl, &res)) != 0) {
1591                 if (!IN_DEV_FORWARD(in_dev))
1592                         goto e_inval;
1593                 goto no_route;
1594         }
1595         free_res = 1;
1596
1597         RT_CACHE_STAT_INC(in_slow_tot);
1598
1599 #ifdef CONFIG_IP_ROUTE_NAT
1600         /* Policy is applied before mapping destination,
1601            but rerouting after map should be made with old source.
1602          */
1603
1604         if (1) {
1605                 u32 src_map = saddr;
1606                 if (res.r)
1607                         src_map = fib_rules_policy(saddr, &res, &flags);
1608
1609                 if (res.type == RTN_NAT) {
1610                         fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1611                         fib_res_put(&res);
1612                         free_res = 0;
1613                         if (fib_lookup(&fl, &res))
1614                                 goto e_inval;
1615                         free_res = 1;
1616                         if (res.type != RTN_UNICAST)
1617                                 goto e_inval;
1618                         flags |= RTCF_DNAT;
1619                 }
1620                 fl.fl4_src = src_map;
1621         }
1622 #endif
1623
1624         if (res.type == RTN_BROADCAST)
1625                 goto brd_input;
1626
1627         if (res.type == RTN_LOCAL) {
1628                 int result;
1629                 result = fib_validate_source(saddr, daddr, tos,
1630                                              loopback_dev.ifindex,
1631                                              dev, &spec_dst, &itag);
1632                 if (result < 0)
1633                         goto martian_source;
1634                 if (result)
1635                         flags |= RTCF_DIRECTSRC;
1636                 spec_dst = daddr;
1637                 goto local_input;
1638         }
1639
1640         if (!IN_DEV_FORWARD(in_dev))
1641                 goto e_inval;
1642         if (res.type != RTN_UNICAST)
1643                 goto martian_destination;
1644
1645 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1646         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1647                 fib_select_multipath(&fl, &res);
1648 #endif
1649         out_dev = in_dev_get(FIB_RES_DEV(res));
1650         if (out_dev == NULL) {
1651                 if (net_ratelimit())
1652                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1653                                          "Please, report\n");
1654                 goto e_inval;
1655         }
1656
1657         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1658                                   &spec_dst, &itag);
1659         if (err < 0)
1660                 goto martian_source;
1661
1662         if (err)
1663                 flags |= RTCF_DIRECTSRC;
1664
1665         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1666             (IN_DEV_SHARED_MEDIA(out_dev) ||
1667              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1668                 flags |= RTCF_DOREDIRECT;
1669
1670         if (skb->protocol != htons(ETH_P_IP)) {
1671                 /* Not IP (i.e. ARP). Do not create route, if it is
1672                  * invalid for proxy arp. DNAT routes are always valid.
1673                  */
1674                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1675                         goto e_inval;
1676         }
1677
1678         rth = dst_alloc(&ipv4_dst_ops);
1679         if (!rth)
1680                 goto e_nobufs;
1681
1682         atomic_set(&rth->u.dst.__refcnt, 1);
1683         rth->u.dst.flags= DST_HOST;
1684         if (in_dev->cnf.no_policy)
1685                 rth->u.dst.flags |= DST_NOPOLICY;
1686         if (in_dev->cnf.no_xfrm)
1687                 rth->u.dst.flags |= DST_NOXFRM;
1688         rth->fl.fl4_dst = daddr;
1689         rth->rt_dst     = daddr;
1690         rth->fl.fl4_tos = tos;
1691 #ifdef CONFIG_IP_ROUTE_FWMARK
1692         rth->fl.fl4_fwmark= skb->nfmark;
1693 #endif
1694         rth->fl.fl4_src = saddr;
1695         rth->rt_src     = saddr;
1696         rth->rt_gateway = daddr;
1697 #ifdef CONFIG_IP_ROUTE_NAT
1698         rth->rt_src_map = fl.fl4_src;
1699         rth->rt_dst_map = fl.fl4_dst;
1700         if (flags&RTCF_DNAT)
1701                 rth->rt_gateway = fl.fl4_dst;
1702 #endif
1703         rth->rt_iif     =
1704         rth->fl.iif     = dev->ifindex;
1705         rth->u.dst.dev  = out_dev->dev;
1706         dev_hold(rth->u.dst.dev);
1707         rth->idev       = in_dev_get(rth->u.dst.dev);
1708         rth->fl.oif     = 0;
1709         rth->rt_spec_dst= spec_dst;
1710
1711         rth->u.dst.input = ip_forward;
1712         rth->u.dst.output = ip_output;
1713
1714         rt_set_nexthop(rth, &res, itag);
1715
1716         rth->rt_flags = flags;
1717
1718 #ifdef CONFIG_NET_FASTROUTE
1719         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1720                 struct net_device *odev = rth->u.dst.dev;
1721                 if (odev != dev &&
1722                     dev->accept_fastpath &&
1723                     odev->mtu >= dev->mtu &&
1724                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1725                         rth->rt_flags |= RTCF_FAST;
1726         }
1727 #endif
1728
1729 intern:
1730         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1731 done:
1732         in_dev_put(in_dev);
1733         if (out_dev)
1734                 in_dev_put(out_dev);
1735         if (free_res)
1736                 fib_res_put(&res);
1737 out:    return err;
1738
1739 brd_input:
1740         if (skb->protocol != htons(ETH_P_IP))
1741                 goto e_inval;
1742
1743         if (ZERONET(saddr))
1744                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1745         else {
1746                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1747                                           &itag);
1748                 if (err < 0)
1749                         goto martian_source;
1750                 if (err)
1751                         flags |= RTCF_DIRECTSRC;
1752         }
1753         flags |= RTCF_BROADCAST;
1754         res.type = RTN_BROADCAST;
1755         RT_CACHE_STAT_INC(in_brd);
1756
1757 local_input:
1758         rth = dst_alloc(&ipv4_dst_ops);
1759         if (!rth)
1760                 goto e_nobufs;
1761
1762         rth->u.dst.output= ip_rt_bug;
1763
1764         atomic_set(&rth->u.dst.__refcnt, 1);
1765         rth->u.dst.flags= DST_HOST;
1766         if (in_dev->cnf.no_policy)
1767                 rth->u.dst.flags |= DST_NOPOLICY;
1768         rth->fl.fl4_dst = daddr;
1769         rth->rt_dst     = daddr;
1770         rth->fl.fl4_tos = tos;
1771 #ifdef CONFIG_IP_ROUTE_FWMARK
1772         rth->fl.fl4_fwmark= skb->nfmark;
1773 #endif
1774         rth->fl.fl4_src = saddr;
1775         rth->rt_src     = saddr;
1776 #ifdef CONFIG_IP_ROUTE_NAT
1777         rth->rt_dst_map = fl.fl4_dst;
1778         rth->rt_src_map = fl.fl4_src;
1779 #endif
1780 #ifdef CONFIG_NET_CLS_ROUTE
1781         rth->u.dst.tclassid = itag;
1782 #endif
1783         rth->rt_iif     =
1784         rth->fl.iif     = dev->ifindex;
1785         rth->u.dst.dev  = &loopback_dev;
1786         dev_hold(rth->u.dst.dev);
1787         rth->idev       = in_dev_get(rth->u.dst.dev);
1788         rth->rt_gateway = daddr;
1789         rth->rt_spec_dst= spec_dst;
1790         rth->u.dst.input= ip_local_deliver;
1791         rth->rt_flags   = flags|RTCF_LOCAL;
1792         if (res.type == RTN_UNREACHABLE) {
1793                 rth->u.dst.input= ip_error;
1794                 rth->u.dst.error= -err;
1795                 rth->rt_flags   &= ~RTCF_LOCAL;
1796         }
1797         rth->rt_type    = res.type;
1798         goto intern;
1799
1800 no_route:
1801         RT_CACHE_STAT_INC(in_no_route);
1802         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1803         res.type = RTN_UNREACHABLE;
1804         goto local_input;
1805
1806         /*
1807          *      Do not cache martian addresses: they should be logged (RFC1812)
1808          */
1809 martian_destination:
1810         RT_CACHE_STAT_INC(in_martian_dst);
1811 #ifdef CONFIG_IP_ROUTE_VERBOSE
1812         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1813                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1814                         "%u.%u.%u.%u, dev %s\n",
1815                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1816 #endif
1817 e_inval:
1818         err = -EINVAL;
1819         goto done;
1820
1821 e_nobufs:
1822         err = -ENOBUFS;
1823         goto done;
1824
1825 martian_source:
1826
1827         RT_CACHE_STAT_INC(in_martian_src);
1828 #ifdef CONFIG_IP_ROUTE_VERBOSE
1829         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1830                 /*
1831                  *      RFC1812 recommendation, if source is martian,
1832                  *      the only hint is MAC header.
1833                  */
1834                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1835                         "%u.%u.%u.%u, on dev %s\n",
1836                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1837                 if (dev->hard_header_len) {
1838                         int i;
1839                         unsigned char *p = skb->mac.raw;
1840                         printk(KERN_WARNING "ll header: ");
1841                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1842                                 printk("%02x", *p);
1843                                 if (i < (dev->hard_header_len - 1))
1844                                         printk(":");
1845                         }
1846                         printk("\n");
1847                 }
1848         }
1849 #endif
1850         goto e_inval;
1851 }
1852
1853 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1854                    u8 tos, struct net_device *dev)
1855 {
1856         struct rtable * rth;
1857         unsigned        hash;
1858         int iif = dev->ifindex;
1859
1860         tos &= IPTOS_RT_MASK;
1861         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1862
1863         rcu_read_lock();
1864         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1865                 smp_read_barrier_depends();
1866                 if (rth->fl.fl4_dst == daddr &&
1867                     rth->fl.fl4_src == saddr &&
1868                     rth->fl.iif == iif &&
1869                     rth->fl.oif == 0 &&
1870 #ifdef CONFIG_IP_ROUTE_FWMARK
1871                     rth->fl.fl4_fwmark == skb->nfmark &&
1872 #endif
1873                     rth->fl.fl4_tos == tos) {
1874                         rth->u.dst.lastuse = jiffies;
1875                         dst_hold(&rth->u.dst);
1876                         rth->u.dst.__use++;
1877                         RT_CACHE_STAT_INC(in_hit);
1878                         rcu_read_unlock();
1879                         skb->dst = (struct dst_entry*)rth;
1880                         return 0;
1881                 }
1882                 RT_CACHE_STAT_INC(in_hlist_search);
1883         }
1884         rcu_read_unlock();
1885
1886         /* Multicast recognition logic is moved from route cache to here.
1887            The problem was that too many Ethernet cards have broken/missing
1888            hardware multicast filters :-( As result the host on multicasting
1889            network acquires a lot of useless route cache entries, sort of
1890            SDR messages from all the world. Now we try to get rid of them.
1891            Really, provided software IP multicast filter is organized
1892            reasonably (at least, hashed), it does not result in a slowdown
1893            comparing with route cache reject entries.
1894            Note, that multicast routers are not affected, because
1895            route cache entry is created eventually.
1896          */
1897         if (MULTICAST(daddr)) {
1898                 struct in_device *in_dev;
1899
1900                 read_lock(&inetdev_lock);
1901                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1902                         int our = ip_check_mc(in_dev, daddr, saddr,
1903                                 skb->nh.iph->protocol);
1904                         if (our
1905 #ifdef CONFIG_IP_MROUTE
1906                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1907 #endif
1908                             ) {
1909                                 read_unlock(&inetdev_lock);
1910                                 return ip_route_input_mc(skb, daddr, saddr,
1911                                                          tos, dev, our);
1912                         }
1913                 }
1914                 read_unlock(&inetdev_lock);
1915                 return -EINVAL;
1916         }
1917         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1918 }
1919
1920 /*
1921  * Major route resolver routine.
1922  */
1923
1924 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1925 {
1926         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1927         struct flowi fl = { .nl_u = { .ip4_u =
1928                                       { .daddr = oldflp->fl4_dst,
1929                                         .saddr = oldflp->fl4_src,
1930                                         .tos = tos & IPTOS_RT_MASK,
1931                                         .scope = ((tos & RTO_ONLINK) ?
1932                                                   RT_SCOPE_LINK :
1933                                                   RT_SCOPE_UNIVERSE),
1934 #ifdef CONFIG_IP_ROUTE_FWMARK
1935                                         .fwmark = oldflp->fl4_fwmark
1936 #endif
1937                                       } },
1938                             .iif = loopback_dev.ifindex,
1939                             .oif = oldflp->oif };
1940         struct fib_result res;
1941         unsigned flags = 0;
1942         struct rtable *rth;
1943         struct net_device *dev_out = NULL;
1944         struct in_device *in_dev = NULL;
1945         unsigned hash;
1946         int free_res = 0;
1947         int err;
1948
1949         res.fi          = NULL;
1950 #ifdef CONFIG_IP_MULTIPLE_TABLES
1951         res.r           = NULL;
1952 #endif
1953
1954         if (oldflp->fl4_src) {
1955                 err = -EINVAL;
1956                 if (MULTICAST(oldflp->fl4_src) ||
1957                     BADCLASS(oldflp->fl4_src) ||
1958                     ZERONET(oldflp->fl4_src))
1959                         goto out;
1960
1961                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1962                 dev_out = ip_dev_find(oldflp->fl4_src);
1963                 if (dev_out == NULL)
1964                         goto out;
1965
1966                 /* I removed check for oif == dev_out->oif here.
1967                    It was wrong for two reasons:
1968                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1969                       assigned to multiple interfaces.
1970                    2. Moreover, we are allowed to send packets with saddr
1971                       of another iface. --ANK
1972                  */
1973
1974                 if (oldflp->oif == 0
1975                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1976                         /* Special hack: user can direct multicasts
1977                            and limited broadcast via necessary interface
1978                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1979                            This hack is not just for fun, it allows
1980                            vic,vat and friends to work.
1981                            They bind socket to loopback, set ttl to zero
1982                            and expect that it will work.
1983                            From the viewpoint of routing cache they are broken,
1984                            because we are not allowed to build multicast path
1985                            with loopback source addr (look, routing cache
1986                            cannot know, that ttl is zero, so that packet
1987                            will not leave this host and route is valid).
1988                            Luckily, this hack is good workaround.
1989                          */
1990
1991                         fl.oif = dev_out->ifindex;
1992                         goto make_route;
1993                 }
1994                 if (dev_out)
1995                         dev_put(dev_out);
1996                 dev_out = NULL;
1997         }
1998         if (oldflp->oif) {
1999                 dev_out = dev_get_by_index(oldflp->oif);
2000                 err = -ENODEV;
2001                 if (dev_out == NULL)
2002                         goto out;
2003                 if (__in_dev_get(dev_out) == NULL) {
2004                         dev_put(dev_out);
2005                         goto out;       /* Wrong error code */
2006                 }
2007
2008                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2009                         if (!fl.fl4_src)
2010                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2011                                                               RT_SCOPE_LINK);
2012                         goto make_route;
2013                 }
2014                 if (!fl.fl4_src) {
2015                         if (MULTICAST(oldflp->fl4_dst))
2016                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2017                                                               fl.fl4_scope);
2018                         else if (!oldflp->fl4_dst)
2019                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2020                                                               RT_SCOPE_HOST);
2021                 }
2022         }
2023
2024         if (!fl.fl4_dst) {
2025                 fl.fl4_dst = fl.fl4_src;
2026                 if (!fl.fl4_dst)
2027                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2028                 if (dev_out)
2029                         dev_put(dev_out);
2030                 dev_out = &loopback_dev;
2031                 dev_hold(dev_out);
2032                 fl.oif = loopback_dev.ifindex;
2033                 res.type = RTN_LOCAL;
2034                 flags |= RTCF_LOCAL;
2035                 goto make_route;
2036         }
2037
2038         if (fib_lookup(&fl, &res)) {
2039                 res.fi = NULL;
2040                 if (oldflp->oif) {
2041                         /* Apparently, routing tables are wrong. Assume,
2042                            that the destination is on link.
2043
2044                            WHY? DW.
2045                            Because we are allowed to send to iface
2046                            even if it has NO routes and NO assigned
2047                            addresses. When oif is specified, routing
2048                            tables are looked up with only one purpose:
2049                            to catch if destination is gatewayed, rather than
2050                            direct. Moreover, if MSG_DONTROUTE is set,
2051                            we send packet, ignoring both routing tables
2052                            and ifaddr state. --ANK
2053
2054
2055                            We could make it even if oif is unknown,
2056                            likely IPv6, but we do not.
2057                          */
2058
2059                         if (fl.fl4_src == 0)
2060                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2061                                                               RT_SCOPE_LINK);
2062                         res.type = RTN_UNICAST;
2063                         goto make_route;
2064                 }
2065                 if (dev_out)
2066                         dev_put(dev_out);
2067                 err = -ENETUNREACH;
2068                 goto out;
2069         }
2070         free_res = 1;
2071
2072         if (res.type == RTN_NAT)
2073                 goto e_inval;
2074
2075         if (res.type == RTN_LOCAL) {
2076                 if (!fl.fl4_src)
2077                         fl.fl4_src = fl.fl4_dst;
2078                 if (dev_out)
2079                         dev_put(dev_out);
2080                 dev_out = &loopback_dev;
2081                 dev_hold(dev_out);
2082                 fl.oif = dev_out->ifindex;
2083                 if (res.fi)
2084                         fib_info_put(res.fi);
2085                 res.fi = NULL;
2086                 flags |= RTCF_LOCAL;
2087                 goto make_route;
2088         }
2089
2090 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2091         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2092                 fib_select_multipath(&fl, &res);
2093         else
2094 #endif
2095         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2096                 fib_select_default(&fl, &res);
2097
2098         if (!fl.fl4_src)
2099                 fl.fl4_src = FIB_RES_PREFSRC(res);
2100
2101         if (dev_out)
2102                 dev_put(dev_out);
2103         dev_out = FIB_RES_DEV(res);
2104         dev_hold(dev_out);
2105         fl.oif = dev_out->ifindex;
2106
2107 make_route:
2108         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2109                 goto e_inval;
2110
2111         if (fl.fl4_dst == 0xFFFFFFFF)
2112                 res.type = RTN_BROADCAST;
2113         else if (MULTICAST(fl.fl4_dst))
2114                 res.type = RTN_MULTICAST;
2115         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2116                 goto e_inval;
2117
2118         if (dev_out->flags & IFF_LOOPBACK)
2119                 flags |= RTCF_LOCAL;
2120
2121         in_dev = in_dev_get(dev_out);
2122         if (!in_dev)
2123                 goto e_inval;
2124
2125         if (res.type == RTN_BROADCAST) {
2126                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2127                 if (res.fi) {
2128                         fib_info_put(res.fi);
2129                         res.fi = NULL;
2130                 }
2131         } else if (res.type == RTN_MULTICAST) {
2132                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2133                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2134                         flags &= ~RTCF_LOCAL;
2135                 /* If multicast route do not exist use
2136                    default one, but do not gateway in this case.
2137                    Yes, it is hack.
2138                  */
2139                 if (res.fi && res.prefixlen < 4) {
2140                         fib_info_put(res.fi);
2141                         res.fi = NULL;
2142                 }
2143         }
2144
2145         rth = dst_alloc(&ipv4_dst_ops);
2146         if (!rth)
2147                 goto e_nobufs;
2148
2149         atomic_set(&rth->u.dst.__refcnt, 1);
2150         rth->u.dst.flags= DST_HOST;
2151         if (in_dev->cnf.no_xfrm)
2152                 rth->u.dst.flags |= DST_NOXFRM;
2153         if (in_dev->cnf.no_policy)
2154                 rth->u.dst.flags |= DST_NOPOLICY;
2155         rth->fl.fl4_dst = oldflp->fl4_dst;
2156         rth->fl.fl4_tos = tos;
2157         rth->fl.fl4_src = oldflp->fl4_src;
2158         rth->fl.oif     = oldflp->oif;
2159 #ifdef CONFIG_IP_ROUTE_FWMARK
2160         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2161 #endif
2162         rth->rt_dst     = fl.fl4_dst;
2163         rth->rt_src     = fl.fl4_src;
2164 #ifdef CONFIG_IP_ROUTE_NAT
2165         rth->rt_dst_map = fl.fl4_dst;
2166         rth->rt_src_map = fl.fl4_src;
2167 #endif
2168         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2169         rth->u.dst.dev  = dev_out;
2170         dev_hold(dev_out);
2171         rth->idev       = in_dev_get(dev_out);
2172         rth->rt_gateway = fl.fl4_dst;
2173         rth->rt_spec_dst= fl.fl4_src;
2174
2175         rth->u.dst.output=ip_output;
2176
2177         RT_CACHE_STAT_INC(out_slow_tot);
2178
2179         if (flags & RTCF_LOCAL) {
2180                 rth->u.dst.input = ip_local_deliver;
2181                 rth->rt_spec_dst = fl.fl4_dst;
2182         }
2183         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2184                 rth->rt_spec_dst = fl.fl4_src;
2185                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2186                         rth->u.dst.output = ip_mc_output;
2187                         RT_CACHE_STAT_INC(out_slow_mc);
2188                 }
2189 #ifdef CONFIG_IP_MROUTE
2190                 if (res.type == RTN_MULTICAST) {
2191                         if (IN_DEV_MFORWARD(in_dev) &&
2192                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2193                                 rth->u.dst.input = ip_mr_input;
2194                                 rth->u.dst.output = ip_mc_output;
2195                         }
2196                 }
2197 #endif
2198         }
2199
2200         rt_set_nexthop(rth, &res, 0);
2201         
2202
2203         rth->rt_flags = flags;
2204
2205         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2206         err = rt_intern_hash(hash, rth, rp);
2207 done:
2208         if (free_res)
2209                 fib_res_put(&res);
2210         if (dev_out)
2211                 dev_put(dev_out);
2212         if (in_dev)
2213                 in_dev_put(in_dev);
2214 out:    return err;
2215
2216 e_inval:
2217         err = -EINVAL;
2218         goto done;
2219 e_nobufs:
2220         err = -ENOBUFS;
2221         goto done;
2222 }
2223
2224 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2225 {
2226         unsigned hash;
2227         struct rtable *rth;
2228
2229         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2230
2231         rcu_read_lock();
2232         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2233                 smp_read_barrier_depends();
2234                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2235                     rth->fl.fl4_src == flp->fl4_src &&
2236                     rth->fl.iif == 0 &&
2237                     rth->fl.oif == flp->oif &&
2238 #ifdef CONFIG_IP_ROUTE_FWMARK
2239                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2240 #endif
2241                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2242                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2243                         rth->u.dst.lastuse = jiffies;
2244                         dst_hold(&rth->u.dst);
2245                         rth->u.dst.__use++;
2246                         RT_CACHE_STAT_INC(out_hit);
2247                         rcu_read_unlock();
2248                         *rp = rth;
2249                         return 0;
2250                 }
2251                 RT_CACHE_STAT_INC(out_hlist_search);
2252         }
2253         rcu_read_unlock();
2254
2255         return ip_route_output_slow(rp, flp);
2256 }
2257
2258 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2259 {
2260         int err;
2261
2262         if ((err = __ip_route_output_key(rp, flp)) != 0)
2263                 return err;
2264         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2265 }
2266
2267 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2268 {
2269         int err;
2270
2271         if ((err = __ip_route_output_key(rp, flp)) != 0)
2272                 return err;
2273         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2274 }
2275
2276 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2277                         int nowait)
2278 {
2279         struct rtable *rt = (struct rtable*)skb->dst;
2280         struct rtmsg *r;
2281         struct nlmsghdr  *nlh;
2282         unsigned char    *b = skb->tail;
2283         struct rta_cacheinfo ci;
2284 #ifdef CONFIG_IP_MROUTE
2285         struct rtattr *eptr;
2286 #endif
2287         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2288         r = NLMSG_DATA(nlh);
2289         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2290         r->rtm_family    = AF_INET;
2291         r->rtm_dst_len  = 32;
2292         r->rtm_src_len  = 0;
2293         r->rtm_tos      = rt->fl.fl4_tos;
2294         r->rtm_table    = RT_TABLE_MAIN;
2295         r->rtm_type     = rt->rt_type;
2296         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2297         r->rtm_protocol = RTPROT_UNSPEC;
2298         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2299         if (rt->rt_flags & RTCF_NOTIFY)
2300                 r->rtm_flags |= RTM_F_NOTIFY;
2301         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2302         if (rt->fl.fl4_src) {
2303                 r->rtm_src_len = 32;
2304                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2305         }
2306         if (rt->u.dst.dev)
2307                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2308 #ifdef CONFIG_NET_CLS_ROUTE
2309         if (rt->u.dst.tclassid)
2310                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2311 #endif
2312         if (rt->fl.iif)
2313                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2314         else if (rt->rt_src != rt->fl.fl4_src)
2315                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2316         if (rt->rt_dst != rt->rt_gateway)
2317                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2318         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2319                 goto rtattr_failure;
2320         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2321         ci.rta_used     = rt->u.dst.__use;
2322         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2323         if (rt->u.dst.expires)
2324                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2325         else
2326                 ci.rta_expires = 0;
2327         ci.rta_error    = rt->u.dst.error;
2328         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2329         if (rt->peer) {
2330                 ci.rta_id = rt->peer->ip_id_count;
2331                 if (rt->peer->tcp_ts_stamp) {
2332                         ci.rta_ts = rt->peer->tcp_ts;
2333                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2334                 }
2335         }
2336 #ifdef CONFIG_IP_MROUTE
2337         eptr = (struct rtattr*)skb->tail;
2338 #endif
2339         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2340         if (rt->fl.iif) {
2341 #ifdef CONFIG_IP_MROUTE
2342                 u32 dst = rt->rt_dst;
2343
2344                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2345                     ipv4_devconf.mc_forwarding) {
2346                         int err = ipmr_get_route(skb, r, nowait);
2347                         if (err <= 0) {
2348                                 if (!nowait) {
2349                                         if (err == 0)
2350                                                 return 0;
2351                                         goto nlmsg_failure;
2352                                 } else {
2353                                         if (err == -EMSGSIZE)
2354                                                 goto nlmsg_failure;
2355                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2356                                 }
2357                         }
2358                 } else
2359 #endif
2360                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2361         }
2362
2363         nlh->nlmsg_len = skb->tail - b;
2364         return skb->len;
2365
2366 nlmsg_failure:
2367 rtattr_failure:
2368         skb_trim(skb, b - skb->data);
2369         return -1;
2370 }
2371
2372 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2373 {
2374         struct rtattr **rta = arg;
2375         struct rtmsg *rtm = NLMSG_DATA(nlh);
2376         struct rtable *rt = NULL;
2377         u32 dst = 0;
2378         u32 src = 0;
2379         int iif = 0;
2380         int err = -ENOBUFS;
2381         struct sk_buff *skb;
2382
2383         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2384         if (!skb)
2385                 goto out;
2386
2387         /* Reserve room for dummy headers, this skb can pass
2388            through good chunk of routing engine.
2389          */
2390         skb->mac.raw = skb->data;
2391         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2392
2393         if (rta[RTA_SRC - 1])
2394                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2395         if (rta[RTA_DST - 1])
2396                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2397         if (rta[RTA_IIF - 1])
2398                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2399
2400         if (iif) {
2401                 struct net_device *dev = __dev_get_by_index(iif);
2402                 err = -ENODEV;
2403                 if (!dev)
2404                         goto out_free;
2405                 skb->protocol   = htons(ETH_P_IP);
2406                 skb->dev        = dev;
2407                 local_bh_disable();
2408                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2409                 local_bh_enable();
2410                 rt = (struct rtable*)skb->dst;
2411                 if (!err && rt->u.dst.error)
2412                         err = -rt->u.dst.error;
2413         } else {
2414                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2415                                                          .saddr = src,
2416                                                          .tos = rtm->rtm_tos } } };
2417                 int oif = 0;
2418                 if (rta[RTA_OIF - 1])
2419                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2420                 fl.oif = oif;
2421                 err = ip_route_output_key(&rt, &fl);
2422         }
2423         if (err)
2424                 goto out_free;
2425
2426         skb->dst = &rt->u.dst;
2427         if (rtm->rtm_flags & RTM_F_NOTIFY)
2428                 rt->rt_flags |= RTCF_NOTIFY;
2429
2430         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2431
2432         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2433                                 RTM_NEWROUTE, 0);
2434         if (!err)
2435                 goto out_free;
2436         if (err < 0) {
2437                 err = -EMSGSIZE;
2438                 goto out_free;
2439         }
2440
2441         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2442         if (err > 0)
2443                 err = 0;
2444 out:    return err;
2445
2446 out_free:
2447         kfree_skb(skb);
2448         goto out;
2449 }
2450
2451 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2452 {
2453         struct rtable *rt;
2454         int h, s_h;
2455         int idx, s_idx;
2456
2457         s_h = cb->args[0];
2458         s_idx = idx = cb->args[1];
2459         for (h = 0; h <= rt_hash_mask; h++) {
2460                 if (h < s_h) continue;
2461                 if (h > s_h)
2462                         s_idx = 0;
2463                 rcu_read_lock();
2464                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2465                      rt = rt->u.rt_next, idx++) {
2466                         smp_read_barrier_depends();
2467                         if (idx < s_idx)
2468                                 continue;
2469                         skb->dst = dst_clone(&rt->u.dst);
2470                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2471                                          cb->nlh->nlmsg_seq,
2472                                          RTM_NEWROUTE, 1) <= 0) {
2473                                 dst_release(xchg(&skb->dst, NULL));
2474                                 rcu_read_unlock();
2475                                 goto done;
2476                         }
2477                         dst_release(xchg(&skb->dst, NULL));
2478                 }
2479                 rcu_read_unlock();
2480         }
2481
2482 done:
2483         cb->args[0] = h;
2484         cb->args[1] = idx;
2485         return skb->len;
2486 }
2487
2488 void ip_rt_multicast_event(struct in_device *in_dev)
2489 {
2490         rt_cache_flush(0);
2491 }
2492
2493 #ifdef CONFIG_SYSCTL
2494 static int flush_delay;
2495
2496 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2497                                         struct file *filp, void __user *buffer,
2498                                         size_t *lenp)
2499 {
2500         if (write) {
2501                 proc_dointvec(ctl, write, filp, buffer, lenp);
2502                 rt_cache_flush(flush_delay);
2503                 return 0;
2504         } 
2505
2506         return -EINVAL;
2507 }
2508
2509 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2510                                                 int __user *name,
2511                                                 int nlen,
2512                                                 void __user *oldval,
2513                                                 size_t __user *oldlenp,
2514                                                 void __user *newval,
2515                                                 size_t newlen,
2516                                                 void **context)
2517 {
2518         int delay;
2519         if (newlen != sizeof(int))
2520                 return -EINVAL;
2521         if (get_user(delay, (int __user *)newval))
2522                 return -EFAULT; 
2523         rt_cache_flush(delay); 
2524         return 0;
2525 }
2526
2527 ctl_table ipv4_route_table[] = {
2528         {
2529                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2530                 .procname       = "flush",
2531                 .data           = &flush_delay,
2532                 .maxlen         = sizeof(int),
2533                 .mode           = 0644,
2534                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2535                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2536         },
2537         {
2538                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2539                 .procname       = "min_delay",
2540                 .data           = &ip_rt_min_delay,
2541                 .maxlen         = sizeof(int),
2542                 .mode           = 0644,
2543                 .proc_handler   = &proc_dointvec_jiffies,
2544                 .strategy       = &sysctl_jiffies,
2545         },
2546         {
2547                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2548                 .procname       = "max_delay",
2549                 .data           = &ip_rt_max_delay,
2550                 .maxlen         = sizeof(int),
2551                 .mode           = 0644,
2552                 .proc_handler   = &proc_dointvec_jiffies,
2553                 .strategy       = &sysctl_jiffies,
2554         },
2555         {
2556                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2557                 .procname       = "gc_thresh",
2558                 .data           = &ipv4_dst_ops.gc_thresh,
2559                 .maxlen         = sizeof(int),
2560                 .mode           = 0644,
2561                 .proc_handler   = &proc_dointvec,
2562         },
2563         {
2564                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2565                 .procname       = "max_size",
2566                 .data           = &ip_rt_max_size,
2567                 .maxlen         = sizeof(int),
2568                 .mode           = 0644,
2569                 .proc_handler   = &proc_dointvec,
2570         },
2571         {
2572                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2573                 .procname       = "gc_min_interval",
2574                 .data           = &ip_rt_gc_min_interval,
2575                 .maxlen         = sizeof(int),
2576                 .mode           = 0644,
2577                 .proc_handler   = &proc_dointvec_jiffies,
2578                 .strategy       = &sysctl_jiffies,
2579         },
2580         {
2581                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2582                 .procname       = "gc_timeout",
2583                 .data           = &ip_rt_gc_timeout,
2584                 .maxlen         = sizeof(int),
2585                 .mode           = 0644,
2586                 .proc_handler   = &proc_dointvec_jiffies,
2587                 .strategy       = &sysctl_jiffies,
2588         },
2589         {
2590                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2591                 .procname       = "gc_interval",
2592                 .data           = &ip_rt_gc_interval,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = &proc_dointvec_jiffies,
2596                 .strategy       = &sysctl_jiffies,
2597         },
2598         {
2599                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2600                 .procname       = "redirect_load",
2601                 .data           = &ip_rt_redirect_load,
2602                 .maxlen         = sizeof(int),
2603                 .mode           = 0644,
2604                 .proc_handler   = &proc_dointvec,
2605         },
2606         {
2607                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2608                 .procname       = "redirect_number",
2609                 .data           = &ip_rt_redirect_number,
2610                 .maxlen         = sizeof(int),
2611                 .mode           = 0644,
2612                 .proc_handler   = &proc_dointvec,
2613         },
2614         {
2615                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2616                 .procname       = "redirect_silence",
2617                 .data           = &ip_rt_redirect_silence,
2618                 .maxlen         = sizeof(int),
2619                 .mode           = 0644,
2620                 .proc_handler   = &proc_dointvec,
2621         },
2622         {
2623                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2624                 .procname       = "error_cost",
2625                 .data           = &ip_rt_error_cost,
2626                 .maxlen         = sizeof(int),
2627                 .mode           = 0644,
2628                 .proc_handler   = &proc_dointvec,
2629         },
2630         {
2631                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2632                 .procname       = "error_burst",
2633                 .data           = &ip_rt_error_burst,
2634                 .maxlen         = sizeof(int),
2635                 .mode           = 0644,
2636                 .proc_handler   = &proc_dointvec,
2637         },
2638         {
2639                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2640                 .procname       = "gc_elasticity",
2641                 .data           = &ip_rt_gc_elasticity,
2642                 .maxlen         = sizeof(int),
2643                 .mode           = 0644,
2644                 .proc_handler   = &proc_dointvec,
2645         },
2646         {
2647                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2648                 .procname       = "mtu_expires",
2649                 .data           = &ip_rt_mtu_expires,
2650                 .maxlen         = sizeof(int),
2651                 .mode           = 0644,
2652                 .proc_handler   = &proc_dointvec_jiffies,
2653                 .strategy       = &sysctl_jiffies,
2654         },
2655         {
2656                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2657                 .procname       = "min_pmtu",
2658                 .data           = &ip_rt_min_pmtu,
2659                 .maxlen         = sizeof(int),
2660                 .mode           = 0644,
2661                 .proc_handler   = &proc_dointvec,
2662         },
2663         {
2664                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2665                 .procname       = "min_adv_mss",
2666                 .data           = &ip_rt_min_advmss,
2667                 .maxlen         = sizeof(int),
2668                 .mode           = 0644,
2669                 .proc_handler   = &proc_dointvec,
2670         },
2671         {
2672                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2673                 .procname       = "secret_interval",
2674                 .data           = &ip_rt_secret_interval,
2675                 .maxlen         = sizeof(int),
2676                 .mode           = 0644,
2677                 .proc_handler   = &proc_dointvec_jiffies,
2678                 .strategy       = &sysctl_jiffies,
2679         },
2680         { .ctl_name = 0 }
2681 };
2682 #endif
2683
2684 #ifdef CONFIG_NET_CLS_ROUTE
2685 struct ip_rt_acct *ip_rt_acct;
2686
2687 /* This code sucks.  But you should have seen it before! --RR */
2688
2689 /* IP route accounting ptr for this logical cpu number. */
2690 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2691
2692 #ifdef CONFIG_PROC_FS
2693 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2694                            int length, int *eof, void *data)
2695 {
2696         unsigned int i;
2697
2698         if ((offset & 3) || (length & 3))
2699                 return -EIO;
2700
2701         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2702                 *eof = 1;
2703                 return 0;
2704         }
2705
2706         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2707                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2708                 *eof = 1;
2709         }
2710
2711         offset /= sizeof(u32);
2712
2713         if (length > 0) {
2714                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2715                 u32 *dst = (u32 *) buffer;
2716
2717                 /* Copy first cpu. */
2718                 *start = buffer;
2719                 memcpy(dst, src, length);
2720
2721                 /* Add the other cpus in, one int at a time */
2722                 for_each_cpu(i) {
2723                         unsigned int j;
2724
2725                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2726
2727                         for (j = 0; j < length/4; j++)
2728                                 dst[j] += src[j];
2729                 }
2730         }
2731         return length;
2732 }
2733 #endif /* CONFIG_PROC_FS */
2734 #endif /* CONFIG_NET_CLS_ROUTE */
2735
2736 static __initdata unsigned long rhash_entries;
2737 static int __init set_rhash_entries(char *str)
2738 {
2739         if (!str)
2740                 return 0;
2741         rhash_entries = simple_strtoul(str, &str, 0);
2742         return 1;
2743 }
2744 __setup("rhash_entries=", set_rhash_entries);
2745
2746 int __init ip_rt_init(void)
2747 {
2748         int i, order, goal, rc = 0;
2749
2750         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2751                              (jiffies ^ (jiffies >> 7)));
2752
2753 #ifdef CONFIG_NET_CLS_ROUTE
2754         for (order = 0;
2755              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2756                 /* NOTHING */;
2757         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2758         if (!ip_rt_acct)
2759                 panic("IP: failed to allocate ip_rt_acct\n");
2760         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2761 #endif
2762
2763         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2764                                                      sizeof(struct rtable),
2765                                                      0, SLAB_HWCACHE_ALIGN,
2766                                                      NULL, NULL);
2767
2768         if (!ipv4_dst_ops.kmem_cachep)
2769                 panic("IP: failed to allocate ip_dst_cache\n");
2770
2771         goal = num_physpages >> (26 - PAGE_SHIFT);
2772         if (rhash_entries)
2773                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2774         for (order = 0; (1UL << order) < goal; order++)
2775                 /* NOTHING */;
2776
2777         do {
2778                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2779                         sizeof(struct rt_hash_bucket);
2780                 while (rt_hash_mask & (rt_hash_mask - 1))
2781                         rt_hash_mask--;
2782                 rt_hash_table = (struct rt_hash_bucket *)
2783                         __get_free_pages(GFP_ATOMIC, order);
2784         } while (rt_hash_table == NULL && --order > 0);
2785
2786         if (!rt_hash_table)
2787                 panic("Failed to allocate IP route cache hash table\n");
2788
2789         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2790                rt_hash_mask,
2791                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2792
2793         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2794                 /* NOTHING */;
2795
2796         rt_hash_mask--;
2797         for (i = 0; i <= rt_hash_mask; i++) {
2798                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2799                 rt_hash_table[i].chain = NULL;
2800         }
2801
2802         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2803         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2804
2805         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2806         if (!rt_cache_stat)
2807                 return -ENOMEM;
2808
2809         devinet_init();
2810         ip_fib_init();
2811
2812         init_timer(&rt_flush_timer);
2813         rt_flush_timer.function = rt_run_flush;
2814         init_timer(&rt_periodic_timer);
2815         rt_periodic_timer.function = rt_check_expire;
2816         init_timer(&rt_secret_timer);
2817         rt_secret_timer.function = rt_secret_rebuild;
2818
2819         /* All the timers, started at system startup tend
2820            to synchronize. Perturb it a bit.
2821          */
2822         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2823                                         ip_rt_gc_interval;
2824         add_timer(&rt_periodic_timer);
2825
2826         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2827                 ip_rt_secret_interval;
2828         add_timer(&rt_secret_timer);
2829
2830 #ifdef CONFIG_PROC_FS
2831         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2832             !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2833                 free_percpu(rt_cache_stat);
2834                 return -ENOMEM;
2835         }
2836
2837 #ifdef CONFIG_NET_CLS_ROUTE
2838         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2839 #endif
2840 #endif
2841 #ifdef CONFIG_XFRM
2842         xfrm_init();
2843         xfrm4_init();
2844 #endif
2845         return rc;
2846 }
2847
2848 EXPORT_SYMBOL(__ip_select_ident);
2849 EXPORT_SYMBOL(ip_route_input);
2850 EXPORT_SYMBOL(ip_route_output_key);