ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106
107 #define IP_MAX_MTU      0xFFF0
108
109 #define RT_GC_TIMEOUT (300*HZ)
110
111 int ip_rt_min_delay             = 2 * HZ;
112 int ip_rt_max_delay             = 10 * HZ;
113 int ip_rt_max_size;
114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval           = 60 * HZ;
116 int ip_rt_gc_min_interval       = HZ / 2;
117 int ip_rt_redirect_number       = 9;
118 int ip_rt_redirect_load         = HZ / 50;
119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost            = HZ;
121 int ip_rt_error_burst           = 5 * HZ;
122 int ip_rt_gc_elasticity         = 8;
123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
124 int ip_rt_min_pmtu              = 512 + 20 + 20;
125 int ip_rt_min_advmss            = 256;
126 int ip_rt_secret_interval       = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void              ipv4_link_failure(struct sk_buff *skb);
143 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
144 static int rt_garbage_collect(void);
145
146
147 static struct dst_ops ipv4_dst_ops = {
148         .family =               AF_INET,
149         .protocol =             __constant_htons(ETH_P_IP),
150         .gc =                   rt_garbage_collect,
151         .check =                ipv4_dst_check,
152         .destroy =              ipv4_dst_destroy,
153         .negative_advice =      ipv4_negative_advice,
154         .link_failure =         ipv4_link_failure,
155         .update_pmtu =          ip_rt_update_pmtu,
156         .entry_size =           sizeof(struct rtable),
157 };
158
159 #define ECN_OR_COST(class)      TC_PRIO_##class
160
161 __u8 ip_tos2prio[16] = {
162         TC_PRIO_BESTEFFORT,
163         ECN_OR_COST(FILLER),
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(BESTEFFORT),
166         TC_PRIO_BULK,
167         ECN_OR_COST(BULK),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_INTERACTIVE,
171         ECN_OR_COST(INTERACTIVE),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE_BULK,
175         ECN_OR_COST(INTERACTIVE_BULK),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK)
178 };
179
180
181 /*
182  * Route cache.
183  */
184
185 /* The locking scheme is rather straight forward:
186  *
187  * 1) Read-Copy Update protects the buckets of the central route hash.
188  * 2) Only writers remove entries, and they hold the lock
189  *    as they look at rtable reference counts.
190  * 3) Only readers acquire references to rtable entries,
191  *    they do so with atomic increments and with the
192  *    lock held.
193  */
194
195 struct rt_hash_bucket {
196         struct rtable   *chain;
197         spinlock_t      lock;
198 } __attribute__((__aligned__(8)));
199
200 static struct rt_hash_bucket    *rt_hash_table;
201 static unsigned                 rt_hash_mask;
202 static int                      rt_hash_log;
203 static unsigned int             rt_hash_rnd;
204
205 struct rt_cache_stat *rt_cache_stat;
206
207 static int rt_intern_hash(unsigned hash, struct rtable *rth,
208                                 struct rtable **res);
209
210 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
211 {
212         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
213                 & rt_hash_mask);
214 }
215
216 #ifdef CONFIG_PROC_FS
217 struct rt_cache_iter_state {
218         int bucket;
219 };
220
221 static struct rtable *rt_cache_get_first(struct seq_file *seq)
222 {
223         struct rtable *r = NULL;
224         struct rt_cache_iter_state *st = seq->private;
225
226         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
227                 rcu_read_lock();
228                 r = rt_hash_table[st->bucket].chain;
229                 if (r)
230                         break;
231                 rcu_read_unlock();
232         }
233         return r;
234 }
235
236 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
237 {
238         struct rt_cache_iter_state *st = seq->private;
239
240         smp_read_barrier_depends();
241         r = r->u.rt_next;
242         while (!r) {
243                 rcu_read_unlock();
244                 if (--st->bucket < 0)
245                         break;
246                 rcu_read_lock();
247                 r = rt_hash_table[st->bucket].chain;
248         }
249         return r;
250 }
251
252 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
253 {
254         struct rtable *r = rt_cache_get_first(seq);
255
256         if (r)
257                 while (pos && (r = rt_cache_get_next(seq, r)))
258                         --pos;
259         return pos ? NULL : r;
260 }
261
262 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
263 {
264         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
265 }
266
267 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         struct rtable *r = NULL;
270
271         if (v == SEQ_START_TOKEN)
272                 r = rt_cache_get_first(seq);
273         else
274                 r = rt_cache_get_next(seq, v);
275         ++*pos;
276         return r;
277 }
278
279 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
280 {
281         if (v && v != SEQ_START_TOKEN)
282                 rcu_read_unlock();
283 }
284
285 static int rt_cache_seq_show(struct seq_file *seq, void *v)
286 {
287         if (v == SEQ_START_TOKEN)
288                 seq_printf(seq, "%-127s\n",
289                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
290                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
291                            "HHUptod\tSpecDst");
292         else {
293                 struct rtable *r = v;
294                 char temp[256];
295
296                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
297                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
298                         r->u.dst.dev ? r->u.dst.dev->name : "*",
299                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
300                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
301                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
302                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
303                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
304                         dst_metric(&r->u.dst, RTAX_WINDOW),
305                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
306                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
307                         r->fl.fl4_tos,
308                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
309                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
310                                        dev_queue_xmit) : 0,
311                         r->rt_spec_dst);
312                 seq_printf(seq, "%-127s\n", temp);
313         }
314         return 0;
315 }
316
317 static struct seq_operations rt_cache_seq_ops = {
318         .start  = rt_cache_seq_start,
319         .next   = rt_cache_seq_next,
320         .stop   = rt_cache_seq_stop,
321         .show   = rt_cache_seq_show,
322 };
323
324 static int rt_cache_seq_open(struct inode *inode, struct file *file)
325 {
326         struct seq_file *seq;
327         int rc = -ENOMEM;
328         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
329
330         if (!s)
331                 goto out;
332         rc = seq_open(file, &rt_cache_seq_ops);
333         if (rc)
334                 goto out_kfree;
335         seq          = file->private_data;
336         seq->private = s;
337         memset(s, 0, sizeof(*s));
338 out:
339         return rc;
340 out_kfree:
341         kfree(s);
342         goto out;
343 }
344
345 static struct file_operations rt_cache_seq_fops = {
346         .owner   = THIS_MODULE,
347         .open    = rt_cache_seq_open,
348         .read    = seq_read,
349         .llseek  = seq_lseek,
350         .release = seq_release_private,
351 };
352
353
354 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
355 {
356         int cpu;
357
358         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
359                 if (!cpu_possible(cpu))
360                         continue;
361                 *pos = cpu;
362                 return per_cpu_ptr(rt_cache_stat, cpu);
363         }
364         return NULL;
365 }
366
367 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
368 {
369         int cpu;
370
371         for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
372                 if (!cpu_possible(cpu))
373                         continue;
374                 *pos = cpu;
375                 return per_cpu_ptr(rt_cache_stat, cpu);
376         }
377         return NULL;
378         
379 }
380
381 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
382 {
383
384 }
385
386 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
387 {
388         struct rt_cache_stat *st = v;
389         
390         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
391                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
392                    atomic_read(&ipv4_dst_ops.entries),
393                    st->in_hit,
394                    st->in_slow_tot,
395                    st->in_slow_mc,
396                    st->in_no_route,
397                    st->in_brd,
398                    st->in_martian_dst,
399                    st->in_martian_src,
400
401                    st->out_hit,
402                    st->out_slow_tot,
403                    st->out_slow_mc, 
404
405                    st->gc_total,
406                    st->gc_ignored,
407                    st->gc_goal_miss,
408                    st->gc_dst_overflow,
409                    st->in_hlist_search,
410                    st->out_hlist_search
411                 );
412         return 0;
413 }
414
415 static struct seq_operations rt_cpu_seq_ops = {
416         .start  = rt_cpu_seq_start,
417         .next   = rt_cpu_seq_next,
418         .stop   = rt_cpu_seq_stop,
419         .show   = rt_cpu_seq_show,
420 };
421
422
423 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
424 {
425         return seq_open(file, &rt_cpu_seq_ops);
426 }
427
428 static struct file_operations rt_cpu_seq_fops = {
429         .owner   = THIS_MODULE,
430         .open    = rt_cpu_seq_open,
431         .read    = seq_read,
432         .llseek  = seq_lseek,
433         .release = seq_release_private,
434 };
435
436 #endif /* CONFIG_PROC_FS */
437   
438 static __inline__ void rt_free(struct rtable *rt)
439 {
440         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
441 }
442
443 static __inline__ void rt_drop(struct rtable *rt)
444 {
445         ip_rt_put(rt);
446         call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
447 }
448
449 static __inline__ int rt_fast_clean(struct rtable *rth)
450 {
451         /* Kill broadcast/multicast entries very aggresively, if they
452            collide in hash table with more useful entries */
453         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
454                 rth->fl.iif && rth->u.rt_next;
455 }
456
457 static __inline__ int rt_valuable(struct rtable *rth)
458 {
459         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
460                 rth->u.dst.expires;
461 }
462
463 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
464 {
465         unsigned long age;
466         int ret = 0;
467
468         if (atomic_read(&rth->u.dst.__refcnt))
469                 goto out;
470
471         ret = 1;
472         if (rth->u.dst.expires &&
473             time_after_eq(jiffies, rth->u.dst.expires))
474                 goto out;
475
476         age = jiffies - rth->u.dst.lastuse;
477         ret = 0;
478         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
479             (age <= tmo2 && rt_valuable(rth)))
480                 goto out;
481         ret = 1;
482 out:    return ret;
483 }
484
485 /* Bits of score are:
486  * 31: very valuable
487  * 30: not quite useless
488  * 29..0: usage counter
489  */
490 static inline u32 rt_score(struct rtable *rt)
491 {
492         u32 score = jiffies - rt->u.dst.lastuse;
493
494         score = ~score & ~(3<<30);
495
496         if (rt_valuable(rt))
497                 score |= (1<<31);
498
499         if (!rt->fl.iif ||
500             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
501                 score |= (1<<30);
502
503         return score;
504 }
505
506 /* This runs via a timer and thus is always in BH context. */
507 static void rt_check_expire(unsigned long dummy)
508 {
509         static int rover;
510         int i = rover, t;
511         struct rtable *rth, **rthp;
512         unsigned long now = jiffies;
513
514         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
515              t -= ip_rt_gc_timeout) {
516                 unsigned long tmo = ip_rt_gc_timeout;
517
518                 i = (i + 1) & rt_hash_mask;
519                 rthp = &rt_hash_table[i].chain;
520
521                 spin_lock(&rt_hash_table[i].lock);
522                 while ((rth = *rthp) != NULL) {
523                         if (rth->u.dst.expires) {
524                                 /* Entry is expired even if it is in use */
525                                 if (time_before_eq(now, rth->u.dst.expires)) {
526                                         tmo >>= 1;
527                                         rthp = &rth->u.rt_next;
528                                         continue;
529                                 }
530                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
531                                 tmo >>= 1;
532                                 rthp = &rth->u.rt_next;
533                                 continue;
534                         }
535
536                         /* Cleanup aged off entries. */
537                         *rthp = rth->u.rt_next;
538                         rt_free(rth);
539                 }
540                 spin_unlock(&rt_hash_table[i].lock);
541
542                 /* Fallback loop breaker. */
543                 if (time_after(jiffies, now))
544                         break;
545         }
546         rover = i;
547         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
548 }
549
550 /* This can run from both BH and non-BH contexts, the latter
551  * in the case of a forced flush event.
552  */
553 static void rt_run_flush(unsigned long dummy)
554 {
555         int i;
556         struct rtable *rth, *next;
557
558         rt_deadline = 0;
559
560         get_random_bytes(&rt_hash_rnd, 4);
561
562         for (i = rt_hash_mask; i >= 0; i--) {
563                 spin_lock_bh(&rt_hash_table[i].lock);
564                 rth = rt_hash_table[i].chain;
565                 if (rth)
566                         rt_hash_table[i].chain = NULL;
567                 spin_unlock_bh(&rt_hash_table[i].lock);
568
569                 for (; rth; rth = next) {
570                         next = rth->u.rt_next;
571                         rt_free(rth);
572                 }
573         }
574 }
575
576 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
577
578 void rt_cache_flush(int delay)
579 {
580         unsigned long now = jiffies;
581         int user_mode = !in_softirq();
582
583         if (delay < 0)
584                 delay = ip_rt_min_delay;
585
586         spin_lock_bh(&rt_flush_lock);
587
588         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
589                 long tmo = (long)(rt_deadline - now);
590
591                 /* If flush timer is already running
592                    and flush request is not immediate (delay > 0):
593
594                    if deadline is not achieved, prolongate timer to "delay",
595                    otherwise fire it at deadline time.
596                  */
597
598                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
599                         tmo = 0;
600                 
601                 if (delay > tmo)
602                         delay = tmo;
603         }
604
605         if (delay <= 0) {
606                 spin_unlock_bh(&rt_flush_lock);
607                 rt_run_flush(0);
608                 return;
609         }
610
611         if (rt_deadline == 0)
612                 rt_deadline = now + ip_rt_max_delay;
613
614         mod_timer(&rt_flush_timer, now+delay);
615         spin_unlock_bh(&rt_flush_lock);
616 }
617
618 static void rt_secret_rebuild(unsigned long dummy)
619 {
620         unsigned long now = jiffies;
621
622         rt_cache_flush(0);
623         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
624 }
625
626 /*
627    Short description of GC goals.
628
629    We want to build algorithm, which will keep routing cache
630    at some equilibrium point, when number of aged off entries
631    is kept approximately equal to newly generated ones.
632
633    Current expiration strength is variable "expire".
634    We try to adjust it dynamically, so that if networking
635    is idle expires is large enough to keep enough of warm entries,
636    and when load increases it reduces to limit cache size.
637  */
638
639 static int rt_garbage_collect(void)
640 {
641         static unsigned long expire = RT_GC_TIMEOUT;
642         static unsigned long last_gc;
643         static int rover;
644         static int equilibrium;
645         struct rtable *rth, **rthp;
646         unsigned long now = jiffies;
647         int goal;
648
649         /*
650          * Garbage collection is pretty expensive,
651          * do not make it too frequently.
652          */
653
654         RT_CACHE_STAT_INC(gc_total);
655
656         if (now - last_gc < ip_rt_gc_min_interval &&
657             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
658                 RT_CACHE_STAT_INC(gc_ignored);
659                 goto out;
660         }
661
662         /* Calculate number of entries, which we want to expire now. */
663         goal = atomic_read(&ipv4_dst_ops.entries) -
664                 (ip_rt_gc_elasticity << rt_hash_log);
665         if (goal <= 0) {
666                 if (equilibrium < ipv4_dst_ops.gc_thresh)
667                         equilibrium = ipv4_dst_ops.gc_thresh;
668                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
669                 if (goal > 0) {
670                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
671                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
672                 }
673         } else {
674                 /* We are in dangerous area. Try to reduce cache really
675                  * aggressively.
676                  */
677                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
678                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
679         }
680
681         if (now - last_gc >= ip_rt_gc_min_interval)
682                 last_gc = now;
683
684         if (goal <= 0) {
685                 equilibrium += goal;
686                 goto work_done;
687         }
688
689         do {
690                 int i, k;
691
692                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
693                         unsigned long tmo = expire;
694
695                         k = (k + 1) & rt_hash_mask;
696                         rthp = &rt_hash_table[k].chain;
697                         spin_lock_bh(&rt_hash_table[k].lock);
698                         while ((rth = *rthp) != NULL) {
699                                 if (!rt_may_expire(rth, tmo, expire)) {
700                                         tmo >>= 1;
701                                         rthp = &rth->u.rt_next;
702                                         continue;
703                                 }
704                                 *rthp = rth->u.rt_next;
705                                 rt_free(rth);
706                                 goal--;
707                         }
708                         spin_unlock_bh(&rt_hash_table[k].lock);
709                         if (goal <= 0)
710                                 break;
711                 }
712                 rover = k;
713
714                 if (goal <= 0)
715                         goto work_done;
716
717                 /* Goal is not achieved. We stop process if:
718
719                    - if expire reduced to zero. Otherwise, expire is halfed.
720                    - if table is not full.
721                    - if we are called from interrupt.
722                    - jiffies check is just fallback/debug loop breaker.
723                      We will not spin here for long time in any case.
724                  */
725
726                 RT_CACHE_STAT_INC(gc_goal_miss);
727
728                 if (expire == 0)
729                         break;
730
731                 expire >>= 1;
732 #if RT_CACHE_DEBUG >= 2
733                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
734                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
735 #endif
736
737                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
738                         goto out;
739         } while (!in_softirq() && time_before_eq(jiffies, now));
740
741         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
742                 goto out;
743         if (net_ratelimit())
744                 printk(KERN_WARNING "dst cache overflow\n");
745         RT_CACHE_STAT_INC(gc_dst_overflow);
746         return 1;
747
748 work_done:
749         expire += ip_rt_gc_min_interval;
750         if (expire > ip_rt_gc_timeout ||
751             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
752                 expire = ip_rt_gc_timeout;
753 #if RT_CACHE_DEBUG >= 2
754         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
755                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
756 #endif
757 out:    return 0;
758 }
759
760 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
761 {
762         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
763                fl1->oif     == fl2->oif &&
764                fl1->iif     == fl2->iif;
765 }
766
767 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
768 {
769         struct rtable   *rth, **rthp;
770         unsigned long   now;
771         struct rtable *cand, **candp;
772         u32             min_score;
773         int             chain_length;
774         int attempts = !in_softirq();
775
776 restart:
777         chain_length = 0;
778         min_score = ~(u32)0;
779         cand = NULL;
780         candp = NULL;
781         now = jiffies;
782
783         rthp = &rt_hash_table[hash].chain;
784
785         spin_lock_bh(&rt_hash_table[hash].lock);
786         while ((rth = *rthp) != NULL) {
787                 if (compare_keys(&rth->fl, &rt->fl)) {
788                         /* Put it first */
789                         *rthp = rth->u.rt_next;
790                         /*
791                          * Since lookup is lockfree, the deletion
792                          * must be visible to another weakly ordered CPU before
793                          * the insertion at the start of the hash chain.
794                          */
795                         smp_wmb();
796                         rth->u.rt_next = rt_hash_table[hash].chain;
797                         /*
798                          * Since lookup is lockfree, the update writes
799                          * must be ordered for consistency on SMP.
800                          */
801                         smp_wmb();
802                         rt_hash_table[hash].chain = rth;
803
804                         rth->u.dst.__use++;
805                         dst_hold(&rth->u.dst);
806                         rth->u.dst.lastuse = now;
807                         spin_unlock_bh(&rt_hash_table[hash].lock);
808
809                         rt_drop(rt);
810                         *rp = rth;
811                         return 0;
812                 }
813
814                 if (!atomic_read(&rth->u.dst.__refcnt)) {
815                         u32 score = rt_score(rth);
816
817                         if (score <= min_score) {
818                                 cand = rth;
819                                 candp = rthp;
820                                 min_score = score;
821                         }
822                 }
823
824                 chain_length++;
825
826                 rthp = &rth->u.rt_next;
827         }
828
829         if (cand) {
830                 /* ip_rt_gc_elasticity used to be average length of chain
831                  * length, when exceeded gc becomes really aggressive.
832                  *
833                  * The second limit is less certain. At the moment it allows
834                  * only 2 entries per bucket. We will see.
835                  */
836                 if (chain_length > ip_rt_gc_elasticity) {
837                         *candp = cand->u.rt_next;
838                         rt_free(cand);
839                 }
840         }
841
842         /* Try to bind route to arp only if it is output
843            route or unicast forwarding path.
844          */
845         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
846                 int err = arp_bind_neighbour(&rt->u.dst);
847                 if (err) {
848                         spin_unlock_bh(&rt_hash_table[hash].lock);
849
850                         if (err != -ENOBUFS) {
851                                 rt_drop(rt);
852                                 return err;
853                         }
854
855                         /* Neighbour tables are full and nothing
856                            can be released. Try to shrink route cache,
857                            it is most likely it holds some neighbour records.
858                          */
859                         if (attempts-- > 0) {
860                                 int saved_elasticity = ip_rt_gc_elasticity;
861                                 int saved_int = ip_rt_gc_min_interval;
862                                 ip_rt_gc_elasticity     = 1;
863                                 ip_rt_gc_min_interval   = 0;
864                                 rt_garbage_collect();
865                                 ip_rt_gc_min_interval   = saved_int;
866                                 ip_rt_gc_elasticity     = saved_elasticity;
867                                 goto restart;
868                         }
869
870                         if (net_ratelimit())
871                                 printk(KERN_WARNING "Neighbour table overflow.\n");
872                         rt_drop(rt);
873                         return -ENOBUFS;
874                 }
875         }
876
877         rt->u.rt_next = rt_hash_table[hash].chain;
878 #if RT_CACHE_DEBUG >= 2
879         if (rt->u.rt_next) {
880                 struct rtable *trt;
881                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
882                        NIPQUAD(rt->rt_dst));
883                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
884                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
885                 printk("\n");
886         }
887 #endif
888         rt_hash_table[hash].chain = rt;
889         spin_unlock_bh(&rt_hash_table[hash].lock);
890         *rp = rt;
891         return 0;
892 }
893
894 void rt_bind_peer(struct rtable *rt, int create)
895 {
896         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
897         struct inet_peer *peer;
898
899         peer = inet_getpeer(rt->rt_dst, create);
900
901         spin_lock_bh(&rt_peer_lock);
902         if (rt->peer == NULL) {
903                 rt->peer = peer;
904                 peer = NULL;
905         }
906         spin_unlock_bh(&rt_peer_lock);
907         if (peer)
908                 inet_putpeer(peer);
909 }
910
911 /*
912  * Peer allocation may fail only in serious out-of-memory conditions.  However
913  * we still can generate some output.
914  * Random ID selection looks a bit dangerous because we have no chances to
915  * select ID being unique in a reasonable period of time.
916  * But broken packet identifier may be better than no packet at all.
917  */
918 static void ip_select_fb_ident(struct iphdr *iph)
919 {
920         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
921         static u32 ip_fallback_id;
922         u32 salt;
923
924         spin_lock_bh(&ip_fb_id_lock);
925         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
926         iph->id = htons(salt & 0xFFFF);
927         ip_fallback_id = salt;
928         spin_unlock_bh(&ip_fb_id_lock);
929 }
930
931 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
932 {
933         struct rtable *rt = (struct rtable *) dst;
934
935         if (rt) {
936                 if (rt->peer == NULL)
937                         rt_bind_peer(rt, 1);
938
939                 /* If peer is attached to destination, it is never detached,
940                    so that we need not to grab a lock to dereference it.
941                  */
942                 if (rt->peer) {
943                         iph->id = htons(inet_getid(rt->peer, more));
944                         return;
945                 }
946         } else
947                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
948
949         ip_select_fb_ident(iph);
950 }
951
952 static void rt_del(unsigned hash, struct rtable *rt)
953 {
954         struct rtable **rthp;
955
956         spin_lock_bh(&rt_hash_table[hash].lock);
957         ip_rt_put(rt);
958         for (rthp = &rt_hash_table[hash].chain; *rthp;
959              rthp = &(*rthp)->u.rt_next)
960                 if (*rthp == rt) {
961                         *rthp = rt->u.rt_next;
962                         rt_free(rt);
963                         break;
964                 }
965         spin_unlock_bh(&rt_hash_table[hash].lock);
966 }
967
968 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
969                     u32 saddr, u8 tos, struct net_device *dev)
970 {
971         int i, k;
972         struct in_device *in_dev = in_dev_get(dev);
973         struct rtable *rth, **rthp;
974         u32  skeys[2] = { saddr, 0 };
975         int  ikeys[2] = { dev->ifindex, 0 };
976
977         tos &= IPTOS_RT_MASK;
978
979         if (!in_dev)
980                 return;
981
982         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
983             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
984                 goto reject_redirect;
985
986         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
987                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
988                         goto reject_redirect;
989                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
990                         goto reject_redirect;
991         } else {
992                 if (inet_addr_type(new_gw) != RTN_UNICAST)
993                         goto reject_redirect;
994         }
995
996         for (i = 0; i < 2; i++) {
997                 for (k = 0; k < 2; k++) {
998                         unsigned hash = rt_hash_code(daddr,
999                                                      skeys[i] ^ (ikeys[k] << 5),
1000                                                      tos);
1001
1002                         rthp=&rt_hash_table[hash].chain;
1003
1004                         rcu_read_lock();
1005                         while ((rth = *rthp) != NULL) {
1006                                 struct rtable *rt;
1007
1008                                 smp_read_barrier_depends();
1009                                 if (rth->fl.fl4_dst != daddr ||
1010                                     rth->fl.fl4_src != skeys[i] ||
1011                                     rth->fl.fl4_tos != tos ||
1012                                     rth->fl.oif != ikeys[k] ||
1013                                     rth->fl.iif != 0) {
1014                                         rthp = &rth->u.rt_next;
1015                                         continue;
1016                                 }
1017
1018                                 if (rth->rt_dst != daddr ||
1019                                     rth->rt_src != saddr ||
1020                                     rth->u.dst.error ||
1021                                     rth->rt_gateway != old_gw ||
1022                                     rth->u.dst.dev != dev)
1023                                         break;
1024
1025                                 dst_hold(&rth->u.dst);
1026                                 rcu_read_unlock();
1027
1028                                 rt = dst_alloc(&ipv4_dst_ops);
1029                                 if (rt == NULL) {
1030                                         ip_rt_put(rth);
1031                                         in_dev_put(in_dev);
1032                                         return;
1033                                 }
1034
1035                                 /* Copy all the information. */
1036                                 *rt = *rth;
1037                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1038                                 rt->u.dst.__use         = 1;
1039                                 atomic_set(&rt->u.dst.__refcnt, 1);
1040                                 rt->u.dst.child         = NULL;
1041                                 if (rt->u.dst.dev)
1042                                         dev_hold(rt->u.dst.dev);
1043                                 rt->u.dst.obsolete      = 0;
1044                                 rt->u.dst.lastuse       = jiffies;
1045                                 rt->u.dst.path          = &rt->u.dst;
1046                                 rt->u.dst.neighbour     = NULL;
1047                                 rt->u.dst.hh            = NULL;
1048                                 rt->u.dst.xfrm          = NULL;
1049
1050                                 rt->rt_flags            |= RTCF_REDIRECTED;
1051
1052                                 /* Gateway is different ... */
1053                                 rt->rt_gateway          = new_gw;
1054
1055                                 /* Redirect received -> path was valid */
1056                                 dst_confirm(&rth->u.dst);
1057
1058                                 if (rt->peer)
1059                                         atomic_inc(&rt->peer->refcnt);
1060
1061                                 if (arp_bind_neighbour(&rt->u.dst) ||
1062                                     !(rt->u.dst.neighbour->nud_state &
1063                                             NUD_VALID)) {
1064                                         if (rt->u.dst.neighbour)
1065                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1066                                         ip_rt_put(rth);
1067                                         rt_drop(rt);
1068                                         goto do_next;
1069                                 }
1070
1071                                 rt_del(hash, rth);
1072                                 if (!rt_intern_hash(hash, rt, &rt))
1073                                         ip_rt_put(rt);
1074                                 goto do_next;
1075                         }
1076                         rcu_read_unlock();
1077                 do_next:
1078                         ;
1079                 }
1080         }
1081         in_dev_put(in_dev);
1082         return;
1083
1084 reject_redirect:
1085 #ifdef CONFIG_IP_ROUTE_VERBOSE
1086         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1087                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1088                         "%u.%u.%u.%u ignored.\n"
1089                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1090                         "tos %02x\n",
1091                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1092                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1093 #endif
1094         in_dev_put(in_dev);
1095 }
1096
1097 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1098 {
1099         struct rtable *rt = (struct rtable*)dst;
1100         struct dst_entry *ret = dst;
1101
1102         if (rt) {
1103                 if (dst->obsolete) {
1104                         ip_rt_put(rt);
1105                         ret = NULL;
1106                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1107                            rt->u.dst.expires) {
1108                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1109                                                      rt->fl.fl4_src ^
1110                                                         (rt->fl.oif << 5),
1111                                                      rt->fl.fl4_tos);
1112 #if RT_CACHE_DEBUG >= 1
1113                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1114                                           "%u.%u.%u.%u/%02x dropped\n",
1115                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1116 #endif
1117                         rt_del(hash, rt);
1118                         ret = NULL;
1119                 }
1120         }
1121         return ret;
1122 }
1123
1124 /*
1125  * Algorithm:
1126  *      1. The first ip_rt_redirect_number redirects are sent
1127  *         with exponential backoff, then we stop sending them at all,
1128  *         assuming that the host ignores our redirects.
1129  *      2. If we did not see packets requiring redirects
1130  *         during ip_rt_redirect_silence, we assume that the host
1131  *         forgot redirected route and start to send redirects again.
1132  *
1133  * This algorithm is much cheaper and more intelligent than dumb load limiting
1134  * in icmp.c.
1135  *
1136  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1137  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1138  */
1139
1140 void ip_rt_send_redirect(struct sk_buff *skb)
1141 {
1142         struct rtable *rt = (struct rtable*)skb->dst;
1143         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1144
1145         if (!in_dev)
1146                 return;
1147
1148         if (!IN_DEV_TX_REDIRECTS(in_dev))
1149                 goto out;
1150
1151         /* No redirected packets during ip_rt_redirect_silence;
1152          * reset the algorithm.
1153          */
1154         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1155                 rt->u.dst.rate_tokens = 0;
1156
1157         /* Too many ignored redirects; do not send anything
1158          * set u.dst.rate_last to the last seen redirected packet.
1159          */
1160         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1161                 rt->u.dst.rate_last = jiffies;
1162                 goto out;
1163         }
1164
1165         /* Check for load limit; set rate_last to the latest sent
1166          * redirect.
1167          */
1168         if (time_after(jiffies,
1169                        (rt->u.dst.rate_last +
1170                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1171                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1172                 rt->u.dst.rate_last = jiffies;
1173                 ++rt->u.dst.rate_tokens;
1174 #ifdef CONFIG_IP_ROUTE_VERBOSE
1175                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1176                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1177                     net_ratelimit())
1178                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1179                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1180                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1181                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1182 #endif
1183         }
1184 out:
1185         in_dev_put(in_dev);
1186 }
1187
1188 static int ip_error(struct sk_buff *skb)
1189 {
1190         struct rtable *rt = (struct rtable*)skb->dst;
1191         unsigned long now;
1192         int code;
1193
1194         switch (rt->u.dst.error) {
1195                 case EINVAL:
1196                 default:
1197                         goto out;
1198                 case EHOSTUNREACH:
1199                         code = ICMP_HOST_UNREACH;
1200                         break;
1201                 case ENETUNREACH:
1202                         code = ICMP_NET_UNREACH;
1203                         break;
1204                 case EACCES:
1205                         code = ICMP_PKT_FILTERED;
1206                         break;
1207         }
1208
1209         now = jiffies;
1210         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1211         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1212                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1213         rt->u.dst.rate_last = now;
1214         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1215                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1216                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1217         }
1218
1219 out:    kfree_skb(skb);
1220         return 0;
1221
1222
1223 /*
1224  *      The last two values are not from the RFC but
1225  *      are needed for AMPRnet AX.25 paths.
1226  */
1227
1228 static unsigned short mtu_plateau[] =
1229 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1230
1231 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1232 {
1233         int i;
1234         
1235         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1236                 if (old_mtu > mtu_plateau[i])
1237                         return mtu_plateau[i];
1238         return 68;
1239 }
1240
1241 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1242 {
1243         int i;
1244         unsigned short old_mtu = ntohs(iph->tot_len);
1245         struct rtable *rth;
1246         u32  skeys[2] = { iph->saddr, 0, };
1247         u32  daddr = iph->daddr;
1248         u8   tos = iph->tos & IPTOS_RT_MASK;
1249         unsigned short est_mtu = 0;
1250
1251         if (ipv4_config.no_pmtu_disc)
1252                 return 0;
1253
1254         for (i = 0; i < 2; i++) {
1255                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1256
1257                 rcu_read_lock();
1258                 for (rth = rt_hash_table[hash].chain; rth;
1259                      rth = rth->u.rt_next) {
1260                         smp_read_barrier_depends();
1261                         if (rth->fl.fl4_dst == daddr &&
1262                             rth->fl.fl4_src == skeys[i] &&
1263                             rth->rt_dst  == daddr &&
1264                             rth->rt_src  == iph->saddr &&
1265                             rth->fl.fl4_tos == tos &&
1266                             rth->fl.iif == 0 &&
1267                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1268                                 unsigned short mtu = new_mtu;
1269
1270                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1271
1272                                         /* BSD 4.2 compatibility hack :-( */
1273                                         if (mtu == 0 &&
1274                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1275                                             old_mtu >= 68 + (iph->ihl << 2))
1276                                                 old_mtu -= iph->ihl << 2;
1277
1278                                         mtu = guess_mtu(old_mtu);
1279                                 }
1280                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1281                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1282                                                 dst_confirm(&rth->u.dst);
1283                                                 if (mtu < ip_rt_min_pmtu) {
1284                                                         mtu = ip_rt_min_pmtu;
1285                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1286                                                                 (1 << RTAX_MTU);
1287                                                 }
1288                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1289                                                 dst_set_expires(&rth->u.dst,
1290                                                         ip_rt_mtu_expires);
1291                                         }
1292                                         est_mtu = mtu;
1293                                 }
1294                         }
1295                 }
1296                 rcu_read_unlock();
1297         }
1298         return est_mtu ? : new_mtu;
1299 }
1300
1301 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1302 {
1303         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1304             !(dst_metric_locked(dst, RTAX_MTU))) {
1305                 if (mtu < ip_rt_min_pmtu) {
1306                         mtu = ip_rt_min_pmtu;
1307                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1308                 }
1309                 dst->metrics[RTAX_MTU-1] = mtu;
1310                 dst_set_expires(dst, ip_rt_mtu_expires);
1311         }
1312 }
1313
1314 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1315 {
1316         dst_release(dst);
1317         return NULL;
1318 }
1319
1320 static void ipv4_dst_destroy(struct dst_entry *dst)
1321 {
1322         struct rtable *rt = (struct rtable *) dst;
1323         struct inet_peer *peer = rt->peer;
1324
1325         if (peer) {
1326                 rt->peer = NULL;
1327                 inet_putpeer(peer);
1328         }
1329 }
1330
1331 static void ipv4_link_failure(struct sk_buff *skb)
1332 {
1333         struct rtable *rt;
1334
1335         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1336
1337         rt = (struct rtable *) skb->dst;
1338         if (rt)
1339                 dst_set_expires(&rt->u.dst, 0);
1340 }
1341
1342 static int ip_rt_bug(struct sk_buff *skb)
1343 {
1344         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1345                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1346                 skb->dev ? skb->dev->name : "?");
1347         kfree_skb(skb);
1348         return 0;
1349 }
1350
1351 /*
1352    We do not cache source address of outgoing interface,
1353    because it is used only by IP RR, TS and SRR options,
1354    so that it out of fast path.
1355
1356    BTW remember: "addr" is allowed to be not aligned
1357    in IP options!
1358  */
1359
1360 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1361 {
1362         u32 src;
1363         struct fib_result res;
1364
1365         if (rt->fl.iif == 0)
1366                 src = rt->rt_src;
1367         else if (fib_lookup(&rt->fl, &res) == 0) {
1368 #ifdef CONFIG_IP_ROUTE_NAT
1369                 if (res.type == RTN_NAT)
1370                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1371                                                 RT_SCOPE_UNIVERSE);
1372                 else
1373 #endif
1374                         src = FIB_RES_PREFSRC(res);
1375                 fib_res_put(&res);
1376         } else
1377                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1378                                         RT_SCOPE_UNIVERSE);
1379         memcpy(addr, &src, 4);
1380 }
1381
1382 #ifdef CONFIG_NET_CLS_ROUTE
1383 static void set_class_tag(struct rtable *rt, u32 tag)
1384 {
1385         if (!(rt->u.dst.tclassid & 0xFFFF))
1386                 rt->u.dst.tclassid |= tag & 0xFFFF;
1387         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1388                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1389 }
1390 #endif
1391
1392 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1393 {
1394         struct fib_info *fi = res->fi;
1395
1396         if (fi) {
1397                 if (FIB_RES_GW(*res) &&
1398                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1399                         rt->rt_gateway = FIB_RES_GW(*res);
1400                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1401                        sizeof(rt->u.dst.metrics));
1402                 if (fi->fib_mtu == 0) {
1403                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1404                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1405                             rt->rt_gateway != rt->rt_dst &&
1406                             rt->u.dst.dev->mtu > 576)
1407                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1408                 }
1409 #ifdef CONFIG_NET_CLS_ROUTE
1410                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1411 #endif
1412         } else
1413                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1414
1415         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1416                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1417         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1418                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1419         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1420                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1421                                        ip_rt_min_advmss);
1422         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1423                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1424
1425 #ifdef CONFIG_NET_CLS_ROUTE
1426 #ifdef CONFIG_IP_MULTIPLE_TABLES
1427         set_class_tag(rt, fib_rules_tclass(res));
1428 #endif
1429         set_class_tag(rt, itag);
1430 #endif
1431         rt->rt_type = res->type;
1432 }
1433
1434 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1435                                 u8 tos, struct net_device *dev, int our)
1436 {
1437         unsigned hash;
1438         struct rtable *rth;
1439         u32 spec_dst;
1440         struct in_device *in_dev = in_dev_get(dev);
1441         u32 itag = 0;
1442
1443         /* Primary sanity checks. */
1444
1445         if (in_dev == NULL)
1446                 return -EINVAL;
1447
1448         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1449             skb->protocol != htons(ETH_P_IP))
1450                 goto e_inval;
1451
1452         if (ZERONET(saddr)) {
1453                 if (!LOCAL_MCAST(daddr))
1454                         goto e_inval;
1455                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1456         } else if (fib_validate_source(saddr, 0, tos, 0,
1457                                         dev, &spec_dst, &itag) < 0)
1458                 goto e_inval;
1459
1460         rth = dst_alloc(&ipv4_dst_ops);
1461         if (!rth)
1462                 goto e_nobufs;
1463
1464         rth->u.dst.output= ip_rt_bug;
1465
1466         atomic_set(&rth->u.dst.__refcnt, 1);
1467         rth->u.dst.flags= DST_HOST;
1468         if (in_dev->cnf.no_policy)
1469                 rth->u.dst.flags |= DST_NOPOLICY;
1470         rth->fl.fl4_dst = daddr;
1471         rth->rt_dst     = daddr;
1472         rth->fl.fl4_tos = tos;
1473 #ifdef CONFIG_IP_ROUTE_FWMARK
1474         rth->fl.fl4_fwmark= skb->nfmark;
1475 #endif
1476         rth->fl.fl4_src = saddr;
1477         rth->rt_src     = saddr;
1478 #ifdef CONFIG_IP_ROUTE_NAT
1479         rth->rt_dst_map = daddr;
1480         rth->rt_src_map = saddr;
1481 #endif
1482 #ifdef CONFIG_NET_CLS_ROUTE
1483         rth->u.dst.tclassid = itag;
1484 #endif
1485         rth->rt_iif     =
1486         rth->fl.iif     = dev->ifindex;
1487         rth->u.dst.dev  = &loopback_dev;
1488         dev_hold(rth->u.dst.dev);
1489         rth->fl.oif     = 0;
1490         rth->rt_gateway = daddr;
1491         rth->rt_spec_dst= spec_dst;
1492         rth->rt_type    = RTN_MULTICAST;
1493         rth->rt_flags   = RTCF_MULTICAST;
1494         if (our) {
1495                 rth->u.dst.input= ip_local_deliver;
1496                 rth->rt_flags |= RTCF_LOCAL;
1497         }
1498
1499 #ifdef CONFIG_IP_MROUTE
1500         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1501                 rth->u.dst.input = ip_mr_input;
1502 #endif
1503         RT_CACHE_STAT_INC(in_slow_mc);
1504
1505         in_dev_put(in_dev);
1506         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1507         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1508
1509 e_nobufs:
1510         in_dev_put(in_dev);
1511         return -ENOBUFS;
1512
1513 e_inval:
1514         in_dev_put(in_dev);
1515         return -EINVAL;
1516 }
1517
1518 /*
1519  *      NOTE. We drop all the packets that has local source
1520  *      addresses, because every properly looped back packet
1521  *      must have correct destination already attached by output routine.
1522  *
1523  *      Such approach solves two big problems:
1524  *      1. Not simplex devices are handled properly.
1525  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1526  */
1527
1528 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1529                         u8 tos, struct net_device *dev)
1530 {
1531         struct fib_result res;
1532         struct in_device *in_dev = in_dev_get(dev);
1533         struct in_device *out_dev = NULL;
1534         struct flowi fl = { .nl_u = { .ip4_u =
1535                                       { .daddr = daddr,
1536                                         .saddr = saddr,
1537                                         .tos = tos,
1538                                         .scope = RT_SCOPE_UNIVERSE,
1539 #ifdef CONFIG_IP_ROUTE_FWMARK
1540                                         .fwmark = skb->nfmark
1541 #endif
1542                                       } },
1543                             .iif = dev->ifindex };
1544         unsigned        flags = 0;
1545         u32             itag = 0;
1546         struct rtable * rth;
1547         unsigned        hash;
1548         u32             spec_dst;
1549         int             err = -EINVAL;
1550         int             free_res = 0;
1551
1552         /* IP on this device is disabled. */
1553
1554         if (!in_dev)
1555                 goto out;
1556
1557         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1558
1559         /* Check for the most weird martians, which can be not detected
1560            by fib_lookup.
1561          */
1562
1563         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1564                 goto martian_source;
1565
1566         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1567                 goto brd_input;
1568
1569         /* Accept zero addresses only to limited broadcast;
1570          * I even do not know to fix it or not. Waiting for complains :-)
1571          */
1572         if (ZERONET(saddr))
1573                 goto martian_source;
1574
1575         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1576                 goto martian_destination;
1577
1578         /*
1579          *      Now we are ready to route packet.
1580          */
1581         if ((err = fib_lookup(&fl, &res)) != 0) {
1582                 if (!IN_DEV_FORWARD(in_dev))
1583                         goto e_inval;
1584                 goto no_route;
1585         }
1586         free_res = 1;
1587
1588         RT_CACHE_STAT_INC(in_slow_tot);
1589
1590 #ifdef CONFIG_IP_ROUTE_NAT
1591         /* Policy is applied before mapping destination,
1592            but rerouting after map should be made with old source.
1593          */
1594
1595         if (1) {
1596                 u32 src_map = saddr;
1597                 if (res.r)
1598                         src_map = fib_rules_policy(saddr, &res, &flags);
1599
1600                 if (res.type == RTN_NAT) {
1601                         fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1602                         fib_res_put(&res);
1603                         free_res = 0;
1604                         if (fib_lookup(&fl, &res))
1605                                 goto e_inval;
1606                         free_res = 1;
1607                         if (res.type != RTN_UNICAST)
1608                                 goto e_inval;
1609                         flags |= RTCF_DNAT;
1610                 }
1611                 fl.fl4_src = src_map;
1612         }
1613 #endif
1614
1615         if (res.type == RTN_BROADCAST)
1616                 goto brd_input;
1617
1618         if (res.type == RTN_LOCAL) {
1619                 int result;
1620                 result = fib_validate_source(saddr, daddr, tos,
1621                                              loopback_dev.ifindex,
1622                                              dev, &spec_dst, &itag);
1623                 if (result < 0)
1624                         goto martian_source;
1625                 if (result)
1626                         flags |= RTCF_DIRECTSRC;
1627                 spec_dst = daddr;
1628                 goto local_input;
1629         }
1630
1631         if (!IN_DEV_FORWARD(in_dev))
1632                 goto e_inval;
1633         if (res.type != RTN_UNICAST)
1634                 goto martian_destination;
1635
1636 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1637         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1638                 fib_select_multipath(&fl, &res);
1639 #endif
1640         out_dev = in_dev_get(FIB_RES_DEV(res));
1641         if (out_dev == NULL) {
1642                 if (net_ratelimit())
1643                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1644                                          "Please, report\n");
1645                 goto e_inval;
1646         }
1647
1648         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1649                                   &spec_dst, &itag);
1650         if (err < 0)
1651                 goto martian_source;
1652
1653         if (err)
1654                 flags |= RTCF_DIRECTSRC;
1655
1656         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1657             (IN_DEV_SHARED_MEDIA(out_dev) ||
1658              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1659                 flags |= RTCF_DOREDIRECT;
1660
1661         if (skb->protocol != htons(ETH_P_IP)) {
1662                 /* Not IP (i.e. ARP). Do not create route, if it is
1663                  * invalid for proxy arp. DNAT routes are always valid.
1664                  */
1665                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1666                         goto e_inval;
1667         }
1668
1669         rth = dst_alloc(&ipv4_dst_ops);
1670         if (!rth)
1671                 goto e_nobufs;
1672
1673         atomic_set(&rth->u.dst.__refcnt, 1);
1674         rth->u.dst.flags= DST_HOST;
1675         if (in_dev->cnf.no_policy)
1676                 rth->u.dst.flags |= DST_NOPOLICY;
1677         if (in_dev->cnf.no_xfrm)
1678                 rth->u.dst.flags |= DST_NOXFRM;
1679         rth->fl.fl4_dst = daddr;
1680         rth->rt_dst     = daddr;
1681         rth->fl.fl4_tos = tos;
1682 #ifdef CONFIG_IP_ROUTE_FWMARK
1683         rth->fl.fl4_fwmark= skb->nfmark;
1684 #endif
1685         rth->fl.fl4_src = saddr;
1686         rth->rt_src     = saddr;
1687         rth->rt_gateway = daddr;
1688 #ifdef CONFIG_IP_ROUTE_NAT
1689         rth->rt_src_map = fl.fl4_src;
1690         rth->rt_dst_map = fl.fl4_dst;
1691         if (flags&RTCF_DNAT)
1692                 rth->rt_gateway = fl.fl4_dst;
1693 #endif
1694         rth->rt_iif     =
1695         rth->fl.iif     = dev->ifindex;
1696         rth->u.dst.dev  = out_dev->dev;
1697         dev_hold(rth->u.dst.dev);
1698         rth->fl.oif     = 0;
1699         rth->rt_spec_dst= spec_dst;
1700
1701         rth->u.dst.input = ip_forward;
1702         rth->u.dst.output = ip_output;
1703
1704         rt_set_nexthop(rth, &res, itag);
1705
1706         rth->rt_flags = flags;
1707
1708 #ifdef CONFIG_NET_FASTROUTE
1709         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1710                 struct net_device *odev = rth->u.dst.dev;
1711                 if (odev != dev &&
1712                     dev->accept_fastpath &&
1713                     odev->mtu >= dev->mtu &&
1714                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1715                         rth->rt_flags |= RTCF_FAST;
1716         }
1717 #endif
1718
1719 intern:
1720         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1721 done:
1722         in_dev_put(in_dev);
1723         if (out_dev)
1724                 in_dev_put(out_dev);
1725         if (free_res)
1726                 fib_res_put(&res);
1727 out:    return err;
1728
1729 brd_input:
1730         if (skb->protocol != htons(ETH_P_IP))
1731                 goto e_inval;
1732
1733         if (ZERONET(saddr))
1734                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1735         else {
1736                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1737                                           &itag);
1738                 if (err < 0)
1739                         goto martian_source;
1740                 if (err)
1741                         flags |= RTCF_DIRECTSRC;
1742         }
1743         flags |= RTCF_BROADCAST;
1744         res.type = RTN_BROADCAST;
1745         RT_CACHE_STAT_INC(in_brd);
1746
1747 local_input:
1748         rth = dst_alloc(&ipv4_dst_ops);
1749         if (!rth)
1750                 goto e_nobufs;
1751
1752         rth->u.dst.output= ip_rt_bug;
1753
1754         atomic_set(&rth->u.dst.__refcnt, 1);
1755         rth->u.dst.flags= DST_HOST;
1756         if (in_dev->cnf.no_policy)
1757                 rth->u.dst.flags |= DST_NOPOLICY;
1758         rth->fl.fl4_dst = daddr;
1759         rth->rt_dst     = daddr;
1760         rth->fl.fl4_tos = tos;
1761 #ifdef CONFIG_IP_ROUTE_FWMARK
1762         rth->fl.fl4_fwmark= skb->nfmark;
1763 #endif
1764         rth->fl.fl4_src = saddr;
1765         rth->rt_src     = saddr;
1766 #ifdef CONFIG_IP_ROUTE_NAT
1767         rth->rt_dst_map = fl.fl4_dst;
1768         rth->rt_src_map = fl.fl4_src;
1769 #endif
1770 #ifdef CONFIG_NET_CLS_ROUTE
1771         rth->u.dst.tclassid = itag;
1772 #endif
1773         rth->rt_iif     =
1774         rth->fl.iif     = dev->ifindex;
1775         rth->u.dst.dev  = &loopback_dev;
1776         dev_hold(rth->u.dst.dev);
1777         rth->rt_gateway = daddr;
1778         rth->rt_spec_dst= spec_dst;
1779         rth->u.dst.input= ip_local_deliver;
1780         rth->rt_flags   = flags|RTCF_LOCAL;
1781         if (res.type == RTN_UNREACHABLE) {
1782                 rth->u.dst.input= ip_error;
1783                 rth->u.dst.error= -err;
1784                 rth->rt_flags   &= ~RTCF_LOCAL;
1785         }
1786         rth->rt_type    = res.type;
1787         goto intern;
1788
1789 no_route:
1790         RT_CACHE_STAT_INC(in_no_route);
1791         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1792         res.type = RTN_UNREACHABLE;
1793         goto local_input;
1794
1795         /*
1796          *      Do not cache martian addresses: they should be logged (RFC1812)
1797          */
1798 martian_destination:
1799         RT_CACHE_STAT_INC(in_martian_dst);
1800 #ifdef CONFIG_IP_ROUTE_VERBOSE
1801         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1802                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1803                         "%u.%u.%u.%u, dev %s\n",
1804                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1805 #endif
1806 e_inval:
1807         err = -EINVAL;
1808         goto done;
1809
1810 e_nobufs:
1811         err = -ENOBUFS;
1812         goto done;
1813
1814 martian_source:
1815
1816         RT_CACHE_STAT_INC(in_martian_src);
1817 #ifdef CONFIG_IP_ROUTE_VERBOSE
1818         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1819                 /*
1820                  *      RFC1812 recommendation, if source is martian,
1821                  *      the only hint is MAC header.
1822                  */
1823                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1824                         "%u.%u.%u.%u, on dev %s\n",
1825                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1826                 if (dev->hard_header_len) {
1827                         int i;
1828                         unsigned char *p = skb->mac.raw;
1829                         printk(KERN_WARNING "ll header: ");
1830                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1831                                 printk("%02x", *p);
1832                                 if (i < (dev->hard_header_len - 1))
1833                                         printk(":");
1834                         }
1835                         printk("\n");
1836                 }
1837         }
1838 #endif
1839         goto e_inval;
1840 }
1841
1842 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1843                    u8 tos, struct net_device *dev)
1844 {
1845         struct rtable * rth;
1846         unsigned        hash;
1847         int iif = dev->ifindex;
1848
1849         tos &= IPTOS_RT_MASK;
1850         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1851
1852         rcu_read_lock();
1853         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1854                 smp_read_barrier_depends();
1855                 if (rth->fl.fl4_dst == daddr &&
1856                     rth->fl.fl4_src == saddr &&
1857                     rth->fl.iif == iif &&
1858                     rth->fl.oif == 0 &&
1859 #ifdef CONFIG_IP_ROUTE_FWMARK
1860                     rth->fl.fl4_fwmark == skb->nfmark &&
1861 #endif
1862                     rth->fl.fl4_tos == tos) {
1863                         rth->u.dst.lastuse = jiffies;
1864                         dst_hold(&rth->u.dst);
1865                         rth->u.dst.__use++;
1866                         RT_CACHE_STAT_INC(in_hit);
1867                         rcu_read_unlock();
1868                         skb->dst = (struct dst_entry*)rth;
1869                         return 0;
1870                 }
1871                 RT_CACHE_STAT_INC(in_hlist_search);
1872         }
1873         rcu_read_unlock();
1874
1875         /* Multicast recognition logic is moved from route cache to here.
1876            The problem was that too many Ethernet cards have broken/missing
1877            hardware multicast filters :-( As result the host on multicasting
1878            network acquires a lot of useless route cache entries, sort of
1879            SDR messages from all the world. Now we try to get rid of them.
1880            Really, provided software IP multicast filter is organized
1881            reasonably (at least, hashed), it does not result in a slowdown
1882            comparing with route cache reject entries.
1883            Note, that multicast routers are not affected, because
1884            route cache entry is created eventually.
1885          */
1886         if (MULTICAST(daddr)) {
1887                 struct in_device *in_dev;
1888
1889                 read_lock(&inetdev_lock);
1890                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1891                         int our = ip_check_mc(in_dev, daddr, saddr,
1892                                 skb->nh.iph->protocol);
1893                         if (our
1894 #ifdef CONFIG_IP_MROUTE
1895                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1896 #endif
1897                             ) {
1898                                 read_unlock(&inetdev_lock);
1899                                 return ip_route_input_mc(skb, daddr, saddr,
1900                                                          tos, dev, our);
1901                         }
1902                 }
1903                 read_unlock(&inetdev_lock);
1904                 return -EINVAL;
1905         }
1906         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1907 }
1908
1909 /*
1910  * Major route resolver routine.
1911  */
1912
1913 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1914 {
1915         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1916         struct flowi fl = { .nl_u = { .ip4_u =
1917                                       { .daddr = oldflp->fl4_dst,
1918                                         .saddr = oldflp->fl4_src,
1919                                         .tos = tos & IPTOS_RT_MASK,
1920                                         .scope = ((tos & RTO_ONLINK) ?
1921                                                   RT_SCOPE_LINK :
1922                                                   RT_SCOPE_UNIVERSE),
1923 #ifdef CONFIG_IP_ROUTE_FWMARK
1924                                         .fwmark = oldflp->fl4_fwmark
1925 #endif
1926                                       } },
1927                             .iif = loopback_dev.ifindex,
1928                             .oif = oldflp->oif };
1929         struct fib_result res;
1930         unsigned flags = 0;
1931         struct rtable *rth;
1932         struct net_device *dev_out = NULL;
1933         struct in_device *in_dev = NULL;
1934         unsigned hash;
1935         int free_res = 0;
1936         int err;
1937
1938         res.fi          = NULL;
1939 #ifdef CONFIG_IP_MULTIPLE_TABLES
1940         res.r           = NULL;
1941 #endif
1942
1943         if (oldflp->fl4_src) {
1944                 err = -EINVAL;
1945                 if (MULTICAST(oldflp->fl4_src) ||
1946                     BADCLASS(oldflp->fl4_src) ||
1947                     ZERONET(oldflp->fl4_src))
1948                         goto out;
1949
1950                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1951                 dev_out = ip_dev_find(oldflp->fl4_src);
1952                 if (dev_out == NULL)
1953                         goto out;
1954
1955                 /* I removed check for oif == dev_out->oif here.
1956                    It was wrong for two reasons:
1957                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1958                       assigned to multiple interfaces.
1959                    2. Moreover, we are allowed to send packets with saddr
1960                       of another iface. --ANK
1961                  */
1962
1963                 if (oldflp->oif == 0
1964                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1965                         /* Special hack: user can direct multicasts
1966                            and limited broadcast via necessary interface
1967                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1968                            This hack is not just for fun, it allows
1969                            vic,vat and friends to work.
1970                            They bind socket to loopback, set ttl to zero
1971                            and expect that it will work.
1972                            From the viewpoint of routing cache they are broken,
1973                            because we are not allowed to build multicast path
1974                            with loopback source addr (look, routing cache
1975                            cannot know, that ttl is zero, so that packet
1976                            will not leave this host and route is valid).
1977                            Luckily, this hack is good workaround.
1978                          */
1979
1980                         fl.oif = dev_out->ifindex;
1981                         goto make_route;
1982                 }
1983                 if (dev_out)
1984                         dev_put(dev_out);
1985                 dev_out = NULL;
1986         }
1987         if (oldflp->oif) {
1988                 dev_out = dev_get_by_index(oldflp->oif);
1989                 err = -ENODEV;
1990                 if (dev_out == NULL)
1991                         goto out;
1992                 if (__in_dev_get(dev_out) == NULL) {
1993                         dev_put(dev_out);
1994                         goto out;       /* Wrong error code */
1995                 }
1996
1997                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1998                         if (!fl.fl4_src)
1999                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2000                                                               RT_SCOPE_LINK);
2001                         goto make_route;
2002                 }
2003                 if (!fl.fl4_src) {
2004                         if (MULTICAST(oldflp->fl4_dst))
2005                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2006                                                               fl.fl4_scope);
2007                         else if (!oldflp->fl4_dst)
2008                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2009                                                               RT_SCOPE_HOST);
2010                 }
2011         }
2012
2013         if (!fl.fl4_dst) {
2014                 fl.fl4_dst = fl.fl4_src;
2015                 if (!fl.fl4_dst)
2016                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2017                 if (dev_out)
2018                         dev_put(dev_out);
2019                 dev_out = &loopback_dev;
2020                 dev_hold(dev_out);
2021                 fl.oif = loopback_dev.ifindex;
2022                 res.type = RTN_LOCAL;
2023                 flags |= RTCF_LOCAL;
2024                 goto make_route;
2025         }
2026
2027         if (fib_lookup(&fl, &res)) {
2028                 res.fi = NULL;
2029                 if (oldflp->oif) {
2030                         /* Apparently, routing tables are wrong. Assume,
2031                            that the destination is on link.
2032
2033                            WHY? DW.
2034                            Because we are allowed to send to iface
2035                            even if it has NO routes and NO assigned
2036                            addresses. When oif is specified, routing
2037                            tables are looked up with only one purpose:
2038                            to catch if destination is gatewayed, rather than
2039                            direct. Moreover, if MSG_DONTROUTE is set,
2040                            we send packet, ignoring both routing tables
2041                            and ifaddr state. --ANK
2042
2043
2044                            We could make it even if oif is unknown,
2045                            likely IPv6, but we do not.
2046                          */
2047
2048                         if (fl.fl4_src == 0)
2049                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2050                                                               RT_SCOPE_LINK);
2051                         res.type = RTN_UNICAST;
2052                         goto make_route;
2053                 }
2054                 if (dev_out)
2055                         dev_put(dev_out);
2056                 err = -ENETUNREACH;
2057                 goto out;
2058         }
2059         free_res = 1;
2060
2061         if (res.type == RTN_NAT)
2062                 goto e_inval;
2063
2064         if (res.type == RTN_LOCAL) {
2065                 if (!fl.fl4_src)
2066                         fl.fl4_src = fl.fl4_dst;
2067                 if (dev_out)
2068                         dev_put(dev_out);
2069                 dev_out = &loopback_dev;
2070                 dev_hold(dev_out);
2071                 fl.oif = dev_out->ifindex;
2072                 if (res.fi)
2073                         fib_info_put(res.fi);
2074                 res.fi = NULL;
2075                 flags |= RTCF_LOCAL;
2076                 goto make_route;
2077         }
2078
2079 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2080         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2081                 fib_select_multipath(&fl, &res);
2082         else
2083 #endif
2084         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2085                 fib_select_default(&fl, &res);
2086
2087         if (!fl.fl4_src)
2088                 fl.fl4_src = FIB_RES_PREFSRC(res);
2089
2090         if (dev_out)
2091                 dev_put(dev_out);
2092         dev_out = FIB_RES_DEV(res);
2093         dev_hold(dev_out);
2094         fl.oif = dev_out->ifindex;
2095
2096 make_route:
2097         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2098                 goto e_inval;
2099
2100         if (fl.fl4_dst == 0xFFFFFFFF)
2101                 res.type = RTN_BROADCAST;
2102         else if (MULTICAST(fl.fl4_dst))
2103                 res.type = RTN_MULTICAST;
2104         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2105                 goto e_inval;
2106
2107         if (dev_out->flags & IFF_LOOPBACK)
2108                 flags |= RTCF_LOCAL;
2109
2110         in_dev = in_dev_get(dev_out);
2111         if (!in_dev)
2112                 goto e_inval;
2113
2114         if (res.type == RTN_BROADCAST) {
2115                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2116                 if (res.fi) {
2117                         fib_info_put(res.fi);
2118                         res.fi = NULL;
2119                 }
2120         } else if (res.type == RTN_MULTICAST) {
2121                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2122                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2123                         flags &= ~RTCF_LOCAL;
2124                 /* If multicast route do not exist use
2125                    default one, but do not gateway in this case.
2126                    Yes, it is hack.
2127                  */
2128                 if (res.fi && res.prefixlen < 4) {
2129                         fib_info_put(res.fi);
2130                         res.fi = NULL;
2131                 }
2132         }
2133
2134         rth = dst_alloc(&ipv4_dst_ops);
2135         if (!rth)
2136                 goto e_nobufs;
2137
2138         atomic_set(&rth->u.dst.__refcnt, 1);
2139         rth->u.dst.flags= DST_HOST;
2140         if (in_dev->cnf.no_xfrm)
2141                 rth->u.dst.flags |= DST_NOXFRM;
2142         if (in_dev->cnf.no_policy)
2143                 rth->u.dst.flags |= DST_NOPOLICY;
2144         rth->fl.fl4_dst = oldflp->fl4_dst;
2145         rth->fl.fl4_tos = tos;
2146         rth->fl.fl4_src = oldflp->fl4_src;
2147         rth->fl.oif     = oldflp->oif;
2148 #ifdef CONFIG_IP_ROUTE_FWMARK
2149         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2150 #endif
2151         rth->rt_dst     = fl.fl4_dst;
2152         rth->rt_src     = fl.fl4_src;
2153 #ifdef CONFIG_IP_ROUTE_NAT
2154         rth->rt_dst_map = fl.fl4_dst;
2155         rth->rt_src_map = fl.fl4_src;
2156 #endif
2157         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2158         rth->u.dst.dev  = dev_out;
2159         dev_hold(dev_out);
2160         rth->rt_gateway = fl.fl4_dst;
2161         rth->rt_spec_dst= fl.fl4_src;
2162
2163         rth->u.dst.output=ip_output;
2164
2165         RT_CACHE_STAT_INC(out_slow_tot);
2166
2167         if (flags & RTCF_LOCAL) {
2168                 rth->u.dst.input = ip_local_deliver;
2169                 rth->rt_spec_dst = fl.fl4_dst;
2170         }
2171         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2172                 rth->rt_spec_dst = fl.fl4_src;
2173                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2174                         rth->u.dst.output = ip_mc_output;
2175                         RT_CACHE_STAT_INC(out_slow_mc);
2176                 }
2177 #ifdef CONFIG_IP_MROUTE
2178                 if (res.type == RTN_MULTICAST) {
2179                         if (IN_DEV_MFORWARD(in_dev) &&
2180                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2181                                 rth->u.dst.input = ip_mr_input;
2182                                 rth->u.dst.output = ip_mc_output;
2183                         }
2184                 }
2185 #endif
2186         }
2187
2188         rt_set_nexthop(rth, &res, 0);
2189         
2190
2191         rth->rt_flags = flags;
2192
2193         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2194         err = rt_intern_hash(hash, rth, rp);
2195 done:
2196         if (free_res)
2197                 fib_res_put(&res);
2198         if (dev_out)
2199                 dev_put(dev_out);
2200         if (in_dev)
2201                 in_dev_put(in_dev);
2202 out:    return err;
2203
2204 e_inval:
2205         err = -EINVAL;
2206         goto done;
2207 e_nobufs:
2208         err = -ENOBUFS;
2209         goto done;
2210 }
2211
2212 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2213 {
2214         unsigned hash;
2215         struct rtable *rth;
2216
2217         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2218
2219         rcu_read_lock();
2220         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2221                 smp_read_barrier_depends();
2222                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2223                     rth->fl.fl4_src == flp->fl4_src &&
2224                     rth->fl.iif == 0 &&
2225                     rth->fl.oif == flp->oif &&
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2228 #endif
2229                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2230                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2231                         rth->u.dst.lastuse = jiffies;
2232                         dst_hold(&rth->u.dst);
2233                         rth->u.dst.__use++;
2234                         RT_CACHE_STAT_INC(out_hit);
2235                         rcu_read_unlock();
2236                         *rp = rth;
2237                         return 0;
2238                 }
2239                 RT_CACHE_STAT_INC(out_hlist_search);
2240         }
2241         rcu_read_unlock();
2242
2243         return ip_route_output_slow(rp, flp);
2244 }
2245
2246 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2247 {
2248         int err;
2249
2250         if ((err = __ip_route_output_key(rp, flp)) != 0)
2251                 return err;
2252         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2253 }
2254
2255 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2256 {
2257         int err;
2258
2259         if ((err = __ip_route_output_key(rp, flp)) != 0)
2260                 return err;
2261         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2262 }
2263
2264 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2265                         int nowait)
2266 {
2267         struct rtable *rt = (struct rtable*)skb->dst;
2268         struct rtmsg *r;
2269         struct nlmsghdr  *nlh;
2270         unsigned char    *b = skb->tail;
2271         struct rta_cacheinfo ci;
2272 #ifdef CONFIG_IP_MROUTE
2273         struct rtattr *eptr;
2274 #endif
2275         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2276         r = NLMSG_DATA(nlh);
2277         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2278         r->rtm_family    = AF_INET;
2279         r->rtm_dst_len  = 32;
2280         r->rtm_src_len  = 0;
2281         r->rtm_tos      = rt->fl.fl4_tos;
2282         r->rtm_table    = RT_TABLE_MAIN;
2283         r->rtm_type     = rt->rt_type;
2284         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2285         r->rtm_protocol = RTPROT_UNSPEC;
2286         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2287         if (rt->rt_flags & RTCF_NOTIFY)
2288                 r->rtm_flags |= RTM_F_NOTIFY;
2289         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2290         if (rt->fl.fl4_src) {
2291                 r->rtm_src_len = 32;
2292                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2293         }
2294         if (rt->u.dst.dev)
2295                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2296 #ifdef CONFIG_NET_CLS_ROUTE
2297         if (rt->u.dst.tclassid)
2298                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2299 #endif
2300         if (rt->fl.iif)
2301                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2302         else if (rt->rt_src != rt->fl.fl4_src)
2303                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2304         if (rt->rt_dst != rt->rt_gateway)
2305                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2306         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2307                 goto rtattr_failure;
2308         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2309         ci.rta_used     = rt->u.dst.__use;
2310         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2311         if (rt->u.dst.expires)
2312                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2313         else
2314                 ci.rta_expires = 0;
2315         ci.rta_error    = rt->u.dst.error;
2316         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2317         if (rt->peer) {
2318                 ci.rta_id = rt->peer->ip_id_count;
2319                 if (rt->peer->tcp_ts_stamp) {
2320                         ci.rta_ts = rt->peer->tcp_ts;
2321                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2322                 }
2323         }
2324 #ifdef CONFIG_IP_MROUTE
2325         eptr = (struct rtattr*)skb->tail;
2326 #endif
2327         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2328         if (rt->fl.iif) {
2329 #ifdef CONFIG_IP_MROUTE
2330                 u32 dst = rt->rt_dst;
2331
2332                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2333                     ipv4_devconf.mc_forwarding) {
2334                         int err = ipmr_get_route(skb, r, nowait);
2335                         if (err <= 0) {
2336                                 if (!nowait) {
2337                                         if (err == 0)
2338                                                 return 0;
2339                                         goto nlmsg_failure;
2340                                 } else {
2341                                         if (err == -EMSGSIZE)
2342                                                 goto nlmsg_failure;
2343                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2344                                 }
2345                         }
2346                 } else
2347 #endif
2348                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2349         }
2350
2351         nlh->nlmsg_len = skb->tail - b;
2352         return skb->len;
2353
2354 nlmsg_failure:
2355 rtattr_failure:
2356         skb_trim(skb, b - skb->data);
2357         return -1;
2358 }
2359
2360 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2361 {
2362         struct rtattr **rta = arg;
2363         struct rtmsg *rtm = NLMSG_DATA(nlh);
2364         struct rtable *rt = NULL;
2365         u32 dst = 0;
2366         u32 src = 0;
2367         int iif = 0;
2368         int err = -ENOBUFS;
2369         struct sk_buff *skb;
2370
2371         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2372         if (!skb)
2373                 goto out;
2374
2375         /* Reserve room for dummy headers, this skb can pass
2376            through good chunk of routing engine.
2377          */
2378         skb->mac.raw = skb->data;
2379         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2380
2381         if (rta[RTA_SRC - 1])
2382                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2383         if (rta[RTA_DST - 1])
2384                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2385         if (rta[RTA_IIF - 1])
2386                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2387
2388         if (iif) {
2389                 struct net_device *dev = __dev_get_by_index(iif);
2390                 err = -ENODEV;
2391                 if (!dev)
2392                         goto out_free;
2393                 skb->protocol   = htons(ETH_P_IP);
2394                 skb->dev        = dev;
2395                 local_bh_disable();
2396                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2397                 local_bh_enable();
2398                 rt = (struct rtable*)skb->dst;
2399                 if (!err && rt->u.dst.error)
2400                         err = -rt->u.dst.error;
2401         } else {
2402                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2403                                                          .saddr = src,
2404                                                          .tos = rtm->rtm_tos } } };
2405                 int oif = 0;
2406                 if (rta[RTA_OIF - 1])
2407                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2408                 fl.oif = oif;
2409                 err = ip_route_output_key(&rt, &fl);
2410         }
2411         if (err)
2412                 goto out_free;
2413
2414         skb->dst = &rt->u.dst;
2415         if (rtm->rtm_flags & RTM_F_NOTIFY)
2416                 rt->rt_flags |= RTCF_NOTIFY;
2417
2418         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2419
2420         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2421                                 RTM_NEWROUTE, 0);
2422         if (!err)
2423                 goto out_free;
2424         if (err < 0) {
2425                 err = -EMSGSIZE;
2426                 goto out_free;
2427         }
2428
2429         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2430         if (err > 0)
2431                 err = 0;
2432 out:    return err;
2433
2434 out_free:
2435         kfree_skb(skb);
2436         goto out;
2437 }
2438
2439 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2440 {
2441         struct rtable *rt;
2442         int h, s_h;
2443         int idx, s_idx;
2444
2445         s_h = cb->args[0];
2446         s_idx = idx = cb->args[1];
2447         for (h = 0; h <= rt_hash_mask; h++) {
2448                 if (h < s_h) continue;
2449                 if (h > s_h)
2450                         s_idx = 0;
2451                 rcu_read_lock();
2452                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2453                      rt = rt->u.rt_next, idx++) {
2454                         smp_read_barrier_depends();
2455                         if (idx < s_idx)
2456                                 continue;
2457                         skb->dst = dst_clone(&rt->u.dst);
2458                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2459                                          cb->nlh->nlmsg_seq,
2460                                          RTM_NEWROUTE, 1) <= 0) {
2461                                 dst_release(xchg(&skb->dst, NULL));
2462                                 rcu_read_unlock();
2463                                 goto done;
2464                         }
2465                         dst_release(xchg(&skb->dst, NULL));
2466                 }
2467                 rcu_read_unlock();
2468         }
2469
2470 done:
2471         cb->args[0] = h;
2472         cb->args[1] = idx;
2473         return skb->len;
2474 }
2475
2476 void ip_rt_multicast_event(struct in_device *in_dev)
2477 {
2478         rt_cache_flush(0);
2479 }
2480
2481 #ifdef CONFIG_SYSCTL
2482 static int flush_delay;
2483
2484 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2485                                         struct file *filp, void *buffer,
2486                                         size_t *lenp)
2487 {
2488         if (write) {
2489                 proc_dointvec(ctl, write, filp, buffer, lenp);
2490                 rt_cache_flush(flush_delay);
2491                 return 0;
2492         } 
2493
2494         return -EINVAL;
2495 }
2496
2497 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2498                                                 int nlen, void *oldval,
2499                                                 size_t *oldlenp, void *newval,
2500                                                 size_t newlen, void **context)
2501 {
2502         int delay;
2503         if (newlen != sizeof(int))
2504                 return -EINVAL;
2505         if (get_user(delay, (int *)newval))
2506                 return -EFAULT; 
2507         rt_cache_flush(delay); 
2508         return 0;
2509 }
2510
2511 ctl_table ipv4_route_table[] = {
2512         {
2513                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2514                 .procname       = "flush",
2515                 .data           = &flush_delay,
2516                 .maxlen         = sizeof(int),
2517                 .mode           = 0644,
2518                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2519                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2520         },
2521         {
2522                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2523                 .procname       = "min_delay",
2524                 .data           = &ip_rt_min_delay,
2525                 .maxlen         = sizeof(int),
2526                 .mode           = 0644,
2527                 .proc_handler   = &proc_dointvec_jiffies,
2528                 .strategy       = &sysctl_jiffies,
2529         },
2530         {
2531                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2532                 .procname       = "max_delay",
2533                 .data           = &ip_rt_max_delay,
2534                 .maxlen         = sizeof(int),
2535                 .mode           = 0644,
2536                 .proc_handler   = &proc_dointvec_jiffies,
2537                 .strategy       = &sysctl_jiffies,
2538         },
2539         {
2540                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2541                 .procname       = "gc_thresh",
2542                 .data           = &ipv4_dst_ops.gc_thresh,
2543                 .maxlen         = sizeof(int),
2544                 .mode           = 0644,
2545                 .proc_handler   = &proc_dointvec,
2546         },
2547         {
2548                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2549                 .procname       = "max_size",
2550                 .data           = &ip_rt_max_size,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = &proc_dointvec,
2554         },
2555         {
2556                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2557                 .procname       = "gc_min_interval",
2558                 .data           = &ip_rt_gc_min_interval,
2559                 .maxlen         = sizeof(int),
2560                 .mode           = 0644,
2561                 .proc_handler   = &proc_dointvec_jiffies,
2562                 .strategy       = &sysctl_jiffies,
2563         },
2564         {
2565                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2566                 .procname       = "gc_timeout",
2567                 .data           = &ip_rt_gc_timeout,
2568                 .maxlen         = sizeof(int),
2569                 .mode           = 0644,
2570                 .proc_handler   = &proc_dointvec_jiffies,
2571                 .strategy       = &sysctl_jiffies,
2572         },
2573         {
2574                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2575                 .procname       = "gc_interval",
2576                 .data           = &ip_rt_gc_interval,
2577                 .maxlen         = sizeof(int),
2578                 .mode           = 0644,
2579                 .proc_handler   = &proc_dointvec_jiffies,
2580                 .strategy       = &sysctl_jiffies,
2581         },
2582         {
2583                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2584                 .procname       = "redirect_load",
2585                 .data           = &ip_rt_redirect_load,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = &proc_dointvec,
2589         },
2590         {
2591                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2592                 .procname       = "redirect_number",
2593                 .data           = &ip_rt_redirect_number,
2594                 .maxlen         = sizeof(int),
2595                 .mode           = 0644,
2596                 .proc_handler   = &proc_dointvec,
2597         },
2598         {
2599                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2600                 .procname       = "redirect_silence",
2601                 .data           = &ip_rt_redirect_silence,
2602                 .maxlen         = sizeof(int),
2603                 .mode           = 0644,
2604                 .proc_handler   = &proc_dointvec,
2605         },
2606         {
2607                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2608                 .procname       = "error_cost",
2609                 .data           = &ip_rt_error_cost,
2610                 .maxlen         = sizeof(int),
2611                 .mode           = 0644,
2612                 .proc_handler   = &proc_dointvec,
2613         },
2614         {
2615                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2616                 .procname       = "error_burst",
2617                 .data           = &ip_rt_error_burst,
2618                 .maxlen         = sizeof(int),
2619                 .mode           = 0644,
2620                 .proc_handler   = &proc_dointvec,
2621         },
2622         {
2623                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2624                 .procname       = "gc_elasticity",
2625                 .data           = &ip_rt_gc_elasticity,
2626                 .maxlen         = sizeof(int),
2627                 .mode           = 0644,
2628                 .proc_handler   = &proc_dointvec,
2629         },
2630         {
2631                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2632                 .procname       = "mtu_expires",
2633                 .data           = &ip_rt_mtu_expires,
2634                 .maxlen         = sizeof(int),
2635                 .mode           = 0644,
2636                 .proc_handler   = &proc_dointvec_jiffies,
2637                 .strategy       = &sysctl_jiffies,
2638         },
2639         {
2640                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2641                 .procname       = "min_pmtu",
2642                 .data           = &ip_rt_min_pmtu,
2643                 .maxlen         = sizeof(int),
2644                 .mode           = 0644,
2645                 .proc_handler   = &proc_dointvec,
2646         },
2647         {
2648                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2649                 .procname       = "min_adv_mss",
2650                 .data           = &ip_rt_min_advmss,
2651                 .maxlen         = sizeof(int),
2652                 .mode           = 0644,
2653                 .proc_handler   = &proc_dointvec,
2654         },
2655         {
2656                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2657                 .procname       = "secret_interval",
2658                 .data           = &ip_rt_secret_interval,
2659                 .maxlen         = sizeof(int),
2660                 .mode           = 0644,
2661                 .proc_handler   = &proc_dointvec_jiffies,
2662                 .strategy       = &sysctl_jiffies,
2663         },
2664         { .ctl_name = 0 }
2665 };
2666 #endif
2667
2668 #ifdef CONFIG_NET_CLS_ROUTE
2669 struct ip_rt_acct *ip_rt_acct;
2670
2671 /* This code sucks.  But you should have seen it before! --RR */
2672
2673 /* IP route accounting ptr for this logical cpu number. */
2674 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2675
2676 #ifdef CONFIG_PROC_FS
2677 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2678                            int length, int *eof, void *data)
2679 {
2680         unsigned int i;
2681
2682         if ((offset & 3) || (length & 3))
2683                 return -EIO;
2684
2685         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2686                 *eof = 1;
2687                 return 0;
2688         }
2689
2690         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2691                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2692                 *eof = 1;
2693         }
2694
2695         offset /= sizeof(u32);
2696
2697         if (length > 0) {
2698                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2699                 u32 *dst = (u32 *) buffer;
2700
2701                 /* Copy first cpu. */
2702                 *start = buffer;
2703                 memcpy(dst, src, length);
2704
2705                 /* Add the other cpus in, one int at a time */
2706                 for_each_cpu(i) {
2707                         unsigned int j;
2708
2709                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2710
2711                         for (j = 0; j < length/4; j++)
2712                                 dst[j] += src[j];
2713                 }
2714         }
2715         return length;
2716 }
2717 #endif /* CONFIG_PROC_FS */
2718 #endif /* CONFIG_NET_CLS_ROUTE */
2719
2720 static __initdata unsigned long rhash_entries;
2721 static int __init set_rhash_entries(char *str)
2722 {
2723         if (!str)
2724                 return 0;
2725         rhash_entries = simple_strtoul(str, &str, 0);
2726         return 1;
2727 }
2728 __setup("rhash_entries=", set_rhash_entries);
2729
2730 int __init ip_rt_init(void)
2731 {
2732         int i, order, goal, rc = 0;
2733
2734         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2735                              (jiffies ^ (jiffies >> 7)));
2736
2737 #ifdef CONFIG_NET_CLS_ROUTE
2738         for (order = 0;
2739              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2740                 /* NOTHING */;
2741         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2742         if (!ip_rt_acct)
2743                 panic("IP: failed to allocate ip_rt_acct\n");
2744         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2745 #endif
2746
2747         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2748                                                      sizeof(struct rtable),
2749                                                      0, SLAB_HWCACHE_ALIGN,
2750                                                      NULL, NULL);
2751
2752         if (!ipv4_dst_ops.kmem_cachep)
2753                 panic("IP: failed to allocate ip_dst_cache\n");
2754
2755         goal = num_physpages >> (26 - PAGE_SHIFT);
2756         if (rhash_entries)
2757                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2758         for (order = 0; (1UL << order) < goal; order++)
2759                 /* NOTHING */;
2760
2761         do {
2762                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2763                         sizeof(struct rt_hash_bucket);
2764                 while (rt_hash_mask & (rt_hash_mask - 1))
2765                         rt_hash_mask--;
2766                 rt_hash_table = (struct rt_hash_bucket *)
2767                         __get_free_pages(GFP_ATOMIC, order);
2768         } while (rt_hash_table == NULL && --order > 0);
2769
2770         if (!rt_hash_table)
2771                 panic("Failed to allocate IP route cache hash table\n");
2772
2773         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2774                rt_hash_mask,
2775                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2776
2777         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2778                 /* NOTHING */;
2779
2780         rt_hash_mask--;
2781         for (i = 0; i <= rt_hash_mask; i++) {
2782                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2783                 rt_hash_table[i].chain = NULL;
2784         }
2785
2786         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2787         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2788
2789         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2790         if (!rt_cache_stat)
2791                 return -ENOMEM;
2792
2793         devinet_init();
2794         ip_fib_init();
2795
2796         init_timer(&rt_flush_timer);
2797         rt_flush_timer.function = rt_run_flush;
2798         init_timer(&rt_periodic_timer);
2799         rt_periodic_timer.function = rt_check_expire;
2800         init_timer(&rt_secret_timer);
2801         rt_secret_timer.function = rt_secret_rebuild;
2802
2803         /* All the timers, started at system startup tend
2804            to synchronize. Perturb it a bit.
2805          */
2806         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2807                                         ip_rt_gc_interval;
2808         add_timer(&rt_periodic_timer);
2809
2810         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2811                 ip_rt_secret_interval;
2812         add_timer(&rt_secret_timer);
2813
2814 #ifdef CONFIG_PROC_FS
2815         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2816             !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2817                 free_percpu(rt_cache_stat);
2818                 return -ENOMEM;
2819         }
2820
2821 #ifdef CONFIG_NET_CLS_ROUTE
2822         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2823 #endif
2824 #endif
2825 #ifdef CONFIG_XFRM
2826         xfrm_init();
2827         xfrm4_init();
2828 #endif
2829         return rc;
2830 }
2831
2832 EXPORT_SYMBOL(__ip_select_ident);
2833 EXPORT_SYMBOL(ip_route_input);
2834 EXPORT_SYMBOL(ip_route_output_key);