vserver 1.9.3
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106
107 #define IP_MAX_MTU      0xFFF0
108
109 #define RT_GC_TIMEOUT (300*HZ)
110
111 int ip_rt_min_delay             = 2 * HZ;
112 int ip_rt_max_delay             = 10 * HZ;
113 int ip_rt_max_size;
114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval           = 60 * HZ;
116 int ip_rt_gc_min_interval       = HZ / 2;
117 int ip_rt_redirect_number       = 9;
118 int ip_rt_redirect_load         = HZ / 50;
119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost            = HZ;
121 int ip_rt_error_burst           = 5 * HZ;
122 int ip_rt_gc_elasticity         = 8;
123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
124 int ip_rt_min_pmtu              = 512 + 20 + 20;
125 int ip_rt_min_advmss            = 256;
126 int ip_rt_secret_interval       = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static void              ipv4_dst_ifdown(struct dst_entry *dst, int how);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(void);
146
147
148 static struct dst_ops ipv4_dst_ops = {
149         .family =               AF_INET,
150         .protocol =             __constant_htons(ETH_P_IP),
151         .gc =                   rt_garbage_collect,
152         .check =                ipv4_dst_check,
153         .destroy =              ipv4_dst_destroy,
154         .ifdown =               ipv4_dst_ifdown,
155         .negative_advice =      ipv4_negative_advice,
156         .link_failure =         ipv4_link_failure,
157         .update_pmtu =          ip_rt_update_pmtu,
158         .entry_size =           sizeof(struct rtable),
159 };
160
161 #define ECN_OR_COST(class)      TC_PRIO_##class
162
163 __u8 ip_tos2prio[16] = {
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(FILLER),
166         TC_PRIO_BESTEFFORT,
167         ECN_OR_COST(BESTEFFORT),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_BULK,
171         ECN_OR_COST(BULK),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE,
175         ECN_OR_COST(INTERACTIVE),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK),
178         TC_PRIO_INTERACTIVE_BULK,
179         ECN_OR_COST(INTERACTIVE_BULK)
180 };
181
182
183 /*
184  * Route cache.
185  */
186
187 /* The locking scheme is rather straight forward:
188  *
189  * 1) Read-Copy Update protects the buckets of the central route hash.
190  * 2) Only writers remove entries, and they hold the lock
191  *    as they look at rtable reference counts.
192  * 3) Only readers acquire references to rtable entries,
193  *    they do so with atomic increments and with the
194  *    lock held.
195  */
196
197 struct rt_hash_bucket {
198         struct rtable   *chain;
199         spinlock_t      lock;
200 } __attribute__((__aligned__(8)));
201
202 static struct rt_hash_bucket    *rt_hash_table;
203 static unsigned                 rt_hash_mask;
204 static int                      rt_hash_log;
205 static unsigned int             rt_hash_rnd;
206
207 struct rt_cache_stat *rt_cache_stat;
208
209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
210                                 struct rtable **res);
211
212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
213 {
214         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
215                 & rt_hash_mask);
216 }
217
218 #ifdef CONFIG_PROC_FS
219 struct rt_cache_iter_state {
220         int bucket;
221 };
222
223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
224 {
225         struct rtable *r = NULL;
226         struct rt_cache_iter_state *st = seq->private;
227
228         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
229                 rcu_read_lock_bh();
230                 r = rt_hash_table[st->bucket].chain;
231                 if (r)
232                         break;
233                 rcu_read_unlock_bh();
234         }
235         return r;
236 }
237
238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
239 {
240         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
241
242         r = r->u.rt_next;
243         while (!r) {
244                 rcu_read_unlock_bh();
245                 if (--st->bucket < 0)
246                         break;
247                 rcu_read_lock_bh();
248                 r = rt_hash_table[st->bucket].chain;
249         }
250         return r;
251 }
252
253 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
254 {
255         struct rtable *r = rt_cache_get_first(seq);
256
257         if (r)
258                 while (pos && (r = rt_cache_get_next(seq, r)))
259                         --pos;
260         return pos ? NULL : r;
261 }
262
263 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
264 {
265         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
266 }
267
268 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
269 {
270         struct rtable *r = NULL;
271
272         if (v == SEQ_START_TOKEN)
273                 r = rt_cache_get_first(seq);
274         else
275                 r = rt_cache_get_next(seq, v);
276         ++*pos;
277         return r;
278 }
279
280 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
281 {
282         if (v && v != SEQ_START_TOKEN)
283                 rcu_read_unlock_bh();
284 }
285
286 static int rt_cache_seq_show(struct seq_file *seq, void *v)
287 {
288         if (v == SEQ_START_TOKEN)
289                 seq_printf(seq, "%-127s\n",
290                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
291                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
292                            "HHUptod\tSpecDst");
293         else {
294                 struct rtable *r = v;
295                 char temp[256];
296
297                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
298                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
299                         r->u.dst.dev ? r->u.dst.dev->name : "*",
300                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
301                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
302                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
303                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
304                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
305                         dst_metric(&r->u.dst, RTAX_WINDOW),
306                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
307                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
308                         r->fl.fl4_tos,
309                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
310                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
311                                        dev_queue_xmit) : 0,
312                         r->rt_spec_dst);
313                 seq_printf(seq, "%-127s\n", temp);
314         }
315         return 0;
316 }
317
318 static struct seq_operations rt_cache_seq_ops = {
319         .start  = rt_cache_seq_start,
320         .next   = rt_cache_seq_next,
321         .stop   = rt_cache_seq_stop,
322         .show   = rt_cache_seq_show,
323 };
324
325 static int rt_cache_seq_open(struct inode *inode, struct file *file)
326 {
327         struct seq_file *seq;
328         int rc = -ENOMEM;
329         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
330
331         if (!s)
332                 goto out;
333         rc = seq_open(file, &rt_cache_seq_ops);
334         if (rc)
335                 goto out_kfree;
336         seq          = file->private_data;
337         seq->private = s;
338         memset(s, 0, sizeof(*s));
339 out:
340         return rc;
341 out_kfree:
342         kfree(s);
343         goto out;
344 }
345
346 static struct file_operations rt_cache_seq_fops = {
347         .owner   = THIS_MODULE,
348         .open    = rt_cache_seq_open,
349         .read    = seq_read,
350         .llseek  = seq_lseek,
351         .release = seq_release_private,
352 };
353
354
355 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
356 {
357         int cpu;
358
359         if (*pos == 0)
360                 return SEQ_START_TOKEN;
361
362         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
363                 if (!cpu_possible(cpu))
364                         continue;
365                 *pos = cpu+1;
366                 return per_cpu_ptr(rt_cache_stat, cpu);
367         }
368         return NULL;
369 }
370
371 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
372 {
373         int cpu;
374
375         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
376                 if (!cpu_possible(cpu))
377                         continue;
378                 *pos = cpu+1;
379                 return per_cpu_ptr(rt_cache_stat, cpu);
380         }
381         return NULL;
382         
383 }
384
385 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
386 {
387
388 }
389
390 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
391 {
392         struct rt_cache_stat *st = v;
393
394         if (v == SEQ_START_TOKEN) {
395                 seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
396                 return 0;
397         }
398         
399         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
400                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
401                    atomic_read(&ipv4_dst_ops.entries),
402                    st->in_hit,
403                    st->in_slow_tot,
404                    st->in_slow_mc,
405                    st->in_no_route,
406                    st->in_brd,
407                    st->in_martian_dst,
408                    st->in_martian_src,
409
410                    st->out_hit,
411                    st->out_slow_tot,
412                    st->out_slow_mc, 
413
414                    st->gc_total,
415                    st->gc_ignored,
416                    st->gc_goal_miss,
417                    st->gc_dst_overflow,
418                    st->in_hlist_search,
419                    st->out_hlist_search
420                 );
421         return 0;
422 }
423
424 static struct seq_operations rt_cpu_seq_ops = {
425         .start  = rt_cpu_seq_start,
426         .next   = rt_cpu_seq_next,
427         .stop   = rt_cpu_seq_stop,
428         .show   = rt_cpu_seq_show,
429 };
430
431
432 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
433 {
434         return seq_open(file, &rt_cpu_seq_ops);
435 }
436
437 static struct file_operations rt_cpu_seq_fops = {
438         .owner   = THIS_MODULE,
439         .open    = rt_cpu_seq_open,
440         .read    = seq_read,
441         .llseek  = seq_lseek,
442         .release = seq_release,
443 };
444
445 #endif /* CONFIG_PROC_FS */
446   
447 static __inline__ void rt_free(struct rtable *rt)
448 {
449         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
450 }
451
452 static __inline__ void rt_drop(struct rtable *rt)
453 {
454         ip_rt_put(rt);
455         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
456 }
457
458 static __inline__ int rt_fast_clean(struct rtable *rth)
459 {
460         /* Kill broadcast/multicast entries very aggresively, if they
461            collide in hash table with more useful entries */
462         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
463                 rth->fl.iif && rth->u.rt_next;
464 }
465
466 static __inline__ int rt_valuable(struct rtable *rth)
467 {
468         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
469                 rth->u.dst.expires;
470 }
471
472 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
473 {
474         unsigned long age;
475         int ret = 0;
476
477         if (atomic_read(&rth->u.dst.__refcnt))
478                 goto out;
479
480         ret = 1;
481         if (rth->u.dst.expires &&
482             time_after_eq(jiffies, rth->u.dst.expires))
483                 goto out;
484
485         age = jiffies - rth->u.dst.lastuse;
486         ret = 0;
487         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
488             (age <= tmo2 && rt_valuable(rth)))
489                 goto out;
490         ret = 1;
491 out:    return ret;
492 }
493
494 /* Bits of score are:
495  * 31: very valuable
496  * 30: not quite useless
497  * 29..0: usage counter
498  */
499 static inline u32 rt_score(struct rtable *rt)
500 {
501         u32 score = jiffies - rt->u.dst.lastuse;
502
503         score = ~score & ~(3<<30);
504
505         if (rt_valuable(rt))
506                 score |= (1<<31);
507
508         if (!rt->fl.iif ||
509             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
510                 score |= (1<<30);
511
512         return score;
513 }
514
515 /* This runs via a timer and thus is always in BH context. */
516 static void rt_check_expire(unsigned long dummy)
517 {
518         static int rover;
519         int i = rover, t;
520         struct rtable *rth, **rthp;
521         unsigned long now = jiffies;
522
523         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
524              t -= ip_rt_gc_timeout) {
525                 unsigned long tmo = ip_rt_gc_timeout;
526
527                 i = (i + 1) & rt_hash_mask;
528                 rthp = &rt_hash_table[i].chain;
529
530                 spin_lock(&rt_hash_table[i].lock);
531                 while ((rth = *rthp) != NULL) {
532                         if (rth->u.dst.expires) {
533                                 /* Entry is expired even if it is in use */
534                                 if (time_before_eq(now, rth->u.dst.expires)) {
535                                         tmo >>= 1;
536                                         rthp = &rth->u.rt_next;
537                                         continue;
538                                 }
539                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
540                                 tmo >>= 1;
541                                 rthp = &rth->u.rt_next;
542                                 continue;
543                         }
544
545                         /* Cleanup aged off entries. */
546                         *rthp = rth->u.rt_next;
547                         rt_free(rth);
548                 }
549                 spin_unlock(&rt_hash_table[i].lock);
550
551                 /* Fallback loop breaker. */
552                 if (time_after(jiffies, now))
553                         break;
554         }
555         rover = i;
556         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
557 }
558
559 /* This can run from both BH and non-BH contexts, the latter
560  * in the case of a forced flush event.
561  */
562 static void rt_run_flush(unsigned long dummy)
563 {
564         int i;
565         struct rtable *rth, *next;
566
567         rt_deadline = 0;
568
569         get_random_bytes(&rt_hash_rnd, 4);
570
571         for (i = rt_hash_mask; i >= 0; i--) {
572                 spin_lock_bh(&rt_hash_table[i].lock);
573                 rth = rt_hash_table[i].chain;
574                 if (rth)
575                         rt_hash_table[i].chain = NULL;
576                 spin_unlock_bh(&rt_hash_table[i].lock);
577
578                 for (; rth; rth = next) {
579                         next = rth->u.rt_next;
580                         rt_free(rth);
581                 }
582         }
583 }
584
585 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
586
587 void rt_cache_flush(int delay)
588 {
589         unsigned long now = jiffies;
590         int user_mode = !in_softirq();
591
592         if (delay < 0)
593                 delay = ip_rt_min_delay;
594
595         spin_lock_bh(&rt_flush_lock);
596
597         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
598                 long tmo = (long)(rt_deadline - now);
599
600                 /* If flush timer is already running
601                    and flush request is not immediate (delay > 0):
602
603                    if deadline is not achieved, prolongate timer to "delay",
604                    otherwise fire it at deadline time.
605                  */
606
607                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
608                         tmo = 0;
609                 
610                 if (delay > tmo)
611                         delay = tmo;
612         }
613
614         if (delay <= 0) {
615                 spin_unlock_bh(&rt_flush_lock);
616                 rt_run_flush(0);
617                 return;
618         }
619
620         if (rt_deadline == 0)
621                 rt_deadline = now + ip_rt_max_delay;
622
623         mod_timer(&rt_flush_timer, now+delay);
624         spin_unlock_bh(&rt_flush_lock);
625 }
626
627 static void rt_secret_rebuild(unsigned long dummy)
628 {
629         unsigned long now = jiffies;
630
631         rt_cache_flush(0);
632         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
633 }
634
635 /*
636    Short description of GC goals.
637
638    We want to build algorithm, which will keep routing cache
639    at some equilibrium point, when number of aged off entries
640    is kept approximately equal to newly generated ones.
641
642    Current expiration strength is variable "expire".
643    We try to adjust it dynamically, so that if networking
644    is idle expires is large enough to keep enough of warm entries,
645    and when load increases it reduces to limit cache size.
646  */
647
648 static int rt_garbage_collect(void)
649 {
650         static unsigned long expire = RT_GC_TIMEOUT;
651         static unsigned long last_gc;
652         static int rover;
653         static int equilibrium;
654         struct rtable *rth, **rthp;
655         unsigned long now = jiffies;
656         int goal;
657
658         /*
659          * Garbage collection is pretty expensive,
660          * do not make it too frequently.
661          */
662
663         RT_CACHE_STAT_INC(gc_total);
664
665         if (now - last_gc < ip_rt_gc_min_interval &&
666             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
667                 RT_CACHE_STAT_INC(gc_ignored);
668                 goto out;
669         }
670
671         /* Calculate number of entries, which we want to expire now. */
672         goal = atomic_read(&ipv4_dst_ops.entries) -
673                 (ip_rt_gc_elasticity << rt_hash_log);
674         if (goal <= 0) {
675                 if (equilibrium < ipv4_dst_ops.gc_thresh)
676                         equilibrium = ipv4_dst_ops.gc_thresh;
677                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
678                 if (goal > 0) {
679                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
680                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
681                 }
682         } else {
683                 /* We are in dangerous area. Try to reduce cache really
684                  * aggressively.
685                  */
686                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
687                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
688         }
689
690         if (now - last_gc >= ip_rt_gc_min_interval)
691                 last_gc = now;
692
693         if (goal <= 0) {
694                 equilibrium += goal;
695                 goto work_done;
696         }
697
698         do {
699                 int i, k;
700
701                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
702                         unsigned long tmo = expire;
703
704                         k = (k + 1) & rt_hash_mask;
705                         rthp = &rt_hash_table[k].chain;
706                         spin_lock_bh(&rt_hash_table[k].lock);
707                         while ((rth = *rthp) != NULL) {
708                                 if (!rt_may_expire(rth, tmo, expire)) {
709                                         tmo >>= 1;
710                                         rthp = &rth->u.rt_next;
711                                         continue;
712                                 }
713                                 *rthp = rth->u.rt_next;
714                                 rt_free(rth);
715                                 goal--;
716                         }
717                         spin_unlock_bh(&rt_hash_table[k].lock);
718                         if (goal <= 0)
719                                 break;
720                 }
721                 rover = k;
722
723                 if (goal <= 0)
724                         goto work_done;
725
726                 /* Goal is not achieved. We stop process if:
727
728                    - if expire reduced to zero. Otherwise, expire is halfed.
729                    - if table is not full.
730                    - if we are called from interrupt.
731                    - jiffies check is just fallback/debug loop breaker.
732                      We will not spin here for long time in any case.
733                  */
734
735                 RT_CACHE_STAT_INC(gc_goal_miss);
736
737                 if (expire == 0)
738                         break;
739
740                 expire >>= 1;
741 #if RT_CACHE_DEBUG >= 2
742                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
743                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
744 #endif
745
746                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
747                         goto out;
748         } while (!in_softirq() && time_before_eq(jiffies, now));
749
750         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
751                 goto out;
752         if (net_ratelimit())
753                 printk(KERN_WARNING "dst cache overflow\n");
754         RT_CACHE_STAT_INC(gc_dst_overflow);
755         return 1;
756
757 work_done:
758         expire += ip_rt_gc_min_interval;
759         if (expire > ip_rt_gc_timeout ||
760             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
761                 expire = ip_rt_gc_timeout;
762 #if RT_CACHE_DEBUG >= 2
763         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
764                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
765 #endif
766 out:    return 0;
767 }
768
769 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
770 {
771         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
772                fl1->oif     == fl2->oif &&
773                fl1->iif     == fl2->iif;
774 }
775
776 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
777 {
778         struct rtable   *rth, **rthp;
779         unsigned long   now;
780         struct rtable *cand, **candp;
781         u32             min_score;
782         int             chain_length;
783         int attempts = !in_softirq();
784
785 restart:
786         chain_length = 0;
787         min_score = ~(u32)0;
788         cand = NULL;
789         candp = NULL;
790         now = jiffies;
791
792         rthp = &rt_hash_table[hash].chain;
793
794         spin_lock_bh(&rt_hash_table[hash].lock);
795         while ((rth = *rthp) != NULL) {
796                 if (compare_keys(&rth->fl, &rt->fl)) {
797                         /* Put it first */
798                         *rthp = rth->u.rt_next;
799                         /*
800                          * Since lookup is lockfree, the deletion
801                          * must be visible to another weakly ordered CPU before
802                          * the insertion at the start of the hash chain.
803                          */
804                         smp_wmb();
805                         rth->u.rt_next = rt_hash_table[hash].chain;
806                         /*
807                          * Since lookup is lockfree, the update writes
808                          * must be ordered for consistency on SMP.
809                          */
810                         smp_wmb();
811                         rt_hash_table[hash].chain = rth;
812
813                         rth->u.dst.__use++;
814                         dst_hold(&rth->u.dst);
815                         rth->u.dst.lastuse = now;
816                         spin_unlock_bh(&rt_hash_table[hash].lock);
817
818                         rt_drop(rt);
819                         *rp = rth;
820                         return 0;
821                 }
822
823                 if (!atomic_read(&rth->u.dst.__refcnt)) {
824                         u32 score = rt_score(rth);
825
826                         if (score <= min_score) {
827                                 cand = rth;
828                                 candp = rthp;
829                                 min_score = score;
830                         }
831                 }
832
833                 chain_length++;
834
835                 rthp = &rth->u.rt_next;
836         }
837
838         if (cand) {
839                 /* ip_rt_gc_elasticity used to be average length of chain
840                  * length, when exceeded gc becomes really aggressive.
841                  *
842                  * The second limit is less certain. At the moment it allows
843                  * only 2 entries per bucket. We will see.
844                  */
845                 if (chain_length > ip_rt_gc_elasticity) {
846                         *candp = cand->u.rt_next;
847                         rt_free(cand);
848                 }
849         }
850
851         /* Try to bind route to arp only if it is output
852            route or unicast forwarding path.
853          */
854         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
855                 int err = arp_bind_neighbour(&rt->u.dst);
856                 if (err) {
857                         spin_unlock_bh(&rt_hash_table[hash].lock);
858
859                         if (err != -ENOBUFS) {
860                                 rt_drop(rt);
861                                 return err;
862                         }
863
864                         /* Neighbour tables are full and nothing
865                            can be released. Try to shrink route cache,
866                            it is most likely it holds some neighbour records.
867                          */
868                         if (attempts-- > 0) {
869                                 int saved_elasticity = ip_rt_gc_elasticity;
870                                 int saved_int = ip_rt_gc_min_interval;
871                                 ip_rt_gc_elasticity     = 1;
872                                 ip_rt_gc_min_interval   = 0;
873                                 rt_garbage_collect();
874                                 ip_rt_gc_min_interval   = saved_int;
875                                 ip_rt_gc_elasticity     = saved_elasticity;
876                                 goto restart;
877                         }
878
879                         if (net_ratelimit())
880                                 printk(KERN_WARNING "Neighbour table overflow.\n");
881                         rt_drop(rt);
882                         return -ENOBUFS;
883                 }
884         }
885
886         rt->u.rt_next = rt_hash_table[hash].chain;
887 #if RT_CACHE_DEBUG >= 2
888         if (rt->u.rt_next) {
889                 struct rtable *trt;
890                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
891                        NIPQUAD(rt->rt_dst));
892                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
893                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
894                 printk("\n");
895         }
896 #endif
897         rt_hash_table[hash].chain = rt;
898         spin_unlock_bh(&rt_hash_table[hash].lock);
899         *rp = rt;
900         return 0;
901 }
902
903 void rt_bind_peer(struct rtable *rt, int create)
904 {
905         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
906         struct inet_peer *peer;
907
908         peer = inet_getpeer(rt->rt_dst, create);
909
910         spin_lock_bh(&rt_peer_lock);
911         if (rt->peer == NULL) {
912                 rt->peer = peer;
913                 peer = NULL;
914         }
915         spin_unlock_bh(&rt_peer_lock);
916         if (peer)
917                 inet_putpeer(peer);
918 }
919
920 /*
921  * Peer allocation may fail only in serious out-of-memory conditions.  However
922  * we still can generate some output.
923  * Random ID selection looks a bit dangerous because we have no chances to
924  * select ID being unique in a reasonable period of time.
925  * But broken packet identifier may be better than no packet at all.
926  */
927 static void ip_select_fb_ident(struct iphdr *iph)
928 {
929         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
930         static u32 ip_fallback_id;
931         u32 salt;
932
933         spin_lock_bh(&ip_fb_id_lock);
934         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
935         iph->id = htons(salt & 0xFFFF);
936         ip_fallback_id = salt;
937         spin_unlock_bh(&ip_fb_id_lock);
938 }
939
940 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
941 {
942         struct rtable *rt = (struct rtable *) dst;
943
944         if (rt) {
945                 if (rt->peer == NULL)
946                         rt_bind_peer(rt, 1);
947
948                 /* If peer is attached to destination, it is never detached,
949                    so that we need not to grab a lock to dereference it.
950                  */
951                 if (rt->peer) {
952                         iph->id = htons(inet_getid(rt->peer, more));
953                         return;
954                 }
955         } else
956                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
957
958         ip_select_fb_ident(iph);
959 }
960
961 static void rt_del(unsigned hash, struct rtable *rt)
962 {
963         struct rtable **rthp;
964
965         spin_lock_bh(&rt_hash_table[hash].lock);
966         ip_rt_put(rt);
967         for (rthp = &rt_hash_table[hash].chain; *rthp;
968              rthp = &(*rthp)->u.rt_next)
969                 if (*rthp == rt) {
970                         *rthp = rt->u.rt_next;
971                         rt_free(rt);
972                         break;
973                 }
974         spin_unlock_bh(&rt_hash_table[hash].lock);
975 }
976
977 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
978                     u32 saddr, u8 tos, struct net_device *dev)
979 {
980         int i, k;
981         struct in_device *in_dev = in_dev_get(dev);
982         struct rtable *rth, **rthp;
983         u32  skeys[2] = { saddr, 0 };
984         int  ikeys[2] = { dev->ifindex, 0 };
985
986         tos &= IPTOS_RT_MASK;
987
988         if (!in_dev)
989                 return;
990
991         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
992             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
993                 goto reject_redirect;
994
995         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
996                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
997                         goto reject_redirect;
998                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
999                         goto reject_redirect;
1000         } else {
1001                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1002                         goto reject_redirect;
1003         }
1004
1005         for (i = 0; i < 2; i++) {
1006                 for (k = 0; k < 2; k++) {
1007                         unsigned hash = rt_hash_code(daddr,
1008                                                      skeys[i] ^ (ikeys[k] << 5),
1009                                                      tos);
1010
1011                         rthp=&rt_hash_table[hash].chain;
1012
1013                         rcu_read_lock();
1014                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1015                                 struct rtable *rt;
1016
1017                                 if (rth->fl.fl4_dst != daddr ||
1018                                     rth->fl.fl4_src != skeys[i] ||
1019                                     rth->fl.fl4_tos != tos ||
1020                                     rth->fl.oif != ikeys[k] ||
1021                                     rth->fl.iif != 0) {
1022                                         rthp = &rth->u.rt_next;
1023                                         continue;
1024                                 }
1025
1026                                 if (rth->rt_dst != daddr ||
1027                                     rth->rt_src != saddr ||
1028                                     rth->u.dst.error ||
1029                                     rth->rt_gateway != old_gw ||
1030                                     rth->u.dst.dev != dev)
1031                                         break;
1032
1033                                 dst_hold(&rth->u.dst);
1034                                 rcu_read_unlock();
1035
1036                                 rt = dst_alloc(&ipv4_dst_ops);
1037                                 if (rt == NULL) {
1038                                         ip_rt_put(rth);
1039                                         in_dev_put(in_dev);
1040                                         return;
1041                                 }
1042
1043                                 /* Copy all the information. */
1044                                 *rt = *rth;
1045                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1046                                 rt->u.dst.__use         = 1;
1047                                 atomic_set(&rt->u.dst.__refcnt, 1);
1048                                 rt->u.dst.child         = NULL;
1049                                 if (rt->u.dst.dev)
1050                                         dev_hold(rt->u.dst.dev);
1051                                 if (rt->idev)
1052                                         in_dev_hold(rt->idev);
1053                                 rt->u.dst.obsolete      = 0;
1054                                 rt->u.dst.lastuse       = jiffies;
1055                                 rt->u.dst.path          = &rt->u.dst;
1056                                 rt->u.dst.neighbour     = NULL;
1057                                 rt->u.dst.hh            = NULL;
1058                                 rt->u.dst.xfrm          = NULL;
1059
1060                                 rt->rt_flags            |= RTCF_REDIRECTED;
1061
1062                                 /* Gateway is different ... */
1063                                 rt->rt_gateway          = new_gw;
1064
1065                                 /* Redirect received -> path was valid */
1066                                 dst_confirm(&rth->u.dst);
1067
1068                                 if (rt->peer)
1069                                         atomic_inc(&rt->peer->refcnt);
1070
1071                                 if (arp_bind_neighbour(&rt->u.dst) ||
1072                                     !(rt->u.dst.neighbour->nud_state &
1073                                             NUD_VALID)) {
1074                                         if (rt->u.dst.neighbour)
1075                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1076                                         ip_rt_put(rth);
1077                                         rt_drop(rt);
1078                                         goto do_next;
1079                                 }
1080
1081                                 rt_del(hash, rth);
1082                                 if (!rt_intern_hash(hash, rt, &rt))
1083                                         ip_rt_put(rt);
1084                                 goto do_next;
1085                         }
1086                         rcu_read_unlock();
1087                 do_next:
1088                         ;
1089                 }
1090         }
1091         in_dev_put(in_dev);
1092         return;
1093
1094 reject_redirect:
1095 #ifdef CONFIG_IP_ROUTE_VERBOSE
1096         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1097                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1098                         "%u.%u.%u.%u ignored.\n"
1099                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1100                         "tos %02x\n",
1101                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1102                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1103 #endif
1104         in_dev_put(in_dev);
1105 }
1106
1107 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1108 {
1109         struct rtable *rt = (struct rtable*)dst;
1110         struct dst_entry *ret = dst;
1111
1112         if (rt) {
1113                 if (dst->obsolete) {
1114                         ip_rt_put(rt);
1115                         ret = NULL;
1116                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1117                            rt->u.dst.expires) {
1118                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1119                                                      rt->fl.fl4_src ^
1120                                                         (rt->fl.oif << 5),
1121                                                      rt->fl.fl4_tos);
1122 #if RT_CACHE_DEBUG >= 1
1123                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1124                                           "%u.%u.%u.%u/%02x dropped\n",
1125                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1126 #endif
1127                         rt_del(hash, rt);
1128                         ret = NULL;
1129                 }
1130         }
1131         return ret;
1132 }
1133
1134 /*
1135  * Algorithm:
1136  *      1. The first ip_rt_redirect_number redirects are sent
1137  *         with exponential backoff, then we stop sending them at all,
1138  *         assuming that the host ignores our redirects.
1139  *      2. If we did not see packets requiring redirects
1140  *         during ip_rt_redirect_silence, we assume that the host
1141  *         forgot redirected route and start to send redirects again.
1142  *
1143  * This algorithm is much cheaper and more intelligent than dumb load limiting
1144  * in icmp.c.
1145  *
1146  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1147  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1148  */
1149
1150 void ip_rt_send_redirect(struct sk_buff *skb)
1151 {
1152         struct rtable *rt = (struct rtable*)skb->dst;
1153         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1154
1155         if (!in_dev)
1156                 return;
1157
1158         if (!IN_DEV_TX_REDIRECTS(in_dev))
1159                 goto out;
1160
1161         /* No redirected packets during ip_rt_redirect_silence;
1162          * reset the algorithm.
1163          */
1164         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1165                 rt->u.dst.rate_tokens = 0;
1166
1167         /* Too many ignored redirects; do not send anything
1168          * set u.dst.rate_last to the last seen redirected packet.
1169          */
1170         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1171                 rt->u.dst.rate_last = jiffies;
1172                 goto out;
1173         }
1174
1175         /* Check for load limit; set rate_last to the latest sent
1176          * redirect.
1177          */
1178         if (time_after(jiffies,
1179                        (rt->u.dst.rate_last +
1180                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1181                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1182                 rt->u.dst.rate_last = jiffies;
1183                 ++rt->u.dst.rate_tokens;
1184 #ifdef CONFIG_IP_ROUTE_VERBOSE
1185                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1186                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1187                     net_ratelimit())
1188                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1189                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1190                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1191                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1192 #endif
1193         }
1194 out:
1195         in_dev_put(in_dev);
1196 }
1197
1198 static int ip_error(struct sk_buff *skb)
1199 {
1200         struct rtable *rt = (struct rtable*)skb->dst;
1201         unsigned long now;
1202         int code;
1203
1204         switch (rt->u.dst.error) {
1205                 case EINVAL:
1206                 default:
1207                         goto out;
1208                 case EHOSTUNREACH:
1209                         code = ICMP_HOST_UNREACH;
1210                         break;
1211                 case ENETUNREACH:
1212                         code = ICMP_NET_UNREACH;
1213                         break;
1214                 case EACCES:
1215                         code = ICMP_PKT_FILTERED;
1216                         break;
1217         }
1218
1219         now = jiffies;
1220         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1221         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1222                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1223         rt->u.dst.rate_last = now;
1224         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1225                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1226                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1227         }
1228
1229 out:    kfree_skb(skb);
1230         return 0;
1231
1232
1233 /*
1234  *      The last two values are not from the RFC but
1235  *      are needed for AMPRnet AX.25 paths.
1236  */
1237
1238 static unsigned short mtu_plateau[] =
1239 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1240
1241 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1242 {
1243         int i;
1244         
1245         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1246                 if (old_mtu > mtu_plateau[i])
1247                         return mtu_plateau[i];
1248         return 68;
1249 }
1250
1251 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1252 {
1253         int i;
1254         unsigned short old_mtu = ntohs(iph->tot_len);
1255         struct rtable *rth;
1256         u32  skeys[2] = { iph->saddr, 0, };
1257         u32  daddr = iph->daddr;
1258         u8   tos = iph->tos & IPTOS_RT_MASK;
1259         unsigned short est_mtu = 0;
1260
1261         if (ipv4_config.no_pmtu_disc)
1262                 return 0;
1263
1264         for (i = 0; i < 2; i++) {
1265                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1266
1267                 rcu_read_lock();
1268                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1269                      rth = rcu_dereference(rth->u.rt_next)) {
1270                         if (rth->fl.fl4_dst == daddr &&
1271                             rth->fl.fl4_src == skeys[i] &&
1272                             rth->rt_dst  == daddr &&
1273                             rth->rt_src  == iph->saddr &&
1274                             rth->fl.fl4_tos == tos &&
1275                             rth->fl.iif == 0 &&
1276                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1277                                 unsigned short mtu = new_mtu;
1278
1279                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1280
1281                                         /* BSD 4.2 compatibility hack :-( */
1282                                         if (mtu == 0 &&
1283                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1284                                             old_mtu >= 68 + (iph->ihl << 2))
1285                                                 old_mtu -= iph->ihl << 2;
1286
1287                                         mtu = guess_mtu(old_mtu);
1288                                 }
1289                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1290                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1291                                                 dst_confirm(&rth->u.dst);
1292                                                 if (mtu < ip_rt_min_pmtu) {
1293                                                         mtu = ip_rt_min_pmtu;
1294                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1295                                                                 (1 << RTAX_MTU);
1296                                                 }
1297                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1298                                                 dst_set_expires(&rth->u.dst,
1299                                                         ip_rt_mtu_expires);
1300                                         }
1301                                         est_mtu = mtu;
1302                                 }
1303                         }
1304                 }
1305                 rcu_read_unlock();
1306         }
1307         return est_mtu ? : new_mtu;
1308 }
1309
1310 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1311 {
1312         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1313             !(dst_metric_locked(dst, RTAX_MTU))) {
1314                 if (mtu < ip_rt_min_pmtu) {
1315                         mtu = ip_rt_min_pmtu;
1316                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1317                 }
1318                 dst->metrics[RTAX_MTU-1] = mtu;
1319                 dst_set_expires(dst, ip_rt_mtu_expires);
1320         }
1321 }
1322
1323 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1324 {
1325         dst_release(dst);
1326         return NULL;
1327 }
1328
1329 static void ipv4_dst_destroy(struct dst_entry *dst)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332         struct inet_peer *peer = rt->peer;
1333         struct in_device *idev = rt->idev;
1334
1335         if (peer) {
1336                 rt->peer = NULL;
1337                 inet_putpeer(peer);
1338         }
1339
1340         if (idev) {
1341                 rt->idev = NULL;
1342                 in_dev_put(idev);
1343         }
1344 }
1345
1346 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1347 {
1348         struct rtable *rt = (struct rtable *) dst;
1349         struct in_device *idev = rt->idev;
1350         if (idev && idev->dev != &loopback_dev) {
1351                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1352                 if (loopback_idev) {
1353                         rt->idev = loopback_idev;
1354                         in_dev_put(idev);
1355                 }
1356         }
1357 }
1358
1359 static void ipv4_link_failure(struct sk_buff *skb)
1360 {
1361         struct rtable *rt;
1362
1363         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1364
1365         rt = (struct rtable *) skb->dst;
1366         if (rt)
1367                 dst_set_expires(&rt->u.dst, 0);
1368 }
1369
1370 static int ip_rt_bug(struct sk_buff **pskb)
1371 {
1372         struct sk_buff *skb = *pskb;
1373
1374         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1375                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1376                 skb->dev ? skb->dev->name : "?");
1377         kfree_skb(skb);
1378         return 0;
1379 }
1380
1381 /*
1382    We do not cache source address of outgoing interface,
1383    because it is used only by IP RR, TS and SRR options,
1384    so that it out of fast path.
1385
1386    BTW remember: "addr" is allowed to be not aligned
1387    in IP options!
1388  */
1389
1390 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1391 {
1392         u32 src;
1393         struct fib_result res;
1394
1395         if (rt->fl.iif == 0)
1396                 src = rt->rt_src;
1397         else if (fib_lookup(&rt->fl, &res) == 0) {
1398                 src = FIB_RES_PREFSRC(res);
1399                 fib_res_put(&res);
1400         } else
1401                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1402                                         RT_SCOPE_UNIVERSE);
1403         memcpy(addr, &src, 4);
1404 }
1405
1406 #ifdef CONFIG_NET_CLS_ROUTE
1407 static void set_class_tag(struct rtable *rt, u32 tag)
1408 {
1409         if (!(rt->u.dst.tclassid & 0xFFFF))
1410                 rt->u.dst.tclassid |= tag & 0xFFFF;
1411         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1412                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1413 }
1414 #endif
1415
1416 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1417 {
1418         struct fib_info *fi = res->fi;
1419
1420         if (fi) {
1421                 if (FIB_RES_GW(*res) &&
1422                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1423                         rt->rt_gateway = FIB_RES_GW(*res);
1424                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1425                        sizeof(rt->u.dst.metrics));
1426                 if (fi->fib_mtu == 0) {
1427                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1428                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1429                             rt->rt_gateway != rt->rt_dst &&
1430                             rt->u.dst.dev->mtu > 576)
1431                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1432                 }
1433 #ifdef CONFIG_NET_CLS_ROUTE
1434                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1435 #endif
1436         } else
1437                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1438
1439         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1440                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1441         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1442                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1443         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1444                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1445                                        ip_rt_min_advmss);
1446         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1447                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1448
1449 #ifdef CONFIG_NET_CLS_ROUTE
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451         set_class_tag(rt, fib_rules_tclass(res));
1452 #endif
1453         set_class_tag(rt, itag);
1454 #endif
1455         rt->rt_type = res->type;
1456 }
1457
1458 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1459                                 u8 tos, struct net_device *dev, int our)
1460 {
1461         unsigned hash;
1462         struct rtable *rth;
1463         u32 spec_dst;
1464         struct in_device *in_dev = in_dev_get(dev);
1465         u32 itag = 0;
1466
1467         /* Primary sanity checks. */
1468
1469         if (in_dev == NULL)
1470                 return -EINVAL;
1471
1472         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1473             skb->protocol != htons(ETH_P_IP))
1474                 goto e_inval;
1475
1476         if (ZERONET(saddr)) {
1477                 if (!LOCAL_MCAST(daddr))
1478                         goto e_inval;
1479                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1480         } else if (fib_validate_source(saddr, 0, tos, 0,
1481                                         dev, &spec_dst, &itag) < 0)
1482                 goto e_inval;
1483
1484         rth = dst_alloc(&ipv4_dst_ops);
1485         if (!rth)
1486                 goto e_nobufs;
1487
1488         rth->u.dst.output= ip_rt_bug;
1489
1490         atomic_set(&rth->u.dst.__refcnt, 1);
1491         rth->u.dst.flags= DST_HOST;
1492         if (in_dev->cnf.no_policy)
1493                 rth->u.dst.flags |= DST_NOPOLICY;
1494         rth->fl.fl4_dst = daddr;
1495         rth->rt_dst     = daddr;
1496         rth->fl.fl4_tos = tos;
1497 #ifdef CONFIG_IP_ROUTE_FWMARK
1498         rth->fl.fl4_fwmark= skb->nfmark;
1499 #endif
1500         rth->fl.fl4_src = saddr;
1501         rth->rt_src     = saddr;
1502 #ifdef CONFIG_NET_CLS_ROUTE
1503         rth->u.dst.tclassid = itag;
1504 #endif
1505         rth->rt_iif     =
1506         rth->fl.iif     = dev->ifindex;
1507         rth->u.dst.dev  = &loopback_dev;
1508         dev_hold(rth->u.dst.dev);
1509         rth->idev       = in_dev_get(rth->u.dst.dev);
1510         rth->fl.oif     = 0;
1511         rth->rt_gateway = daddr;
1512         rth->rt_spec_dst= spec_dst;
1513         rth->rt_type    = RTN_MULTICAST;
1514         rth->rt_flags   = RTCF_MULTICAST;
1515         if (our) {
1516                 rth->u.dst.input= ip_local_deliver;
1517                 rth->rt_flags |= RTCF_LOCAL;
1518         }
1519
1520 #ifdef CONFIG_IP_MROUTE
1521         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1522                 rth->u.dst.input = ip_mr_input;
1523 #endif
1524         RT_CACHE_STAT_INC(in_slow_mc);
1525
1526         in_dev_put(in_dev);
1527         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1528         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1529
1530 e_nobufs:
1531         in_dev_put(in_dev);
1532         return -ENOBUFS;
1533
1534 e_inval:
1535         in_dev_put(in_dev);
1536         return -EINVAL;
1537 }
1538
1539 /*
1540  *      NOTE. We drop all the packets that has local source
1541  *      addresses, because every properly looped back packet
1542  *      must have correct destination already attached by output routine.
1543  *
1544  *      Such approach solves two big problems:
1545  *      1. Not simplex devices are handled properly.
1546  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1547  */
1548
1549 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1550                         u8 tos, struct net_device *dev)
1551 {
1552         struct fib_result res;
1553         struct in_device *in_dev = in_dev_get(dev);
1554         struct in_device *out_dev = NULL;
1555         struct flowi fl = { .nl_u = { .ip4_u =
1556                                       { .daddr = daddr,
1557                                         .saddr = saddr,
1558                                         .tos = tos,
1559                                         .scope = RT_SCOPE_UNIVERSE,
1560 #ifdef CONFIG_IP_ROUTE_FWMARK
1561                                         .fwmark = skb->nfmark
1562 #endif
1563                                       } },
1564                             .iif = dev->ifindex };
1565         unsigned        flags = 0;
1566         u32             itag = 0;
1567         struct rtable * rth;
1568         unsigned        hash;
1569         u32             spec_dst;
1570         int             err = -EINVAL;
1571         int             free_res = 0;
1572
1573         /* IP on this device is disabled. */
1574
1575         if (!in_dev)
1576                 goto out;
1577
1578         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1579
1580         /* Check for the most weird martians, which can be not detected
1581            by fib_lookup.
1582          */
1583
1584         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1585                 goto martian_source;
1586
1587         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1588                 goto brd_input;
1589
1590         /* Accept zero addresses only to limited broadcast;
1591          * I even do not know to fix it or not. Waiting for complains :-)
1592          */
1593         if (ZERONET(saddr))
1594                 goto martian_source;
1595
1596         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1597                 goto martian_destination;
1598
1599         /*
1600          *      Now we are ready to route packet.
1601          */
1602         if ((err = fib_lookup(&fl, &res)) != 0) {
1603                 if (!IN_DEV_FORWARD(in_dev))
1604                         goto e_inval;
1605                 goto no_route;
1606         }
1607         free_res = 1;
1608
1609         RT_CACHE_STAT_INC(in_slow_tot);
1610
1611         if (res.type == RTN_BROADCAST)
1612                 goto brd_input;
1613
1614         if (res.type == RTN_LOCAL) {
1615                 int result;
1616                 result = fib_validate_source(saddr, daddr, tos,
1617                                              loopback_dev.ifindex,
1618                                              dev, &spec_dst, &itag);
1619                 if (result < 0)
1620                         goto martian_source;
1621                 if (result)
1622                         flags |= RTCF_DIRECTSRC;
1623                 spec_dst = daddr;
1624                 goto local_input;
1625         }
1626
1627         if (!IN_DEV_FORWARD(in_dev))
1628                 goto e_inval;
1629         if (res.type != RTN_UNICAST)
1630                 goto martian_destination;
1631
1632 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1633         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1634                 fib_select_multipath(&fl, &res);
1635 #endif
1636         out_dev = in_dev_get(FIB_RES_DEV(res));
1637         if (out_dev == NULL) {
1638                 if (net_ratelimit())
1639                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1640                                          "Please, report\n");
1641                 goto e_inval;
1642         }
1643
1644         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1645                                   &spec_dst, &itag);
1646         if (err < 0)
1647                 goto martian_source;
1648
1649         if (err)
1650                 flags |= RTCF_DIRECTSRC;
1651
1652         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1653             (IN_DEV_SHARED_MEDIA(out_dev) ||
1654              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1655                 flags |= RTCF_DOREDIRECT;
1656
1657         if (skb->protocol != htons(ETH_P_IP)) {
1658                 /* Not IP (i.e. ARP). Do not create route, if it is
1659                  * invalid for proxy arp. DNAT routes are always valid.
1660                  */
1661                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1662                         goto e_inval;
1663         }
1664
1665         rth = dst_alloc(&ipv4_dst_ops);
1666         if (!rth)
1667                 goto e_nobufs;
1668
1669         atomic_set(&rth->u.dst.__refcnt, 1);
1670         rth->u.dst.flags= DST_HOST;
1671         if (in_dev->cnf.no_policy)
1672                 rth->u.dst.flags |= DST_NOPOLICY;
1673         if (in_dev->cnf.no_xfrm)
1674                 rth->u.dst.flags |= DST_NOXFRM;
1675         rth->fl.fl4_dst = daddr;
1676         rth->rt_dst     = daddr;
1677         rth->fl.fl4_tos = tos;
1678 #ifdef CONFIG_IP_ROUTE_FWMARK
1679         rth->fl.fl4_fwmark= skb->nfmark;
1680 #endif
1681         rth->fl.fl4_src = saddr;
1682         rth->rt_src     = saddr;
1683         rth->rt_gateway = daddr;
1684         rth->rt_iif     =
1685         rth->fl.iif     = dev->ifindex;
1686         rth->u.dst.dev  = out_dev->dev;
1687         dev_hold(rth->u.dst.dev);
1688         rth->idev       = in_dev_get(rth->u.dst.dev);
1689         rth->fl.oif     = 0;
1690         rth->rt_spec_dst= spec_dst;
1691
1692         rth->u.dst.input = ip_forward;
1693         rth->u.dst.output = ip_output;
1694
1695         rt_set_nexthop(rth, &res, itag);
1696
1697         rth->rt_flags = flags;
1698
1699 intern:
1700         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1701 done:
1702         in_dev_put(in_dev);
1703         if (out_dev)
1704                 in_dev_put(out_dev);
1705         if (free_res)
1706                 fib_res_put(&res);
1707 out:    return err;
1708
1709 brd_input:
1710         if (skb->protocol != htons(ETH_P_IP))
1711                 goto e_inval;
1712
1713         if (ZERONET(saddr))
1714                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1715         else {
1716                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1717                                           &itag);
1718                 if (err < 0)
1719                         goto martian_source;
1720                 if (err)
1721                         flags |= RTCF_DIRECTSRC;
1722         }
1723         flags |= RTCF_BROADCAST;
1724         res.type = RTN_BROADCAST;
1725         RT_CACHE_STAT_INC(in_brd);
1726
1727 local_input:
1728         rth = dst_alloc(&ipv4_dst_ops);
1729         if (!rth)
1730                 goto e_nobufs;
1731
1732         rth->u.dst.output= ip_rt_bug;
1733
1734         atomic_set(&rth->u.dst.__refcnt, 1);
1735         rth->u.dst.flags= DST_HOST;
1736         if (in_dev->cnf.no_policy)
1737                 rth->u.dst.flags |= DST_NOPOLICY;
1738         rth->fl.fl4_dst = daddr;
1739         rth->rt_dst     = daddr;
1740         rth->fl.fl4_tos = tos;
1741 #ifdef CONFIG_IP_ROUTE_FWMARK
1742         rth->fl.fl4_fwmark= skb->nfmark;
1743 #endif
1744         rth->fl.fl4_src = saddr;
1745         rth->rt_src     = saddr;
1746 #ifdef CONFIG_NET_CLS_ROUTE
1747         rth->u.dst.tclassid = itag;
1748 #endif
1749         rth->rt_iif     =
1750         rth->fl.iif     = dev->ifindex;
1751         rth->u.dst.dev  = &loopback_dev;
1752         dev_hold(rth->u.dst.dev);
1753         rth->idev       = in_dev_get(rth->u.dst.dev);
1754         rth->rt_gateway = daddr;
1755         rth->rt_spec_dst= spec_dst;
1756         rth->u.dst.input= ip_local_deliver;
1757         rth->rt_flags   = flags|RTCF_LOCAL;
1758         if (res.type == RTN_UNREACHABLE) {
1759                 rth->u.dst.input= ip_error;
1760                 rth->u.dst.error= -err;
1761                 rth->rt_flags   &= ~RTCF_LOCAL;
1762         }
1763         rth->rt_type    = res.type;
1764         goto intern;
1765
1766 no_route:
1767         RT_CACHE_STAT_INC(in_no_route);
1768         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1769         res.type = RTN_UNREACHABLE;
1770         goto local_input;
1771
1772         /*
1773          *      Do not cache martian addresses: they should be logged (RFC1812)
1774          */
1775 martian_destination:
1776         RT_CACHE_STAT_INC(in_martian_dst);
1777 #ifdef CONFIG_IP_ROUTE_VERBOSE
1778         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1779                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1780                         "%u.%u.%u.%u, dev %s\n",
1781                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1782 #endif
1783 e_inval:
1784         err = -EINVAL;
1785         goto done;
1786
1787 e_nobufs:
1788         err = -ENOBUFS;
1789         goto done;
1790
1791 martian_source:
1792
1793         RT_CACHE_STAT_INC(in_martian_src);
1794 #ifdef CONFIG_IP_ROUTE_VERBOSE
1795         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1796                 /*
1797                  *      RFC1812 recommendation, if source is martian,
1798                  *      the only hint is MAC header.
1799                  */
1800                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1801                         "%u.%u.%u.%u, on dev %s\n",
1802                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1803                 if (dev->hard_header_len) {
1804                         int i;
1805                         unsigned char *p = skb->mac.raw;
1806                         printk(KERN_WARNING "ll header: ");
1807                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1808                                 printk("%02x", *p);
1809                                 if (i < (dev->hard_header_len - 1))
1810                                         printk(":");
1811                         }
1812                         printk("\n");
1813                 }
1814         }
1815 #endif
1816         goto e_inval;
1817 }
1818
1819 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1820                    u8 tos, struct net_device *dev)
1821 {
1822         struct rtable * rth;
1823         unsigned        hash;
1824         int iif = dev->ifindex;
1825
1826         tos &= IPTOS_RT_MASK;
1827         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1828
1829         rcu_read_lock();
1830         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1831              rth = rcu_dereference(rth->u.rt_next)) {
1832                 if (rth->fl.fl4_dst == daddr &&
1833                     rth->fl.fl4_src == saddr &&
1834                     rth->fl.iif == iif &&
1835                     rth->fl.oif == 0 &&
1836 #ifdef CONFIG_IP_ROUTE_FWMARK
1837                     rth->fl.fl4_fwmark == skb->nfmark &&
1838 #endif
1839                     rth->fl.fl4_tos == tos) {
1840                         rth->u.dst.lastuse = jiffies;
1841                         dst_hold(&rth->u.dst);
1842                         rth->u.dst.__use++;
1843                         RT_CACHE_STAT_INC(in_hit);
1844                         rcu_read_unlock();
1845                         skb->dst = (struct dst_entry*)rth;
1846                         return 0;
1847                 }
1848                 RT_CACHE_STAT_INC(in_hlist_search);
1849         }
1850         rcu_read_unlock();
1851
1852         /* Multicast recognition logic is moved from route cache to here.
1853            The problem was that too many Ethernet cards have broken/missing
1854            hardware multicast filters :-( As result the host on multicasting
1855            network acquires a lot of useless route cache entries, sort of
1856            SDR messages from all the world. Now we try to get rid of them.
1857            Really, provided software IP multicast filter is organized
1858            reasonably (at least, hashed), it does not result in a slowdown
1859            comparing with route cache reject entries.
1860            Note, that multicast routers are not affected, because
1861            route cache entry is created eventually.
1862          */
1863         if (MULTICAST(daddr)) {
1864                 struct in_device *in_dev;
1865
1866                 rcu_read_lock();
1867                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1868                         int our = ip_check_mc(in_dev, daddr, saddr,
1869                                 skb->nh.iph->protocol);
1870                         if (our
1871 #ifdef CONFIG_IP_MROUTE
1872                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1873 #endif
1874                             ) {
1875                                 rcu_read_unlock();
1876                                 return ip_route_input_mc(skb, daddr, saddr,
1877                                                          tos, dev, our);
1878                         }
1879                 }
1880                 rcu_read_unlock();
1881                 return -EINVAL;
1882         }
1883         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1884 }
1885
1886 /*
1887  * Major route resolver routine.
1888  */
1889
1890 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1891 {
1892         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1893         struct flowi fl = { .nl_u = { .ip4_u =
1894                                       { .daddr = oldflp->fl4_dst,
1895                                         .saddr = oldflp->fl4_src,
1896                                         .tos = tos & IPTOS_RT_MASK,
1897                                         .scope = ((tos & RTO_ONLINK) ?
1898                                                   RT_SCOPE_LINK :
1899                                                   RT_SCOPE_UNIVERSE),
1900 #ifdef CONFIG_IP_ROUTE_FWMARK
1901                                         .fwmark = oldflp->fl4_fwmark
1902 #endif
1903                                       } },
1904                             .iif = loopback_dev.ifindex,
1905                             .oif = oldflp->oif };
1906         struct fib_result res;
1907         unsigned flags = 0;
1908         struct rtable *rth;
1909         struct net_device *dev_out = NULL;
1910         struct in_device *in_dev = NULL;
1911         unsigned hash;
1912         int free_res = 0;
1913         int err;
1914
1915         res.fi          = NULL;
1916 #ifdef CONFIG_IP_MULTIPLE_TABLES
1917         res.r           = NULL;
1918 #endif
1919
1920         if (oldflp->fl4_src) {
1921                 err = -EINVAL;
1922                 if (MULTICAST(oldflp->fl4_src) ||
1923                     BADCLASS(oldflp->fl4_src) ||
1924                     ZERONET(oldflp->fl4_src))
1925                         goto out;
1926
1927                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1928                 dev_out = ip_dev_find(oldflp->fl4_src);
1929                 if (dev_out == NULL)
1930                         goto out;
1931
1932                 /* I removed check for oif == dev_out->oif here.
1933                    It was wrong for two reasons:
1934                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1935                       assigned to multiple interfaces.
1936                    2. Moreover, we are allowed to send packets with saddr
1937                       of another iface. --ANK
1938                  */
1939
1940                 if (oldflp->oif == 0
1941                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1942                         /* Special hack: user can direct multicasts
1943                            and limited broadcast via necessary interface
1944                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1945                            This hack is not just for fun, it allows
1946                            vic,vat and friends to work.
1947                            They bind socket to loopback, set ttl to zero
1948                            and expect that it will work.
1949                            From the viewpoint of routing cache they are broken,
1950                            because we are not allowed to build multicast path
1951                            with loopback source addr (look, routing cache
1952                            cannot know, that ttl is zero, so that packet
1953                            will not leave this host and route is valid).
1954                            Luckily, this hack is good workaround.
1955                          */
1956
1957                         fl.oif = dev_out->ifindex;
1958                         goto make_route;
1959                 }
1960                 if (dev_out)
1961                         dev_put(dev_out);
1962                 dev_out = NULL;
1963         }
1964         if (oldflp->oif) {
1965                 dev_out = dev_get_by_index(oldflp->oif);
1966                 err = -ENODEV;
1967                 if (dev_out == NULL)
1968                         goto out;
1969                 if (__in_dev_get(dev_out) == NULL) {
1970                         dev_put(dev_out);
1971                         goto out;       /* Wrong error code */
1972                 }
1973
1974                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1975                         if (!fl.fl4_src)
1976                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1977                                                               RT_SCOPE_LINK);
1978                         goto make_route;
1979                 }
1980                 if (!fl.fl4_src) {
1981                         if (MULTICAST(oldflp->fl4_dst))
1982                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1983                                                               fl.fl4_scope);
1984                         else if (!oldflp->fl4_dst)
1985                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1986                                                               RT_SCOPE_HOST);
1987                 }
1988         }
1989
1990         if (!fl.fl4_dst) {
1991                 fl.fl4_dst = fl.fl4_src;
1992                 if (!fl.fl4_dst)
1993                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
1994                 if (dev_out)
1995                         dev_put(dev_out);
1996                 dev_out = &loopback_dev;
1997                 dev_hold(dev_out);
1998                 fl.oif = loopback_dev.ifindex;
1999                 res.type = RTN_LOCAL;
2000                 flags |= RTCF_LOCAL;
2001                 goto make_route;
2002         }
2003
2004         if (fib_lookup(&fl, &res)) {
2005                 res.fi = NULL;
2006                 if (oldflp->oif) {
2007                         /* Apparently, routing tables are wrong. Assume,
2008                            that the destination is on link.
2009
2010                            WHY? DW.
2011                            Because we are allowed to send to iface
2012                            even if it has NO routes and NO assigned
2013                            addresses. When oif is specified, routing
2014                            tables are looked up with only one purpose:
2015                            to catch if destination is gatewayed, rather than
2016                            direct. Moreover, if MSG_DONTROUTE is set,
2017                            we send packet, ignoring both routing tables
2018                            and ifaddr state. --ANK
2019
2020
2021                            We could make it even if oif is unknown,
2022                            likely IPv6, but we do not.
2023                          */
2024
2025                         if (fl.fl4_src == 0)
2026                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2027                                                               RT_SCOPE_LINK);
2028                         res.type = RTN_UNICAST;
2029                         goto make_route;
2030                 }
2031                 if (dev_out)
2032                         dev_put(dev_out);
2033                 err = -ENETUNREACH;
2034                 goto out;
2035         }
2036         free_res = 1;
2037
2038         if (res.type == RTN_LOCAL) {
2039                 if (!fl.fl4_src)
2040                         fl.fl4_src = fl.fl4_dst;
2041                 if (dev_out)
2042                         dev_put(dev_out);
2043                 dev_out = &loopback_dev;
2044                 dev_hold(dev_out);
2045                 fl.oif = dev_out->ifindex;
2046                 if (res.fi)
2047                         fib_info_put(res.fi);
2048                 res.fi = NULL;
2049                 flags |= RTCF_LOCAL;
2050                 goto make_route;
2051         }
2052
2053 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2054         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2055                 fib_select_multipath(&fl, &res);
2056         else
2057 #endif
2058         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2059                 fib_select_default(&fl, &res);
2060
2061         if (!fl.fl4_src)
2062                 fl.fl4_src = FIB_RES_PREFSRC(res);
2063
2064         if (dev_out)
2065                 dev_put(dev_out);
2066         dev_out = FIB_RES_DEV(res);
2067         dev_hold(dev_out);
2068         fl.oif = dev_out->ifindex;
2069
2070 make_route:
2071         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2072                 goto e_inval;
2073
2074         if (fl.fl4_dst == 0xFFFFFFFF)
2075                 res.type = RTN_BROADCAST;
2076         else if (MULTICAST(fl.fl4_dst))
2077                 res.type = RTN_MULTICAST;
2078         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2079                 goto e_inval;
2080
2081         if (dev_out->flags & IFF_LOOPBACK)
2082                 flags |= RTCF_LOCAL;
2083
2084         in_dev = in_dev_get(dev_out);
2085         if (!in_dev)
2086                 goto e_inval;
2087
2088         if (res.type == RTN_BROADCAST) {
2089                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2090                 if (res.fi) {
2091                         fib_info_put(res.fi);
2092                         res.fi = NULL;
2093                 }
2094         } else if (res.type == RTN_MULTICAST) {
2095                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2096                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2097                         flags &= ~RTCF_LOCAL;
2098                 /* If multicast route do not exist use
2099                    default one, but do not gateway in this case.
2100                    Yes, it is hack.
2101                  */
2102                 if (res.fi && res.prefixlen < 4) {
2103                         fib_info_put(res.fi);
2104                         res.fi = NULL;
2105                 }
2106         }
2107
2108         rth = dst_alloc(&ipv4_dst_ops);
2109         if (!rth)
2110                 goto e_nobufs;
2111
2112         atomic_set(&rth->u.dst.__refcnt, 1);
2113         rth->u.dst.flags= DST_HOST;
2114         if (in_dev->cnf.no_xfrm)
2115                 rth->u.dst.flags |= DST_NOXFRM;
2116         if (in_dev->cnf.no_policy)
2117                 rth->u.dst.flags |= DST_NOPOLICY;
2118         rth->fl.fl4_dst = oldflp->fl4_dst;
2119         rth->fl.fl4_tos = tos;
2120         rth->fl.fl4_src = oldflp->fl4_src;
2121         rth->fl.oif     = oldflp->oif;
2122 #ifdef CONFIG_IP_ROUTE_FWMARK
2123         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2124 #endif
2125         rth->rt_dst     = fl.fl4_dst;
2126         rth->rt_src     = fl.fl4_src;
2127         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2128         rth->u.dst.dev  = dev_out;
2129         dev_hold(dev_out);
2130         rth->idev       = in_dev_get(dev_out);
2131         rth->rt_gateway = fl.fl4_dst;
2132         rth->rt_spec_dst= fl.fl4_src;
2133
2134         rth->u.dst.output=ip_output;
2135
2136         RT_CACHE_STAT_INC(out_slow_tot);
2137
2138         if (flags & RTCF_LOCAL) {
2139                 rth->u.dst.input = ip_local_deliver;
2140                 rth->rt_spec_dst = fl.fl4_dst;
2141         }
2142         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2143                 rth->rt_spec_dst = fl.fl4_src;
2144                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2145                         rth->u.dst.output = ip_mc_output;
2146                         RT_CACHE_STAT_INC(out_slow_mc);
2147                 }
2148 #ifdef CONFIG_IP_MROUTE
2149                 if (res.type == RTN_MULTICAST) {
2150                         if (IN_DEV_MFORWARD(in_dev) &&
2151                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2152                                 rth->u.dst.input = ip_mr_input;
2153                                 rth->u.dst.output = ip_mc_output;
2154                         }
2155                 }
2156 #endif
2157         }
2158
2159         rt_set_nexthop(rth, &res, 0);
2160         
2161
2162         rth->rt_flags = flags;
2163
2164         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2165         err = rt_intern_hash(hash, rth, rp);
2166 done:
2167         if (free_res)
2168                 fib_res_put(&res);
2169         if (dev_out)
2170                 dev_put(dev_out);
2171         if (in_dev)
2172                 in_dev_put(in_dev);
2173 out:    return err;
2174
2175 e_inval:
2176         err = -EINVAL;
2177         goto done;
2178 e_nobufs:
2179         err = -ENOBUFS;
2180         goto done;
2181 }
2182
2183 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2184 {
2185         unsigned hash;
2186         struct rtable *rth;
2187
2188         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2189
2190         rcu_read_lock_bh();
2191         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2192                 rth = rcu_dereference(rth->u.rt_next)) {
2193                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2194                     rth->fl.fl4_src == flp->fl4_src &&
2195                     rth->fl.iif == 0 &&
2196                     rth->fl.oif == flp->oif &&
2197 #ifdef CONFIG_IP_ROUTE_FWMARK
2198                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2199 #endif
2200                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2201                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2202                         rth->u.dst.lastuse = jiffies;
2203                         dst_hold(&rth->u.dst);
2204                         rth->u.dst.__use++;
2205                         RT_CACHE_STAT_INC(out_hit);
2206                         rcu_read_unlock_bh();
2207                         *rp = rth;
2208                         return 0;
2209                 }
2210                 RT_CACHE_STAT_INC(out_hlist_search);
2211         }
2212         rcu_read_unlock_bh();
2213
2214         return ip_route_output_slow(rp, flp);
2215 }
2216
2217 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2218 {
2219         int err;
2220
2221         if ((err = __ip_route_output_key(rp, flp)) != 0)
2222                 return err;
2223
2224         if (flp->proto) {
2225                 if (!flp->fl4_src)
2226                         flp->fl4_src = (*rp)->rt_src;
2227                 if (!flp->fl4_dst)
2228                         flp->fl4_dst = (*rp)->rt_dst;
2229                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2230         }
2231
2232         return 0;
2233 }
2234
2235 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2236 {
2237         return ip_route_output_flow(rp, flp, NULL, 0);
2238 }
2239
2240 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2241                         int nowait)
2242 {
2243         struct rtable *rt = (struct rtable*)skb->dst;
2244         struct rtmsg *r;
2245         struct nlmsghdr  *nlh;
2246         unsigned char    *b = skb->tail;
2247         struct rta_cacheinfo ci;
2248 #ifdef CONFIG_IP_MROUTE
2249         struct rtattr *eptr;
2250 #endif
2251         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2252         r = NLMSG_DATA(nlh);
2253         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2254         r->rtm_family    = AF_INET;
2255         r->rtm_dst_len  = 32;
2256         r->rtm_src_len  = 0;
2257         r->rtm_tos      = rt->fl.fl4_tos;
2258         r->rtm_table    = RT_TABLE_MAIN;
2259         r->rtm_type     = rt->rt_type;
2260         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2261         r->rtm_protocol = RTPROT_UNSPEC;
2262         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2263         if (rt->rt_flags & RTCF_NOTIFY)
2264                 r->rtm_flags |= RTM_F_NOTIFY;
2265         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2266         if (rt->fl.fl4_src) {
2267                 r->rtm_src_len = 32;
2268                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2269         }
2270         if (rt->u.dst.dev)
2271                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2272 #ifdef CONFIG_NET_CLS_ROUTE
2273         if (rt->u.dst.tclassid)
2274                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2275 #endif
2276         if (rt->fl.iif)
2277                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2278         else if (rt->rt_src != rt->fl.fl4_src)
2279                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2280         if (rt->rt_dst != rt->rt_gateway)
2281                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2282         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2283                 goto rtattr_failure;
2284         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2285         ci.rta_used     = rt->u.dst.__use;
2286         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2287         if (rt->u.dst.expires)
2288                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2289         else
2290                 ci.rta_expires = 0;
2291         ci.rta_error    = rt->u.dst.error;
2292         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2293         if (rt->peer) {
2294                 ci.rta_id = rt->peer->ip_id_count;
2295                 if (rt->peer->tcp_ts_stamp) {
2296                         ci.rta_ts = rt->peer->tcp_ts;
2297                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2298                 }
2299         }
2300 #ifdef CONFIG_IP_MROUTE
2301         eptr = (struct rtattr*)skb->tail;
2302 #endif
2303         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2304         if (rt->fl.iif) {
2305 #ifdef CONFIG_IP_MROUTE
2306                 u32 dst = rt->rt_dst;
2307
2308                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2309                     ipv4_devconf.mc_forwarding) {
2310                         int err = ipmr_get_route(skb, r, nowait);
2311                         if (err <= 0) {
2312                                 if (!nowait) {
2313                                         if (err == 0)
2314                                                 return 0;
2315                                         goto nlmsg_failure;
2316                                 } else {
2317                                         if (err == -EMSGSIZE)
2318                                                 goto nlmsg_failure;
2319                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2320                                 }
2321                         }
2322                 } else
2323 #endif
2324                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2325         }
2326
2327         nlh->nlmsg_len = skb->tail - b;
2328         return skb->len;
2329
2330 nlmsg_failure:
2331 rtattr_failure:
2332         skb_trim(skb, b - skb->data);
2333         return -1;
2334 }
2335
2336 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2337 {
2338         struct rtattr **rta = arg;
2339         struct rtmsg *rtm = NLMSG_DATA(nlh);
2340         struct rtable *rt = NULL;
2341         u32 dst = 0;
2342         u32 src = 0;
2343         int iif = 0;
2344         int err = -ENOBUFS;
2345         struct sk_buff *skb;
2346
2347         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2348         if (!skb)
2349                 goto out;
2350
2351         /* Reserve room for dummy headers, this skb can pass
2352            through good chunk of routing engine.
2353          */
2354         skb->mac.raw = skb->data;
2355         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2356
2357         if (rta[RTA_SRC - 1])
2358                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2359         if (rta[RTA_DST - 1])
2360                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2361         if (rta[RTA_IIF - 1])
2362                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2363
2364         if (iif) {
2365                 struct net_device *dev = __dev_get_by_index(iif);
2366                 err = -ENODEV;
2367                 if (!dev)
2368                         goto out_free;
2369                 skb->protocol   = htons(ETH_P_IP);
2370                 skb->dev        = dev;
2371                 local_bh_disable();
2372                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2373                 local_bh_enable();
2374                 rt = (struct rtable*)skb->dst;
2375                 if (!err && rt->u.dst.error)
2376                         err = -rt->u.dst.error;
2377         } else {
2378                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2379                                                          .saddr = src,
2380                                                          .tos = rtm->rtm_tos } } };
2381                 int oif = 0;
2382                 if (rta[RTA_OIF - 1])
2383                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2384                 fl.oif = oif;
2385                 err = ip_route_output_key(&rt, &fl);
2386         }
2387         if (err)
2388                 goto out_free;
2389
2390         skb->dst = &rt->u.dst;
2391         if (rtm->rtm_flags & RTM_F_NOTIFY)
2392                 rt->rt_flags |= RTCF_NOTIFY;
2393
2394         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2395
2396         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2397                                 RTM_NEWROUTE, 0);
2398         if (!err)
2399                 goto out_free;
2400         if (err < 0) {
2401                 err = -EMSGSIZE;
2402                 goto out_free;
2403         }
2404
2405         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2406         if (err > 0)
2407                 err = 0;
2408 out:    return err;
2409
2410 out_free:
2411         kfree_skb(skb);
2412         goto out;
2413 }
2414
2415 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2416 {
2417         struct rtable *rt;
2418         int h, s_h;
2419         int idx, s_idx;
2420
2421         s_h = cb->args[0];
2422         s_idx = idx = cb->args[1];
2423         for (h = 0; h <= rt_hash_mask; h++) {
2424                 if (h < s_h) continue;
2425                 if (h > s_h)
2426                         s_idx = 0;
2427                 rcu_read_lock_bh();
2428                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2429                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2430                         if (idx < s_idx)
2431                                 continue;
2432                         skb->dst = dst_clone(&rt->u.dst);
2433                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2434                                          cb->nlh->nlmsg_seq,
2435                                          RTM_NEWROUTE, 1) <= 0) {
2436                                 dst_release(xchg(&skb->dst, NULL));
2437                                 rcu_read_unlock_bh();
2438                                 goto done;
2439                         }
2440                         dst_release(xchg(&skb->dst, NULL));
2441                 }
2442                 rcu_read_unlock_bh();
2443         }
2444
2445 done:
2446         cb->args[0] = h;
2447         cb->args[1] = idx;
2448         return skb->len;
2449 }
2450
2451 void ip_rt_multicast_event(struct in_device *in_dev)
2452 {
2453         rt_cache_flush(0);
2454 }
2455
2456 #ifdef CONFIG_SYSCTL
2457 static int flush_delay;
2458
2459 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2460                                         struct file *filp, void __user *buffer,
2461                                         size_t *lenp, loff_t *ppos)
2462 {
2463         if (write) {
2464                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2465                 rt_cache_flush(flush_delay);
2466                 return 0;
2467         } 
2468
2469         return -EINVAL;
2470 }
2471
2472 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2473                                                 int __user *name,
2474                                                 int nlen,
2475                                                 void __user *oldval,
2476                                                 size_t __user *oldlenp,
2477                                                 void __user *newval,
2478                                                 size_t newlen,
2479                                                 void **context)
2480 {
2481         int delay;
2482         if (newlen != sizeof(int))
2483                 return -EINVAL;
2484         if (get_user(delay, (int __user *)newval))
2485                 return -EFAULT; 
2486         rt_cache_flush(delay); 
2487         return 0;
2488 }
2489
2490 ctl_table ipv4_route_table[] = {
2491         {
2492                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2493                 .procname       = "flush",
2494                 .data           = &flush_delay,
2495                 .maxlen         = sizeof(int),
2496                 .mode           = 0644,
2497                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2498                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2499         },
2500         {
2501                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2502                 .procname       = "min_delay",
2503                 .data           = &ip_rt_min_delay,
2504                 .maxlen         = sizeof(int),
2505                 .mode           = 0644,
2506                 .proc_handler   = &proc_dointvec_jiffies,
2507                 .strategy       = &sysctl_jiffies,
2508         },
2509         {
2510                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2511                 .procname       = "max_delay",
2512                 .data           = &ip_rt_max_delay,
2513                 .maxlen         = sizeof(int),
2514                 .mode           = 0644,
2515                 .proc_handler   = &proc_dointvec_jiffies,
2516                 .strategy       = &sysctl_jiffies,
2517         },
2518         {
2519                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2520                 .procname       = "gc_thresh",
2521                 .data           = &ipv4_dst_ops.gc_thresh,
2522                 .maxlen         = sizeof(int),
2523                 .mode           = 0644,
2524                 .proc_handler   = &proc_dointvec,
2525         },
2526         {
2527                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2528                 .procname       = "max_size",
2529                 .data           = &ip_rt_max_size,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = &proc_dointvec,
2533         },
2534         {
2535                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2536                 .procname       = "gc_min_interval",
2537                 .data           = &ip_rt_gc_min_interval,
2538                 .maxlen         = sizeof(int),
2539                 .mode           = 0644,
2540                 .proc_handler   = &proc_dointvec_jiffies,
2541                 .strategy       = &sysctl_jiffies,
2542         },
2543         {
2544                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2545                 .procname       = "gc_timeout",
2546                 .data           = &ip_rt_gc_timeout,
2547                 .maxlen         = sizeof(int),
2548                 .mode           = 0644,
2549                 .proc_handler   = &proc_dointvec_jiffies,
2550                 .strategy       = &sysctl_jiffies,
2551         },
2552         {
2553                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2554                 .procname       = "gc_interval",
2555                 .data           = &ip_rt_gc_interval,
2556                 .maxlen         = sizeof(int),
2557                 .mode           = 0644,
2558                 .proc_handler   = &proc_dointvec_jiffies,
2559                 .strategy       = &sysctl_jiffies,
2560         },
2561         {
2562                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2563                 .procname       = "redirect_load",
2564                 .data           = &ip_rt_redirect_load,
2565                 .maxlen         = sizeof(int),
2566                 .mode           = 0644,
2567                 .proc_handler   = &proc_dointvec,
2568         },
2569         {
2570                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2571                 .procname       = "redirect_number",
2572                 .data           = &ip_rt_redirect_number,
2573                 .maxlen         = sizeof(int),
2574                 .mode           = 0644,
2575                 .proc_handler   = &proc_dointvec,
2576         },
2577         {
2578                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2579                 .procname       = "redirect_silence",
2580                 .data           = &ip_rt_redirect_silence,
2581                 .maxlen         = sizeof(int),
2582                 .mode           = 0644,
2583                 .proc_handler   = &proc_dointvec,
2584         },
2585         {
2586                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2587                 .procname       = "error_cost",
2588                 .data           = &ip_rt_error_cost,
2589                 .maxlen         = sizeof(int),
2590                 .mode           = 0644,
2591                 .proc_handler   = &proc_dointvec,
2592         },
2593         {
2594                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2595                 .procname       = "error_burst",
2596                 .data           = &ip_rt_error_burst,
2597                 .maxlen         = sizeof(int),
2598                 .mode           = 0644,
2599                 .proc_handler   = &proc_dointvec,
2600         },
2601         {
2602                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2603                 .procname       = "gc_elasticity",
2604                 .data           = &ip_rt_gc_elasticity,
2605                 .maxlen         = sizeof(int),
2606                 .mode           = 0644,
2607                 .proc_handler   = &proc_dointvec,
2608         },
2609         {
2610                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2611                 .procname       = "mtu_expires",
2612                 .data           = &ip_rt_mtu_expires,
2613                 .maxlen         = sizeof(int),
2614                 .mode           = 0644,
2615                 .proc_handler   = &proc_dointvec_jiffies,
2616                 .strategy       = &sysctl_jiffies,
2617         },
2618         {
2619                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2620                 .procname       = "min_pmtu",
2621                 .data           = &ip_rt_min_pmtu,
2622                 .maxlen         = sizeof(int),
2623                 .mode           = 0644,
2624                 .proc_handler   = &proc_dointvec,
2625         },
2626         {
2627                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2628                 .procname       = "min_adv_mss",
2629                 .data           = &ip_rt_min_advmss,
2630                 .maxlen         = sizeof(int),
2631                 .mode           = 0644,
2632                 .proc_handler   = &proc_dointvec,
2633         },
2634         {
2635                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2636                 .procname       = "secret_interval",
2637                 .data           = &ip_rt_secret_interval,
2638                 .maxlen         = sizeof(int),
2639                 .mode           = 0644,
2640                 .proc_handler   = &proc_dointvec_jiffies,
2641                 .strategy       = &sysctl_jiffies,
2642         },
2643         { .ctl_name = 0 }
2644 };
2645 #endif
2646
2647 #ifdef CONFIG_NET_CLS_ROUTE
2648 struct ip_rt_acct *ip_rt_acct;
2649
2650 /* This code sucks.  But you should have seen it before! --RR */
2651
2652 /* IP route accounting ptr for this logical cpu number. */
2653 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2654
2655 #ifdef CONFIG_PROC_FS
2656 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2657                            int length, int *eof, void *data)
2658 {
2659         unsigned int i;
2660
2661         if ((offset & 3) || (length & 3))
2662                 return -EIO;
2663
2664         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2665                 *eof = 1;
2666                 return 0;
2667         }
2668
2669         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2670                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2671                 *eof = 1;
2672         }
2673
2674         offset /= sizeof(u32);
2675
2676         if (length > 0) {
2677                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2678                 u32 *dst = (u32 *) buffer;
2679
2680                 /* Copy first cpu. */
2681                 *start = buffer;
2682                 memcpy(dst, src, length);
2683
2684                 /* Add the other cpus in, one int at a time */
2685                 for_each_cpu(i) {
2686                         unsigned int j;
2687
2688                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2689
2690                         for (j = 0; j < length/4; j++)
2691                                 dst[j] += src[j];
2692                 }
2693         }
2694         return length;
2695 }
2696 #endif /* CONFIG_PROC_FS */
2697 #endif /* CONFIG_NET_CLS_ROUTE */
2698
2699 static __initdata unsigned long rhash_entries;
2700 static int __init set_rhash_entries(char *str)
2701 {
2702         if (!str)
2703                 return 0;
2704         rhash_entries = simple_strtoul(str, &str, 0);
2705         return 1;
2706 }
2707 __setup("rhash_entries=", set_rhash_entries);
2708
2709 int __init ip_rt_init(void)
2710 {
2711         int i, order, goal, rc = 0;
2712
2713         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2714                              (jiffies ^ (jiffies >> 7)));
2715
2716 #ifdef CONFIG_NET_CLS_ROUTE
2717         for (order = 0;
2718              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2719                 /* NOTHING */;
2720         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2721         if (!ip_rt_acct)
2722                 panic("IP: failed to allocate ip_rt_acct\n");
2723         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2724 #endif
2725
2726         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2727                                                      sizeof(struct rtable),
2728                                                      0, SLAB_HWCACHE_ALIGN,
2729                                                      NULL, NULL);
2730
2731         if (!ipv4_dst_ops.kmem_cachep)
2732                 panic("IP: failed to allocate ip_dst_cache\n");
2733
2734         goal = num_physpages >> (26 - PAGE_SHIFT);
2735         if (rhash_entries)
2736                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2737         for (order = 0; (1UL << order) < goal; order++)
2738                 /* NOTHING */;
2739
2740         do {
2741                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2742                         sizeof(struct rt_hash_bucket);
2743                 while (rt_hash_mask & (rt_hash_mask - 1))
2744                         rt_hash_mask--;
2745                 rt_hash_table = (struct rt_hash_bucket *)
2746                         __get_free_pages(GFP_ATOMIC, order);
2747         } while (rt_hash_table == NULL && --order > 0);
2748
2749         if (!rt_hash_table)
2750                 panic("Failed to allocate IP route cache hash table\n");
2751
2752         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2753                rt_hash_mask,
2754                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2755
2756         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2757                 /* NOTHING */;
2758
2759         rt_hash_mask--;
2760         for (i = 0; i <= rt_hash_mask; i++) {
2761                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2762                 rt_hash_table[i].chain = NULL;
2763         }
2764
2765         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2766         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2767
2768         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2769         if (!rt_cache_stat)
2770                 return -ENOMEM;
2771
2772         devinet_init();
2773         ip_fib_init();
2774
2775         init_timer(&rt_flush_timer);
2776         rt_flush_timer.function = rt_run_flush;
2777         init_timer(&rt_periodic_timer);
2778         rt_periodic_timer.function = rt_check_expire;
2779         init_timer(&rt_secret_timer);
2780         rt_secret_timer.function = rt_secret_rebuild;
2781
2782         /* All the timers, started at system startup tend
2783            to synchronize. Perturb it a bit.
2784          */
2785         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2786                                         ip_rt_gc_interval;
2787         add_timer(&rt_periodic_timer);
2788
2789         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2790                 ip_rt_secret_interval;
2791         add_timer(&rt_secret_timer);
2792
2793 #ifdef CONFIG_PROC_FS
2794         {
2795         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2796         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2797             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
2798                                              proc_net_stat))) {
2799                 free_percpu(rt_cache_stat);
2800                 return -ENOMEM;
2801         }
2802         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2803         }
2804 #ifdef CONFIG_NET_CLS_ROUTE
2805         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2806 #endif
2807 #endif
2808 #ifdef CONFIG_XFRM
2809         xfrm_init();
2810         xfrm4_init();
2811 #endif
2812         return rc;
2813 }
2814
2815 EXPORT_SYMBOL(__ip_select_ident);
2816 EXPORT_SYMBOL(ip_route_input);
2817 EXPORT_SYMBOL(ip_route_output_key);