vserver 1.9.5.x5
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106
107 #define IP_MAX_MTU      0xFFF0
108
109 #define RT_GC_TIMEOUT (300*HZ)
110
111 static int ip_rt_min_delay              = 2 * HZ;
112 static int ip_rt_max_delay              = 10 * HZ;
113 static int ip_rt_max_size;
114 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
115 static int ip_rt_gc_interval            = 60 * HZ;
116 static int ip_rt_gc_min_interval        = HZ / 2;
117 static int ip_rt_redirect_number        = 9;
118 static int ip_rt_redirect_load          = HZ / 50;
119 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
120 static int ip_rt_error_cost             = HZ;
121 static int ip_rt_error_burst            = 5 * HZ;
122 static int ip_rt_gc_elasticity          = 8;
123 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
124 static int ip_rt_min_pmtu               = 512 + 20 + 20;
125 static int ip_rt_min_advmss             = 256;
126 static int ip_rt_secret_interval        = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static void              ipv4_dst_ifdown(struct dst_entry *dst,
142                                          struct net_device *dev, int how);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(void);
147
148
149 static struct dst_ops ipv4_dst_ops = {
150         .family =               AF_INET,
151         .protocol =             __constant_htons(ETH_P_IP),
152         .gc =                   rt_garbage_collect,
153         .check =                ipv4_dst_check,
154         .destroy =              ipv4_dst_destroy,
155         .ifdown =               ipv4_dst_ifdown,
156         .negative_advice =      ipv4_negative_advice,
157         .link_failure =         ipv4_link_failure,
158         .update_pmtu =          ip_rt_update_pmtu,
159         .entry_size =           sizeof(struct rtable),
160 };
161
162 #define ECN_OR_COST(class)      TC_PRIO_##class
163
164 __u8 ip_tos2prio[16] = {
165         TC_PRIO_BESTEFFORT,
166         ECN_OR_COST(FILLER),
167         TC_PRIO_BESTEFFORT,
168         ECN_OR_COST(BESTEFFORT),
169         TC_PRIO_BULK,
170         ECN_OR_COST(BULK),
171         TC_PRIO_BULK,
172         ECN_OR_COST(BULK),
173         TC_PRIO_INTERACTIVE,
174         ECN_OR_COST(INTERACTIVE),
175         TC_PRIO_INTERACTIVE,
176         ECN_OR_COST(INTERACTIVE),
177         TC_PRIO_INTERACTIVE_BULK,
178         ECN_OR_COST(INTERACTIVE_BULK),
179         TC_PRIO_INTERACTIVE_BULK,
180         ECN_OR_COST(INTERACTIVE_BULK)
181 };
182
183
184 /*
185  * Route cache.
186  */
187
188 /* The locking scheme is rather straight forward:
189  *
190  * 1) Read-Copy Update protects the buckets of the central route hash.
191  * 2) Only writers remove entries, and they hold the lock
192  *    as they look at rtable reference counts.
193  * 3) Only readers acquire references to rtable entries,
194  *    they do so with atomic increments and with the
195  *    lock held.
196  */
197
198 struct rt_hash_bucket {
199         struct rtable   *chain;
200         spinlock_t      lock;
201 } __attribute__((__aligned__(8)));
202
203 static struct rt_hash_bucket    *rt_hash_table;
204 static unsigned                 rt_hash_mask;
205 static int                      rt_hash_log;
206 static unsigned int             rt_hash_rnd;
207
208 struct rt_cache_stat *rt_cache_stat;
209
210 static int rt_intern_hash(unsigned hash, struct rtable *rth,
211                                 struct rtable **res);
212
213 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
214 {
215         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
216                 & rt_hash_mask);
217 }
218
219 #ifdef CONFIG_PROC_FS
220 struct rt_cache_iter_state {
221         int bucket;
222 };
223
224 static struct rtable *rt_cache_get_first(struct seq_file *seq)
225 {
226         struct rtable *r = NULL;
227         struct rt_cache_iter_state *st = seq->private;
228
229         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
230                 rcu_read_lock_bh();
231                 r = rt_hash_table[st->bucket].chain;
232                 if (r)
233                         break;
234                 rcu_read_unlock_bh();
235         }
236         return r;
237 }
238
239 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
240 {
241         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
242
243         r = r->u.rt_next;
244         while (!r) {
245                 rcu_read_unlock_bh();
246                 if (--st->bucket < 0)
247                         break;
248                 rcu_read_lock_bh();
249                 r = rt_hash_table[st->bucket].chain;
250         }
251         return r;
252 }
253
254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
255 {
256         struct rtable *r = rt_cache_get_first(seq);
257
258         if (r)
259                 while (pos && (r = rt_cache_get_next(seq, r)))
260                         --pos;
261         return pos ? NULL : r;
262 }
263
264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
265 {
266         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
267 }
268
269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 {
271         struct rtable *r = NULL;
272
273         if (v == SEQ_START_TOKEN)
274                 r = rt_cache_get_first(seq);
275         else
276                 r = rt_cache_get_next(seq, v);
277         ++*pos;
278         return r;
279 }
280
281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
282 {
283         if (v && v != SEQ_START_TOKEN)
284                 rcu_read_unlock_bh();
285 }
286
287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
288 {
289         if (v == SEQ_START_TOKEN)
290                 seq_printf(seq, "%-127s\n",
291                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
292                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
293                            "HHUptod\tSpecDst");
294         else {
295                 struct rtable *r = v;
296                 char temp[256];
297
298                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
299                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
300                         r->u.dst.dev ? r->u.dst.dev->name : "*",
301                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
302                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
303                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
304                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
305                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
306                         dst_metric(&r->u.dst, RTAX_WINDOW),
307                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
308                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
309                         r->fl.fl4_tos,
310                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
311                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
312                                        dev_queue_xmit) : 0,
313                         r->rt_spec_dst);
314                 seq_printf(seq, "%-127s\n", temp);
315         }
316         return 0;
317 }
318
319 static struct seq_operations rt_cache_seq_ops = {
320         .start  = rt_cache_seq_start,
321         .next   = rt_cache_seq_next,
322         .stop   = rt_cache_seq_stop,
323         .show   = rt_cache_seq_show,
324 };
325
326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
327 {
328         struct seq_file *seq;
329         int rc = -ENOMEM;
330         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
331
332         if (!s)
333                 goto out;
334         rc = seq_open(file, &rt_cache_seq_ops);
335         if (rc)
336                 goto out_kfree;
337         seq          = file->private_data;
338         seq->private = s;
339         memset(s, 0, sizeof(*s));
340 out:
341         return rc;
342 out_kfree:
343         kfree(s);
344         goto out;
345 }
346
347 static struct file_operations rt_cache_seq_fops = {
348         .owner   = THIS_MODULE,
349         .open    = rt_cache_seq_open,
350         .read    = seq_read,
351         .llseek  = seq_lseek,
352         .release = seq_release_private,
353 };
354
355
356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
357 {
358         int cpu;
359
360         if (*pos == 0)
361                 return SEQ_START_TOKEN;
362
363         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
364                 if (!cpu_possible(cpu))
365                         continue;
366                 *pos = cpu+1;
367                 return per_cpu_ptr(rt_cache_stat, cpu);
368         }
369         return NULL;
370 }
371
372 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373 {
374         int cpu;
375
376         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
377                 if (!cpu_possible(cpu))
378                         continue;
379                 *pos = cpu+1;
380                 return per_cpu_ptr(rt_cache_stat, cpu);
381         }
382         return NULL;
383         
384 }
385
386 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
387 {
388
389 }
390
391 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
392 {
393         struct rt_cache_stat *st = v;
394
395         if (v == SEQ_START_TOKEN) {
396                 seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
397                 return 0;
398         }
399         
400         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
401                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
402                    atomic_read(&ipv4_dst_ops.entries),
403                    st->in_hit,
404                    st->in_slow_tot,
405                    st->in_slow_mc,
406                    st->in_no_route,
407                    st->in_brd,
408                    st->in_martian_dst,
409                    st->in_martian_src,
410
411                    st->out_hit,
412                    st->out_slow_tot,
413                    st->out_slow_mc, 
414
415                    st->gc_total,
416                    st->gc_ignored,
417                    st->gc_goal_miss,
418                    st->gc_dst_overflow,
419                    st->in_hlist_search,
420                    st->out_hlist_search
421                 );
422         return 0;
423 }
424
425 static struct seq_operations rt_cpu_seq_ops = {
426         .start  = rt_cpu_seq_start,
427         .next   = rt_cpu_seq_next,
428         .stop   = rt_cpu_seq_stop,
429         .show   = rt_cpu_seq_show,
430 };
431
432
433 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
434 {
435         return seq_open(file, &rt_cpu_seq_ops);
436 }
437
438 static struct file_operations rt_cpu_seq_fops = {
439         .owner   = THIS_MODULE,
440         .open    = rt_cpu_seq_open,
441         .read    = seq_read,
442         .llseek  = seq_lseek,
443         .release = seq_release,
444 };
445
446 #endif /* CONFIG_PROC_FS */
447   
448 static __inline__ void rt_free(struct rtable *rt)
449 {
450         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
451 }
452
453 static __inline__ void rt_drop(struct rtable *rt)
454 {
455         ip_rt_put(rt);
456         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
457 }
458
459 static __inline__ int rt_fast_clean(struct rtable *rth)
460 {
461         /* Kill broadcast/multicast entries very aggresively, if they
462            collide in hash table with more useful entries */
463         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
464                 rth->fl.iif && rth->u.rt_next;
465 }
466
467 static __inline__ int rt_valuable(struct rtable *rth)
468 {
469         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
470                 rth->u.dst.expires;
471 }
472
473 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
474 {
475         unsigned long age;
476         int ret = 0;
477
478         if (atomic_read(&rth->u.dst.__refcnt))
479                 goto out;
480
481         ret = 1;
482         if (rth->u.dst.expires &&
483             time_after_eq(jiffies, rth->u.dst.expires))
484                 goto out;
485
486         age = jiffies - rth->u.dst.lastuse;
487         ret = 0;
488         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
489             (age <= tmo2 && rt_valuable(rth)))
490                 goto out;
491         ret = 1;
492 out:    return ret;
493 }
494
495 /* Bits of score are:
496  * 31: very valuable
497  * 30: not quite useless
498  * 29..0: usage counter
499  */
500 static inline u32 rt_score(struct rtable *rt)
501 {
502         u32 score = jiffies - rt->u.dst.lastuse;
503
504         score = ~score & ~(3<<30);
505
506         if (rt_valuable(rt))
507                 score |= (1<<31);
508
509         if (!rt->fl.iif ||
510             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
511                 score |= (1<<30);
512
513         return score;
514 }
515
516 /* This runs via a timer and thus is always in BH context. */
517 static void rt_check_expire(unsigned long dummy)
518 {
519         static int rover;
520         int i = rover, t;
521         struct rtable *rth, **rthp;
522         unsigned long now = jiffies;
523
524         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
525              t -= ip_rt_gc_timeout) {
526                 unsigned long tmo = ip_rt_gc_timeout;
527
528                 i = (i + 1) & rt_hash_mask;
529                 rthp = &rt_hash_table[i].chain;
530
531                 spin_lock(&rt_hash_table[i].lock);
532                 while ((rth = *rthp) != NULL) {
533                         if (rth->u.dst.expires) {
534                                 /* Entry is expired even if it is in use */
535                                 if (time_before_eq(now, rth->u.dst.expires)) {
536                                         tmo >>= 1;
537                                         rthp = &rth->u.rt_next;
538                                         continue;
539                                 }
540                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
541                                 tmo >>= 1;
542                                 rthp = &rth->u.rt_next;
543                                 continue;
544                         }
545
546                         /* Cleanup aged off entries. */
547                         *rthp = rth->u.rt_next;
548                         rt_free(rth);
549                 }
550                 spin_unlock(&rt_hash_table[i].lock);
551
552                 /* Fallback loop breaker. */
553                 if (time_after(jiffies, now))
554                         break;
555         }
556         rover = i;
557         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
558 }
559
560 /* This can run from both BH and non-BH contexts, the latter
561  * in the case of a forced flush event.
562  */
563 static void rt_run_flush(unsigned long dummy)
564 {
565         int i;
566         struct rtable *rth, *next;
567
568         rt_deadline = 0;
569
570         get_random_bytes(&rt_hash_rnd, 4);
571
572         for (i = rt_hash_mask; i >= 0; i--) {
573                 spin_lock_bh(&rt_hash_table[i].lock);
574                 rth = rt_hash_table[i].chain;
575                 if (rth)
576                         rt_hash_table[i].chain = NULL;
577                 spin_unlock_bh(&rt_hash_table[i].lock);
578
579                 for (; rth; rth = next) {
580                         next = rth->u.rt_next;
581                         rt_free(rth);
582                 }
583         }
584 }
585
586 static DEFINE_SPINLOCK(rt_flush_lock);
587
588 void rt_cache_flush(int delay)
589 {
590         unsigned long now = jiffies;
591         int user_mode = !in_softirq();
592
593         if (delay < 0)
594                 delay = ip_rt_min_delay;
595
596         spin_lock_bh(&rt_flush_lock);
597
598         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
599                 long tmo = (long)(rt_deadline - now);
600
601                 /* If flush timer is already running
602                    and flush request is not immediate (delay > 0):
603
604                    if deadline is not achieved, prolongate timer to "delay",
605                    otherwise fire it at deadline time.
606                  */
607
608                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
609                         tmo = 0;
610                 
611                 if (delay > tmo)
612                         delay = tmo;
613         }
614
615         if (delay <= 0) {
616                 spin_unlock_bh(&rt_flush_lock);
617                 rt_run_flush(0);
618                 return;
619         }
620
621         if (rt_deadline == 0)
622                 rt_deadline = now + ip_rt_max_delay;
623
624         mod_timer(&rt_flush_timer, now+delay);
625         spin_unlock_bh(&rt_flush_lock);
626 }
627
628 static void rt_secret_rebuild(unsigned long dummy)
629 {
630         unsigned long now = jiffies;
631
632         rt_cache_flush(0);
633         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
634 }
635
636 /*
637    Short description of GC goals.
638
639    We want to build algorithm, which will keep routing cache
640    at some equilibrium point, when number of aged off entries
641    is kept approximately equal to newly generated ones.
642
643    Current expiration strength is variable "expire".
644    We try to adjust it dynamically, so that if networking
645    is idle expires is large enough to keep enough of warm entries,
646    and when load increases it reduces to limit cache size.
647  */
648
649 static int rt_garbage_collect(void)
650 {
651         static unsigned long expire = RT_GC_TIMEOUT;
652         static unsigned long last_gc;
653         static int rover;
654         static int equilibrium;
655         struct rtable *rth, **rthp;
656         unsigned long now = jiffies;
657         int goal;
658
659         /*
660          * Garbage collection is pretty expensive,
661          * do not make it too frequently.
662          */
663
664         RT_CACHE_STAT_INC(gc_total);
665
666         if (now - last_gc < ip_rt_gc_min_interval &&
667             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
668                 RT_CACHE_STAT_INC(gc_ignored);
669                 goto out;
670         }
671
672         /* Calculate number of entries, which we want to expire now. */
673         goal = atomic_read(&ipv4_dst_ops.entries) -
674                 (ip_rt_gc_elasticity << rt_hash_log);
675         if (goal <= 0) {
676                 if (equilibrium < ipv4_dst_ops.gc_thresh)
677                         equilibrium = ipv4_dst_ops.gc_thresh;
678                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
679                 if (goal > 0) {
680                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
681                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
682                 }
683         } else {
684                 /* We are in dangerous area. Try to reduce cache really
685                  * aggressively.
686                  */
687                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
688                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
689         }
690
691         if (now - last_gc >= ip_rt_gc_min_interval)
692                 last_gc = now;
693
694         if (goal <= 0) {
695                 equilibrium += goal;
696                 goto work_done;
697         }
698
699         do {
700                 int i, k;
701
702                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
703                         unsigned long tmo = expire;
704
705                         k = (k + 1) & rt_hash_mask;
706                         rthp = &rt_hash_table[k].chain;
707                         spin_lock_bh(&rt_hash_table[k].lock);
708                         while ((rth = *rthp) != NULL) {
709                                 if (!rt_may_expire(rth, tmo, expire)) {
710                                         tmo >>= 1;
711                                         rthp = &rth->u.rt_next;
712                                         continue;
713                                 }
714                                 *rthp = rth->u.rt_next;
715                                 rt_free(rth);
716                                 goal--;
717                         }
718                         spin_unlock_bh(&rt_hash_table[k].lock);
719                         if (goal <= 0)
720                                 break;
721                 }
722                 rover = k;
723
724                 if (goal <= 0)
725                         goto work_done;
726
727                 /* Goal is not achieved. We stop process if:
728
729                    - if expire reduced to zero. Otherwise, expire is halfed.
730                    - if table is not full.
731                    - if we are called from interrupt.
732                    - jiffies check is just fallback/debug loop breaker.
733                      We will not spin here for long time in any case.
734                  */
735
736                 RT_CACHE_STAT_INC(gc_goal_miss);
737
738                 if (expire == 0)
739                         break;
740
741                 expire >>= 1;
742 #if RT_CACHE_DEBUG >= 2
743                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
744                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
745 #endif
746
747                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
748                         goto out;
749         } while (!in_softirq() && time_before_eq(jiffies, now));
750
751         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
752                 goto out;
753         if (net_ratelimit())
754                 printk(KERN_WARNING "dst cache overflow\n");
755         RT_CACHE_STAT_INC(gc_dst_overflow);
756         return 1;
757
758 work_done:
759         expire += ip_rt_gc_min_interval;
760         if (expire > ip_rt_gc_timeout ||
761             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
762                 expire = ip_rt_gc_timeout;
763 #if RT_CACHE_DEBUG >= 2
764         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
765                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
766 #endif
767 out:    return 0;
768 }
769
770 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
771 {
772         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
773                fl1->oif     == fl2->oif &&
774                fl1->iif     == fl2->iif;
775 }
776
777 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
778 {
779         struct rtable   *rth, **rthp;
780         unsigned long   now;
781         struct rtable *cand, **candp;
782         u32             min_score;
783         int             chain_length;
784         int attempts = !in_softirq();
785
786 restart:
787         chain_length = 0;
788         min_score = ~(u32)0;
789         cand = NULL;
790         candp = NULL;
791         now = jiffies;
792
793         rthp = &rt_hash_table[hash].chain;
794
795         spin_lock_bh(&rt_hash_table[hash].lock);
796         while ((rth = *rthp) != NULL) {
797                 if (compare_keys(&rth->fl, &rt->fl)) {
798                         /* Put it first */
799                         *rthp = rth->u.rt_next;
800                         /*
801                          * Since lookup is lockfree, the deletion
802                          * must be visible to another weakly ordered CPU before
803                          * the insertion at the start of the hash chain.
804                          */
805                         rcu_assign_pointer(rth->u.rt_next,
806                                            rt_hash_table[hash].chain);
807                         /*
808                          * Since lookup is lockfree, the update writes
809                          * must be ordered for consistency on SMP.
810                          */
811                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
812
813                         rth->u.dst.__use++;
814                         dst_hold(&rth->u.dst);
815                         rth->u.dst.lastuse = now;
816                         spin_unlock_bh(&rt_hash_table[hash].lock);
817
818                         rt_drop(rt);
819                         *rp = rth;
820                         return 0;
821                 }
822
823                 if (!atomic_read(&rth->u.dst.__refcnt)) {
824                         u32 score = rt_score(rth);
825
826                         if (score <= min_score) {
827                                 cand = rth;
828                                 candp = rthp;
829                                 min_score = score;
830                         }
831                 }
832
833                 chain_length++;
834
835                 rthp = &rth->u.rt_next;
836         }
837
838         if (cand) {
839                 /* ip_rt_gc_elasticity used to be average length of chain
840                  * length, when exceeded gc becomes really aggressive.
841                  *
842                  * The second limit is less certain. At the moment it allows
843                  * only 2 entries per bucket. We will see.
844                  */
845                 if (chain_length > ip_rt_gc_elasticity) {
846                         *candp = cand->u.rt_next;
847                         rt_free(cand);
848                 }
849         }
850
851         /* Try to bind route to arp only if it is output
852            route or unicast forwarding path.
853          */
854         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
855                 int err = arp_bind_neighbour(&rt->u.dst);
856                 if (err) {
857                         spin_unlock_bh(&rt_hash_table[hash].lock);
858
859                         if (err != -ENOBUFS) {
860                                 rt_drop(rt);
861                                 return err;
862                         }
863
864                         /* Neighbour tables are full and nothing
865                            can be released. Try to shrink route cache,
866                            it is most likely it holds some neighbour records.
867                          */
868                         if (attempts-- > 0) {
869                                 int saved_elasticity = ip_rt_gc_elasticity;
870                                 int saved_int = ip_rt_gc_min_interval;
871                                 ip_rt_gc_elasticity     = 1;
872                                 ip_rt_gc_min_interval   = 0;
873                                 rt_garbage_collect();
874                                 ip_rt_gc_min_interval   = saved_int;
875                                 ip_rt_gc_elasticity     = saved_elasticity;
876                                 goto restart;
877                         }
878
879                         if (net_ratelimit())
880                                 printk(KERN_WARNING "Neighbour table overflow.\n");
881                         rt_drop(rt);
882                         return -ENOBUFS;
883                 }
884         }
885
886         rt->u.rt_next = rt_hash_table[hash].chain;
887 #if RT_CACHE_DEBUG >= 2
888         if (rt->u.rt_next) {
889                 struct rtable *trt;
890                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
891                        NIPQUAD(rt->rt_dst));
892                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
893                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
894                 printk("\n");
895         }
896 #endif
897         rt_hash_table[hash].chain = rt;
898         spin_unlock_bh(&rt_hash_table[hash].lock);
899         *rp = rt;
900         return 0;
901 }
902
903 void rt_bind_peer(struct rtable *rt, int create)
904 {
905         static DEFINE_SPINLOCK(rt_peer_lock);
906         struct inet_peer *peer;
907
908         peer = inet_getpeer(rt->rt_dst, create);
909
910         spin_lock_bh(&rt_peer_lock);
911         if (rt->peer == NULL) {
912                 rt->peer = peer;
913                 peer = NULL;
914         }
915         spin_unlock_bh(&rt_peer_lock);
916         if (peer)
917                 inet_putpeer(peer);
918 }
919
920 /*
921  * Peer allocation may fail only in serious out-of-memory conditions.  However
922  * we still can generate some output.
923  * Random ID selection looks a bit dangerous because we have no chances to
924  * select ID being unique in a reasonable period of time.
925  * But broken packet identifier may be better than no packet at all.
926  */
927 static void ip_select_fb_ident(struct iphdr *iph)
928 {
929         static DEFINE_SPINLOCK(ip_fb_id_lock);
930         static u32 ip_fallback_id;
931         u32 salt;
932
933         spin_lock_bh(&ip_fb_id_lock);
934         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
935         iph->id = htons(salt & 0xFFFF);
936         ip_fallback_id = salt;
937         spin_unlock_bh(&ip_fb_id_lock);
938 }
939
940 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
941 {
942         struct rtable *rt = (struct rtable *) dst;
943
944         if (rt) {
945                 if (rt->peer == NULL)
946                         rt_bind_peer(rt, 1);
947
948                 /* If peer is attached to destination, it is never detached,
949                    so that we need not to grab a lock to dereference it.
950                  */
951                 if (rt->peer) {
952                         iph->id = htons(inet_getid(rt->peer, more));
953                         return;
954                 }
955         } else
956                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
957
958         ip_select_fb_ident(iph);
959 }
960
961 static void rt_del(unsigned hash, struct rtable *rt)
962 {
963         struct rtable **rthp;
964
965         spin_lock_bh(&rt_hash_table[hash].lock);
966         ip_rt_put(rt);
967         for (rthp = &rt_hash_table[hash].chain; *rthp;
968              rthp = &(*rthp)->u.rt_next)
969                 if (*rthp == rt) {
970                         *rthp = rt->u.rt_next;
971                         rt_free(rt);
972                         break;
973                 }
974         spin_unlock_bh(&rt_hash_table[hash].lock);
975 }
976
977 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
978                     u32 saddr, u8 tos, struct net_device *dev)
979 {
980         int i, k;
981         struct in_device *in_dev = in_dev_get(dev);
982         struct rtable *rth, **rthp;
983         u32  skeys[2] = { saddr, 0 };
984         int  ikeys[2] = { dev->ifindex, 0 };
985
986         tos &= IPTOS_RT_MASK;
987
988         if (!in_dev)
989                 return;
990
991         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
992             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
993                 goto reject_redirect;
994
995         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
996                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
997                         goto reject_redirect;
998                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
999                         goto reject_redirect;
1000         } else {
1001                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1002                         goto reject_redirect;
1003         }
1004
1005         for (i = 0; i < 2; i++) {
1006                 for (k = 0; k < 2; k++) {
1007                         unsigned hash = rt_hash_code(daddr,
1008                                                      skeys[i] ^ (ikeys[k] << 5),
1009                                                      tos);
1010
1011                         rthp=&rt_hash_table[hash].chain;
1012
1013                         rcu_read_lock();
1014                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1015                                 struct rtable *rt;
1016
1017                                 if (rth->fl.fl4_dst != daddr ||
1018                                     rth->fl.fl4_src != skeys[i] ||
1019                                     rth->fl.fl4_tos != tos ||
1020                                     rth->fl.oif != ikeys[k] ||
1021                                     rth->fl.iif != 0) {
1022                                         rthp = &rth->u.rt_next;
1023                                         continue;
1024                                 }
1025
1026                                 if (rth->rt_dst != daddr ||
1027                                     rth->rt_src != saddr ||
1028                                     rth->u.dst.error ||
1029                                     rth->rt_gateway != old_gw ||
1030                                     rth->u.dst.dev != dev)
1031                                         break;
1032
1033                                 dst_hold(&rth->u.dst);
1034                                 rcu_read_unlock();
1035
1036                                 rt = dst_alloc(&ipv4_dst_ops);
1037                                 if (rt == NULL) {
1038                                         ip_rt_put(rth);
1039                                         in_dev_put(in_dev);
1040                                         return;
1041                                 }
1042
1043                                 /* Copy all the information. */
1044                                 *rt = *rth;
1045                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1046                                 rt->u.dst.__use         = 1;
1047                                 atomic_set(&rt->u.dst.__refcnt, 1);
1048                                 rt->u.dst.child         = NULL;
1049                                 if (rt->u.dst.dev)
1050                                         dev_hold(rt->u.dst.dev);
1051                                 if (rt->idev)
1052                                         in_dev_hold(rt->idev);
1053                                 rt->u.dst.obsolete      = 0;
1054                                 rt->u.dst.lastuse       = jiffies;
1055                                 rt->u.dst.path          = &rt->u.dst;
1056                                 rt->u.dst.neighbour     = NULL;
1057                                 rt->u.dst.hh            = NULL;
1058                                 rt->u.dst.xfrm          = NULL;
1059
1060                                 rt->rt_flags            |= RTCF_REDIRECTED;
1061
1062                                 /* Gateway is different ... */
1063                                 rt->rt_gateway          = new_gw;
1064
1065                                 /* Redirect received -> path was valid */
1066                                 dst_confirm(&rth->u.dst);
1067
1068                                 if (rt->peer)
1069                                         atomic_inc(&rt->peer->refcnt);
1070
1071                                 if (arp_bind_neighbour(&rt->u.dst) ||
1072                                     !(rt->u.dst.neighbour->nud_state &
1073                                             NUD_VALID)) {
1074                                         if (rt->u.dst.neighbour)
1075                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1076                                         ip_rt_put(rth);
1077                                         rt_drop(rt);
1078                                         goto do_next;
1079                                 }
1080
1081                                 rt_del(hash, rth);
1082                                 if (!rt_intern_hash(hash, rt, &rt))
1083                                         ip_rt_put(rt);
1084                                 goto do_next;
1085                         }
1086                         rcu_read_unlock();
1087                 do_next:
1088                         ;
1089                 }
1090         }
1091         in_dev_put(in_dev);
1092         return;
1093
1094 reject_redirect:
1095 #ifdef CONFIG_IP_ROUTE_VERBOSE
1096         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1097                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1098                         "%u.%u.%u.%u ignored.\n"
1099                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1100                         "tos %02x\n",
1101                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1102                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1103 #endif
1104         in_dev_put(in_dev);
1105 }
1106
1107 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1108 {
1109         struct rtable *rt = (struct rtable*)dst;
1110         struct dst_entry *ret = dst;
1111
1112         if (rt) {
1113                 if (dst->obsolete) {
1114                         ip_rt_put(rt);
1115                         ret = NULL;
1116                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1117                            rt->u.dst.expires) {
1118                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1119                                                      rt->fl.fl4_src ^
1120                                                         (rt->fl.oif << 5),
1121                                                      rt->fl.fl4_tos);
1122 #if RT_CACHE_DEBUG >= 1
1123                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1124                                           "%u.%u.%u.%u/%02x dropped\n",
1125                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1126 #endif
1127                         rt_del(hash, rt);
1128                         ret = NULL;
1129                 }
1130         }
1131         return ret;
1132 }
1133
1134 /*
1135  * Algorithm:
1136  *      1. The first ip_rt_redirect_number redirects are sent
1137  *         with exponential backoff, then we stop sending them at all,
1138  *         assuming that the host ignores our redirects.
1139  *      2. If we did not see packets requiring redirects
1140  *         during ip_rt_redirect_silence, we assume that the host
1141  *         forgot redirected route and start to send redirects again.
1142  *
1143  * This algorithm is much cheaper and more intelligent than dumb load limiting
1144  * in icmp.c.
1145  *
1146  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1147  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1148  */
1149
1150 void ip_rt_send_redirect(struct sk_buff *skb)
1151 {
1152         struct rtable *rt = (struct rtable*)skb->dst;
1153         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1154
1155         if (!in_dev)
1156                 return;
1157
1158         if (!IN_DEV_TX_REDIRECTS(in_dev))
1159                 goto out;
1160
1161         /* No redirected packets during ip_rt_redirect_silence;
1162          * reset the algorithm.
1163          */
1164         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1165                 rt->u.dst.rate_tokens = 0;
1166
1167         /* Too many ignored redirects; do not send anything
1168          * set u.dst.rate_last to the last seen redirected packet.
1169          */
1170         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1171                 rt->u.dst.rate_last = jiffies;
1172                 goto out;
1173         }
1174
1175         /* Check for load limit; set rate_last to the latest sent
1176          * redirect.
1177          */
1178         if (time_after(jiffies,
1179                        (rt->u.dst.rate_last +
1180                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1181                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1182                 rt->u.dst.rate_last = jiffies;
1183                 ++rt->u.dst.rate_tokens;
1184 #ifdef CONFIG_IP_ROUTE_VERBOSE
1185                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1186                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1187                     net_ratelimit())
1188                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1189                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1190                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1191                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1192 #endif
1193         }
1194 out:
1195         in_dev_put(in_dev);
1196 }
1197
1198 static int ip_error(struct sk_buff *skb)
1199 {
1200         struct rtable *rt = (struct rtable*)skb->dst;
1201         unsigned long now;
1202         int code;
1203
1204         switch (rt->u.dst.error) {
1205                 case EINVAL:
1206                 default:
1207                         goto out;
1208                 case EHOSTUNREACH:
1209                         code = ICMP_HOST_UNREACH;
1210                         break;
1211                 case ENETUNREACH:
1212                         code = ICMP_NET_UNREACH;
1213                         break;
1214                 case EACCES:
1215                         code = ICMP_PKT_FILTERED;
1216                         break;
1217         }
1218
1219         now = jiffies;
1220         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1221         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1222                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1223         rt->u.dst.rate_last = now;
1224         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1225                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1226                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1227         }
1228
1229 out:    kfree_skb(skb);
1230         return 0;
1231
1232
1233 /*
1234  *      The last two values are not from the RFC but
1235  *      are needed for AMPRnet AX.25 paths.
1236  */
1237
1238 static unsigned short mtu_plateau[] =
1239 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1240
1241 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1242 {
1243         int i;
1244         
1245         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1246                 if (old_mtu > mtu_plateau[i])
1247                         return mtu_plateau[i];
1248         return 68;
1249 }
1250
1251 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1252 {
1253         int i;
1254         unsigned short old_mtu = ntohs(iph->tot_len);
1255         struct rtable *rth;
1256         u32  skeys[2] = { iph->saddr, 0, };
1257         u32  daddr = iph->daddr;
1258         u8   tos = iph->tos & IPTOS_RT_MASK;
1259         unsigned short est_mtu = 0;
1260
1261         if (ipv4_config.no_pmtu_disc)
1262                 return 0;
1263
1264         for (i = 0; i < 2; i++) {
1265                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1266
1267                 rcu_read_lock();
1268                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1269                      rth = rcu_dereference(rth->u.rt_next)) {
1270                         if (rth->fl.fl4_dst == daddr &&
1271                             rth->fl.fl4_src == skeys[i] &&
1272                             rth->rt_dst  == daddr &&
1273                             rth->rt_src  == iph->saddr &&
1274                             rth->fl.fl4_tos == tos &&
1275                             rth->fl.iif == 0 &&
1276                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1277                                 unsigned short mtu = new_mtu;
1278
1279                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1280
1281                                         /* BSD 4.2 compatibility hack :-( */
1282                                         if (mtu == 0 &&
1283                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1284                                             old_mtu >= 68 + (iph->ihl << 2))
1285                                                 old_mtu -= iph->ihl << 2;
1286
1287                                         mtu = guess_mtu(old_mtu);
1288                                 }
1289                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1290                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1291                                                 dst_confirm(&rth->u.dst);
1292                                                 if (mtu < ip_rt_min_pmtu) {
1293                                                         mtu = ip_rt_min_pmtu;
1294                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1295                                                                 (1 << RTAX_MTU);
1296                                                 }
1297                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1298                                                 dst_set_expires(&rth->u.dst,
1299                                                         ip_rt_mtu_expires);
1300                                         }
1301                                         est_mtu = mtu;
1302                                 }
1303                         }
1304                 }
1305                 rcu_read_unlock();
1306         }
1307         return est_mtu ? : new_mtu;
1308 }
1309
1310 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1311 {
1312         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1313             !(dst_metric_locked(dst, RTAX_MTU))) {
1314                 if (mtu < ip_rt_min_pmtu) {
1315                         mtu = ip_rt_min_pmtu;
1316                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1317                 }
1318                 dst->metrics[RTAX_MTU-1] = mtu;
1319                 dst_set_expires(dst, ip_rt_mtu_expires);
1320         }
1321 }
1322
1323 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1324 {
1325         dst_release(dst);
1326         return NULL;
1327 }
1328
1329 static void ipv4_dst_destroy(struct dst_entry *dst)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332         struct inet_peer *peer = rt->peer;
1333         struct in_device *idev = rt->idev;
1334
1335         if (peer) {
1336                 rt->peer = NULL;
1337                 inet_putpeer(peer);
1338         }
1339
1340         if (idev) {
1341                 rt->idev = NULL;
1342                 in_dev_put(idev);
1343         }
1344 }
1345
1346 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1347                             int how)
1348 {
1349         struct rtable *rt = (struct rtable *) dst;
1350         struct in_device *idev = rt->idev;
1351         if (dev != &loopback_dev && idev && idev->dev == dev) {
1352                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1353                 if (loopback_idev) {
1354                         rt->idev = loopback_idev;
1355                         in_dev_put(idev);
1356                 }
1357         }
1358 }
1359
1360 static void ipv4_link_failure(struct sk_buff *skb)
1361 {
1362         struct rtable *rt;
1363
1364         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1365
1366         rt = (struct rtable *) skb->dst;
1367         if (rt)
1368                 dst_set_expires(&rt->u.dst, 0);
1369 }
1370
1371 static int ip_rt_bug(struct sk_buff *skb)
1372 {
1373         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1374                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1375                 skb->dev ? skb->dev->name : "?");
1376         kfree_skb(skb);
1377         return 0;
1378 }
1379
1380 /*
1381    We do not cache source address of outgoing interface,
1382    because it is used only by IP RR, TS and SRR options,
1383    so that it out of fast path.
1384
1385    BTW remember: "addr" is allowed to be not aligned
1386    in IP options!
1387  */
1388
1389 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1390 {
1391         u32 src;
1392         struct fib_result res;
1393
1394         if (rt->fl.iif == 0)
1395                 src = rt->rt_src;
1396         else if (fib_lookup(&rt->fl, &res) == 0) {
1397                 src = FIB_RES_PREFSRC(res);
1398                 fib_res_put(&res);
1399         } else
1400                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1401                                         RT_SCOPE_UNIVERSE);
1402         memcpy(addr, &src, 4);
1403 }
1404
1405 #ifdef CONFIG_NET_CLS_ROUTE
1406 static void set_class_tag(struct rtable *rt, u32 tag)
1407 {
1408         if (!(rt->u.dst.tclassid & 0xFFFF))
1409                 rt->u.dst.tclassid |= tag & 0xFFFF;
1410         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1411                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1412 }
1413 #endif
1414
1415 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1416 {
1417         struct fib_info *fi = res->fi;
1418
1419         if (fi) {
1420                 if (FIB_RES_GW(*res) &&
1421                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1422                         rt->rt_gateway = FIB_RES_GW(*res);
1423                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1424                        sizeof(rt->u.dst.metrics));
1425                 if (fi->fib_mtu == 0) {
1426                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1427                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1428                             rt->rt_gateway != rt->rt_dst &&
1429                             rt->u.dst.dev->mtu > 576)
1430                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1431                 }
1432 #ifdef CONFIG_NET_CLS_ROUTE
1433                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1434 #endif
1435         } else
1436                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1437
1438         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1439                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1440         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1441                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1442         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1443                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1444                                        ip_rt_min_advmss);
1445         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1446                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1447
1448 #ifdef CONFIG_NET_CLS_ROUTE
1449 #ifdef CONFIG_IP_MULTIPLE_TABLES
1450         set_class_tag(rt, fib_rules_tclass(res));
1451 #endif
1452         set_class_tag(rt, itag);
1453 #endif
1454         rt->rt_type = res->type;
1455 }
1456
1457 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1458                                 u8 tos, struct net_device *dev, int our)
1459 {
1460         unsigned hash;
1461         struct rtable *rth;
1462         u32 spec_dst;
1463         struct in_device *in_dev = in_dev_get(dev);
1464         u32 itag = 0;
1465
1466         /* Primary sanity checks. */
1467
1468         if (in_dev == NULL)
1469                 return -EINVAL;
1470
1471         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1472             skb->protocol != htons(ETH_P_IP))
1473                 goto e_inval;
1474
1475         if (ZERONET(saddr)) {
1476                 if (!LOCAL_MCAST(daddr))
1477                         goto e_inval;
1478                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1479         } else if (fib_validate_source(saddr, 0, tos, 0,
1480                                         dev, &spec_dst, &itag) < 0)
1481                 goto e_inval;
1482
1483         rth = dst_alloc(&ipv4_dst_ops);
1484         if (!rth)
1485                 goto e_nobufs;
1486
1487         rth->u.dst.output= ip_rt_bug;
1488
1489         atomic_set(&rth->u.dst.__refcnt, 1);
1490         rth->u.dst.flags= DST_HOST;
1491         if (in_dev->cnf.no_policy)
1492                 rth->u.dst.flags |= DST_NOPOLICY;
1493         rth->fl.fl4_dst = daddr;
1494         rth->rt_dst     = daddr;
1495         rth->fl.fl4_tos = tos;
1496 #ifdef CONFIG_IP_ROUTE_FWMARK
1497         rth->fl.fl4_fwmark= skb->nfmark;
1498 #endif
1499         rth->fl.fl4_src = saddr;
1500         rth->rt_src     = saddr;
1501 #ifdef CONFIG_NET_CLS_ROUTE
1502         rth->u.dst.tclassid = itag;
1503 #endif
1504         rth->rt_iif     =
1505         rth->fl.iif     = dev->ifindex;
1506         rth->u.dst.dev  = &loopback_dev;
1507         dev_hold(rth->u.dst.dev);
1508         rth->idev       = in_dev_get(rth->u.dst.dev);
1509         rth->fl.oif     = 0;
1510         rth->rt_gateway = daddr;
1511         rth->rt_spec_dst= spec_dst;
1512         rth->rt_type    = RTN_MULTICAST;
1513         rth->rt_flags   = RTCF_MULTICAST;
1514         if (our) {
1515                 rth->u.dst.input= ip_local_deliver;
1516                 rth->rt_flags |= RTCF_LOCAL;
1517         }
1518
1519 #ifdef CONFIG_IP_MROUTE
1520         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1521                 rth->u.dst.input = ip_mr_input;
1522 #endif
1523         RT_CACHE_STAT_INC(in_slow_mc);
1524
1525         in_dev_put(in_dev);
1526         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1527         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1528
1529 e_nobufs:
1530         in_dev_put(in_dev);
1531         return -ENOBUFS;
1532
1533 e_inval:
1534         in_dev_put(in_dev);
1535         return -EINVAL;
1536 }
1537
1538 /*
1539  *      NOTE. We drop all the packets that has local source
1540  *      addresses, because every properly looped back packet
1541  *      must have correct destination already attached by output routine.
1542  *
1543  *      Such approach solves two big problems:
1544  *      1. Not simplex devices are handled properly.
1545  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1546  */
1547
1548 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1549                         u8 tos, struct net_device *dev)
1550 {
1551         struct fib_result res;
1552         struct in_device *in_dev = in_dev_get(dev);
1553         struct in_device *out_dev = NULL;
1554         struct flowi fl = { .nl_u = { .ip4_u =
1555                                       { .daddr = daddr,
1556                                         .saddr = saddr,
1557                                         .tos = tos,
1558                                         .scope = RT_SCOPE_UNIVERSE,
1559 #ifdef CONFIG_IP_ROUTE_FWMARK
1560                                         .fwmark = skb->nfmark
1561 #endif
1562                                       } },
1563                             .iif = dev->ifindex };
1564         unsigned        flags = 0;
1565         u32             itag = 0;
1566         struct rtable * rth;
1567         unsigned        hash;
1568         u32             spec_dst;
1569         int             err = -EINVAL;
1570         int             free_res = 0;
1571
1572         /* IP on this device is disabled. */
1573
1574         if (!in_dev)
1575                 goto out;
1576
1577         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1578
1579         /* Check for the most weird martians, which can be not detected
1580            by fib_lookup.
1581          */
1582
1583         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1584                 goto martian_source;
1585
1586         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1587                 goto brd_input;
1588
1589         /* Accept zero addresses only to limited broadcast;
1590          * I even do not know to fix it or not. Waiting for complains :-)
1591          */
1592         if (ZERONET(saddr))
1593                 goto martian_source;
1594
1595         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1596                 goto martian_destination;
1597
1598         /*
1599          *      Now we are ready to route packet.
1600          */
1601         if ((err = fib_lookup(&fl, &res)) != 0) {
1602                 if (!IN_DEV_FORWARD(in_dev))
1603                         goto e_inval;
1604                 goto no_route;
1605         }
1606         free_res = 1;
1607
1608         RT_CACHE_STAT_INC(in_slow_tot);
1609
1610         if (res.type == RTN_BROADCAST)
1611                 goto brd_input;
1612
1613         if (res.type == RTN_LOCAL) {
1614                 int result;
1615                 result = fib_validate_source(saddr, daddr, tos,
1616                                              loopback_dev.ifindex,
1617                                              dev, &spec_dst, &itag);
1618                 if (result < 0)
1619                         goto martian_source;
1620                 if (result)
1621                         flags |= RTCF_DIRECTSRC;
1622                 spec_dst = daddr;
1623                 goto local_input;
1624         }
1625
1626         if (!IN_DEV_FORWARD(in_dev))
1627                 goto e_inval;
1628         if (res.type != RTN_UNICAST)
1629                 goto martian_destination;
1630
1631 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1632         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1633                 fib_select_multipath(&fl, &res);
1634 #endif
1635         out_dev = in_dev_get(FIB_RES_DEV(res));
1636         if (out_dev == NULL) {
1637                 if (net_ratelimit())
1638                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1639                                          "Please, report\n");
1640                 goto e_inval;
1641         }
1642
1643         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1644                                   &spec_dst, &itag);
1645         if (err < 0)
1646                 goto martian_source;
1647
1648         if (err)
1649                 flags |= RTCF_DIRECTSRC;
1650
1651         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1652             (IN_DEV_SHARED_MEDIA(out_dev) ||
1653              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1654                 flags |= RTCF_DOREDIRECT;
1655
1656         if (skb->protocol != htons(ETH_P_IP)) {
1657                 /* Not IP (i.e. ARP). Do not create route, if it is
1658                  * invalid for proxy arp. DNAT routes are always valid.
1659                  */
1660                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1661                         goto e_inval;
1662         }
1663
1664         rth = dst_alloc(&ipv4_dst_ops);
1665         if (!rth)
1666                 goto e_nobufs;
1667
1668         atomic_set(&rth->u.dst.__refcnt, 1);
1669         rth->u.dst.flags= DST_HOST;
1670         if (in_dev->cnf.no_policy)
1671                 rth->u.dst.flags |= DST_NOPOLICY;
1672         if (in_dev->cnf.no_xfrm)
1673                 rth->u.dst.flags |= DST_NOXFRM;
1674         rth->fl.fl4_dst = daddr;
1675         rth->rt_dst     = daddr;
1676         rth->fl.fl4_tos = tos;
1677 #ifdef CONFIG_IP_ROUTE_FWMARK
1678         rth->fl.fl4_fwmark= skb->nfmark;
1679 #endif
1680         rth->fl.fl4_src = saddr;
1681         rth->rt_src     = saddr;
1682         rth->rt_gateway = daddr;
1683         rth->rt_iif     =
1684         rth->fl.iif     = dev->ifindex;
1685         rth->u.dst.dev  = out_dev->dev;
1686         dev_hold(rth->u.dst.dev);
1687         rth->idev       = in_dev_get(rth->u.dst.dev);
1688         rth->fl.oif     = 0;
1689         rth->rt_spec_dst= spec_dst;
1690
1691         rth->u.dst.input = ip_forward;
1692         rth->u.dst.output = ip_output;
1693
1694         rt_set_nexthop(rth, &res, itag);
1695
1696         rth->rt_flags = flags;
1697
1698 intern:
1699         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1700 done:
1701         in_dev_put(in_dev);
1702         if (out_dev)
1703                 in_dev_put(out_dev);
1704         if (free_res)
1705                 fib_res_put(&res);
1706 out:    return err;
1707
1708 brd_input:
1709         if (skb->protocol != htons(ETH_P_IP))
1710                 goto e_inval;
1711
1712         if (ZERONET(saddr))
1713                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1714         else {
1715                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1716                                           &itag);
1717                 if (err < 0)
1718                         goto martian_source;
1719                 if (err)
1720                         flags |= RTCF_DIRECTSRC;
1721         }
1722         flags |= RTCF_BROADCAST;
1723         res.type = RTN_BROADCAST;
1724         RT_CACHE_STAT_INC(in_brd);
1725
1726 local_input:
1727         rth = dst_alloc(&ipv4_dst_ops);
1728         if (!rth)
1729                 goto e_nobufs;
1730
1731         rth->u.dst.output= ip_rt_bug;
1732
1733         atomic_set(&rth->u.dst.__refcnt, 1);
1734         rth->u.dst.flags= DST_HOST;
1735         if (in_dev->cnf.no_policy)
1736                 rth->u.dst.flags |= DST_NOPOLICY;
1737         rth->fl.fl4_dst = daddr;
1738         rth->rt_dst     = daddr;
1739         rth->fl.fl4_tos = tos;
1740 #ifdef CONFIG_IP_ROUTE_FWMARK
1741         rth->fl.fl4_fwmark= skb->nfmark;
1742 #endif
1743         rth->fl.fl4_src = saddr;
1744         rth->rt_src     = saddr;
1745 #ifdef CONFIG_NET_CLS_ROUTE
1746         rth->u.dst.tclassid = itag;
1747 #endif
1748         rth->rt_iif     =
1749         rth->fl.iif     = dev->ifindex;
1750         rth->u.dst.dev  = &loopback_dev;
1751         dev_hold(rth->u.dst.dev);
1752         rth->idev       = in_dev_get(rth->u.dst.dev);
1753         rth->rt_gateway = daddr;
1754         rth->rt_spec_dst= spec_dst;
1755         rth->u.dst.input= ip_local_deliver;
1756         rth->rt_flags   = flags|RTCF_LOCAL;
1757         if (res.type == RTN_UNREACHABLE) {
1758                 rth->u.dst.input= ip_error;
1759                 rth->u.dst.error= -err;
1760                 rth->rt_flags   &= ~RTCF_LOCAL;
1761         }
1762         rth->rt_type    = res.type;
1763         goto intern;
1764
1765 no_route:
1766         RT_CACHE_STAT_INC(in_no_route);
1767         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1768         res.type = RTN_UNREACHABLE;
1769         goto local_input;
1770
1771         /*
1772          *      Do not cache martian addresses: they should be logged (RFC1812)
1773          */
1774 martian_destination:
1775         RT_CACHE_STAT_INC(in_martian_dst);
1776 #ifdef CONFIG_IP_ROUTE_VERBOSE
1777         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1778                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1779                         "%u.%u.%u.%u, dev %s\n",
1780                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1781 #endif
1782 e_inval:
1783         err = -EINVAL;
1784         goto done;
1785
1786 e_nobufs:
1787         err = -ENOBUFS;
1788         goto done;
1789
1790 martian_source:
1791
1792         RT_CACHE_STAT_INC(in_martian_src);
1793 #ifdef CONFIG_IP_ROUTE_VERBOSE
1794         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1795                 /*
1796                  *      RFC1812 recommendation, if source is martian,
1797                  *      the only hint is MAC header.
1798                  */
1799                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1800                         "%u.%u.%u.%u, on dev %s\n",
1801                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1802                 if (dev->hard_header_len) {
1803                         int i;
1804                         unsigned char *p = skb->mac.raw;
1805                         printk(KERN_WARNING "ll header: ");
1806                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1807                                 printk("%02x", *p);
1808                                 if (i < (dev->hard_header_len - 1))
1809                                         printk(":");
1810                         }
1811                         printk("\n");
1812                 }
1813         }
1814 #endif
1815         goto e_inval;
1816 }
1817
1818 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1819                    u8 tos, struct net_device *dev)
1820 {
1821         struct rtable * rth;
1822         unsigned        hash;
1823         int iif = dev->ifindex;
1824
1825         tos &= IPTOS_RT_MASK;
1826         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1827
1828         rcu_read_lock();
1829         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1830              rth = rcu_dereference(rth->u.rt_next)) {
1831                 if (rth->fl.fl4_dst == daddr &&
1832                     rth->fl.fl4_src == saddr &&
1833                     rth->fl.iif == iif &&
1834                     rth->fl.oif == 0 &&
1835 #ifdef CONFIG_IP_ROUTE_FWMARK
1836                     rth->fl.fl4_fwmark == skb->nfmark &&
1837 #endif
1838                     rth->fl.fl4_tos == tos) {
1839                         rth->u.dst.lastuse = jiffies;
1840                         dst_hold(&rth->u.dst);
1841                         rth->u.dst.__use++;
1842                         RT_CACHE_STAT_INC(in_hit);
1843                         rcu_read_unlock();
1844                         skb->dst = (struct dst_entry*)rth;
1845                         return 0;
1846                 }
1847                 RT_CACHE_STAT_INC(in_hlist_search);
1848         }
1849         rcu_read_unlock();
1850
1851         /* Multicast recognition logic is moved from route cache to here.
1852            The problem was that too many Ethernet cards have broken/missing
1853            hardware multicast filters :-( As result the host on multicasting
1854            network acquires a lot of useless route cache entries, sort of
1855            SDR messages from all the world. Now we try to get rid of them.
1856            Really, provided software IP multicast filter is organized
1857            reasonably (at least, hashed), it does not result in a slowdown
1858            comparing with route cache reject entries.
1859            Note, that multicast routers are not affected, because
1860            route cache entry is created eventually.
1861          */
1862         if (MULTICAST(daddr)) {
1863                 struct in_device *in_dev;
1864
1865                 rcu_read_lock();
1866                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1867                         int our = ip_check_mc(in_dev, daddr, saddr,
1868                                 skb->nh.iph->protocol);
1869                         if (our
1870 #ifdef CONFIG_IP_MROUTE
1871                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1872 #endif
1873                             ) {
1874                                 rcu_read_unlock();
1875                                 return ip_route_input_mc(skb, daddr, saddr,
1876                                                          tos, dev, our);
1877                         }
1878                 }
1879                 rcu_read_unlock();
1880                 return -EINVAL;
1881         }
1882         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1883 }
1884
1885 /*
1886  * Major route resolver routine.
1887  */
1888
1889 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1890 {
1891         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1892         struct flowi fl = { .nl_u = { .ip4_u =
1893                                       { .daddr = oldflp->fl4_dst,
1894                                         .saddr = oldflp->fl4_src,
1895                                         .tos = tos & IPTOS_RT_MASK,
1896                                         .scope = ((tos & RTO_ONLINK) ?
1897                                                   RT_SCOPE_LINK :
1898                                                   RT_SCOPE_UNIVERSE),
1899 #ifdef CONFIG_IP_ROUTE_FWMARK
1900                                         .fwmark = oldflp->fl4_fwmark
1901 #endif
1902                                       } },
1903                             .iif = loopback_dev.ifindex,
1904                             .oif = oldflp->oif };
1905         struct fib_result res;
1906         unsigned flags = 0;
1907         struct rtable *rth;
1908         struct net_device *dev_out = NULL;
1909         struct in_device *in_dev = NULL;
1910         unsigned hash;
1911         int free_res = 0;
1912         int err;
1913
1914         res.fi          = NULL;
1915 #ifdef CONFIG_IP_MULTIPLE_TABLES
1916         res.r           = NULL;
1917 #endif
1918
1919         if (oldflp->fl4_src) {
1920                 err = -EINVAL;
1921                 if (MULTICAST(oldflp->fl4_src) ||
1922                     BADCLASS(oldflp->fl4_src) ||
1923                     ZERONET(oldflp->fl4_src))
1924                         goto out;
1925
1926                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1927                 dev_out = ip_dev_find(oldflp->fl4_src);
1928                 if (dev_out == NULL)
1929                         goto out;
1930
1931                 /* I removed check for oif == dev_out->oif here.
1932                    It was wrong for two reasons:
1933                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1934                       assigned to multiple interfaces.
1935                    2. Moreover, we are allowed to send packets with saddr
1936                       of another iface. --ANK
1937                  */
1938
1939                 if (oldflp->oif == 0
1940                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1941                         /* Special hack: user can direct multicasts
1942                            and limited broadcast via necessary interface
1943                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1944                            This hack is not just for fun, it allows
1945                            vic,vat and friends to work.
1946                            They bind socket to loopback, set ttl to zero
1947                            and expect that it will work.
1948                            From the viewpoint of routing cache they are broken,
1949                            because we are not allowed to build multicast path
1950                            with loopback source addr (look, routing cache
1951                            cannot know, that ttl is zero, so that packet
1952                            will not leave this host and route is valid).
1953                            Luckily, this hack is good workaround.
1954                          */
1955
1956                         fl.oif = dev_out->ifindex;
1957                         goto make_route;
1958                 }
1959                 if (dev_out)
1960                         dev_put(dev_out);
1961                 dev_out = NULL;
1962         }
1963         if (oldflp->oif) {
1964                 dev_out = dev_get_by_index(oldflp->oif);
1965                 err = -ENODEV;
1966                 if (dev_out == NULL)
1967                         goto out;
1968                 if (__in_dev_get(dev_out) == NULL) {
1969                         dev_put(dev_out);
1970                         goto out;       /* Wrong error code */
1971                 }
1972
1973                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1974                         if (!fl.fl4_src)
1975                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1976                                                               RT_SCOPE_LINK);
1977                         goto make_route;
1978                 }
1979                 if (!fl.fl4_src) {
1980                         if (MULTICAST(oldflp->fl4_dst))
1981                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1982                                                               fl.fl4_scope);
1983                         else if (!oldflp->fl4_dst)
1984                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1985                                                               RT_SCOPE_HOST);
1986                 }
1987         }
1988
1989         if (!fl.fl4_dst) {
1990                 fl.fl4_dst = fl.fl4_src;
1991                 if (!fl.fl4_dst)
1992                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
1993                 if (dev_out)
1994                         dev_put(dev_out);
1995                 dev_out = &loopback_dev;
1996                 dev_hold(dev_out);
1997                 fl.oif = loopback_dev.ifindex;
1998                 res.type = RTN_LOCAL;
1999                 flags |= RTCF_LOCAL;
2000                 goto make_route;
2001         }
2002
2003         if (fib_lookup(&fl, &res)) {
2004                 res.fi = NULL;
2005                 if (oldflp->oif) {
2006                         /* Apparently, routing tables are wrong. Assume,
2007                            that the destination is on link.
2008
2009                            WHY? DW.
2010                            Because we are allowed to send to iface
2011                            even if it has NO routes and NO assigned
2012                            addresses. When oif is specified, routing
2013                            tables are looked up with only one purpose:
2014                            to catch if destination is gatewayed, rather than
2015                            direct. Moreover, if MSG_DONTROUTE is set,
2016                            we send packet, ignoring both routing tables
2017                            and ifaddr state. --ANK
2018
2019
2020                            We could make it even if oif is unknown,
2021                            likely IPv6, but we do not.
2022                          */
2023
2024                         if (fl.fl4_src == 0)
2025                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2026                                                               RT_SCOPE_LINK);
2027                         res.type = RTN_UNICAST;
2028                         goto make_route;
2029                 }
2030                 if (dev_out)
2031                         dev_put(dev_out);
2032                 err = -ENETUNREACH;
2033                 goto out;
2034         }
2035         free_res = 1;
2036
2037         if (res.type == RTN_LOCAL) {
2038                 if (!fl.fl4_src)
2039                         fl.fl4_src = fl.fl4_dst;
2040                 if (dev_out)
2041                         dev_put(dev_out);
2042                 dev_out = &loopback_dev;
2043                 dev_hold(dev_out);
2044                 fl.oif = dev_out->ifindex;
2045                 if (res.fi)
2046                         fib_info_put(res.fi);
2047                 res.fi = NULL;
2048                 flags |= RTCF_LOCAL;
2049                 goto make_route;
2050         }
2051
2052 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2053         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2054                 fib_select_multipath(&fl, &res);
2055         else
2056 #endif
2057         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2058                 fib_select_default(&fl, &res);
2059
2060         if (!fl.fl4_src)
2061                 fl.fl4_src = FIB_RES_PREFSRC(res);
2062
2063         if (dev_out)
2064                 dev_put(dev_out);
2065         dev_out = FIB_RES_DEV(res);
2066         dev_hold(dev_out);
2067         fl.oif = dev_out->ifindex;
2068
2069 make_route:
2070         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2071                 goto e_inval;
2072
2073         if (fl.fl4_dst == 0xFFFFFFFF)
2074                 res.type = RTN_BROADCAST;
2075         else if (MULTICAST(fl.fl4_dst))
2076                 res.type = RTN_MULTICAST;
2077         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2078                 goto e_inval;
2079
2080         if (dev_out->flags & IFF_LOOPBACK)
2081                 flags |= RTCF_LOCAL;
2082
2083         in_dev = in_dev_get(dev_out);
2084         if (!in_dev)
2085                 goto e_inval;
2086
2087         if (res.type == RTN_BROADCAST) {
2088                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2089                 if (res.fi) {
2090                         fib_info_put(res.fi);
2091                         res.fi = NULL;
2092                 }
2093         } else if (res.type == RTN_MULTICAST) {
2094                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2095                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2096                         flags &= ~RTCF_LOCAL;
2097                 /* If multicast route do not exist use
2098                    default one, but do not gateway in this case.
2099                    Yes, it is hack.
2100                  */
2101                 if (res.fi && res.prefixlen < 4) {
2102                         fib_info_put(res.fi);
2103                         res.fi = NULL;
2104                 }
2105         }
2106
2107         rth = dst_alloc(&ipv4_dst_ops);
2108         if (!rth)
2109                 goto e_nobufs;
2110
2111         atomic_set(&rth->u.dst.__refcnt, 1);
2112         rth->u.dst.flags= DST_HOST;
2113         if (in_dev->cnf.no_xfrm)
2114                 rth->u.dst.flags |= DST_NOXFRM;
2115         if (in_dev->cnf.no_policy)
2116                 rth->u.dst.flags |= DST_NOPOLICY;
2117         rth->fl.fl4_dst = oldflp->fl4_dst;
2118         rth->fl.fl4_tos = tos;
2119         rth->fl.fl4_src = oldflp->fl4_src;
2120         rth->fl.oif     = oldflp->oif;
2121 #ifdef CONFIG_IP_ROUTE_FWMARK
2122         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2123 #endif
2124         rth->rt_dst     = fl.fl4_dst;
2125         rth->rt_src     = fl.fl4_src;
2126         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2127         rth->u.dst.dev  = dev_out;
2128         dev_hold(dev_out);
2129         rth->idev       = in_dev_get(dev_out);
2130         rth->rt_gateway = fl.fl4_dst;
2131         rth->rt_spec_dst= fl.fl4_src;
2132
2133         rth->u.dst.output=ip_output;
2134
2135         RT_CACHE_STAT_INC(out_slow_tot);
2136
2137         if (flags & RTCF_LOCAL) {
2138                 rth->u.dst.input = ip_local_deliver;
2139                 rth->rt_spec_dst = fl.fl4_dst;
2140         }
2141         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2142                 rth->rt_spec_dst = fl.fl4_src;
2143                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2144                         rth->u.dst.output = ip_mc_output;
2145                         RT_CACHE_STAT_INC(out_slow_mc);
2146                 }
2147 #ifdef CONFIG_IP_MROUTE
2148                 if (res.type == RTN_MULTICAST) {
2149                         if (IN_DEV_MFORWARD(in_dev) &&
2150                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2151                                 rth->u.dst.input = ip_mr_input;
2152                                 rth->u.dst.output = ip_mc_output;
2153                         }
2154                 }
2155 #endif
2156         }
2157
2158         rt_set_nexthop(rth, &res, 0);
2159         
2160
2161         rth->rt_flags = flags;
2162
2163         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2164         err = rt_intern_hash(hash, rth, rp);
2165 done:
2166         if (free_res)
2167                 fib_res_put(&res);
2168         if (dev_out)
2169                 dev_put(dev_out);
2170         if (in_dev)
2171                 in_dev_put(in_dev);
2172 out:    return err;
2173
2174 e_inval:
2175         err = -EINVAL;
2176         goto done;
2177 e_nobufs:
2178         err = -ENOBUFS;
2179         goto done;
2180 }
2181
2182 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2183 {
2184         unsigned hash;
2185         struct rtable *rth;
2186
2187         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2188
2189         rcu_read_lock_bh();
2190         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2191                 rth = rcu_dereference(rth->u.rt_next)) {
2192                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2193                     rth->fl.fl4_src == flp->fl4_src &&
2194                     rth->fl.iif == 0 &&
2195                     rth->fl.oif == flp->oif &&
2196 #ifdef CONFIG_IP_ROUTE_FWMARK
2197                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2198 #endif
2199                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2200                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2201                         rth->u.dst.lastuse = jiffies;
2202                         dst_hold(&rth->u.dst);
2203                         rth->u.dst.__use++;
2204                         RT_CACHE_STAT_INC(out_hit);
2205                         rcu_read_unlock_bh();
2206                         *rp = rth;
2207                         return 0;
2208                 }
2209                 RT_CACHE_STAT_INC(out_hlist_search);
2210         }
2211         rcu_read_unlock_bh();
2212
2213         return ip_route_output_slow(rp, flp);
2214 }
2215
2216 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2217 {
2218         int err;
2219
2220         if ((err = __ip_route_output_key(rp, flp)) != 0)
2221                 return err;
2222
2223         if (flp->proto) {
2224                 if (!flp->fl4_src)
2225                         flp->fl4_src = (*rp)->rt_src;
2226                 if (!flp->fl4_dst)
2227                         flp->fl4_dst = (*rp)->rt_dst;
2228                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2229         }
2230
2231         return 0;
2232 }
2233
2234 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2235 {
2236         return ip_route_output_flow(rp, flp, NULL, 0);
2237 }
2238
2239 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2240                         int nowait)
2241 {
2242         struct rtable *rt = (struct rtable*)skb->dst;
2243         struct rtmsg *r;
2244         struct nlmsghdr  *nlh;
2245         unsigned char    *b = skb->tail;
2246         struct rta_cacheinfo ci;
2247 #ifdef CONFIG_IP_MROUTE
2248         struct rtattr *eptr;
2249 #endif
2250         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2251         r = NLMSG_DATA(nlh);
2252         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2253         r->rtm_family    = AF_INET;
2254         r->rtm_dst_len  = 32;
2255         r->rtm_src_len  = 0;
2256         r->rtm_tos      = rt->fl.fl4_tos;
2257         r->rtm_table    = RT_TABLE_MAIN;
2258         r->rtm_type     = rt->rt_type;
2259         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2260         r->rtm_protocol = RTPROT_UNSPEC;
2261         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2262         if (rt->rt_flags & RTCF_NOTIFY)
2263                 r->rtm_flags |= RTM_F_NOTIFY;
2264         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2265         if (rt->fl.fl4_src) {
2266                 r->rtm_src_len = 32;
2267                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2268         }
2269         if (rt->u.dst.dev)
2270                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2271 #ifdef CONFIG_NET_CLS_ROUTE
2272         if (rt->u.dst.tclassid)
2273                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2274 #endif
2275         if (rt->fl.iif)
2276                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2277         else if (rt->rt_src != rt->fl.fl4_src)
2278                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2279         if (rt->rt_dst != rt->rt_gateway)
2280                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2281         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2282                 goto rtattr_failure;
2283         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2284         ci.rta_used     = rt->u.dst.__use;
2285         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2286         if (rt->u.dst.expires)
2287                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2288         else
2289                 ci.rta_expires = 0;
2290         ci.rta_error    = rt->u.dst.error;
2291         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2292         if (rt->peer) {
2293                 ci.rta_id = rt->peer->ip_id_count;
2294                 if (rt->peer->tcp_ts_stamp) {
2295                         ci.rta_ts = rt->peer->tcp_ts;
2296                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2297                 }
2298         }
2299 #ifdef CONFIG_IP_MROUTE
2300         eptr = (struct rtattr*)skb->tail;
2301 #endif
2302         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2303         if (rt->fl.iif) {
2304 #ifdef CONFIG_IP_MROUTE
2305                 u32 dst = rt->rt_dst;
2306
2307                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2308                     ipv4_devconf.mc_forwarding) {
2309                         int err = ipmr_get_route(skb, r, nowait);
2310                         if (err <= 0) {
2311                                 if (!nowait) {
2312                                         if (err == 0)
2313                                                 return 0;
2314                                         goto nlmsg_failure;
2315                                 } else {
2316                                         if (err == -EMSGSIZE)
2317                                                 goto nlmsg_failure;
2318                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2319                                 }
2320                         }
2321                 } else
2322 #endif
2323                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2324         }
2325
2326         nlh->nlmsg_len = skb->tail - b;
2327         return skb->len;
2328
2329 nlmsg_failure:
2330 rtattr_failure:
2331         skb_trim(skb, b - skb->data);
2332         return -1;
2333 }
2334
2335 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2336 {
2337         struct rtattr **rta = arg;
2338         struct rtmsg *rtm = NLMSG_DATA(nlh);
2339         struct rtable *rt = NULL;
2340         u32 dst = 0;
2341         u32 src = 0;
2342         int iif = 0;
2343         int err = -ENOBUFS;
2344         struct sk_buff *skb;
2345
2346         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2347         if (!skb)
2348                 goto out;
2349
2350         /* Reserve room for dummy headers, this skb can pass
2351            through good chunk of routing engine.
2352          */
2353         skb->mac.raw = skb->data;
2354         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2355
2356         if (rta[RTA_SRC - 1])
2357                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2358         if (rta[RTA_DST - 1])
2359                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2360         if (rta[RTA_IIF - 1])
2361                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2362
2363         if (iif) {
2364                 struct net_device *dev = __dev_get_by_index(iif);
2365                 err = -ENODEV;
2366                 if (!dev)
2367                         goto out_free;
2368                 skb->protocol   = htons(ETH_P_IP);
2369                 skb->dev        = dev;
2370                 local_bh_disable();
2371                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2372                 local_bh_enable();
2373                 rt = (struct rtable*)skb->dst;
2374                 if (!err && rt->u.dst.error)
2375                         err = -rt->u.dst.error;
2376         } else {
2377                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2378                                                          .saddr = src,
2379                                                          .tos = rtm->rtm_tos } } };
2380                 int oif = 0;
2381                 if (rta[RTA_OIF - 1])
2382                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2383                 fl.oif = oif;
2384                 err = ip_route_output_key(&rt, &fl);
2385         }
2386         if (err)
2387                 goto out_free;
2388
2389         skb->dst = &rt->u.dst;
2390         if (rtm->rtm_flags & RTM_F_NOTIFY)
2391                 rt->rt_flags |= RTCF_NOTIFY;
2392
2393         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2394
2395         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2396                                 RTM_NEWROUTE, 0);
2397         if (!err)
2398                 goto out_free;
2399         if (err < 0) {
2400                 err = -EMSGSIZE;
2401                 goto out_free;
2402         }
2403
2404         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2405         if (err > 0)
2406                 err = 0;
2407 out:    return err;
2408
2409 out_free:
2410         kfree_skb(skb);
2411         goto out;
2412 }
2413
2414 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2415 {
2416         struct rtable *rt;
2417         int h, s_h;
2418         int idx, s_idx;
2419
2420         s_h = cb->args[0];
2421         s_idx = idx = cb->args[1];
2422         for (h = 0; h <= rt_hash_mask; h++) {
2423                 if (h < s_h) continue;
2424                 if (h > s_h)
2425                         s_idx = 0;
2426                 rcu_read_lock_bh();
2427                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2428                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2429                         if (idx < s_idx)
2430                                 continue;
2431                         skb->dst = dst_clone(&rt->u.dst);
2432                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2433                                          cb->nlh->nlmsg_seq,
2434                                          RTM_NEWROUTE, 1) <= 0) {
2435                                 dst_release(xchg(&skb->dst, NULL));
2436                                 rcu_read_unlock_bh();
2437                                 goto done;
2438                         }
2439                         dst_release(xchg(&skb->dst, NULL));
2440                 }
2441                 rcu_read_unlock_bh();
2442         }
2443
2444 done:
2445         cb->args[0] = h;
2446         cb->args[1] = idx;
2447         return skb->len;
2448 }
2449
2450 void ip_rt_multicast_event(struct in_device *in_dev)
2451 {
2452         rt_cache_flush(0);
2453 }
2454
2455 #ifdef CONFIG_SYSCTL
2456 static int flush_delay;
2457
2458 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2459                                         struct file *filp, void __user *buffer,
2460                                         size_t *lenp, loff_t *ppos)
2461 {
2462         if (write) {
2463                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2464                 rt_cache_flush(flush_delay);
2465                 return 0;
2466         } 
2467
2468         return -EINVAL;
2469 }
2470
2471 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2472                                                 int __user *name,
2473                                                 int nlen,
2474                                                 void __user *oldval,
2475                                                 size_t __user *oldlenp,
2476                                                 void __user *newval,
2477                                                 size_t newlen,
2478                                                 void **context)
2479 {
2480         int delay;
2481         if (newlen != sizeof(int))
2482                 return -EINVAL;
2483         if (get_user(delay, (int __user *)newval))
2484                 return -EFAULT; 
2485         rt_cache_flush(delay); 
2486         return 0;
2487 }
2488
2489 ctl_table ipv4_route_table[] = {
2490         {
2491                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2492                 .procname       = "flush",
2493                 .data           = &flush_delay,
2494                 .maxlen         = sizeof(int),
2495                 .mode           = 0644,
2496                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2497                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2498         },
2499         {
2500                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2501                 .procname       = "min_delay",
2502                 .data           = &ip_rt_min_delay,
2503                 .maxlen         = sizeof(int),
2504                 .mode           = 0644,
2505                 .proc_handler   = &proc_dointvec_jiffies,
2506                 .strategy       = &sysctl_jiffies,
2507         },
2508         {
2509                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2510                 .procname       = "max_delay",
2511                 .data           = &ip_rt_max_delay,
2512                 .maxlen         = sizeof(int),
2513                 .mode           = 0644,
2514                 .proc_handler   = &proc_dointvec_jiffies,
2515                 .strategy       = &sysctl_jiffies,
2516         },
2517         {
2518                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2519                 .procname       = "gc_thresh",
2520                 .data           = &ipv4_dst_ops.gc_thresh,
2521                 .maxlen         = sizeof(int),
2522                 .mode           = 0644,
2523                 .proc_handler   = &proc_dointvec,
2524         },
2525         {
2526                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2527                 .procname       = "max_size",
2528                 .data           = &ip_rt_max_size,
2529                 .maxlen         = sizeof(int),
2530                 .mode           = 0644,
2531                 .proc_handler   = &proc_dointvec,
2532         },
2533         {
2534                 /*  Deprecated. Use gc_min_interval_ms */
2535  
2536                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2537                 .procname       = "gc_min_interval",
2538                 .data           = &ip_rt_gc_min_interval,
2539                 .maxlen         = sizeof(int),
2540                 .mode           = 0644,
2541                 .proc_handler   = &proc_dointvec_jiffies,
2542                 .strategy       = &sysctl_jiffies,
2543         },
2544         {
2545                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2546                 .procname       = "gc_min_interval_ms",
2547                 .data           = &ip_rt_gc_min_interval,
2548                 .maxlen         = sizeof(int),
2549                 .mode           = 0644,
2550                 .proc_handler   = &proc_dointvec_ms_jiffies,
2551                 .strategy       = &sysctl_ms_jiffies,
2552         },
2553         {
2554                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2555                 .procname       = "gc_timeout",
2556                 .data           = &ip_rt_gc_timeout,
2557                 .maxlen         = sizeof(int),
2558                 .mode           = 0644,
2559                 .proc_handler   = &proc_dointvec_jiffies,
2560                 .strategy       = &sysctl_jiffies,
2561         },
2562         {
2563                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2564                 .procname       = "gc_interval",
2565                 .data           = &ip_rt_gc_interval,
2566                 .maxlen         = sizeof(int),
2567                 .mode           = 0644,
2568                 .proc_handler   = &proc_dointvec_jiffies,
2569                 .strategy       = &sysctl_jiffies,
2570         },
2571         {
2572                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2573                 .procname       = "redirect_load",
2574                 .data           = &ip_rt_redirect_load,
2575                 .maxlen         = sizeof(int),
2576                 .mode           = 0644,
2577                 .proc_handler   = &proc_dointvec,
2578         },
2579         {
2580                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2581                 .procname       = "redirect_number",
2582                 .data           = &ip_rt_redirect_number,
2583                 .maxlen         = sizeof(int),
2584                 .mode           = 0644,
2585                 .proc_handler   = &proc_dointvec,
2586         },
2587         {
2588                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2589                 .procname       = "redirect_silence",
2590                 .data           = &ip_rt_redirect_silence,
2591                 .maxlen         = sizeof(int),
2592                 .mode           = 0644,
2593                 .proc_handler   = &proc_dointvec,
2594         },
2595         {
2596                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2597                 .procname       = "error_cost",
2598                 .data           = &ip_rt_error_cost,
2599                 .maxlen         = sizeof(int),
2600                 .mode           = 0644,
2601                 .proc_handler   = &proc_dointvec,
2602         },
2603         {
2604                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2605                 .procname       = "error_burst",
2606                 .data           = &ip_rt_error_burst,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = &proc_dointvec,
2610         },
2611         {
2612                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2613                 .procname       = "gc_elasticity",
2614                 .data           = &ip_rt_gc_elasticity,
2615                 .maxlen         = sizeof(int),
2616                 .mode           = 0644,
2617                 .proc_handler   = &proc_dointvec,
2618         },
2619         {
2620                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2621                 .procname       = "mtu_expires",
2622                 .data           = &ip_rt_mtu_expires,
2623                 .maxlen         = sizeof(int),
2624                 .mode           = 0644,
2625                 .proc_handler   = &proc_dointvec_jiffies,
2626                 .strategy       = &sysctl_jiffies,
2627         },
2628         {
2629                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2630                 .procname       = "min_pmtu",
2631                 .data           = &ip_rt_min_pmtu,
2632                 .maxlen         = sizeof(int),
2633                 .mode           = 0644,
2634                 .proc_handler   = &proc_dointvec,
2635         },
2636         {
2637                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2638                 .procname       = "min_adv_mss",
2639                 .data           = &ip_rt_min_advmss,
2640                 .maxlen         = sizeof(int),
2641                 .mode           = 0644,
2642                 .proc_handler   = &proc_dointvec,
2643         },
2644         {
2645                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2646                 .procname       = "secret_interval",
2647                 .data           = &ip_rt_secret_interval,
2648                 .maxlen         = sizeof(int),
2649                 .mode           = 0644,
2650                 .proc_handler   = &proc_dointvec_jiffies,
2651                 .strategy       = &sysctl_jiffies,
2652         },
2653         { .ctl_name = 0 }
2654 };
2655 #endif
2656
2657 #ifdef CONFIG_NET_CLS_ROUTE
2658 struct ip_rt_acct *ip_rt_acct;
2659
2660 /* This code sucks.  But you should have seen it before! --RR */
2661
2662 /* IP route accounting ptr for this logical cpu number. */
2663 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2664
2665 #ifdef CONFIG_PROC_FS
2666 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2667                            int length, int *eof, void *data)
2668 {
2669         unsigned int i;
2670
2671         if ((offset & 3) || (length & 3))
2672                 return -EIO;
2673
2674         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2675                 *eof = 1;
2676                 return 0;
2677         }
2678
2679         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2680                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2681                 *eof = 1;
2682         }
2683
2684         offset /= sizeof(u32);
2685
2686         if (length > 0) {
2687                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2688                 u32 *dst = (u32 *) buffer;
2689
2690                 /* Copy first cpu. */
2691                 *start = buffer;
2692                 memcpy(dst, src, length);
2693
2694                 /* Add the other cpus in, one int at a time */
2695                 for_each_cpu(i) {
2696                         unsigned int j;
2697
2698                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2699
2700                         for (j = 0; j < length/4; j++)
2701                                 dst[j] += src[j];
2702                 }
2703         }
2704         return length;
2705 }
2706 #endif /* CONFIG_PROC_FS */
2707 #endif /* CONFIG_NET_CLS_ROUTE */
2708
2709 static __initdata unsigned long rhash_entries;
2710 static int __init set_rhash_entries(char *str)
2711 {
2712         if (!str)
2713                 return 0;
2714         rhash_entries = simple_strtoul(str, &str, 0);
2715         return 1;
2716 }
2717 __setup("rhash_entries=", set_rhash_entries);
2718
2719 int __init ip_rt_init(void)
2720 {
2721         int i, order, goal, rc = 0;
2722
2723         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2724                              (jiffies ^ (jiffies >> 7)));
2725
2726 #ifdef CONFIG_NET_CLS_ROUTE
2727         for (order = 0;
2728              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2729                 /* NOTHING */;
2730         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2731         if (!ip_rt_acct)
2732                 panic("IP: failed to allocate ip_rt_acct\n");
2733         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2734 #endif
2735
2736         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2737                                                      sizeof(struct rtable),
2738                                                      0, SLAB_HWCACHE_ALIGN,
2739                                                      NULL, NULL);
2740
2741         if (!ipv4_dst_ops.kmem_cachep)
2742                 panic("IP: failed to allocate ip_dst_cache\n");
2743
2744         goal = num_physpages >> (26 - PAGE_SHIFT);
2745         if (rhash_entries)
2746                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2747         for (order = 0; (1UL << order) < goal; order++)
2748                 /* NOTHING */;
2749
2750         do {
2751                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2752                         sizeof(struct rt_hash_bucket);
2753                 while (rt_hash_mask & (rt_hash_mask - 1))
2754                         rt_hash_mask--;
2755                 rt_hash_table = (struct rt_hash_bucket *)
2756                         __get_free_pages(GFP_ATOMIC, order);
2757         } while (rt_hash_table == NULL && --order > 0);
2758
2759         if (!rt_hash_table)
2760                 panic("Failed to allocate IP route cache hash table\n");
2761
2762         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2763                rt_hash_mask,
2764                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2765
2766         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2767                 /* NOTHING */;
2768
2769         rt_hash_mask--;
2770         for (i = 0; i <= rt_hash_mask; i++) {
2771                 spin_lock_init(&rt_hash_table[i].lock);
2772                 rt_hash_table[i].chain = NULL;
2773         }
2774
2775         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2776         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2777
2778         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2779         if (!rt_cache_stat)
2780                 return -ENOMEM;
2781
2782         devinet_init();
2783         ip_fib_init();
2784
2785         init_timer(&rt_flush_timer);
2786         rt_flush_timer.function = rt_run_flush;
2787         init_timer(&rt_periodic_timer);
2788         rt_periodic_timer.function = rt_check_expire;
2789         init_timer(&rt_secret_timer);
2790         rt_secret_timer.function = rt_secret_rebuild;
2791
2792         /* All the timers, started at system startup tend
2793            to synchronize. Perturb it a bit.
2794          */
2795         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2796                                         ip_rt_gc_interval;
2797         add_timer(&rt_periodic_timer);
2798
2799         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2800                 ip_rt_secret_interval;
2801         add_timer(&rt_secret_timer);
2802
2803 #ifdef CONFIG_PROC_FS
2804         {
2805         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2806         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2807             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
2808                                              proc_net_stat))) {
2809                 free_percpu(rt_cache_stat);
2810                 return -ENOMEM;
2811         }
2812         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2813         }
2814 #ifdef CONFIG_NET_CLS_ROUTE
2815         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2816 #endif
2817 #endif
2818 #ifdef CONFIG_XFRM
2819         xfrm_init();
2820         xfrm4_init();
2821 #endif
2822         return rc;
2823 }
2824
2825 EXPORT_SYMBOL(__ip_select_ident);
2826 EXPORT_SYMBOL(ip_route_input);
2827 EXPORT_SYMBOL(ip_route_output_key);