2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
58 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License
60 * as published by the Free Software Foundation; either version
61 * 2 of the License, or (at your option) any later version.
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
98 #include <net/ip_fib.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
104 #include <linux/sysctl.h>
107 #define IP_MAX_MTU 0xFFF0
109 #define RT_GC_TIMEOUT (300*HZ)
111 int ip_rt_min_delay = 2 * HZ;
112 int ip_rt_max_delay = 10 * HZ;
114 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval = 60 * HZ;
116 int ip_rt_gc_min_interval = HZ / 2;
117 int ip_rt_redirect_number = 9;
118 int ip_rt_redirect_load = HZ / 50;
119 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost = HZ;
121 int ip_rt_error_burst = 5 * HZ;
122 int ip_rt_gc_elasticity = 8;
123 int ip_rt_mtu_expires = 10 * 60 * HZ;
124 int ip_rt_min_pmtu = 512 + 20 + 20;
125 int ip_rt_min_advmss = 256;
126 int ip_rt_secret_interval = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
129 #define RTprint(a...) printk(KERN_DEBUG a)
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
136 * Interface to generic destination cache.
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void ipv4_dst_destroy(struct dst_entry *dst);
141 static void ipv4_dst_ifdown(struct dst_entry *dst, int how);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void ipv4_link_failure(struct sk_buff *skb);
144 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(void);
148 static struct dst_ops ipv4_dst_ops = {
150 .protocol = __constant_htons(ETH_P_IP),
151 .gc = rt_garbage_collect,
152 .check = ipv4_dst_check,
153 .destroy = ipv4_dst_destroy,
154 .ifdown = ipv4_dst_ifdown,
155 .negative_advice = ipv4_negative_advice,
156 .link_failure = ipv4_link_failure,
157 .update_pmtu = ip_rt_update_pmtu,
158 .entry_size = sizeof(struct rtable),
161 #define ECN_OR_COST(class) TC_PRIO_##class
163 __u8 ip_tos2prio[16] = {
167 ECN_OR_COST(BESTEFFORT),
173 ECN_OR_COST(INTERACTIVE),
175 ECN_OR_COST(INTERACTIVE),
176 TC_PRIO_INTERACTIVE_BULK,
177 ECN_OR_COST(INTERACTIVE_BULK),
178 TC_PRIO_INTERACTIVE_BULK,
179 ECN_OR_COST(INTERACTIVE_BULK)
187 /* The locking scheme is rather straight forward:
189 * 1) Read-Copy Update protects the buckets of the central route hash.
190 * 2) Only writers remove entries, and they hold the lock
191 * as they look at rtable reference counts.
192 * 3) Only readers acquire references to rtable entries,
193 * they do so with atomic increments and with the
197 struct rt_hash_bucket {
198 struct rtable *chain;
200 } __attribute__((__aligned__(8)));
202 static struct rt_hash_bucket *rt_hash_table;
203 static unsigned rt_hash_mask;
204 static int rt_hash_log;
205 static unsigned int rt_hash_rnd;
207 struct rt_cache_stat *rt_cache_stat;
209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
210 struct rtable **res);
212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
214 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
218 #ifdef CONFIG_PROC_FS
219 struct rt_cache_iter_state {
223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
225 struct rtable *r = NULL;
226 struct rt_cache_iter_state *st = seq->private;
228 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
230 r = rt_hash_table[st->bucket].chain;
233 rcu_read_unlock_bh();
238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
240 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
244 rcu_read_unlock_bh();
245 if (--st->bucket < 0)
248 r = rt_hash_table[st->bucket].chain;
253 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
255 struct rtable *r = rt_cache_get_first(seq);
258 while (pos && (r = rt_cache_get_next(seq, r)))
260 return pos ? NULL : r;
263 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
265 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
268 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 struct rtable *r = NULL;
272 if (v == SEQ_START_TOKEN)
273 r = rt_cache_get_first(seq);
275 r = rt_cache_get_next(seq, v);
280 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
282 if (v && v != SEQ_START_TOKEN)
283 rcu_read_unlock_bh();
286 static int rt_cache_seq_show(struct seq_file *seq, void *v)
288 if (v == SEQ_START_TOKEN)
289 seq_printf(seq, "%-127s\n",
290 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
291 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
294 struct rtable *r = v;
297 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
298 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
299 r->u.dst.dev ? r->u.dst.dev->name : "*",
300 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
301 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
302 r->u.dst.__use, 0, (unsigned long)r->rt_src,
303 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
304 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
305 dst_metric(&r->u.dst, RTAX_WINDOW),
306 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
307 dst_metric(&r->u.dst, RTAX_RTTVAR)),
309 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
310 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
313 seq_printf(seq, "%-127s\n", temp);
318 static struct seq_operations rt_cache_seq_ops = {
319 .start = rt_cache_seq_start,
320 .next = rt_cache_seq_next,
321 .stop = rt_cache_seq_stop,
322 .show = rt_cache_seq_show,
325 static int rt_cache_seq_open(struct inode *inode, struct file *file)
327 struct seq_file *seq;
329 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
333 rc = seq_open(file, &rt_cache_seq_ops);
336 seq = file->private_data;
338 memset(s, 0, sizeof(*s));
346 static struct file_operations rt_cache_seq_fops = {
347 .owner = THIS_MODULE,
348 .open = rt_cache_seq_open,
351 .release = seq_release_private,
355 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
360 return SEQ_START_TOKEN;
362 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
363 if (!cpu_possible(cpu))
366 return per_cpu_ptr(rt_cache_stat, cpu);
371 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
375 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
376 if (!cpu_possible(cpu))
379 return per_cpu_ptr(rt_cache_stat, cpu);
385 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
390 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
392 struct rt_cache_stat *st = v;
394 if (v == SEQ_START_TOKEN) {
395 seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
399 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
400 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
401 atomic_read(&ipv4_dst_ops.entries),
424 static struct seq_operations rt_cpu_seq_ops = {
425 .start = rt_cpu_seq_start,
426 .next = rt_cpu_seq_next,
427 .stop = rt_cpu_seq_stop,
428 .show = rt_cpu_seq_show,
432 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
434 return seq_open(file, &rt_cpu_seq_ops);
437 static struct file_operations rt_cpu_seq_fops = {
438 .owner = THIS_MODULE,
439 .open = rt_cpu_seq_open,
442 .release = seq_release,
445 #endif /* CONFIG_PROC_FS */
447 static __inline__ void rt_free(struct rtable *rt)
449 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
452 static __inline__ void rt_drop(struct rtable *rt)
455 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
458 static __inline__ int rt_fast_clean(struct rtable *rth)
460 /* Kill broadcast/multicast entries very aggresively, if they
461 collide in hash table with more useful entries */
462 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
463 rth->fl.iif && rth->u.rt_next;
466 static __inline__ int rt_valuable(struct rtable *rth)
468 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
472 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
477 if (atomic_read(&rth->u.dst.__refcnt))
481 if (rth->u.dst.expires &&
482 time_after_eq(jiffies, rth->u.dst.expires))
485 age = jiffies - rth->u.dst.lastuse;
487 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
488 (age <= tmo2 && rt_valuable(rth)))
494 /* Bits of score are:
496 * 30: not quite useless
497 * 29..0: usage counter
499 static inline u32 rt_score(struct rtable *rt)
501 u32 score = jiffies - rt->u.dst.lastuse;
503 score = ~score & ~(3<<30);
509 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
515 /* This runs via a timer and thus is always in BH context. */
516 static void rt_check_expire(unsigned long dummy)
520 struct rtable *rth, **rthp;
521 unsigned long now = jiffies;
523 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
524 t -= ip_rt_gc_timeout) {
525 unsigned long tmo = ip_rt_gc_timeout;
527 i = (i + 1) & rt_hash_mask;
528 rthp = &rt_hash_table[i].chain;
530 spin_lock(&rt_hash_table[i].lock);
531 while ((rth = *rthp) != NULL) {
532 if (rth->u.dst.expires) {
533 /* Entry is expired even if it is in use */
534 if (time_before_eq(now, rth->u.dst.expires)) {
536 rthp = &rth->u.rt_next;
539 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
541 rthp = &rth->u.rt_next;
545 /* Cleanup aged off entries. */
546 *rthp = rth->u.rt_next;
549 spin_unlock(&rt_hash_table[i].lock);
551 /* Fallback loop breaker. */
552 if (time_after(jiffies, now))
556 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
559 /* This can run from both BH and non-BH contexts, the latter
560 * in the case of a forced flush event.
562 static void rt_run_flush(unsigned long dummy)
565 struct rtable *rth, *next;
569 get_random_bytes(&rt_hash_rnd, 4);
571 for (i = rt_hash_mask; i >= 0; i--) {
572 spin_lock_bh(&rt_hash_table[i].lock);
573 rth = rt_hash_table[i].chain;
575 rt_hash_table[i].chain = NULL;
576 spin_unlock_bh(&rt_hash_table[i].lock);
578 for (; rth; rth = next) {
579 next = rth->u.rt_next;
585 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
587 void rt_cache_flush(int delay)
589 unsigned long now = jiffies;
590 int user_mode = !in_softirq();
593 delay = ip_rt_min_delay;
595 spin_lock_bh(&rt_flush_lock);
597 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
598 long tmo = (long)(rt_deadline - now);
600 /* If flush timer is already running
601 and flush request is not immediate (delay > 0):
603 if deadline is not achieved, prolongate timer to "delay",
604 otherwise fire it at deadline time.
607 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
615 spin_unlock_bh(&rt_flush_lock);
620 if (rt_deadline == 0)
621 rt_deadline = now + ip_rt_max_delay;
623 mod_timer(&rt_flush_timer, now+delay);
624 spin_unlock_bh(&rt_flush_lock);
627 static void rt_secret_rebuild(unsigned long dummy)
629 unsigned long now = jiffies;
632 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
636 Short description of GC goals.
638 We want to build algorithm, which will keep routing cache
639 at some equilibrium point, when number of aged off entries
640 is kept approximately equal to newly generated ones.
642 Current expiration strength is variable "expire".
643 We try to adjust it dynamically, so that if networking
644 is idle expires is large enough to keep enough of warm entries,
645 and when load increases it reduces to limit cache size.
648 static int rt_garbage_collect(void)
650 static unsigned long expire = RT_GC_TIMEOUT;
651 static unsigned long last_gc;
653 static int equilibrium;
654 struct rtable *rth, **rthp;
655 unsigned long now = jiffies;
659 * Garbage collection is pretty expensive,
660 * do not make it too frequently.
663 RT_CACHE_STAT_INC(gc_total);
665 if (now - last_gc < ip_rt_gc_min_interval &&
666 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
667 RT_CACHE_STAT_INC(gc_ignored);
671 /* Calculate number of entries, which we want to expire now. */
672 goal = atomic_read(&ipv4_dst_ops.entries) -
673 (ip_rt_gc_elasticity << rt_hash_log);
675 if (equilibrium < ipv4_dst_ops.gc_thresh)
676 equilibrium = ipv4_dst_ops.gc_thresh;
677 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
679 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
680 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
683 /* We are in dangerous area. Try to reduce cache really
686 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
687 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
690 if (now - last_gc >= ip_rt_gc_min_interval)
701 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
702 unsigned long tmo = expire;
704 k = (k + 1) & rt_hash_mask;
705 rthp = &rt_hash_table[k].chain;
706 spin_lock_bh(&rt_hash_table[k].lock);
707 while ((rth = *rthp) != NULL) {
708 if (!rt_may_expire(rth, tmo, expire)) {
710 rthp = &rth->u.rt_next;
713 *rthp = rth->u.rt_next;
717 spin_unlock_bh(&rt_hash_table[k].lock);
726 /* Goal is not achieved. We stop process if:
728 - if expire reduced to zero. Otherwise, expire is halfed.
729 - if table is not full.
730 - if we are called from interrupt.
731 - jiffies check is just fallback/debug loop breaker.
732 We will not spin here for long time in any case.
735 RT_CACHE_STAT_INC(gc_goal_miss);
741 #if RT_CACHE_DEBUG >= 2
742 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
743 atomic_read(&ipv4_dst_ops.entries), goal, i);
746 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
748 } while (!in_softirq() && time_before_eq(jiffies, now));
750 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
753 printk(KERN_WARNING "dst cache overflow\n");
754 RT_CACHE_STAT_INC(gc_dst_overflow);
758 expire += ip_rt_gc_min_interval;
759 if (expire > ip_rt_gc_timeout ||
760 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
761 expire = ip_rt_gc_timeout;
762 #if RT_CACHE_DEBUG >= 2
763 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
764 atomic_read(&ipv4_dst_ops.entries), goal, rover);
769 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
771 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
772 fl1->oif == fl2->oif &&
773 fl1->iif == fl2->iif;
776 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
778 struct rtable *rth, **rthp;
780 struct rtable *cand, **candp;
783 int attempts = !in_softirq();
792 rthp = &rt_hash_table[hash].chain;
794 spin_lock_bh(&rt_hash_table[hash].lock);
795 while ((rth = *rthp) != NULL) {
796 if (compare_keys(&rth->fl, &rt->fl)) {
798 *rthp = rth->u.rt_next;
800 * Since lookup is lockfree, the deletion
801 * must be visible to another weakly ordered CPU before
802 * the insertion at the start of the hash chain.
805 rth->u.rt_next = rt_hash_table[hash].chain;
807 * Since lookup is lockfree, the update writes
808 * must be ordered for consistency on SMP.
811 rt_hash_table[hash].chain = rth;
814 dst_hold(&rth->u.dst);
815 rth->u.dst.lastuse = now;
816 spin_unlock_bh(&rt_hash_table[hash].lock);
823 if (!atomic_read(&rth->u.dst.__refcnt)) {
824 u32 score = rt_score(rth);
826 if (score <= min_score) {
835 rthp = &rth->u.rt_next;
839 /* ip_rt_gc_elasticity used to be average length of chain
840 * length, when exceeded gc becomes really aggressive.
842 * The second limit is less certain. At the moment it allows
843 * only 2 entries per bucket. We will see.
845 if (chain_length > ip_rt_gc_elasticity) {
846 *candp = cand->u.rt_next;
851 /* Try to bind route to arp only if it is output
852 route or unicast forwarding path.
854 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
855 int err = arp_bind_neighbour(&rt->u.dst);
857 spin_unlock_bh(&rt_hash_table[hash].lock);
859 if (err != -ENOBUFS) {
864 /* Neighbour tables are full and nothing
865 can be released. Try to shrink route cache,
866 it is most likely it holds some neighbour records.
868 if (attempts-- > 0) {
869 int saved_elasticity = ip_rt_gc_elasticity;
870 int saved_int = ip_rt_gc_min_interval;
871 ip_rt_gc_elasticity = 1;
872 ip_rt_gc_min_interval = 0;
873 rt_garbage_collect();
874 ip_rt_gc_min_interval = saved_int;
875 ip_rt_gc_elasticity = saved_elasticity;
880 printk(KERN_WARNING "Neighbour table overflow.\n");
886 rt->u.rt_next = rt_hash_table[hash].chain;
887 #if RT_CACHE_DEBUG >= 2
890 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
891 NIPQUAD(rt->rt_dst));
892 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
893 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
897 rt_hash_table[hash].chain = rt;
898 spin_unlock_bh(&rt_hash_table[hash].lock);
903 void rt_bind_peer(struct rtable *rt, int create)
905 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
906 struct inet_peer *peer;
908 peer = inet_getpeer(rt->rt_dst, create);
910 spin_lock_bh(&rt_peer_lock);
911 if (rt->peer == NULL) {
915 spin_unlock_bh(&rt_peer_lock);
921 * Peer allocation may fail only in serious out-of-memory conditions. However
922 * we still can generate some output.
923 * Random ID selection looks a bit dangerous because we have no chances to
924 * select ID being unique in a reasonable period of time.
925 * But broken packet identifier may be better than no packet at all.
927 static void ip_select_fb_ident(struct iphdr *iph)
929 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
930 static u32 ip_fallback_id;
933 spin_lock_bh(&ip_fb_id_lock);
934 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
935 iph->id = htons(salt & 0xFFFF);
936 ip_fallback_id = salt;
937 spin_unlock_bh(&ip_fb_id_lock);
940 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
942 struct rtable *rt = (struct rtable *) dst;
945 if (rt->peer == NULL)
948 /* If peer is attached to destination, it is never detached,
949 so that we need not to grab a lock to dereference it.
952 iph->id = htons(inet_getid(rt->peer, more));
956 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
958 ip_select_fb_ident(iph);
961 static void rt_del(unsigned hash, struct rtable *rt)
963 struct rtable **rthp;
965 spin_lock_bh(&rt_hash_table[hash].lock);
967 for (rthp = &rt_hash_table[hash].chain; *rthp;
968 rthp = &(*rthp)->u.rt_next)
970 *rthp = rt->u.rt_next;
974 spin_unlock_bh(&rt_hash_table[hash].lock);
977 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
978 u32 saddr, u8 tos, struct net_device *dev)
981 struct in_device *in_dev = in_dev_get(dev);
982 struct rtable *rth, **rthp;
983 u32 skeys[2] = { saddr, 0 };
984 int ikeys[2] = { dev->ifindex, 0 };
986 tos &= IPTOS_RT_MASK;
991 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
992 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
993 goto reject_redirect;
995 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
996 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
997 goto reject_redirect;
998 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
999 goto reject_redirect;
1001 if (inet_addr_type(new_gw) != RTN_UNICAST)
1002 goto reject_redirect;
1005 for (i = 0; i < 2; i++) {
1006 for (k = 0; k < 2; k++) {
1007 unsigned hash = rt_hash_code(daddr,
1008 skeys[i] ^ (ikeys[k] << 5),
1011 rthp=&rt_hash_table[hash].chain;
1014 while ((rth = rcu_dereference(*rthp)) != NULL) {
1017 if (rth->fl.fl4_dst != daddr ||
1018 rth->fl.fl4_src != skeys[i] ||
1019 rth->fl.fl4_tos != tos ||
1020 rth->fl.oif != ikeys[k] ||
1022 rthp = &rth->u.rt_next;
1026 if (rth->rt_dst != daddr ||
1027 rth->rt_src != saddr ||
1029 rth->rt_gateway != old_gw ||
1030 rth->u.dst.dev != dev)
1033 dst_hold(&rth->u.dst);
1036 rt = dst_alloc(&ipv4_dst_ops);
1043 /* Copy all the information. */
1045 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1046 rt->u.dst.__use = 1;
1047 atomic_set(&rt->u.dst.__refcnt, 1);
1048 rt->u.dst.child = NULL;
1050 dev_hold(rt->u.dst.dev);
1052 in_dev_hold(rt->idev);
1053 rt->u.dst.obsolete = 0;
1054 rt->u.dst.lastuse = jiffies;
1055 rt->u.dst.path = &rt->u.dst;
1056 rt->u.dst.neighbour = NULL;
1057 rt->u.dst.hh = NULL;
1058 rt->u.dst.xfrm = NULL;
1060 rt->rt_flags |= RTCF_REDIRECTED;
1062 /* Gateway is different ... */
1063 rt->rt_gateway = new_gw;
1065 /* Redirect received -> path was valid */
1066 dst_confirm(&rth->u.dst);
1069 atomic_inc(&rt->peer->refcnt);
1071 if (arp_bind_neighbour(&rt->u.dst) ||
1072 !(rt->u.dst.neighbour->nud_state &
1074 if (rt->u.dst.neighbour)
1075 neigh_event_send(rt->u.dst.neighbour, NULL);
1082 if (!rt_intern_hash(hash, rt, &rt))
1095 #ifdef CONFIG_IP_ROUTE_VERBOSE
1096 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1097 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1098 "%u.%u.%u.%u ignored.\n"
1099 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1101 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1102 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1107 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1109 struct rtable *rt = (struct rtable*)dst;
1110 struct dst_entry *ret = dst;
1113 if (dst->obsolete) {
1116 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1117 rt->u.dst.expires) {
1118 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1122 #if RT_CACHE_DEBUG >= 1
1123 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1124 "%u.%u.%u.%u/%02x dropped\n",
1125 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1136 * 1. The first ip_rt_redirect_number redirects are sent
1137 * with exponential backoff, then we stop sending them at all,
1138 * assuming that the host ignores our redirects.
1139 * 2. If we did not see packets requiring redirects
1140 * during ip_rt_redirect_silence, we assume that the host
1141 * forgot redirected route and start to send redirects again.
1143 * This algorithm is much cheaper and more intelligent than dumb load limiting
1146 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1147 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1150 void ip_rt_send_redirect(struct sk_buff *skb)
1152 struct rtable *rt = (struct rtable*)skb->dst;
1153 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1158 if (!IN_DEV_TX_REDIRECTS(in_dev))
1161 /* No redirected packets during ip_rt_redirect_silence;
1162 * reset the algorithm.
1164 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1165 rt->u.dst.rate_tokens = 0;
1167 /* Too many ignored redirects; do not send anything
1168 * set u.dst.rate_last to the last seen redirected packet.
1170 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1171 rt->u.dst.rate_last = jiffies;
1175 /* Check for load limit; set rate_last to the latest sent
1178 if (time_after(jiffies,
1179 (rt->u.dst.rate_last +
1180 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1181 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1182 rt->u.dst.rate_last = jiffies;
1183 ++rt->u.dst.rate_tokens;
1184 #ifdef CONFIG_IP_ROUTE_VERBOSE
1185 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1186 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1188 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1189 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1190 NIPQUAD(rt->rt_src), rt->rt_iif,
1191 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1198 static int ip_error(struct sk_buff *skb)
1200 struct rtable *rt = (struct rtable*)skb->dst;
1204 switch (rt->u.dst.error) {
1209 code = ICMP_HOST_UNREACH;
1212 code = ICMP_NET_UNREACH;
1215 code = ICMP_PKT_FILTERED;
1220 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1221 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1222 rt->u.dst.rate_tokens = ip_rt_error_burst;
1223 rt->u.dst.rate_last = now;
1224 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1225 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1226 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1229 out: kfree_skb(skb);
1234 * The last two values are not from the RFC but
1235 * are needed for AMPRnet AX.25 paths.
1238 static unsigned short mtu_plateau[] =
1239 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1241 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1245 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1246 if (old_mtu > mtu_plateau[i])
1247 return mtu_plateau[i];
1251 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1254 unsigned short old_mtu = ntohs(iph->tot_len);
1256 u32 skeys[2] = { iph->saddr, 0, };
1257 u32 daddr = iph->daddr;
1258 u8 tos = iph->tos & IPTOS_RT_MASK;
1259 unsigned short est_mtu = 0;
1261 if (ipv4_config.no_pmtu_disc)
1264 for (i = 0; i < 2; i++) {
1265 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1268 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1269 rth = rcu_dereference(rth->u.rt_next)) {
1270 if (rth->fl.fl4_dst == daddr &&
1271 rth->fl.fl4_src == skeys[i] &&
1272 rth->rt_dst == daddr &&
1273 rth->rt_src == iph->saddr &&
1274 rth->fl.fl4_tos == tos &&
1276 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1277 unsigned short mtu = new_mtu;
1279 if (new_mtu < 68 || new_mtu >= old_mtu) {
1281 /* BSD 4.2 compatibility hack :-( */
1283 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1284 old_mtu >= 68 + (iph->ihl << 2))
1285 old_mtu -= iph->ihl << 2;
1287 mtu = guess_mtu(old_mtu);
1289 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1290 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1291 dst_confirm(&rth->u.dst);
1292 if (mtu < ip_rt_min_pmtu) {
1293 mtu = ip_rt_min_pmtu;
1294 rth->u.dst.metrics[RTAX_LOCK-1] |=
1297 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1298 dst_set_expires(&rth->u.dst,
1307 return est_mtu ? : new_mtu;
1310 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1312 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1313 !(dst_metric_locked(dst, RTAX_MTU))) {
1314 if (mtu < ip_rt_min_pmtu) {
1315 mtu = ip_rt_min_pmtu;
1316 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1318 dst->metrics[RTAX_MTU-1] = mtu;
1319 dst_set_expires(dst, ip_rt_mtu_expires);
1323 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1329 static void ipv4_dst_destroy(struct dst_entry *dst)
1331 struct rtable *rt = (struct rtable *) dst;
1332 struct inet_peer *peer = rt->peer;
1333 struct in_device *idev = rt->idev;
1346 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1348 struct rtable *rt = (struct rtable *) dst;
1349 struct in_device *idev = rt->idev;
1350 if (idev && idev->dev != &loopback_dev) {
1351 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1352 if (loopback_idev) {
1353 rt->idev = loopback_idev;
1359 static void ipv4_link_failure(struct sk_buff *skb)
1363 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1365 rt = (struct rtable *) skb->dst;
1367 dst_set_expires(&rt->u.dst, 0);
1370 static int ip_rt_bug(struct sk_buff **pskb)
1372 struct sk_buff *skb = *pskb;
1374 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1375 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1376 skb->dev ? skb->dev->name : "?");
1382 We do not cache source address of outgoing interface,
1383 because it is used only by IP RR, TS and SRR options,
1384 so that it out of fast path.
1386 BTW remember: "addr" is allowed to be not aligned
1390 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1393 struct fib_result res;
1395 if (rt->fl.iif == 0)
1397 else if (fib_lookup(&rt->fl, &res) == 0) {
1398 src = FIB_RES_PREFSRC(res);
1401 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1403 memcpy(addr, &src, 4);
1406 #ifdef CONFIG_NET_CLS_ROUTE
1407 static void set_class_tag(struct rtable *rt, u32 tag)
1409 if (!(rt->u.dst.tclassid & 0xFFFF))
1410 rt->u.dst.tclassid |= tag & 0xFFFF;
1411 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1412 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1416 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1418 struct fib_info *fi = res->fi;
1421 if (FIB_RES_GW(*res) &&
1422 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1423 rt->rt_gateway = FIB_RES_GW(*res);
1424 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1425 sizeof(rt->u.dst.metrics));
1426 if (fi->fib_mtu == 0) {
1427 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1428 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1429 rt->rt_gateway != rt->rt_dst &&
1430 rt->u.dst.dev->mtu > 576)
1431 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1433 #ifdef CONFIG_NET_CLS_ROUTE
1434 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1437 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1439 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1440 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1441 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1442 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1443 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1444 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1446 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1447 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1449 #ifdef CONFIG_NET_CLS_ROUTE
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451 set_class_tag(rt, fib_rules_tclass(res));
1453 set_class_tag(rt, itag);
1455 rt->rt_type = res->type;
1458 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1459 u8 tos, struct net_device *dev, int our)
1464 struct in_device *in_dev = in_dev_get(dev);
1467 /* Primary sanity checks. */
1472 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1473 skb->protocol != htons(ETH_P_IP))
1476 if (ZERONET(saddr)) {
1477 if (!LOCAL_MCAST(daddr))
1479 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1480 } else if (fib_validate_source(saddr, 0, tos, 0,
1481 dev, &spec_dst, &itag) < 0)
1484 rth = dst_alloc(&ipv4_dst_ops);
1488 rth->u.dst.output= ip_rt_bug;
1490 atomic_set(&rth->u.dst.__refcnt, 1);
1491 rth->u.dst.flags= DST_HOST;
1492 if (in_dev->cnf.no_policy)
1493 rth->u.dst.flags |= DST_NOPOLICY;
1494 rth->fl.fl4_dst = daddr;
1495 rth->rt_dst = daddr;
1496 rth->fl.fl4_tos = tos;
1497 #ifdef CONFIG_IP_ROUTE_FWMARK
1498 rth->fl.fl4_fwmark= skb->nfmark;
1500 rth->fl.fl4_src = saddr;
1501 rth->rt_src = saddr;
1502 #ifdef CONFIG_NET_CLS_ROUTE
1503 rth->u.dst.tclassid = itag;
1506 rth->fl.iif = dev->ifindex;
1507 rth->u.dst.dev = &loopback_dev;
1508 dev_hold(rth->u.dst.dev);
1509 rth->idev = in_dev_get(rth->u.dst.dev);
1511 rth->rt_gateway = daddr;
1512 rth->rt_spec_dst= spec_dst;
1513 rth->rt_type = RTN_MULTICAST;
1514 rth->rt_flags = RTCF_MULTICAST;
1516 rth->u.dst.input= ip_local_deliver;
1517 rth->rt_flags |= RTCF_LOCAL;
1520 #ifdef CONFIG_IP_MROUTE
1521 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1522 rth->u.dst.input = ip_mr_input;
1524 RT_CACHE_STAT_INC(in_slow_mc);
1527 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1528 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1540 * NOTE. We drop all the packets that has local source
1541 * addresses, because every properly looped back packet
1542 * must have correct destination already attached by output routine.
1544 * Such approach solves two big problems:
1545 * 1. Not simplex devices are handled properly.
1546 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1549 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1550 u8 tos, struct net_device *dev)
1552 struct fib_result res;
1553 struct in_device *in_dev = in_dev_get(dev);
1554 struct in_device *out_dev = NULL;
1555 struct flowi fl = { .nl_u = { .ip4_u =
1559 .scope = RT_SCOPE_UNIVERSE,
1560 #ifdef CONFIG_IP_ROUTE_FWMARK
1561 .fwmark = skb->nfmark
1564 .iif = dev->ifindex };
1567 struct rtable * rth;
1573 /* IP on this device is disabled. */
1578 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1580 /* Check for the most weird martians, which can be not detected
1584 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1585 goto martian_source;
1587 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1590 /* Accept zero addresses only to limited broadcast;
1591 * I even do not know to fix it or not. Waiting for complains :-)
1594 goto martian_source;
1596 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1597 goto martian_destination;
1600 * Now we are ready to route packet.
1602 if ((err = fib_lookup(&fl, &res)) != 0) {
1603 if (!IN_DEV_FORWARD(in_dev))
1609 RT_CACHE_STAT_INC(in_slow_tot);
1611 if (res.type == RTN_BROADCAST)
1614 if (res.type == RTN_LOCAL) {
1616 result = fib_validate_source(saddr, daddr, tos,
1617 loopback_dev.ifindex,
1618 dev, &spec_dst, &itag);
1620 goto martian_source;
1622 flags |= RTCF_DIRECTSRC;
1627 if (!IN_DEV_FORWARD(in_dev))
1629 if (res.type != RTN_UNICAST)
1630 goto martian_destination;
1632 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1633 if (res.fi->fib_nhs > 1 && fl.oif == 0)
1634 fib_select_multipath(&fl, &res);
1636 out_dev = in_dev_get(FIB_RES_DEV(res));
1637 if (out_dev == NULL) {
1638 if (net_ratelimit())
1639 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1640 "Please, report\n");
1644 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1647 goto martian_source;
1650 flags |= RTCF_DIRECTSRC;
1652 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1653 (IN_DEV_SHARED_MEDIA(out_dev) ||
1654 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1655 flags |= RTCF_DOREDIRECT;
1657 if (skb->protocol != htons(ETH_P_IP)) {
1658 /* Not IP (i.e. ARP). Do not create route, if it is
1659 * invalid for proxy arp. DNAT routes are always valid.
1661 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1665 rth = dst_alloc(&ipv4_dst_ops);
1669 atomic_set(&rth->u.dst.__refcnt, 1);
1670 rth->u.dst.flags= DST_HOST;
1671 if (in_dev->cnf.no_policy)
1672 rth->u.dst.flags |= DST_NOPOLICY;
1673 if (in_dev->cnf.no_xfrm)
1674 rth->u.dst.flags |= DST_NOXFRM;
1675 rth->fl.fl4_dst = daddr;
1676 rth->rt_dst = daddr;
1677 rth->fl.fl4_tos = tos;
1678 #ifdef CONFIG_IP_ROUTE_FWMARK
1679 rth->fl.fl4_fwmark= skb->nfmark;
1681 rth->fl.fl4_src = saddr;
1682 rth->rt_src = saddr;
1683 rth->rt_gateway = daddr;
1685 rth->fl.iif = dev->ifindex;
1686 rth->u.dst.dev = out_dev->dev;
1687 dev_hold(rth->u.dst.dev);
1688 rth->idev = in_dev_get(rth->u.dst.dev);
1690 rth->rt_spec_dst= spec_dst;
1692 rth->u.dst.input = ip_forward;
1693 rth->u.dst.output = ip_output;
1695 rt_set_nexthop(rth, &res, itag);
1697 rth->rt_flags = flags;
1700 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1704 in_dev_put(out_dev);
1710 if (skb->protocol != htons(ETH_P_IP))
1714 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1716 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1719 goto martian_source;
1721 flags |= RTCF_DIRECTSRC;
1723 flags |= RTCF_BROADCAST;
1724 res.type = RTN_BROADCAST;
1725 RT_CACHE_STAT_INC(in_brd);
1728 rth = dst_alloc(&ipv4_dst_ops);
1732 rth->u.dst.output= ip_rt_bug;
1734 atomic_set(&rth->u.dst.__refcnt, 1);
1735 rth->u.dst.flags= DST_HOST;
1736 if (in_dev->cnf.no_policy)
1737 rth->u.dst.flags |= DST_NOPOLICY;
1738 rth->fl.fl4_dst = daddr;
1739 rth->rt_dst = daddr;
1740 rth->fl.fl4_tos = tos;
1741 #ifdef CONFIG_IP_ROUTE_FWMARK
1742 rth->fl.fl4_fwmark= skb->nfmark;
1744 rth->fl.fl4_src = saddr;
1745 rth->rt_src = saddr;
1746 #ifdef CONFIG_NET_CLS_ROUTE
1747 rth->u.dst.tclassid = itag;
1750 rth->fl.iif = dev->ifindex;
1751 rth->u.dst.dev = &loopback_dev;
1752 dev_hold(rth->u.dst.dev);
1753 rth->idev = in_dev_get(rth->u.dst.dev);
1754 rth->rt_gateway = daddr;
1755 rth->rt_spec_dst= spec_dst;
1756 rth->u.dst.input= ip_local_deliver;
1757 rth->rt_flags = flags|RTCF_LOCAL;
1758 if (res.type == RTN_UNREACHABLE) {
1759 rth->u.dst.input= ip_error;
1760 rth->u.dst.error= -err;
1761 rth->rt_flags &= ~RTCF_LOCAL;
1763 rth->rt_type = res.type;
1767 RT_CACHE_STAT_INC(in_no_route);
1768 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1769 res.type = RTN_UNREACHABLE;
1773 * Do not cache martian addresses: they should be logged (RFC1812)
1775 martian_destination:
1776 RT_CACHE_STAT_INC(in_martian_dst);
1777 #ifdef CONFIG_IP_ROUTE_VERBOSE
1778 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1779 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1780 "%u.%u.%u.%u, dev %s\n",
1781 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1793 RT_CACHE_STAT_INC(in_martian_src);
1794 #ifdef CONFIG_IP_ROUTE_VERBOSE
1795 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1797 * RFC1812 recommendation, if source is martian,
1798 * the only hint is MAC header.
1800 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1801 "%u.%u.%u.%u, on dev %s\n",
1802 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1803 if (dev->hard_header_len) {
1805 unsigned char *p = skb->mac.raw;
1806 printk(KERN_WARNING "ll header: ");
1807 for (i = 0; i < dev->hard_header_len; i++, p++) {
1809 if (i < (dev->hard_header_len - 1))
1819 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1820 u8 tos, struct net_device *dev)
1822 struct rtable * rth;
1824 int iif = dev->ifindex;
1826 tos &= IPTOS_RT_MASK;
1827 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1830 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1831 rth = rcu_dereference(rth->u.rt_next)) {
1832 if (rth->fl.fl4_dst == daddr &&
1833 rth->fl.fl4_src == saddr &&
1834 rth->fl.iif == iif &&
1836 #ifdef CONFIG_IP_ROUTE_FWMARK
1837 rth->fl.fl4_fwmark == skb->nfmark &&
1839 rth->fl.fl4_tos == tos) {
1840 rth->u.dst.lastuse = jiffies;
1841 dst_hold(&rth->u.dst);
1843 RT_CACHE_STAT_INC(in_hit);
1845 skb->dst = (struct dst_entry*)rth;
1848 RT_CACHE_STAT_INC(in_hlist_search);
1852 /* Multicast recognition logic is moved from route cache to here.
1853 The problem was that too many Ethernet cards have broken/missing
1854 hardware multicast filters :-( As result the host on multicasting
1855 network acquires a lot of useless route cache entries, sort of
1856 SDR messages from all the world. Now we try to get rid of them.
1857 Really, provided software IP multicast filter is organized
1858 reasonably (at least, hashed), it does not result in a slowdown
1859 comparing with route cache reject entries.
1860 Note, that multicast routers are not affected, because
1861 route cache entry is created eventually.
1863 if (MULTICAST(daddr)) {
1864 struct in_device *in_dev;
1867 if ((in_dev = __in_dev_get(dev)) != NULL) {
1868 int our = ip_check_mc(in_dev, daddr, saddr,
1869 skb->nh.iph->protocol);
1871 #ifdef CONFIG_IP_MROUTE
1872 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1876 return ip_route_input_mc(skb, daddr, saddr,
1883 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1887 * Major route resolver routine.
1890 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1892 u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1893 struct flowi fl = { .nl_u = { .ip4_u =
1894 { .daddr = oldflp->fl4_dst,
1895 .saddr = oldflp->fl4_src,
1896 .tos = tos & IPTOS_RT_MASK,
1897 .scope = ((tos & RTO_ONLINK) ?
1900 #ifdef CONFIG_IP_ROUTE_FWMARK
1901 .fwmark = oldflp->fl4_fwmark
1904 .iif = loopback_dev.ifindex,
1905 .oif = oldflp->oif };
1906 struct fib_result res;
1909 struct net_device *dev_out = NULL;
1910 struct in_device *in_dev = NULL;
1916 #ifdef CONFIG_IP_MULTIPLE_TABLES
1920 if (oldflp->fl4_src) {
1922 if (MULTICAST(oldflp->fl4_src) ||
1923 BADCLASS(oldflp->fl4_src) ||
1924 ZERONET(oldflp->fl4_src))
1927 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1928 dev_out = ip_dev_find(oldflp->fl4_src);
1929 if (dev_out == NULL)
1932 /* I removed check for oif == dev_out->oif here.
1933 It was wrong for two reasons:
1934 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1935 assigned to multiple interfaces.
1936 2. Moreover, we are allowed to send packets with saddr
1937 of another iface. --ANK
1940 if (oldflp->oif == 0
1941 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1942 /* Special hack: user can direct multicasts
1943 and limited broadcast via necessary interface
1944 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1945 This hack is not just for fun, it allows
1946 vic,vat and friends to work.
1947 They bind socket to loopback, set ttl to zero
1948 and expect that it will work.
1949 From the viewpoint of routing cache they are broken,
1950 because we are not allowed to build multicast path
1951 with loopback source addr (look, routing cache
1952 cannot know, that ttl is zero, so that packet
1953 will not leave this host and route is valid).
1954 Luckily, this hack is good workaround.
1957 fl.oif = dev_out->ifindex;
1965 dev_out = dev_get_by_index(oldflp->oif);
1967 if (dev_out == NULL)
1969 if (__in_dev_get(dev_out) == NULL) {
1971 goto out; /* Wrong error code */
1974 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1976 fl.fl4_src = inet_select_addr(dev_out, 0,
1981 if (MULTICAST(oldflp->fl4_dst))
1982 fl.fl4_src = inet_select_addr(dev_out, 0,
1984 else if (!oldflp->fl4_dst)
1985 fl.fl4_src = inet_select_addr(dev_out, 0,
1991 fl.fl4_dst = fl.fl4_src;
1993 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
1996 dev_out = &loopback_dev;
1998 fl.oif = loopback_dev.ifindex;
1999 res.type = RTN_LOCAL;
2000 flags |= RTCF_LOCAL;
2004 if (fib_lookup(&fl, &res)) {
2007 /* Apparently, routing tables are wrong. Assume,
2008 that the destination is on link.
2011 Because we are allowed to send to iface
2012 even if it has NO routes and NO assigned
2013 addresses. When oif is specified, routing
2014 tables are looked up with only one purpose:
2015 to catch if destination is gatewayed, rather than
2016 direct. Moreover, if MSG_DONTROUTE is set,
2017 we send packet, ignoring both routing tables
2018 and ifaddr state. --ANK
2021 We could make it even if oif is unknown,
2022 likely IPv6, but we do not.
2025 if (fl.fl4_src == 0)
2026 fl.fl4_src = inet_select_addr(dev_out, 0,
2028 res.type = RTN_UNICAST;
2038 if (res.type == RTN_LOCAL) {
2040 fl.fl4_src = fl.fl4_dst;
2043 dev_out = &loopback_dev;
2045 fl.oif = dev_out->ifindex;
2047 fib_info_put(res.fi);
2049 flags |= RTCF_LOCAL;
2053 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2054 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2055 fib_select_multipath(&fl, &res);
2058 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2059 fib_select_default(&fl, &res);
2062 fl.fl4_src = FIB_RES_PREFSRC(res);
2066 dev_out = FIB_RES_DEV(res);
2068 fl.oif = dev_out->ifindex;
2071 if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2074 if (fl.fl4_dst == 0xFFFFFFFF)
2075 res.type = RTN_BROADCAST;
2076 else if (MULTICAST(fl.fl4_dst))
2077 res.type = RTN_MULTICAST;
2078 else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2081 if (dev_out->flags & IFF_LOOPBACK)
2082 flags |= RTCF_LOCAL;
2084 in_dev = in_dev_get(dev_out);
2088 if (res.type == RTN_BROADCAST) {
2089 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2091 fib_info_put(res.fi);
2094 } else if (res.type == RTN_MULTICAST) {
2095 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2096 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2097 flags &= ~RTCF_LOCAL;
2098 /* If multicast route do not exist use
2099 default one, but do not gateway in this case.
2102 if (res.fi && res.prefixlen < 4) {
2103 fib_info_put(res.fi);
2108 rth = dst_alloc(&ipv4_dst_ops);
2112 atomic_set(&rth->u.dst.__refcnt, 1);
2113 rth->u.dst.flags= DST_HOST;
2114 if (in_dev->cnf.no_xfrm)
2115 rth->u.dst.flags |= DST_NOXFRM;
2116 if (in_dev->cnf.no_policy)
2117 rth->u.dst.flags |= DST_NOPOLICY;
2118 rth->fl.fl4_dst = oldflp->fl4_dst;
2119 rth->fl.fl4_tos = tos;
2120 rth->fl.fl4_src = oldflp->fl4_src;
2121 rth->fl.oif = oldflp->oif;
2122 #ifdef CONFIG_IP_ROUTE_FWMARK
2123 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2125 rth->rt_dst = fl.fl4_dst;
2126 rth->rt_src = fl.fl4_src;
2127 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2128 rth->u.dst.dev = dev_out;
2130 rth->idev = in_dev_get(dev_out);
2131 rth->rt_gateway = fl.fl4_dst;
2132 rth->rt_spec_dst= fl.fl4_src;
2134 rth->u.dst.output=ip_output;
2136 RT_CACHE_STAT_INC(out_slow_tot);
2138 if (flags & RTCF_LOCAL) {
2139 rth->u.dst.input = ip_local_deliver;
2140 rth->rt_spec_dst = fl.fl4_dst;
2142 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2143 rth->rt_spec_dst = fl.fl4_src;
2144 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2145 rth->u.dst.output = ip_mc_output;
2146 RT_CACHE_STAT_INC(out_slow_mc);
2148 #ifdef CONFIG_IP_MROUTE
2149 if (res.type == RTN_MULTICAST) {
2150 if (IN_DEV_MFORWARD(in_dev) &&
2151 !LOCAL_MCAST(oldflp->fl4_dst)) {
2152 rth->u.dst.input = ip_mr_input;
2153 rth->u.dst.output = ip_mc_output;
2159 rt_set_nexthop(rth, &res, 0);
2162 rth->rt_flags = flags;
2164 hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2165 err = rt_intern_hash(hash, rth, rp);
2183 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2188 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2191 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2192 rth = rcu_dereference(rth->u.rt_next)) {
2193 if (rth->fl.fl4_dst == flp->fl4_dst &&
2194 rth->fl.fl4_src == flp->fl4_src &&
2196 rth->fl.oif == flp->oif &&
2197 #ifdef CONFIG_IP_ROUTE_FWMARK
2198 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2200 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2201 (IPTOS_RT_MASK | RTO_ONLINK))) {
2202 rth->u.dst.lastuse = jiffies;
2203 dst_hold(&rth->u.dst);
2205 RT_CACHE_STAT_INC(out_hit);
2206 rcu_read_unlock_bh();
2210 RT_CACHE_STAT_INC(out_hlist_search);
2212 rcu_read_unlock_bh();
2214 return ip_route_output_slow(rp, flp);
2217 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2221 if ((err = __ip_route_output_key(rp, flp)) != 0)
2226 flp->fl4_src = (*rp)->rt_src;
2228 flp->fl4_dst = (*rp)->rt_dst;
2229 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2235 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2237 return ip_route_output_flow(rp, flp, NULL, 0);
2240 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2243 struct rtable *rt = (struct rtable*)skb->dst;
2245 struct nlmsghdr *nlh;
2246 unsigned char *b = skb->tail;
2247 struct rta_cacheinfo ci;
2248 #ifdef CONFIG_IP_MROUTE
2249 struct rtattr *eptr;
2251 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2252 r = NLMSG_DATA(nlh);
2253 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2254 r->rtm_family = AF_INET;
2255 r->rtm_dst_len = 32;
2257 r->rtm_tos = rt->fl.fl4_tos;
2258 r->rtm_table = RT_TABLE_MAIN;
2259 r->rtm_type = rt->rt_type;
2260 r->rtm_scope = RT_SCOPE_UNIVERSE;
2261 r->rtm_protocol = RTPROT_UNSPEC;
2262 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2263 if (rt->rt_flags & RTCF_NOTIFY)
2264 r->rtm_flags |= RTM_F_NOTIFY;
2265 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2266 if (rt->fl.fl4_src) {
2267 r->rtm_src_len = 32;
2268 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2271 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2272 #ifdef CONFIG_NET_CLS_ROUTE
2273 if (rt->u.dst.tclassid)
2274 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2277 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2278 else if (rt->rt_src != rt->fl.fl4_src)
2279 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2280 if (rt->rt_dst != rt->rt_gateway)
2281 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2282 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2283 goto rtattr_failure;
2284 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2285 ci.rta_used = rt->u.dst.__use;
2286 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2287 if (rt->u.dst.expires)
2288 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2291 ci.rta_error = rt->u.dst.error;
2292 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2294 ci.rta_id = rt->peer->ip_id_count;
2295 if (rt->peer->tcp_ts_stamp) {
2296 ci.rta_ts = rt->peer->tcp_ts;
2297 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2300 #ifdef CONFIG_IP_MROUTE
2301 eptr = (struct rtattr*)skb->tail;
2303 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2305 #ifdef CONFIG_IP_MROUTE
2306 u32 dst = rt->rt_dst;
2308 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2309 ipv4_devconf.mc_forwarding) {
2310 int err = ipmr_get_route(skb, r, nowait);
2317 if (err == -EMSGSIZE)
2319 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2324 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2327 nlh->nlmsg_len = skb->tail - b;
2332 skb_trim(skb, b - skb->data);
2336 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2338 struct rtattr **rta = arg;
2339 struct rtmsg *rtm = NLMSG_DATA(nlh);
2340 struct rtable *rt = NULL;
2345 struct sk_buff *skb;
2347 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2351 /* Reserve room for dummy headers, this skb can pass
2352 through good chunk of routing engine.
2354 skb->mac.raw = skb->data;
2355 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2357 if (rta[RTA_SRC - 1])
2358 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2359 if (rta[RTA_DST - 1])
2360 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2361 if (rta[RTA_IIF - 1])
2362 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2365 struct net_device *dev = __dev_get_by_index(iif);
2369 skb->protocol = htons(ETH_P_IP);
2372 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2374 rt = (struct rtable*)skb->dst;
2375 if (!err && rt->u.dst.error)
2376 err = -rt->u.dst.error;
2378 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2380 .tos = rtm->rtm_tos } } };
2382 if (rta[RTA_OIF - 1])
2383 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2385 err = ip_route_output_key(&rt, &fl);
2390 skb->dst = &rt->u.dst;
2391 if (rtm->rtm_flags & RTM_F_NOTIFY)
2392 rt->rt_flags |= RTCF_NOTIFY;
2394 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2396 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2405 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2415 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2422 s_idx = idx = cb->args[1];
2423 for (h = 0; h <= rt_hash_mask; h++) {
2424 if (h < s_h) continue;
2428 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2429 rt = rcu_dereference(rt->u.rt_next), idx++) {
2432 skb->dst = dst_clone(&rt->u.dst);
2433 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2435 RTM_NEWROUTE, 1) <= 0) {
2436 dst_release(xchg(&skb->dst, NULL));
2437 rcu_read_unlock_bh();
2440 dst_release(xchg(&skb->dst, NULL));
2442 rcu_read_unlock_bh();
2451 void ip_rt_multicast_event(struct in_device *in_dev)
2456 #ifdef CONFIG_SYSCTL
2457 static int flush_delay;
2459 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2460 struct file *filp, void __user *buffer,
2461 size_t *lenp, loff_t *ppos)
2464 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2465 rt_cache_flush(flush_delay);
2472 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2475 void __user *oldval,
2476 size_t __user *oldlenp,
2477 void __user *newval,
2482 if (newlen != sizeof(int))
2484 if (get_user(delay, (int __user *)newval))
2486 rt_cache_flush(delay);
2490 ctl_table ipv4_route_table[] = {
2492 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2493 .procname = "flush",
2494 .data = &flush_delay,
2495 .maxlen = sizeof(int),
2497 .proc_handler = &ipv4_sysctl_rtcache_flush,
2498 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2501 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2502 .procname = "min_delay",
2503 .data = &ip_rt_min_delay,
2504 .maxlen = sizeof(int),
2506 .proc_handler = &proc_dointvec_jiffies,
2507 .strategy = &sysctl_jiffies,
2510 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2511 .procname = "max_delay",
2512 .data = &ip_rt_max_delay,
2513 .maxlen = sizeof(int),
2515 .proc_handler = &proc_dointvec_jiffies,
2516 .strategy = &sysctl_jiffies,
2519 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2520 .procname = "gc_thresh",
2521 .data = &ipv4_dst_ops.gc_thresh,
2522 .maxlen = sizeof(int),
2524 .proc_handler = &proc_dointvec,
2527 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2528 .procname = "max_size",
2529 .data = &ip_rt_max_size,
2530 .maxlen = sizeof(int),
2532 .proc_handler = &proc_dointvec,
2535 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2536 .procname = "gc_min_interval",
2537 .data = &ip_rt_gc_min_interval,
2538 .maxlen = sizeof(int),
2540 .proc_handler = &proc_dointvec_jiffies,
2541 .strategy = &sysctl_jiffies,
2544 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2545 .procname = "gc_timeout",
2546 .data = &ip_rt_gc_timeout,
2547 .maxlen = sizeof(int),
2549 .proc_handler = &proc_dointvec_jiffies,
2550 .strategy = &sysctl_jiffies,
2553 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2554 .procname = "gc_interval",
2555 .data = &ip_rt_gc_interval,
2556 .maxlen = sizeof(int),
2558 .proc_handler = &proc_dointvec_jiffies,
2559 .strategy = &sysctl_jiffies,
2562 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2563 .procname = "redirect_load",
2564 .data = &ip_rt_redirect_load,
2565 .maxlen = sizeof(int),
2567 .proc_handler = &proc_dointvec,
2570 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2571 .procname = "redirect_number",
2572 .data = &ip_rt_redirect_number,
2573 .maxlen = sizeof(int),
2575 .proc_handler = &proc_dointvec,
2578 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2579 .procname = "redirect_silence",
2580 .data = &ip_rt_redirect_silence,
2581 .maxlen = sizeof(int),
2583 .proc_handler = &proc_dointvec,
2586 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2587 .procname = "error_cost",
2588 .data = &ip_rt_error_cost,
2589 .maxlen = sizeof(int),
2591 .proc_handler = &proc_dointvec,
2594 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2595 .procname = "error_burst",
2596 .data = &ip_rt_error_burst,
2597 .maxlen = sizeof(int),
2599 .proc_handler = &proc_dointvec,
2602 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2603 .procname = "gc_elasticity",
2604 .data = &ip_rt_gc_elasticity,
2605 .maxlen = sizeof(int),
2607 .proc_handler = &proc_dointvec,
2610 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2611 .procname = "mtu_expires",
2612 .data = &ip_rt_mtu_expires,
2613 .maxlen = sizeof(int),
2615 .proc_handler = &proc_dointvec_jiffies,
2616 .strategy = &sysctl_jiffies,
2619 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2620 .procname = "min_pmtu",
2621 .data = &ip_rt_min_pmtu,
2622 .maxlen = sizeof(int),
2624 .proc_handler = &proc_dointvec,
2627 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2628 .procname = "min_adv_mss",
2629 .data = &ip_rt_min_advmss,
2630 .maxlen = sizeof(int),
2632 .proc_handler = &proc_dointvec,
2635 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2636 .procname = "secret_interval",
2637 .data = &ip_rt_secret_interval,
2638 .maxlen = sizeof(int),
2640 .proc_handler = &proc_dointvec_jiffies,
2641 .strategy = &sysctl_jiffies,
2647 #ifdef CONFIG_NET_CLS_ROUTE
2648 struct ip_rt_acct *ip_rt_acct;
2650 /* This code sucks. But you should have seen it before! --RR */
2652 /* IP route accounting ptr for this logical cpu number. */
2653 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2655 #ifdef CONFIG_PROC_FS
2656 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2657 int length, int *eof, void *data)
2661 if ((offset & 3) || (length & 3))
2664 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2669 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2670 length = sizeof(struct ip_rt_acct) * 256 - offset;
2674 offset /= sizeof(u32);
2677 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2678 u32 *dst = (u32 *) buffer;
2680 /* Copy first cpu. */
2682 memcpy(dst, src, length);
2684 /* Add the other cpus in, one int at a time */
2688 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2690 for (j = 0; j < length/4; j++)
2696 #endif /* CONFIG_PROC_FS */
2697 #endif /* CONFIG_NET_CLS_ROUTE */
2699 static __initdata unsigned long rhash_entries;
2700 static int __init set_rhash_entries(char *str)
2704 rhash_entries = simple_strtoul(str, &str, 0);
2707 __setup("rhash_entries=", set_rhash_entries);
2709 int __init ip_rt_init(void)
2711 int i, order, goal, rc = 0;
2713 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2714 (jiffies ^ (jiffies >> 7)));
2716 #ifdef CONFIG_NET_CLS_ROUTE
2718 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2720 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2722 panic("IP: failed to allocate ip_rt_acct\n");
2723 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2726 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2727 sizeof(struct rtable),
2728 0, SLAB_HWCACHE_ALIGN,
2731 if (!ipv4_dst_ops.kmem_cachep)
2732 panic("IP: failed to allocate ip_dst_cache\n");
2734 goal = num_physpages >> (26 - PAGE_SHIFT);
2736 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2737 for (order = 0; (1UL << order) < goal; order++)
2741 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2742 sizeof(struct rt_hash_bucket);
2743 while (rt_hash_mask & (rt_hash_mask - 1))
2745 rt_hash_table = (struct rt_hash_bucket *)
2746 __get_free_pages(GFP_ATOMIC, order);
2747 } while (rt_hash_table == NULL && --order > 0);
2750 panic("Failed to allocate IP route cache hash table\n");
2752 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2754 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2756 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2760 for (i = 0; i <= rt_hash_mask; i++) {
2761 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2762 rt_hash_table[i].chain = NULL;
2765 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2766 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2768 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2775 init_timer(&rt_flush_timer);
2776 rt_flush_timer.function = rt_run_flush;
2777 init_timer(&rt_periodic_timer);
2778 rt_periodic_timer.function = rt_check_expire;
2779 init_timer(&rt_secret_timer);
2780 rt_secret_timer.function = rt_secret_rebuild;
2782 /* All the timers, started at system startup tend
2783 to synchronize. Perturb it a bit.
2785 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2787 add_timer(&rt_periodic_timer);
2789 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2790 ip_rt_secret_interval;
2791 add_timer(&rt_secret_timer);
2793 #ifdef CONFIG_PROC_FS
2795 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2796 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2797 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2799 free_percpu(rt_cache_stat);
2802 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2804 #ifdef CONFIG_NET_CLS_ROUTE
2805 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2815 EXPORT_SYMBOL(__ip_select_ident);
2816 EXPORT_SYMBOL(ip_route_input);
2817 EXPORT_SYMBOL(ip_route_output_key);