2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
58 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License
60 * as published by the Free Software Foundation; either version
61 * 2 of the License, or (at your option) any later version.
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
98 #include <net/ip_fib.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
104 #include <linux/sysctl.h>
107 #define IP_MAX_MTU 0xFFF0
109 #define RT_GC_TIMEOUT (300*HZ)
111 int ip_rt_min_delay = 2 * HZ;
112 int ip_rt_max_delay = 10 * HZ;
114 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval = 60 * HZ;
116 int ip_rt_gc_min_interval = HZ / 2;
117 int ip_rt_redirect_number = 9;
118 int ip_rt_redirect_load = HZ / 50;
119 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost = HZ;
121 int ip_rt_error_burst = 5 * HZ;
122 int ip_rt_gc_elasticity = 8;
123 int ip_rt_mtu_expires = 10 * 60 * HZ;
124 int ip_rt_min_pmtu = 512 + 20 + 20;
125 int ip_rt_min_advmss = 256;
126 int ip_rt_secret_interval = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
129 #define RTprint(a...) printk(KERN_DEBUG a)
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
136 * Interface to generic destination cache.
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void ipv4_dst_destroy(struct dst_entry *dst);
141 static void ipv4_dst_ifdown(struct dst_entry *dst, int how);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void ipv4_link_failure(struct sk_buff *skb);
144 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(void);
148 static struct dst_ops ipv4_dst_ops = {
150 .protocol = __constant_htons(ETH_P_IP),
151 .gc = rt_garbage_collect,
152 .check = ipv4_dst_check,
153 .destroy = ipv4_dst_destroy,
154 .ifdown = ipv4_dst_ifdown,
155 .negative_advice = ipv4_negative_advice,
156 .link_failure = ipv4_link_failure,
157 .update_pmtu = ip_rt_update_pmtu,
158 .entry_size = sizeof(struct rtable),
161 #define ECN_OR_COST(class) TC_PRIO_##class
163 __u8 ip_tos2prio[16] = {
167 ECN_OR_COST(BESTEFFORT),
173 ECN_OR_COST(INTERACTIVE),
175 ECN_OR_COST(INTERACTIVE),
176 TC_PRIO_INTERACTIVE_BULK,
177 ECN_OR_COST(INTERACTIVE_BULK),
178 TC_PRIO_INTERACTIVE_BULK,
179 ECN_OR_COST(INTERACTIVE_BULK)
187 /* The locking scheme is rather straight forward:
189 * 1) Read-Copy Update protects the buckets of the central route hash.
190 * 2) Only writers remove entries, and they hold the lock
191 * as they look at rtable reference counts.
192 * 3) Only readers acquire references to rtable entries,
193 * they do so with atomic increments and with the
197 struct rt_hash_bucket {
198 struct rtable *chain;
200 } __attribute__((__aligned__(8)));
202 static struct rt_hash_bucket *rt_hash_table;
203 static unsigned rt_hash_mask;
204 static int rt_hash_log;
205 static unsigned int rt_hash_rnd;
207 struct rt_cache_stat *rt_cache_stat;
209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
210 struct rtable **res);
212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
214 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
218 #ifdef CONFIG_PROC_FS
219 struct rt_cache_iter_state {
223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
225 struct rtable *r = NULL;
226 struct rt_cache_iter_state *st = seq->private;
228 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
230 r = rt_hash_table[st->bucket].chain;
238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
240 struct rt_cache_iter_state *st = seq->private;
242 smp_read_barrier_depends();
246 if (--st->bucket < 0)
249 r = rt_hash_table[st->bucket].chain;
254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
256 struct rtable *r = rt_cache_get_first(seq);
259 while (pos && (r = rt_cache_get_next(seq, r)))
261 return pos ? NULL : r;
264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
266 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 struct rtable *r = NULL;
273 if (v == SEQ_START_TOKEN)
274 r = rt_cache_get_first(seq);
276 r = rt_cache_get_next(seq, v);
281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
283 if (v && v != SEQ_START_TOKEN)
287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
289 if (v == SEQ_START_TOKEN)
290 seq_printf(seq, "%-127s\n",
291 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
292 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
295 struct rtable *r = v;
298 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
299 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
300 r->u.dst.dev ? r->u.dst.dev->name : "*",
301 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
302 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
303 r->u.dst.__use, 0, (unsigned long)r->rt_src,
304 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
305 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
306 dst_metric(&r->u.dst, RTAX_WINDOW),
307 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
308 dst_metric(&r->u.dst, RTAX_RTTVAR)),
310 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
311 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
314 seq_printf(seq, "%-127s\n", temp);
319 static struct seq_operations rt_cache_seq_ops = {
320 .start = rt_cache_seq_start,
321 .next = rt_cache_seq_next,
322 .stop = rt_cache_seq_stop,
323 .show = rt_cache_seq_show,
326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
328 struct seq_file *seq;
330 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
334 rc = seq_open(file, &rt_cache_seq_ops);
337 seq = file->private_data;
339 memset(s, 0, sizeof(*s));
347 static struct file_operations rt_cache_seq_fops = {
348 .owner = THIS_MODULE,
349 .open = rt_cache_seq_open,
352 .release = seq_release_private,
356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
360 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
361 if (!cpu_possible(cpu))
364 return per_cpu_ptr(rt_cache_stat, cpu);
369 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373 for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
374 if (!cpu_possible(cpu))
377 return per_cpu_ptr(rt_cache_stat, cpu);
383 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
388 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
390 struct rt_cache_stat *st = v;
392 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
393 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
394 atomic_read(&ipv4_dst_ops.entries),
417 static struct seq_operations rt_cpu_seq_ops = {
418 .start = rt_cpu_seq_start,
419 .next = rt_cpu_seq_next,
420 .stop = rt_cpu_seq_stop,
421 .show = rt_cpu_seq_show,
425 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
427 return seq_open(file, &rt_cpu_seq_ops);
430 static struct file_operations rt_cpu_seq_fops = {
431 .owner = THIS_MODULE,
432 .open = rt_cpu_seq_open,
435 .release = seq_release,
438 #endif /* CONFIG_PROC_FS */
440 static __inline__ void rt_free(struct rtable *rt)
442 call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
445 static __inline__ void rt_drop(struct rtable *rt)
448 call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
451 static __inline__ int rt_fast_clean(struct rtable *rth)
453 /* Kill broadcast/multicast entries very aggresively, if they
454 collide in hash table with more useful entries */
455 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
456 rth->fl.iif && rth->u.rt_next;
459 static __inline__ int rt_valuable(struct rtable *rth)
461 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
465 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
470 if (atomic_read(&rth->u.dst.__refcnt))
474 if (rth->u.dst.expires &&
475 time_after_eq(jiffies, rth->u.dst.expires))
478 age = jiffies - rth->u.dst.lastuse;
480 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
481 (age <= tmo2 && rt_valuable(rth)))
487 /* Bits of score are:
489 * 30: not quite useless
490 * 29..0: usage counter
492 static inline u32 rt_score(struct rtable *rt)
494 u32 score = jiffies - rt->u.dst.lastuse;
496 score = ~score & ~(3<<30);
502 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
508 /* This runs via a timer and thus is always in BH context. */
509 static void rt_check_expire(unsigned long dummy)
513 struct rtable *rth, **rthp;
514 unsigned long now = jiffies;
516 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
517 t -= ip_rt_gc_timeout) {
518 unsigned long tmo = ip_rt_gc_timeout;
520 i = (i + 1) & rt_hash_mask;
521 rthp = &rt_hash_table[i].chain;
523 spin_lock(&rt_hash_table[i].lock);
524 while ((rth = *rthp) != NULL) {
525 if (rth->u.dst.expires) {
526 /* Entry is expired even if it is in use */
527 if (time_before_eq(now, rth->u.dst.expires)) {
529 rthp = &rth->u.rt_next;
532 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
534 rthp = &rth->u.rt_next;
538 /* Cleanup aged off entries. */
539 *rthp = rth->u.rt_next;
542 spin_unlock(&rt_hash_table[i].lock);
544 /* Fallback loop breaker. */
545 if (time_after(jiffies, now))
549 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
552 /* This can run from both BH and non-BH contexts, the latter
553 * in the case of a forced flush event.
555 static void rt_run_flush(unsigned long dummy)
558 struct rtable *rth, *next;
562 get_random_bytes(&rt_hash_rnd, 4);
564 for (i = rt_hash_mask; i >= 0; i--) {
565 spin_lock_bh(&rt_hash_table[i].lock);
566 rth = rt_hash_table[i].chain;
568 rt_hash_table[i].chain = NULL;
569 spin_unlock_bh(&rt_hash_table[i].lock);
571 for (; rth; rth = next) {
572 next = rth->u.rt_next;
578 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
580 void rt_cache_flush(int delay)
582 unsigned long now = jiffies;
583 int user_mode = !in_softirq();
586 delay = ip_rt_min_delay;
588 spin_lock_bh(&rt_flush_lock);
590 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
591 long tmo = (long)(rt_deadline - now);
593 /* If flush timer is already running
594 and flush request is not immediate (delay > 0):
596 if deadline is not achieved, prolongate timer to "delay",
597 otherwise fire it at deadline time.
600 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
608 spin_unlock_bh(&rt_flush_lock);
613 if (rt_deadline == 0)
614 rt_deadline = now + ip_rt_max_delay;
616 mod_timer(&rt_flush_timer, now+delay);
617 spin_unlock_bh(&rt_flush_lock);
620 static void rt_secret_rebuild(unsigned long dummy)
622 unsigned long now = jiffies;
625 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
629 Short description of GC goals.
631 We want to build algorithm, which will keep routing cache
632 at some equilibrium point, when number of aged off entries
633 is kept approximately equal to newly generated ones.
635 Current expiration strength is variable "expire".
636 We try to adjust it dynamically, so that if networking
637 is idle expires is large enough to keep enough of warm entries,
638 and when load increases it reduces to limit cache size.
641 static int rt_garbage_collect(void)
643 static unsigned long expire = RT_GC_TIMEOUT;
644 static unsigned long last_gc;
646 static int equilibrium;
647 struct rtable *rth, **rthp;
648 unsigned long now = jiffies;
652 * Garbage collection is pretty expensive,
653 * do not make it too frequently.
656 RT_CACHE_STAT_INC(gc_total);
658 if (now - last_gc < ip_rt_gc_min_interval &&
659 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
660 RT_CACHE_STAT_INC(gc_ignored);
664 /* Calculate number of entries, which we want to expire now. */
665 goal = atomic_read(&ipv4_dst_ops.entries) -
666 (ip_rt_gc_elasticity << rt_hash_log);
668 if (equilibrium < ipv4_dst_ops.gc_thresh)
669 equilibrium = ipv4_dst_ops.gc_thresh;
670 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
672 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
673 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
676 /* We are in dangerous area. Try to reduce cache really
679 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
680 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
683 if (now - last_gc >= ip_rt_gc_min_interval)
694 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
695 unsigned long tmo = expire;
697 k = (k + 1) & rt_hash_mask;
698 rthp = &rt_hash_table[k].chain;
699 spin_lock_bh(&rt_hash_table[k].lock);
700 while ((rth = *rthp) != NULL) {
701 if (!rt_may_expire(rth, tmo, expire)) {
703 rthp = &rth->u.rt_next;
706 *rthp = rth->u.rt_next;
710 spin_unlock_bh(&rt_hash_table[k].lock);
719 /* Goal is not achieved. We stop process if:
721 - if expire reduced to zero. Otherwise, expire is halfed.
722 - if table is not full.
723 - if we are called from interrupt.
724 - jiffies check is just fallback/debug loop breaker.
725 We will not spin here for long time in any case.
728 RT_CACHE_STAT_INC(gc_goal_miss);
734 #if RT_CACHE_DEBUG >= 2
735 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
736 atomic_read(&ipv4_dst_ops.entries), goal, i);
739 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
741 } while (!in_softirq() && time_before_eq(jiffies, now));
743 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
746 printk(KERN_WARNING "dst cache overflow\n");
747 RT_CACHE_STAT_INC(gc_dst_overflow);
751 expire += ip_rt_gc_min_interval;
752 if (expire > ip_rt_gc_timeout ||
753 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
754 expire = ip_rt_gc_timeout;
755 #if RT_CACHE_DEBUG >= 2
756 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
757 atomic_read(&ipv4_dst_ops.entries), goal, rover);
762 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
764 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
765 fl1->oif == fl2->oif &&
766 fl1->iif == fl2->iif;
769 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
771 struct rtable *rth, **rthp;
773 struct rtable *cand, **candp;
776 int attempts = !in_softirq();
785 rthp = &rt_hash_table[hash].chain;
787 spin_lock_bh(&rt_hash_table[hash].lock);
788 while ((rth = *rthp) != NULL) {
789 if (compare_keys(&rth->fl, &rt->fl)) {
791 *rthp = rth->u.rt_next;
793 * Since lookup is lockfree, the deletion
794 * must be visible to another weakly ordered CPU before
795 * the insertion at the start of the hash chain.
798 rth->u.rt_next = rt_hash_table[hash].chain;
800 * Since lookup is lockfree, the update writes
801 * must be ordered for consistency on SMP.
804 rt_hash_table[hash].chain = rth;
807 dst_hold(&rth->u.dst);
808 rth->u.dst.lastuse = now;
809 spin_unlock_bh(&rt_hash_table[hash].lock);
816 if (!atomic_read(&rth->u.dst.__refcnt)) {
817 u32 score = rt_score(rth);
819 if (score <= min_score) {
828 rthp = &rth->u.rt_next;
832 /* ip_rt_gc_elasticity used to be average length of chain
833 * length, when exceeded gc becomes really aggressive.
835 * The second limit is less certain. At the moment it allows
836 * only 2 entries per bucket. We will see.
838 if (chain_length > ip_rt_gc_elasticity) {
839 *candp = cand->u.rt_next;
844 /* Try to bind route to arp only if it is output
845 route or unicast forwarding path.
847 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
848 int err = arp_bind_neighbour(&rt->u.dst);
850 spin_unlock_bh(&rt_hash_table[hash].lock);
852 if (err != -ENOBUFS) {
857 /* Neighbour tables are full and nothing
858 can be released. Try to shrink route cache,
859 it is most likely it holds some neighbour records.
861 if (attempts-- > 0) {
862 int saved_elasticity = ip_rt_gc_elasticity;
863 int saved_int = ip_rt_gc_min_interval;
864 ip_rt_gc_elasticity = 1;
865 ip_rt_gc_min_interval = 0;
866 rt_garbage_collect();
867 ip_rt_gc_min_interval = saved_int;
868 ip_rt_gc_elasticity = saved_elasticity;
873 printk(KERN_WARNING "Neighbour table overflow.\n");
879 rt->u.rt_next = rt_hash_table[hash].chain;
880 #if RT_CACHE_DEBUG >= 2
883 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
884 NIPQUAD(rt->rt_dst));
885 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
886 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
890 rt_hash_table[hash].chain = rt;
891 spin_unlock_bh(&rt_hash_table[hash].lock);
896 void rt_bind_peer(struct rtable *rt, int create)
898 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
899 struct inet_peer *peer;
901 peer = inet_getpeer(rt->rt_dst, create);
903 spin_lock_bh(&rt_peer_lock);
904 if (rt->peer == NULL) {
908 spin_unlock_bh(&rt_peer_lock);
914 * Peer allocation may fail only in serious out-of-memory conditions. However
915 * we still can generate some output.
916 * Random ID selection looks a bit dangerous because we have no chances to
917 * select ID being unique in a reasonable period of time.
918 * But broken packet identifier may be better than no packet at all.
920 static void ip_select_fb_ident(struct iphdr *iph)
922 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
923 static u32 ip_fallback_id;
926 spin_lock_bh(&ip_fb_id_lock);
927 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
928 iph->id = htons(salt & 0xFFFF);
929 ip_fallback_id = salt;
930 spin_unlock_bh(&ip_fb_id_lock);
933 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
935 struct rtable *rt = (struct rtable *) dst;
938 if (rt->peer == NULL)
941 /* If peer is attached to destination, it is never detached,
942 so that we need not to grab a lock to dereference it.
945 iph->id = htons(inet_getid(rt->peer, more));
949 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
951 ip_select_fb_ident(iph);
954 static void rt_del(unsigned hash, struct rtable *rt)
956 struct rtable **rthp;
958 spin_lock_bh(&rt_hash_table[hash].lock);
960 for (rthp = &rt_hash_table[hash].chain; *rthp;
961 rthp = &(*rthp)->u.rt_next)
963 *rthp = rt->u.rt_next;
967 spin_unlock_bh(&rt_hash_table[hash].lock);
970 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
971 u32 saddr, u8 tos, struct net_device *dev)
974 struct in_device *in_dev = in_dev_get(dev);
975 struct rtable *rth, **rthp;
976 u32 skeys[2] = { saddr, 0 };
977 int ikeys[2] = { dev->ifindex, 0 };
979 tos &= IPTOS_RT_MASK;
984 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
985 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
986 goto reject_redirect;
988 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
989 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
990 goto reject_redirect;
991 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
992 goto reject_redirect;
994 if (inet_addr_type(new_gw) != RTN_UNICAST)
995 goto reject_redirect;
998 for (i = 0; i < 2; i++) {
999 for (k = 0; k < 2; k++) {
1000 unsigned hash = rt_hash_code(daddr,
1001 skeys[i] ^ (ikeys[k] << 5),
1004 rthp=&rt_hash_table[hash].chain;
1007 while ((rth = *rthp) != NULL) {
1010 smp_read_barrier_depends();
1011 if (rth->fl.fl4_dst != daddr ||
1012 rth->fl.fl4_src != skeys[i] ||
1013 rth->fl.fl4_tos != tos ||
1014 rth->fl.oif != ikeys[k] ||
1016 rthp = &rth->u.rt_next;
1020 if (rth->rt_dst != daddr ||
1021 rth->rt_src != saddr ||
1023 rth->rt_gateway != old_gw ||
1024 rth->u.dst.dev != dev)
1027 dst_hold(&rth->u.dst);
1030 rt = dst_alloc(&ipv4_dst_ops);
1037 /* Copy all the information. */
1039 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1040 rt->u.dst.__use = 1;
1041 atomic_set(&rt->u.dst.__refcnt, 1);
1042 rt->u.dst.child = NULL;
1044 dev_hold(rt->u.dst.dev);
1046 in_dev_hold(rt->idev);
1047 rt->u.dst.obsolete = 0;
1048 rt->u.dst.lastuse = jiffies;
1049 rt->u.dst.path = &rt->u.dst;
1050 rt->u.dst.neighbour = NULL;
1051 rt->u.dst.hh = NULL;
1052 rt->u.dst.xfrm = NULL;
1054 rt->rt_flags |= RTCF_REDIRECTED;
1056 /* Gateway is different ... */
1057 rt->rt_gateway = new_gw;
1059 /* Redirect received -> path was valid */
1060 dst_confirm(&rth->u.dst);
1063 atomic_inc(&rt->peer->refcnt);
1065 if (arp_bind_neighbour(&rt->u.dst) ||
1066 !(rt->u.dst.neighbour->nud_state &
1068 if (rt->u.dst.neighbour)
1069 neigh_event_send(rt->u.dst.neighbour, NULL);
1076 if (!rt_intern_hash(hash, rt, &rt))
1089 #ifdef CONFIG_IP_ROUTE_VERBOSE
1090 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1091 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1092 "%u.%u.%u.%u ignored.\n"
1093 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1095 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1096 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1101 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1103 struct rtable *rt = (struct rtable*)dst;
1104 struct dst_entry *ret = dst;
1107 if (dst->obsolete) {
1110 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1111 rt->u.dst.expires) {
1112 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1116 #if RT_CACHE_DEBUG >= 1
1117 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1118 "%u.%u.%u.%u/%02x dropped\n",
1119 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1130 * 1. The first ip_rt_redirect_number redirects are sent
1131 * with exponential backoff, then we stop sending them at all,
1132 * assuming that the host ignores our redirects.
1133 * 2. If we did not see packets requiring redirects
1134 * during ip_rt_redirect_silence, we assume that the host
1135 * forgot redirected route and start to send redirects again.
1137 * This algorithm is much cheaper and more intelligent than dumb load limiting
1140 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1141 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1144 void ip_rt_send_redirect(struct sk_buff *skb)
1146 struct rtable *rt = (struct rtable*)skb->dst;
1147 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1152 if (!IN_DEV_TX_REDIRECTS(in_dev))
1155 /* No redirected packets during ip_rt_redirect_silence;
1156 * reset the algorithm.
1158 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1159 rt->u.dst.rate_tokens = 0;
1161 /* Too many ignored redirects; do not send anything
1162 * set u.dst.rate_last to the last seen redirected packet.
1164 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1165 rt->u.dst.rate_last = jiffies;
1169 /* Check for load limit; set rate_last to the latest sent
1172 if (time_after(jiffies,
1173 (rt->u.dst.rate_last +
1174 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1175 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1176 rt->u.dst.rate_last = jiffies;
1177 ++rt->u.dst.rate_tokens;
1178 #ifdef CONFIG_IP_ROUTE_VERBOSE
1179 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1180 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1182 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1183 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1184 NIPQUAD(rt->rt_src), rt->rt_iif,
1185 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1192 static int ip_error(struct sk_buff *skb)
1194 struct rtable *rt = (struct rtable*)skb->dst;
1198 switch (rt->u.dst.error) {
1203 code = ICMP_HOST_UNREACH;
1206 code = ICMP_NET_UNREACH;
1209 code = ICMP_PKT_FILTERED;
1214 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1215 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1216 rt->u.dst.rate_tokens = ip_rt_error_burst;
1217 rt->u.dst.rate_last = now;
1218 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1219 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1220 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1223 out: kfree_skb(skb);
1228 * The last two values are not from the RFC but
1229 * are needed for AMPRnet AX.25 paths.
1232 static unsigned short mtu_plateau[] =
1233 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1235 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1239 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1240 if (old_mtu > mtu_plateau[i])
1241 return mtu_plateau[i];
1245 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1248 unsigned short old_mtu = ntohs(iph->tot_len);
1250 u32 skeys[2] = { iph->saddr, 0, };
1251 u32 daddr = iph->daddr;
1252 u8 tos = iph->tos & IPTOS_RT_MASK;
1253 unsigned short est_mtu = 0;
1255 if (ipv4_config.no_pmtu_disc)
1258 for (i = 0; i < 2; i++) {
1259 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1262 for (rth = rt_hash_table[hash].chain; rth;
1263 rth = rth->u.rt_next) {
1264 smp_read_barrier_depends();
1265 if (rth->fl.fl4_dst == daddr &&
1266 rth->fl.fl4_src == skeys[i] &&
1267 rth->rt_dst == daddr &&
1268 rth->rt_src == iph->saddr &&
1269 rth->fl.fl4_tos == tos &&
1271 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1272 unsigned short mtu = new_mtu;
1274 if (new_mtu < 68 || new_mtu >= old_mtu) {
1276 /* BSD 4.2 compatibility hack :-( */
1278 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1279 old_mtu >= 68 + (iph->ihl << 2))
1280 old_mtu -= iph->ihl << 2;
1282 mtu = guess_mtu(old_mtu);
1284 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1285 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1286 dst_confirm(&rth->u.dst);
1287 if (mtu < ip_rt_min_pmtu) {
1288 mtu = ip_rt_min_pmtu;
1289 rth->u.dst.metrics[RTAX_LOCK-1] |=
1292 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1293 dst_set_expires(&rth->u.dst,
1302 return est_mtu ? : new_mtu;
1305 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1307 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1308 !(dst_metric_locked(dst, RTAX_MTU))) {
1309 if (mtu < ip_rt_min_pmtu) {
1310 mtu = ip_rt_min_pmtu;
1311 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1313 dst->metrics[RTAX_MTU-1] = mtu;
1314 dst_set_expires(dst, ip_rt_mtu_expires);
1318 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1324 static void ipv4_dst_destroy(struct dst_entry *dst)
1326 struct rtable *rt = (struct rtable *) dst;
1327 struct inet_peer *peer = rt->peer;
1328 struct in_device *idev = rt->idev;
1341 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1343 struct rtable *rt = (struct rtable *) dst;
1344 struct in_device *idev = rt->idev;
1351 static void ipv4_link_failure(struct sk_buff *skb)
1355 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1357 rt = (struct rtable *) skb->dst;
1359 dst_set_expires(&rt->u.dst, 0);
1362 static int ip_rt_bug(struct sk_buff **pskb)
1364 struct sk_buff *skb = *pskb;
1366 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1367 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1368 skb->dev ? skb->dev->name : "?");
1374 We do not cache source address of outgoing interface,
1375 because it is used only by IP RR, TS and SRR options,
1376 so that it out of fast path.
1378 BTW remember: "addr" is allowed to be not aligned
1382 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1385 struct fib_result res;
1387 if (rt->fl.iif == 0)
1389 else if (fib_lookup(&rt->fl, &res) == 0) {
1390 #ifdef CONFIG_IP_ROUTE_NAT
1391 if (res.type == RTN_NAT)
1392 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1396 src = FIB_RES_PREFSRC(res);
1399 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1401 memcpy(addr, &src, 4);
1404 #ifdef CONFIG_NET_CLS_ROUTE
1405 static void set_class_tag(struct rtable *rt, u32 tag)
1407 if (!(rt->u.dst.tclassid & 0xFFFF))
1408 rt->u.dst.tclassid |= tag & 0xFFFF;
1409 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1410 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1414 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1416 struct fib_info *fi = res->fi;
1419 if (FIB_RES_GW(*res) &&
1420 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1421 rt->rt_gateway = FIB_RES_GW(*res);
1422 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1423 sizeof(rt->u.dst.metrics));
1424 if (fi->fib_mtu == 0) {
1425 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1426 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1427 rt->rt_gateway != rt->rt_dst &&
1428 rt->u.dst.dev->mtu > 576)
1429 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1431 #ifdef CONFIG_NET_CLS_ROUTE
1432 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1435 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1437 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1438 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1439 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1440 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1441 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1442 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1444 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1445 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1447 #ifdef CONFIG_NET_CLS_ROUTE
1448 #ifdef CONFIG_IP_MULTIPLE_TABLES
1449 set_class_tag(rt, fib_rules_tclass(res));
1451 set_class_tag(rt, itag);
1453 rt->rt_type = res->type;
1456 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1457 u8 tos, struct net_device *dev, int our)
1462 struct in_device *in_dev = in_dev_get(dev);
1465 /* Primary sanity checks. */
1470 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1471 skb->protocol != htons(ETH_P_IP))
1474 if (ZERONET(saddr)) {
1475 if (!LOCAL_MCAST(daddr))
1477 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1478 } else if (fib_validate_source(saddr, 0, tos, 0,
1479 dev, &spec_dst, &itag) < 0)
1482 rth = dst_alloc(&ipv4_dst_ops);
1486 rth->u.dst.output= ip_rt_bug;
1488 atomic_set(&rth->u.dst.__refcnt, 1);
1489 rth->u.dst.flags= DST_HOST;
1490 if (in_dev->cnf.no_policy)
1491 rth->u.dst.flags |= DST_NOPOLICY;
1492 rth->fl.fl4_dst = daddr;
1493 rth->rt_dst = daddr;
1494 rth->fl.fl4_tos = tos;
1495 #ifdef CONFIG_IP_ROUTE_FWMARK
1496 rth->fl.fl4_fwmark= skb->nfmark;
1498 rth->fl.fl4_src = saddr;
1499 rth->rt_src = saddr;
1500 #ifdef CONFIG_IP_ROUTE_NAT
1501 rth->rt_dst_map = daddr;
1502 rth->rt_src_map = saddr;
1504 #ifdef CONFIG_NET_CLS_ROUTE
1505 rth->u.dst.tclassid = itag;
1508 rth->fl.iif = dev->ifindex;
1509 rth->u.dst.dev = &loopback_dev;
1510 dev_hold(rth->u.dst.dev);
1511 rth->idev = in_dev_get(rth->u.dst.dev);
1513 rth->rt_gateway = daddr;
1514 rth->rt_spec_dst= spec_dst;
1515 rth->rt_type = RTN_MULTICAST;
1516 rth->rt_flags = RTCF_MULTICAST;
1518 rth->u.dst.input= ip_local_deliver;
1519 rth->rt_flags |= RTCF_LOCAL;
1522 #ifdef CONFIG_IP_MROUTE
1523 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1524 rth->u.dst.input = ip_mr_input;
1526 RT_CACHE_STAT_INC(in_slow_mc);
1529 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1530 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1542 * NOTE. We drop all the packets that has local source
1543 * addresses, because every properly looped back packet
1544 * must have correct destination already attached by output routine.
1546 * Such approach solves two big problems:
1547 * 1. Not simplex devices are handled properly.
1548 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1551 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1552 u8 tos, struct net_device *dev)
1554 struct fib_result res;
1555 struct in_device *in_dev = in_dev_get(dev);
1556 struct in_device *out_dev = NULL;
1557 struct flowi fl = { .nl_u = { .ip4_u =
1561 .scope = RT_SCOPE_UNIVERSE,
1562 #ifdef CONFIG_IP_ROUTE_FWMARK
1563 .fwmark = skb->nfmark
1566 .iif = dev->ifindex };
1569 struct rtable * rth;
1575 /* IP on this device is disabled. */
1580 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1582 /* Check for the most weird martians, which can be not detected
1586 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1587 goto martian_source;
1589 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1592 /* Accept zero addresses only to limited broadcast;
1593 * I even do not know to fix it or not. Waiting for complains :-)
1596 goto martian_source;
1598 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1599 goto martian_destination;
1602 * Now we are ready to route packet.
1604 if ((err = fib_lookup(&fl, &res)) != 0) {
1605 if (!IN_DEV_FORWARD(in_dev))
1611 RT_CACHE_STAT_INC(in_slow_tot);
1613 #ifdef CONFIG_IP_ROUTE_NAT
1614 /* Policy is applied before mapping destination,
1615 but rerouting after map should be made with old source.
1619 u32 src_map = saddr;
1621 src_map = fib_rules_policy(saddr, &res, &flags);
1623 if (res.type == RTN_NAT) {
1624 fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1627 if (fib_lookup(&fl, &res))
1630 if (res.type != RTN_UNICAST)
1634 fl.fl4_src = src_map;
1638 if (res.type == RTN_BROADCAST)
1641 if (res.type == RTN_LOCAL) {
1643 result = fib_validate_source(saddr, daddr, tos,
1644 loopback_dev.ifindex,
1645 dev, &spec_dst, &itag);
1647 goto martian_source;
1649 flags |= RTCF_DIRECTSRC;
1654 if (!IN_DEV_FORWARD(in_dev))
1656 if (res.type != RTN_UNICAST)
1657 goto martian_destination;
1659 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1660 if (res.fi->fib_nhs > 1 && fl.oif == 0)
1661 fib_select_multipath(&fl, &res);
1663 out_dev = in_dev_get(FIB_RES_DEV(res));
1664 if (out_dev == NULL) {
1665 if (net_ratelimit())
1666 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1667 "Please, report\n");
1671 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1674 goto martian_source;
1677 flags |= RTCF_DIRECTSRC;
1679 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1680 (IN_DEV_SHARED_MEDIA(out_dev) ||
1681 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1682 flags |= RTCF_DOREDIRECT;
1684 if (skb->protocol != htons(ETH_P_IP)) {
1685 /* Not IP (i.e. ARP). Do not create route, if it is
1686 * invalid for proxy arp. DNAT routes are always valid.
1688 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1692 rth = dst_alloc(&ipv4_dst_ops);
1696 atomic_set(&rth->u.dst.__refcnt, 1);
1697 rth->u.dst.flags= DST_HOST;
1698 if (in_dev->cnf.no_policy)
1699 rth->u.dst.flags |= DST_NOPOLICY;
1700 if (in_dev->cnf.no_xfrm)
1701 rth->u.dst.flags |= DST_NOXFRM;
1702 rth->fl.fl4_dst = daddr;
1703 rth->rt_dst = daddr;
1704 rth->fl.fl4_tos = tos;
1705 #ifdef CONFIG_IP_ROUTE_FWMARK
1706 rth->fl.fl4_fwmark= skb->nfmark;
1708 rth->fl.fl4_src = saddr;
1709 rth->rt_src = saddr;
1710 rth->rt_gateway = daddr;
1711 #ifdef CONFIG_IP_ROUTE_NAT
1712 rth->rt_src_map = fl.fl4_src;
1713 rth->rt_dst_map = fl.fl4_dst;
1714 if (flags&RTCF_DNAT)
1715 rth->rt_gateway = fl.fl4_dst;
1718 rth->fl.iif = dev->ifindex;
1719 rth->u.dst.dev = out_dev->dev;
1720 dev_hold(rth->u.dst.dev);
1721 rth->idev = in_dev_get(rth->u.dst.dev);
1723 rth->rt_spec_dst= spec_dst;
1725 rth->u.dst.input = ip_forward;
1726 rth->u.dst.output = ip_output;
1728 rt_set_nexthop(rth, &res, itag);
1730 rth->rt_flags = flags;
1733 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1737 in_dev_put(out_dev);
1743 if (skb->protocol != htons(ETH_P_IP))
1747 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1749 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1752 goto martian_source;
1754 flags |= RTCF_DIRECTSRC;
1756 flags |= RTCF_BROADCAST;
1757 res.type = RTN_BROADCAST;
1758 RT_CACHE_STAT_INC(in_brd);
1761 rth = dst_alloc(&ipv4_dst_ops);
1765 rth->u.dst.output= ip_rt_bug;
1767 atomic_set(&rth->u.dst.__refcnt, 1);
1768 rth->u.dst.flags= DST_HOST;
1769 if (in_dev->cnf.no_policy)
1770 rth->u.dst.flags |= DST_NOPOLICY;
1771 rth->fl.fl4_dst = daddr;
1772 rth->rt_dst = daddr;
1773 rth->fl.fl4_tos = tos;
1774 #ifdef CONFIG_IP_ROUTE_FWMARK
1775 rth->fl.fl4_fwmark= skb->nfmark;
1777 rth->fl.fl4_src = saddr;
1778 rth->rt_src = saddr;
1779 #ifdef CONFIG_IP_ROUTE_NAT
1780 rth->rt_dst_map = fl.fl4_dst;
1781 rth->rt_src_map = fl.fl4_src;
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784 rth->u.dst.tclassid = itag;
1787 rth->fl.iif = dev->ifindex;
1788 rth->u.dst.dev = &loopback_dev;
1789 dev_hold(rth->u.dst.dev);
1790 rth->idev = in_dev_get(rth->u.dst.dev);
1791 rth->rt_gateway = daddr;
1792 rth->rt_spec_dst= spec_dst;
1793 rth->u.dst.input= ip_local_deliver;
1794 rth->rt_flags = flags|RTCF_LOCAL;
1795 if (res.type == RTN_UNREACHABLE) {
1796 rth->u.dst.input= ip_error;
1797 rth->u.dst.error= -err;
1798 rth->rt_flags &= ~RTCF_LOCAL;
1800 rth->rt_type = res.type;
1804 RT_CACHE_STAT_INC(in_no_route);
1805 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1806 res.type = RTN_UNREACHABLE;
1810 * Do not cache martian addresses: they should be logged (RFC1812)
1812 martian_destination:
1813 RT_CACHE_STAT_INC(in_martian_dst);
1814 #ifdef CONFIG_IP_ROUTE_VERBOSE
1815 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1816 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1817 "%u.%u.%u.%u, dev %s\n",
1818 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1830 RT_CACHE_STAT_INC(in_martian_src);
1831 #ifdef CONFIG_IP_ROUTE_VERBOSE
1832 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1834 * RFC1812 recommendation, if source is martian,
1835 * the only hint is MAC header.
1837 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1838 "%u.%u.%u.%u, on dev %s\n",
1839 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1840 if (dev->hard_header_len) {
1842 unsigned char *p = skb->mac.raw;
1843 printk(KERN_WARNING "ll header: ");
1844 for (i = 0; i < dev->hard_header_len; i++, p++) {
1846 if (i < (dev->hard_header_len - 1))
1856 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1857 u8 tos, struct net_device *dev)
1859 struct rtable * rth;
1861 int iif = dev->ifindex;
1863 tos &= IPTOS_RT_MASK;
1864 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1867 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1868 smp_read_barrier_depends();
1869 if (rth->fl.fl4_dst == daddr &&
1870 rth->fl.fl4_src == saddr &&
1871 rth->fl.iif == iif &&
1873 #ifdef CONFIG_IP_ROUTE_FWMARK
1874 rth->fl.fl4_fwmark == skb->nfmark &&
1876 rth->fl.fl4_tos == tos) {
1877 rth->u.dst.lastuse = jiffies;
1878 dst_hold(&rth->u.dst);
1880 RT_CACHE_STAT_INC(in_hit);
1882 skb->dst = (struct dst_entry*)rth;
1885 RT_CACHE_STAT_INC(in_hlist_search);
1889 /* Multicast recognition logic is moved from route cache to here.
1890 The problem was that too many Ethernet cards have broken/missing
1891 hardware multicast filters :-( As result the host on multicasting
1892 network acquires a lot of useless route cache entries, sort of
1893 SDR messages from all the world. Now we try to get rid of them.
1894 Really, provided software IP multicast filter is organized
1895 reasonably (at least, hashed), it does not result in a slowdown
1896 comparing with route cache reject entries.
1897 Note, that multicast routers are not affected, because
1898 route cache entry is created eventually.
1900 if (MULTICAST(daddr)) {
1901 struct in_device *in_dev;
1903 read_lock(&inetdev_lock);
1904 if ((in_dev = __in_dev_get(dev)) != NULL) {
1905 int our = ip_check_mc(in_dev, daddr, saddr,
1906 skb->nh.iph->protocol);
1908 #ifdef CONFIG_IP_MROUTE
1909 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1912 read_unlock(&inetdev_lock);
1913 return ip_route_input_mc(skb, daddr, saddr,
1917 read_unlock(&inetdev_lock);
1920 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1924 * Major route resolver routine.
1927 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1929 u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1930 struct flowi fl = { .nl_u = { .ip4_u =
1931 { .daddr = oldflp->fl4_dst,
1932 .saddr = oldflp->fl4_src,
1933 .tos = tos & IPTOS_RT_MASK,
1934 .scope = ((tos & RTO_ONLINK) ?
1937 #ifdef CONFIG_IP_ROUTE_FWMARK
1938 .fwmark = oldflp->fl4_fwmark
1941 .iif = loopback_dev.ifindex,
1942 .oif = oldflp->oif };
1943 struct fib_result res;
1946 struct net_device *dev_out = NULL;
1947 struct in_device *in_dev = NULL;
1953 #ifdef CONFIG_IP_MULTIPLE_TABLES
1957 if (oldflp->fl4_src) {
1959 if (MULTICAST(oldflp->fl4_src) ||
1960 BADCLASS(oldflp->fl4_src) ||
1961 ZERONET(oldflp->fl4_src))
1964 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1965 dev_out = ip_dev_find(oldflp->fl4_src);
1966 if (dev_out == NULL)
1969 /* I removed check for oif == dev_out->oif here.
1970 It was wrong for two reasons:
1971 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1972 assigned to multiple interfaces.
1973 2. Moreover, we are allowed to send packets with saddr
1974 of another iface. --ANK
1977 if (oldflp->oif == 0
1978 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1979 /* Special hack: user can direct multicasts
1980 and limited broadcast via necessary interface
1981 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1982 This hack is not just for fun, it allows
1983 vic,vat and friends to work.
1984 They bind socket to loopback, set ttl to zero
1985 and expect that it will work.
1986 From the viewpoint of routing cache they are broken,
1987 because we are not allowed to build multicast path
1988 with loopback source addr (look, routing cache
1989 cannot know, that ttl is zero, so that packet
1990 will not leave this host and route is valid).
1991 Luckily, this hack is good workaround.
1994 fl.oif = dev_out->ifindex;
2002 dev_out = dev_get_by_index(oldflp->oif);
2004 if (dev_out == NULL)
2006 if (__in_dev_get(dev_out) == NULL) {
2008 goto out; /* Wrong error code */
2011 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2013 fl.fl4_src = inet_select_addr(dev_out, 0,
2018 if (MULTICAST(oldflp->fl4_dst))
2019 fl.fl4_src = inet_select_addr(dev_out, 0,
2021 else if (!oldflp->fl4_dst)
2022 fl.fl4_src = inet_select_addr(dev_out, 0,
2028 fl.fl4_dst = fl.fl4_src;
2030 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2033 dev_out = &loopback_dev;
2035 fl.oif = loopback_dev.ifindex;
2036 res.type = RTN_LOCAL;
2037 flags |= RTCF_LOCAL;
2041 if (fib_lookup(&fl, &res)) {
2044 /* Apparently, routing tables are wrong. Assume,
2045 that the destination is on link.
2048 Because we are allowed to send to iface
2049 even if it has NO routes and NO assigned
2050 addresses. When oif is specified, routing
2051 tables are looked up with only one purpose:
2052 to catch if destination is gatewayed, rather than
2053 direct. Moreover, if MSG_DONTROUTE is set,
2054 we send packet, ignoring both routing tables
2055 and ifaddr state. --ANK
2058 We could make it even if oif is unknown,
2059 likely IPv6, but we do not.
2062 if (fl.fl4_src == 0)
2063 fl.fl4_src = inet_select_addr(dev_out, 0,
2065 res.type = RTN_UNICAST;
2075 if (res.type == RTN_NAT)
2078 if (res.type == RTN_LOCAL) {
2080 fl.fl4_src = fl.fl4_dst;
2083 dev_out = &loopback_dev;
2085 fl.oif = dev_out->ifindex;
2087 fib_info_put(res.fi);
2089 flags |= RTCF_LOCAL;
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2095 fib_select_multipath(&fl, &res);
2098 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2099 fib_select_default(&fl, &res);
2102 fl.fl4_src = FIB_RES_PREFSRC(res);
2106 dev_out = FIB_RES_DEV(res);
2108 fl.oif = dev_out->ifindex;
2111 if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2114 if (fl.fl4_dst == 0xFFFFFFFF)
2115 res.type = RTN_BROADCAST;
2116 else if (MULTICAST(fl.fl4_dst))
2117 res.type = RTN_MULTICAST;
2118 else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2121 if (dev_out->flags & IFF_LOOPBACK)
2122 flags |= RTCF_LOCAL;
2124 in_dev = in_dev_get(dev_out);
2128 if (res.type == RTN_BROADCAST) {
2129 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2131 fib_info_put(res.fi);
2134 } else if (res.type == RTN_MULTICAST) {
2135 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2136 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2137 flags &= ~RTCF_LOCAL;
2138 /* If multicast route do not exist use
2139 default one, but do not gateway in this case.
2142 if (res.fi && res.prefixlen < 4) {
2143 fib_info_put(res.fi);
2148 rth = dst_alloc(&ipv4_dst_ops);
2152 atomic_set(&rth->u.dst.__refcnt, 1);
2153 rth->u.dst.flags= DST_HOST;
2154 if (in_dev->cnf.no_xfrm)
2155 rth->u.dst.flags |= DST_NOXFRM;
2156 if (in_dev->cnf.no_policy)
2157 rth->u.dst.flags |= DST_NOPOLICY;
2158 rth->fl.fl4_dst = oldflp->fl4_dst;
2159 rth->fl.fl4_tos = tos;
2160 rth->fl.fl4_src = oldflp->fl4_src;
2161 rth->fl.oif = oldflp->oif;
2162 #ifdef CONFIG_IP_ROUTE_FWMARK
2163 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2165 rth->rt_dst = fl.fl4_dst;
2166 rth->rt_src = fl.fl4_src;
2167 #ifdef CONFIG_IP_ROUTE_NAT
2168 rth->rt_dst_map = fl.fl4_dst;
2169 rth->rt_src_map = fl.fl4_src;
2171 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2172 rth->u.dst.dev = dev_out;
2174 rth->idev = in_dev_get(dev_out);
2175 rth->rt_gateway = fl.fl4_dst;
2176 rth->rt_spec_dst= fl.fl4_src;
2178 rth->u.dst.output=ip_output;
2180 RT_CACHE_STAT_INC(out_slow_tot);
2182 if (flags & RTCF_LOCAL) {
2183 rth->u.dst.input = ip_local_deliver;
2184 rth->rt_spec_dst = fl.fl4_dst;
2186 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2187 rth->rt_spec_dst = fl.fl4_src;
2188 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2189 rth->u.dst.output = ip_mc_output;
2190 RT_CACHE_STAT_INC(out_slow_mc);
2192 #ifdef CONFIG_IP_MROUTE
2193 if (res.type == RTN_MULTICAST) {
2194 if (IN_DEV_MFORWARD(in_dev) &&
2195 !LOCAL_MCAST(oldflp->fl4_dst)) {
2196 rth->u.dst.input = ip_mr_input;
2197 rth->u.dst.output = ip_mc_output;
2203 rt_set_nexthop(rth, &res, 0);
2206 rth->rt_flags = flags;
2208 hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2209 err = rt_intern_hash(hash, rth, rp);
2227 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2232 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2235 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2236 smp_read_barrier_depends();
2237 if (rth->fl.fl4_dst == flp->fl4_dst &&
2238 rth->fl.fl4_src == flp->fl4_src &&
2240 rth->fl.oif == flp->oif &&
2241 #ifdef CONFIG_IP_ROUTE_FWMARK
2242 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2244 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2245 (IPTOS_RT_MASK | RTO_ONLINK))) {
2246 rth->u.dst.lastuse = jiffies;
2247 dst_hold(&rth->u.dst);
2249 RT_CACHE_STAT_INC(out_hit);
2254 RT_CACHE_STAT_INC(out_hlist_search);
2258 return ip_route_output_slow(rp, flp);
2261 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2265 if ((err = __ip_route_output_key(rp, flp)) != 0)
2267 return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2270 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2274 if ((err = __ip_route_output_key(rp, flp)) != 0)
2276 return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2279 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2282 struct rtable *rt = (struct rtable*)skb->dst;
2284 struct nlmsghdr *nlh;
2285 unsigned char *b = skb->tail;
2286 struct rta_cacheinfo ci;
2287 #ifdef CONFIG_IP_MROUTE
2288 struct rtattr *eptr;
2290 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2291 r = NLMSG_DATA(nlh);
2292 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2293 r->rtm_family = AF_INET;
2294 r->rtm_dst_len = 32;
2296 r->rtm_tos = rt->fl.fl4_tos;
2297 r->rtm_table = RT_TABLE_MAIN;
2298 r->rtm_type = rt->rt_type;
2299 r->rtm_scope = RT_SCOPE_UNIVERSE;
2300 r->rtm_protocol = RTPROT_UNSPEC;
2301 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2302 if (rt->rt_flags & RTCF_NOTIFY)
2303 r->rtm_flags |= RTM_F_NOTIFY;
2304 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2305 if (rt->fl.fl4_src) {
2306 r->rtm_src_len = 32;
2307 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2310 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2311 #ifdef CONFIG_NET_CLS_ROUTE
2312 if (rt->u.dst.tclassid)
2313 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2316 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2317 else if (rt->rt_src != rt->fl.fl4_src)
2318 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2319 if (rt->rt_dst != rt->rt_gateway)
2320 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2321 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2322 goto rtattr_failure;
2323 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2324 ci.rta_used = rt->u.dst.__use;
2325 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2326 if (rt->u.dst.expires)
2327 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2330 ci.rta_error = rt->u.dst.error;
2331 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2333 ci.rta_id = rt->peer->ip_id_count;
2334 if (rt->peer->tcp_ts_stamp) {
2335 ci.rta_ts = rt->peer->tcp_ts;
2336 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2339 #ifdef CONFIG_IP_MROUTE
2340 eptr = (struct rtattr*)skb->tail;
2342 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2344 #ifdef CONFIG_IP_MROUTE
2345 u32 dst = rt->rt_dst;
2347 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2348 ipv4_devconf.mc_forwarding) {
2349 int err = ipmr_get_route(skb, r, nowait);
2356 if (err == -EMSGSIZE)
2358 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2363 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2366 nlh->nlmsg_len = skb->tail - b;
2371 skb_trim(skb, b - skb->data);
2375 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2377 struct rtattr **rta = arg;
2378 struct rtmsg *rtm = NLMSG_DATA(nlh);
2379 struct rtable *rt = NULL;
2384 struct sk_buff *skb;
2386 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2390 /* Reserve room for dummy headers, this skb can pass
2391 through good chunk of routing engine.
2393 skb->mac.raw = skb->data;
2394 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2396 if (rta[RTA_SRC - 1])
2397 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2398 if (rta[RTA_DST - 1])
2399 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2400 if (rta[RTA_IIF - 1])
2401 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2404 struct net_device *dev = __dev_get_by_index(iif);
2408 skb->protocol = htons(ETH_P_IP);
2411 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2413 rt = (struct rtable*)skb->dst;
2414 if (!err && rt->u.dst.error)
2415 err = -rt->u.dst.error;
2417 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2419 .tos = rtm->rtm_tos } } };
2421 if (rta[RTA_OIF - 1])
2422 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2424 err = ip_route_output_key(&rt, &fl);
2429 skb->dst = &rt->u.dst;
2430 if (rtm->rtm_flags & RTM_F_NOTIFY)
2431 rt->rt_flags |= RTCF_NOTIFY;
2433 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2435 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2444 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2454 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2461 s_idx = idx = cb->args[1];
2462 for (h = 0; h <= rt_hash_mask; h++) {
2463 if (h < s_h) continue;
2467 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2468 rt = rt->u.rt_next, idx++) {
2469 smp_read_barrier_depends();
2472 skb->dst = dst_clone(&rt->u.dst);
2473 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2475 RTM_NEWROUTE, 1) <= 0) {
2476 dst_release(xchg(&skb->dst, NULL));
2480 dst_release(xchg(&skb->dst, NULL));
2491 void ip_rt_multicast_event(struct in_device *in_dev)
2496 #ifdef CONFIG_SYSCTL
2497 static int flush_delay;
2499 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2500 struct file *filp, void __user *buffer,
2501 size_t *lenp, loff_t *ppos)
2504 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2505 rt_cache_flush(flush_delay);
2512 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2515 void __user *oldval,
2516 size_t __user *oldlenp,
2517 void __user *newval,
2522 if (newlen != sizeof(int))
2524 if (get_user(delay, (int __user *)newval))
2526 rt_cache_flush(delay);
2530 ctl_table ipv4_route_table[] = {
2532 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2533 .procname = "flush",
2534 .data = &flush_delay,
2535 .maxlen = sizeof(int),
2537 .proc_handler = &ipv4_sysctl_rtcache_flush,
2538 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2541 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2542 .procname = "min_delay",
2543 .data = &ip_rt_min_delay,
2544 .maxlen = sizeof(int),
2546 .proc_handler = &proc_dointvec_jiffies,
2547 .strategy = &sysctl_jiffies,
2550 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2551 .procname = "max_delay",
2552 .data = &ip_rt_max_delay,
2553 .maxlen = sizeof(int),
2555 .proc_handler = &proc_dointvec_jiffies,
2556 .strategy = &sysctl_jiffies,
2559 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2560 .procname = "gc_thresh",
2561 .data = &ipv4_dst_ops.gc_thresh,
2562 .maxlen = sizeof(int),
2564 .proc_handler = &proc_dointvec,
2567 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2568 .procname = "max_size",
2569 .data = &ip_rt_max_size,
2570 .maxlen = sizeof(int),
2572 .proc_handler = &proc_dointvec,
2575 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2576 .procname = "gc_min_interval",
2577 .data = &ip_rt_gc_min_interval,
2578 .maxlen = sizeof(int),
2580 .proc_handler = &proc_dointvec_jiffies,
2581 .strategy = &sysctl_jiffies,
2584 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2585 .procname = "gc_timeout",
2586 .data = &ip_rt_gc_timeout,
2587 .maxlen = sizeof(int),
2589 .proc_handler = &proc_dointvec_jiffies,
2590 .strategy = &sysctl_jiffies,
2593 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2594 .procname = "gc_interval",
2595 .data = &ip_rt_gc_interval,
2596 .maxlen = sizeof(int),
2598 .proc_handler = &proc_dointvec_jiffies,
2599 .strategy = &sysctl_jiffies,
2602 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2603 .procname = "redirect_load",
2604 .data = &ip_rt_redirect_load,
2605 .maxlen = sizeof(int),
2607 .proc_handler = &proc_dointvec,
2610 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2611 .procname = "redirect_number",
2612 .data = &ip_rt_redirect_number,
2613 .maxlen = sizeof(int),
2615 .proc_handler = &proc_dointvec,
2618 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2619 .procname = "redirect_silence",
2620 .data = &ip_rt_redirect_silence,
2621 .maxlen = sizeof(int),
2623 .proc_handler = &proc_dointvec,
2626 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2627 .procname = "error_cost",
2628 .data = &ip_rt_error_cost,
2629 .maxlen = sizeof(int),
2631 .proc_handler = &proc_dointvec,
2634 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2635 .procname = "error_burst",
2636 .data = &ip_rt_error_burst,
2637 .maxlen = sizeof(int),
2639 .proc_handler = &proc_dointvec,
2642 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2643 .procname = "gc_elasticity",
2644 .data = &ip_rt_gc_elasticity,
2645 .maxlen = sizeof(int),
2647 .proc_handler = &proc_dointvec,
2650 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2651 .procname = "mtu_expires",
2652 .data = &ip_rt_mtu_expires,
2653 .maxlen = sizeof(int),
2655 .proc_handler = &proc_dointvec_jiffies,
2656 .strategy = &sysctl_jiffies,
2659 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2660 .procname = "min_pmtu",
2661 .data = &ip_rt_min_pmtu,
2662 .maxlen = sizeof(int),
2664 .proc_handler = &proc_dointvec,
2667 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2668 .procname = "min_adv_mss",
2669 .data = &ip_rt_min_advmss,
2670 .maxlen = sizeof(int),
2672 .proc_handler = &proc_dointvec,
2675 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2676 .procname = "secret_interval",
2677 .data = &ip_rt_secret_interval,
2678 .maxlen = sizeof(int),
2680 .proc_handler = &proc_dointvec_jiffies,
2681 .strategy = &sysctl_jiffies,
2687 #ifdef CONFIG_NET_CLS_ROUTE
2688 struct ip_rt_acct *ip_rt_acct;
2690 /* This code sucks. But you should have seen it before! --RR */
2692 /* IP route accounting ptr for this logical cpu number. */
2693 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2695 #ifdef CONFIG_PROC_FS
2696 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2697 int length, int *eof, void *data)
2701 if ((offset & 3) || (length & 3))
2704 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2709 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2710 length = sizeof(struct ip_rt_acct) * 256 - offset;
2714 offset /= sizeof(u32);
2717 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2718 u32 *dst = (u32 *) buffer;
2720 /* Copy first cpu. */
2722 memcpy(dst, src, length);
2724 /* Add the other cpus in, one int at a time */
2728 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2730 for (j = 0; j < length/4; j++)
2736 #endif /* CONFIG_PROC_FS */
2737 #endif /* CONFIG_NET_CLS_ROUTE */
2739 static __initdata unsigned long rhash_entries;
2740 static int __init set_rhash_entries(char *str)
2744 rhash_entries = simple_strtoul(str, &str, 0);
2747 __setup("rhash_entries=", set_rhash_entries);
2749 int __init ip_rt_init(void)
2751 int i, order, goal, rc = 0;
2753 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2754 (jiffies ^ (jiffies >> 7)));
2756 #ifdef CONFIG_NET_CLS_ROUTE
2758 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2760 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2762 panic("IP: failed to allocate ip_rt_acct\n");
2763 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2766 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2767 sizeof(struct rtable),
2768 0, SLAB_HWCACHE_ALIGN,
2771 if (!ipv4_dst_ops.kmem_cachep)
2772 panic("IP: failed to allocate ip_dst_cache\n");
2774 goal = num_physpages >> (26 - PAGE_SHIFT);
2776 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2777 for (order = 0; (1UL << order) < goal; order++)
2781 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2782 sizeof(struct rt_hash_bucket);
2783 while (rt_hash_mask & (rt_hash_mask - 1))
2785 rt_hash_table = (struct rt_hash_bucket *)
2786 __get_free_pages(GFP_ATOMIC, order);
2787 } while (rt_hash_table == NULL && --order > 0);
2790 panic("Failed to allocate IP route cache hash table\n");
2792 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2794 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2796 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2800 for (i = 0; i <= rt_hash_mask; i++) {
2801 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2802 rt_hash_table[i].chain = NULL;
2805 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2806 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2808 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2815 init_timer(&rt_flush_timer);
2816 rt_flush_timer.function = rt_run_flush;
2817 init_timer(&rt_periodic_timer);
2818 rt_periodic_timer.function = rt_check_expire;
2819 init_timer(&rt_secret_timer);
2820 rt_secret_timer.function = rt_secret_rebuild;
2822 /* All the timers, started at system startup tend
2823 to synchronize. Perturb it a bit.
2825 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2827 add_timer(&rt_periodic_timer);
2829 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2830 ip_rt_secret_interval;
2831 add_timer(&rt_secret_timer);
2833 #ifdef CONFIG_PROC_FS
2834 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2835 !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2836 free_percpu(rt_cache_stat);
2840 #ifdef CONFIG_NET_CLS_ROUTE
2841 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2851 EXPORT_SYMBOL(__ip_select_ident);
2852 EXPORT_SYMBOL(ip_route_input);
2853 EXPORT_SYMBOL(ip_route_output_key);