2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
46 #include "fib_lookup.h"
48 #define FSprintk(a...)
50 static rwlock_t fib_info_lock = RW_LOCK_UNLOCKED;
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
62 static spinlock_t fib_multipath_lock = SPIN_LOCK_UNLOCKED;
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
82 #define endfor_nexthops(fi) }
89 } fib_props[RTA_MAX + 1] = {
92 .scope = RT_SCOPE_NOWHERE,
96 .scope = RT_SCOPE_UNIVERSE,
100 .scope = RT_SCOPE_HOST,
104 .scope = RT_SCOPE_LINK,
105 }, /* RTN_BROADCAST */
108 .scope = RT_SCOPE_LINK,
112 .scope = RT_SCOPE_UNIVERSE,
113 }, /* RTN_MULTICAST */
116 .scope = RT_SCOPE_UNIVERSE,
117 }, /* RTN_BLACKHOLE */
119 .error = -EHOSTUNREACH,
120 .scope = RT_SCOPE_UNIVERSE,
121 }, /* RTN_UNREACHABLE */
124 .scope = RT_SCOPE_UNIVERSE,
125 }, /* RTN_PROHIBIT */
128 .scope = RT_SCOPE_UNIVERSE,
132 .scope = RT_SCOPE_NOWHERE,
136 .scope = RT_SCOPE_NOWHERE,
137 }, /* RTN_XRESOLVE */
141 /* Release a nexthop info record */
143 void free_fib_info(struct fib_info *fi)
145 if (fi->fib_dead == 0) {
146 printk("Freeing alive fib_info %p\n", fi);
149 change_nexthops(fi) {
153 } endfor_nexthops(fi);
158 void fib_release_info(struct fib_info *fi)
160 write_lock(&fib_info_lock);
161 if (fi && --fi->fib_treeref == 0) {
162 hlist_del(&fi->fib_hash);
164 hlist_del(&fi->fib_lhash);
165 change_nexthops(fi) {
168 hlist_del(&nh->nh_hash);
169 } endfor_nexthops(fi)
173 write_unlock(&fib_info_lock);
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178 const struct fib_nh *onh = ofi->fib_nh;
181 if (nh->nh_oif != onh->nh_oif ||
182 nh->nh_gw != onh->nh_gw ||
183 nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185 nh->nh_weight != onh->nh_weight ||
187 #ifdef CONFIG_NET_CLS_ROUTE
188 nh->nh_tclassid != onh->nh_tclassid ||
190 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
193 } endfor_nexthops(fi);
197 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
199 unsigned int mask = (fib_hash_size - 1);
200 unsigned int val = fi->fib_nhs;
202 val ^= fi->fib_protocol;
203 val ^= fi->fib_prefsrc;
204 val ^= fi->fib_priority;
206 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
209 static struct fib_info *fib_find_info(const struct fib_info *nfi)
211 struct hlist_head *head;
212 struct hlist_node *node;
216 hash = fib_info_hashfn(nfi);
217 head = &fib_info_hash[hash];
219 hlist_for_each_entry(fi, node, head, fib_hash) {
220 if (fi->fib_nhs != nfi->fib_nhs)
222 if (nfi->fib_protocol == fi->fib_protocol &&
223 nfi->fib_prefsrc == fi->fib_prefsrc &&
224 nfi->fib_priority == fi->fib_priority &&
225 memcmp(nfi->fib_metrics, fi->fib_metrics,
226 sizeof(fi->fib_metrics)) == 0 &&
227 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
228 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
235 static inline unsigned int fib_devindex_hashfn(unsigned int val)
237 unsigned int mask = DEVINDEX_HASHSIZE - 1;
240 (val >> DEVINDEX_HASHBITS) ^
241 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
244 /* Check, that the gateway is already configured.
245 Used only by redirect accept routine.
248 int ip_fib_check_default(u32 gw, struct net_device *dev)
250 struct hlist_head *head;
251 struct hlist_node *node;
255 read_lock(&fib_info_lock);
257 hash = fib_devindex_hashfn(dev->ifindex);
258 head = &fib_info_devhash[hash];
259 hlist_for_each_entry(nh, node, head, nh_hash) {
260 if (nh->nh_dev == dev &&
262 !(nh->nh_flags&RTNH_F_DEAD)) {
263 read_unlock(&fib_info_lock);
268 read_unlock(&fib_info_lock);
273 #ifdef CONFIG_IP_ROUTE_MULTIPATH
275 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
277 while (RTA_OK(attr,attrlen)) {
278 if (attr->rta_type == type)
279 return *(u32*)RTA_DATA(attr);
280 attr = RTA_NEXT(attr, attrlen);
286 fib_count_nexthops(struct rtattr *rta)
289 struct rtnexthop *nhp = RTA_DATA(rta);
290 int nhlen = RTA_PAYLOAD(rta);
292 while (nhlen >= (int)sizeof(struct rtnexthop)) {
293 if ((nhlen -= nhp->rtnh_len) < 0)
296 nhp = RTNH_NEXT(nhp);
302 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
304 struct rtnexthop *nhp = RTA_DATA(rta);
305 int nhlen = RTA_PAYLOAD(rta);
307 change_nexthops(fi) {
308 int attrlen = nhlen - sizeof(struct rtnexthop);
309 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
311 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
312 nh->nh_oif = nhp->rtnh_ifindex;
313 nh->nh_weight = nhp->rtnh_hops + 1;
315 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
316 #ifdef CONFIG_NET_CLS_ROUTE
317 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
320 nhp = RTNH_NEXT(nhp);
321 } endfor_nexthops(fi);
327 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
330 #ifdef CONFIG_IP_ROUTE_MULTIPATH
331 struct rtnexthop *nhp;
335 if (rta->rta_priority &&
336 *rta->rta_priority != fi->fib_priority)
339 if (rta->rta_oif || rta->rta_gw) {
340 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
341 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
346 #ifdef CONFIG_IP_ROUTE_MULTIPATH
347 if (rta->rta_mp == NULL)
349 nhp = RTA_DATA(rta->rta_mp);
350 nhlen = RTA_PAYLOAD(rta->rta_mp);
353 int attrlen = nhlen - sizeof(struct rtnexthop);
356 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
358 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
361 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
362 if (gw && gw != nh->nh_gw)
364 #ifdef CONFIG_NET_CLS_ROUTE
365 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
366 if (gw && gw != nh->nh_tclassid)
370 nhp = RTNH_NEXT(nhp);
371 } endfor_nexthops(fi);
381 Semantics of nexthop is very messy by historical reasons.
382 We have to take into account, that:
383 a) gateway can be actually local interface address,
384 so that gatewayed route is direct.
385 b) gateway must be on-link address, possibly
386 described not by an ifaddr, but also by a direct route.
387 c) If both gateway and interface are specified, they should not
389 d) If we use tunnel routes, gateway could be not on-link.
391 Attempt to reconcile all of these (alas, self-contradictory) conditions
392 results in pretty ugly and hairy code with obscure logic.
394 I chose to generalized it instead, so that the size
395 of code does not increase practically, but it becomes
397 Every prefix is assigned a "scope" value: "host" is local address,
398 "link" is direct route,
399 [ ... "site" ... "interior" ... ]
400 and "universe" is true gateway route with global meaning.
402 Every prefix refers to a set of "nexthop"s (gw, oif),
403 where gw must have narrower scope. This recursion stops
404 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
405 which means that gw is forced to be on link.
407 Code is still hairy, but now it is apparently logically
408 consistent and very flexible. F.e. as by-product it allows
409 to co-exists in peace independent exterior and interior
412 Normally it looks as following.
414 {universe prefix} -> (gw, oif) [scope link]
416 |-> {link prefix} -> (gw, oif) [scope local]
418 |-> {local prefix} (terminal node)
421 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
426 struct fib_result res;
428 #ifdef CONFIG_IP_ROUTE_PERVASIVE
429 if (nh->nh_flags&RTNH_F_PERVASIVE)
432 if (nh->nh_flags&RTNH_F_ONLINK) {
433 struct net_device *dev;
435 if (r->rtm_scope >= RT_SCOPE_LINK)
437 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
439 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
441 if (!(dev->flags&IFF_UP))
445 nh->nh_scope = RT_SCOPE_LINK;
449 struct flowi fl = { .nl_u = { .ip4_u =
450 { .daddr = nh->nh_gw,
451 .scope = r->rtm_scope + 1 } },
454 /* It is not necessary, but requires a bit of thinking */
455 if (fl.fl4_scope < RT_SCOPE_LINK)
456 fl.fl4_scope = RT_SCOPE_LINK;
457 if ((err = fib_lookup(&fl, &res)) != 0)
461 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
463 nh->nh_scope = res.scope;
464 nh->nh_oif = FIB_RES_OIF(res);
465 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
467 dev_hold(nh->nh_dev);
469 if (!(nh->nh_dev->flags & IFF_UP))
476 struct in_device *in_dev;
478 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
481 in_dev = inetdev_by_index(nh->nh_oif);
484 if (!(in_dev->dev->flags&IFF_UP)) {
488 nh->nh_dev = in_dev->dev;
489 dev_hold(nh->nh_dev);
490 nh->nh_scope = RT_SCOPE_HOST;
496 static inline unsigned int fib_laddr_hashfn(u32 val)
498 unsigned int mask = (fib_hash_size - 1);
500 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
503 static struct hlist_head *fib_hash_alloc(int bytes)
505 if (bytes <= PAGE_SIZE)
506 return kmalloc(bytes, GFP_KERNEL);
508 return (struct hlist_head *)
509 __get_free_pages(GFP_KERNEL, get_order(bytes));
512 static void fib_hash_free(struct hlist_head *hash, int bytes)
517 if (bytes <= PAGE_SIZE)
520 free_pages((unsigned long) hash, get_order(bytes));
523 static void fib_hash_move(struct hlist_head *new_info_hash,
524 struct hlist_head *new_laddrhash,
525 unsigned int new_size)
527 unsigned int old_size = fib_hash_size;
530 write_lock(&fib_info_lock);
531 fib_hash_size = new_size;
533 for (i = 0; i < old_size; i++) {
534 struct hlist_head *head = &fib_info_hash[i];
535 struct hlist_node *node, *n;
538 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
539 struct hlist_head *dest;
540 unsigned int new_hash;
542 hlist_del(&fi->fib_hash);
544 new_hash = fib_info_hashfn(fi);
545 dest = &new_info_hash[new_hash];
546 hlist_add_head(&fi->fib_hash, dest);
549 fib_info_hash = new_info_hash;
551 for (i = 0; i < old_size; i++) {
552 struct hlist_head *lhead = &fib_info_laddrhash[i];
553 struct hlist_node *node, *n;
556 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
557 struct hlist_head *ldest;
558 unsigned int new_hash;
560 hlist_del(&fi->fib_lhash);
562 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
563 ldest = &new_laddrhash[new_hash];
564 hlist_add_head(&fi->fib_lhash, ldest);
567 fib_info_laddrhash = new_laddrhash;
569 write_unlock(&fib_info_lock);
573 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
574 const struct nlmsghdr *nlh, int *errp)
577 struct fib_info *fi = NULL;
578 struct fib_info *ofi;
579 #ifdef CONFIG_IP_ROUTE_MULTIPATH
585 /* Fast check to catch the most weird cases */
586 if (fib_props[r->rtm_type].scope > r->rtm_scope)
589 #ifdef CONFIG_IP_ROUTE_MULTIPATH
591 nhs = fib_count_nexthops(rta->rta_mp);
598 if (fib_info_cnt >= fib_hash_size) {
599 unsigned int new_size = fib_hash_size << 1;
600 struct hlist_head *new_info_hash;
601 struct hlist_head *new_laddrhash;
606 bytes = new_size * sizeof(struct hlist_head *);
607 new_info_hash = fib_hash_alloc(bytes);
608 new_laddrhash = fib_hash_alloc(bytes);
609 if (!new_info_hash || !new_laddrhash) {
610 fib_hash_free(new_info_hash, bytes);
611 fib_hash_free(new_laddrhash, bytes);
613 memset(new_info_hash, 0, bytes);
614 memset(new_laddrhash, 0, bytes);
616 fib_hash_move(new_info_hash, new_laddrhash, new_size);
623 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
627 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
629 fi->fib_protocol = r->rtm_protocol;
632 change_nexthops(fi) {
634 } endfor_nexthops(fi)
636 fi->fib_flags = r->rtm_flags;
637 if (rta->rta_priority)
638 fi->fib_priority = *rta->rta_priority;
640 int attrlen = RTA_PAYLOAD(rta->rta_mx);
641 struct rtattr *attr = RTA_DATA(rta->rta_mx);
643 while (RTA_OK(attr, attrlen)) {
644 unsigned flavor = attr->rta_type;
646 if (flavor > RTAX_MAX)
648 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
650 attr = RTA_NEXT(attr, attrlen);
653 if (rta->rta_prefsrc)
654 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
657 #ifdef CONFIG_IP_ROUTE_MULTIPATH
658 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
660 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
662 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
664 #ifdef CONFIG_NET_CLS_ROUTE
665 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
672 struct fib_nh *nh = fi->fib_nh;
674 nh->nh_oif = *rta->rta_oif;
676 memcpy(&nh->nh_gw, rta->rta_gw, 4);
677 #ifdef CONFIG_NET_CLS_ROUTE
679 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
681 nh->nh_flags = r->rtm_flags;
682 #ifdef CONFIG_IP_ROUTE_MULTIPATH
687 if (fib_props[r->rtm_type].error) {
688 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
693 if (r->rtm_scope > RT_SCOPE_HOST)
696 if (r->rtm_scope == RT_SCOPE_HOST) {
697 struct fib_nh *nh = fi->fib_nh;
699 /* Local address is added. */
700 if (nhs != 1 || nh->nh_gw)
702 nh->nh_scope = RT_SCOPE_NOWHERE;
703 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
705 if (nh->nh_dev == NULL)
708 change_nexthops(fi) {
709 if ((err = fib_check_nh(r, fi, nh)) != 0)
711 } endfor_nexthops(fi)
714 if (fi->fib_prefsrc) {
715 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
716 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
717 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
722 if ((ofi = fib_find_info(fi)) != NULL) {
730 atomic_inc(&fi->fib_clntref);
731 write_lock(&fib_info_lock);
732 hlist_add_head(&fi->fib_hash,
733 &fib_info_hash[fib_info_hashfn(fi)]);
734 if (fi->fib_prefsrc) {
735 struct hlist_head *head;
737 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
738 hlist_add_head(&fi->fib_lhash, head);
740 change_nexthops(fi) {
741 struct hlist_head *head;
746 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
747 head = &fib_info_devhash[hash];
748 hlist_add_head(&nh->nh_hash, head);
749 } endfor_nexthops(fi)
750 write_unlock(&fib_info_lock);
765 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
766 struct fib_result *res, int prefixlen)
768 struct fib_alias *fa;
771 list_for_each_entry(fa, head, fa_list) {
775 fa->fa_tos != flp->fl4_tos)
778 if (fa->fa_scope < flp->fl4_scope)
781 fa->fa_state |= FA_S_ACCESSED;
783 err = fib_props[fa->fa_type].error;
785 struct fib_info *fi = fa->fa_info;
787 if (fi->fib_flags & RTNH_F_DEAD)
790 switch (fa->fa_type) {
797 if (nh->nh_flags&RTNH_F_DEAD)
799 if (!flp->oif || flp->oif == nh->nh_oif)
802 #ifdef CONFIG_IP_ROUTE_MULTIPATH
803 if (nhsel < fi->fib_nhs) {
816 printk(KERN_DEBUG "impossible 102\n");
825 res->prefixlen = prefixlen;
826 res->nh_sel = nh_sel;
827 res->type = fa->fa_type;
828 res->scope = fa->fa_scope;
829 res->fi = fa->fa_info;
830 atomic_inc(&res->fi->fib_clntref);
834 /* Find appropriate source address to this destination */
836 u32 __fib_res_prefsrc(struct fib_result *res)
838 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
842 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
843 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
847 struct nlmsghdr *nlh;
848 unsigned char *b = skb->tail;
850 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
851 rtm = NLMSG_DATA(nlh);
852 rtm->rtm_family = AF_INET;
853 rtm->rtm_dst_len = dst_len;
854 rtm->rtm_src_len = 0;
856 rtm->rtm_table = tb_id;
857 rtm->rtm_type = type;
858 rtm->rtm_flags = fi->fib_flags;
859 rtm->rtm_scope = scope;
860 if (rtm->rtm_dst_len)
861 RTA_PUT(skb, RTA_DST, 4, dst);
862 rtm->rtm_protocol = fi->fib_protocol;
863 if (fi->fib_priority)
864 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
865 #ifdef CONFIG_NET_CLS_ROUTE
866 if (fi->fib_nh[0].nh_tclassid)
867 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
869 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
872 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
873 if (fi->fib_nhs == 1) {
874 if (fi->fib_nh->nh_gw)
875 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
876 if (fi->fib_nh->nh_oif)
877 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
879 #ifdef CONFIG_IP_ROUTE_MULTIPATH
880 if (fi->fib_nhs > 1) {
881 struct rtnexthop *nhp;
882 struct rtattr *mp_head;
883 if (skb_tailroom(skb) <= RTA_SPACE(0))
885 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
888 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
890 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
891 nhp->rtnh_flags = nh->nh_flags & 0xFF;
892 nhp->rtnh_hops = nh->nh_weight-1;
893 nhp->rtnh_ifindex = nh->nh_oif;
895 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
896 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
897 } endfor_nexthops(fi);
898 mp_head->rta_type = RTA_MULTIPATH;
899 mp_head->rta_len = skb->tail - (u8*)mp_head;
902 nlh->nlmsg_len = skb->tail - b;
907 skb_trim(skb, b - skb->data);
911 #ifndef CONFIG_IP_NOSIOCRT
914 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
915 struct kern_rta *rta, struct rtentry *r)
920 memset(rtm, 0, sizeof(*rtm));
921 memset(rta, 0, sizeof(*rta));
923 if (r->rt_dst.sa_family != AF_INET)
924 return -EAFNOSUPPORT;
926 /* Check mask for validity:
927 a) it must be contiguous.
928 b) destination must have all host bits clear.
929 c) if application forgot to set correct family (AF_INET),
930 reject request unless it is absolutely clear i.e.
931 both family and mask are zero.
934 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
935 if (!(r->rt_flags&RTF_HOST)) {
936 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
937 if (r->rt_genmask.sa_family != AF_INET) {
938 if (mask || r->rt_genmask.sa_family)
939 return -EAFNOSUPPORT;
941 if (bad_mask(mask, *ptr))
943 plen = inet_mask_len(mask);
946 nl->nlmsg_flags = NLM_F_REQUEST;
949 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
950 if (cmd == SIOCDELRT) {
951 nl->nlmsg_type = RTM_DELROUTE;
954 nl->nlmsg_type = RTM_NEWROUTE;
955 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
956 rtm->rtm_protocol = RTPROT_BOOT;
959 rtm->rtm_dst_len = plen;
963 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
964 rta->rta_priority = (u32*)&r->rt_pad3;
966 if (r->rt_flags&RTF_REJECT) {
967 rtm->rtm_scope = RT_SCOPE_HOST;
968 rtm->rtm_type = RTN_UNREACHABLE;
971 rtm->rtm_scope = RT_SCOPE_NOWHERE;
972 rtm->rtm_type = RTN_UNICAST;
976 struct net_device *dev;
977 char devname[IFNAMSIZ];
979 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
981 devname[IFNAMSIZ-1] = 0;
982 colon = strchr(devname, ':');
985 dev = __dev_get_by_name(devname);
988 rta->rta_oif = &dev->ifindex;
990 struct in_ifaddr *ifa;
991 struct in_device *in_dev = __in_dev_get(dev);
995 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
996 if (strcmp(ifa->ifa_label, devname) == 0)
1000 rta->rta_prefsrc = &ifa->ifa_local;
1004 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1005 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1007 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1008 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1011 if (cmd == SIOCDELRT)
1014 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1017 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1018 rtm->rtm_scope = RT_SCOPE_LINK;
1020 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1022 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1026 mx->rta_type = RTA_METRICS;
1027 mx->rta_len = RTA_LENGTH(0);
1028 if (r->rt_flags&RTF_MTU) {
1029 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1030 rec->rta_type = RTAX_ADVMSS;
1031 rec->rta_len = RTA_LENGTH(4);
1032 mx->rta_len += RTA_LENGTH(4);
1033 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1035 if (r->rt_flags&RTF_WINDOW) {
1036 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1037 rec->rta_type = RTAX_WINDOW;
1038 rec->rta_len = RTA_LENGTH(4);
1039 mx->rta_len += RTA_LENGTH(4);
1040 *(u32*)RTA_DATA(rec) = r->rt_window;
1042 if (r->rt_flags&RTF_IRTT) {
1043 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1044 rec->rta_type = RTAX_RTT;
1045 rec->rta_len = RTA_LENGTH(4);
1046 mx->rta_len += RTA_LENGTH(4);
1047 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1057 - local address disappeared -> we must delete all the entries
1059 - device went down -> we must shutdown all nexthops going via it.
1062 int fib_sync_down(u32 local, struct net_device *dev, int force)
1065 int scope = RT_SCOPE_NOWHERE;
1070 if (local && fib_info_laddrhash) {
1071 unsigned int hash = fib_laddr_hashfn(local);
1072 struct hlist_head *head = &fib_info_laddrhash[hash];
1073 struct hlist_node *node;
1074 struct fib_info *fi;
1076 hlist_for_each_entry(fi, node, head, fib_lhash) {
1077 if (fi->fib_prefsrc == local) {
1078 fi->fib_flags |= RTNH_F_DEAD;
1085 struct fib_info *prev_fi = NULL;
1086 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1087 struct hlist_head *head = &fib_info_devhash[hash];
1088 struct hlist_node *node;
1091 hlist_for_each_entry(nh, node, head, nh_hash) {
1092 struct fib_info *fi = nh->nh_parent;
1095 BUG_ON(!fi->fib_nhs);
1096 if (nh->nh_dev != dev || fi == prev_fi)
1100 change_nexthops(fi) {
1101 if (nh->nh_flags&RTNH_F_DEAD)
1103 else if (nh->nh_dev == dev &&
1104 nh->nh_scope != scope) {
1105 nh->nh_flags |= RTNH_F_DEAD;
1106 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1107 spin_lock_bh(&fib_multipath_lock);
1108 fi->fib_power -= nh->nh_power;
1110 spin_unlock_bh(&fib_multipath_lock);
1114 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1115 if (force > 1 && nh->nh_dev == dev) {
1120 } endfor_nexthops(fi)
1121 if (dead == fi->fib_nhs) {
1122 fi->fib_flags |= RTNH_F_DEAD;
1131 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1134 Dead device goes up. We wake up dead nexthops.
1135 It takes sense only on multipath routes.
1138 int fib_sync_up(struct net_device *dev)
1140 struct fib_info *prev_fi;
1142 struct hlist_head *head;
1143 struct hlist_node *node;
1147 if (!(dev->flags&IFF_UP))
1151 hash = fib_devindex_hashfn(dev->ifindex);
1152 head = &fib_info_devhash[hash];
1155 hlist_for_each_entry(nh, node, head, nh_hash) {
1156 struct fib_info *fi = nh->nh_parent;
1159 BUG_ON(!fi->fib_nhs);
1160 if (nh->nh_dev != dev || fi == prev_fi)
1165 change_nexthops(fi) {
1166 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1170 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1172 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1175 spin_lock_bh(&fib_multipath_lock);
1177 nh->nh_flags &= ~RTNH_F_DEAD;
1178 spin_unlock_bh(&fib_multipath_lock);
1179 } endfor_nexthops(fi)
1182 fi->fib_flags &= ~RTNH_F_DEAD;
1191 The algorithm is suboptimal, but it provides really
1192 fair weighted route distribution.
1195 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1197 struct fib_info *fi = res->fi;
1200 spin_lock_bh(&fib_multipath_lock);
1201 if (fi->fib_power <= 0) {
1203 change_nexthops(fi) {
1204 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1205 power += nh->nh_weight;
1206 nh->nh_power = nh->nh_weight;
1208 } endfor_nexthops(fi);
1209 fi->fib_power = power;
1211 spin_unlock_bh(&fib_multipath_lock);
1212 /* Race condition: route has just become dead. */
1219 /* w should be random number [0..fi->fib_power-1],
1220 it is pretty bad approximation.
1223 w = jiffies % fi->fib_power;
1225 change_nexthops(fi) {
1226 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1227 if ((w -= nh->nh_power) <= 0) {
1230 res->nh_sel = nhsel;
1231 spin_unlock_bh(&fib_multipath_lock);
1235 } endfor_nexthops(fi);
1237 /* Race condition: route has just become dead. */
1239 spin_unlock_bh(&fib_multipath_lock);