2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
46 #include "fib_lookup.h"
48 #define FSprintk(a...)
50 static rwlock_t fib_info_lock = RW_LOCK_UNLOCKED;
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
62 static spinlock_t fib_multipath_lock = SPIN_LOCK_UNLOCKED;
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
82 #define endfor_nexthops(fi) }
89 } fib_props[RTA_MAX + 1] = {
92 .scope = RT_SCOPE_NOWHERE,
96 .scope = RT_SCOPE_UNIVERSE,
100 .scope = RT_SCOPE_HOST,
104 .scope = RT_SCOPE_LINK,
105 }, /* RTN_BROADCAST */
108 .scope = RT_SCOPE_LINK,
112 .scope = RT_SCOPE_UNIVERSE,
113 }, /* RTN_MULTICAST */
116 .scope = RT_SCOPE_UNIVERSE,
117 }, /* RTN_BLACKHOLE */
119 .error = -EHOSTUNREACH,
120 .scope = RT_SCOPE_UNIVERSE,
121 }, /* RTN_UNREACHABLE */
124 .scope = RT_SCOPE_UNIVERSE,
125 }, /* RTN_PROHIBIT */
128 .scope = RT_SCOPE_UNIVERSE,
132 .scope = RT_SCOPE_NOWHERE,
136 .scope = RT_SCOPE_NOWHERE,
137 }, /* RTN_XRESOLVE */
141 /* Release a nexthop info record */
143 void free_fib_info(struct fib_info *fi)
145 if (fi->fib_dead == 0) {
146 printk("Freeing alive fib_info %p\n", fi);
149 change_nexthops(fi) {
153 } endfor_nexthops(fi);
158 void fib_release_info(struct fib_info *fi)
160 write_lock(&fib_info_lock);
161 if (fi && --fi->fib_treeref == 0) {
162 hlist_del(&fi->fib_hash);
164 hlist_del(&fi->fib_lhash);
165 change_nexthops(fi) {
166 hlist_del(&nh->nh_hash);
167 } endfor_nexthops(fi)
171 write_unlock(&fib_info_lock);
174 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 const struct fib_nh *onh = ofi->fib_nh;
179 if (nh->nh_oif != onh->nh_oif ||
180 nh->nh_gw != onh->nh_gw ||
181 nh->nh_scope != onh->nh_scope ||
182 #ifdef CONFIG_IP_ROUTE_MULTIPATH
183 nh->nh_weight != onh->nh_weight ||
185 #ifdef CONFIG_NET_CLS_ROUTE
186 nh->nh_tclassid != onh->nh_tclassid ||
188 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191 } endfor_nexthops(fi);
195 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
197 unsigned int mask = (fib_hash_size - 1);
198 unsigned int val = fi->fib_nhs;
200 val ^= fi->fib_protocol;
201 val ^= fi->fib_prefsrc;
202 val ^= fi->fib_priority;
204 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
207 static struct fib_info *fib_find_info(const struct fib_info *nfi)
209 struct hlist_head *head;
210 struct hlist_node *node;
214 hash = fib_info_hashfn(nfi);
215 head = &fib_info_hash[hash];
217 hlist_for_each_entry(fi, node, head, fib_hash) {
218 if (fi->fib_nhs != nfi->fib_nhs)
220 if (nfi->fib_protocol == fi->fib_protocol &&
221 nfi->fib_prefsrc == fi->fib_prefsrc &&
222 nfi->fib_priority == fi->fib_priority &&
223 memcmp(nfi->fib_metrics, fi->fib_metrics,
224 sizeof(fi->fib_metrics)) == 0 &&
225 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
226 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
233 static inline unsigned int fib_devindex_hashfn(unsigned int val)
235 unsigned int mask = DEVINDEX_HASHSIZE - 1;
238 (val >> DEVINDEX_HASHBITS) ^
239 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
242 /* Check, that the gateway is already configured.
243 Used only by redirect accept routine.
246 int ip_fib_check_default(u32 gw, struct net_device *dev)
248 struct hlist_head *head;
249 struct hlist_node *node;
253 read_lock(&fib_info_lock);
255 hash = fib_devindex_hashfn(dev->ifindex);
256 head = &fib_info_devhash[hash];
257 hlist_for_each_entry(nh, node, head, nh_hash) {
258 if (nh->nh_dev == dev &&
260 !(nh->nh_flags&RTNH_F_DEAD)) {
261 read_unlock(&fib_info_lock);
266 read_unlock(&fib_info_lock);
271 #ifdef CONFIG_IP_ROUTE_MULTIPATH
273 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
275 while (RTA_OK(attr,attrlen)) {
276 if (attr->rta_type == type)
277 return *(u32*)RTA_DATA(attr);
278 attr = RTA_NEXT(attr, attrlen);
284 fib_count_nexthops(struct rtattr *rta)
287 struct rtnexthop *nhp = RTA_DATA(rta);
288 int nhlen = RTA_PAYLOAD(rta);
290 while (nhlen >= (int)sizeof(struct rtnexthop)) {
291 if ((nhlen -= nhp->rtnh_len) < 0)
294 nhp = RTNH_NEXT(nhp);
300 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
302 struct rtnexthop *nhp = RTA_DATA(rta);
303 int nhlen = RTA_PAYLOAD(rta);
305 change_nexthops(fi) {
306 int attrlen = nhlen - sizeof(struct rtnexthop);
307 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
309 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
310 nh->nh_oif = nhp->rtnh_ifindex;
311 nh->nh_weight = nhp->rtnh_hops + 1;
313 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
314 #ifdef CONFIG_NET_CLS_ROUTE
315 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
318 nhp = RTNH_NEXT(nhp);
319 } endfor_nexthops(fi);
325 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
328 #ifdef CONFIG_IP_ROUTE_MULTIPATH
329 struct rtnexthop *nhp;
333 if (rta->rta_priority &&
334 *rta->rta_priority != fi->fib_priority)
337 if (rta->rta_oif || rta->rta_gw) {
338 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
339 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
344 #ifdef CONFIG_IP_ROUTE_MULTIPATH
345 if (rta->rta_mp == NULL)
347 nhp = RTA_DATA(rta->rta_mp);
348 nhlen = RTA_PAYLOAD(rta->rta_mp);
351 int attrlen = nhlen - sizeof(struct rtnexthop);
354 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
356 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
359 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
360 if (gw && gw != nh->nh_gw)
362 #ifdef CONFIG_NET_CLS_ROUTE
363 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
364 if (gw && gw != nh->nh_tclassid)
368 nhp = RTNH_NEXT(nhp);
369 } endfor_nexthops(fi);
379 Semantics of nexthop is very messy by historical reasons.
380 We have to take into account, that:
381 a) gateway can be actually local interface address,
382 so that gatewayed route is direct.
383 b) gateway must be on-link address, possibly
384 described not by an ifaddr, but also by a direct route.
385 c) If both gateway and interface are specified, they should not
387 d) If we use tunnel routes, gateway could be not on-link.
389 Attempt to reconcile all of these (alas, self-contradictory) conditions
390 results in pretty ugly and hairy code with obscure logic.
392 I chose to generalized it instead, so that the size
393 of code does not increase practically, but it becomes
395 Every prefix is assigned a "scope" value: "host" is local address,
396 "link" is direct route,
397 [ ... "site" ... "interior" ... ]
398 and "universe" is true gateway route with global meaning.
400 Every prefix refers to a set of "nexthop"s (gw, oif),
401 where gw must have narrower scope. This recursion stops
402 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
403 which means that gw is forced to be on link.
405 Code is still hairy, but now it is apparently logically
406 consistent and very flexible. F.e. as by-product it allows
407 to co-exists in peace independent exterior and interior
410 Normally it looks as following.
412 {universe prefix} -> (gw, oif) [scope link]
414 |-> {link prefix} -> (gw, oif) [scope local]
416 |-> {local prefix} (terminal node)
419 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
424 struct fib_result res;
426 #ifdef CONFIG_IP_ROUTE_PERVASIVE
427 if (nh->nh_flags&RTNH_F_PERVASIVE)
430 if (nh->nh_flags&RTNH_F_ONLINK) {
431 struct net_device *dev;
433 if (r->rtm_scope >= RT_SCOPE_LINK)
435 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
437 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
439 if (!(dev->flags&IFF_UP))
443 nh->nh_scope = RT_SCOPE_LINK;
447 struct flowi fl = { .nl_u = { .ip4_u =
448 { .daddr = nh->nh_gw,
449 .scope = r->rtm_scope + 1 } },
452 /* It is not necessary, but requires a bit of thinking */
453 if (fl.fl4_scope < RT_SCOPE_LINK)
454 fl.fl4_scope = RT_SCOPE_LINK;
455 if ((err = fib_lookup(&fl, &res)) != 0)
459 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
461 nh->nh_scope = res.scope;
462 nh->nh_oif = FIB_RES_OIF(res);
463 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
465 dev_hold(nh->nh_dev);
467 if (!(nh->nh_dev->flags & IFF_UP))
474 struct in_device *in_dev;
476 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
479 in_dev = inetdev_by_index(nh->nh_oif);
482 if (!(in_dev->dev->flags&IFF_UP)) {
486 nh->nh_dev = in_dev->dev;
487 dev_hold(nh->nh_dev);
488 nh->nh_scope = RT_SCOPE_HOST;
494 static inline unsigned int fib_laddr_hashfn(u32 val)
496 unsigned int mask = (fib_hash_size - 1);
498 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
501 static struct hlist_head *fib_hash_alloc(int bytes)
503 if (bytes <= PAGE_SIZE)
504 return kmalloc(bytes, GFP_KERNEL);
506 return (struct hlist_head *)
507 __get_free_pages(GFP_KERNEL, get_order(bytes));
510 static void fib_hash_free(struct hlist_head *hash, int bytes)
515 if (bytes <= PAGE_SIZE)
518 free_pages((unsigned long) hash, get_order(bytes));
521 static void fib_hash_move(struct hlist_head *new_info_hash,
522 struct hlist_head *new_laddrhash,
523 unsigned int new_size)
525 unsigned int old_size = fib_hash_size;
528 write_lock(&fib_info_lock);
529 fib_hash_size = new_size;
531 for (i = 0; i < old_size; i++) {
532 struct hlist_head *head = &fib_info_hash[i];
533 struct hlist_node *node, *n;
536 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
537 struct hlist_head *dest;
538 unsigned int new_hash;
540 hlist_del(&fi->fib_hash);
542 new_hash = fib_info_hashfn(fi);
543 dest = &new_info_hash[new_hash];
544 hlist_add_head(&fi->fib_hash, dest);
547 fib_info_hash = new_info_hash;
549 for (i = 0; i < old_size; i++) {
550 struct hlist_head *lhead = &fib_info_laddrhash[i];
551 struct hlist_node *node, *n;
554 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
555 struct hlist_head *ldest;
556 unsigned int new_hash;
558 hlist_del(&fi->fib_lhash);
560 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
561 ldest = &new_laddrhash[new_hash];
562 hlist_add_head(&fi->fib_lhash, ldest);
565 fib_info_laddrhash = new_laddrhash;
567 write_unlock(&fib_info_lock);
571 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
572 const struct nlmsghdr *nlh, int *errp)
575 struct fib_info *fi = NULL;
576 struct fib_info *ofi;
577 #ifdef CONFIG_IP_ROUTE_MULTIPATH
583 /* Fast check to catch the most weird cases */
584 if (fib_props[r->rtm_type].scope > r->rtm_scope)
587 #ifdef CONFIG_IP_ROUTE_MULTIPATH
589 nhs = fib_count_nexthops(rta->rta_mp);
596 if (fib_info_cnt >= fib_hash_size) {
597 unsigned int new_size = fib_hash_size << 1;
598 struct hlist_head *new_info_hash;
599 struct hlist_head *new_laddrhash;
604 bytes = new_size * sizeof(struct hlist_head *);
605 new_info_hash = fib_hash_alloc(bytes);
606 new_laddrhash = fib_hash_alloc(bytes);
607 if (!new_info_hash || !new_laddrhash) {
608 fib_hash_free(new_info_hash, bytes);
609 fib_hash_free(new_laddrhash, bytes);
611 memset(new_info_hash, 0, bytes);
612 memset(new_laddrhash, 0, bytes);
614 fib_hash_move(new_info_hash, new_laddrhash, new_size);
621 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
625 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
627 fi->fib_protocol = r->rtm_protocol;
630 change_nexthops(fi) {
632 } endfor_nexthops(fi)
634 fi->fib_flags = r->rtm_flags;
635 if (rta->rta_priority)
636 fi->fib_priority = *rta->rta_priority;
638 int attrlen = RTA_PAYLOAD(rta->rta_mx);
639 struct rtattr *attr = RTA_DATA(rta->rta_mx);
641 while (RTA_OK(attr, attrlen)) {
642 unsigned flavor = attr->rta_type;
644 if (flavor > RTAX_MAX)
646 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
648 attr = RTA_NEXT(attr, attrlen);
651 if (rta->rta_prefsrc)
652 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
655 #ifdef CONFIG_IP_ROUTE_MULTIPATH
656 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
658 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
660 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
662 #ifdef CONFIG_NET_CLS_ROUTE
663 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
670 struct fib_nh *nh = fi->fib_nh;
672 nh->nh_oif = *rta->rta_oif;
674 memcpy(&nh->nh_gw, rta->rta_gw, 4);
675 #ifdef CONFIG_NET_CLS_ROUTE
677 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
679 nh->nh_flags = r->rtm_flags;
680 #ifdef CONFIG_IP_ROUTE_MULTIPATH
685 if (fib_props[r->rtm_type].error) {
686 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
691 if (r->rtm_scope > RT_SCOPE_HOST)
694 if (r->rtm_scope == RT_SCOPE_HOST) {
695 struct fib_nh *nh = fi->fib_nh;
697 /* Local address is added. */
698 if (nhs != 1 || nh->nh_gw)
700 nh->nh_scope = RT_SCOPE_NOWHERE;
701 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
703 if (nh->nh_dev == NULL)
706 change_nexthops(fi) {
707 if ((err = fib_check_nh(r, fi, nh)) != 0)
709 } endfor_nexthops(fi)
712 if (fi->fib_prefsrc) {
713 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
714 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
715 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
720 if ((ofi = fib_find_info(fi)) != NULL) {
728 atomic_inc(&fi->fib_clntref);
729 write_lock(&fib_info_lock);
730 hlist_add_head(&fi->fib_hash,
731 &fib_info_hash[fib_info_hashfn(fi)]);
732 if (fi->fib_prefsrc) {
733 struct hlist_head *head;
735 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
736 hlist_add_head(&fi->fib_lhash, head);
738 change_nexthops(fi) {
739 struct hlist_head *head;
744 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
745 head = &fib_info_devhash[hash];
746 hlist_add_head(&nh->nh_hash, head);
747 } endfor_nexthops(fi)
748 write_unlock(&fib_info_lock);
763 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
764 struct fib_result *res, int prefixlen)
766 struct fib_alias *fa;
769 list_for_each_entry(fa, head, fa_list) {
773 fa->fa_tos != flp->fl4_tos)
776 if (fa->fa_scope < flp->fl4_scope)
779 fa->fa_state |= FA_S_ACCESSED;
781 err = fib_props[fa->fa_type].error;
783 struct fib_info *fi = fa->fa_info;
785 if (fi->fib_flags & RTNH_F_DEAD)
788 switch (fa->fa_type) {
795 if (nh->nh_flags&RTNH_F_DEAD)
797 if (!flp->oif || flp->oif == nh->nh_oif)
800 #ifdef CONFIG_IP_ROUTE_MULTIPATH
801 if (nhsel < fi->fib_nhs) {
814 printk(KERN_DEBUG "impossible 102\n");
823 res->prefixlen = prefixlen;
824 res->nh_sel = nh_sel;
825 res->type = fa->fa_type;
826 res->scope = fa->fa_scope;
827 res->fi = fa->fa_info;
828 atomic_inc(&res->fi->fib_clntref);
832 /* Find appropriate source address to this destination */
834 u32 __fib_res_prefsrc(struct fib_result *res)
836 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
840 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
841 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
845 struct nlmsghdr *nlh;
846 unsigned char *b = skb->tail;
848 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
849 rtm = NLMSG_DATA(nlh);
850 rtm->rtm_family = AF_INET;
851 rtm->rtm_dst_len = dst_len;
852 rtm->rtm_src_len = 0;
854 rtm->rtm_table = tb_id;
855 rtm->rtm_type = type;
856 rtm->rtm_flags = fi->fib_flags;
857 rtm->rtm_scope = scope;
858 if (rtm->rtm_dst_len)
859 RTA_PUT(skb, RTA_DST, 4, dst);
860 rtm->rtm_protocol = fi->fib_protocol;
861 if (fi->fib_priority)
862 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
863 #ifdef CONFIG_NET_CLS_ROUTE
864 if (fi->fib_nh[0].nh_tclassid)
865 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
867 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
870 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
871 if (fi->fib_nhs == 1) {
872 if (fi->fib_nh->nh_gw)
873 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
874 if (fi->fib_nh->nh_oif)
875 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
877 #ifdef CONFIG_IP_ROUTE_MULTIPATH
878 if (fi->fib_nhs > 1) {
879 struct rtnexthop *nhp;
880 struct rtattr *mp_head;
881 if (skb_tailroom(skb) <= RTA_SPACE(0))
883 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
886 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
888 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
889 nhp->rtnh_flags = nh->nh_flags & 0xFF;
890 nhp->rtnh_hops = nh->nh_weight-1;
891 nhp->rtnh_ifindex = nh->nh_oif;
893 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
894 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
895 } endfor_nexthops(fi);
896 mp_head->rta_type = RTA_MULTIPATH;
897 mp_head->rta_len = skb->tail - (u8*)mp_head;
900 nlh->nlmsg_len = skb->tail - b;
905 skb_trim(skb, b - skb->data);
909 #ifndef CONFIG_IP_NOSIOCRT
912 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
913 struct kern_rta *rta, struct rtentry *r)
918 memset(rtm, 0, sizeof(*rtm));
919 memset(rta, 0, sizeof(*rta));
921 if (r->rt_dst.sa_family != AF_INET)
922 return -EAFNOSUPPORT;
924 /* Check mask for validity:
925 a) it must be contiguous.
926 b) destination must have all host bits clear.
927 c) if application forgot to set correct family (AF_INET),
928 reject request unless it is absolutely clear i.e.
929 both family and mask are zero.
932 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
933 if (!(r->rt_flags&RTF_HOST)) {
934 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
935 if (r->rt_genmask.sa_family != AF_INET) {
936 if (mask || r->rt_genmask.sa_family)
937 return -EAFNOSUPPORT;
939 if (bad_mask(mask, *ptr))
941 plen = inet_mask_len(mask);
944 nl->nlmsg_flags = NLM_F_REQUEST;
947 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
948 if (cmd == SIOCDELRT) {
949 nl->nlmsg_type = RTM_DELROUTE;
952 nl->nlmsg_type = RTM_NEWROUTE;
953 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
954 rtm->rtm_protocol = RTPROT_BOOT;
957 rtm->rtm_dst_len = plen;
961 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
962 rta->rta_priority = (u32*)&r->rt_pad3;
964 if (r->rt_flags&RTF_REJECT) {
965 rtm->rtm_scope = RT_SCOPE_HOST;
966 rtm->rtm_type = RTN_UNREACHABLE;
969 rtm->rtm_scope = RT_SCOPE_NOWHERE;
970 rtm->rtm_type = RTN_UNICAST;
974 struct net_device *dev;
975 char devname[IFNAMSIZ];
977 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
979 devname[IFNAMSIZ-1] = 0;
980 colon = strchr(devname, ':');
983 dev = __dev_get_by_name(devname);
986 rta->rta_oif = &dev->ifindex;
988 struct in_ifaddr *ifa;
989 struct in_device *in_dev = __in_dev_get(dev);
993 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
994 if (strcmp(ifa->ifa_label, devname) == 0)
998 rta->rta_prefsrc = &ifa->ifa_local;
1002 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1003 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1005 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1006 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1009 if (cmd == SIOCDELRT)
1012 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1015 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1016 rtm->rtm_scope = RT_SCOPE_LINK;
1018 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1020 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1024 mx->rta_type = RTA_METRICS;
1025 mx->rta_len = RTA_LENGTH(0);
1026 if (r->rt_flags&RTF_MTU) {
1027 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1028 rec->rta_type = RTAX_ADVMSS;
1029 rec->rta_len = RTA_LENGTH(4);
1030 mx->rta_len += RTA_LENGTH(4);
1031 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1033 if (r->rt_flags&RTF_WINDOW) {
1034 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1035 rec->rta_type = RTAX_WINDOW;
1036 rec->rta_len = RTA_LENGTH(4);
1037 mx->rta_len += RTA_LENGTH(4);
1038 *(u32*)RTA_DATA(rec) = r->rt_window;
1040 if (r->rt_flags&RTF_IRTT) {
1041 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1042 rec->rta_type = RTAX_RTT;
1043 rec->rta_len = RTA_LENGTH(4);
1044 mx->rta_len += RTA_LENGTH(4);
1045 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1055 - local address disappeared -> we must delete all the entries
1057 - device went down -> we must shutdown all nexthops going via it.
1060 int fib_sync_down(u32 local, struct net_device *dev, int force)
1063 int scope = RT_SCOPE_NOWHERE;
1068 if (local && fib_info_laddrhash) {
1069 unsigned int hash = fib_laddr_hashfn(local);
1070 struct hlist_head *head = &fib_info_laddrhash[hash];
1071 struct hlist_node *node;
1072 struct fib_info *fi;
1074 hlist_for_each_entry(fi, node, head, fib_lhash) {
1075 if (fi->fib_prefsrc == local) {
1076 fi->fib_flags |= RTNH_F_DEAD;
1083 struct fib_info *prev_fi = NULL;
1084 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1085 struct hlist_head *head = &fib_info_devhash[hash];
1086 struct hlist_node *node;
1089 hlist_for_each_entry(nh, node, head, nh_hash) {
1090 struct fib_info *fi = nh->nh_parent;
1093 BUG_ON(!fi->fib_nhs);
1094 if (nh->nh_dev != dev || fi == prev_fi)
1098 change_nexthops(fi) {
1099 if (nh->nh_flags&RTNH_F_DEAD)
1101 else if (nh->nh_dev == dev &&
1102 nh->nh_scope != scope) {
1103 nh->nh_flags |= RTNH_F_DEAD;
1104 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1105 spin_lock_bh(&fib_multipath_lock);
1106 fi->fib_power -= nh->nh_power;
1108 spin_unlock_bh(&fib_multipath_lock);
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1113 if (force > 1 && nh->nh_dev == dev) {
1118 } endfor_nexthops(fi)
1119 if (dead == fi->fib_nhs) {
1120 fi->fib_flags |= RTNH_F_DEAD;
1129 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1132 Dead device goes up. We wake up dead nexthops.
1133 It takes sense only on multipath routes.
1136 int fib_sync_up(struct net_device *dev)
1138 struct fib_info *prev_fi;
1140 struct hlist_head *head;
1141 struct hlist_node *node;
1145 if (!(dev->flags&IFF_UP))
1149 hash = fib_devindex_hashfn(dev->ifindex);
1150 head = &fib_info_devhash[hash];
1153 hlist_for_each_entry(nh, node, head, nh_hash) {
1154 struct fib_info *fi = nh->nh_parent;
1157 BUG_ON(!fi->fib_nhs);
1158 if (nh->nh_dev != dev || fi == prev_fi)
1163 change_nexthops(fi) {
1164 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1168 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1170 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1173 spin_lock_bh(&fib_multipath_lock);
1175 nh->nh_flags &= ~RTNH_F_DEAD;
1176 spin_unlock_bh(&fib_multipath_lock);
1177 } endfor_nexthops(fi)
1180 fi->fib_flags &= ~RTNH_F_DEAD;
1189 The algorithm is suboptimal, but it provides really
1190 fair weighted route distribution.
1193 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1195 struct fib_info *fi = res->fi;
1198 spin_lock_bh(&fib_multipath_lock);
1199 if (fi->fib_power <= 0) {
1201 change_nexthops(fi) {
1202 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1203 power += nh->nh_weight;
1204 nh->nh_power = nh->nh_weight;
1206 } endfor_nexthops(fi);
1207 fi->fib_power = power;
1209 spin_unlock_bh(&fib_multipath_lock);
1210 /* Race condition: route has just become dead. */
1217 /* w should be random number [0..fi->fib_power-1],
1218 it is pretty bad approximation.
1221 w = jiffies % fi->fib_power;
1223 change_nexthops(fi) {
1224 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1225 if ((w -= nh->nh_power) <= 0) {
1228 res->nh_sel = nhsel;
1229 spin_unlock_bh(&fib_multipath_lock);
1233 } endfor_nexthops(fi);
1235 /* Race condition: route has just become dead. */
1237 spin_unlock_bh(&fib_multipath_lock);