2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
46 #include "fib_lookup.h"
48 #define FSprintk(a...)
50 static DEFINE_RWLOCK(fib_info_lock);
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
62 static DEFINE_SPINLOCK(fib_multipath_lock);
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
82 #define endfor_nexthops(fi) }
89 } fib_props[RTA_MAX + 1] = {
92 .scope = RT_SCOPE_NOWHERE,
96 .scope = RT_SCOPE_UNIVERSE,
100 .scope = RT_SCOPE_HOST,
104 .scope = RT_SCOPE_LINK,
105 }, /* RTN_BROADCAST */
108 .scope = RT_SCOPE_LINK,
112 .scope = RT_SCOPE_UNIVERSE,
113 }, /* RTN_MULTICAST */
116 .scope = RT_SCOPE_UNIVERSE,
117 }, /* RTN_BLACKHOLE */
119 .error = -EHOSTUNREACH,
120 .scope = RT_SCOPE_UNIVERSE,
121 }, /* RTN_UNREACHABLE */
124 .scope = RT_SCOPE_UNIVERSE,
125 }, /* RTN_PROHIBIT */
128 .scope = RT_SCOPE_UNIVERSE,
132 .scope = RT_SCOPE_NOWHERE,
136 .scope = RT_SCOPE_NOWHERE,
137 }, /* RTN_XRESOLVE */
141 /* Release a nexthop info record */
143 void free_fib_info(struct fib_info *fi)
145 if (fi->fib_dead == 0) {
146 printk("Freeing alive fib_info %p\n", fi);
149 change_nexthops(fi) {
153 } endfor_nexthops(fi);
158 void fib_release_info(struct fib_info *fi)
160 write_lock(&fib_info_lock);
161 if (fi && --fi->fib_treeref == 0) {
162 hlist_del(&fi->fib_hash);
164 hlist_del(&fi->fib_lhash);
165 change_nexthops(fi) {
168 hlist_del(&nh->nh_hash);
169 } endfor_nexthops(fi)
173 write_unlock(&fib_info_lock);
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178 const struct fib_nh *onh = ofi->fib_nh;
181 if (nh->nh_oif != onh->nh_oif ||
182 nh->nh_gw != onh->nh_gw ||
183 nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185 nh->nh_weight != onh->nh_weight ||
187 #ifdef CONFIG_NET_CLS_ROUTE
188 nh->nh_tclassid != onh->nh_tclassid ||
190 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
193 } endfor_nexthops(fi);
197 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
199 unsigned int mask = (fib_hash_size - 1);
200 unsigned int val = fi->fib_nhs;
202 val ^= fi->fib_protocol;
203 val ^= fi->fib_prefsrc;
204 val ^= fi->fib_priority;
206 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
209 static struct fib_info *fib_find_info(const struct fib_info *nfi)
211 struct hlist_head *head;
212 struct hlist_node *node;
216 hash = fib_info_hashfn(nfi);
217 head = &fib_info_hash[hash];
219 hlist_for_each_entry(fi, node, head, fib_hash) {
220 if (fi->fib_nhs != nfi->fib_nhs)
222 if (nfi->fib_protocol == fi->fib_protocol &&
223 nfi->fib_prefsrc == fi->fib_prefsrc &&
224 nfi->fib_priority == fi->fib_priority &&
225 memcmp(nfi->fib_metrics, fi->fib_metrics,
226 sizeof(fi->fib_metrics)) == 0 &&
227 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
228 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
235 static inline unsigned int fib_devindex_hashfn(unsigned int val)
237 unsigned int mask = DEVINDEX_HASHSIZE - 1;
240 (val >> DEVINDEX_HASHBITS) ^
241 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
244 /* Check, that the gateway is already configured.
245 Used only by redirect accept routine.
248 int ip_fib_check_default(u32 gw, struct net_device *dev)
250 struct hlist_head *head;
251 struct hlist_node *node;
255 read_lock(&fib_info_lock);
257 hash = fib_devindex_hashfn(dev->ifindex);
258 head = &fib_info_devhash[hash];
259 hlist_for_each_entry(nh, node, head, nh_hash) {
260 if (nh->nh_dev == dev &&
262 !(nh->nh_flags&RTNH_F_DEAD)) {
263 read_unlock(&fib_info_lock);
268 read_unlock(&fib_info_lock);
273 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
275 struct nlmsghdr *n, struct netlink_skb_parms *req)
278 u32 pid = req ? req->pid : 0;
279 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
281 skb = alloc_skb(size, GFP_KERNEL);
285 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
286 fa->fa_type, fa->fa_scope, &key, z,
292 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
293 if (n->nlmsg_flags&NLM_F_ECHO)
294 atomic_inc(&skb->users);
295 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
296 if (n->nlmsg_flags&NLM_F_ECHO)
297 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
300 /* Return the first fib alias matching TOS with
301 * priority less than or equal to PRIO.
303 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
306 struct fib_alias *fa;
307 list_for_each_entry(fa, fah, fa_list) {
308 if (fa->fa_tos > tos)
310 if (fa->fa_info->fib_priority >= prio ||
318 int fib_detect_death(struct fib_info *fi, int order,
319 struct fib_info **last_resort, int *last_idx, int *dflt)
322 int state = NUD_NONE;
324 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
326 state = n->nud_state;
329 if (state==NUD_REACHABLE)
331 if ((state&NUD_VALID) && order != *dflt)
333 if ((state&NUD_VALID) ||
334 (*last_idx<0 && order > *dflt)) {
341 #ifdef CONFIG_IP_ROUTE_MULTIPATH
343 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
345 while (RTA_OK(attr,attrlen)) {
346 if (attr->rta_type == type)
347 return *(u32*)RTA_DATA(attr);
348 attr = RTA_NEXT(attr, attrlen);
354 fib_count_nexthops(struct rtattr *rta)
357 struct rtnexthop *nhp = RTA_DATA(rta);
358 int nhlen = RTA_PAYLOAD(rta);
360 while (nhlen >= (int)sizeof(struct rtnexthop)) {
361 if ((nhlen -= nhp->rtnh_len) < 0)
364 nhp = RTNH_NEXT(nhp);
370 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
372 struct rtnexthop *nhp = RTA_DATA(rta);
373 int nhlen = RTA_PAYLOAD(rta);
375 change_nexthops(fi) {
376 int attrlen = nhlen - sizeof(struct rtnexthop);
377 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
379 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
380 nh->nh_oif = nhp->rtnh_ifindex;
381 nh->nh_weight = nhp->rtnh_hops + 1;
383 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
384 #ifdef CONFIG_NET_CLS_ROUTE
385 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
388 nhp = RTNH_NEXT(nhp);
389 } endfor_nexthops(fi);
395 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
398 #ifdef CONFIG_IP_ROUTE_MULTIPATH
399 struct rtnexthop *nhp;
403 if (rta->rta_priority &&
404 *rta->rta_priority != fi->fib_priority)
407 if (rta->rta_oif || rta->rta_gw) {
408 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
409 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
414 #ifdef CONFIG_IP_ROUTE_MULTIPATH
415 if (rta->rta_mp == NULL)
417 nhp = RTA_DATA(rta->rta_mp);
418 nhlen = RTA_PAYLOAD(rta->rta_mp);
421 int attrlen = nhlen - sizeof(struct rtnexthop);
424 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
426 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
429 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
430 if (gw && gw != nh->nh_gw)
432 #ifdef CONFIG_NET_CLS_ROUTE
433 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
434 if (gw && gw != nh->nh_tclassid)
438 nhp = RTNH_NEXT(nhp);
439 } endfor_nexthops(fi);
449 Semantics of nexthop is very messy by historical reasons.
450 We have to take into account, that:
451 a) gateway can be actually local interface address,
452 so that gatewayed route is direct.
453 b) gateway must be on-link address, possibly
454 described not by an ifaddr, but also by a direct route.
455 c) If both gateway and interface are specified, they should not
457 d) If we use tunnel routes, gateway could be not on-link.
459 Attempt to reconcile all of these (alas, self-contradictory) conditions
460 results in pretty ugly and hairy code with obscure logic.
462 I chose to generalized it instead, so that the size
463 of code does not increase practically, but it becomes
465 Every prefix is assigned a "scope" value: "host" is local address,
466 "link" is direct route,
467 [ ... "site" ... "interior" ... ]
468 and "universe" is true gateway route with global meaning.
470 Every prefix refers to a set of "nexthop"s (gw, oif),
471 where gw must have narrower scope. This recursion stops
472 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
473 which means that gw is forced to be on link.
475 Code is still hairy, but now it is apparently logically
476 consistent and very flexible. F.e. as by-product it allows
477 to co-exists in peace independent exterior and interior
480 Normally it looks as following.
482 {universe prefix} -> (gw, oif) [scope link]
484 |-> {link prefix} -> (gw, oif) [scope local]
486 |-> {local prefix} (terminal node)
489 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
494 struct fib_result res;
496 #ifdef CONFIG_IP_ROUTE_PERVASIVE
497 if (nh->nh_flags&RTNH_F_PERVASIVE)
500 if (nh->nh_flags&RTNH_F_ONLINK) {
501 struct net_device *dev;
503 if (r->rtm_scope >= RT_SCOPE_LINK)
505 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
507 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
509 if (!(dev->flags&IFF_UP))
513 nh->nh_scope = RT_SCOPE_LINK;
517 struct flowi fl = { .nl_u = { .ip4_u =
518 { .daddr = nh->nh_gw,
519 .scope = r->rtm_scope + 1 } },
522 /* It is not necessary, but requires a bit of thinking */
523 if (fl.fl4_scope < RT_SCOPE_LINK)
524 fl.fl4_scope = RT_SCOPE_LINK;
525 if ((err = fib_lookup(&fl, &res)) != 0)
529 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
531 nh->nh_scope = res.scope;
532 nh->nh_oif = FIB_RES_OIF(res);
533 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
535 dev_hold(nh->nh_dev);
537 if (!(nh->nh_dev->flags & IFF_UP))
544 struct in_device *in_dev;
546 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
549 in_dev = inetdev_by_index(nh->nh_oif);
552 if (!(in_dev->dev->flags&IFF_UP)) {
556 nh->nh_dev = in_dev->dev;
557 dev_hold(nh->nh_dev);
558 nh->nh_scope = RT_SCOPE_HOST;
564 static inline unsigned int fib_laddr_hashfn(u32 val)
566 unsigned int mask = (fib_hash_size - 1);
568 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
571 static struct hlist_head *fib_hash_alloc(int bytes)
573 if (bytes <= PAGE_SIZE)
574 return kmalloc(bytes, GFP_KERNEL);
576 return (struct hlist_head *)
577 __get_free_pages(GFP_KERNEL, get_order(bytes));
580 static void fib_hash_free(struct hlist_head *hash, int bytes)
585 if (bytes <= PAGE_SIZE)
588 free_pages((unsigned long) hash, get_order(bytes));
591 static void fib_hash_move(struct hlist_head *new_info_hash,
592 struct hlist_head *new_laddrhash,
593 unsigned int new_size)
595 unsigned int old_size = fib_hash_size;
598 write_lock(&fib_info_lock);
599 fib_hash_size = new_size;
601 for (i = 0; i < old_size; i++) {
602 struct hlist_head *head = &fib_info_hash[i];
603 struct hlist_node *node, *n;
606 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
607 struct hlist_head *dest;
608 unsigned int new_hash;
610 hlist_del(&fi->fib_hash);
612 new_hash = fib_info_hashfn(fi);
613 dest = &new_info_hash[new_hash];
614 hlist_add_head(&fi->fib_hash, dest);
617 fib_info_hash = new_info_hash;
619 for (i = 0; i < old_size; i++) {
620 struct hlist_head *lhead = &fib_info_laddrhash[i];
621 struct hlist_node *node, *n;
624 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
625 struct hlist_head *ldest;
626 unsigned int new_hash;
628 hlist_del(&fi->fib_lhash);
630 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
631 ldest = &new_laddrhash[new_hash];
632 hlist_add_head(&fi->fib_lhash, ldest);
635 fib_info_laddrhash = new_laddrhash;
637 write_unlock(&fib_info_lock);
641 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
642 const struct nlmsghdr *nlh, int *errp)
645 struct fib_info *fi = NULL;
646 struct fib_info *ofi;
647 #ifdef CONFIG_IP_ROUTE_MULTIPATH
653 /* Fast check to catch the most weird cases */
654 if (fib_props[r->rtm_type].scope > r->rtm_scope)
657 #ifdef CONFIG_IP_ROUTE_MULTIPATH
659 nhs = fib_count_nexthops(rta->rta_mp);
666 if (fib_info_cnt >= fib_hash_size) {
667 unsigned int new_size = fib_hash_size << 1;
668 struct hlist_head *new_info_hash;
669 struct hlist_head *new_laddrhash;
674 bytes = new_size * sizeof(struct hlist_head *);
675 new_info_hash = fib_hash_alloc(bytes);
676 new_laddrhash = fib_hash_alloc(bytes);
677 if (!new_info_hash || !new_laddrhash) {
678 fib_hash_free(new_info_hash, bytes);
679 fib_hash_free(new_laddrhash, bytes);
681 memset(new_info_hash, 0, bytes);
682 memset(new_laddrhash, 0, bytes);
684 fib_hash_move(new_info_hash, new_laddrhash, new_size);
691 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
695 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
697 fi->fib_protocol = r->rtm_protocol;
700 change_nexthops(fi) {
702 } endfor_nexthops(fi)
704 fi->fib_flags = r->rtm_flags;
705 if (rta->rta_priority)
706 fi->fib_priority = *rta->rta_priority;
708 int attrlen = RTA_PAYLOAD(rta->rta_mx);
709 struct rtattr *attr = RTA_DATA(rta->rta_mx);
711 while (RTA_OK(attr, attrlen)) {
712 unsigned flavor = attr->rta_type;
714 if (flavor > RTAX_MAX)
716 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
718 attr = RTA_NEXT(attr, attrlen);
721 if (rta->rta_prefsrc)
722 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
725 #ifdef CONFIG_IP_ROUTE_MULTIPATH
726 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
728 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
730 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
732 #ifdef CONFIG_NET_CLS_ROUTE
733 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
740 struct fib_nh *nh = fi->fib_nh;
742 nh->nh_oif = *rta->rta_oif;
744 memcpy(&nh->nh_gw, rta->rta_gw, 4);
745 #ifdef CONFIG_NET_CLS_ROUTE
747 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
749 nh->nh_flags = r->rtm_flags;
750 #ifdef CONFIG_IP_ROUTE_MULTIPATH
755 if (fib_props[r->rtm_type].error) {
756 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
761 if (r->rtm_scope > RT_SCOPE_HOST)
764 if (r->rtm_scope == RT_SCOPE_HOST) {
765 struct fib_nh *nh = fi->fib_nh;
767 /* Local address is added. */
768 if (nhs != 1 || nh->nh_gw)
770 nh->nh_scope = RT_SCOPE_NOWHERE;
771 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
773 if (nh->nh_dev == NULL)
776 change_nexthops(fi) {
777 if ((err = fib_check_nh(r, fi, nh)) != 0)
779 } endfor_nexthops(fi)
782 if (fi->fib_prefsrc) {
783 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
784 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
785 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
790 if ((ofi = fib_find_info(fi)) != NULL) {
798 atomic_inc(&fi->fib_clntref);
799 write_lock(&fib_info_lock);
800 hlist_add_head(&fi->fib_hash,
801 &fib_info_hash[fib_info_hashfn(fi)]);
802 if (fi->fib_prefsrc) {
803 struct hlist_head *head;
805 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
806 hlist_add_head(&fi->fib_lhash, head);
808 change_nexthops(fi) {
809 struct hlist_head *head;
814 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
815 head = &fib_info_devhash[hash];
816 hlist_add_head(&nh->nh_hash, head);
817 } endfor_nexthops(fi)
818 write_unlock(&fib_info_lock);
833 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
834 struct fib_result *res, int prefixlen)
836 struct fib_alias *fa;
839 list_for_each_entry(fa, head, fa_list) {
843 fa->fa_tos != flp->fl4_tos)
846 if (fa->fa_scope < flp->fl4_scope)
849 fa->fa_state |= FA_S_ACCESSED;
851 err = fib_props[fa->fa_type].error;
853 struct fib_info *fi = fa->fa_info;
855 if (fi->fib_flags & RTNH_F_DEAD)
858 switch (fa->fa_type) {
865 if (nh->nh_flags&RTNH_F_DEAD)
867 if (!flp->oif || flp->oif == nh->nh_oif)
870 #ifdef CONFIG_IP_ROUTE_MULTIPATH
871 if (nhsel < fi->fib_nhs) {
884 printk(KERN_DEBUG "impossible 102\n");
893 res->prefixlen = prefixlen;
894 res->nh_sel = nh_sel;
895 res->type = fa->fa_type;
896 res->scope = fa->fa_scope;
897 res->fi = fa->fa_info;
898 atomic_inc(&res->fi->fib_clntref);
902 /* Find appropriate source address to this destination */
904 u32 __fib_res_prefsrc(struct fib_result *res)
906 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
910 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
911 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
915 struct nlmsghdr *nlh;
916 unsigned char *b = skb->tail;
918 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
919 rtm = NLMSG_DATA(nlh);
920 rtm->rtm_family = AF_INET;
921 rtm->rtm_dst_len = dst_len;
922 rtm->rtm_src_len = 0;
924 rtm->rtm_table = tb_id;
925 rtm->rtm_type = type;
926 rtm->rtm_flags = fi->fib_flags;
927 rtm->rtm_scope = scope;
928 if (rtm->rtm_dst_len)
929 RTA_PUT(skb, RTA_DST, 4, dst);
930 rtm->rtm_protocol = fi->fib_protocol;
931 if (fi->fib_priority)
932 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
933 #ifdef CONFIG_NET_CLS_ROUTE
934 if (fi->fib_nh[0].nh_tclassid)
935 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
937 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
940 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
941 if (fi->fib_nhs == 1) {
942 if (fi->fib_nh->nh_gw)
943 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
944 if (fi->fib_nh->nh_oif)
945 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
947 #ifdef CONFIG_IP_ROUTE_MULTIPATH
948 if (fi->fib_nhs > 1) {
949 struct rtnexthop *nhp;
950 struct rtattr *mp_head;
951 if (skb_tailroom(skb) <= RTA_SPACE(0))
953 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
956 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
958 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
959 nhp->rtnh_flags = nh->nh_flags & 0xFF;
960 nhp->rtnh_hops = nh->nh_weight-1;
961 nhp->rtnh_ifindex = nh->nh_oif;
963 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
964 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
965 } endfor_nexthops(fi);
966 mp_head->rta_type = RTA_MULTIPATH;
967 mp_head->rta_len = skb->tail - (u8*)mp_head;
970 nlh->nlmsg_len = skb->tail - b;
975 skb_trim(skb, b - skb->data);
979 #ifndef CONFIG_IP_NOSIOCRT
982 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
983 struct kern_rta *rta, struct rtentry *r)
988 memset(rtm, 0, sizeof(*rtm));
989 memset(rta, 0, sizeof(*rta));
991 if (r->rt_dst.sa_family != AF_INET)
992 return -EAFNOSUPPORT;
994 /* Check mask for validity:
995 a) it must be contiguous.
996 b) destination must have all host bits clear.
997 c) if application forgot to set correct family (AF_INET),
998 reject request unless it is absolutely clear i.e.
999 both family and mask are zero.
1002 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
1003 if (!(r->rt_flags&RTF_HOST)) {
1004 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
1005 if (r->rt_genmask.sa_family != AF_INET) {
1006 if (mask || r->rt_genmask.sa_family)
1007 return -EAFNOSUPPORT;
1009 if (bad_mask(mask, *ptr))
1011 plen = inet_mask_len(mask);
1014 nl->nlmsg_flags = NLM_F_REQUEST;
1017 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1018 if (cmd == SIOCDELRT) {
1019 nl->nlmsg_type = RTM_DELROUTE;
1020 nl->nlmsg_flags = 0;
1022 nl->nlmsg_type = RTM_NEWROUTE;
1023 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
1024 rtm->rtm_protocol = RTPROT_BOOT;
1027 rtm->rtm_dst_len = plen;
1031 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
1032 rta->rta_priority = (u32*)&r->rt_pad3;
1034 if (r->rt_flags&RTF_REJECT) {
1035 rtm->rtm_scope = RT_SCOPE_HOST;
1036 rtm->rtm_type = RTN_UNREACHABLE;
1039 rtm->rtm_scope = RT_SCOPE_NOWHERE;
1040 rtm->rtm_type = RTN_UNICAST;
1044 struct net_device *dev;
1045 char devname[IFNAMSIZ];
1047 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
1049 devname[IFNAMSIZ-1] = 0;
1050 colon = strchr(devname, ':');
1053 dev = __dev_get_by_name(devname);
1056 rta->rta_oif = &dev->ifindex;
1058 struct in_ifaddr *ifa;
1059 struct in_device *in_dev = __in_dev_get(dev);
1063 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
1064 if (strcmp(ifa->ifa_label, devname) == 0)
1068 rta->rta_prefsrc = &ifa->ifa_local;
1072 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1073 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1075 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1076 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1079 if (cmd == SIOCDELRT)
1082 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1085 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1086 rtm->rtm_scope = RT_SCOPE_LINK;
1088 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1090 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1094 mx->rta_type = RTA_METRICS;
1095 mx->rta_len = RTA_LENGTH(0);
1096 if (r->rt_flags&RTF_MTU) {
1097 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1098 rec->rta_type = RTAX_ADVMSS;
1099 rec->rta_len = RTA_LENGTH(4);
1100 mx->rta_len += RTA_LENGTH(4);
1101 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1103 if (r->rt_flags&RTF_WINDOW) {
1104 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1105 rec->rta_type = RTAX_WINDOW;
1106 rec->rta_len = RTA_LENGTH(4);
1107 mx->rta_len += RTA_LENGTH(4);
1108 *(u32*)RTA_DATA(rec) = r->rt_window;
1110 if (r->rt_flags&RTF_IRTT) {
1111 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1112 rec->rta_type = RTAX_RTT;
1113 rec->rta_len = RTA_LENGTH(4);
1114 mx->rta_len += RTA_LENGTH(4);
1115 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1125 - local address disappeared -> we must delete all the entries
1127 - device went down -> we must shutdown all nexthops going via it.
1130 int fib_sync_down(u32 local, struct net_device *dev, int force)
1133 int scope = RT_SCOPE_NOWHERE;
1138 if (local && fib_info_laddrhash) {
1139 unsigned int hash = fib_laddr_hashfn(local);
1140 struct hlist_head *head = &fib_info_laddrhash[hash];
1141 struct hlist_node *node;
1142 struct fib_info *fi;
1144 hlist_for_each_entry(fi, node, head, fib_lhash) {
1145 if (fi->fib_prefsrc == local) {
1146 fi->fib_flags |= RTNH_F_DEAD;
1153 struct fib_info *prev_fi = NULL;
1154 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1155 struct hlist_head *head = &fib_info_devhash[hash];
1156 struct hlist_node *node;
1159 hlist_for_each_entry(nh, node, head, nh_hash) {
1160 struct fib_info *fi = nh->nh_parent;
1163 BUG_ON(!fi->fib_nhs);
1164 if (nh->nh_dev != dev || fi == prev_fi)
1168 change_nexthops(fi) {
1169 if (nh->nh_flags&RTNH_F_DEAD)
1171 else if (nh->nh_dev == dev &&
1172 nh->nh_scope != scope) {
1173 nh->nh_flags |= RTNH_F_DEAD;
1174 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1175 spin_lock_bh(&fib_multipath_lock);
1176 fi->fib_power -= nh->nh_power;
1178 spin_unlock_bh(&fib_multipath_lock);
1182 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1183 if (force > 1 && nh->nh_dev == dev) {
1188 } endfor_nexthops(fi)
1189 if (dead == fi->fib_nhs) {
1190 fi->fib_flags |= RTNH_F_DEAD;
1199 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1202 Dead device goes up. We wake up dead nexthops.
1203 It takes sense only on multipath routes.
1206 int fib_sync_up(struct net_device *dev)
1208 struct fib_info *prev_fi;
1210 struct hlist_head *head;
1211 struct hlist_node *node;
1215 if (!(dev->flags&IFF_UP))
1219 hash = fib_devindex_hashfn(dev->ifindex);
1220 head = &fib_info_devhash[hash];
1223 hlist_for_each_entry(nh, node, head, nh_hash) {
1224 struct fib_info *fi = nh->nh_parent;
1227 BUG_ON(!fi->fib_nhs);
1228 if (nh->nh_dev != dev || fi == prev_fi)
1233 change_nexthops(fi) {
1234 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1238 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1240 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1243 spin_lock_bh(&fib_multipath_lock);
1245 nh->nh_flags &= ~RTNH_F_DEAD;
1246 spin_unlock_bh(&fib_multipath_lock);
1247 } endfor_nexthops(fi)
1250 fi->fib_flags &= ~RTNH_F_DEAD;
1259 The algorithm is suboptimal, but it provides really
1260 fair weighted route distribution.
1263 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1265 struct fib_info *fi = res->fi;
1268 spin_lock_bh(&fib_multipath_lock);
1269 if (fi->fib_power <= 0) {
1271 change_nexthops(fi) {
1272 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1273 power += nh->nh_weight;
1274 nh->nh_power = nh->nh_weight;
1276 } endfor_nexthops(fi);
1277 fi->fib_power = power;
1279 spin_unlock_bh(&fib_multipath_lock);
1280 /* Race condition: route has just become dead. */
1287 /* w should be random number [0..fi->fib_power-1],
1288 it is pretty bad approximation.
1291 w = jiffies % fi->fib_power;
1293 change_nexthops(fi) {
1294 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1295 if ((w -= nh->nh_power) <= 0) {
1298 res->nh_sel = nhsel;
1299 spin_unlock_bh(&fib_multipath_lock);
1303 } endfor_nexthops(fi);
1305 /* Race condition: route has just become dead. */
1307 spin_unlock_bh(&fib_multipath_lock);