a1349862a8ce71b6dee7c93654a90ca4413b834d
[linux-2.6.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
25 #include <linux/mm.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
30 #include <linux/in.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
38
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45
46 #include "fib_lookup.h"
47
48 #define FSprintk(a...)
49
50 static rwlock_t fib_info_lock = RW_LOCK_UNLOCKED;
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61
62 static spinlock_t fib_multipath_lock = SPIN_LOCK_UNLOCKED;
63
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81
82 #define endfor_nexthops(fi) }
83
84
85 static struct 
86 {
87         int     error;
88         u8      scope;
89 } fib_props[RTA_MAX + 1] = {
90         {
91                 .error  = 0,
92                 .scope  = RT_SCOPE_NOWHERE,
93         },      /* RTN_UNSPEC */
94         {
95                 .error  = 0,
96                 .scope  = RT_SCOPE_UNIVERSE,
97         },      /* RTN_UNICAST */
98         {
99                 .error  = 0,
100                 .scope  = RT_SCOPE_HOST,
101         },      /* RTN_LOCAL */
102         {
103                 .error  = 0,
104                 .scope  = RT_SCOPE_LINK,
105         },      /* RTN_BROADCAST */
106         {
107                 .error  = 0,
108                 .scope  = RT_SCOPE_LINK,
109         },      /* RTN_ANYCAST */
110         {
111                 .error  = 0,
112                 .scope  = RT_SCOPE_UNIVERSE,
113         },      /* RTN_MULTICAST */
114         {
115                 .error  = -EINVAL,
116                 .scope  = RT_SCOPE_UNIVERSE,
117         },      /* RTN_BLACKHOLE */
118         {
119                 .error  = -EHOSTUNREACH,
120                 .scope  = RT_SCOPE_UNIVERSE,
121         },      /* RTN_UNREACHABLE */
122         {
123                 .error  = -EACCES,
124                 .scope  = RT_SCOPE_UNIVERSE,
125         },      /* RTN_PROHIBIT */
126         {
127                 .error  = -EAGAIN,
128                 .scope  = RT_SCOPE_UNIVERSE,
129         },      /* RTN_THROW */
130         {
131                 .error  = -EINVAL,
132                 .scope  = RT_SCOPE_NOWHERE,
133         },      /* RTN_NAT */
134         {
135                 .error  = -EINVAL,
136                 .scope  = RT_SCOPE_NOWHERE,
137         },      /* RTN_XRESOLVE */
138 };
139
140
141 /* Release a nexthop info record */
142
143 void free_fib_info(struct fib_info *fi)
144 {
145         if (fi->fib_dead == 0) {
146                 printk("Freeing alive fib_info %p\n", fi);
147                 return;
148         }
149         change_nexthops(fi) {
150                 if (nh->nh_dev)
151                         dev_put(nh->nh_dev);
152                 nh->nh_dev = NULL;
153         } endfor_nexthops(fi);
154         fib_info_cnt--;
155         kfree(fi);
156 }
157
158 void fib_release_info(struct fib_info *fi)
159 {
160         write_lock(&fib_info_lock);
161         if (fi && --fi->fib_treeref == 0) {
162                 hlist_del(&fi->fib_hash);
163                 if (fi->fib_prefsrc)
164                         hlist_del(&fi->fib_lhash);
165                 change_nexthops(fi) {
166                         if (!nh->nh_dev)
167                                 continue;
168                         hlist_del(&nh->nh_hash);
169                 } endfor_nexthops(fi)
170                 fi->fib_dead = 1;
171                 fib_info_put(fi);
172         }
173         write_unlock(&fib_info_lock);
174 }
175
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177 {
178         const struct fib_nh *onh = ofi->fib_nh;
179
180         for_nexthops(fi) {
181                 if (nh->nh_oif != onh->nh_oif ||
182                     nh->nh_gw  != onh->nh_gw ||
183                     nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185                     nh->nh_weight != onh->nh_weight ||
186 #endif
187 #ifdef CONFIG_NET_CLS_ROUTE
188                     nh->nh_tclassid != onh->nh_tclassid ||
189 #endif
190                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191                         return -1;
192                 onh++;
193         } endfor_nexthops(fi);
194         return 0;
195 }
196
197 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
198 {
199         unsigned int mask = (fib_hash_size - 1);
200         unsigned int val = fi->fib_nhs;
201
202         val ^= fi->fib_protocol;
203         val ^= fi->fib_prefsrc;
204         val ^= fi->fib_priority;
205
206         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
207 }
208
209 static struct fib_info *fib_find_info(const struct fib_info *nfi)
210 {
211         struct hlist_head *head;
212         struct hlist_node *node;
213         struct fib_info *fi;
214         unsigned int hash;
215
216         hash = fib_info_hashfn(nfi);
217         head = &fib_info_hash[hash];
218
219         hlist_for_each_entry(fi, node, head, fib_hash) {
220                 if (fi->fib_nhs != nfi->fib_nhs)
221                         continue;
222                 if (nfi->fib_protocol == fi->fib_protocol &&
223                     nfi->fib_prefsrc == fi->fib_prefsrc &&
224                     nfi->fib_priority == fi->fib_priority &&
225                     memcmp(nfi->fib_metrics, fi->fib_metrics,
226                            sizeof(fi->fib_metrics)) == 0 &&
227                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
228                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
229                         return fi;
230         }
231
232         return NULL;
233 }
234
235 static inline unsigned int fib_devindex_hashfn(unsigned int val)
236 {
237         unsigned int mask = DEVINDEX_HASHSIZE - 1;
238
239         return (val ^
240                 (val >> DEVINDEX_HASHBITS) ^
241                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
242 }
243
244 /* Check, that the gateway is already configured.
245    Used only by redirect accept routine.
246  */
247
248 int ip_fib_check_default(u32 gw, struct net_device *dev)
249 {
250         struct hlist_head *head;
251         struct hlist_node *node;
252         struct fib_nh *nh;
253         unsigned int hash;
254
255         read_lock(&fib_info_lock);
256
257         hash = fib_devindex_hashfn(dev->ifindex);
258         head = &fib_info_devhash[hash];
259         hlist_for_each_entry(nh, node, head, nh_hash) {
260                 if (nh->nh_dev == dev &&
261                     nh->nh_gw == gw &&
262                     !(nh->nh_flags&RTNH_F_DEAD)) {
263                         read_unlock(&fib_info_lock);
264                         return 0;
265                 }
266         }
267
268         read_unlock(&fib_info_lock);
269
270         return -1;
271 }
272
273 #ifdef CONFIG_IP_ROUTE_MULTIPATH
274
275 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
276 {
277         while (RTA_OK(attr,attrlen)) {
278                 if (attr->rta_type == type)
279                         return *(u32*)RTA_DATA(attr);
280                 attr = RTA_NEXT(attr, attrlen);
281         }
282         return 0;
283 }
284
285 static int
286 fib_count_nexthops(struct rtattr *rta)
287 {
288         int nhs = 0;
289         struct rtnexthop *nhp = RTA_DATA(rta);
290         int nhlen = RTA_PAYLOAD(rta);
291
292         while (nhlen >= (int)sizeof(struct rtnexthop)) {
293                 if ((nhlen -= nhp->rtnh_len) < 0)
294                         return 0;
295                 nhs++;
296                 nhp = RTNH_NEXT(nhp);
297         };
298         return nhs;
299 }
300
301 static int
302 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
303 {
304         struct rtnexthop *nhp = RTA_DATA(rta);
305         int nhlen = RTA_PAYLOAD(rta);
306
307         change_nexthops(fi) {
308                 int attrlen = nhlen - sizeof(struct rtnexthop);
309                 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
310                         return -EINVAL;
311                 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
312                 nh->nh_oif = nhp->rtnh_ifindex;
313                 nh->nh_weight = nhp->rtnh_hops + 1;
314                 if (attrlen) {
315                         nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
316 #ifdef CONFIG_NET_CLS_ROUTE
317                         nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
318 #endif
319                 }
320                 nhp = RTNH_NEXT(nhp);
321         } endfor_nexthops(fi);
322         return 0;
323 }
324
325 #endif
326
327 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
328                  struct fib_info *fi)
329 {
330 #ifdef CONFIG_IP_ROUTE_MULTIPATH
331         struct rtnexthop *nhp;
332         int nhlen;
333 #endif
334
335         if (rta->rta_priority &&
336             *rta->rta_priority != fi->fib_priority)
337                 return 1;
338
339         if (rta->rta_oif || rta->rta_gw) {
340                 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
341                     (!rta->rta_gw  || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
342                         return 0;
343                 return 1;
344         }
345
346 #ifdef CONFIG_IP_ROUTE_MULTIPATH
347         if (rta->rta_mp == NULL)
348                 return 0;
349         nhp = RTA_DATA(rta->rta_mp);
350         nhlen = RTA_PAYLOAD(rta->rta_mp);
351         
352         for_nexthops(fi) {
353                 int attrlen = nhlen - sizeof(struct rtnexthop);
354                 u32 gw;
355
356                 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
357                         return -EINVAL;
358                 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
359                         return 1;
360                 if (attrlen) {
361                         gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
362                         if (gw && gw != nh->nh_gw)
363                                 return 1;
364 #ifdef CONFIG_NET_CLS_ROUTE
365                         gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
366                         if (gw && gw != nh->nh_tclassid)
367                                 return 1;
368 #endif
369                 }
370                 nhp = RTNH_NEXT(nhp);
371         } endfor_nexthops(fi);
372 #endif
373         return 0;
374 }
375
376
377 /*
378    Picture
379    -------
380
381    Semantics of nexthop is very messy by historical reasons.
382    We have to take into account, that:
383    a) gateway can be actually local interface address,
384       so that gatewayed route is direct.
385    b) gateway must be on-link address, possibly
386       described not by an ifaddr, but also by a direct route.
387    c) If both gateway and interface are specified, they should not
388       contradict.
389    d) If we use tunnel routes, gateway could be not on-link.
390
391    Attempt to reconcile all of these (alas, self-contradictory) conditions
392    results in pretty ugly and hairy code with obscure logic.
393
394    I chose to generalized it instead, so that the size
395    of code does not increase practically, but it becomes
396    much more general.
397    Every prefix is assigned a "scope" value: "host" is local address,
398    "link" is direct route,
399    [ ... "site" ... "interior" ... ]
400    and "universe" is true gateway route with global meaning.
401
402    Every prefix refers to a set of "nexthop"s (gw, oif),
403    where gw must have narrower scope. This recursion stops
404    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
405    which means that gw is forced to be on link.
406
407    Code is still hairy, but now it is apparently logically
408    consistent and very flexible. F.e. as by-product it allows
409    to co-exists in peace independent exterior and interior
410    routing processes.
411
412    Normally it looks as following.
413
414    {universe prefix}  -> (gw, oif) [scope link]
415                           |
416                           |-> {link prefix} -> (gw, oif) [scope local]
417                                                 |
418                                                 |-> {local prefix} (terminal node)
419  */
420
421 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
422 {
423         int err;
424
425         if (nh->nh_gw) {
426                 struct fib_result res;
427
428 #ifdef CONFIG_IP_ROUTE_PERVASIVE
429                 if (nh->nh_flags&RTNH_F_PERVASIVE)
430                         return 0;
431 #endif
432                 if (nh->nh_flags&RTNH_F_ONLINK) {
433                         struct net_device *dev;
434
435                         if (r->rtm_scope >= RT_SCOPE_LINK)
436                                 return -EINVAL;
437                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
438                                 return -EINVAL;
439                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
440                                 return -ENODEV;
441                         if (!(dev->flags&IFF_UP))
442                                 return -ENETDOWN;
443                         nh->nh_dev = dev;
444                         dev_hold(dev);
445                         nh->nh_scope = RT_SCOPE_LINK;
446                         return 0;
447                 }
448                 {
449                         struct flowi fl = { .nl_u = { .ip4_u =
450                                                       { .daddr = nh->nh_gw,
451                                                         .scope = r->rtm_scope + 1 } },
452                                             .oif = nh->nh_oif };
453
454                         /* It is not necessary, but requires a bit of thinking */
455                         if (fl.fl4_scope < RT_SCOPE_LINK)
456                                 fl.fl4_scope = RT_SCOPE_LINK;
457                         if ((err = fib_lookup(&fl, &res)) != 0)
458                                 return err;
459                 }
460                 err = -EINVAL;
461                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
462                         goto out;
463                 nh->nh_scope = res.scope;
464                 nh->nh_oif = FIB_RES_OIF(res);
465                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
466                         goto out;
467                 dev_hold(nh->nh_dev);
468                 err = -ENETDOWN;
469                 if (!(nh->nh_dev->flags & IFF_UP))
470                         goto out;
471                 err = 0;
472 out:
473                 fib_res_put(&res);
474                 return err;
475         } else {
476                 struct in_device *in_dev;
477
478                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
479                         return -EINVAL;
480
481                 in_dev = inetdev_by_index(nh->nh_oif);
482                 if (in_dev == NULL)
483                         return -ENODEV;
484                 if (!(in_dev->dev->flags&IFF_UP)) {
485                         in_dev_put(in_dev);
486                         return -ENETDOWN;
487                 }
488                 nh->nh_dev = in_dev->dev;
489                 dev_hold(nh->nh_dev);
490                 nh->nh_scope = RT_SCOPE_HOST;
491                 in_dev_put(in_dev);
492         }
493         return 0;
494 }
495
496 static inline unsigned int fib_laddr_hashfn(u32 val)
497 {
498         unsigned int mask = (fib_hash_size - 1);
499
500         return (val ^ (val >> 7) ^ (val >> 14)) & mask;
501 }
502
503 static struct hlist_head *fib_hash_alloc(int bytes)
504 {
505         if (bytes <= PAGE_SIZE)
506                 return kmalloc(bytes, GFP_KERNEL);
507         else
508                 return (struct hlist_head *)
509                         __get_free_pages(GFP_KERNEL, get_order(bytes));
510 }
511
512 static void fib_hash_free(struct hlist_head *hash, int bytes)
513 {
514         if (!hash)
515                 return;
516
517         if (bytes <= PAGE_SIZE)
518                 kfree(hash);
519         else
520                 free_pages((unsigned long) hash, get_order(bytes));
521 }
522
523 static void fib_hash_move(struct hlist_head *new_info_hash,
524                           struct hlist_head *new_laddrhash,
525                           unsigned int new_size)
526 {
527         unsigned int old_size = fib_hash_size;
528         unsigned int i;
529
530         write_lock(&fib_info_lock);
531         fib_hash_size = new_size;
532
533         for (i = 0; i < old_size; i++) {
534                 struct hlist_head *head = &fib_info_hash[i];
535                 struct hlist_node *node, *n;
536                 struct fib_info *fi;
537
538                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
539                         struct hlist_head *dest;
540                         unsigned int new_hash;
541
542                         hlist_del(&fi->fib_hash);
543
544                         new_hash = fib_info_hashfn(fi);
545                         dest = &new_info_hash[new_hash];
546                         hlist_add_head(&fi->fib_hash, dest);
547                 }
548         }
549         fib_info_hash = new_info_hash;
550
551         for (i = 0; i < old_size; i++) {
552                 struct hlist_head *lhead = &fib_info_laddrhash[i];
553                 struct hlist_node *node, *n;
554                 struct fib_info *fi;
555
556                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
557                         struct hlist_head *ldest;
558                         unsigned int new_hash;
559
560                         hlist_del(&fi->fib_lhash);
561
562                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
563                         ldest = &new_laddrhash[new_hash];
564                         hlist_add_head(&fi->fib_lhash, ldest);
565                 }
566         }
567         fib_info_laddrhash = new_laddrhash;
568
569         write_unlock(&fib_info_lock);
570 }
571
572 struct fib_info *
573 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
574                 const struct nlmsghdr *nlh, int *errp)
575 {
576         int err;
577         struct fib_info *fi = NULL;
578         struct fib_info *ofi;
579 #ifdef CONFIG_IP_ROUTE_MULTIPATH
580         int nhs = 1;
581 #else
582         const int nhs = 1;
583 #endif
584
585         /* Fast check to catch the most weird cases */
586         if (fib_props[r->rtm_type].scope > r->rtm_scope)
587                 goto err_inval;
588
589 #ifdef CONFIG_IP_ROUTE_MULTIPATH
590         if (rta->rta_mp) {
591                 nhs = fib_count_nexthops(rta->rta_mp);
592                 if (nhs == 0)
593                         goto err_inval;
594         }
595 #endif
596
597         err = -ENOBUFS;
598         if (fib_info_cnt >= fib_hash_size) {
599                 unsigned int new_size = fib_hash_size << 1;
600                 struct hlist_head *new_info_hash;
601                 struct hlist_head *new_laddrhash;
602                 unsigned int bytes;
603
604                 if (!new_size)
605                         new_size = 1;
606                 bytes = new_size * sizeof(struct hlist_head *);
607                 new_info_hash = fib_hash_alloc(bytes);
608                 new_laddrhash = fib_hash_alloc(bytes);
609                 if (!new_info_hash || !new_laddrhash) {
610                         fib_hash_free(new_info_hash, bytes);
611                         fib_hash_free(new_laddrhash, bytes);
612                 } else {
613                         memset(new_info_hash, 0, bytes);
614                         memset(new_laddrhash, 0, bytes);
615
616                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
617                 }
618
619                 if (!fib_hash_size)
620                         goto failure;
621         }
622
623         fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
624         if (fi == NULL)
625                 goto failure;
626         fib_info_cnt++;
627         memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
628
629         fi->fib_protocol = r->rtm_protocol;
630
631         fi->fib_nhs = nhs;
632         change_nexthops(fi) {
633                 nh->nh_parent = fi;
634         } endfor_nexthops(fi)
635
636         fi->fib_flags = r->rtm_flags;
637         if (rta->rta_priority)
638                 fi->fib_priority = *rta->rta_priority;
639         if (rta->rta_mx) {
640                 int attrlen = RTA_PAYLOAD(rta->rta_mx);
641                 struct rtattr *attr = RTA_DATA(rta->rta_mx);
642
643                 while (RTA_OK(attr, attrlen)) {
644                         unsigned flavor = attr->rta_type;
645                         if (flavor) {
646                                 if (flavor > RTAX_MAX)
647                                         goto err_inval;
648                                 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
649                         }
650                         attr = RTA_NEXT(attr, attrlen);
651                 }
652         }
653         if (rta->rta_prefsrc)
654                 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
655
656         if (rta->rta_mp) {
657 #ifdef CONFIG_IP_ROUTE_MULTIPATH
658                 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
659                         goto failure;
660                 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
661                         goto err_inval;
662                 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
663                         goto err_inval;
664 #ifdef CONFIG_NET_CLS_ROUTE
665                 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
666                         goto err_inval;
667 #endif
668 #else
669                 goto err_inval;
670 #endif
671         } else {
672                 struct fib_nh *nh = fi->fib_nh;
673                 if (rta->rta_oif)
674                         nh->nh_oif = *rta->rta_oif;
675                 if (rta->rta_gw)
676                         memcpy(&nh->nh_gw, rta->rta_gw, 4);
677 #ifdef CONFIG_NET_CLS_ROUTE
678                 if (rta->rta_flow)
679                         memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
680 #endif
681                 nh->nh_flags = r->rtm_flags;
682 #ifdef CONFIG_IP_ROUTE_MULTIPATH
683                 nh->nh_weight = 1;
684 #endif
685         }
686
687         if (fib_props[r->rtm_type].error) {
688                 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
689                         goto err_inval;
690                 goto link_it;
691         }
692
693         if (r->rtm_scope > RT_SCOPE_HOST)
694                 goto err_inval;
695
696         if (r->rtm_scope == RT_SCOPE_HOST) {
697                 struct fib_nh *nh = fi->fib_nh;
698
699                 /* Local address is added. */
700                 if (nhs != 1 || nh->nh_gw)
701                         goto err_inval;
702                 nh->nh_scope = RT_SCOPE_NOWHERE;
703                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
704                 err = -ENODEV;
705                 if (nh->nh_dev == NULL)
706                         goto failure;
707         } else {
708                 change_nexthops(fi) {
709                         if ((err = fib_check_nh(r, fi, nh)) != 0)
710                                 goto failure;
711                 } endfor_nexthops(fi)
712         }
713
714         if (fi->fib_prefsrc) {
715                 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
716                     memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
717                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
718                                 goto err_inval;
719         }
720
721 link_it:
722         if ((ofi = fib_find_info(fi)) != NULL) {
723                 fi->fib_dead = 1;
724                 free_fib_info(fi);
725                 ofi->fib_treeref++;
726                 return ofi;
727         }
728
729         fi->fib_treeref++;
730         atomic_inc(&fi->fib_clntref);
731         write_lock(&fib_info_lock);
732         hlist_add_head(&fi->fib_hash,
733                        &fib_info_hash[fib_info_hashfn(fi)]);
734         if (fi->fib_prefsrc) {
735                 struct hlist_head *head;
736
737                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
738                 hlist_add_head(&fi->fib_lhash, head);
739         }
740         change_nexthops(fi) {
741                 struct hlist_head *head;
742                 unsigned int hash;
743
744                 if (!nh->nh_dev)
745                         continue;
746                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
747                 head = &fib_info_devhash[hash];
748                 hlist_add_head(&nh->nh_hash, head);
749         } endfor_nexthops(fi)
750         write_unlock(&fib_info_lock);
751         return fi;
752
753 err_inval:
754         err = -EINVAL;
755
756 failure:
757         *errp = err;
758         if (fi) {
759                 fi->fib_dead = 1;
760                 free_fib_info(fi);
761         }
762         return NULL;
763 }
764
765 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
766                        struct fib_result *res, int prefixlen)
767 {
768         struct fib_alias *fa;
769         int nh_sel = 0;
770
771         list_for_each_entry(fa, head, fa_list) {
772                 int err;
773
774                 if (fa->fa_tos &&
775                     fa->fa_tos != flp->fl4_tos)
776                         continue;
777
778                 if (fa->fa_scope < flp->fl4_scope)
779                         continue;
780
781                 fa->fa_state |= FA_S_ACCESSED;
782
783                 err = fib_props[fa->fa_type].error;
784                 if (err == 0) {
785                         struct fib_info *fi = fa->fa_info;
786
787                         if (fi->fib_flags & RTNH_F_DEAD)
788                                 continue;
789
790                         switch (fa->fa_type) {
791                         case RTN_UNICAST:
792                         case RTN_LOCAL:
793                         case RTN_BROADCAST:
794                         case RTN_ANYCAST:
795                         case RTN_MULTICAST:
796                                 for_nexthops(fi) {
797                                         if (nh->nh_flags&RTNH_F_DEAD)
798                                                 continue;
799                                         if (!flp->oif || flp->oif == nh->nh_oif)
800                                                 break;
801                                 }
802 #ifdef CONFIG_IP_ROUTE_MULTIPATH
803                                 if (nhsel < fi->fib_nhs) {
804                                         nh_sel = nhsel;
805                                         goto out_fill_res;
806                                 }
807 #else
808                                 if (nhsel < 1) {
809                                         goto out_fill_res;
810                                 }
811 #endif
812                                 endfor_nexthops(fi);
813                                 continue;
814
815                         default:
816                                 printk(KERN_DEBUG "impossible 102\n");
817                                 return -EINVAL;
818                         };
819                 }
820                 return err;
821         }
822         return 1;
823
824 out_fill_res:
825         res->prefixlen = prefixlen;
826         res->nh_sel = nh_sel;
827         res->type = fa->fa_type;
828         res->scope = fa->fa_scope;
829         res->fi = fa->fa_info;
830         atomic_inc(&res->fi->fib_clntref);
831         return 0;
832 }
833
834 /* Find appropriate source address to this destination */
835
836 u32 __fib_res_prefsrc(struct fib_result *res)
837 {
838         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
839 }
840
841 int
842 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
843               u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
844               struct fib_info *fi)
845 {
846         struct rtmsg *rtm;
847         struct nlmsghdr  *nlh;
848         unsigned char    *b = skb->tail;
849
850         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
851         rtm = NLMSG_DATA(nlh);
852         rtm->rtm_family = AF_INET;
853         rtm->rtm_dst_len = dst_len;
854         rtm->rtm_src_len = 0;
855         rtm->rtm_tos = tos;
856         rtm->rtm_table = tb_id;
857         rtm->rtm_type = type;
858         rtm->rtm_flags = fi->fib_flags;
859         rtm->rtm_scope = scope;
860         if (rtm->rtm_dst_len)
861                 RTA_PUT(skb, RTA_DST, 4, dst);
862         rtm->rtm_protocol = fi->fib_protocol;
863         if (fi->fib_priority)
864                 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
865 #ifdef CONFIG_NET_CLS_ROUTE
866         if (fi->fib_nh[0].nh_tclassid)
867                 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
868 #endif
869         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
870                 goto rtattr_failure;
871         if (fi->fib_prefsrc)
872                 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
873         if (fi->fib_nhs == 1) {
874                 if (fi->fib_nh->nh_gw)
875                         RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
876                 if (fi->fib_nh->nh_oif)
877                         RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
878         }
879 #ifdef CONFIG_IP_ROUTE_MULTIPATH
880         if (fi->fib_nhs > 1) {
881                 struct rtnexthop *nhp;
882                 struct rtattr *mp_head;
883                 if (skb_tailroom(skb) <= RTA_SPACE(0))
884                         goto rtattr_failure;
885                 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
886
887                 for_nexthops(fi) {
888                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
889                                 goto rtattr_failure;
890                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
891                         nhp->rtnh_flags = nh->nh_flags & 0xFF;
892                         nhp->rtnh_hops = nh->nh_weight-1;
893                         nhp->rtnh_ifindex = nh->nh_oif;
894                         if (nh->nh_gw)
895                                 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
896                         nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
897                 } endfor_nexthops(fi);
898                 mp_head->rta_type = RTA_MULTIPATH;
899                 mp_head->rta_len = skb->tail - (u8*)mp_head;
900         }
901 #endif
902         nlh->nlmsg_len = skb->tail - b;
903         return skb->len;
904
905 nlmsg_failure:
906 rtattr_failure:
907         skb_trim(skb, b - skb->data);
908         return -1;
909 }
910
911 #ifndef CONFIG_IP_NOSIOCRT
912
913 int
914 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
915                     struct kern_rta *rta, struct rtentry *r)
916 {
917         int    plen;
918         u32    *ptr;
919
920         memset(rtm, 0, sizeof(*rtm));
921         memset(rta, 0, sizeof(*rta));
922
923         if (r->rt_dst.sa_family != AF_INET)
924                 return -EAFNOSUPPORT;
925
926         /* Check mask for validity:
927            a) it must be contiguous.
928            b) destination must have all host bits clear.
929            c) if application forgot to set correct family (AF_INET),
930               reject request unless it is absolutely clear i.e.
931               both family and mask are zero.
932          */
933         plen = 32;
934         ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
935         if (!(r->rt_flags&RTF_HOST)) {
936                 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
937                 if (r->rt_genmask.sa_family != AF_INET) {
938                         if (mask || r->rt_genmask.sa_family)
939                                 return -EAFNOSUPPORT;
940                 }
941                 if (bad_mask(mask, *ptr))
942                         return -EINVAL;
943                 plen = inet_mask_len(mask);
944         }
945
946         nl->nlmsg_flags = NLM_F_REQUEST;
947         nl->nlmsg_pid = 0;
948         nl->nlmsg_seq = 0;
949         nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
950         if (cmd == SIOCDELRT) {
951                 nl->nlmsg_type = RTM_DELROUTE;
952                 nl->nlmsg_flags = 0;
953         } else {
954                 nl->nlmsg_type = RTM_NEWROUTE;
955                 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
956                 rtm->rtm_protocol = RTPROT_BOOT;
957         }
958
959         rtm->rtm_dst_len = plen;
960         rta->rta_dst = ptr;
961
962         if (r->rt_metric) {
963                 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
964                 rta->rta_priority = (u32*)&r->rt_pad3;
965         }
966         if (r->rt_flags&RTF_REJECT) {
967                 rtm->rtm_scope = RT_SCOPE_HOST;
968                 rtm->rtm_type = RTN_UNREACHABLE;
969                 return 0;
970         }
971         rtm->rtm_scope = RT_SCOPE_NOWHERE;
972         rtm->rtm_type = RTN_UNICAST;
973
974         if (r->rt_dev) {
975                 char *colon;
976                 struct net_device *dev;
977                 char   devname[IFNAMSIZ];
978
979                 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
980                         return -EFAULT;
981                 devname[IFNAMSIZ-1] = 0;
982                 colon = strchr(devname, ':');
983                 if (colon)
984                         *colon = 0;
985                 dev = __dev_get_by_name(devname);
986                 if (!dev)
987                         return -ENODEV;
988                 rta->rta_oif = &dev->ifindex;
989                 if (colon) {
990                         struct in_ifaddr *ifa;
991                         struct in_device *in_dev = __in_dev_get(dev);
992                         if (!in_dev)
993                                 return -ENODEV;
994                         *colon = ':';
995                         for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
996                                 if (strcmp(ifa->ifa_label, devname) == 0)
997                                         break;
998                         if (ifa == NULL)
999                                 return -ENODEV;
1000                         rta->rta_prefsrc = &ifa->ifa_local;
1001                 }
1002         }
1003
1004         ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1005         if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1006                 rta->rta_gw = ptr;
1007                 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1008                         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1009         }
1010
1011         if (cmd == SIOCDELRT)
1012                 return 0;
1013
1014         if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1015                 return -EINVAL;
1016
1017         if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1018                 rtm->rtm_scope = RT_SCOPE_LINK;
1019
1020         if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1021                 struct rtattr *rec;
1022                 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1023                 if (mx == NULL)
1024                         return -ENOMEM;
1025                 rta->rta_mx = mx;
1026                 mx->rta_type = RTA_METRICS;
1027                 mx->rta_len  = RTA_LENGTH(0);
1028                 if (r->rt_flags&RTF_MTU) {
1029                         rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1030                         rec->rta_type = RTAX_ADVMSS;
1031                         rec->rta_len = RTA_LENGTH(4);
1032                         mx->rta_len += RTA_LENGTH(4);
1033                         *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1034                 }
1035                 if (r->rt_flags&RTF_WINDOW) {
1036                         rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1037                         rec->rta_type = RTAX_WINDOW;
1038                         rec->rta_len = RTA_LENGTH(4);
1039                         mx->rta_len += RTA_LENGTH(4);
1040                         *(u32*)RTA_DATA(rec) = r->rt_window;
1041                 }
1042                 if (r->rt_flags&RTF_IRTT) {
1043                         rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1044                         rec->rta_type = RTAX_RTT;
1045                         rec->rta_len = RTA_LENGTH(4);
1046                         mx->rta_len += RTA_LENGTH(4);
1047                         *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1048                 }
1049         }
1050         return 0;
1051 }
1052
1053 #endif
1054
1055 /*
1056    Update FIB if:
1057    - local address disappeared -> we must delete all the entries
1058      referring to it.
1059    - device went down -> we must shutdown all nexthops going via it.
1060  */
1061
1062 int fib_sync_down(u32 local, struct net_device *dev, int force)
1063 {
1064         int ret = 0;
1065         int scope = RT_SCOPE_NOWHERE;
1066         
1067         if (force)
1068                 scope = -1;
1069
1070         if (local && fib_info_laddrhash) {
1071                 unsigned int hash = fib_laddr_hashfn(local);
1072                 struct hlist_head *head = &fib_info_laddrhash[hash];
1073                 struct hlist_node *node;
1074                 struct fib_info *fi;
1075
1076                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1077                         if (fi->fib_prefsrc == local) {
1078                                 fi->fib_flags |= RTNH_F_DEAD;
1079                                 ret++;
1080                         }
1081                 }
1082         }
1083
1084         if (dev) {
1085                 struct fib_info *prev_fi = NULL;
1086                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1087                 struct hlist_head *head = &fib_info_devhash[hash];
1088                 struct hlist_node *node;
1089                 struct fib_nh *nh;
1090
1091                 hlist_for_each_entry(nh, node, head, nh_hash) {
1092                         struct fib_info *fi = nh->nh_parent;
1093                         int dead;
1094
1095                         BUG_ON(!fi->fib_nhs);
1096                         if (nh->nh_dev != dev || fi == prev_fi)
1097                                 continue;
1098                         prev_fi = fi;
1099                         dead = 0;
1100                         change_nexthops(fi) {
1101                                 if (nh->nh_flags&RTNH_F_DEAD)
1102                                         dead++;
1103                                 else if (nh->nh_dev == dev &&
1104                                          nh->nh_scope != scope) {
1105                                         nh->nh_flags |= RTNH_F_DEAD;
1106 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1107                                         spin_lock_bh(&fib_multipath_lock);
1108                                         fi->fib_power -= nh->nh_power;
1109                                         nh->nh_power = 0;
1110                                         spin_unlock_bh(&fib_multipath_lock);
1111 #endif
1112                                         dead++;
1113                                 }
1114 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1115                                 if (force > 1 && nh->nh_dev == dev) {
1116                                         dead = fi->fib_nhs;
1117                                         break;
1118                                 }
1119 #endif
1120                         } endfor_nexthops(fi)
1121                         if (dead == fi->fib_nhs) {
1122                                 fi->fib_flags |= RTNH_F_DEAD;
1123                                 ret++;
1124                         }
1125                 }
1126         }
1127
1128         return ret;
1129 }
1130
1131 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1132
1133 /*
1134    Dead device goes up. We wake up dead nexthops.
1135    It takes sense only on multipath routes.
1136  */
1137
1138 int fib_sync_up(struct net_device *dev)
1139 {
1140         struct fib_info *prev_fi;
1141         unsigned int hash;
1142         struct hlist_head *head;
1143         struct hlist_node *node;
1144         struct fib_nh *nh;
1145         int ret;
1146
1147         if (!(dev->flags&IFF_UP))
1148                 return 0;
1149
1150         prev_fi = NULL;
1151         hash = fib_devindex_hashfn(dev->ifindex);
1152         head = &fib_info_devhash[hash];
1153         ret = 0;
1154
1155         hlist_for_each_entry(nh, node, head, nh_hash) {
1156                 struct fib_info *fi = nh->nh_parent;
1157                 int alive;
1158
1159                 BUG_ON(!fi->fib_nhs);
1160                 if (nh->nh_dev != dev || fi == prev_fi)
1161                         continue;
1162
1163                 prev_fi = fi;
1164                 alive = 0;
1165                 change_nexthops(fi) {
1166                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1167                                 alive++;
1168                                 continue;
1169                         }
1170                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1171                                 continue;
1172                         if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1173                                 continue;
1174                         alive++;
1175                         spin_lock_bh(&fib_multipath_lock);
1176                         nh->nh_power = 0;
1177                         nh->nh_flags &= ~RTNH_F_DEAD;
1178                         spin_unlock_bh(&fib_multipath_lock);
1179                 } endfor_nexthops(fi)
1180
1181                 if (alive > 0) {
1182                         fi->fib_flags &= ~RTNH_F_DEAD;
1183                         ret++;
1184                 }
1185         }
1186
1187         return ret;
1188 }
1189
1190 /*
1191    The algorithm is suboptimal, but it provides really
1192    fair weighted route distribution.
1193  */
1194
1195 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1196 {
1197         struct fib_info *fi = res->fi;
1198         int w;
1199
1200         spin_lock_bh(&fib_multipath_lock);
1201         if (fi->fib_power <= 0) {
1202                 int power = 0;
1203                 change_nexthops(fi) {
1204                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1205                                 power += nh->nh_weight;
1206                                 nh->nh_power = nh->nh_weight;
1207                         }
1208                 } endfor_nexthops(fi);
1209                 fi->fib_power = power;
1210                 if (power <= 0) {
1211                         spin_unlock_bh(&fib_multipath_lock);
1212                         /* Race condition: route has just become dead. */
1213                         res->nh_sel = 0;
1214                         return;
1215                 }
1216         }
1217
1218
1219         /* w should be random number [0..fi->fib_power-1],
1220            it is pretty bad approximation.
1221          */
1222
1223         w = jiffies % fi->fib_power;
1224
1225         change_nexthops(fi) {
1226                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1227                         if ((w -= nh->nh_power) <= 0) {
1228                                 nh->nh_power--;
1229                                 fi->fib_power--;
1230                                 res->nh_sel = nhsel;
1231                                 spin_unlock_bh(&fib_multipath_lock);
1232                                 return;
1233                         }
1234                 }
1235         } endfor_nexthops(fi);
1236
1237         /* Race condition: route has just become dead. */
1238         res->nh_sel = 0;
1239         spin_unlock_bh(&fib_multipath_lock);
1240 }
1241 #endif