vserver 1.9.3
[linux-2.6.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
25 #include <linux/mm.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
30 #include <linux/in.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
38
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45
46 #include "fib_lookup.h"
47
48 #define FSprintk(a...)
49
50 static rwlock_t fib_info_lock = RW_LOCK_UNLOCKED;
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61
62 static spinlock_t fib_multipath_lock = SPIN_LOCK_UNLOCKED;
63
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81
82 #define endfor_nexthops(fi) }
83
84
85 static struct 
86 {
87         int     error;
88         u8      scope;
89 } fib_props[RTA_MAX + 1] = {
90         {
91                 .error  = 0,
92                 .scope  = RT_SCOPE_NOWHERE,
93         },      /* RTN_UNSPEC */
94         {
95                 .error  = 0,
96                 .scope  = RT_SCOPE_UNIVERSE,
97         },      /* RTN_UNICAST */
98         {
99                 .error  = 0,
100                 .scope  = RT_SCOPE_HOST,
101         },      /* RTN_LOCAL */
102         {
103                 .error  = 0,
104                 .scope  = RT_SCOPE_LINK,
105         },      /* RTN_BROADCAST */
106         {
107                 .error  = 0,
108                 .scope  = RT_SCOPE_LINK,
109         },      /* RTN_ANYCAST */
110         {
111                 .error  = 0,
112                 .scope  = RT_SCOPE_UNIVERSE,
113         },      /* RTN_MULTICAST */
114         {
115                 .error  = -EINVAL,
116                 .scope  = RT_SCOPE_UNIVERSE,
117         },      /* RTN_BLACKHOLE */
118         {
119                 .error  = -EHOSTUNREACH,
120                 .scope  = RT_SCOPE_UNIVERSE,
121         },      /* RTN_UNREACHABLE */
122         {
123                 .error  = -EACCES,
124                 .scope  = RT_SCOPE_UNIVERSE,
125         },      /* RTN_PROHIBIT */
126         {
127                 .error  = -EAGAIN,
128                 .scope  = RT_SCOPE_UNIVERSE,
129         },      /* RTN_THROW */
130         {
131                 .error  = -EINVAL,
132                 .scope  = RT_SCOPE_NOWHERE,
133         },      /* RTN_NAT */
134         {
135                 .error  = -EINVAL,
136                 .scope  = RT_SCOPE_NOWHERE,
137         },      /* RTN_XRESOLVE */
138 };
139
140
141 /* Release a nexthop info record */
142
143 void free_fib_info(struct fib_info *fi)
144 {
145         if (fi->fib_dead == 0) {
146                 printk("Freeing alive fib_info %p\n", fi);
147                 return;
148         }
149         change_nexthops(fi) {
150                 if (nh->nh_dev)
151                         dev_put(nh->nh_dev);
152                 nh->nh_dev = NULL;
153         } endfor_nexthops(fi);
154         fib_info_cnt--;
155         kfree(fi);
156 }
157
158 void fib_release_info(struct fib_info *fi)
159 {
160         write_lock(&fib_info_lock);
161         if (fi && --fi->fib_treeref == 0) {
162                 hlist_del(&fi->fib_hash);
163                 if (fi->fib_prefsrc)
164                         hlist_del(&fi->fib_lhash);
165                 change_nexthops(fi) {
166                         hlist_del(&nh->nh_hash);
167                 } endfor_nexthops(fi)
168                 fi->fib_dead = 1;
169                 fib_info_put(fi);
170         }
171         write_unlock(&fib_info_lock);
172 }
173
174 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
175 {
176         const struct fib_nh *onh = ofi->fib_nh;
177
178         for_nexthops(fi) {
179                 if (nh->nh_oif != onh->nh_oif ||
180                     nh->nh_gw  != onh->nh_gw ||
181                     nh->nh_scope != onh->nh_scope ||
182 #ifdef CONFIG_IP_ROUTE_MULTIPATH
183                     nh->nh_weight != onh->nh_weight ||
184 #endif
185 #ifdef CONFIG_NET_CLS_ROUTE
186                     nh->nh_tclassid != onh->nh_tclassid ||
187 #endif
188                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
189                         return -1;
190                 onh++;
191         } endfor_nexthops(fi);
192         return 0;
193 }
194
195 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
196 {
197         unsigned int mask = (fib_hash_size - 1);
198         unsigned int val = fi->fib_nhs;
199
200         val ^= fi->fib_protocol;
201         val ^= fi->fib_prefsrc;
202         val ^= fi->fib_priority;
203
204         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
205 }
206
207 static struct fib_info *fib_find_info(const struct fib_info *nfi)
208 {
209         struct hlist_head *head;
210         struct hlist_node *node;
211         struct fib_info *fi;
212         unsigned int hash;
213
214         hash = fib_info_hashfn(nfi);
215         head = &fib_info_hash[hash];
216
217         hlist_for_each_entry(fi, node, head, fib_hash) {
218                 if (fi->fib_nhs != nfi->fib_nhs)
219                         continue;
220                 if (nfi->fib_protocol == fi->fib_protocol &&
221                     nfi->fib_prefsrc == fi->fib_prefsrc &&
222                     nfi->fib_priority == fi->fib_priority &&
223                     memcmp(nfi->fib_metrics, fi->fib_metrics,
224                            sizeof(fi->fib_metrics)) == 0 &&
225                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
226                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
227                         return fi;
228         }
229
230         return NULL;
231 }
232
233 static inline unsigned int fib_devindex_hashfn(unsigned int val)
234 {
235         unsigned int mask = DEVINDEX_HASHSIZE - 1;
236
237         return (val ^
238                 (val >> DEVINDEX_HASHBITS) ^
239                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
240 }
241
242 /* Check, that the gateway is already configured.
243    Used only by redirect accept routine.
244  */
245
246 int ip_fib_check_default(u32 gw, struct net_device *dev)
247 {
248         struct hlist_head *head;
249         struct hlist_node *node;
250         struct fib_nh *nh;
251         unsigned int hash;
252
253         read_lock(&fib_info_lock);
254
255         hash = fib_devindex_hashfn(dev->ifindex);
256         head = &fib_info_devhash[hash];
257         hlist_for_each_entry(nh, node, head, nh_hash) {
258                 if (nh->nh_dev == dev &&
259                     nh->nh_gw == gw &&
260                     !(nh->nh_flags&RTNH_F_DEAD)) {
261                         read_unlock(&fib_info_lock);
262                         return 0;
263                 }
264         }
265
266         read_unlock(&fib_info_lock);
267
268         return -1;
269 }
270
271 #ifdef CONFIG_IP_ROUTE_MULTIPATH
272
273 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
274 {
275         while (RTA_OK(attr,attrlen)) {
276                 if (attr->rta_type == type)
277                         return *(u32*)RTA_DATA(attr);
278                 attr = RTA_NEXT(attr, attrlen);
279         }
280         return 0;
281 }
282
283 static int
284 fib_count_nexthops(struct rtattr *rta)
285 {
286         int nhs = 0;
287         struct rtnexthop *nhp = RTA_DATA(rta);
288         int nhlen = RTA_PAYLOAD(rta);
289
290         while (nhlen >= (int)sizeof(struct rtnexthop)) {
291                 if ((nhlen -= nhp->rtnh_len) < 0)
292                         return 0;
293                 nhs++;
294                 nhp = RTNH_NEXT(nhp);
295         };
296         return nhs;
297 }
298
299 static int
300 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
301 {
302         struct rtnexthop *nhp = RTA_DATA(rta);
303         int nhlen = RTA_PAYLOAD(rta);
304
305         change_nexthops(fi) {
306                 int attrlen = nhlen - sizeof(struct rtnexthop);
307                 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
308                         return -EINVAL;
309                 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
310                 nh->nh_oif = nhp->rtnh_ifindex;
311                 nh->nh_weight = nhp->rtnh_hops + 1;
312                 if (attrlen) {
313                         nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
314 #ifdef CONFIG_NET_CLS_ROUTE
315                         nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
316 #endif
317                 }
318                 nhp = RTNH_NEXT(nhp);
319         } endfor_nexthops(fi);
320         return 0;
321 }
322
323 #endif
324
325 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
326                  struct fib_info *fi)
327 {
328 #ifdef CONFIG_IP_ROUTE_MULTIPATH
329         struct rtnexthop *nhp;
330         int nhlen;
331 #endif
332
333         if (rta->rta_priority &&
334             *rta->rta_priority != fi->fib_priority)
335                 return 1;
336
337         if (rta->rta_oif || rta->rta_gw) {
338                 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
339                     (!rta->rta_gw  || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
340                         return 0;
341                 return 1;
342         }
343
344 #ifdef CONFIG_IP_ROUTE_MULTIPATH
345         if (rta->rta_mp == NULL)
346                 return 0;
347         nhp = RTA_DATA(rta->rta_mp);
348         nhlen = RTA_PAYLOAD(rta->rta_mp);
349         
350         for_nexthops(fi) {
351                 int attrlen = nhlen - sizeof(struct rtnexthop);
352                 u32 gw;
353
354                 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
355                         return -EINVAL;
356                 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
357                         return 1;
358                 if (attrlen) {
359                         gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
360                         if (gw && gw != nh->nh_gw)
361                                 return 1;
362 #ifdef CONFIG_NET_CLS_ROUTE
363                         gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
364                         if (gw && gw != nh->nh_tclassid)
365                                 return 1;
366 #endif
367                 }
368                 nhp = RTNH_NEXT(nhp);
369         } endfor_nexthops(fi);
370 #endif
371         return 0;
372 }
373
374
375 /*
376    Picture
377    -------
378
379    Semantics of nexthop is very messy by historical reasons.
380    We have to take into account, that:
381    a) gateway can be actually local interface address,
382       so that gatewayed route is direct.
383    b) gateway must be on-link address, possibly
384       described not by an ifaddr, but also by a direct route.
385    c) If both gateway and interface are specified, they should not
386       contradict.
387    d) If we use tunnel routes, gateway could be not on-link.
388
389    Attempt to reconcile all of these (alas, self-contradictory) conditions
390    results in pretty ugly and hairy code with obscure logic.
391
392    I chose to generalized it instead, so that the size
393    of code does not increase practically, but it becomes
394    much more general.
395    Every prefix is assigned a "scope" value: "host" is local address,
396    "link" is direct route,
397    [ ... "site" ... "interior" ... ]
398    and "universe" is true gateway route with global meaning.
399
400    Every prefix refers to a set of "nexthop"s (gw, oif),
401    where gw must have narrower scope. This recursion stops
402    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
403    which means that gw is forced to be on link.
404
405    Code is still hairy, but now it is apparently logically
406    consistent and very flexible. F.e. as by-product it allows
407    to co-exists in peace independent exterior and interior
408    routing processes.
409
410    Normally it looks as following.
411
412    {universe prefix}  -> (gw, oif) [scope link]
413                           |
414                           |-> {link prefix} -> (gw, oif) [scope local]
415                                                 |
416                                                 |-> {local prefix} (terminal node)
417  */
418
419 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
420 {
421         int err;
422
423         if (nh->nh_gw) {
424                 struct fib_result res;
425
426 #ifdef CONFIG_IP_ROUTE_PERVASIVE
427                 if (nh->nh_flags&RTNH_F_PERVASIVE)
428                         return 0;
429 #endif
430                 if (nh->nh_flags&RTNH_F_ONLINK) {
431                         struct net_device *dev;
432
433                         if (r->rtm_scope >= RT_SCOPE_LINK)
434                                 return -EINVAL;
435                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
436                                 return -EINVAL;
437                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
438                                 return -ENODEV;
439                         if (!(dev->flags&IFF_UP))
440                                 return -ENETDOWN;
441                         nh->nh_dev = dev;
442                         dev_hold(dev);
443                         nh->nh_scope = RT_SCOPE_LINK;
444                         return 0;
445                 }
446                 {
447                         struct flowi fl = { .nl_u = { .ip4_u =
448                                                       { .daddr = nh->nh_gw,
449                                                         .scope = r->rtm_scope + 1 } },
450                                             .oif = nh->nh_oif };
451
452                         /* It is not necessary, but requires a bit of thinking */
453                         if (fl.fl4_scope < RT_SCOPE_LINK)
454                                 fl.fl4_scope = RT_SCOPE_LINK;
455                         if ((err = fib_lookup(&fl, &res)) != 0)
456                                 return err;
457                 }
458                 err = -EINVAL;
459                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
460                         goto out;
461                 nh->nh_scope = res.scope;
462                 nh->nh_oif = FIB_RES_OIF(res);
463                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
464                         goto out;
465                 dev_hold(nh->nh_dev);
466                 err = -ENETDOWN;
467                 if (!(nh->nh_dev->flags & IFF_UP))
468                         goto out;
469                 err = 0;
470 out:
471                 fib_res_put(&res);
472                 return err;
473         } else {
474                 struct in_device *in_dev;
475
476                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
477                         return -EINVAL;
478
479                 in_dev = inetdev_by_index(nh->nh_oif);
480                 if (in_dev == NULL)
481                         return -ENODEV;
482                 if (!(in_dev->dev->flags&IFF_UP)) {
483                         in_dev_put(in_dev);
484                         return -ENETDOWN;
485                 }
486                 nh->nh_dev = in_dev->dev;
487                 dev_hold(nh->nh_dev);
488                 nh->nh_scope = RT_SCOPE_HOST;
489                 in_dev_put(in_dev);
490         }
491         return 0;
492 }
493
494 static inline unsigned int fib_laddr_hashfn(u32 val)
495 {
496         unsigned int mask = (fib_hash_size - 1);
497
498         return (val ^ (val >> 7) ^ (val >> 14)) & mask;
499 }
500
501 static struct hlist_head *fib_hash_alloc(int bytes)
502 {
503         if (bytes <= PAGE_SIZE)
504                 return kmalloc(bytes, GFP_KERNEL);
505         else
506                 return (struct hlist_head *)
507                         __get_free_pages(GFP_KERNEL, get_order(bytes));
508 }
509
510 static void fib_hash_free(struct hlist_head *hash, int bytes)
511 {
512         if (!hash)
513                 return;
514
515         if (bytes <= PAGE_SIZE)
516                 kfree(hash);
517         else
518                 free_pages((unsigned long) hash, get_order(bytes));
519 }
520
521 static void fib_hash_move(struct hlist_head *new_info_hash,
522                           struct hlist_head *new_laddrhash,
523                           unsigned int new_size)
524 {
525         unsigned int old_size = fib_hash_size;
526         unsigned int i;
527
528         write_lock(&fib_info_lock);
529         fib_hash_size = new_size;
530
531         for (i = 0; i < old_size; i++) {
532                 struct hlist_head *head = &fib_info_hash[i];
533                 struct hlist_node *node, *n;
534                 struct fib_info *fi;
535
536                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
537                         struct hlist_head *dest;
538                         unsigned int new_hash;
539
540                         hlist_del(&fi->fib_hash);
541
542                         new_hash = fib_info_hashfn(fi);
543                         dest = &new_info_hash[new_hash];
544                         hlist_add_head(&fi->fib_hash, dest);
545                 }
546         }
547         fib_info_hash = new_info_hash;
548
549         for (i = 0; i < old_size; i++) {
550                 struct hlist_head *lhead = &fib_info_laddrhash[i];
551                 struct hlist_node *node, *n;
552                 struct fib_info *fi;
553
554                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
555                         struct hlist_head *ldest;
556                         unsigned int new_hash;
557
558                         hlist_del(&fi->fib_lhash);
559
560                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
561                         ldest = &new_laddrhash[new_hash];
562                         hlist_add_head(&fi->fib_lhash, ldest);
563                 }
564         }
565         fib_info_laddrhash = new_laddrhash;
566
567         write_unlock(&fib_info_lock);
568 }
569
570 struct fib_info *
571 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
572                 const struct nlmsghdr *nlh, int *errp)
573 {
574         int err;
575         struct fib_info *fi = NULL;
576         struct fib_info *ofi;
577 #ifdef CONFIG_IP_ROUTE_MULTIPATH
578         int nhs = 1;
579 #else
580         const int nhs = 1;
581 #endif
582
583         /* Fast check to catch the most weird cases */
584         if (fib_props[r->rtm_type].scope > r->rtm_scope)
585                 goto err_inval;
586
587 #ifdef CONFIG_IP_ROUTE_MULTIPATH
588         if (rta->rta_mp) {
589                 nhs = fib_count_nexthops(rta->rta_mp);
590                 if (nhs == 0)
591                         goto err_inval;
592         }
593 #endif
594
595         err = -ENOBUFS;
596         if (fib_info_cnt >= fib_hash_size) {
597                 unsigned int new_size = fib_hash_size << 1;
598                 struct hlist_head *new_info_hash;
599                 struct hlist_head *new_laddrhash;
600                 unsigned int bytes;
601
602                 if (!new_size)
603                         new_size = 1;
604                 bytes = new_size * sizeof(struct hlist_head *);
605                 new_info_hash = fib_hash_alloc(bytes);
606                 new_laddrhash = fib_hash_alloc(bytes);
607                 if (!new_info_hash || !new_laddrhash) {
608                         fib_hash_free(new_info_hash, bytes);
609                         fib_hash_free(new_laddrhash, bytes);
610                 } else {
611                         memset(new_info_hash, 0, bytes);
612                         memset(new_laddrhash, 0, bytes);
613
614                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
615                 }
616
617                 if (!fib_hash_size)
618                         goto failure;
619         }
620
621         fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
622         if (fi == NULL)
623                 goto failure;
624         fib_info_cnt++;
625         memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
626
627         fi->fib_protocol = r->rtm_protocol;
628
629         fi->fib_nhs = nhs;
630         change_nexthops(fi) {
631                 nh->nh_parent = fi;
632         } endfor_nexthops(fi)
633
634         fi->fib_flags = r->rtm_flags;
635         if (rta->rta_priority)
636                 fi->fib_priority = *rta->rta_priority;
637         if (rta->rta_mx) {
638                 int attrlen = RTA_PAYLOAD(rta->rta_mx);
639                 struct rtattr *attr = RTA_DATA(rta->rta_mx);
640
641                 while (RTA_OK(attr, attrlen)) {
642                         unsigned flavor = attr->rta_type;
643                         if (flavor) {
644                                 if (flavor > RTAX_MAX)
645                                         goto err_inval;
646                                 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
647                         }
648                         attr = RTA_NEXT(attr, attrlen);
649                 }
650         }
651         if (rta->rta_prefsrc)
652                 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
653
654         if (rta->rta_mp) {
655 #ifdef CONFIG_IP_ROUTE_MULTIPATH
656                 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
657                         goto failure;
658                 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
659                         goto err_inval;
660                 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
661                         goto err_inval;
662 #ifdef CONFIG_NET_CLS_ROUTE
663                 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
664                         goto err_inval;
665 #endif
666 #else
667                 goto err_inval;
668 #endif
669         } else {
670                 struct fib_nh *nh = fi->fib_nh;
671                 if (rta->rta_oif)
672                         nh->nh_oif = *rta->rta_oif;
673                 if (rta->rta_gw)
674                         memcpy(&nh->nh_gw, rta->rta_gw, 4);
675 #ifdef CONFIG_NET_CLS_ROUTE
676                 if (rta->rta_flow)
677                         memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
678 #endif
679                 nh->nh_flags = r->rtm_flags;
680 #ifdef CONFIG_IP_ROUTE_MULTIPATH
681                 nh->nh_weight = 1;
682 #endif
683         }
684
685         if (fib_props[r->rtm_type].error) {
686                 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
687                         goto err_inval;
688                 goto link_it;
689         }
690
691         if (r->rtm_scope > RT_SCOPE_HOST)
692                 goto err_inval;
693
694         if (r->rtm_scope == RT_SCOPE_HOST) {
695                 struct fib_nh *nh = fi->fib_nh;
696
697                 /* Local address is added. */
698                 if (nhs != 1 || nh->nh_gw)
699                         goto err_inval;
700                 nh->nh_scope = RT_SCOPE_NOWHERE;
701                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
702                 err = -ENODEV;
703                 if (nh->nh_dev == NULL)
704                         goto failure;
705         } else {
706                 change_nexthops(fi) {
707                         if ((err = fib_check_nh(r, fi, nh)) != 0)
708                                 goto failure;
709                 } endfor_nexthops(fi)
710         }
711
712         if (fi->fib_prefsrc) {
713                 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
714                     memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
715                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
716                                 goto err_inval;
717         }
718
719 link_it:
720         if ((ofi = fib_find_info(fi)) != NULL) {
721                 fi->fib_dead = 1;
722                 free_fib_info(fi);
723                 ofi->fib_treeref++;
724                 return ofi;
725         }
726
727         fi->fib_treeref++;
728         atomic_inc(&fi->fib_clntref);
729         write_lock(&fib_info_lock);
730         hlist_add_head(&fi->fib_hash,
731                        &fib_info_hash[fib_info_hashfn(fi)]);
732         if (fi->fib_prefsrc) {
733                 struct hlist_head *head;
734
735                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
736                 hlist_add_head(&fi->fib_lhash, head);
737         }
738         change_nexthops(fi) {
739                 struct hlist_head *head;
740                 unsigned int hash;
741
742                 if (!nh->nh_dev)
743                         continue;
744                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
745                 head = &fib_info_devhash[hash];
746                 hlist_add_head(&nh->nh_hash, head);
747         } endfor_nexthops(fi)
748         write_unlock(&fib_info_lock);
749         return fi;
750
751 err_inval:
752         err = -EINVAL;
753
754 failure:
755         *errp = err;
756         if (fi) {
757                 fi->fib_dead = 1;
758                 free_fib_info(fi);
759         }
760         return NULL;
761 }
762
763 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
764                        struct fib_result *res, int prefixlen)
765 {
766         struct fib_alias *fa;
767         int nh_sel = 0;
768
769         list_for_each_entry(fa, head, fa_list) {
770                 int err;
771
772                 if (fa->fa_tos &&
773                     fa->fa_tos != flp->fl4_tos)
774                         continue;
775
776                 if (fa->fa_scope < flp->fl4_scope)
777                         continue;
778
779                 fa->fa_state |= FA_S_ACCESSED;
780
781                 err = fib_props[fa->fa_type].error;
782                 if (err == 0) {
783                         struct fib_info *fi = fa->fa_info;
784
785                         if (fi->fib_flags & RTNH_F_DEAD)
786                                 continue;
787
788                         switch (fa->fa_type) {
789                         case RTN_UNICAST:
790                         case RTN_LOCAL:
791                         case RTN_BROADCAST:
792                         case RTN_ANYCAST:
793                         case RTN_MULTICAST:
794                                 for_nexthops(fi) {
795                                         if (nh->nh_flags&RTNH_F_DEAD)
796                                                 continue;
797                                         if (!flp->oif || flp->oif == nh->nh_oif)
798                                                 break;
799                                 }
800 #ifdef CONFIG_IP_ROUTE_MULTIPATH
801                                 if (nhsel < fi->fib_nhs) {
802                                         nh_sel = nhsel;
803                                         goto out_fill_res;
804                                 }
805 #else
806                                 if (nhsel < 1) {
807                                         goto out_fill_res;
808                                 }
809 #endif
810                                 endfor_nexthops(fi);
811                                 continue;
812
813                         default:
814                                 printk(KERN_DEBUG "impossible 102\n");
815                                 return -EINVAL;
816                         };
817                 }
818                 return err;
819         }
820         return 1;
821
822 out_fill_res:
823         res->prefixlen = prefixlen;
824         res->nh_sel = nh_sel;
825         res->type = fa->fa_type;
826         res->scope = fa->fa_scope;
827         res->fi = fa->fa_info;
828         atomic_inc(&res->fi->fib_clntref);
829         return 0;
830 }
831
832 /* Find appropriate source address to this destination */
833
834 u32 __fib_res_prefsrc(struct fib_result *res)
835 {
836         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
837 }
838
839 int
840 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
841               u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
842               struct fib_info *fi)
843 {
844         struct rtmsg *rtm;
845         struct nlmsghdr  *nlh;
846         unsigned char    *b = skb->tail;
847
848         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
849         rtm = NLMSG_DATA(nlh);
850         rtm->rtm_family = AF_INET;
851         rtm->rtm_dst_len = dst_len;
852         rtm->rtm_src_len = 0;
853         rtm->rtm_tos = tos;
854         rtm->rtm_table = tb_id;
855         rtm->rtm_type = type;
856         rtm->rtm_flags = fi->fib_flags;
857         rtm->rtm_scope = scope;
858         if (rtm->rtm_dst_len)
859                 RTA_PUT(skb, RTA_DST, 4, dst);
860         rtm->rtm_protocol = fi->fib_protocol;
861         if (fi->fib_priority)
862                 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
863 #ifdef CONFIG_NET_CLS_ROUTE
864         if (fi->fib_nh[0].nh_tclassid)
865                 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
866 #endif
867         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
868                 goto rtattr_failure;
869         if (fi->fib_prefsrc)
870                 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
871         if (fi->fib_nhs == 1) {
872                 if (fi->fib_nh->nh_gw)
873                         RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
874                 if (fi->fib_nh->nh_oif)
875                         RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
876         }
877 #ifdef CONFIG_IP_ROUTE_MULTIPATH
878         if (fi->fib_nhs > 1) {
879                 struct rtnexthop *nhp;
880                 struct rtattr *mp_head;
881                 if (skb_tailroom(skb) <= RTA_SPACE(0))
882                         goto rtattr_failure;
883                 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
884
885                 for_nexthops(fi) {
886                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
887                                 goto rtattr_failure;
888                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
889                         nhp->rtnh_flags = nh->nh_flags & 0xFF;
890                         nhp->rtnh_hops = nh->nh_weight-1;
891                         nhp->rtnh_ifindex = nh->nh_oif;
892                         if (nh->nh_gw)
893                                 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
894                         nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
895                 } endfor_nexthops(fi);
896                 mp_head->rta_type = RTA_MULTIPATH;
897                 mp_head->rta_len = skb->tail - (u8*)mp_head;
898         }
899 #endif
900         nlh->nlmsg_len = skb->tail - b;
901         return skb->len;
902
903 nlmsg_failure:
904 rtattr_failure:
905         skb_trim(skb, b - skb->data);
906         return -1;
907 }
908
909 #ifndef CONFIG_IP_NOSIOCRT
910
911 int
912 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
913                     struct kern_rta *rta, struct rtentry *r)
914 {
915         int    plen;
916         u32    *ptr;
917
918         memset(rtm, 0, sizeof(*rtm));
919         memset(rta, 0, sizeof(*rta));
920
921         if (r->rt_dst.sa_family != AF_INET)
922                 return -EAFNOSUPPORT;
923
924         /* Check mask for validity:
925            a) it must be contiguous.
926            b) destination must have all host bits clear.
927            c) if application forgot to set correct family (AF_INET),
928               reject request unless it is absolutely clear i.e.
929               both family and mask are zero.
930          */
931         plen = 32;
932         ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
933         if (!(r->rt_flags&RTF_HOST)) {
934                 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
935                 if (r->rt_genmask.sa_family != AF_INET) {
936                         if (mask || r->rt_genmask.sa_family)
937                                 return -EAFNOSUPPORT;
938                 }
939                 if (bad_mask(mask, *ptr))
940                         return -EINVAL;
941                 plen = inet_mask_len(mask);
942         }
943
944         nl->nlmsg_flags = NLM_F_REQUEST;
945         nl->nlmsg_pid = 0;
946         nl->nlmsg_seq = 0;
947         nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
948         if (cmd == SIOCDELRT) {
949                 nl->nlmsg_type = RTM_DELROUTE;
950                 nl->nlmsg_flags = 0;
951         } else {
952                 nl->nlmsg_type = RTM_NEWROUTE;
953                 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
954                 rtm->rtm_protocol = RTPROT_BOOT;
955         }
956
957         rtm->rtm_dst_len = plen;
958         rta->rta_dst = ptr;
959
960         if (r->rt_metric) {
961                 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
962                 rta->rta_priority = (u32*)&r->rt_pad3;
963         }
964         if (r->rt_flags&RTF_REJECT) {
965                 rtm->rtm_scope = RT_SCOPE_HOST;
966                 rtm->rtm_type = RTN_UNREACHABLE;
967                 return 0;
968         }
969         rtm->rtm_scope = RT_SCOPE_NOWHERE;
970         rtm->rtm_type = RTN_UNICAST;
971
972         if (r->rt_dev) {
973                 char *colon;
974                 struct net_device *dev;
975                 char   devname[IFNAMSIZ];
976
977                 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
978                         return -EFAULT;
979                 devname[IFNAMSIZ-1] = 0;
980                 colon = strchr(devname, ':');
981                 if (colon)
982                         *colon = 0;
983                 dev = __dev_get_by_name(devname);
984                 if (!dev)
985                         return -ENODEV;
986                 rta->rta_oif = &dev->ifindex;
987                 if (colon) {
988                         struct in_ifaddr *ifa;
989                         struct in_device *in_dev = __in_dev_get(dev);
990                         if (!in_dev)
991                                 return -ENODEV;
992                         *colon = ':';
993                         for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
994                                 if (strcmp(ifa->ifa_label, devname) == 0)
995                                         break;
996                         if (ifa == NULL)
997                                 return -ENODEV;
998                         rta->rta_prefsrc = &ifa->ifa_local;
999                 }
1000         }
1001
1002         ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1003         if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1004                 rta->rta_gw = ptr;
1005                 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1006                         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1007         }
1008
1009         if (cmd == SIOCDELRT)
1010                 return 0;
1011
1012         if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1013                 return -EINVAL;
1014
1015         if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1016                 rtm->rtm_scope = RT_SCOPE_LINK;
1017
1018         if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1019                 struct rtattr *rec;
1020                 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1021                 if (mx == NULL)
1022                         return -ENOMEM;
1023                 rta->rta_mx = mx;
1024                 mx->rta_type = RTA_METRICS;
1025                 mx->rta_len  = RTA_LENGTH(0);
1026                 if (r->rt_flags&RTF_MTU) {
1027                         rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1028                         rec->rta_type = RTAX_ADVMSS;
1029                         rec->rta_len = RTA_LENGTH(4);
1030                         mx->rta_len += RTA_LENGTH(4);
1031                         *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1032                 }
1033                 if (r->rt_flags&RTF_WINDOW) {
1034                         rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1035                         rec->rta_type = RTAX_WINDOW;
1036                         rec->rta_len = RTA_LENGTH(4);
1037                         mx->rta_len += RTA_LENGTH(4);
1038                         *(u32*)RTA_DATA(rec) = r->rt_window;
1039                 }
1040                 if (r->rt_flags&RTF_IRTT) {
1041                         rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1042                         rec->rta_type = RTAX_RTT;
1043                         rec->rta_len = RTA_LENGTH(4);
1044                         mx->rta_len += RTA_LENGTH(4);
1045                         *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1046                 }
1047         }
1048         return 0;
1049 }
1050
1051 #endif
1052
1053 /*
1054    Update FIB if:
1055    - local address disappeared -> we must delete all the entries
1056      referring to it.
1057    - device went down -> we must shutdown all nexthops going via it.
1058  */
1059
1060 int fib_sync_down(u32 local, struct net_device *dev, int force)
1061 {
1062         int ret = 0;
1063         int scope = RT_SCOPE_NOWHERE;
1064         
1065         if (force)
1066                 scope = -1;
1067
1068         if (local && fib_info_laddrhash) {
1069                 unsigned int hash = fib_laddr_hashfn(local);
1070                 struct hlist_head *head = &fib_info_laddrhash[hash];
1071                 struct hlist_node *node;
1072                 struct fib_info *fi;
1073
1074                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1075                         if (fi->fib_prefsrc == local) {
1076                                 fi->fib_flags |= RTNH_F_DEAD;
1077                                 ret++;
1078                         }
1079                 }
1080         }
1081
1082         if (dev) {
1083                 struct fib_info *prev_fi = NULL;
1084                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1085                 struct hlist_head *head = &fib_info_devhash[hash];
1086                 struct hlist_node *node;
1087                 struct fib_nh *nh;
1088
1089                 hlist_for_each_entry(nh, node, head, nh_hash) {
1090                         struct fib_info *fi = nh->nh_parent;
1091                         int dead;
1092
1093                         BUG_ON(!fi->fib_nhs);
1094                         if (nh->nh_dev != dev || fi == prev_fi)
1095                                 continue;
1096                         prev_fi = fi;
1097                         dead = 0;
1098                         change_nexthops(fi) {
1099                                 if (nh->nh_flags&RTNH_F_DEAD)
1100                                         dead++;
1101                                 else if (nh->nh_dev == dev &&
1102                                          nh->nh_scope != scope) {
1103                                         nh->nh_flags |= RTNH_F_DEAD;
1104 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1105                                         spin_lock_bh(&fib_multipath_lock);
1106                                         fi->fib_power -= nh->nh_power;
1107                                         nh->nh_power = 0;
1108                                         spin_unlock_bh(&fib_multipath_lock);
1109 #endif
1110                                         dead++;
1111                                 }
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1113                                 if (force > 1 && nh->nh_dev == dev) {
1114                                         dead = fi->fib_nhs;
1115                                         break;
1116                                 }
1117 #endif
1118                         } endfor_nexthops(fi)
1119                         if (dead == fi->fib_nhs) {
1120                                 fi->fib_flags |= RTNH_F_DEAD;
1121                                 ret++;
1122                         }
1123                 }
1124         }
1125
1126         return ret;
1127 }
1128
1129 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1130
1131 /*
1132    Dead device goes up. We wake up dead nexthops.
1133    It takes sense only on multipath routes.
1134  */
1135
1136 int fib_sync_up(struct net_device *dev)
1137 {
1138         struct fib_info *prev_fi;
1139         unsigned int hash;
1140         struct hlist_head *head;
1141         struct hlist_node *node;
1142         struct fib_nh *nh;
1143         int ret;
1144
1145         if (!(dev->flags&IFF_UP))
1146                 return 0;
1147
1148         prev_fi = NULL;
1149         hash = fib_devindex_hashfn(dev->ifindex);
1150         head = &fib_info_devhash[hash];
1151         ret = 0;
1152
1153         hlist_for_each_entry(nh, node, head, nh_hash) {
1154                 struct fib_info *fi = nh->nh_parent;
1155                 int alive;
1156
1157                 BUG_ON(!fi->fib_nhs);
1158                 if (nh->nh_dev != dev || fi == prev_fi)
1159                         continue;
1160
1161                 prev_fi = fi;
1162                 alive = 0;
1163                 change_nexthops(fi) {
1164                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1165                                 alive++;
1166                                 continue;
1167                         }
1168                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1169                                 continue;
1170                         if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1171                                 continue;
1172                         alive++;
1173                         spin_lock_bh(&fib_multipath_lock);
1174                         nh->nh_power = 0;
1175                         nh->nh_flags &= ~RTNH_F_DEAD;
1176                         spin_unlock_bh(&fib_multipath_lock);
1177                 } endfor_nexthops(fi)
1178
1179                 if (alive > 0) {
1180                         fi->fib_flags &= ~RTNH_F_DEAD;
1181                         ret++;
1182                 }
1183         }
1184
1185         return ret;
1186 }
1187
1188 /*
1189    The algorithm is suboptimal, but it provides really
1190    fair weighted route distribution.
1191  */
1192
1193 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1194 {
1195         struct fib_info *fi = res->fi;
1196         int w;
1197
1198         spin_lock_bh(&fib_multipath_lock);
1199         if (fi->fib_power <= 0) {
1200                 int power = 0;
1201                 change_nexthops(fi) {
1202                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1203                                 power += nh->nh_weight;
1204                                 nh->nh_power = nh->nh_weight;
1205                         }
1206                 } endfor_nexthops(fi);
1207                 fi->fib_power = power;
1208                 if (power <= 0) {
1209                         spin_unlock_bh(&fib_multipath_lock);
1210                         /* Race condition: route has just become dead. */
1211                         res->nh_sel = 0;
1212                         return;
1213                 }
1214         }
1215
1216
1217         /* w should be random number [0..fi->fib_power-1],
1218            it is pretty bad approximation.
1219          */
1220
1221         w = jiffies % fi->fib_power;
1222
1223         change_nexthops(fi) {
1224                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1225                         if ((w -= nh->nh_power) <= 0) {
1226                                 nh->nh_power--;
1227                                 fi->fib_power--;
1228                                 res->nh_sel = nhsel;
1229                                 spin_unlock_bh(&fib_multipath_lock);
1230                                 return;
1231                         }
1232                 }
1233         } endfor_nexthops(fi);
1234
1235         /* Race condition: route has just become dead. */
1236         res->nh_sel = 0;
1237         spin_unlock_bh(&fib_multipath_lock);
1238 }
1239 #endif