vserver 1.9.3
[linux-2.6.git] / net / ipv4 / fib_hash.c
index 3b4f8e3..25535ff 100644 (file)
 #include <net/sock.h>
 #include <net/ip_fib.h>
 
-#define FTprint(a...)
-/*
-   printk(KERN_DEBUG a)
- */
-
-static kmem_cache_t * fn_hash_kmem;
-
-/*
-   These bizarre types are just to force strict type checking.
-   When I reversed order of bytes and changed to natural mask lengths,
-   I forgot to make fixes in several places. Now I am lazy to return
-   it back.
- */
+#include "fib_lookup.h"
 
-typedef struct {
-       u32     datum;
-} fn_key_t;
+static kmem_cache_t *fn_hash_kmem;
+static kmem_cache_t *fn_alias_kmem;
 
-typedef struct {
-       u32     datum;
-} fn_hash_idx_t;
-
-struct fib_node
-{
-       struct fib_node         *fn_next;
-       struct fib_info         *fn_info;
-#define FIB_INFO(f)    ((f)->fn_info)
-       fn_key_t                fn_key;
-       u8                      fn_tos;
-       u8                      fn_type;
-       u8                      fn_scope;
-       u8                      fn_state;
+struct fib_node {
+       struct hlist_node       fn_hash;
+       struct list_head        fn_alias;
+       u32                     fn_key;
 };
 
-#define FN_S_ZOMBIE    1
-#define FN_S_ACCESSED  2
-
-static int fib_hash_zombies;
-
-struct fn_zone
-{
-       struct fn_zone  *fz_next;       /* Next not empty zone  */
-       struct fib_node **fz_hash;      /* Hash table pointer   */
-       int             fz_nent;        /* Number of entries    */
+struct fn_zone {
+       struct fn_zone          *fz_next;       /* Next not empty zone  */
+       struct hlist_head       *fz_hash;       /* Hash table pointer   */
+       int                     fz_nent;        /* Number of entries    */
 
-       int             fz_divisor;     /* Hash divisor         */
-       u32             fz_hashmask;    /* (fz_divisor - 1)     */
-#define FZ_HASHMASK(fz)        ((fz)->fz_hashmask)
+       int                     fz_divisor;     /* Hash divisor         */
+       u32                     fz_hashmask;    /* (fz_divisor - 1)     */
+#define FZ_HASHMASK(fz)                ((fz)->fz_hashmask)
 
-       int             fz_order;       /* Zone order           */
-       u32             fz_mask;
-#define FZ_MASK(fz)    ((fz)->fz_mask)
+       int                     fz_order;       /* Zone order           */
+       u32                     fz_mask;
+#define FZ_MASK(fz)            ((fz)->fz_mask)
 };
 
 /* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
  can be cheaper than memory lookup, so that FZ_* macros are used.
* can be cheaper than memory lookup, so that FZ_* macros are used.
  */
 
-struct fn_hash
-{
+struct fn_hash {
        struct fn_zone  *fn_zones[33];
        struct fn_zone  *fn_zone_list;
 };
 
-static __inline__ fn_hash_idx_t fn_hash(fn_key_t key, struct fn_zone *fz)
+static inline u32 fn_hash(u32 key, struct fn_zone *fz)
 {
-       u32 h = ntohl(key.datum)>>(32 - fz->fz_order);
+       u32 h = ntohl(key)>>(32 - fz->fz_order);
        h ^= (h>>20);
        h ^= (h>>10);
        h ^= (h>>5);
        h &= FZ_HASHMASK(fz);
-       return *(fn_hash_idx_t*)&h;
-}
-
-#define fz_key_0(key)          ((key).datum = 0)
-#define fz_prefix(key,fz)      ((key).datum)
-
-static __inline__ fn_key_t fz_key(u32 dst, struct fn_zone *fz)
-{
-       fn_key_t k;
-       k.datum = dst & FZ_MASK(fz);
-       return k;
-}
-
-static __inline__ struct fib_node ** fz_chain_p(fn_key_t key, struct fn_zone *fz)
-{
-       return &fz->fz_hash[fn_hash(key, fz).datum];
-}
-
-static __inline__ struct fib_node * fz_chain(fn_key_t key, struct fn_zone *fz)
-{
-       return fz->fz_hash[fn_hash(key, fz).datum];
+       return h;
 }
 
-static __inline__ int fn_key_eq(fn_key_t a, fn_key_t b)
+static inline u32 fz_key(u32 dst, struct fn_zone *fz)
 {
-       return a.datum == b.datum;
-}
-
-static __inline__ int fn_key_leq(fn_key_t a, fn_key_t b)
-{
-       return a.datum <= b.datum;
+       return dst & FZ_MASK(fz);
 }
 
 static rwlock_t fib_hash_lock = RW_LOCK_UNLOCKED;
 
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct fib_node *))
+#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
 
-static struct fib_node **fz_hash_alloc(int divisor)
+static struct hlist_head *fz_hash_alloc(int divisor)
 {
-       unsigned long size = divisor * sizeof(struct fib_node *);
+       unsigned long size = divisor * sizeof(struct hlist_head);
 
-       if (divisor <= 1024) {
+       if (size <= PAGE_SIZE) {
                return kmalloc(size, GFP_KERNEL);
        } else {
-               return (struct fib_node **)
+               return (struct hlist_head *)
                        __get_free_pages(GFP_KERNEL, get_order(size));
        }
 }
 
 /* The fib hash lock must be held when this is called. */
-static __inline__ void fn_rebuild_zone(struct fn_zone *fz,
-                                      struct fib_node **old_ht,
-                                      int old_divisor)
+static inline void fn_rebuild_zone(struct fn_zone *fz,
+                                  struct hlist_head *old_ht,
+                                  int old_divisor)
 {
        int i;
-       struct fib_node *f, **fp, *next;
-
-       for (i=0; i<old_divisor; i++) {
-               for (f=old_ht[i]; f; f=next) {
-                       next = f->fn_next;
-                       for (fp = fz_chain_p(f->fn_key, fz);
-                            *fp && fn_key_leq((*fp)->fn_key, f->fn_key);
-                            fp = &(*fp)->fn_next)
-                               /* NONE */;
-                       f->fn_next = *fp;
-                       *fp = f;
+
+       for (i = 0; i < old_divisor; i++) {
+               struct hlist_node *node, *n;
+               struct fib_node *f;
+
+               hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
+                       struct hlist_head *new_head;
+
+                       hlist_del(&f->fn_hash);
+
+                       new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+                       hlist_add_head(&f->fn_hash, new_head);
                }
        }
 }
 
-static void fz_hash_free(struct fib_node **hash, int divisor)
+static void fz_hash_free(struct hlist_head *hash, int divisor)
 {
-       if (divisor <= 1024)
+       unsigned long size = divisor * sizeof(struct hlist_head);
+
+       if (size <= PAGE_SIZE)
                kfree(hash);
        else
-               free_pages((unsigned long) hash,
-                          get_order(divisor * sizeof(struct fib_node *)));
+               free_pages((unsigned long)hash, get_order(size));
 }
 
 static void fn_rehash_zone(struct fn_zone *fz)
 {
-       struct fib_node **ht, **old_ht;
+       struct hlist_head *ht, *old_ht;
        int old_divisor, new_divisor;
        u32 new_hashmask;
                
@@ -226,7 +173,7 @@ static void fn_rehash_zone(struct fn_zone *fz)
        ht = fz_hash_alloc(new_divisor);
 
        if (ht) {
-               memset(ht, 0, new_divisor*sizeof(struct fib_node*));
+               memset(ht, 0, new_divisor * sizeof(struct hlist_head));
 
                write_lock_bh(&fib_hash_lock);
                old_ht = fz->fz_hash;
@@ -240,12 +187,16 @@ static void fn_rehash_zone(struct fn_zone *fz)
        }
 }
 
-static void fn_free_node(struct fib_node * f)
+static inline void fn_free_node(struct fib_node * f)
 {
-       fib_release_info(FIB_INFO(f));
        kmem_cache_free(fn_hash_kmem, f);
 }
 
+static inline void fn_free_alias(struct fib_alias *fa)
+{
+       fib_release_info(fa->fa_info);
+       kmem_cache_free(fn_alias_kmem, fa);
+}
 
 static struct fn_zone *
 fn_new_zone(struct fn_hash *table, int z)
@@ -267,7 +218,7 @@ fn_new_zone(struct fn_hash *table, int z)
                kfree(fz);
                return NULL;
        }
-       memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*));
+       memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
        fz->fz_order = z;
        fz->fz_mask = inet_make_mask(z);
 
@@ -298,35 +249,20 @@ fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 
        read_lock(&fib_hash_lock);
        for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+               struct hlist_head *head;
+               struct hlist_node *node;
                struct fib_node *f;
-               fn_key_t k = fz_key(flp->fl4_dst, fz);
+               u32 k = fz_key(flp->fl4_dst, fz);
 
-               for (f = fz_chain(k, fz); f; f = f->fn_next) {
-                       if (!fn_key_eq(k, f->fn_key)) {
-                               if (fn_key_leq(k, f->fn_key))
-                                       break;
-                               else
-                                       continue;
-                       }
-#ifdef CONFIG_IP_ROUTE_TOS
-                       if (f->fn_tos && f->fn_tos != flp->fl4_tos)
-                               continue;
-#endif
-                       f->fn_state |= FN_S_ACCESSED;
-
-                       if (f->fn_state&FN_S_ZOMBIE)
-                               continue;
-                       if (f->fn_scope < flp->fl4_scope)
+               head = &fz->fz_hash[fn_hash(k, fz)];
+               hlist_for_each_entry(f, node, head, fn_hash) {
+                       if (f->fn_key != k)
                                continue;
 
-                       err = fib_semantic_match(f->fn_type, FIB_INFO(f), flp, res);
-                       if (err == 0) {
-                               res->type = f->fn_type;
-                               res->scope = f->fn_scope;
-                               res->prefixlen = fz->fz_order;
-                               goto out;
-                       }
-                       if (err < 0)
+                       err = fib_semantic_match(&f->fn_alias,
+                                                flp, res,
+                                                fz->fz_order);
+                       if (err <= 0)
                                goto out;
                }
        }
@@ -365,6 +301,7 @@ static void
 fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
 {
        int order, last_idx;
+       struct hlist_node *node;
        struct fib_node *f;
        struct fib_info *fi = NULL;
        struct fib_info *last_resort;
@@ -379,36 +316,41 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
        order = -1;
 
        read_lock(&fib_hash_lock);
-       for (f = fz->fz_hash[0]; f; f = f->fn_next) {
-               struct fib_info *next_fi = FIB_INFO(f);
+       hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+               struct fib_alias *fa;
 
-               if ((f->fn_state&FN_S_ZOMBIE) ||
-                   f->fn_scope != res->scope ||
-                   f->fn_type != RTN_UNICAST)
-                       continue;
+               list_for_each_entry(fa, &f->fn_alias, fa_list) {
+                       struct fib_info *next_fi = fa->fa_info;
 
-               if (next_fi->fib_priority > res->fi->fib_priority)
-                       break;
-               if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
-                       continue;
-               f->fn_state |= FN_S_ACCESSED;
+                       if (fa->fa_scope != res->scope ||
+                           fa->fa_type != RTN_UNICAST)
+                               continue;
 
-               if (fi == NULL) {
-                       if (next_fi != res->fi)
+                       if (next_fi->fib_priority > res->fi->fib_priority)
                                break;
-               } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
-                       if (res->fi)
-                               fib_info_put(res->fi);
-                       res->fi = fi;
-                       atomic_inc(&fi->fib_clntref);
-                       fn_hash_last_dflt = order;
-                       goto out;
+                       if (!next_fi->fib_nh[0].nh_gw ||
+                           next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+                               continue;
+                       fa->fa_state |= FA_S_ACCESSED;
+
+                       if (fi == NULL) {
+                               if (next_fi != res->fi)
+                                       break;
+                       } else if (!fib_detect_death(fi, order, &last_resort,
+                                                    &last_idx)) {
+                               if (res->fi)
+                                       fib_info_put(res->fi);
+                               res->fi = fi;
+                               atomic_inc(&fi->fib_clntref);
+                               fn_hash_last_dflt = order;
+                               goto out;
+                       }
+                       fi = next_fi;
+                       order++;
                }
-               fi = next_fi;
-               order++;
        }
 
-       if (order<=0 || fi==NULL) {
+       if (order <= 0 || fi == NULL) {
                fn_hash_last_dflt = -1;
                goto out;
        }
@@ -434,52 +376,76 @@ out:
        read_unlock(&fib_hash_lock);
 }
 
-#define FIB_SCAN(f, fp) \
-for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
+static void rtmsg_fib(int, struct fib_node *, struct fib_alias *,
+                     int, int,
+                     struct nlmsghdr *n,
+                     struct netlink_skb_parms *);
 
-#define FIB_SCAN_KEY(f, fp, key) \
-for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
+/* Insert node F to FZ. */
+static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
+{
+       struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
 
-#ifndef CONFIG_IP_ROUTE_TOS
-#define FIB_SCAN_TOS(f, fp, key, tos) FIB_SCAN_KEY(f, fp, key)
-#else
-#define FIB_SCAN_TOS(f, fp, key, tos) \
-for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)) && \
-     (f)->fn_tos == (tos) ; (fp) = &(f)->fn_next)
-#endif
+       hlist_add_head(&f->fn_hash, head);
+}
 
+/* Return the node in FZ matching KEY. */
+static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
+{
+       struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+       struct hlist_node *node;
+       struct fib_node *f;
 
-static void rtmsg_fib(int, struct fib_node*, int, int,
-                     struct nlmsghdr *n,
-                     struct netlink_skb_parms *);
+       hlist_for_each_entry(f, node, head, fn_hash) {
+               if (f->fn_key == key)
+                       return f;
+       }
+
+       return NULL;
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+static struct fib_alias *fib_find_alias(struct fib_node *fn, u8 tos, u32 prio)
+{
+       if (fn) {
+               struct list_head *head = &fn->fn_alias;
+               struct fib_alias *fa;
+
+               list_for_each_entry(fa, head, fa_list) {
+                       if (fa->fa_tos > tos)
+                               continue;
+                       if (fa->fa_info->fib_priority >= prio ||
+                           fa->fa_tos < tos)
+                               return fa;
+               }
+       }
+       return NULL;
+}
 
 static int
 fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-               struct nlmsghdr *n, struct netlink_skb_parms *req)
+              struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
-       struct fn_hash *table = (struct fn_hash*)tb->tb_data;
-       struct fib_node *new_f, *f, **fp, **del_fp;
+       struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+       struct fib_node *new_f, *f;
+       struct fib_alias *fa, *new_fa;
        struct fn_zone *fz;
        struct fib_info *fi;
-
        int z = r->rtm_dst_len;
        int type = r->rtm_type;
-#ifdef CONFIG_IP_ROUTE_TOS
        u8 tos = r->rtm_tos;
-#endif
-       fn_key_t key;
+       u32 key;
        int err;
 
-FTprint("tb(%d)_insert: %d %08x/%d %d %08x\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
-*(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1,
-rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0);
        if (z > 32)
                return -EINVAL;
        fz = table->fn_zones[z];
        if (!fz && !(fz = fn_new_zone(table, z)))
                return -ENOBUFS;
 
-       fz_key_0(key);
+       key = 0;
        if (rta->rta_dst) {
                u32 dst;
                memcpy(&dst, rta->rta_dst, 4);
@@ -496,136 +462,114 @@ rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0);
            (z==32 || (1<<z) > fz->fz_divisor))
                fn_rehash_zone(fz);
 
-       fp = fz_chain_p(key, fz);
-
-
-       /*
-        * Scan list to find the first route with the same destination
+       f = fib_find_node(fz, key);
+       fa = fib_find_alias(f, tos, fi->fib_priority);
+
+       /* Now fa, if non-NULL, points to the first fib alias
+        * with the same keys [prefix,tos,priority], if such key already
+        * exists or to the node before which we will insert new one.
+        *
+        * If fa is NULL, we will need to allocate a new one and
+        * insert to the head of f.
+        *
+        * If f is NULL, no fib node matched the destination key
+        * and we need to allocate a new one of those as well.
         */
-       FIB_SCAN(f, fp) {
-               if (fn_key_leq(key,f->fn_key))
-                       break;
-       }
 
-#ifdef CONFIG_IP_ROUTE_TOS
-       /*
-        * Find route with the same destination and tos.
-        */
-       FIB_SCAN_KEY(f, fp, key) {
-               if (f->fn_tos <= tos)
-                       break;
-       }
-#endif
-
-       del_fp = NULL;
-
-       if (f && (f->fn_state&FN_S_ZOMBIE) &&
-#ifdef CONFIG_IP_ROUTE_TOS
-           f->fn_tos == tos &&
-#endif
-           fn_key_eq(f->fn_key, key)) {
-               del_fp = fp;
-               fp = &f->fn_next;
-               f = *fp;
-               goto create;
-       }
-
-       FIB_SCAN_TOS(f, fp, key, tos) {
-               if (fi->fib_priority <= FIB_INFO(f)->fib_priority)
-                       break;
-       }
-
-       /* Now f==*fp points to the first node with the same
-          keys [prefix,tos,priority], if such key already
-          exists or to the node, before which we will insert new one.
-        */
-
-       if (f && 
-#ifdef CONFIG_IP_ROUTE_TOS
-           f->fn_tos == tos &&
-#endif
-           fn_key_eq(f->fn_key, key) &&
-           fi->fib_priority == FIB_INFO(f)->fib_priority) {
-               struct fib_node **ins_fp;
+       if (fa && fa->fa_tos == tos &&
+           fa->fa_info->fib_priority == fi->fib_priority) {
+               struct fib_alias *fa_orig;
 
                err = -EEXIST;
-               if (n->nlmsg_flags&NLM_F_EXCL)
+               if (n->nlmsg_flags & NLM_F_EXCL)
                        goto out;
 
-               if (n->nlmsg_flags&NLM_F_REPLACE) {
-                       del_fp = fp;
-                       fp = &f->fn_next;
-                       f = *fp;
-                       goto replace;
-               }
+               if (n->nlmsg_flags & NLM_F_REPLACE) {
+                       struct fib_info *fi_drop;
+                       u8 state;
 
-               ins_fp = fp;
-               err = -EEXIST;
+                       write_lock_bh(&fib_hash_lock);
+                       fi_drop = fa->fa_info;
+                       fa->fa_info = fi;
+                       fa->fa_type = type;
+                       fa->fa_scope = r->rtm_scope;
+                       state = fa->fa_state;
+                       fa->fa_state &= ~FA_S_ACCESSED;
+                       write_unlock_bh(&fib_hash_lock);
 
-               FIB_SCAN_TOS(f, fp, key, tos) {
-                       if (fi->fib_priority != FIB_INFO(f)->fib_priority)
-                               break;
-                       if (f->fn_type == type && f->fn_scope == r->rtm_scope
-                           && FIB_INFO(f) == fi)
-                               goto out;
+                       fib_release_info(fi_drop);
+                       if (state & FA_S_ACCESSED)
+                               rt_cache_flush(-1);
+                       return 0;
                }
 
-               if (!(n->nlmsg_flags&NLM_F_APPEND)) {
-                       fp = ins_fp;
-                       f = *fp;
+               /* Error if we find a perfect match which
+                * uses the same scope, type, and nexthop
+                * information.
+                */
+               fa_orig = fa;
+               fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+               list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+                       if (fa->fa_tos != tos)
+                               break;
+                       if (fa->fa_info->fib_priority != fi->fib_priority)
+                               break;
+                       if (fa->fa_type == type &&
+                           fa->fa_scope == r->rtm_scope &&
+                           fa->fa_info == fi)
+                               goto out;
                }
+               if (!(n->nlmsg_flags & NLM_F_APPEND))
+                       fa = fa_orig;
        }
 
-create:
        err = -ENOENT;
        if (!(n->nlmsg_flags&NLM_F_CREATE))
                goto out;
 
-replace:
        err = -ENOBUFS;
-       new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
-       if (new_f == NULL)
+       new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+       if (new_fa == NULL)
                goto out;
 
-       memset(new_f, 0, sizeof(struct fib_node));
+       new_f = NULL;
+       if (!f) {
+               new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
+               if (new_f == NULL)
+                       goto out_free_new_fa;
 
-       new_f->fn_key = key;
-#ifdef CONFIG_IP_ROUTE_TOS
-       new_f->fn_tos = tos;
-#endif
-       new_f->fn_type = type;
-       new_f->fn_scope = r->rtm_scope;
-       FIB_INFO(new_f) = fi;
+               INIT_HLIST_NODE(&new_f->fn_hash);
+               INIT_LIST_HEAD(&new_f->fn_alias);
+               new_f->fn_key = key;
+               f = new_f;
+       }
+
+       new_fa->fa_info = fi;
+       new_fa->fa_tos = tos;
+       new_fa->fa_type = type;
+       new_fa->fa_scope = r->rtm_scope;
+       new_fa->fa_state = 0;
 
        /*
         * Insert new entry to the list.
         */
 
-       new_f->fn_next = f;
        write_lock_bh(&fib_hash_lock);
-       *fp = new_f;
+       if (new_f)
+               fib_insert_node(fz, new_f);
+       list_add_tail(&new_fa->fa_list,
+                (fa ? &fa->fa_list : &f->fn_alias));
        write_unlock_bh(&fib_hash_lock);
-       fz->fz_nent++;
 
-       if (del_fp) {
-               f = *del_fp;
-               /* Unlink replaced node */
-               write_lock_bh(&fib_hash_lock);
-               *del_fp = f->fn_next;
-               write_unlock_bh(&fib_hash_lock);
+       if (new_f)
+               fz->fz_nent++;
+       rt_cache_flush(-1);
 
-               if (!(f->fn_state&FN_S_ZOMBIE))
-                       rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
-               if (f->fn_state&FN_S_ACCESSED)
-                       rt_cache_flush(-1);
-               fn_free_node(f);
-               fz->fz_nent--;
-       } else {
-               rt_cache_flush(-1);
-       }
-       rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req);
+       rtmsg_fib(RTM_NEWROUTE, f, new_fa, z, tb->tb_id, n, req);
        return 0;
 
+out_free_new_fa:
+       kmem_cache_free(fn_alias_kmem, new_fa);
 out:
        fib_release_info(fi);
        return err;
@@ -634,26 +578,22 @@ out:
 
 static int
 fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-               struct nlmsghdr *n, struct netlink_skb_parms *req)
+              struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
        struct fn_hash *table = (struct fn_hash*)tb->tb_data;
-       struct fib_node **fp, **del_fp, *f;
+       struct fib_node *f;
+       struct fib_alias *fa, *fa_to_delete;
        int z = r->rtm_dst_len;
        struct fn_zone *fz;
-       fn_key_t key;
-       int matched;
-#ifdef CONFIG_IP_ROUTE_TOS
+       u32 key;
        u8 tos = r->rtm_tos;
-#endif
 
-FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
-       *(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1);
        if (z > 32)
                return -EINVAL;
        if ((fz  = table->fn_zones[z]) == NULL)
                return -ESRCH;
 
-       fz_key_0(key);
+       key = 0;
        if (rta->rta_dst) {
                u32 dst;
                memcpy(&dst, rta->rta_dst, 4);
@@ -662,62 +602,52 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
                key = fz_key(dst, fz);
        }
 
-       fp = fz_chain_p(key, fz);
+       f = fib_find_node(fz, key);
+       fa = fib_find_alias(f, tos, 0);
+       if (!fa)
+               return -ESRCH;
 
+       fa_to_delete = NULL;
+       fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+       list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+               struct fib_info *fi = fa->fa_info;
 
-       FIB_SCAN(f, fp) {
-               if (fn_key_eq(f->fn_key, key))
+               if (fa->fa_tos != tos)
                        break;
-               if (fn_key_leq(key, f->fn_key)) {
-                       return -ESRCH;
-               }
-       }
-#ifdef CONFIG_IP_ROUTE_TOS
-       FIB_SCAN_KEY(f, fp, key) {
-               if (f->fn_tos == tos)
-                       break;
-       }
-#endif
-
-       matched = 0;
-       del_fp = NULL;
-       FIB_SCAN_TOS(f, fp, key, tos) {
-               struct fib_info * fi = FIB_INFO(f);
 
-               if (f->fn_state&FN_S_ZOMBIE) {
-                       return -ESRCH;
+               if ((!r->rtm_type ||
+                    fa->fa_type == r->rtm_type) &&
+                   (r->rtm_scope == RT_SCOPE_NOWHERE ||
+                    fa->fa_scope == r->rtm_scope) &&
+                   (!r->rtm_protocol ||
+                    fi->fib_protocol == r->rtm_protocol) &&
+                   fib_nh_match(r, n, rta, fi) == 0) {
+                       fa_to_delete = fa;
+                       break;
                }
-               matched++;
-
-               if (del_fp == NULL &&
-                   (!r->rtm_type || f->fn_type == r->rtm_type) &&
-                   (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
-                   (!r->rtm_protocol || fi->fib_protocol == r->rtm_protocol) &&
-                   fib_nh_match(r, n, rta, fi) == 0)
-                       del_fp = fp;
        }
 
-       if (del_fp) {
-               f = *del_fp;
-               rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
+       if (fa_to_delete) {
+               int kill_fn;
 
-               if (matched != 1) {
-                       write_lock_bh(&fib_hash_lock);
-                       *del_fp = f->fn_next;
-                       write_unlock_bh(&fib_hash_lock);
+               fa = fa_to_delete;
+               rtmsg_fib(RTM_DELROUTE, f, fa, z, tb->tb_id, n, req);
 
-                       if (f->fn_state&FN_S_ACCESSED)
-                               rt_cache_flush(-1);
+               kill_fn = 0;
+               write_lock_bh(&fib_hash_lock);
+               list_del(&fa->fa_list);
+               if (list_empty(&f->fn_alias)) {
+                       hlist_del(&f->fn_hash);
+                       kill_fn = 1;
+               }
+               write_unlock_bh(&fib_hash_lock);
+
+               if (fa->fa_state & FA_S_ACCESSED)
+                       rt_cache_flush(-1);
+               fn_free_alias(fa);
+               if (kill_fn) {
                        fn_free_node(f);
                        fz->fz_nent--;
-               } else {
-                       f->fn_state |= FN_S_ZOMBIE;
-                       if (f->fn_state&FN_S_ACCESSED) {
-                               f->fn_state &= ~FN_S_ACCESSED;
-                               rt_cache_flush(-1);
-                       }
-                       if (++fib_hash_zombies > 128)
-                               fib_flush();
                }
 
                return 0;
@@ -725,74 +655,99 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
        return -ESRCH;
 }
 
-static __inline__ int
-fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table)
+static int fn_flush_list(struct fn_zone *fz, int idx)
 {
-       int found = 0;
+       struct hlist_head *head = &fz->fz_hash[idx];
+       struct hlist_node *node, *n;
        struct fib_node *f;
+       int found = 0;
 
-       while ((f = *fp) != NULL) {
-               struct fib_info *fi = FIB_INFO(f);
-
-               if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
-                       write_lock_bh(&fib_hash_lock);
-                       *fp = f->fn_next;
-                       write_unlock_bh(&fib_hash_lock);
-
+       hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
+               struct fib_alias *fa, *fa_node;
+               int kill_f;
+
+               kill_f = 0;
+               list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
+                       struct fib_info *fi = fa->fa_info;
+
+                       if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
+                               write_lock_bh(&fib_hash_lock);
+                               list_del(&fa->fa_list);
+                               if (list_empty(&f->fn_alias)) {
+                                       hlist_del(&f->fn_hash);
+                                       kill_f = 1;
+                               }
+                               write_unlock_bh(&fib_hash_lock);
+
+                               fn_free_alias(fa);
+                               found++;
+                       }
+               }
+               if (kill_f) {
                        fn_free_node(f);
-                       found++;
-                       continue;
+                       fz->fz_nent--;
                }
-               fp = &f->fn_next;
        }
        return found;
 }
 
 static int fn_hash_flush(struct fib_table *tb)
 {
-       struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+       struct fn_hash *table = (struct fn_hash *) tb->tb_data;
        struct fn_zone *fz;
        int found = 0;
 
-       fib_hash_zombies = 0;
        for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
                int i;
-               int tmp = 0;
-               for (i=fz->fz_divisor-1; i>=0; i--)
-                       tmp += fn_flush_list(&fz->fz_hash[i], fz->fz_order, table);
-               fz->fz_nent -= tmp;
-               found += tmp;
+
+               for (i = fz->fz_divisor - 1; i >= 0; i--)
+                       found += fn_flush_list(fz, i);
        }
        return found;
 }
 
 
-static __inline__ int
+static inline int
 fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
                     struct fib_table *tb,
                     struct fn_zone *fz,
-                    struct fib_node *f)
+                    struct hlist_head *head)
 {
+       struct hlist_node *node;
+       struct fib_node *f;
        int i, s_i;
 
        s_i = cb->args[3];
-       for (i=0; f; i++, f=f->fn_next) {
-               if (i < s_i) continue;
-               if (f->fn_state&FN_S_ZOMBIE) continue;
-               if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-                                 RTM_NEWROUTE,
-                                 tb->tb_id, (f->fn_state&FN_S_ZOMBIE) ? 0 : f->fn_type, f->fn_scope,
-                                 &f->fn_key, fz->fz_order, f->fn_tos,
-                                 f->fn_info) < 0) {
-                       cb->args[3] = i;
-                       return -1;
+       i = 0;
+       hlist_for_each_entry(f, node, head, fn_hash) {
+               struct fib_alias *fa;
+
+               list_for_each_entry(fa, &f->fn_alias, fa_list) {
+                       if (i < s_i)
+                               continue;
+
+                       if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+                                         cb->nlh->nlmsg_seq,
+                                         RTM_NEWROUTE,
+                                         tb->tb_id,
+                                         fa->fa_type,
+                                         fa->fa_scope,
+                                         &f->fn_key,
+                                         fz->fz_order,
+                                         fa->fa_tos,
+                                         fa->fa_info) < 0) {
+                               cb->args[3] = i;
+                               return -1;
+                       }
+
+                       i++;
                }
        }
        cb->args[3] = i;
        return skb->len;
 }
 
-static __inline__ int
+static inline int
 fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
                   struct fib_table *tb,
                   struct fn_zone *fz)
@@ -803,10 +758,12 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
        for (h=0; h < fz->fz_divisor; h++) {
                if (h < s_h) continue;
                if (h > s_h)
-                       memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
-               if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL)
+                       memset(&cb->args[3], 0,
+                              sizeof(cb->args) - 3*sizeof(cb->args[0]));
+               if (fz->fz_hash == NULL ||
+                   hlist_empty(&fz->fz_hash[h]))
                        continue;
-               if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) {
+               if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
                        cb->args[2] = h;
                        return -1;
                }
@@ -826,7 +783,8 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
        for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
                if (m < s_m) continue;
                if (m > s_m)
-                       memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
+                       memset(&cb->args[2], 0,
+                              sizeof(cb->args) - 2*sizeof(cb->args[0]));
                if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
                        cb->args[1] = m;
                        read_unlock(&fib_hash_lock);
@@ -838,7 +796,8 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
        return skb->len;
 }
 
-static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
+static void rtmsg_fib(int event, struct fib_node *f, struct fib_alias *fa,
+                     int z, int tb_id,
                      struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
        struct sk_buff *skb;
@@ -850,8 +809,9 @@ static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
                return;
 
        if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
-                         f->fn_type, f->fn_scope, &f->fn_key, z, f->fn_tos,
-                         FIB_INFO(f)) < 0) {
+                         fa->fa_type, fa->fa_scope, &f->fn_key, z,
+                         fa->fa_tos,
+                         fa->fa_info) < 0) {
                kfree_skb(skb);
                return;
        }
@@ -877,7 +837,14 @@ struct fib_table * __init fib_hash_init(int id)
                                                 0, SLAB_HWCACHE_ALIGN,
                                                 NULL, NULL);
 
-       tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL);
+       if (fn_alias_kmem == NULL)
+               fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+                                                 sizeof(struct fib_alias),
+                                                 0, SLAB_HWCACHE_ALIGN,
+                                                 NULL, NULL);
+
+       tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
+                    GFP_KERNEL);
        if (tb == NULL)
                return NULL;
 
@@ -898,64 +865,105 @@ struct fib_table * __init fib_hash_init(int id)
 struct fib_iter_state {
        struct fn_zone  *zone;
        int             bucket;
-       struct fib_node **hash;
-       struct fib_node *node;
+       struct hlist_head *hash_head;
+       struct fib_node *fn;
+       struct fib_alias *fa;
 };
 
-static __inline__ struct fib_node *fib_get_first(struct seq_file *seq)
+static struct fib_alias *fib_get_first(struct seq_file *seq)
 {
-       struct fib_iter_stateiter = seq->private;
-       struct fn_hash *table = (struct fn_hash *)ip_fib_main_table->tb_data;
+       struct fib_iter_state *iter = seq->private;
+       struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
 
-       iter->bucket = 0;
-       iter->hash   = NULL;
-       iter->node   = NULL;
+       iter->bucket    = 0;
+       iter->hash_head = NULL;
+       iter->fn        = NULL;
+       iter->fa        = NULL;
 
        for (iter->zone = table->fn_zone_list; iter->zone;
             iter->zone = iter->zone->fz_next) {
                int maxslot;
 
-               if (!iter->zone->fz_next)
+               if (!iter->zone->fz_nent)
                        continue;
 
-               iter->hash = iter->zone->fz_hash;
+               iter->hash_head = iter->zone->fz_hash;
                maxslot = iter->zone->fz_divisor;
 
                for (iter->bucket = 0; iter->bucket < maxslot;
-                    ++iter->bucket, ++iter->hash) {
-                       iter->node = *iter->hash;
-
-                       if (iter->node)
-                               goto out;
+                    ++iter->bucket, ++iter->hash_head) {
+                       struct hlist_node *node;
+                       struct fib_node *fn;
+
+                       hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
+                               struct fib_alias *fa;
+
+                               list_for_each_entry(fa,&fn->fn_alias,fa_list) {
+                                       iter->fn = fn;
+                                       iter->fa = fa;
+                                       goto out;
+                               }
+                       }
                }
        }
 out:
-       return iter->node;
+       return iter->fa;
 }
 
-static __inline__ struct fib_node *fib_get_next(struct seq_file *seq)
+static struct fib_alias *fib_get_next(struct seq_file *seq)
 {
-       struct fib_iter_state* iter = seq->private;
+       struct fib_iter_state *iter = seq->private;
+       struct fib_node *fn;
+       struct fib_alias *fa;
+
+       /* Advance FA, if any. */
+       fn = iter->fn;
+       fa = iter->fa;
+       if (fa) {
+               BUG_ON(!fn);
+               list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
+                       iter->fa = fa;
+                       goto out;
+               }
+       }
 
-       if (iter->node)
-               iter->node = iter->node->fn_next;
+       fa = iter->fa = NULL;
 
-       if (iter->node)
-               goto out;
+       /* Advance FN. */
+       if (fn) {
+               struct hlist_node *node = &fn->fn_hash;
+               hlist_for_each_entry_continue(fn, node, fn_hash) {
+                       iter->fn = fn;
 
+                       list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+                               iter->fa = fa;
+                               goto out;
+                       }
+               }
+       }
+
+       fn = iter->fn = NULL;
+
+       /* Advance hash chain. */
        if (!iter->zone)
                goto out;
 
        for (;;) {
+               struct hlist_node *node;
                int maxslot;
 
                maxslot = iter->zone->fz_divisor;
 
                while (++iter->bucket < maxslot) {
-                       iter->node = *++iter->hash;
-
-                       if (iter->node)
-                               goto out;
+                       iter->hash_head++;
+
+                       hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+                               list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+                                       iter->fn = fn;
+                                       iter->fa = fa;
+                                       goto out;
+                               }
+                       }
                }
 
                iter->zone = iter->zone->fz_next;
@@ -963,14 +971,19 @@ static __inline__ struct fib_node *fib_get_next(struct seq_file *seq)
                if (!iter->zone)
                        goto out;
                
-               iter->hash = iter->zone->fz_hash;
                iter->bucket = 0;
-               iter->node = *iter->hash;
-               if (iter->node)
-                       break;
+               iter->hash_head = iter->zone->fz_hash;
+
+               hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+                       list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+                               iter->fn = fn;
+                               iter->fa = fa;
+                               goto out;
+                       }
+               }
        }
 out:
-       return iter->node;
+       return fa;
 }
 
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
@@ -994,7 +1007,7 @@ static void fib_seq_stop(struct seq_file *seq, void *v)
        read_unlock(&fib_hash_lock);
 }
 
-static unsigned fib_flag_trans(int type, int dead, u32 mask, struct fib_info *fi)
+static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
 {
        static unsigned type2flags[RTN_MAX + 1] = {
                [7] = RTF_REJECT, [8] = RTF_REJECT,
@@ -1005,8 +1018,7 @@ static unsigned fib_flag_trans(int type, int dead, u32 mask, struct fib_info *fi
                flags |= RTF_GATEWAY;
        if (mask == 0xFFFFFFFF)
                flags |= RTF_HOST;
-       if (!dead)
-               flags |= RTF_UP;
+       flags |= RTF_UP;
        return flags;
 }
 
@@ -1020,11 +1032,12 @@ extern int dev_in_nx_info(struct net_device *, struct nx_info *);
  */
 static int fib_seq_show(struct seq_file *seq, void *v)
 {
-       struct fib_iter_stateiter;
+       struct fib_iter_state *iter;
        char bf[128];
        u32 prefix, mask;
        unsigned flags;
        struct fib_node *f;
+       struct fib_alias *fa;
        struct fib_info *fi;
 
        if (v == SEQ_START_TOKEN) {
@@ -1034,14 +1047,15 @@ static int fib_seq_show(struct seq_file *seq, void *v)
                goto out;
        }
 
-       f       = v;
-       fi      = FIB_INFO(f);
        iter    = seq->private;
-       prefix  = fz_prefix(f->fn_key, iter->zone);
+       f       = iter->fn;
+       fa      = iter->fa;
+       fi      = fa->fa_info;
+       prefix  = f->fn_key;
        mask    = FZ_MASK(iter->zone);
-       flags   = fib_flag_trans(f->fn_type, f->fn_state & FN_S_ZOMBIE,
-                                mask, fi);
-       if (fi && dev_in_nx_info(fi->fib_dev, current->nx_info))
+       flags   = fib_flag_trans(fa->fa_type, mask, fi);
+       if (fi && (!vx_flags(VXF_HIDE_NETIF, 0) ||
+               dev_in_nx_info(fi->fib_dev, current->nx_info)))
                snprintf(bf, sizeof(bf),
                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
                         fi->fib_dev ? fi->fib_dev->name : "*", prefix,