vserver 1.9.5.x5
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
index 00a89f4..28d9425 100644 (file)
@@ -34,8 +34,9 @@
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
-/* For ERR_PTR().  Yeah, I know... --RR */
-#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
 
 /* This rwlock protects the main hash table, protocol/helper/expected
    registrations, conntrack timers*/
 #endif
 
 DECLARE_RWLOCK(ip_conntrack_lock);
-DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
+
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
 
 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
 LIST_HEAD(ip_conntrack_expect_list);
-LIST_HEAD(protocol_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
 static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
 int ip_conntrack_max;
-static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
 struct list_head *ip_conntrack_hash;
 static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_expect_cachep;
 struct ip_conntrack ip_conntrack_untracked;
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
 
-extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
-
-static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
-                             u_int8_t protocol)
-{
-       return protocol == curr->proto;
-}
-
-struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
-{
-       struct ip_conntrack_protocol *p;
-
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       p = LIST_FIND(&protocol_list, proto_cmpfn,
-                     struct ip_conntrack_protocol *, protocol);
-       if (!p)
-               p = &ip_conntrack_generic_protocol;
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
 
-       return p;
-}
-
-struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
-{
-       struct ip_conntrack_protocol *p;
-
-       READ_LOCK(&ip_conntrack_lock);
-       p = __ip_ct_find_proto(protocol);
-       READ_UNLOCK(&ip_conntrack_lock);
-       return p;
-}
-
-inline void 
+void 
 ip_conntrack_put(struct ip_conntrack *ct)
 {
        IP_NF_ASSERT(ct);
-       IP_NF_ASSERT(ct->infos[0].master);
-       /* nf_conntrack_put wants to go via an info struct, so feed it
-           one at random. */
-       nf_conntrack_put(&ct->infos[0]);
+       nf_conntrack_put(&ct->ct_general);
 }
 
 static int ip_conntrack_hash_rnd_initted;
@@ -127,11 +101,11 @@ hash_conntrack(const struct ip_conntrack_tuple *tuple)
 }
 
 int
-get_tuple(const struct iphdr *iph,
-         const struct sk_buff *skb,
-         unsigned int dataoff,
-         struct ip_conntrack_tuple *tuple,
-         const struct ip_conntrack_protocol *protocol)
+ip_ct_get_tuple(const struct iphdr *iph,
+               const struct sk_buff *skb,
+               unsigned int dataoff,
+               struct ip_conntrack_tuple *tuple,
+               const struct ip_conntrack_protocol *protocol)
 {
        /* Never happen */
        if (iph->frag_off & htons(IP_OFFSET)) {
@@ -143,146 +117,89 @@ get_tuple(const struct iphdr *iph,
        tuple->src.ip = iph->saddr;
        tuple->dst.ip = iph->daddr;
        tuple->dst.protonum = iph->protocol;
+       tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 
        return protocol->pkt_to_tuple(skb, dataoff, tuple);
 }
 
-static int
-invert_tuple(struct ip_conntrack_tuple *inverse,
-            const struct ip_conntrack_tuple *orig,
-            const struct ip_conntrack_protocol *protocol)
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+                  const struct ip_conntrack_tuple *orig,
+                  const struct ip_conntrack_protocol *protocol)
 {
        inverse->src.ip = orig->dst.ip;
        inverse->dst.ip = orig->src.ip;
        inverse->dst.protonum = orig->dst.protonum;
+       inverse->dst.dir = !orig->dst.dir;
 
        return protocol->invert_tuple(inverse, orig);
 }
 
 
 /* ip_conntrack_expect helper functions */
-
-/* Compare tuple parts depending on mask. */
-static inline int expect_cmp(const struct ip_conntrack_expect *i,
-                            const struct ip_conntrack_tuple *tuple)
-{
-       MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
-       return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
-}
-
-static void
-destroy_expect(struct ip_conntrack_expect *exp)
+static void destroy_expect(struct ip_conntrack_expect *exp)
 {
-       DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
-       IP_NF_ASSERT(atomic_read(&exp->use) == 0);
+       ip_conntrack_put(exp->master);
        IP_NF_ASSERT(!timer_pending(&exp->timeout));
-
-       kfree(exp);
+       kmem_cache_free(ip_conntrack_expect_cachep, exp);
+       CONNTRACK_STAT_INC(expect_delete);
 }
 
-inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
+static void unlink_expect(struct ip_conntrack_expect *exp)
 {
-       IP_NF_ASSERT(exp);
-
-       if (atomic_dec_and_test(&exp->use)) {
-               /* usage count dropped to zero */
-               destroy_expect(exp);
-       }
-}
-
-static inline struct ip_conntrack_expect *
-__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
-{
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
-       return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
-                        struct ip_conntrack_expect *, tuple);
-}
-
-/* Find a expectation corresponding to a tuple. */
-struct ip_conntrack_expect *
-ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
-{
-       struct ip_conntrack_expect *exp;
-
-       READ_LOCK(&ip_conntrack_lock);
-       READ_LOCK(&ip_conntrack_expect_tuple_lock);
-       exp = __ip_ct_expect_find(tuple);
-       if (exp)
-               atomic_inc(&exp->use);
-       READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
-       READ_UNLOCK(&ip_conntrack_lock);
-
-       return exp;
+       MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+       list_del(&exp->list);
+       /* Logically in destroy_expect, but we hold the lock here. */
+       exp->master->expecting--;
 }
 
-/* remove one specific expectation from all lists and drop refcount,
- * does _NOT_ delete the timer. */
-static void __unexpect_related(struct ip_conntrack_expect *expect)
+static void expectation_timed_out(unsigned long ul_expect)
 {
-       DEBUGP("unexpect_related(%p)\n", expect);
-       MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
-
-       /* we're not allowed to unexpect a confirmed expectation! */
-       IP_NF_ASSERT(!expect->sibling);
+       struct ip_conntrack_expect *exp = (void *)ul_expect;
 
-       /* delete from global and local lists */
-       list_del(&expect->list);
-       list_del(&expect->expected_list);
-
-       /* decrement expect-count of master conntrack */
-       if (expect->expectant)
-               expect->expectant->expecting--;
-
-       ip_conntrack_expect_put(expect);
+       WRITE_LOCK(&ip_conntrack_lock);
+       unlink_expect(exp);
+       WRITE_UNLOCK(&ip_conntrack_lock);
+       destroy_expect(exp);
 }
 
-/* remove one specific expecatation from all lists, drop refcount
- * and expire timer. 
- * This function can _NOT_ be called for confirmed expects! */
-static void unexpect_related(struct ip_conntrack_expect *expect)
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
 {
-       IP_NF_ASSERT(expect->expectant);
-       IP_NF_ASSERT(expect->expectant->helper);
-       /* if we are supposed to have a timer, but we can't delete
-        * it: race condition.  __unexpect_related will
-        * be calledd by timeout function */
-       if (expect->expectant->helper->timeout
-           && !del_timer(&expect->timeout))
-               return;
-
-       __unexpect_related(expect);
+       struct ip_conntrack_expect *i;
+
+       list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+               /* If master is not in hash table yet (ie. packet hasn't left
+                  this machine yet), how can other end know about expected?
+                  Hence these are not the droids you are looking for (if
+                  master ct never got confirmed, we'd hold a reference to it
+                  and weird things would happen to future packets). */
+               if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+                   && is_confirmed(i->master)
+                   && del_timer(&i->timeout)) {
+                       unlink_expect(i);
+                       return i;
+               }
+       }
+       return NULL;
 }
 
-/* delete all unconfirmed expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct ip_conntrack *ct)
 {
-       struct list_head *exp_entry, *next;
-       struct ip_conntrack_expect *exp;
-
-       DEBUGP("remove_expectations(%p)\n", ct);
+       struct ip_conntrack_expect *i, *tmp;
 
-       list_for_each_safe(exp_entry, next, &ct->sibling_list) {
-               exp = list_entry(exp_entry, struct ip_conntrack_expect,
-                                expected_list);
+       /* Optimization: most connection never expect any others. */
+       if (ct->expecting == 0)
+               return;
 
-               /* we skip established expectations, as we want to delete
-                * the un-established ones only */
-               if (exp->sibling) {
-                       DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
-                       if (drop_refcount) {
-                               /* Indicate that this expectations parent is dead */
-                               ip_conntrack_put(exp->expectant);
-                               exp->expectant = NULL;
-                       }
-                       continue;
+       list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+               if (i->master == ct && del_timer(&i->timeout)) {
+                       unlink_expect(i);
+                       destroy_expect(i);
                }
-
-               IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
-               IP_NF_ASSERT(exp->expectant == ct);
-
-               /* delete expectation from global and private lists */
-               unexpect_related(exp);
        }
 }
 
@@ -299,14 +216,14 @@ clean_from_lists(struct ip_conntrack *ct)
        LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
        LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
 
-       /* Destroy all un-established, pending expectations */
-       remove_expectations(ct, 1);
+       /* Destroy all pending expectations */
+       remove_expectations(ct);
 }
 
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
-       struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
+       struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
        struct ip_conntrack_protocol *proto;
 
        DEBUGP("destroy_conntrack(%p)\n", ct);
@@ -324,24 +241,23 @@ destroy_conntrack(struct nf_conntrack *nfct)
                ip_conntrack_destroyed(ct);
 
        WRITE_LOCK(&ip_conntrack_lock);
-       /* Make sure don't leave any orphaned expectations lying around */
-       if (ct->expecting)
-               remove_expectations(ct, 1);
-
-       /* Delete our master expectation */
-       if (ct->master) {
-               if (ct->master->expectant) {
-                       /* can't call __unexpect_related here,
-                        * since it would screw up expect_list */
-                       list_del(&ct->master->expected_list);
-                       master = ct->master->expectant;
-               }
-               kfree(ct->master);
+       /* Expectations will have been removed in clean_from_lists,
+        * except TFTP can create an expectation on the first packet,
+        * before connection is in the list, so we need to clean here,
+        * too. */
+       remove_expectations(ct);
+
+       /* We overload first tuple to link into unconfirmed list. */
+       if (!is_confirmed(ct)) {
+               BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+               list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
        }
+
+       CONNTRACK_STAT_INC(delete);
        WRITE_UNLOCK(&ip_conntrack_lock);
 
-       if (master)
-               ip_conntrack_put(master);
+       if (ct->master)
+               ip_conntrack_put(ct->master);
 
        DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
        kmem_cache_free(ip_conntrack_cachep, ct);
@@ -353,6 +269,9 @@ static void death_by_timeout(unsigned long ul_conntrack)
        struct ip_conntrack *ct = (void *)ul_conntrack;
 
        WRITE_LOCK(&ip_conntrack_lock);
+       /* Inside lock so preempt is disabled on module removal path.
+        * Otherwise we can get spurious warnings. */
+       CONNTRACK_STAT_INC(delete_list);
        clean_from_lists(ct);
        WRITE_UNLOCK(&ip_conntrack_lock);
        ip_conntrack_put(ct);
@@ -364,7 +283,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
                    const struct ip_conntrack *ignored_conntrack)
 {
        MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       return i->ctrack != ignored_conntrack
+       return tuplehash_to_ctrack(i) != ignored_conntrack
                && ip_ct_tuple_equal(tuple, &i->tuple);
 }
 
@@ -376,11 +295,15 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
        unsigned int hash = hash_conntrack(tuple);
 
        MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       h = LIST_FIND(&ip_conntrack_hash[hash],
-                     conntrack_tuple_cmp,
-                     struct ip_conntrack_tuple_hash *,
-                     tuple, ignored_conntrack);
-       return h;
+       list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+               if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+                       CONNTRACK_STAT_INC(found);
+                       return h;
+               }
+               CONNTRACK_STAT_INC(searched);
+       }
+
+       return NULL;
 }
 
 /* Find a connection corresponding to a tuple. */
@@ -393,42 +316,21 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
        READ_LOCK(&ip_conntrack_lock);
        h = __ip_conntrack_find(tuple, ignored_conntrack);
        if (h)
-               atomic_inc(&h->ctrack->ct_general.use);
+               atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
        READ_UNLOCK(&ip_conntrack_lock);
 
        return h;
 }
 
-static inline struct ip_conntrack *
-__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
-{
-       struct ip_conntrack *ct
-               = (struct ip_conntrack *)nfct->master;
-
-       /* ctinfo is the index of the nfct inside the conntrack */
-       *ctinfo = nfct - ct->infos;
-       IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
-       return ct;
-}
-
-/* Return conntrack and conntrack_info given skb->nfct->master */
-struct ip_conntrack *
-ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
-{
-       if (skb->nfct) 
-               return __ip_conntrack_get(skb->nfct, ctinfo);
-       return NULL;
-}
-
-/* Confirm a connection given skb->nfct; places it in hash table */
+/* Confirm a connection given skb; places it in hash table */
 int
-__ip_conntrack_confirm(struct nf_ct_info *nfct)
+__ip_conntrack_confirm(struct sk_buff **pskb)
 {
        unsigned int hash, repl_hash;
        struct ip_conntrack *ct;
        enum ip_conntrack_info ctinfo;
 
-       ct = __ip_conntrack_get(nfct, &ctinfo);
+       ct = ip_conntrack_get(*pskb, &ctinfo);
 
        /* ipt_REJECT uses ip_conntrack_attach to attach related
           ICMP/TCP RST packets in other direction.  Actual packet
@@ -451,6 +353,7 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct)
        DEBUGP("Confirming conntrack %p\n", ct);
 
        WRITE_LOCK(&ip_conntrack_lock);
+
        /* See if there's one in the list already, including reverse:
            NAT could have grabbed it without realizing, since we're
            not in the hash.  If there is, we lost race. */
@@ -462,6 +365,9 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct)
                          conntrack_tuple_cmp,
                          struct ip_conntrack_tuple_hash *,
                          &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+               /* Remove from unconfirmed list */
+               list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
                list_prepend(&ip_conntrack_hash[hash],
                             &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
                list_prepend(&ip_conntrack_hash[repl_hash],
@@ -473,11 +379,14 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct)
                add_timer(&ct->timeout);
                atomic_inc(&ct->ct_general.use);
                set_bit(IPS_CONFIRMED_BIT, &ct->status);
+               CONNTRACK_STAT_INC(insert);
                WRITE_UNLOCK(&ip_conntrack_lock);
                return NF_ACCEPT;
        }
 
+       CONNTRACK_STAT_INC(insert_failed);
        WRITE_UNLOCK(&ip_conntrack_lock);
+
        return NF_DROP;
 }
 
@@ -496,110 +405,37 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
        return h != NULL;
 }
 
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-struct ip_conntrack *
-icmp_error_track(struct sk_buff *skb,
-                enum ip_conntrack_info *ctinfo,
-                unsigned int hooknum)
-{
-       struct ip_conntrack_tuple innertuple, origtuple;
-       struct {
-               struct icmphdr icmp;
-               struct iphdr ip;
-       } inside;
-       struct ip_conntrack_protocol *innerproto;
-       struct ip_conntrack_tuple_hash *h;
-       int dataoff;
-
-       IP_NF_ASSERT(skb->nfct == NULL);
-
-       /* Not enough header? */
-       if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
-               return NULL;
-
-       if (inside.icmp.type != ICMP_DEST_UNREACH
-           && inside.icmp.type != ICMP_SOURCE_QUENCH
-           && inside.icmp.type != ICMP_TIME_EXCEEDED
-           && inside.icmp.type != ICMP_PARAMETERPROB
-           && inside.icmp.type != ICMP_REDIRECT)
-               return NULL;
-
-       /* Ignore ICMP's containing fragments (shouldn't happen) */
-       if (inside.ip.frag_off & htons(IP_OFFSET)) {
-               DEBUGP("icmp_error_track: fragment of proto %u\n",
-                      inside.ip.protocol);
-               return NULL;
-       }
-
-       innerproto = ip_ct_find_proto(inside.ip.protocol);
-       dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
-       /* Are they talking about one of our connections? */
-       if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
-               DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
-               return NULL;
-       }
-
-       /* Ordinarily, we'd expect the inverted tupleproto, but it's
-          been preserved inside the ICMP. */
-       if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
-               DEBUGP("icmp_error_track: Can't invert tuple\n");
-               return NULL;
-       }
-
-       *ctinfo = IP_CT_RELATED;
-
-       h = ip_conntrack_find_get(&innertuple, NULL);
-       if (!h) {
-               /* Locally generated ICMPs will match inverted if they
-                  haven't been SNAT'ed yet */
-               /* FIXME: NAT code has to handle half-done double NAT --RR */
-               if (hooknum == NF_IP_LOCAL_OUT)
-                       h = ip_conntrack_find_get(&origtuple, NULL);
-
-               if (!h) {
-                       DEBUGP("icmp_error_track: no match\n");
-                       return NULL;
-               }
-               /* Reverse direction from that found */
-               if (DIRECTION(h) != IP_CT_DIR_REPLY)
-                       *ctinfo += IP_CT_IS_REPLY;
-       } else {
-               if (DIRECTION(h) == IP_CT_DIR_REPLY)
-                       *ctinfo += IP_CT_IS_REPLY;
-       }
-
-       /* Update skb to refer to this connection */
-       skb->nfct = &h->ctrack->infos[*ctinfo];
-       return h->ctrack;
-}
-
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
 {
-       return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
+       return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
 }
 
 static int early_drop(struct list_head *chain)
 {
        /* Traverse backwards: gives us oldest, which is roughly LRU */
        struct ip_conntrack_tuple_hash *h;
+       struct ip_conntrack *ct = NULL;
        int dropped = 0;
 
        READ_LOCK(&ip_conntrack_lock);
        h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-       if (h)
-               atomic_inc(&h->ctrack->ct_general.use);
+       if (h) {
+               ct = tuplehash_to_ctrack(h);
+               atomic_inc(&ct->ct_general.use);
+       }
        READ_UNLOCK(&ip_conntrack_lock);
 
-       if (!h)
+       if (!ct)
                return dropped;
 
-       if (del_timer(&h->ctrack->timeout)) {
-               death_by_timeout((unsigned long)h->ctrack);
+       if (del_timer(&ct->timeout)) {
+               death_by_timeout((unsigned long)ct);
                dropped = 1;
+               CONNTRACK_STAT_INC(early_drop);
        }
-       ip_conntrack_put(h->ctrack);
+       ip_conntrack_put(ct);
        return dropped;
 }
 
@@ -609,7 +445,7 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
        return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
 }
 
-struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
 {
        return LIST_FIND(&helpers, helper_cmp,
                         struct ip_conntrack_helper *,
@@ -626,9 +462,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
        struct ip_conntrack *conntrack;
        struct ip_conntrack_tuple repl_tuple;
        size_t hash;
-       struct ip_conntrack_expect *expected;
-       int i;
-       static unsigned int drop_next;
+       struct ip_conntrack_expect *exp;
 
        if (!ip_conntrack_hash_rnd_initted) {
                get_random_bytes(&ip_conntrack_hash_rnd, 4);
@@ -637,15 +471,10 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 
        hash = hash_conntrack(tuple);
 
-       if (ip_conntrack_max &&
-           atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
-               /* Try dropping from random chain, or else from the
-                   chain about to put into (in case they're trying to
-                   bomb one hash chain). */
-               unsigned int next = (drop_next++)%ip_conntrack_htable_size;
-
-               if (!early_drop(&ip_conntrack_hash[next])
-                   && !early_drop(&ip_conntrack_hash[hash])) {
+       if (ip_conntrack_max
+           && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+               /* Try dropping from this hash chain. */
+               if (!early_drop(&ip_conntrack_hash[hash])) {
                        if (net_ratelimit())
                                printk(KERN_WARNING
                                       "ip_conntrack: table full, dropping"
@@ -654,7 +483,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
                }
        }
 
-       if (!invert_tuple(&repl_tuple, tuple, protocol)) {
+       if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
                DEBUGP("Can't invert tuple.\n");
                return NULL;
        }
@@ -669,12 +498,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
        atomic_set(&conntrack->ct_general.use, 1);
        conntrack->ct_general.destroy = destroy_conntrack;
        conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
-       conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
-       conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
-       for (i=0; i < IP_CT_NUMBER; i++)
-               conntrack->infos[i].master = &conntrack->ct_general;
-
        if (!protocol->new(conntrack, skb)) {
                kmem_cache_free(ip_conntrack_cachep, conntrack);
                return NULL;
@@ -684,49 +508,38 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
        conntrack->timeout.data = (unsigned long)conntrack;
        conntrack->timeout.function = death_by_timeout;
 
-       INIT_LIST_HEAD(&conntrack->sibling_list);
-
        WRITE_LOCK(&ip_conntrack_lock);
-       /* Need finding and deleting of expected ONLY if we win race */
-       READ_LOCK(&ip_conntrack_expect_tuple_lock);
-       expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
-                            struct ip_conntrack_expect *, tuple);
-       READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
-
-       /* If master is not in hash table yet (ie. packet hasn't left
-          this machine yet), how can other end know about expected?
-          Hence these are not the droids you are looking for (if
-          master ct never got confirmed, we'd hold a reference to it
-          and weird things would happen to future packets). */
-       if (expected && !is_confirmed(expected->expectant))
-               expected = NULL;
-
-       /* Look up the conntrack helper for master connections only */
-       if (!expected)
-               conntrack->helper = ip_ct_find_helper(&repl_tuple);
+       exp = find_expectation(tuple);
 
-       /* If the expectation is dying, then this is a loser. */
-       if (expected
-           && expected->expectant->helper->timeout
-           && ! del_timer(&expected->timeout))
-               expected = NULL;
-
-       if (expected) {
+       if (exp) {
                DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
-                       conntrack, expected);
+                       conntrack, exp);
                /* Welcome, Mr. Bond.  We've been expecting you... */
                __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
-               conntrack->master = expected;
-               expected->sibling = conntrack;
-               LIST_DELETE(&ip_conntrack_expect_list, expected);
-               expected->expectant->expecting--;
-               nf_conntrack_get(&master_ct(conntrack)->infos[0]);
+               conntrack->master = exp->master;
+#if CONFIG_IP_NF_CONNTRACK_MARK
+               conntrack->mark = exp->master->mark;
+#endif
+               nf_conntrack_get(&conntrack->master->ct_general);
+               CONNTRACK_STAT_INC(expect_new);
+       } else {
+               conntrack->helper = ip_ct_find_helper(&repl_tuple);
+
+               CONNTRACK_STAT_INC(new);
        }
+
+       /* Overload tuple linked list to put us in unconfirmed list. */
+       list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
        atomic_inc(&ip_conntrack_count);
        WRITE_UNLOCK(&ip_conntrack_lock);
 
-       if (expected && expected->expectfn)
-               expected->expectfn(conntrack);
+       if (exp) {
+               if (exp->expectfn)
+                       exp->expectfn(conntrack, exp);
+               destroy_expect(exp);
+       }
+
        return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
 }
 
@@ -740,10 +553,12 @@ resolve_normal_ct(struct sk_buff *skb,
 {
        struct ip_conntrack_tuple tuple;
        struct ip_conntrack_tuple_hash *h;
+       struct ip_conntrack *ct;
 
        IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
 
-       if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
+       if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
+                               &tuple,proto))
                return NULL;
 
        /* look for tuple match */
@@ -755,6 +570,7 @@ resolve_normal_ct(struct sk_buff *skb,
                if (IS_ERR(h))
                        return (void *)h;
        }
+       ct = tuplehash_to_ctrack(h);
 
        /* It exists; we have (non-exclusive) reference. */
        if (DIRECTION(h) == IP_CT_DIR_REPLY) {
@@ -763,23 +579,24 @@ resolve_normal_ct(struct sk_buff *skb,
                *set_reply = 1;
        } else {
                /* Once we've had two way comms, always ESTABLISHED. */
-               if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
+               if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
                        DEBUGP("ip_conntrack_in: normal packet for %p\n",
-                              h->ctrack);
+                              ct);
                        *ctinfo = IP_CT_ESTABLISHED;
-               } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
+               } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
                        DEBUGP("ip_conntrack_in: related packet for %p\n",
-                              h->ctrack);
+                              ct);
                        *ctinfo = IP_CT_RELATED;
                } else {
                        DEBUGP("ip_conntrack_in: new packet for %p\n",
-                              h->ctrack);
+                              ct);
                        *ctinfo = IP_CT_NEW;
                }
                *set_reply = 0;
        }
-       skb->nfct = &h->ctrack->infos[*ctinfo];
-       return h->ctrack;
+       skb->nfct = &ct->ct_general;
+       skb->nfctinfo = *ctinfo;
+       return ct;
 }
 
 /* Netfilter hook itself. */
@@ -795,6 +612,12 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
        int set_reply;
        int ret;
 
+       /* Previously seen (loopback or untracked)?  Ignore. */
+       if ((*pskb)->nfct) {
+               CONNTRACK_STAT_INC(ignore);
+               return NF_ACCEPT;
+       }
+
        /* Never happen */
        if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
                if (net_ratelimit()) {
@@ -822,44 +645,42 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
        }
 #endif
 
-       /* Previously seen (loopback or untracked)?  Ignore. */
-       if ((*pskb)->nfct)
-               return NF_ACCEPT;
-
        proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
 
-       /* It may be an icmp error... */
-       if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
-           && icmp_error_track(*pskb, &ctinfo, hooknum))
-               return NF_ACCEPT;
+       /* It may be an special packet, error, unclean...
+        * inverse of the return code tells to the netfilter
+        * core what to do with the packet. */
+       if (proto->error != NULL 
+           && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+               CONNTRACK_STAT_INC(error);
+               CONNTRACK_STAT_INC(invalid);
+               return -ret;
+       }
 
-       if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
+       if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
                /* Not valid part of a connection */
+               CONNTRACK_STAT_INC(invalid);
                return NF_ACCEPT;
+       }
 
-       if (IS_ERR(ct))
+       if (IS_ERR(ct)) {
                /* Too stressed to deal. */
+               CONNTRACK_STAT_INC(drop);
                return NF_DROP;
+       }
 
        IP_NF_ASSERT((*pskb)->nfct);
 
        ret = proto->packet(ct, *pskb, ctinfo);
-       if (ret == -1) {
-               /* Invalid */
+       if (ret < 0) {
+               /* Invalid: inverse of the return code tells
+                * the netfilter core what to do*/
                nf_conntrack_put((*pskb)->nfct);
                (*pskb)->nfct = NULL;
-               return NF_ACCEPT;
+               CONNTRACK_STAT_INC(invalid);
+               return -ret;
        }
 
-       if (ret != NF_DROP && ct->helper) {
-               ret = ct->helper->help(*pskb, ct, ctinfo);
-               if (ret == -1) {
-                       /* Invalid */
-                       nf_conntrack_put((*pskb)->nfct);
-                       (*pskb)->nfct = NULL;
-                       return NF_ACCEPT;
-               }
-       }
        if (set_reply)
                set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
 
@@ -869,248 +690,158 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
                   const struct ip_conntrack_tuple *orig)
 {
-       return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
-}
-
-static inline int resent_expect(const struct ip_conntrack_expect *i,
-                               const struct ip_conntrack_tuple *tuple,
-                               const struct ip_conntrack_tuple *mask)
-{
-       DEBUGP("resent_expect\n");
-       DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
-       DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
-       DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
-       return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
-                || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
-               && ip_ct_tuple_equal(&i->mask, mask));
+       return ip_ct_invert_tuple(inverse, orig, 
+                                 ip_ct_find_proto(orig->dst.protonum));
 }
 
 /* Would two expected things clash? */
-static inline int expect_clash(const struct ip_conntrack_expect *i,
-                              const struct ip_conntrack_tuple *tuple,
-                              const struct ip_conntrack_tuple *mask)
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+                              const struct ip_conntrack_expect *b)
 {
        /* Part covered by intersection of masks must be unequal,
            otherwise they clash */
        struct ip_conntrack_tuple intersect_mask
-               = { { i->mask.src.ip & mask->src.ip,
-                     { i->mask.src.u.all & mask->src.u.all } },
-                   { i->mask.dst.ip & mask->dst.ip,
-                     { i->mask.dst.u.all & mask->dst.u.all },
-                     i->mask.dst.protonum & mask->dst.protonum } };
+               = { { a->mask.src.ip & b->mask.src.ip,
+                     { a->mask.src.u.all & b->mask.src.u.all } },
+                   { a->mask.dst.ip & b->mask.dst.ip,
+                     { a->mask.dst.u.all & b->mask.dst.u.all },
+                     a->mask.dst.protonum & b->mask.dst.protonum } };
 
-       return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
+       return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
 }
 
-inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+                                const struct ip_conntrack_expect *b)
 {
-       WRITE_LOCK(&ip_conntrack_lock);
-       unexpect_related(expect);
-       WRITE_UNLOCK(&ip_conntrack_lock);
+       return a->master == b->master
+               && ip_ct_tuple_equal(&a->tuple, &b->tuple)
+               && ip_ct_tuple_equal(&a->mask, &b->mask);
 }
-       
-static void expectation_timed_out(unsigned long ul_expect)
+
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
 {
-       struct ip_conntrack_expect *expect = (void *) ul_expect;
+       struct ip_conntrack_expect *i;
 
-       DEBUGP("expectation %p timed out\n", expect);   
        WRITE_LOCK(&ip_conntrack_lock);
-       __unexpect_related(expect);
+       /* choose the the oldest expectation to evict */
+       list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+               if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+                       unlink_expect(i);
+                       WRITE_UNLOCK(&ip_conntrack_lock);
+                       destroy_expect(i);
+                       return;
+               }
+       }
        WRITE_UNLOCK(&ip_conntrack_lock);
 }
 
-struct ip_conntrack_expect *
-ip_conntrack_expect_alloc(void)
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
 {
        struct ip_conntrack_expect *new;
-       
-       new = (struct ip_conntrack_expect *)
-               kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
+
+       new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
        if (!new) {
                DEBUGP("expect_related: OOM allocating expect\n");
                return NULL;
        }
+       new->master = NULL;
+       return new;
+}
 
-       /* tuple_cmp compares whole union, we have to initialized cleanly */
-       memset(new, 0, sizeof(struct ip_conntrack_expect));
+void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+{
+       kmem_cache_free(ip_conntrack_expect_cachep, expect);
+}
 
-       return new;
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
+{
+       atomic_inc(&exp->master->ct_general.use);
+       exp->master->expecting++;
+       list_add(&exp->list, &ip_conntrack_expect_list);
+
+       if (exp->master->helper->timeout) {
+               init_timer(&exp->timeout);
+               exp->timeout.data = (unsigned long)exp;
+               exp->timeout.function = expectation_timed_out;
+               exp->timeout.expires
+                       = jiffies + exp->master->helper->timeout * HZ;
+               add_timer(&exp->timeout);
+       } else
+               exp->timeout.function = NULL;
+
+       CONNTRACK_STAT_INC(expect_create);
 }
 
-static void
-ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
-                          struct ip_conntrack *related_to)
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
 {
-       DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
-       new->expectant = related_to;
-       new->sibling = NULL;
-       atomic_set(&new->use, 1);
-
-       /* add to expected list for this connection */
-       list_add_tail(&new->expected_list, &related_to->sibling_list);
-       /* add to global list of expectations */
-       list_prepend(&ip_conntrack_expect_list, &new->list);
-       /* add and start timer if required */
-       if (related_to->helper->timeout) {
-               init_timer(&new->timeout);
-               new->timeout.data = (unsigned long)new;
-               new->timeout.function = expectation_timed_out;
-               new->timeout.expires = jiffies +
-                                       related_to->helper->timeout * HZ;
-               add_timer(&new->timeout);
+       struct ip_conntrack_expect *i;
+
+       list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+               if (i->master == master) {
+                       if (del_timer(&i->timeout)) {
+                               unlink_expect(i);
+                               destroy_expect(i);
+                       }
+                       break;
+               }
        }
-       related_to->expecting++;
 }
 
-/* Add a related connection. */
-int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
-                               struct ip_conntrack *related_to)
+static inline int refresh_timer(struct ip_conntrack_expect *i)
 {
-       struct ip_conntrack_expect *old;
-       int ret = 0;
+       if (!del_timer(&i->timeout))
+               return 0;
 
-       WRITE_LOCK(&ip_conntrack_lock);
-       /* Because of the write lock, no reader can walk the lists,
-        * so there is no need to use the tuple lock too */
+       i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+       add_timer(&i->timeout);
+       return 1;
+}
+
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
+{
+       struct ip_conntrack_expect *i;
+       int ret;
 
        DEBUGP("ip_conntrack_expect_related %p\n", related_to);
        DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
        DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
 
-       old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
-                       struct ip_conntrack_expect *, &expect->tuple, 
-                       &expect->mask);
-       if (old) {
-               /* Helper private data may contain offsets but no pointers
-                  pointing into the payload - otherwise we should have to copy 
-                  the data filled out by the helper over the old one */
-               DEBUGP("expect_related: resent packet\n");
-               if (related_to->helper->timeout) {
-                       if (!del_timer(&old->timeout)) {
-                               /* expectation is dying. Fall through */
+       WRITE_LOCK(&ip_conntrack_lock);
+       list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+               if (expect_matches(i, expect)) {
+                       /* Refresh timer: if it's dying, ignore.. */
+                       if (refresh_timer(i)) {
+                               ret = 0;
+                               /* We don't need the one they've given us. */
+                               ip_conntrack_expect_free(expect);
                                goto out;
-                       } else {
-                               old->timeout.expires = jiffies + 
-                                       related_to->helper->timeout * HZ;
-                               add_timer(&old->timeout);
                        }
+               } else if (expect_clash(i, expect)) {
+                       ret = -EBUSY;
+                       goto out;
                }
-
-               WRITE_UNLOCK(&ip_conntrack_lock);
-               kfree(expect);
-               return -EEXIST;
-
-       } else if (related_to->helper->max_expected && 
-                  related_to->expecting >= related_to->helper->max_expected) {
-               /* old == NULL */
-               if (!(related_to->helper->flags & 
-                     IP_CT_HELPER_F_REUSE_EXPECT)) {
-                       WRITE_UNLOCK(&ip_conntrack_lock);
-                       if (net_ratelimit())
-                               printk(KERN_WARNING
-                                      "ip_conntrack: max number of expected "
-                                      "connections %i of %s reached for "
-                                      "%u.%u.%u.%u->%u.%u.%u.%u\n",
-                                      related_to->helper->max_expected,
-                                      related_to->helper->name,
-                                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
-                                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
-                       kfree(expect);
-                       return -EPERM;
-               }
-               DEBUGP("ip_conntrack: max number of expected "
-                      "connections %i of %s reached for "
-                      "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
-                      related_to->helper->max_expected,
-                      related_to->helper->name,
-                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
-                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
-               /* choose the the oldest expectation to evict */
-               list_for_each_entry(old, &related_to->sibling_list, 
-                                                     expected_list)
-                       if (old->sibling == NULL)
-                               break;
-
-               /* We cannot fail since related_to->expecting is the number
-                * of unconfirmed expectations */
-               IP_NF_ASSERT(old && old->sibling == NULL);
-
-               /* newnat14 does not reuse the real allocated memory
-                * structures but rather unexpects the old and
-                * allocates a new.  unexpect_related will decrement
-                * related_to->expecting. 
-                */
-               unexpect_related(old);
-               ret = -EPERM;
-       } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
-                            struct ip_conntrack_expect *, &expect->tuple, 
-                            &expect->mask)) {
-               WRITE_UNLOCK(&ip_conntrack_lock);
-               DEBUGP("expect_related: busy!\n");
-
-               kfree(expect);
-               return -EBUSY;
        }
 
-out:   ip_conntrack_expect_insert(expect, related_to);
+       /* Will be over limit? */
+       if (expect->master->helper->max_expected && 
+           expect->master->expecting >= expect->master->helper->max_expected)
+               evict_oldest_expect(expect->master);
 
+       ip_conntrack_expect_insert(expect);
+       ret = 0;
+out:
        WRITE_UNLOCK(&ip_conntrack_lock);
-
-       return ret;
-}
-
-/* Change tuple in an existing expectation */
-int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
-                              struct ip_conntrack_tuple *newtuple)
-{
-       int ret;
-
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
-       DEBUGP("change_expect:\n");
-       DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
-       DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
-       if (expect->ct_tuple.dst.protonum == 0) {
-               /* Never seen before */
-               DEBUGP("change expect: never seen before\n");
-               if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
-                   && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
-                                struct ip_conntrack_expect *, newtuple, &expect->mask)) {
-                       /* Force NAT to find an unused tuple */
-                       ret = -1;
-               } else {
-                       memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
-                       memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
-                       ret = 0;
-               }
-       } else {
-               /* Resent packet */
-               DEBUGP("change expect: resent packet\n");
-               if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
-                       ret = 0;
-               } else {
-                       /* Force NAT to choose again the same port */
-                       ret = -1;
-               }
-       }
-       WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
-       
-       return ret;
+       return ret;
 }
 
-/* Alter reply tuple (maybe alter helper).  If it's already taken,
-   return 0 and don't do alteration. */
-int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
-                            const struct ip_conntrack_tuple *newreply)
+/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
+   implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+                             const struct ip_conntrack_tuple *newreply)
 {
        WRITE_LOCK(&ip_conntrack_lock);
-       if (__ip_conntrack_find(newreply, conntrack)) {
-               WRITE_UNLOCK(&ip_conntrack_lock);
-               return 0;
-       }
        /* Should be unconfirmed, so not in hash table yet */
        IP_NF_ASSERT(!is_confirmed(conntrack));
 
@@ -1118,15 +849,14 @@ int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
        DUMP_TUPLE(newreply);
 
        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
-       if (!conntrack->master && list_empty(&conntrack->sibling_list))
+       if (!conntrack->master && conntrack->expecting == 0)
                conntrack->helper = ip_ct_find_helper(newreply);
        WRITE_UNLOCK(&ip_conntrack_lock);
-
-       return 1;
 }
 
 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 {
+       BUG_ON(me->timeout == 0);
        WRITE_LOCK(&ip_conntrack_lock);
        list_prepend(&helpers, me);
        WRITE_UNLOCK(&ip_conntrack_lock);
@@ -1137,24 +867,29 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
                         const struct ip_conntrack_helper *me)
 {
-       if (i->ctrack->helper == me) {
-               /* Get rid of any expected. */
-               remove_expectations(i->ctrack, 0);
-               /* And *then* set helper to NULL */
-               i->ctrack->helper = NULL;
-       }
+       if (tuplehash_to_ctrack(i)->helper == me)
+               tuplehash_to_ctrack(i)->helper = NULL;
        return 0;
 }
 
 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 {
        unsigned int i;
+       struct ip_conntrack_expect *exp, *tmp;
 
        /* Need write lock here, to delete helper. */
        WRITE_LOCK(&ip_conntrack_lock);
        LIST_DELETE(&helpers, me);
 
+       /* Get rid of expectations */
+       list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+               if (exp->master->helper == me && del_timer(&exp->timeout)) {
+                       unlink_expect(exp);
+                       destroy_expect(exp);
+               }
+       }
        /* Get rid of expecteds, set helpers to NULL. */
+       LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
        for (i = 0; i < ip_conntrack_htable_size; i++)
                LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
                            struct ip_conntrack_tuple_hash *, me);
@@ -1164,40 +899,59 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
        synchronize_net();
 }
 
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+static inline void ct_add_counters(struct ip_conntrack *ct,
+                                  enum ip_conntrack_info ctinfo,
+                                  const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+       if (skb) {
+               ct->counters[CTINFO2DIR(ctinfo)].packets++;
+               ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+                                       ntohs(skb->nh.iph->tot_len);
+       }
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct, 
+                       enum ip_conntrack_info ctinfo,
+                       const struct sk_buff *skb,
+                       unsigned long extra_jiffies)
 {
        IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
 
        /* If not in hash table, timer will not be active yet */
-       if (!is_confirmed(ct))
+       if (!is_confirmed(ct)) {
                ct->timeout.expires = extra_jiffies;
-       else {
+               ct_add_counters(ct, ctinfo, skb);
+       } else {
                WRITE_LOCK(&ip_conntrack_lock);
                /* Need del_timer for race avoidance (may already be dying). */
                if (del_timer(&ct->timeout)) {
                        ct->timeout.expires = jiffies + extra_jiffies;
                        add_timer(&ct->timeout);
                }
+               ct_add_counters(ct, ctinfo, skb);
                WRITE_UNLOCK(&ip_conntrack_lock);
        }
 }
 
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb)
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
        struct sock *sk = skb->sk;
 #ifdef CONFIG_NETFILTER_DEBUG
        unsigned int olddebug = skb->nf_debug;
 #endif
+
        if (sk) {
                sock_hold(sk);
                skb_orphan(skb);
        }
 
        local_bh_disable(); 
-       skb = ip_defrag(skb);
+       skb = ip_defrag(skb, user);
        local_bh_enable();
 
        if (!skb) {
@@ -1221,66 +975,71 @@ ip_ct_gather_frags(struct sk_buff *skb)
 }
 
 /* Used by ipt_REJECT. */
-static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 {
        struct ip_conntrack *ct;
        enum ip_conntrack_info ctinfo;
 
-       ct = __ip_conntrack_get(nfct, &ctinfo);
-
-       /* This ICMP is in reverse direction to the packet which
-           caused it */
+       /* This ICMP is in reverse direction to the packet which caused it */
+       ct = ip_conntrack_get(skb, &ctinfo);
+       
        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
                ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
        else
                ctinfo = IP_CT_RELATED;
 
-       /* Attach new skbuff, and increment count */
-       nskb->nfct = &ct->infos[ctinfo];
-       atomic_inc(&ct->ct_general.use);
+       /* Attach to new skbuff, and increment count */
+       nskb->nfct = &ct->ct_general;
+       nskb->nfctinfo = ctinfo;
+       nf_conntrack_get(nskb->nfct);
 }
 
 static inline int
-do_kill(const struct ip_conntrack_tuple_hash *i,
-       int (*kill)(const struct ip_conntrack *i, void *data),
+do_iter(const struct ip_conntrack_tuple_hash *i,
+       int (*iter)(struct ip_conntrack *i, void *data),
        void *data)
 {
-       return kill(i->ctrack, data);
+       return iter(tuplehash_to_ctrack(i), data);
 }
 
 /* Bring out ya dead! */
 static struct ip_conntrack_tuple_hash *
-get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
                void *data, unsigned int *bucket)
 {
        struct ip_conntrack_tuple_hash *h = NULL;
 
-       READ_LOCK(&ip_conntrack_lock);
-       for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
-               h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
-                             struct ip_conntrack_tuple_hash *, kill, data);
+       WRITE_LOCK(&ip_conntrack_lock);
+       for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+               h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+                               struct ip_conntrack_tuple_hash *, iter, data);
+               if (h)
+                       break;
        }
+       if (!h)
+               h = LIST_FIND_W(&unconfirmed, do_iter,
+                               struct ip_conntrack_tuple_hash *, iter, data);
        if (h)
-               atomic_inc(&h->ctrack->ct_general.use);
-       READ_UNLOCK(&ip_conntrack_lock);
+               atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+       WRITE_UNLOCK(&ip_conntrack_lock);
 
        return h;
 }
 
 void
-ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
-                       void *data)
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
 {
        struct ip_conntrack_tuple_hash *h;
        unsigned int bucket = 0;
 
-       while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
+       while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+               struct ip_conntrack *ct = tuplehash_to_ctrack(h);
                /* Time to push up daises... */
-               if (del_timer(&h->ctrack->timeout))
-                       death_by_timeout((unsigned long)h->ctrack);
+               if (del_timer(&ct->timeout))
+                       death_by_timeout((unsigned long)ct);
                /* ... else the timer will get him soon. */
 
-               ip_conntrack_put(h->ctrack);
+               ip_conntrack_put(ct);
        }
 }
 
@@ -1291,7 +1050,7 @@ ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
 static int
 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 {
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ip_conntrack_tuple_hash *h;
        struct ip_conntrack_tuple tuple;
        
@@ -1317,16 +1076,17 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
        h = ip_conntrack_find_get(&tuple, NULL);
        if (h) {
                struct sockaddr_in sin;
+               struct ip_conntrack *ct = tuplehash_to_ctrack(h);
 
                sin.sin_family = AF_INET;
-               sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+               sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                        .tuple.dst.u.tcp.port;
-               sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+               sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                        .tuple.dst.ip;
 
                DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
                       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
-               ip_conntrack_put(h->ctrack);
+               ip_conntrack_put(ct);
                if (copy_to_user(user, &sin, sizeof(sin)) != 0)
                        return -EFAULT;
                else
@@ -1345,11 +1105,21 @@ static struct nf_sockopt_ops so_getorigdst = {
        .get            = &getorigdst,
 };
 
-static int kill_all(const struct ip_conntrack *i, void *data)
+static int kill_all(struct ip_conntrack *i, void *data)
 {
        return 1;
 }
 
+static void free_conntrack_hash(void)
+{
+       if (ip_conntrack_vmalloc)
+               vfree(ip_conntrack_hash);
+       else
+               free_pages((unsigned long)ip_conntrack_hash, 
+                          get_order(sizeof(struct list_head)
+                                    * ip_conntrack_htable_size));
+}
+
 /* Mishearing the voices in his head, our hero wonders how he's
    supposed to kill the mall. */
 void ip_conntrack_cleanup(void)
@@ -1361,19 +1131,20 @@ void ip_conntrack_cleanup(void)
        synchronize_net();
  
  i_see_dead_people:
-       ip_ct_selective_cleanup(kill_all, NULL);
+       ip_ct_iterate_cleanup(kill_all, NULL);
        if (atomic_read(&ip_conntrack_count) != 0) {
                schedule();
                goto i_see_dead_people;
        }
 
        kmem_cache_destroy(ip_conntrack_cachep);
-       vfree(ip_conntrack_hash);
+       kmem_cache_destroy(ip_conntrack_expect_cachep);
+       free_conntrack_hash();
        nf_unregister_sockopt(&so_getorigdst);
 }
 
 static int hashsize;
-MODULE_PARM(hashsize, "i");
+module_param(hashsize, int, 0400);
 
 int __init ip_conntrack_init(void)
 {
@@ -1406,8 +1177,20 @@ int __init ip_conntrack_init(void)
                return ret;
        }
 
-       ip_conntrack_hash = vmalloc(sizeof(struct list_head)
-                                   * ip_conntrack_htable_size);
+       /* AK: the hash table is twice as big than needed because it
+          uses list_head.  it would be much nicer to caches to use a
+          single pointer list head here. */
+       ip_conntrack_vmalloc = 0; 
+       ip_conntrack_hash 
+               =(void*)__get_free_pages(GFP_KERNEL, 
+                                        get_order(sizeof(struct list_head)
+                                                  *ip_conntrack_htable_size));
+       if (!ip_conntrack_hash) { 
+               ip_conntrack_vmalloc = 1;
+               printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
+               ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+                                           * ip_conntrack_htable_size);
+       }
        if (!ip_conntrack_hash) {
                printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
                goto err_unreg_sockopt;
@@ -1415,17 +1198,28 @@ int __init ip_conntrack_init(void)
 
        ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
                                                sizeof(struct ip_conntrack), 0,
-                                               SLAB_HWCACHE_ALIGN, NULL, NULL);
+                                               0, NULL, NULL);
        if (!ip_conntrack_cachep) {
                printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
                goto err_free_hash;
        }
+
+       ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+                                       sizeof(struct ip_conntrack_expect),
+                                       0, 0, NULL, NULL);
+       if (!ip_conntrack_expect_cachep) {
+               printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+               goto err_free_conntrack_slab;
+       }
+
        /* Don't NEED lock here, but good form anyway. */
        WRITE_LOCK(&ip_conntrack_lock);
+       for (i = 0; i < MAX_IP_CT_PROTO; i++)
+               ip_ct_protos[i] = &ip_conntrack_generic_protocol;
        /* Sew in builtin protocols. */
-       list_append(&protocol_list, &ip_conntrack_protocol_tcp);
-       list_append(&protocol_list, &ip_conntrack_protocol_udp);
-       list_append(&protocol_list, &ip_conntrack_protocol_icmp);
+       ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+       ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+       ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
        WRITE_UNLOCK(&ip_conntrack_lock);
 
        for (i = 0; i < ip_conntrack_htable_size; i++)
@@ -1439,16 +1233,13 @@ int __init ip_conntrack_init(void)
        atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
        /*  - and look it like as a confirmed connection */
        set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
-       /*  - and prepare the ctinfo field for REJECT & NAT. */
-       ip_conntrack_untracked.infos[IP_CT_NEW].master =
-       ip_conntrack_untracked.infos[IP_CT_RELATED].master =
-       ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
-                       &ip_conntrack_untracked.ct_general;
 
        return ret;
 
+err_free_conntrack_slab:
+       kmem_cache_destroy(ip_conntrack_cachep);
 err_free_hash:
-       vfree(ip_conntrack_hash);
+       free_conntrack_hash();
 err_unreg_sockopt:
        nf_unregister_sockopt(&so_getorigdst);