Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
index 00a89f4..a297da7 100644 (file)
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
-/* For ERR_PTR().  Yeah, I know... --RR */
-#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
 
-/* This rwlock protects the main hash table, protocol/helper/expected
+/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
    registrations, conntrack timers*/
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -48,7 +50,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
-#define IP_CONNTRACK_VERSION   "2.1"
+#define IP_CONNTRACK_VERSION   "2.4"
 
 #if 0
 #define DEBUGP printk
 #define DEBUGP(format, args...)
 #endif
 
-DECLARE_RWLOCK(ip_conntrack_lock);
-DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
+DEFINE_RWLOCK(ip_conntrack_lock);
+
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
 
 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
 LIST_HEAD(ip_conntrack_expect_list);
-LIST_HEAD(protocol_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
 static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
 int ip_conntrack_max;
-static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
 struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
 struct ip_conntrack ip_conntrack_untracked;
-
-extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
-
-static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
-                             u_int8_t protocol)
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
+
+static unsigned int ip_conntrack_next_id;
+static unsigned int ip_conntrack_expect_next_id;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
-       return protocol == curr->proto;
+       DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+       if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+               atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
+                                   ecache->ct);
+       ecache->events = 0;
+       ip_conntrack_put(ecache->ct);
+       ecache->ct = NULL;
 }
 
-struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
 {
-       struct ip_conntrack_protocol *p;
-
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       p = LIST_FIND(&protocol_list, proto_cmpfn,
-                     struct ip_conntrack_protocol *, protocol);
-       if (!p)
-               p = &ip_conntrack_generic_protocol;
-
-       return p;
+       struct ip_conntrack_ecache *ecache;
+       
+       local_bh_disable();
+       ecache = &__get_cpu_var(ip_conntrack_ecache);
+       if (ecache->ct == ct)
+               __ip_ct_deliver_cached_events(ecache);
+       local_bh_enable();
 }
 
-struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
 {
-       struct ip_conntrack_protocol *p;
-
-       READ_LOCK(&ip_conntrack_lock);
-       p = __ip_ct_find_proto(protocol);
-       READ_UNLOCK(&ip_conntrack_lock);
-       return p;
+       struct ip_conntrack_ecache *ecache;
+
+       /* take care of delivering potentially old events */
+       ecache = &__get_cpu_var(ip_conntrack_ecache);
+       BUG_ON(ecache->ct == ct);
+       if (ecache->ct)
+               __ip_ct_deliver_cached_events(ecache);
+       /* initialize for this conntrack/packet */
+       ecache->ct = ct;
+       nf_conntrack_get(&ct->ct_general);
 }
 
-inline void 
-ip_conntrack_put(struct ip_conntrack *ct)
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
 {
-       IP_NF_ASSERT(ct);
-       IP_NF_ASSERT(ct->infos[0].master);
-       /* nf_conntrack_put wants to go via an info struct, so feed it
-           one at random. */
-       nf_conntrack_put(&ct->infos[0]);
+       struct ip_conntrack_ecache *ecache;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               ecache = &per_cpu(ip_conntrack_ecache, cpu);
+               if (ecache->ct)
+                       ip_conntrack_put(ecache->ct);
+       }
 }
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
 
 static int ip_conntrack_hash_rnd_initted;
 static unsigned int ip_conntrack_hash_rnd;
 
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
+static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
+                           unsigned int size, unsigned int rnd)
 {
-#if 0
-       dump_tuple(tuple);
-#endif
        return (jhash_3words(tuple->src.ip,
                             (tuple->dst.ip ^ tuple->dst.protonum),
                             (tuple->src.u.all | (tuple->dst.u.all << 16)),
-                            ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+                            rnd) % size);
+}
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+       return __hash_conntrack(tuple, ip_conntrack_htable_size,
+                               ip_conntrack_hash_rnd);
 }
 
 int
-get_tuple(const struct iphdr *iph,
-         const struct sk_buff *skb,
-         unsigned int dataoff,
-         struct ip_conntrack_tuple *tuple,
-         const struct ip_conntrack_protocol *protocol)
+ip_ct_get_tuple(const struct iphdr *iph,
+               const struct sk_buff *skb,
+               unsigned int dataoff,
+               struct ip_conntrack_tuple *tuple,
+               const struct ip_conntrack_protocol *protocol)
 {
        /* Never happen */
        if (iph->frag_off & htons(IP_OFFSET)) {
@@ -143,146 +181,114 @@ get_tuple(const struct iphdr *iph,
        tuple->src.ip = iph->saddr;
        tuple->dst.ip = iph->daddr;
        tuple->dst.protonum = iph->protocol;
+       tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 
        return protocol->pkt_to_tuple(skb, dataoff, tuple);
 }
 
-static int
-invert_tuple(struct ip_conntrack_tuple *inverse,
-            const struct ip_conntrack_tuple *orig,
-            const struct ip_conntrack_protocol *protocol)
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+                  const struct ip_conntrack_tuple *orig,
+                  const struct ip_conntrack_protocol *protocol)
 {
        inverse->src.ip = orig->dst.ip;
        inverse->dst.ip = orig->src.ip;
        inverse->dst.protonum = orig->dst.protonum;
+       inverse->dst.dir = !orig->dst.dir;
 
        return protocol->invert_tuple(inverse, orig);
 }
 
 
 /* ip_conntrack_expect helper functions */
-
-/* Compare tuple parts depending on mask. */
-static inline int expect_cmp(const struct ip_conntrack_expect *i,
-                            const struct ip_conntrack_tuple *tuple)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
 {
-       MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
-       return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
-}
-
-static void
-destroy_expect(struct ip_conntrack_expect *exp)
-{
-       DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
-       IP_NF_ASSERT(atomic_read(&exp->use) == 0);
+       ASSERT_WRITE_LOCK(&ip_conntrack_lock);
        IP_NF_ASSERT(!timer_pending(&exp->timeout));
-
-       kfree(exp);
+       list_del(&exp->list);
+       CONNTRACK_STAT_INC(expect_delete);
+       exp->master->expecting--;
+       ip_conntrack_expect_put(exp);
 }
 
-inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
+static void expectation_timed_out(unsigned long ul_expect)
 {
-       IP_NF_ASSERT(exp);
+       struct ip_conntrack_expect *exp = (void *)ul_expect;
 
-       if (atomic_dec_and_test(&exp->use)) {
-               /* usage count dropped to zero */
-               destroy_expect(exp);
-       }
-}
-
-static inline struct ip_conntrack_expect *
-__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
-{
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
-       return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
-                        struct ip_conntrack_expect *, tuple);
+       write_lock_bh(&ip_conntrack_lock);
+       ip_ct_unlink_expect(exp);
+       write_unlock_bh(&ip_conntrack_lock);
+       ip_conntrack_expect_put(exp);
 }
 
-/* Find a expectation corresponding to a tuple. */
 struct ip_conntrack_expect *
-ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
 {
-       struct ip_conntrack_expect *exp;
-
-       READ_LOCK(&ip_conntrack_lock);
-       READ_LOCK(&ip_conntrack_expect_tuple_lock);
-       exp = __ip_ct_expect_find(tuple);
-       if (exp)
-               atomic_inc(&exp->use);
-       READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
-       READ_UNLOCK(&ip_conntrack_lock);
-
-       return exp;
+       struct ip_conntrack_expect *i;
+       
+       list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+               if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+                       atomic_inc(&i->use);
+                       return i;
+               }
+       }
+       return NULL;
 }
 
-/* remove one specific expectation from all lists and drop refcount,
- * does _NOT_ delete the timer. */
-static void __unexpect_related(struct ip_conntrack_expect *expect)
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
 {
-       DEBUGP("unexpect_related(%p)\n", expect);
-       MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
-
-       /* we're not allowed to unexpect a confirmed expectation! */
-       IP_NF_ASSERT(!expect->sibling);
-
-       /* delete from global and local lists */
-       list_del(&expect->list);
-       list_del(&expect->expected_list);
-
-       /* decrement expect-count of master conntrack */
-       if (expect->expectant)
-               expect->expectant->expecting--;
+       struct ip_conntrack_expect *i;
+       
+       read_lock_bh(&ip_conntrack_lock);
+       i = __ip_conntrack_expect_find(tuple);
+       read_unlock_bh(&ip_conntrack_lock);
 
-       ip_conntrack_expect_put(expect);
+       return i;
 }
 
-/* remove one specific expecatation from all lists, drop refcount
- * and expire timer. 
- * This function can _NOT_ be called for confirmed expects! */
-static void unexpect_related(struct ip_conntrack_expect *expect)
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
 {
-       IP_NF_ASSERT(expect->expectant);
-       IP_NF_ASSERT(expect->expectant->helper);
-       /* if we are supposed to have a timer, but we can't delete
-        * it: race condition.  __unexpect_related will
-        * be calledd by timeout function */
-       if (expect->expectant->helper->timeout
-           && !del_timer(&expect->timeout))
-               return;
-
-       __unexpect_related(expect);
+       struct ip_conntrack_expect *i;
+
+       list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+               /* If master is not in hash table yet (ie. packet hasn't left
+                  this machine yet), how can other end know about expected?
+                  Hence these are not the droids you are looking for (if
+                  master ct never got confirmed, we'd hold a reference to it
+                  and weird things would happen to future packets). */
+               if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+                   && is_confirmed(i->master)) {
+                       if (i->flags & IP_CT_EXPECT_PERMANENT) {
+                               atomic_inc(&i->use);
+                               return i;
+                       } else if (del_timer(&i->timeout)) {
+                               ip_ct_unlink_expect(i);
+                               return i;
+                       }
+               }
+       }
+       return NULL;
 }
 
-/* delete all unconfirmed expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
+/* delete all expectations for this conntrack */
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
 {
-       struct list_head *exp_entry, *next;
-       struct ip_conntrack_expect *exp;
+       struct ip_conntrack_expect *i, *tmp;
 
-       DEBUGP("remove_expectations(%p)\n", ct);
-
-       list_for_each_safe(exp_entry, next, &ct->sibling_list) {
-               exp = list_entry(exp_entry, struct ip_conntrack_expect,
-                                expected_list);
+       /* Optimization: most connection never expect any others. */
+       if (ct->expecting == 0)
+               return;
 
-               /* we skip established expectations, as we want to delete
-                * the un-established ones only */
-               if (exp->sibling) {
-                       DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
-                       if (drop_refcount) {
-                               /* Indicate that this expectations parent is dead */
-                               ip_conntrack_put(exp->expectant);
-                               exp->expectant = NULL;
-                       }
-                       continue;
+       list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+               if (i->master == ct && del_timer(&i->timeout)) {
+                       ip_ct_unlink_expect(i);
+                       ip_conntrack_expect_put(i);
                }
-
-               IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
-               IP_NF_ASSERT(exp->expectant == ct);
-
-               /* delete expectation from global and private lists */
-               unexpect_related(exp);
        }
 }
 
@@ -292,69 +298,73 @@ clean_from_lists(struct ip_conntrack *ct)
        unsigned int ho, hr;
        
        DEBUGP("clean_from_lists(%p)\n", ct);
-       MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+       ASSERT_WRITE_LOCK(&ip_conntrack_lock);
 
        ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
        LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
        LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
 
-       /* Destroy all un-established, pending expectations */
-       remove_expectations(ct, 1);
+       /* Destroy all pending expectations */
+       ip_ct_remove_expectations(ct);
 }
 
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
-       struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
+       struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
        struct ip_conntrack_protocol *proto;
 
        DEBUGP("destroy_conntrack(%p)\n", ct);
        IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
        IP_NF_ASSERT(!timer_pending(&ct->timeout));
 
+       ip_conntrack_event(IPCT_DESTROY, ct);
+       set_bit(IPS_DYING_BIT, &ct->status);
+
        /* To make sure we don't get any weird locking issues here:
         * destroy_conntrack() MUST NOT be called with a write lock
         * to ip_conntrack_lock!!! -HW */
-       proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+       proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
        if (proto && proto->destroy)
                proto->destroy(ct);
 
        if (ip_conntrack_destroyed)
                ip_conntrack_destroyed(ct);
 
-       WRITE_LOCK(&ip_conntrack_lock);
-       /* Make sure don't leave any orphaned expectations lying around */
-       if (ct->expecting)
-               remove_expectations(ct, 1);
-
-       /* Delete our master expectation */
-       if (ct->master) {
-               if (ct->master->expectant) {
-                       /* can't call __unexpect_related here,
-                        * since it would screw up expect_list */
-                       list_del(&ct->master->expected_list);
-                       master = ct->master->expectant;
-               }
-               kfree(ct->master);
+       write_lock_bh(&ip_conntrack_lock);
+       /* Expectations will have been removed in clean_from_lists,
+        * except TFTP can create an expectation on the first packet,
+        * before connection is in the list, so we need to clean here,
+        * too. */
+       ip_ct_remove_expectations(ct);
+
+       /* We overload first tuple to link into unconfirmed list. */
+       if (!is_confirmed(ct)) {
+               BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+               list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
        }
-       WRITE_UNLOCK(&ip_conntrack_lock);
 
-       if (master)
-               ip_conntrack_put(master);
+       CONNTRACK_STAT_INC(delete);
+       write_unlock_bh(&ip_conntrack_lock);
+
+       if (ct->master)
+               ip_conntrack_put(ct->master);
 
        DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
-       kmem_cache_free(ip_conntrack_cachep, ct);
-       atomic_dec(&ip_conntrack_count);
+       ip_conntrack_free(ct);
 }
 
 static void death_by_timeout(unsigned long ul_conntrack)
 {
        struct ip_conntrack *ct = (void *)ul_conntrack;
 
-       WRITE_LOCK(&ip_conntrack_lock);
+       write_lock_bh(&ip_conntrack_lock);
+       /* Inside lock so preempt is disabled on module removal path.
+        * Otherwise we can get spurious warnings. */
+       CONNTRACK_STAT_INC(delete_list);
        clean_from_lists(ct);
-       WRITE_UNLOCK(&ip_conntrack_lock);
+       write_unlock_bh(&ip_conntrack_lock);
        ip_conntrack_put(ct);
 }
 
@@ -363,24 +373,28 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
                    const struct ip_conntrack_tuple *tuple,
                    const struct ip_conntrack *ignored_conntrack)
 {
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       return i->ctrack != ignored_conntrack
+       ASSERT_READ_LOCK(&ip_conntrack_lock);
+       return tuplehash_to_ctrack(i) != ignored_conntrack
                && ip_ct_tuple_equal(tuple, &i->tuple);
 }
 
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
                    const struct ip_conntrack *ignored_conntrack)
 {
        struct ip_conntrack_tuple_hash *h;
        unsigned int hash = hash_conntrack(tuple);
 
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       h = LIST_FIND(&ip_conntrack_hash[hash],
-                     conntrack_tuple_cmp,
-                     struct ip_conntrack_tuple_hash *,
-                     tuple, ignored_conntrack);
-       return h;
+       ASSERT_READ_LOCK(&ip_conntrack_lock);
+       list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+               if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+                       CONNTRACK_STAT_INC(found);
+                       return h;
+               }
+               CONNTRACK_STAT_INC(searched);
+       }
+
+       return NULL;
 }
 
 /* Find a connection corresponding to a tuple. */
@@ -390,45 +404,47 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
 {
        struct ip_conntrack_tuple_hash *h;
 
-       READ_LOCK(&ip_conntrack_lock);
+       read_lock_bh(&ip_conntrack_lock);
        h = __ip_conntrack_find(tuple, ignored_conntrack);
        if (h)
-               atomic_inc(&h->ctrack->ct_general.use);
-       READ_UNLOCK(&ip_conntrack_lock);
+               atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+       read_unlock_bh(&ip_conntrack_lock);
 
        return h;
 }
 
-static inline struct ip_conntrack *
-__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+                                       unsigned int hash,
+                                       unsigned int repl_hash) 
 {
-       struct ip_conntrack *ct
-               = (struct ip_conntrack *)nfct->master;
-
-       /* ctinfo is the index of the nfct inside the conntrack */
-       *ctinfo = nfct - ct->infos;
-       IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
-       return ct;
+       ct->id = ++ip_conntrack_next_id;
+       list_prepend(&ip_conntrack_hash[hash],
+                    &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+       list_prepend(&ip_conntrack_hash[repl_hash],
+                    &ct->tuplehash[IP_CT_DIR_REPLY].list);
 }
 
-/* Return conntrack and conntrack_info given skb->nfct->master */
-struct ip_conntrack *
-ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
 {
-       if (skb->nfct) 
-               return __ip_conntrack_get(skb->nfct, ctinfo);
-       return NULL;
+       unsigned int hash, repl_hash;
+
+       hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+       repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+       write_lock_bh(&ip_conntrack_lock);
+       __ip_conntrack_hash_insert(ct, hash, repl_hash);
+       write_unlock_bh(&ip_conntrack_lock);
 }
 
-/* Confirm a connection given skb->nfct; places it in hash table */
+/* Confirm a connection given skb; places it in hash table */
 int
-__ip_conntrack_confirm(struct nf_ct_info *nfct)
+__ip_conntrack_confirm(struct sk_buff **pskb)
 {
        unsigned int hash, repl_hash;
        struct ip_conntrack *ct;
        enum ip_conntrack_info ctinfo;
 
-       ct = __ip_conntrack_get(nfct, &ctinfo);
+       ct = ip_conntrack_get(*pskb, &ctinfo);
 
        /* ipt_REJECT uses ip_conntrack_attach to attach related
           ICMP/TCP RST packets in other direction.  Actual packet
@@ -450,7 +466,8 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct)
        IP_NF_ASSERT(!is_confirmed(ct));
        DEBUGP("Confirming conntrack %p\n", ct);
 
-       WRITE_LOCK(&ip_conntrack_lock);
+       write_lock_bh(&ip_conntrack_lock);
+
        /* See if there's one in the list already, including reverse:
            NAT could have grabbed it without realizing, since we're
            not in the hash.  If there is, we lost race. */
@@ -462,10 +479,10 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct)
                          conntrack_tuple_cmp,
                          struct ip_conntrack_tuple_hash *,
                          &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
-               list_prepend(&ip_conntrack_hash[hash],
-                            &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-               list_prepend(&ip_conntrack_hash[repl_hash],
-                            &ct->tuplehash[IP_CT_DIR_REPLY]);
+               /* Remove from unconfirmed list */
+               list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+               __ip_conntrack_hash_insert(ct, hash, repl_hash);
                /* Timer relative to confirmation time, not original
                   setting time, otherwise we'd get timer wrap in
                   weird delay cases. */
@@ -473,11 +490,24 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct)
                add_timer(&ct->timeout);
                atomic_inc(&ct->ct_general.use);
                set_bit(IPS_CONFIRMED_BIT, &ct->status);
-               WRITE_UNLOCK(&ip_conntrack_lock);
+               CONNTRACK_STAT_INC(insert);
+               write_unlock_bh(&ip_conntrack_lock);
+               if (ct->helper)
+                       ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+               if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+                   test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+                       ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+               ip_conntrack_event_cache(master_ct(ct) ?
+                                        IPCT_RELATED : IPCT_NEW, *pskb);
+
                return NF_ACCEPT;
        }
 
-       WRITE_UNLOCK(&ip_conntrack_lock);
+       CONNTRACK_STAT_INC(insert_failed);
+       write_unlock_bh(&ip_conntrack_lock);
+
        return NF_DROP;
 }
 
@@ -489,117 +519,44 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
 {
        struct ip_conntrack_tuple_hash *h;
 
-       READ_LOCK(&ip_conntrack_lock);
+       read_lock_bh(&ip_conntrack_lock);
        h = __ip_conntrack_find(tuple, ignored_conntrack);
-       READ_UNLOCK(&ip_conntrack_lock);
+       read_unlock_bh(&ip_conntrack_lock);
 
        return h != NULL;
 }
 
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-struct ip_conntrack *
-icmp_error_track(struct sk_buff *skb,
-                enum ip_conntrack_info *ctinfo,
-                unsigned int hooknum)
-{
-       struct ip_conntrack_tuple innertuple, origtuple;
-       struct {
-               struct icmphdr icmp;
-               struct iphdr ip;
-       } inside;
-       struct ip_conntrack_protocol *innerproto;
-       struct ip_conntrack_tuple_hash *h;
-       int dataoff;
-
-       IP_NF_ASSERT(skb->nfct == NULL);
-
-       /* Not enough header? */
-       if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
-               return NULL;
-
-       if (inside.icmp.type != ICMP_DEST_UNREACH
-           && inside.icmp.type != ICMP_SOURCE_QUENCH
-           && inside.icmp.type != ICMP_TIME_EXCEEDED
-           && inside.icmp.type != ICMP_PARAMETERPROB
-           && inside.icmp.type != ICMP_REDIRECT)
-               return NULL;
-
-       /* Ignore ICMP's containing fragments (shouldn't happen) */
-       if (inside.ip.frag_off & htons(IP_OFFSET)) {
-               DEBUGP("icmp_error_track: fragment of proto %u\n",
-                      inside.ip.protocol);
-               return NULL;
-       }
-
-       innerproto = ip_ct_find_proto(inside.ip.protocol);
-       dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
-       /* Are they talking about one of our connections? */
-       if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
-               DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
-               return NULL;
-       }
-
-       /* Ordinarily, we'd expect the inverted tupleproto, but it's
-          been preserved inside the ICMP. */
-       if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
-               DEBUGP("icmp_error_track: Can't invert tuple\n");
-               return NULL;
-       }
-
-       *ctinfo = IP_CT_RELATED;
-
-       h = ip_conntrack_find_get(&innertuple, NULL);
-       if (!h) {
-               /* Locally generated ICMPs will match inverted if they
-                  haven't been SNAT'ed yet */
-               /* FIXME: NAT code has to handle half-done double NAT --RR */
-               if (hooknum == NF_IP_LOCAL_OUT)
-                       h = ip_conntrack_find_get(&origtuple, NULL);
-
-               if (!h) {
-                       DEBUGP("icmp_error_track: no match\n");
-                       return NULL;
-               }
-               /* Reverse direction from that found */
-               if (DIRECTION(h) != IP_CT_DIR_REPLY)
-                       *ctinfo += IP_CT_IS_REPLY;
-       } else {
-               if (DIRECTION(h) == IP_CT_DIR_REPLY)
-                       *ctinfo += IP_CT_IS_REPLY;
-       }
-
-       /* Update skb to refer to this connection */
-       skb->nfct = &h->ctrack->infos[*ctinfo];
-       return h->ctrack;
-}
-
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
 {
-       return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
+       return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
 }
 
 static int early_drop(struct list_head *chain)
 {
        /* Traverse backwards: gives us oldest, which is roughly LRU */
        struct ip_conntrack_tuple_hash *h;
+       struct ip_conntrack *ct = NULL;
        int dropped = 0;
 
-       READ_LOCK(&ip_conntrack_lock);
+       read_lock_bh(&ip_conntrack_lock);
        h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-       if (h)
-               atomic_inc(&h->ctrack->ct_general.use);
-       READ_UNLOCK(&ip_conntrack_lock);
+       if (h) {
+               ct = tuplehash_to_ctrack(h);
+               atomic_inc(&ct->ct_general.use);
+       }
+       read_unlock_bh(&ip_conntrack_lock);
 
-       if (!h)
+       if (!ct)
                return dropped;
 
-       if (del_timer(&h->ctrack->timeout)) {
-               death_by_timeout((unsigned long)h->ctrack);
+       if (del_timer(&ct->timeout)) {
+               death_by_timeout((unsigned long)ct);
                dropped = 1;
+               CONNTRACK_STAT_INC(early_drop);
        }
-       ip_conntrack_put(h->ctrack);
+       ip_conntrack_put(ct);
        return dropped;
 }
 
@@ -609,43 +566,86 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
        return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
 }
 
-struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
 {
        return LIST_FIND(&helpers, helper_cmp,
                         struct ip_conntrack_helper *,
                         tuple);
 }
 
-/* Allocate a new conntrack: we return -ENOMEM if classification
-   failed due to stress.  Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
-              struct ip_conntrack_protocol *protocol,
-              struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+       struct ip_conntrack_helper *helper;
+
+       /* need ip_conntrack_lock to assure that helper exists until
+        * try_module_get() is called */
+       read_lock_bh(&ip_conntrack_lock);
+
+       helper = __ip_conntrack_helper_find(tuple);
+       if (helper) {
+               /* need to increase module usage count to assure helper will
+                * not go away while the caller is e.g. busy putting a
+                * conntrack in the hash that uses the helper */
+               if (!try_module_get(helper->me))
+                       helper = NULL;
+       }
+
+       read_unlock_bh(&ip_conntrack_lock);
+
+       return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+       module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+       return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+       struct ip_conntrack_protocol *p;
+
+       preempt_disable();
+       p = __ip_conntrack_proto_find(protocol);
+       if (p) {
+               if (!try_module_get(p->me))
+                       p = &ip_conntrack_generic_protocol;
+       }
+       preempt_enable();
+       
+       return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+       module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+                                       struct ip_conntrack_tuple *repl)
 {
        struct ip_conntrack *conntrack;
-       struct ip_conntrack_tuple repl_tuple;
-       size_t hash;
-       struct ip_conntrack_expect *expected;
-       int i;
-       static unsigned int drop_next;
 
        if (!ip_conntrack_hash_rnd_initted) {
                get_random_bytes(&ip_conntrack_hash_rnd, 4);
                ip_conntrack_hash_rnd_initted = 1;
        }
 
-       hash = hash_conntrack(tuple);
-
-       if (ip_conntrack_max &&
-           atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
-               /* Try dropping from random chain, or else from the
-                   chain about to put into (in case they're trying to
-                   bomb one hash chain). */
-               unsigned int next = (drop_next++)%ip_conntrack_htable_size;
-
-               if (!early_drop(&ip_conntrack_hash[next])
-                   && !early_drop(&ip_conntrack_hash[hash])) {
+       if (ip_conntrack_max
+           && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+               unsigned int hash = hash_conntrack(orig);
+               /* Try dropping from this hash chain. */
+               if (!early_drop(&ip_conntrack_hash[hash])) {
                        if (net_ratelimit())
                                printk(KERN_WARNING
                                       "ip_conntrack: table full, dropping"
@@ -654,11 +654,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
                }
        }
 
-       if (!invert_tuple(&repl_tuple, tuple, protocol)) {
-               DEBUGP("Can't invert tuple.\n");
-               return NULL;
-       }
-
        conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
        if (!conntrack) {
                DEBUGP("Can't allocate conntrack.\n");
@@ -668,65 +663,86 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
        memset(conntrack, 0, sizeof(*conntrack));
        atomic_set(&conntrack->ct_general.use, 1);
        conntrack->ct_general.destroy = destroy_conntrack;
-       conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
-       conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
-       conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
-       conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
-       for (i=0; i < IP_CT_NUMBER; i++)
-               conntrack->infos[i].master = &conntrack->ct_general;
-
-       if (!protocol->new(conntrack, skb)) {
-               kmem_cache_free(ip_conntrack_cachep, conntrack);
-               return NULL;
-       }
+       conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+       conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
        /* Don't set timer yet: wait for confirmation */
        init_timer(&conntrack->timeout);
        conntrack->timeout.data = (unsigned long)conntrack;
        conntrack->timeout.function = death_by_timeout;
 
-       INIT_LIST_HEAD(&conntrack->sibling_list);
-
-       WRITE_LOCK(&ip_conntrack_lock);
-       /* Need finding and deleting of expected ONLY if we win race */
-       READ_LOCK(&ip_conntrack_expect_tuple_lock);
-       expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
-                            struct ip_conntrack_expect *, tuple);
-       READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
-
-       /* If master is not in hash table yet (ie. packet hasn't left
-          this machine yet), how can other end know about expected?
-          Hence these are not the droids you are looking for (if
-          master ct never got confirmed, we'd hold a reference to it
-          and weird things would happen to future packets). */
-       if (expected && !is_confirmed(expected->expectant))
-               expected = NULL;
-
-       /* Look up the conntrack helper for master connections only */
-       if (!expected)
-               conntrack->helper = ip_ct_find_helper(&repl_tuple);
-
-       /* If the expectation is dying, then this is a loser. */
-       if (expected
-           && expected->expectant->helper->timeout
-           && ! del_timer(&expected->timeout))
-               expected = NULL;
-
-       if (expected) {
+       atomic_inc(&ip_conntrack_count);
+
+       return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+       atomic_dec(&ip_conntrack_count);
+       kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress.   Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+              struct ip_conntrack_protocol *protocol,
+              struct sk_buff *skb)
+{
+       struct ip_conntrack *conntrack;
+       struct ip_conntrack_tuple repl_tuple;
+       struct ip_conntrack_expect *exp;
+
+       if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+               DEBUGP("Can't invert tuple.\n");
+               return NULL;
+       }
+
+       conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+       if (conntrack == NULL || IS_ERR(conntrack))
+               return (struct ip_conntrack_tuple_hash *)conntrack;
+
+       if (!protocol->new(conntrack, skb)) {
+               ip_conntrack_free(conntrack);
+               return NULL;
+       }
+
+       write_lock_bh(&ip_conntrack_lock);
+       exp = find_expectation(tuple);
+
+       if (exp) {
                DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
-                       conntrack, expected);
+                       conntrack, exp);
                /* Welcome, Mr. Bond.  We've been expecting you... */
                __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
-               conntrack->master = expected;
-               expected->sibling = conntrack;
-               LIST_DELETE(&ip_conntrack_expect_list, expected);
-               expected->expectant->expecting--;
-               nf_conntrack_get(&master_ct(conntrack)->infos[0]);
+               conntrack->master = exp->master;
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+               conntrack->mark = exp->master->mark;
+#endif
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+    defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+               /* this is ugly, but there is no other place where to put it */
+               conntrack->nat.masq_index = exp->master->nat.masq_index;
+#endif
+               nf_conntrack_get(&conntrack->master->ct_general);
+               CONNTRACK_STAT_INC(expect_new);
+       } else {
+               conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
+
+               CONNTRACK_STAT_INC(new);
+       }
+
+       /* Overload tuple linked list to put us in unconfirmed list. */
+       list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
+       write_unlock_bh(&ip_conntrack_lock);
+
+       if (exp) {
+               if (exp->expectfn)
+                       exp->expectfn(conntrack, exp);
+               ip_conntrack_expect_put(exp);
        }
-       atomic_inc(&ip_conntrack_count);
-       WRITE_UNLOCK(&ip_conntrack_lock);
 
-       if (expected && expected->expectfn)
-               expected->expectfn(conntrack);
        return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
 }
 
@@ -740,10 +756,12 @@ resolve_normal_ct(struct sk_buff *skb,
 {
        struct ip_conntrack_tuple tuple;
        struct ip_conntrack_tuple_hash *h;
+       struct ip_conntrack *ct;
 
        IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
 
-       if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
+       if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
+                               &tuple,proto))
                return NULL;
 
        /* look for tuple match */
@@ -755,6 +773,7 @@ resolve_normal_ct(struct sk_buff *skb,
                if (IS_ERR(h))
                        return (void *)h;
        }
+       ct = tuplehash_to_ctrack(h);
 
        /* It exists; we have (non-exclusive) reference. */
        if (DIRECTION(h) == IP_CT_DIR_REPLY) {
@@ -763,23 +782,24 @@ resolve_normal_ct(struct sk_buff *skb,
                *set_reply = 1;
        } else {
                /* Once we've had two way comms, always ESTABLISHED. */
-               if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
+               if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
                        DEBUGP("ip_conntrack_in: normal packet for %p\n",
-                              h->ctrack);
+                              ct);
                        *ctinfo = IP_CT_ESTABLISHED;
-               } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
+               } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
                        DEBUGP("ip_conntrack_in: related packet for %p\n",
-                              h->ctrack);
+                              ct);
                        *ctinfo = IP_CT_RELATED;
                } else {
                        DEBUGP("ip_conntrack_in: new packet for %p\n",
-                              h->ctrack);
+                              ct);
                        *ctinfo = IP_CT_NEW;
                }
                *set_reply = 0;
        }
-       skb->nfct = &h->ctrack->infos[*ctinfo];
-       return h->ctrack;
+       skb->nfct = &ct->ct_general;
+       skb->nfctinfo = *ctinfo;
+       return ct;
 }
 
 /* Netfilter hook itself. */
@@ -792,9 +812,15 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
        struct ip_conntrack *ct;
        enum ip_conntrack_info ctinfo;
        struct ip_conntrack_protocol *proto;
-       int set_reply;
+       int set_reply = 0;
        int ret;
 
+       /* Previously seen (loopback or untracked)?  Ignore. */
+       if ((*pskb)->nfct) {
+               CONNTRACK_STAT_INC(ignore);
+               return NF_ACCEPT;
+       }
+
        /* Never happen */
        if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
                if (net_ratelimit()) {
@@ -804,9 +830,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
                return NF_DROP;
        }
 
-       /* FIXME: Do this right please. --RR */
-       (*pskb)->nfcache |= NFC_UNKNOWN;
-
 /* Doesn't cover locally-generated broadcast, so not worth it. */
 #if 0
        /* Ignore broadcast: no `connection'. */
@@ -822,46 +845,44 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
        }
 #endif
 
-       /* Previously seen (loopback or untracked)?  Ignore. */
-       if ((*pskb)->nfct)
-               return NF_ACCEPT;
+       proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
 
-       proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
-
-       /* It may be an icmp error... */
-       if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
-           && icmp_error_track(*pskb, &ctinfo, hooknum))
-               return NF_ACCEPT;
+       /* It may be an special packet, error, unclean...
+        * inverse of the return code tells to the netfilter
+        * core what to do with the packet. */
+       if (proto->error != NULL 
+           && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+               CONNTRACK_STAT_INC(error);
+               CONNTRACK_STAT_INC(invalid);
+               return -ret;
+       }
 
-       if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
+       if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
                /* Not valid part of a connection */
+               CONNTRACK_STAT_INC(invalid);
                return NF_ACCEPT;
+       }
 
-       if (IS_ERR(ct))
+       if (IS_ERR(ct)) {
                /* Too stressed to deal. */
+               CONNTRACK_STAT_INC(drop);
                return NF_DROP;
+       }
 
        IP_NF_ASSERT((*pskb)->nfct);
 
        ret = proto->packet(ct, *pskb, ctinfo);
-       if (ret == -1) {
-               /* Invalid */
+       if (ret < 0) {
+               /* Invalid: inverse of the return code tells
+                * the netfilter core what to do*/
                nf_conntrack_put((*pskb)->nfct);
                (*pskb)->nfct = NULL;
-               return NF_ACCEPT;
+               CONNTRACK_STAT_INC(invalid);
+               return -ret;
        }
 
-       if (ret != NF_DROP && ct->helper) {
-               ret = ct->helper->help(*pskb, ct, ctinfo);
-               if (ret == -1) {
-                       /* Invalid */
-                       nf_conntrack_put((*pskb)->nfct);
-                       (*pskb)->nfct = NULL;
-                       return NF_ACCEPT;
-               }
-       }
-       if (set_reply)
-               set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+       if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+               ip_conntrack_event_cache(IPCT_STATUS, *pskb);
 
        return ret;
 }
@@ -869,248 +890,160 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
                   const struct ip_conntrack_tuple *orig)
 {
-       return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
-}
-
-static inline int resent_expect(const struct ip_conntrack_expect *i,
-                               const struct ip_conntrack_tuple *tuple,
-                               const struct ip_conntrack_tuple *mask)
-{
-       DEBUGP("resent_expect\n");
-       DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
-       DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
-       DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
-       return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
-                || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
-               && ip_ct_tuple_equal(&i->mask, mask));
+       return ip_ct_invert_tuple(inverse, orig, 
+                                 __ip_conntrack_proto_find(orig->dst.protonum));
 }
 
 /* Would two expected things clash? */
-static inline int expect_clash(const struct ip_conntrack_expect *i,
-                              const struct ip_conntrack_tuple *tuple,
-                              const struct ip_conntrack_tuple *mask)
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+                              const struct ip_conntrack_expect *b)
 {
        /* Part covered by intersection of masks must be unequal,
            otherwise they clash */
        struct ip_conntrack_tuple intersect_mask
-               = { { i->mask.src.ip & mask->src.ip,
-                     { i->mask.src.u.all & mask->src.u.all } },
-                   { i->mask.dst.ip & mask->dst.ip,
-                     { i->mask.dst.u.all & mask->dst.u.all },
-                     i->mask.dst.protonum & mask->dst.protonum } };
+               = { { a->mask.src.ip & b->mask.src.ip,
+                     { a->mask.src.u.all & b->mask.src.u.all } },
+                   { a->mask.dst.ip & b->mask.dst.ip,
+                     { a->mask.dst.u.all & b->mask.dst.u.all },
+                     a->mask.dst.protonum & b->mask.dst.protonum } };
 
-       return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
+       return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
 }
 
-inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+                                const struct ip_conntrack_expect *b)
 {
-       WRITE_LOCK(&ip_conntrack_lock);
-       unexpect_related(expect);
-       WRITE_UNLOCK(&ip_conntrack_lock);
+       return a->master == b->master
+               && ip_ct_tuple_equal(&a->tuple, &b->tuple)
+               && ip_ct_tuple_equal(&a->mask, &b->mask);
 }
-       
-static void expectation_timed_out(unsigned long ul_expect)
-{
-       struct ip_conntrack_expect *expect = (void *) ul_expect;
 
-       DEBUGP("expectation %p timed out\n", expect);   
-       WRITE_LOCK(&ip_conntrack_lock);
-       __unexpect_related(expect);
-       WRITE_UNLOCK(&ip_conntrack_lock);
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
+{
+       struct ip_conntrack_expect *i;
+
+       write_lock_bh(&ip_conntrack_lock);
+       /* choose the the oldest expectation to evict */
+       list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+               if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+                       ip_ct_unlink_expect(i);
+                       write_unlock_bh(&ip_conntrack_lock);
+                       ip_conntrack_expect_put(i);
+                       return;
+               }
+       }
+       write_unlock_bh(&ip_conntrack_lock);
 }
 
-struct ip_conntrack_expect *
-ip_conntrack_expect_alloc(void)
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are 
+ * always killed before the conntrack itself */
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
 {
        struct ip_conntrack_expect *new;
-       
-       new = (struct ip_conntrack_expect *)
-               kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
+
+       new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
        if (!new) {
                DEBUGP("expect_related: OOM allocating expect\n");
                return NULL;
        }
-
-       /* tuple_cmp compares whole union, we have to initialized cleanly */
-       memset(new, 0, sizeof(struct ip_conntrack_expect));
-
+       new->master = me;
+       atomic_set(&new->use, 1);
        return new;
 }
 
-static void
-ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
-                          struct ip_conntrack *related_to)
+void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
 {
-       DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
-       new->expectant = related_to;
-       new->sibling = NULL;
-       atomic_set(&new->use, 1);
-
-       /* add to expected list for this connection */
-       list_add_tail(&new->expected_list, &related_to->sibling_list);
-       /* add to global list of expectations */
-       list_prepend(&ip_conntrack_expect_list, &new->list);
-       /* add and start timer if required */
-       if (related_to->helper->timeout) {
-               init_timer(&new->timeout);
-               new->timeout.data = (unsigned long)new;
-               new->timeout.function = expectation_timed_out;
-               new->timeout.expires = jiffies +
-                                       related_to->helper->timeout * HZ;
-               add_timer(&new->timeout);
-       }
-       related_to->expecting++;
+       if (atomic_dec_and_test(&exp->use))
+               kmem_cache_free(ip_conntrack_expect_cachep, exp);
 }
 
-/* Add a related connection. */
-int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
-                               struct ip_conntrack *related_to)
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
 {
-       struct ip_conntrack_expect *old;
-       int ret = 0;
-
-       WRITE_LOCK(&ip_conntrack_lock);
-       /* Because of the write lock, no reader can walk the lists,
-        * so there is no need to use the tuple lock too */
+       atomic_inc(&exp->use);
+       exp->master->expecting++;
+       list_add(&exp->list, &ip_conntrack_expect_list);
+
+       init_timer(&exp->timeout);
+       exp->timeout.data = (unsigned long)exp;
+       exp->timeout.function = expectation_timed_out;
+       exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
+       add_timer(&exp->timeout);
+
+       exp->id = ++ip_conntrack_expect_next_id;
+       atomic_inc(&exp->use);
+       CONNTRACK_STAT_INC(expect_create);
+}
 
-       DEBUGP("ip_conntrack_expect_related %p\n", related_to);
-       DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
+{
+       struct ip_conntrack_expect *i;
 
-       old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
-                       struct ip_conntrack_expect *, &expect->tuple, 
-                       &expect->mask);
-       if (old) {
-               /* Helper private data may contain offsets but no pointers
-                  pointing into the payload - otherwise we should have to copy 
-                  the data filled out by the helper over the old one */
-               DEBUGP("expect_related: resent packet\n");
-               if (related_to->helper->timeout) {
-                       if (!del_timer(&old->timeout)) {
-                               /* expectation is dying. Fall through */
-                               goto out;
-                       } else {
-                               old->timeout.expires = jiffies + 
-                                       related_to->helper->timeout * HZ;
-                               add_timer(&old->timeout);
+       list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+               if (i->master == master) {
+                       if (del_timer(&i->timeout)) {
+                               ip_ct_unlink_expect(i);
+                               ip_conntrack_expect_put(i);
                        }
+                       break;
                }
-
-               WRITE_UNLOCK(&ip_conntrack_lock);
-               kfree(expect);
-               return -EEXIST;
-
-       } else if (related_to->helper->max_expected && 
-                  related_to->expecting >= related_to->helper->max_expected) {
-               /* old == NULL */
-               if (!(related_to->helper->flags & 
-                     IP_CT_HELPER_F_REUSE_EXPECT)) {
-                       WRITE_UNLOCK(&ip_conntrack_lock);
-                       if (net_ratelimit())
-                               printk(KERN_WARNING
-                                      "ip_conntrack: max number of expected "
-                                      "connections %i of %s reached for "
-                                      "%u.%u.%u.%u->%u.%u.%u.%u\n",
-                                      related_to->helper->max_expected,
-                                      related_to->helper->name,
-                                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
-                                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
-                       kfree(expect);
-                       return -EPERM;
-               }
-               DEBUGP("ip_conntrack: max number of expected "
-                      "connections %i of %s reached for "
-                      "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
-                      related_to->helper->max_expected,
-                      related_to->helper->name,
-                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
-                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
-               /* choose the the oldest expectation to evict */
-               list_for_each_entry(old, &related_to->sibling_list, 
-                                                     expected_list)
-                       if (old->sibling == NULL)
-                               break;
-
-               /* We cannot fail since related_to->expecting is the number
-                * of unconfirmed expectations */
-               IP_NF_ASSERT(old && old->sibling == NULL);
-
-               /* newnat14 does not reuse the real allocated memory
-                * structures but rather unexpects the old and
-                * allocates a new.  unexpect_related will decrement
-                * related_to->expecting. 
-                */
-               unexpect_related(old);
-               ret = -EPERM;
-       } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
-                            struct ip_conntrack_expect *, &expect->tuple, 
-                            &expect->mask)) {
-               WRITE_UNLOCK(&ip_conntrack_lock);
-               DEBUGP("expect_related: busy!\n");
-
-               kfree(expect);
-               return -EBUSY;
        }
+}
 
-out:   ip_conntrack_expect_insert(expect, related_to);
-
-       WRITE_UNLOCK(&ip_conntrack_lock);
+static inline int refresh_timer(struct ip_conntrack_expect *i)
+{
+       if (!del_timer(&i->timeout))
+               return 0;
 
-       return ret;
+       i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+       add_timer(&i->timeout);
+       return 1;
 }
 
-/* Change tuple in an existing expectation */
-int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
-                              struct ip_conntrack_tuple *newtuple)
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 {
+       struct ip_conntrack_expect *i;
        int ret;
 
-       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
-       WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
-       DEBUGP("change_expect:\n");
-       DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
-       DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
-       if (expect->ct_tuple.dst.protonum == 0) {
-               /* Never seen before */
-               DEBUGP("change expect: never seen before\n");
-               if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
-                   && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
-                                struct ip_conntrack_expect *, newtuple, &expect->mask)) {
-                       /* Force NAT to find an unused tuple */
-                       ret = -1;
-               } else {
-                       memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
-                       memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
-                       ret = 0;
-               }
-       } else {
-               /* Resent packet */
-               DEBUGP("change expect: resent packet\n");
-               if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
-                       ret = 0;
-               } else {
-                       /* Force NAT to choose again the same port */
-                       ret = -1;
+       DEBUGP("ip_conntrack_expect_related %p\n", related_to);
+       DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
+       DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+
+       write_lock_bh(&ip_conntrack_lock);
+       list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+               if (expect_matches(i, expect)) {
+                       /* Refresh timer: if it's dying, ignore.. */
+                       if (refresh_timer(i)) {
+                               ret = 0;
+                               goto out;
+                       }
+               } else if (expect_clash(i, expect)) {
+                       ret = -EBUSY;
+                       goto out;
                }
        }
-       WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
-       
-       return ret;
+
+       /* Will be over limit? */
+       if (expect->master->helper->max_expected && 
+           expect->master->expecting >= expect->master->helper->max_expected)
+               evict_oldest_expect(expect->master);
+
+       ip_conntrack_expect_insert(expect);
+       ip_conntrack_expect_event(IPEXP_NEW, expect);
+       ret = 0;
+out:
+       write_unlock_bh(&ip_conntrack_lock);
+       return ret;
 }
 
-/* Alter reply tuple (maybe alter helper).  If it's already taken,
-   return 0 and don't do alteration. */
-int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
-                            const struct ip_conntrack_tuple *newreply)
+/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
+   implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+                             const struct ip_conntrack_tuple *newreply)
 {
-       WRITE_LOCK(&ip_conntrack_lock);
-       if (__ip_conntrack_find(newreply, conntrack)) {
-               WRITE_UNLOCK(&ip_conntrack_lock);
-               return 0;
-       }
+       write_lock_bh(&ip_conntrack_lock);
        /* Should be unconfirmed, so not in hash table yet */
        IP_NF_ASSERT(!is_confirmed(conntrack));
 
@@ -1118,30 +1051,40 @@ int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
        DUMP_TUPLE(newreply);
 
        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
-       if (!conntrack->master && list_empty(&conntrack->sibling_list))
-               conntrack->helper = ip_ct_find_helper(newreply);
-       WRITE_UNLOCK(&ip_conntrack_lock);
-
-       return 1;
+       if (!conntrack->master && conntrack->expecting == 0)
+               conntrack->helper = __ip_conntrack_helper_find(newreply);
+       write_unlock_bh(&ip_conntrack_lock);
 }
 
 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 {
-       WRITE_LOCK(&ip_conntrack_lock);
+       BUG_ON(me->timeout == 0);
+       write_lock_bh(&ip_conntrack_lock);
        list_prepend(&helpers, me);
-       WRITE_UNLOCK(&ip_conntrack_lock);
+       write_unlock_bh(&ip_conntrack_lock);
 
        return 0;
 }
 
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+       struct ip_conntrack_helper *h;
+
+       list_for_each_entry(h, &helpers, list) {
+               if (!strcmp(h->name, name))
+                       return h;
+       }
+
+       return NULL;
+}
+
 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
                         const struct ip_conntrack_helper *me)
 {
-       if (i->ctrack->helper == me) {
-               /* Get rid of any expected. */
-               remove_expectations(i->ctrack, 0);
-               /* And *then* set helper to NULL */
-               i->ctrack->helper = NULL;
+       if (tuplehash_to_ctrack(i)->helper == me) {
+               ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
+               tuplehash_to_ctrack(i)->helper = NULL;
        }
        return 0;
 }
@@ -1149,138 +1092,189 @@ static inline int unhelp(struct ip_conntrack_tuple_hash *i,
 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 {
        unsigned int i;
+       struct ip_conntrack_expect *exp, *tmp;
 
        /* Need write lock here, to delete helper. */
-       WRITE_LOCK(&ip_conntrack_lock);
+       write_lock_bh(&ip_conntrack_lock);
        LIST_DELETE(&helpers, me);
 
+       /* Get rid of expectations */
+       list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+               if (exp->master->helper == me && del_timer(&exp->timeout)) {
+                       ip_ct_unlink_expect(exp);
+                       ip_conntrack_expect_put(exp);
+               }
+       }
        /* Get rid of expecteds, set helpers to NULL. */
+       LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
        for (i = 0; i < ip_conntrack_htable_size; i++)
                LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
                            struct ip_conntrack_tuple_hash *, me);
-       WRITE_UNLOCK(&ip_conntrack_lock);
+       write_unlock_bh(&ip_conntrack_lock);
 
        /* Someone could be still looking at the helper in a bh. */
        synchronize_net();
 }
 
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
+                       enum ip_conntrack_info ctinfo,
+                       const struct sk_buff *skb,
+                       unsigned long extra_jiffies,
+                       int do_acct)
 {
+       int event = 0;
+
        IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+       IP_NF_ASSERT(skb);
+
+       write_lock_bh(&ip_conntrack_lock);
 
        /* If not in hash table, timer will not be active yet */
-       if (!is_confirmed(ct))
+       if (!is_confirmed(ct)) {
                ct->timeout.expires = extra_jiffies;
-       else {
-               WRITE_LOCK(&ip_conntrack_lock);
+               event = IPCT_REFRESH;
+       } else {
                /* Need del_timer for race avoidance (may already be dying). */
                if (del_timer(&ct->timeout)) {
                        ct->timeout.expires = jiffies + extra_jiffies;
                        add_timer(&ct->timeout);
+                       event = IPCT_REFRESH;
                }
-               WRITE_UNLOCK(&ip_conntrack_lock);
        }
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+       if (do_acct) {
+               ct->counters[CTINFO2DIR(ctinfo)].packets++;
+               ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+                                               ntohs(skb->nh.iph->tot_len);
+               if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+                   || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+                       event |= IPCT_COUNTER_FILLING;
+       }
+#endif
+
+       write_unlock_bh(&ip_conntrack_lock);
+
+       /* must be unlocked when calling event cache */
+       if (event)
+               ip_conntrack_event_cache(event, skb);
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+                              const struct ip_conntrack_tuple *tuple)
+{
+       NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+               &tuple->src.u.tcp.port);
+       NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+               &tuple->dst.u.tcp.port);
+       return 0;
+
+nfattr_failure:
+       return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+                              struct ip_conntrack_tuple *t)
+{
+       if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+               return -EINVAL;
+
+       t->src.u.tcp.port =
+               *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+       t->dst.u.tcp.port =
+               *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+       return 0;
+}
+#endif
+
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb)
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
-       struct sock *sk = skb->sk;
-#ifdef CONFIG_NETFILTER_DEBUG
-       unsigned int olddebug = skb->nf_debug;
-#endif
-       if (sk) {
-               sock_hold(sk);
-               skb_orphan(skb);
-       }
+       skb_orphan(skb);
 
        local_bh_disable(); 
-       skb = ip_defrag(skb);
+       skb = ip_defrag(skb, user);
        local_bh_enable();
 
-       if (!skb) {
-               if (sk)
-                       sock_put(sk);
-               return skb;
-       }
-
-       if (sk) {
-               skb_set_owner_w(skb, sk);
-               sock_put(sk);
-       }
-
-       ip_send_check(skb->nh.iph);
-       skb->nfcache |= NFC_ALTERED;
-#ifdef CONFIG_NETFILTER_DEBUG
-       /* Packet path as if nothing had happened. */
-       skb->nf_debug = olddebug;
-#endif
+       if (skb)
+               ip_send_check(skb->nh.iph);
        return skb;
 }
 
 /* Used by ipt_REJECT. */
-static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 {
        struct ip_conntrack *ct;
        enum ip_conntrack_info ctinfo;
 
-       ct = __ip_conntrack_get(nfct, &ctinfo);
-
-       /* This ICMP is in reverse direction to the packet which
-           caused it */
+       /* This ICMP is in reverse direction to the packet which caused it */
+       ct = ip_conntrack_get(skb, &ctinfo);
+       
        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
                ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
        else
                ctinfo = IP_CT_RELATED;
 
-       /* Attach new skbuff, and increment count */
-       nskb->nfct = &ct->infos[ctinfo];
-       atomic_inc(&ct->ct_general.use);
+       /* Attach to new skbuff, and increment count */
+       nskb->nfct = &ct->ct_general;
+       nskb->nfctinfo = ctinfo;
+       nf_conntrack_get(nskb->nfct);
 }
 
 static inline int
-do_kill(const struct ip_conntrack_tuple_hash *i,
-       int (*kill)(const struct ip_conntrack *i, void *data),
+do_iter(const struct ip_conntrack_tuple_hash *i,
+       int (*iter)(struct ip_conntrack *i, void *data),
        void *data)
 {
-       return kill(i->ctrack, data);
+       return iter(tuplehash_to_ctrack(i), data);
 }
 
 /* Bring out ya dead! */
 static struct ip_conntrack_tuple_hash *
-get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
                void *data, unsigned int *bucket)
 {
        struct ip_conntrack_tuple_hash *h = NULL;
 
-       READ_LOCK(&ip_conntrack_lock);
-       for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
-               h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
-                             struct ip_conntrack_tuple_hash *, kill, data);
+       write_lock_bh(&ip_conntrack_lock);
+       for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+               h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+                               struct ip_conntrack_tuple_hash *, iter, data);
+               if (h)
+                       break;
        }
+       if (!h)
+               h = LIST_FIND_W(&unconfirmed, do_iter,
+                               struct ip_conntrack_tuple_hash *, iter, data);
        if (h)
-               atomic_inc(&h->ctrack->ct_general.use);
-       READ_UNLOCK(&ip_conntrack_lock);
+               atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+       write_unlock_bh(&ip_conntrack_lock);
 
        return h;
 }
 
 void
-ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
-                       void *data)
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
 {
        struct ip_conntrack_tuple_hash *h;
        unsigned int bucket = 0;
 
-       while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
+       while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+               struct ip_conntrack *ct = tuplehash_to_ctrack(h);
                /* Time to push up daises... */
-               if (del_timer(&h->ctrack->timeout))
-                       death_by_timeout((unsigned long)h->ctrack);
+               if (del_timer(&ct->timeout))
+                       death_by_timeout((unsigned long)ct);
                /* ... else the timer will get him soon. */
 
-               ip_conntrack_put(h->ctrack);
+               ip_conntrack_put(ct);
        }
 }
 
@@ -1291,7 +1285,7 @@ ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
 static int
 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 {
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ip_conntrack_tuple_hash *h;
        struct ip_conntrack_tuple tuple;
        
@@ -1317,16 +1311,18 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
        h = ip_conntrack_find_get(&tuple, NULL);
        if (h) {
                struct sockaddr_in sin;
+               struct ip_conntrack *ct = tuplehash_to_ctrack(h);
 
                sin.sin_family = AF_INET;
-               sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+               sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                        .tuple.dst.u.tcp.port;
-               sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+               sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                        .tuple.dst.ip;
+               memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
 
                DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
                       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
-               ip_conntrack_put(h->ctrack);
+               ip_conntrack_put(ct);
                if (copy_to_user(user, &sin, sizeof(sin)) != 0)
                        return -EFAULT;
                else
@@ -1345,35 +1341,126 @@ static struct nf_sockopt_ops so_getorigdst = {
        .get            = &getorigdst,
 };
 
-static int kill_all(const struct ip_conntrack *i, void *data)
+static int kill_all(struct ip_conntrack *i, void *data)
 {
        return 1;
 }
 
+void ip_conntrack_flush(void)
+{
+       ip_ct_iterate_cleanup(kill_all, NULL);
+}
+
+static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
+{
+       if (vmalloced)
+               vfree(hash);
+       else
+               free_pages((unsigned long)hash, 
+                          get_order(sizeof(struct list_head) * size));
+}
+
 /* Mishearing the voices in his head, our hero wonders how he's
    supposed to kill the mall. */
 void ip_conntrack_cleanup(void)
 {
        ip_ct_attach = NULL;
+
        /* This makes sure all current packets have passed through
            netfilter framework.  Roll on, two-stage module
            delete... */
        synchronize_net();
+
+       ip_ct_event_cache_flush();
  i_see_dead_people:
-       ip_ct_selective_cleanup(kill_all, NULL);
+       ip_conntrack_flush();
        if (atomic_read(&ip_conntrack_count) != 0) {
                schedule();
                goto i_see_dead_people;
        }
+       /* wait until all references to ip_conntrack_untracked are dropped */
+       while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
+               schedule();
 
        kmem_cache_destroy(ip_conntrack_cachep);
-       vfree(ip_conntrack_hash);
+       kmem_cache_destroy(ip_conntrack_expect_cachep);
+       free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+                           ip_conntrack_htable_size);
        nf_unregister_sockopt(&so_getorigdst);
 }
 
-static int hashsize;
-MODULE_PARM(hashsize, "i");
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+       struct list_head *hash;
+       unsigned int i;
+
+       *vmalloced = 0; 
+       hash = (void*)__get_free_pages(GFP_KERNEL, 
+                                      get_order(sizeof(struct list_head)
+                                                * size));
+       if (!hash) { 
+               *vmalloced = 1;
+               printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+               hash = vmalloc(sizeof(struct list_head) * size);
+       }
+
+       if (hash)
+               for (i = 0; i < size; i++)
+                       INIT_LIST_HEAD(&hash[i]);
+
+       return hash;
+}
+
+static int set_hashsize(const char *val, struct kernel_param *kp)
+{
+       int i, bucket, hashsize, vmalloced;
+       int old_vmalloced, old_size;
+       int rnd;
+       struct list_head *hash, *old_hash;
+       struct ip_conntrack_tuple_hash *h;
+
+       /* On boot, we can set this without any fancy locking. */
+       if (!ip_conntrack_htable_size)
+               return param_set_int(val, kp);
+
+       hashsize = simple_strtol(val, NULL, 0);
+       if (!hashsize)
+               return -EINVAL;
+
+       hash = alloc_hashtable(hashsize, &vmalloced);
+       if (!hash)
+               return -ENOMEM;
+
+       /* We have to rehash for the new table anyway, so we also can 
+        * use a new random seed */
+       get_random_bytes(&rnd, 4);
+
+       write_lock_bh(&ip_conntrack_lock);
+       for (i = 0; i < ip_conntrack_htable_size; i++) {
+               while (!list_empty(&ip_conntrack_hash[i])) {
+                       h = list_entry(ip_conntrack_hash[i].next,
+                                      struct ip_conntrack_tuple_hash, list);
+                       list_del(&h->list);
+                       bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+                       list_add_tail(&h->list, &hash[bucket]);
+               }
+       }
+       old_size = ip_conntrack_htable_size;
+       old_vmalloced = ip_conntrack_vmalloc;
+       old_hash = ip_conntrack_hash;
+
+       ip_conntrack_htable_size = hashsize;
+       ip_conntrack_vmalloc = vmalloced;
+       ip_conntrack_hash = hash;
+       ip_conntrack_hash_rnd = rnd;
+       write_unlock_bh(&ip_conntrack_lock);
+
+       free_conntrack_hash(old_hash, old_vmalloced, old_size);
+       return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+                 &ip_conntrack_htable_size, 0600);
 
 int __init ip_conntrack_init(void)
 {
@@ -1382,9 +1469,7 @@ int __init ip_conntrack_init(void)
 
        /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
         * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
-       if (hashsize) {
-               ip_conntrack_htable_size = hashsize;
-       } else {
+       if (!ip_conntrack_htable_size) {
                ip_conntrack_htable_size
                        = (((num_physpages << PAGE_SHIFT) / 16384)
                           / sizeof(struct list_head));
@@ -1406,8 +1491,8 @@ int __init ip_conntrack_init(void)
                return ret;
        }
 
-       ip_conntrack_hash = vmalloc(sizeof(struct list_head)
-                                   * ip_conntrack_htable_size);
+       ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+                                           &ip_conntrack_vmalloc);
        if (!ip_conntrack_hash) {
                printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
                goto err_unreg_sockopt;
@@ -1415,21 +1500,29 @@ int __init ip_conntrack_init(void)
 
        ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
                                                sizeof(struct ip_conntrack), 0,
-                                               SLAB_HWCACHE_ALIGN, NULL, NULL);
+                                               0, NULL, NULL);
        if (!ip_conntrack_cachep) {
                printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
                goto err_free_hash;
        }
+
+       ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+                                       sizeof(struct ip_conntrack_expect),
+                                       0, 0, NULL, NULL);
+       if (!ip_conntrack_expect_cachep) {
+               printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+               goto err_free_conntrack_slab;
+       }
+
        /* Don't NEED lock here, but good form anyway. */
-       WRITE_LOCK(&ip_conntrack_lock);
+       write_lock_bh(&ip_conntrack_lock);
+       for (i = 0; i < MAX_IP_CT_PROTO; i++)
+               ip_ct_protos[i] = &ip_conntrack_generic_protocol;
        /* Sew in builtin protocols. */
-       list_append(&protocol_list, &ip_conntrack_protocol_tcp);
-       list_append(&protocol_list, &ip_conntrack_protocol_udp);
-       list_append(&protocol_list, &ip_conntrack_protocol_icmp);
-       WRITE_UNLOCK(&ip_conntrack_lock);
-
-       for (i = 0; i < ip_conntrack_htable_size; i++)
-               INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+       ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+       ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+       ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
+       write_unlock_bh(&ip_conntrack_lock);
 
        /* For use by ipt_REJECT */
        ip_ct_attach = ip_conntrack_attach;
@@ -1439,16 +1532,14 @@ int __init ip_conntrack_init(void)
        atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
        /*  - and look it like as a confirmed connection */
        set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
-       /*  - and prepare the ctinfo field for REJECT & NAT. */
-       ip_conntrack_untracked.infos[IP_CT_NEW].master =
-       ip_conntrack_untracked.infos[IP_CT_RELATED].master =
-       ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
-                       &ip_conntrack_untracked.ct_general;
 
        return ret;
 
+err_free_conntrack_slab:
+       kmem_cache_destroy(ip_conntrack_cachep);
 err_free_hash:
-       vfree(ip_conntrack_hash);
+       free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+                           ip_conntrack_htable_size);
 err_unreg_sockopt:
        nf_unregister_sockopt(&so_getorigdst);