X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fnetfilter%2Fip_conntrack_core.c;h=a297da7bbef5dc735d1c68d57fe35b1db901d820;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=00a89f4f8d8b5580a838fb9b3a4b38f00edef8d7;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 00a89f4f8..a297da7bb 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -34,13 +34,15 @@ #include #include #include -/* For ERR_PTR(). Yeah, I know... --RR */ -#include +#include +#include +#include +#include -/* This rwlock protects the main hash table, protocol/helper/expected +/* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include @@ -48,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.1" +#define IP_CONNTRACK_VERSION "2.4" #if 0 #define DEBUGP printk @@ -56,82 +58,118 @@ #define DEBUGP(format, args...) #endif -DECLARE_RWLOCK(ip_conntrack_lock); -DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock); +DEFINE_RWLOCK(ip_conntrack_lock); + +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(ip_conntrack_expect_list); -LIST_HEAD(protocol_list); +struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; -static atomic_t ip_conntrack_count = ATOMIC_INIT(0); struct list_head *ip_conntrack_hash; -static kmem_cache_t *ip_conntrack_cachep; +static kmem_cache_t *ip_conntrack_cachep __read_mostly; +static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; - -extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; - -static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr, - u_int8_t protocol) +unsigned int ip_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int ip_conntrack_vmalloc; + +static unsigned int ip_conntrack_next_id; +static unsigned int ip_conntrack_expect_next_id; +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain); +ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain); + +DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) { - return protocol == curr->proto; + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) + atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events, + ecache->ct); + ecache->events = 0; + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; } -struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol) +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling or freeing the skb */ +void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) { - struct ip_conntrack_protocol *p; - - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - p = LIST_FIND(&protocol_list, proto_cmpfn, - struct ip_conntrack_protocol *, protocol); - if (!p) - p = &ip_conntrack_generic_protocol; - - return p; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) + __ip_ct_deliver_cached_events(ecache); + local_bh_enable(); } -struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol) +void __ip_ct_event_cache_init(struct ip_conntrack *ct) { - struct ip_conntrack_protocol *p; - - READ_LOCK(&ip_conntrack_lock); - p = __ip_ct_find_proto(protocol); - READ_UNLOCK(&ip_conntrack_lock); - return p; + struct ip_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __ip_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); } -inline void -ip_conntrack_put(struct ip_conntrack *ct) +/* flush the event cache - touches other CPU's data and must not be called while + * packets are still passing through the code */ +static void ip_ct_event_cache_flush(void) { - IP_NF_ASSERT(ct); - IP_NF_ASSERT(ct->infos[0].master); - /* nf_conntrack_put wants to go via an info struct, so feed it - one at random. */ - nf_conntrack_put(&ct->infos[0]); + struct ip_conntrack_ecache *ecache; + int cpu; + + for_each_possible_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) + ip_conntrack_put(ecache->ct); + } } +#else +static inline void ip_ct_event_cache_flush(void) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; -static u_int32_t -hash_conntrack(const struct ip_conntrack_tuple *tuple) +static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) { -#if 0 - dump_tuple(tuple); -#endif return (jhash_3words(tuple->src.ip, (tuple->dst.ip ^ tuple->dst.protonum), (tuple->src.u.all | (tuple->dst.u.all << 16)), - ip_conntrack_hash_rnd) % ip_conntrack_htable_size); + rnd) % size); +} + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, ip_conntrack_htable_size, + ip_conntrack_hash_rnd); } int -get_tuple(const struct iphdr *iph, - const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_protocol *protocol) +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol) { /* Never happen */ if (iph->frag_off & htons(IP_OFFSET)) { @@ -143,146 +181,114 @@ get_tuple(const struct iphdr *iph, tuple->src.ip = iph->saddr; tuple->dst.ip = iph->daddr; tuple->dst.protonum = iph->protocol; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; return protocol->pkt_to_tuple(skb, dataoff, tuple); } -static int -invert_tuple(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig, - const struct ip_conntrack_protocol *protocol) +int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) { inverse->src.ip = orig->dst.ip; inverse->dst.ip = orig->src.ip; inverse->dst.protonum = orig->dst.protonum; + inverse->dst.dir = !orig->dst.dir; return protocol->invert_tuple(inverse, orig); } /* ip_conntrack_expect helper functions */ - -/* Compare tuple parts depending on mask. */ -static inline int expect_cmp(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple) +void ip_ct_unlink_expect(struct ip_conntrack_expect *exp) { - MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); - return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask); -} - -static void -destroy_expect(struct ip_conntrack_expect *exp) -{ - DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use)); - IP_NF_ASSERT(atomic_read(&exp->use) == 0); + ASSERT_WRITE_LOCK(&ip_conntrack_lock); IP_NF_ASSERT(!timer_pending(&exp->timeout)); - - kfree(exp); + list_del(&exp->list); + CONNTRACK_STAT_INC(expect_delete); + exp->master->expecting--; + ip_conntrack_expect_put(exp); } -inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) +static void expectation_timed_out(unsigned long ul_expect) { - IP_NF_ASSERT(exp); + struct ip_conntrack_expect *exp = (void *)ul_expect; - if (atomic_dec_and_test(&exp->use)) { - /* usage count dropped to zero */ - destroy_expect(exp); - } -} - -static inline struct ip_conntrack_expect * -__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); - return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); + write_lock_bh(&ip_conntrack_lock); + ip_ct_unlink_expect(exp); + write_unlock_bh(&ip_conntrack_lock); + ip_conntrack_expect_put(exp); } -/* Find a expectation corresponding to a tuple. */ struct ip_conntrack_expect * -ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) { - struct ip_conntrack_expect *exp; - - READ_LOCK(&ip_conntrack_lock); - READ_LOCK(&ip_conntrack_expect_tuple_lock); - exp = __ip_ct_expect_find(tuple); - if (exp) - atomic_inc(&exp->use); - READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - READ_UNLOCK(&ip_conntrack_lock); - - return exp; + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; } -/* remove one specific expectation from all lists and drop refcount, - * does _NOT_ delete the timer. */ -static void __unexpect_related(struct ip_conntrack_expect *expect) +/* Just find a expectation corresponding to a tuple. */ +struct ip_conntrack_expect * +ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) { - DEBUGP("unexpect_related(%p)\n", expect); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - - /* we're not allowed to unexpect a confirmed expectation! */ - IP_NF_ASSERT(!expect->sibling); - - /* delete from global and local lists */ - list_del(&expect->list); - list_del(&expect->expected_list); - - /* decrement expect-count of master conntrack */ - if (expect->expectant) - expect->expectant->expecting--; + struct ip_conntrack_expect *i; + + read_lock_bh(&ip_conntrack_lock); + i = __ip_conntrack_expect_find(tuple); + read_unlock_bh(&ip_conntrack_lock); - ip_conntrack_expect_put(expect); + return i; } -/* remove one specific expecatation from all lists, drop refcount - * and expire timer. - * This function can _NOT_ be called for confirmed expects! */ -static void unexpect_related(struct ip_conntrack_expect *expect) +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct ip_conntrack_expect * +find_expectation(const struct ip_conntrack_tuple *tuple) { - IP_NF_ASSERT(expect->expectant); - IP_NF_ASSERT(expect->expectant->helper); - /* if we are supposed to have a timer, but we can't delete - * it: race condition. __unexpect_related will - * be calledd by timeout function */ - if (expect->expectant->helper->timeout - && !del_timer(&expect->timeout)) - return; - - __unexpect_related(expect); + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && is_confirmed(i->master)) { + if (i->flags & IP_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + return i; + } + } + } + return NULL; } -/* delete all unconfirmed expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct, int drop_refcount) +/* delete all expectations for this conntrack */ +void ip_ct_remove_expectations(struct ip_conntrack *ct) { - struct list_head *exp_entry, *next; - struct ip_conntrack_expect *exp; + struct ip_conntrack_expect *i, *tmp; - DEBUGP("remove_expectations(%p)\n", ct); - - list_for_each_safe(exp_entry, next, &ct->sibling_list) { - exp = list_entry(exp_entry, struct ip_conntrack_expect, - expected_list); + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; - /* we skip established expectations, as we want to delete - * the un-established ones only */ - if (exp->sibling) { - DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct); - if (drop_refcount) { - /* Indicate that this expectations parent is dead */ - ip_conntrack_put(exp->expectant); - exp->expectant = NULL; - } - continue; + list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + ip_conntrack_expect_put(i); } - - IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp)); - IP_NF_ASSERT(exp->expectant == ct); - - /* delete expectation from global and private lists */ - unexpect_related(exp); } } @@ -292,69 +298,73 @@ clean_from_lists(struct ip_conntrack *ct) unsigned int ho, hr; DEBUGP("clean_from_lists(%p)\n", ct); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + ASSERT_WRITE_LOCK(&ip_conntrack_lock); ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); - /* Destroy all un-established, pending expectations */ - remove_expectations(ct, 1); + /* Destroy all pending expectations */ + ip_ct_remove_expectations(ct); } static void destroy_conntrack(struct nf_conntrack *nfct) { - struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL; + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; struct ip_conntrack_protocol *proto; DEBUGP("destroy_conntrack(%p)\n", ct); IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ - proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); if (ip_conntrack_destroyed) ip_conntrack_destroyed(ct); - WRITE_LOCK(&ip_conntrack_lock); - /* Make sure don't leave any orphaned expectations lying around */ - if (ct->expecting) - remove_expectations(ct, 1); - - /* Delete our master expectation */ - if (ct->master) { - if (ct->master->expectant) { - /* can't call __unexpect_related here, - * since it would screw up expect_list */ - list_del(&ct->master->expected_list); - master = ct->master->expectant; - } - kfree(ct->master); + write_lock_bh(&ip_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + ip_ct_remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); } - WRITE_UNLOCK(&ip_conntrack_lock); - if (master) - ip_conntrack_put(master); + CONNTRACK_STAT_INC(delete); + write_unlock_bh(&ip_conntrack_lock); + + if (ct->master) + ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - kmem_cache_free(ip_conntrack_cachep, ct); - atomic_dec(&ip_conntrack_count); + ip_conntrack_free(ct); } static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + CONNTRACK_STAT_INC(delete_list); clean_from_lists(ct); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); ip_conntrack_put(ct); } @@ -363,24 +373,28 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - return i->ctrack != ignored_conntrack + ASSERT_READ_LOCK(&ip_conntrack_lock); + return tuplehash_to_ctrack(i) != ignored_conntrack && ip_ct_tuple_equal(tuple, &i->tuple); } -static struct ip_conntrack_tuple_hash * +struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; unsigned int hash = hash_conntrack(tuple); - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - h = LIST_FIND(&ip_conntrack_hash[hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - tuple, ignored_conntrack); - return h; + ASSERT_READ_LOCK(&ip_conntrack_lock); + list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; + } + CONNTRACK_STAT_INC(searched); + } + + return NULL; } /* Find a connection corresponding to a tuple. */ @@ -390,45 +404,47 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + read_unlock_bh(&ip_conntrack_lock); return h; } -static inline struct ip_conntrack * -__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo) +static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, + unsigned int hash, + unsigned int repl_hash) { - struct ip_conntrack *ct - = (struct ip_conntrack *)nfct->master; - - /* ctinfo is the index of the nfct inside the conntrack */ - *ctinfo = nfct - ct->infos; - IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER); - return ct; + ct->id = ++ip_conntrack_next_id; + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); } -/* Return conntrack and conntrack_info given skb->nfct->master */ -struct ip_conntrack * -ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo) +void ip_conntrack_hash_insert(struct ip_conntrack *ct) { - if (skb->nfct) - return __ip_conntrack_get(skb->nfct, ctinfo); - return NULL; + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&ip_conntrack_lock); + __ip_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&ip_conntrack_lock); } -/* Confirm a connection given skb->nfct; places it in hash table */ +/* Confirm a connection given skb; places it in hash table */ int -__ip_conntrack_confirm(struct nf_ct_info *nfct) +__ip_conntrack_confirm(struct sk_buff **pskb) { unsigned int hash, repl_hash; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - ct = __ip_conntrack_get(nfct, &ctinfo); + ct = ip_conntrack_get(*pskb, &ctinfo); /* ipt_REJECT uses ip_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet @@ -450,7 +466,8 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) IP_NF_ASSERT(!is_confirmed(ct)); DEBUGP("Confirming conntrack %p\n", ct); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); + /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -462,10 +479,10 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + __ip_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -473,11 +490,24 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); - WRITE_UNLOCK(&ip_conntrack_lock); + CONNTRACK_STAT_INC(insert); + write_unlock_bh(&ip_conntrack_lock); + if (ct->helper) + ip_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_IP_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + ip_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; } - WRITE_UNLOCK(&ip_conntrack_lock); + CONNTRACK_STAT_INC(insert_failed); + write_unlock_bh(&ip_conntrack_lock); + return NF_DROP; } @@ -489,117 +519,44 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); return h != NULL; } -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -struct ip_conntrack * -icmp_error_track(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct ip_conntrack_tuple innertuple, origtuple; - struct { - struct icmphdr icmp; - struct iphdr ip; - } inside; - struct ip_conntrack_protocol *innerproto; - struct ip_conntrack_tuple_hash *h; - int dataoff; - - IP_NF_ASSERT(skb->nfct == NULL); - - /* Not enough header? */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0) - return NULL; - - if (inside.icmp.type != ICMP_DEST_UNREACH - && inside.icmp.type != ICMP_SOURCE_QUENCH - && inside.icmp.type != ICMP_TIME_EXCEEDED - && inside.icmp.type != ICMP_PARAMETERPROB - && inside.icmp.type != ICMP_REDIRECT) - return NULL; - - /* Ignore ICMP's containing fragments (shouldn't happen) */ - if (inside.ip.frag_off & htons(IP_OFFSET)) { - DEBUGP("icmp_error_track: fragment of proto %u\n", - inside.ip.protocol); - return NULL; - } - - innerproto = ip_ct_find_proto(inside.ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4; - /* Are they talking about one of our connections? */ - if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) { - DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol); - return NULL; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!invert_tuple(&innertuple, &origtuple, innerproto)) { - DEBUGP("icmp_error_track: Can't invert tuple\n"); - return NULL; - } - - *ctinfo = IP_CT_RELATED; - - h = ip_conntrack_find_get(&innertuple, NULL); - if (!h) { - /* Locally generated ICMPs will match inverted if they - haven't been SNAT'ed yet */ - /* FIXME: NAT code has to handle half-done double NAT --RR */ - if (hooknum == NF_IP_LOCAL_OUT) - h = ip_conntrack_find_get(&origtuple, NULL); - - if (!h) { - DEBUGP("icmp_error_track: no match\n"); - return NULL; - } - /* Reverse direction from that found */ - if (DIRECTION(h) != IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } else { - if (DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - skb->nfct = &h->ctrack->infos[*ctinfo]; - return h->ctrack; -} - /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static inline int unreplied(const struct ip_conntrack_tuple_hash *i) { - return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status)); + return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); } static int early_drop(struct list_head *chain) { /* Traverse backwards: gives us oldest, which is roughly LRU */ struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct = NULL; int dropped = 0; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); - if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + if (h) { + ct = tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + read_unlock_bh(&ip_conntrack_lock); - if (!h) + if (!ct) return dropped; - if (del_timer(&h->ctrack->timeout)) { - death_by_timeout((unsigned long)h->ctrack); + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); dropped = 1; + CONNTRACK_STAT_INC(early_drop); } - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); return dropped; } @@ -609,43 +566,86 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper * +__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } -/* Allocate a new conntrack: we return -ENOMEM if classification - failed due to stress. Otherwise it really is unclassifiable. */ -static struct ip_conntrack_tuple_hash * -init_conntrack(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) +struct ip_conntrack_helper * +ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_helper *helper; + + /* need ip_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&ip_conntrack_lock); + + helper = __ip_conntrack_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&ip_conntrack_lock); + + return helper; +} + +void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) +{ + module_put(helper->me); +} + +struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol) +{ + return ip_ct_protos[protocol]; +} + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + preempt_disable(); + p = __ip_conntrack_proto_find(protocol); + if (p) { + if (!try_module_get(p->me)) + p = &ip_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, + struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - size_t hash; - struct ip_conntrack_expect *expected; - int i; - static unsigned int drop_next; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - hash = hash_conntrack(tuple); - - if (ip_conntrack_max && - atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { - /* Try dropping from random chain, or else from the - chain about to put into (in case they're trying to - bomb one hash chain). */ - unsigned int next = (drop_next++)%ip_conntrack_htable_size; - - if (!early_drop(&ip_conntrack_hash[next]) - && !early_drop(&ip_conntrack_hash[hash])) { + if (ip_conntrack_max + && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); + /* Try dropping from this hash chain. */ + if (!early_drop(&ip_conntrack_hash[hash])) { if (net_ratelimit()) printk(KERN_WARNING "ip_conntrack: table full, dropping" @@ -654,11 +654,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); @@ -668,65 +663,86 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, memset(conntrack, 0, sizeof(*conntrack)); atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack; - for (i=0; i < IP_CT_NUMBER; i++) - conntrack->infos[i].master = &conntrack->ct_general; - - if (!protocol->new(conntrack, skb)) { - kmem_cache_free(ip_conntrack_cachep, conntrack); - return NULL; - } + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - INIT_LIST_HEAD(&conntrack->sibling_list); - - WRITE_LOCK(&ip_conntrack_lock); - /* Need finding and deleting of expected ONLY if we win race */ - READ_LOCK(&ip_conntrack_expect_tuple_lock); - expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); - READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (expected && !is_confirmed(expected->expectant)) - expected = NULL; - - /* Look up the conntrack helper for master connections only */ - if (!expected) - conntrack->helper = ip_ct_find_helper(&repl_tuple); - - /* If the expectation is dying, then this is a loser. */ - if (expected - && expected->expectant->helper->timeout - && ! del_timer(&expected->timeout)) - expected = NULL; - - if (expected) { + atomic_inc(&ip_conntrack_count); + + return conntrack; +} + +void +ip_conntrack_free(struct ip_conntrack *conntrack) +{ + atomic_dec(&ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + * failed due to stress. Otherwise it really is unclassifiable */ +static struct ip_conntrack_tuple_hash * +init_conntrack(struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = ip_conntrack_alloc(tuple, &repl_tuple); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; + + if (!protocol->new(conntrack, skb)) { + ip_conntrack_free(conntrack); + return NULL; + } + + write_lock_bh(&ip_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", - conntrack, expected); + conntrack, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &conntrack->status); - conntrack->master = expected; - expected->sibling = conntrack; - LIST_DELETE(&ip_conntrack_expect_list, expected); - expected->expectant->expecting--; - nf_conntrack_get(&master_ct(conntrack)->infos[0]); + conntrack->master = exp->master; +#ifdef CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + /* this is ugly, but there is no other place where to put it */ + conntrack->nat.masq_index = exp->master->nat.masq_index; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + CONNTRACK_STAT_INC(expect_new); + } else { + conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); + + CONNTRACK_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + write_unlock_bh(&ip_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + ip_conntrack_expect_put(exp); } - atomic_inc(&ip_conntrack_count); - WRITE_UNLOCK(&ip_conntrack_lock); - if (expected && expected->expectfn) - expected->expectfn(conntrack); return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } @@ -740,10 +756,12 @@ resolve_normal_ct(struct sk_buff *skb, { struct ip_conntrack_tuple tuple; struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); - if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto)) + if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, + &tuple,proto)) return NULL; /* look for tuple match */ @@ -755,6 +773,7 @@ resolve_normal_ct(struct sk_buff *skb, if (IS_ERR(h)) return (void *)h; } + ct = tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (DIRECTION(h) == IP_CT_DIR_REPLY) { @@ -763,23 +782,24 @@ resolve_normal_ct(struct sk_buff *skb, *set_reply = 1; } else { /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) { + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: normal packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) { + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: related packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_RELATED; } else { DEBUGP("ip_conntrack_in: new packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; } - skb->nfct = &h->ctrack->infos[*ctinfo]; - return h->ctrack; + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; } /* Netfilter hook itself. */ @@ -792,9 +812,15 @@ unsigned int ip_conntrack_in(unsigned int hooknum, struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; - int set_reply; + int set_reply = 0; int ret; + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + CONNTRACK_STAT_INC(ignore); + return NF_ACCEPT; + } + /* Never happen */ if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { if (net_ratelimit()) { @@ -804,9 +830,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return NF_DROP; } - /* FIXME: Do this right please. --RR */ - (*pskb)->nfcache |= NFC_UNKNOWN; - /* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ @@ -822,46 +845,44 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } #endif - /* Previously seen (loopback or untracked)? Ignore. */ - if ((*pskb)->nfct) - return NF_ACCEPT; + proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); - - /* It may be an icmp error... */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP - && icmp_error_track(*pskb, &ctinfo, hooknum)) - return NF_ACCEPT; + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL + && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { + CONNTRACK_STAT_INC(error); + CONNTRACK_STAT_INC(invalid); + return -ret; + } - if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { /* Not valid part of a connection */ + CONNTRACK_STAT_INC(invalid); return NF_ACCEPT; + } - if (IS_ERR(ct)) + if (IS_ERR(ct)) { /* Too stressed to deal. */ + CONNTRACK_STAT_INC(drop); return NF_DROP; + } IP_NF_ASSERT((*pskb)->nfct); ret = proto->packet(ct, *pskb, ctinfo); - if (ret == -1) { - /* Invalid */ + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do*/ nf_conntrack_put((*pskb)->nfct); (*pskb)->nfct = NULL; - return NF_ACCEPT; + CONNTRACK_STAT_INC(invalid); + return -ret; } - if (ret != NF_DROP && ct->helper) { - ret = ct->helper->help(*pskb, ct, ctinfo); - if (ret == -1) { - /* Invalid */ - nf_conntrack_put((*pskb)->nfct); - (*pskb)->nfct = NULL; - return NF_ACCEPT; - } - } - if (set_reply) - set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_STATUS, *pskb); return ret; } @@ -869,248 +890,160 @@ unsigned int ip_conntrack_in(unsigned int hooknum, int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { - return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum)); -} - -static inline int resent_expect(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) -{ - DEBUGP("resent_expect\n"); - DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple); - DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple); - DEBUGP("test tuple: "); DUMP_TUPLE(tuple); - return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple)) - || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple))) - && ip_ct_tuple_equal(&i->mask, mask)); + return ip_ct_invert_tuple(inverse, orig, + __ip_conntrack_proto_find(orig->dst.protonum)); } /* Would two expected things clash? */ -static inline int expect_clash(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) +static inline int expect_clash(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct ip_conntrack_tuple intersect_mask - = { { i->mask.src.ip & mask->src.ip, - { i->mask.src.u.all & mask->src.u.all } }, - { i->mask.dst.ip & mask->dst.ip, - { i->mask.dst.u.all & mask->dst.u.all }, - i->mask.dst.protonum & mask->dst.protonum } }; + = { { a->mask.src.ip & b->mask.src.ip, + { a->mask.src.u.all & b->mask.src.u.all } }, + { a->mask.dst.ip & b->mask.dst.ip, + { a->mask.dst.u.all & b->mask.dst.u.all }, + a->mask.dst.protonum & b->mask.dst.protonum } }; - return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask); + return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); } -inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect) +static inline int expect_matches(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) { - WRITE_LOCK(&ip_conntrack_lock); - unexpect_related(expect); - WRITE_UNLOCK(&ip_conntrack_lock); + return a->master == b->master + && ip_ct_tuple_equal(&a->tuple, &b->tuple) + && ip_ct_tuple_equal(&a->mask, &b->mask); } - -static void expectation_timed_out(unsigned long ul_expect) -{ - struct ip_conntrack_expect *expect = (void *) ul_expect; - DEBUGP("expectation %p timed out\n", expect); - WRITE_LOCK(&ip_conntrack_lock); - __unexpect_related(expect); - WRITE_UNLOCK(&ip_conntrack_lock); +/* Generally a bad idea to call this: could have matched already. */ +void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) +{ + struct ip_conntrack_expect *i; + + write_lock_bh(&ip_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + write_unlock_bh(&ip_conntrack_lock); + ip_conntrack_expect_put(i); + return; + } + } + write_unlock_bh(&ip_conntrack_lock); } -struct ip_conntrack_expect * -ip_conntrack_expect_alloc(void) +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) { struct ip_conntrack_expect *new; - - new = (struct ip_conntrack_expect *) - kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC); + + new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); if (!new) { DEBUGP("expect_related: OOM allocating expect\n"); return NULL; } - - /* tuple_cmp compares whole union, we have to initialized cleanly */ - memset(new, 0, sizeof(struct ip_conntrack_expect)); - + new->master = me; + atomic_set(&new->use, 1); return new; } -static void -ip_conntrack_expect_insert(struct ip_conntrack_expect *new, - struct ip_conntrack *related_to) +void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) { - DEBUGP("new expectation %p of conntrack %p\n", new, related_to); - new->expectant = related_to; - new->sibling = NULL; - atomic_set(&new->use, 1); - - /* add to expected list for this connection */ - list_add_tail(&new->expected_list, &related_to->sibling_list); - /* add to global list of expectations */ - list_prepend(&ip_conntrack_expect_list, &new->list); - /* add and start timer if required */ - if (related_to->helper->timeout) { - init_timer(&new->timeout); - new->timeout.data = (unsigned long)new; - new->timeout.function = expectation_timed_out; - new->timeout.expires = jiffies + - related_to->helper->timeout * HZ; - add_timer(&new->timeout); - } - related_to->expecting++; + if (atomic_dec_and_test(&exp->use)) + kmem_cache_free(ip_conntrack_expect_cachep, exp); } -/* Add a related connection. */ -int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, - struct ip_conntrack *related_to) +static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) { - struct ip_conntrack_expect *old; - int ret = 0; - - WRITE_LOCK(&ip_conntrack_lock); - /* Because of the write lock, no reader can walk the lists, - * so there is no need to use the tuple lock too */ + atomic_inc(&exp->use); + exp->master->expecting++; + list_add(&exp->list, &ip_conntrack_expect_list); + + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + + exp->id = ++ip_conntrack_expect_next_id; + atomic_inc(&exp->use); + CONNTRACK_STAT_INC(expect_create); +} - DEBUGP("ip_conntrack_expect_related %p\n", related_to); - DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct ip_conntrack *master) +{ + struct ip_conntrack_expect *i; - old = LIST_FIND(&ip_conntrack_expect_list, resent_expect, - struct ip_conntrack_expect *, &expect->tuple, - &expect->mask); - if (old) { - /* Helper private data may contain offsets but no pointers - pointing into the payload - otherwise we should have to copy - the data filled out by the helper over the old one */ - DEBUGP("expect_related: resent packet\n"); - if (related_to->helper->timeout) { - if (!del_timer(&old->timeout)) { - /* expectation is dying. Fall through */ - goto out; - } else { - old->timeout.expires = jiffies + - related_to->helper->timeout * HZ; - add_timer(&old->timeout); + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + ip_conntrack_expect_put(i); } + break; } - - WRITE_UNLOCK(&ip_conntrack_lock); - kfree(expect); - return -EEXIST; - - } else if (related_to->helper->max_expected && - related_to->expecting >= related_to->helper->max_expected) { - /* old == NULL */ - if (!(related_to->helper->flags & - IP_CT_HELPER_F_REUSE_EXPECT)) { - WRITE_UNLOCK(&ip_conntrack_lock); - if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: max number of expected " - "connections %i of %s reached for " - "%u.%u.%u.%u->%u.%u.%u.%u\n", - related_to->helper->max_expected, - related_to->helper->name, - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - kfree(expect); - return -EPERM; - } - DEBUGP("ip_conntrack: max number of expected " - "connections %i of %s reached for " - "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n", - related_to->helper->max_expected, - related_to->helper->name, - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - - /* choose the the oldest expectation to evict */ - list_for_each_entry(old, &related_to->sibling_list, - expected_list) - if (old->sibling == NULL) - break; - - /* We cannot fail since related_to->expecting is the number - * of unconfirmed expectations */ - IP_NF_ASSERT(old && old->sibling == NULL); - - /* newnat14 does not reuse the real allocated memory - * structures but rather unexpects the old and - * allocates a new. unexpect_related will decrement - * related_to->expecting. - */ - unexpect_related(old); - ret = -EPERM; - } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash, - struct ip_conntrack_expect *, &expect->tuple, - &expect->mask)) { - WRITE_UNLOCK(&ip_conntrack_lock); - DEBUGP("expect_related: busy!\n"); - - kfree(expect); - return -EBUSY; } +} -out: ip_conntrack_expect_insert(expect, related_to); - - WRITE_UNLOCK(&ip_conntrack_lock); +static inline int refresh_timer(struct ip_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; - return ret; + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; } -/* Change tuple in an existing expectation */ -int ip_conntrack_change_expect(struct ip_conntrack_expect *expect, - struct ip_conntrack_tuple *newtuple) +int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) { + struct ip_conntrack_expect *i; int ret; - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - WRITE_LOCK(&ip_conntrack_expect_tuple_lock); - - DEBUGP("change_expect:\n"); - DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask); - DEBUGP("newtuple: "); DUMP_TUPLE(newtuple); - if (expect->ct_tuple.dst.protonum == 0) { - /* Never seen before */ - DEBUGP("change expect: never seen before\n"); - if (!ip_ct_tuple_equal(&expect->tuple, newtuple) - && LIST_FIND(&ip_conntrack_expect_list, expect_clash, - struct ip_conntrack_expect *, newtuple, &expect->mask)) { - /* Force NAT to find an unused tuple */ - ret = -1; - } else { - memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple)); - memcpy(&expect->tuple, newtuple, sizeof(expect->tuple)); - ret = 0; - } - } else { - /* Resent packet */ - DEBUGP("change expect: resent packet\n"); - if (ip_ct_tuple_equal(&expect->tuple, newtuple)) { - ret = 0; - } else { - /* Force NAT to choose again the same port */ - ret = -1; + DEBUGP("ip_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + write_lock_bh(&ip_conntrack_lock); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; } } - WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock); - - return ret; + + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + ip_conntrack_expect_insert(expect); + ip_conntrack_expect_event(IPEXP_NEW, expect); + ret = 0; +out: + write_unlock_bh(&ip_conntrack_lock); + return ret; } -/* Alter reply tuple (maybe alter helper). If it's already taken, - return 0 and don't do alteration. */ -int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, - const struct ip_conntrack_tuple *newreply) +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __ip_conntrack_confirm */ +void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) { - WRITE_LOCK(&ip_conntrack_lock); - if (__ip_conntrack_find(newreply, conntrack)) { - WRITE_UNLOCK(&ip_conntrack_lock); - return 0; - } + write_lock_bh(&ip_conntrack_lock); /* Should be unconfirmed, so not in hash table yet */ IP_NF_ASSERT(!is_confirmed(conntrack)); @@ -1118,30 +1051,40 @@ int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, DUMP_TUPLE(newreply); conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (!conntrack->master && list_empty(&conntrack->sibling_list)) - conntrack->helper = ip_ct_find_helper(newreply); - WRITE_UNLOCK(&ip_conntrack_lock); - - return 1; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = __ip_conntrack_helper_find(newreply); + write_unlock_bh(&ip_conntrack_lock); } int ip_conntrack_helper_register(struct ip_conntrack_helper *me) { - WRITE_LOCK(&ip_conntrack_lock); + BUG_ON(me->timeout == 0); + write_lock_bh(&ip_conntrack_lock); list_prepend(&helpers, me); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return 0; } +struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *name) +{ + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (i->ctrack->helper == me) { - /* Get rid of any expected. */ - remove_expectations(i->ctrack, 0); - /* And *then* set helper to NULL */ - i->ctrack->helper = NULL; + if (tuplehash_to_ctrack(i)->helper == me) { + ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); + tuplehash_to_ctrack(i)->helper = NULL; } return 0; } @@ -1149,138 +1092,189 @@ static inline int unhelp(struct ip_conntrack_tuple_hash *i, void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) { unsigned int i; + struct ip_conntrack_expect *exp, *tmp; /* Need write lock here, to delete helper. */ - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); LIST_DELETE(&helpers, me); + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); + ip_conntrack_expect_put(exp); + } + } /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); for (i = 0; i < ip_conntrack_htable_size; i++) LIST_FIND_W(&ip_conntrack_hash[i], unhelp, struct ip_conntrack_tuple_hash *, me); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); /* Someone could be still looking at the helper in a bh. */ synchronize_net(); } -/* Refresh conntrack for this many jiffies. */ -void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct) { + int event = 0; + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); + IP_NF_ASSERT(skb); + + write_lock_bh(&ip_conntrack_lock); /* If not in hash table, timer will not be active yet */ - if (!is_confirmed(ct)) + if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - else { - WRITE_LOCK(&ip_conntrack_lock); + event = IPCT_REFRESH; + } else { /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); + event = IPCT_REFRESH; } - WRITE_UNLOCK(&ip_conntrack_lock); } + +#ifdef CONFIG_IP_NF_CT_ACCT + if (do_acct) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) + || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) + event |= IPCT_COUNTER_FILLING; + } +#endif + + write_unlock_bh(&ip_conntrack_lock); + + /* must be unlocked when calling event cache */ + if (event) + ip_conntrack_event_cache(event, skb); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Returns new sk_buff, or NULL */ struct sk_buff * -ip_ct_gather_frags(struct sk_buff *skb) +ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { - struct sock *sk = skb->sk; -#ifdef CONFIG_NETFILTER_DEBUG - unsigned int olddebug = skb->nf_debug; -#endif - if (sk) { - sock_hold(sk); - skb_orphan(skb); - } + skb_orphan(skb); local_bh_disable(); - skb = ip_defrag(skb); + skb = ip_defrag(skb, user); local_bh_enable(); - if (!skb) { - if (sk) - sock_put(sk); - return skb; - } - - if (sk) { - skb_set_owner_w(skb, sk); - sock_put(sk); - } - - ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; -#ifdef CONFIG_NETFILTER_DEBUG - /* Packet path as if nothing had happened. */ - skb->nf_debug = olddebug; -#endif + if (skb) + ip_send_check(skb->nh.iph); return skb; } /* Used by ipt_REJECT. */ -static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct) +static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - ct = __ip_conntrack_get(nfct, &ctinfo); - - /* This ICMP is in reverse direction to the packet which - caused it */ + /* This ICMP is in reverse direction to the packet which caused it */ + ct = ip_conntrack_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; else ctinfo = IP_CT_RELATED; - /* Attach new skbuff, and increment count */ - nskb->nfct = &ct->infos[ctinfo]; - atomic_inc(&ct->ct_general.use); + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); } static inline int -do_kill(const struct ip_conntrack_tuple_hash *i, - int (*kill)(const struct ip_conntrack *i, void *data), +do_iter(const struct ip_conntrack_tuple_hash *i, + int (*iter)(struct ip_conntrack *i, void *data), void *data) { - return kill(i->ctrack, data); + return iter(tuplehash_to_ctrack(i), data); } /* Bring out ya dead! */ static struct ip_conntrack_tuple_hash * -get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data), +get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), void *data, unsigned int *bucket) { struct ip_conntrack_tuple_hash *h = NULL; - READ_LOCK(&ip_conntrack_lock); - for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) { - h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill, - struct ip_conntrack_tuple_hash *, kill, data); + write_lock_bh(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + write_unlock_bh(&ip_conntrack_lock); return h; } void -ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), - void *data) +ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) { struct ip_conntrack_tuple_hash *h; unsigned int bucket = 0; - while ((h = get_next_corpse(kill, data, &bucket)) != NULL) { + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct ip_conntrack *ct = tuplehash_to_ctrack(h); /* Time to push up daises... */ - if (del_timer(&h->ctrack->timeout)) - death_by_timeout((unsigned long)h->ctrack); + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); /* ... else the timer will get him soon. */ - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); } } @@ -1291,7 +1285,7 @@ ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), static int getorigdst(struct sock *sk, int optval, void __user *user, int *len) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_conntrack_tuple_hash *h; struct ip_conntrack_tuple tuple; @@ -1317,16 +1311,18 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) h = ip_conntrack_find_get(&tuple, NULL); if (h) { struct sockaddr_in sin; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); sin.sin_family = AF_INET; - sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else @@ -1345,35 +1341,126 @@ static struct nf_sockopt_ops so_getorigdst = { .get = &getorigdst, }; -static int kill_all(const struct ip_conntrack *i, void *data) +static int kill_all(struct ip_conntrack *i, void *data) { return 1; } +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + +static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) +{ + if (vmalloced) + vfree(hash); + else + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); +} + /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ void ip_conntrack_cleanup(void) { ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ synchronize_net(); - + + ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_selective_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; } + /* wait until all references to ip_conntrack_untracked are dropped */ + while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) + schedule(); kmem_cache_destroy(ip_conntrack_cachep); - vfree(ip_conntrack_hash); + kmem_cache_destroy(ip_conntrack_expect_cachep); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); nf_unregister_sockopt(&so_getorigdst); } -static int hashsize; -MODULE_PARM(hashsize, "i"); +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +static int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct ip_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!ip_conntrack_htable_size) + return param_set_int(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehash for the new table anyway, so we also can + * use a new random seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { + while (!list_empty(&ip_conntrack_hash[i])) { + h = list_entry(ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = ip_conntrack_htable_size; + old_vmalloced = ip_conntrack_vmalloc; + old_hash = ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; + ip_conntrack_vmalloc = vmalloced; + ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); int __init ip_conntrack_init(void) { @@ -1382,9 +1469,7 @@ int __init ip_conntrack_init(void) /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (hashsize) { - ip_conntrack_htable_size = hashsize; - } else { + if (!ip_conntrack_htable_size) { ip_conntrack_htable_size = (((num_physpages << PAGE_SHIFT) / 16384) / sizeof(struct list_head)); @@ -1406,8 +1491,8 @@ int __init ip_conntrack_init(void) return ret; } - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); + ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, + &ip_conntrack_vmalloc); if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1415,21 +1500,29 @@ int __init ip_conntrack_init(void) ip_conntrack_cachep = kmem_cache_create("ip_conntrack", sizeof(struct ip_conntrack), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + 0, NULL, NULL); if (!ip_conntrack_cachep) { printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); goto err_free_hash; } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), + 0, 0, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + /* Don't NEED lock here, but good form anyway. */ - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) + ip_ct_protos[i] = &ip_conntrack_generic_protocol; /* Sew in builtin protocols. */ - list_append(&protocol_list, &ip_conntrack_protocol_tcp); - list_append(&protocol_list, &ip_conntrack_protocol_udp); - list_append(&protocol_list, &ip_conntrack_protocol_icmp); - WRITE_UNLOCK(&ip_conntrack_lock); - - for (i = 0; i < ip_conntrack_htable_size; i++) - INIT_LIST_HEAD(&ip_conntrack_hash[i]); + ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; + ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; + ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + write_unlock_bh(&ip_conntrack_lock); /* For use by ipt_REJECT */ ip_ct_attach = ip_conntrack_attach; @@ -1439,16 +1532,14 @@ int __init ip_conntrack_init(void) atomic_set(&ip_conntrack_untracked.ct_general.use, 1); /* - and look it like as a confirmed connection */ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); - /* - and prepare the ctinfo field for REJECT & NAT. */ - ip_conntrack_untracked.infos[IP_CT_NEW].master = - ip_conntrack_untracked.infos[IP_CT_RELATED].master = - ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = - &ip_conntrack_untracked.ct_general; return ret; +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - vfree(ip_conntrack_hash); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst);