X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fnetfilter%2Fip_conntrack_core.c;h=f8b3009ba66ecc1a9be6e99953eb1400c651ee17;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=28d9425d5c390dac7601953e65043e973833e7aa;hpb=6a77f38946aaee1cd85eeec6cf4229b204c15071;p=linux-2.6.git diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 28d9425d5..f8b3009ba 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -17,7 +17,6 @@ * - export ip_conntrack[_expect]_{find_get,put} functions * */ -#include #include #include #include @@ -37,19 +36,16 @@ #include #include #include +#include -/* This rwlock protects the main hash table, protocol/helper/expected +/* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) - #include #include #include #include -#include -#define IP_CONNTRACK_VERSION "2.1" +#define IP_CONNTRACK_VERSION "2.4" #if 0 #define DEBUGP printk @@ -57,47 +53,110 @@ #define DEBUGP(format, args...) #endif -DECLARE_RWLOCK(ip_conntrack_lock); +DEFINE_RWLOCK(ip_conntrack_lock); /* ip_conntrack_standalone needs this */ atomic_t ip_conntrack_count = ATOMIC_INIT(0); void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(ip_conntrack_expect_list); -struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; +struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly; static LIST_HEAD(helpers); -unsigned int ip_conntrack_htable_size = 0; -int ip_conntrack_max; -struct list_head *ip_conntrack_hash; -static kmem_cache_t *ip_conntrack_cachep; -static kmem_cache_t *ip_conntrack_expect_cachep; +unsigned int ip_conntrack_htable_size __read_mostly = 0; +int ip_conntrack_max __read_mostly; +struct list_head *ip_conntrack_hash __read_mostly; +static struct kmem_cache *ip_conntrack_cachep __read_mostly; +static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; -unsigned int ip_ct_log_invalid; +unsigned int ip_ct_log_invalid __read_mostly; static LIST_HEAD(unconfirmed); -static int ip_conntrack_vmalloc; +static int ip_conntrack_vmalloc __read_mostly; -DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); +static unsigned int ip_conntrack_next_id; +static unsigned int ip_conntrack_expect_next_id; +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain); +ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain); + +DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); -void -ip_conntrack_put(struct ip_conntrack *ct) +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) { - IP_NF_ASSERT(ct); - nf_conntrack_put(&ct->ct_general); + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) + atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events, + ecache->ct); + ecache->events = 0; + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; } +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling or freeing the skb */ +void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) + __ip_ct_deliver_cached_events(ecache); + local_bh_enable(); +} + +void __ip_ct_event_cache_init(struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __ip_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); +} + +/* flush the event cache - touches other CPU's data and must not be called while + * packets are still passing through the code */ +static void ip_ct_event_cache_flush(void) +{ + struct ip_conntrack_ecache *ecache; + int cpu; + + for_each_possible_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) + ip_conntrack_put(ecache->ct); + } +} +#else +static inline void ip_ct_event_cache_flush(void) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; +static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) +{ + return (jhash_3words((__force u32)tuple->src.ip, + ((__force u32)tuple->dst.ip ^ tuple->dst.protonum), + (tuple->src.u.all | (tuple->dst.u.all << 16)), + rnd) % size); +} + static u_int32_t hash_conntrack(const struct ip_conntrack_tuple *tuple) { -#if 0 - dump_tuple(tuple); -#endif - return (jhash_3words(tuple->src.ip, - (tuple->dst.ip ^ tuple->dst.protonum), - (tuple->src.u.all | (tuple->dst.u.all << 16)), - ip_conntrack_hash_rnd) % ip_conntrack_htable_size); + return __hash_conntrack(tuple, ip_conntrack_htable_size, + ip_conntrack_hash_rnd); } int @@ -137,30 +196,50 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, /* ip_conntrack_expect helper functions */ -static void destroy_expect(struct ip_conntrack_expect *exp) +void ip_ct_unlink_expect(struct ip_conntrack_expect *exp) { - ip_conntrack_put(exp->master); IP_NF_ASSERT(!timer_pending(&exp->timeout)); - kmem_cache_free(ip_conntrack_expect_cachep, exp); - CONNTRACK_STAT_INC(expect_delete); -} - -static void unlink_expect(struct ip_conntrack_expect *exp) -{ - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); list_del(&exp->list); - /* Logically in destroy_expect, but we hold the lock here. */ + CONNTRACK_STAT_INC(expect_delete); exp->master->expecting--; + ip_conntrack_expect_put(exp); } static void expectation_timed_out(unsigned long ul_expect) { struct ip_conntrack_expect *exp = (void *)ul_expect; - WRITE_LOCK(&ip_conntrack_lock); - unlink_expect(exp); - WRITE_UNLOCK(&ip_conntrack_lock); - destroy_expect(exp); + write_lock_bh(&ip_conntrack_lock); + ip_ct_unlink_expect(exp); + write_unlock_bh(&ip_conntrack_lock); + ip_conntrack_expect_put(exp); +} + +struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) + return i; + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + read_lock_bh(&ip_conntrack_lock); + i = __ip_conntrack_expect_find(tuple); + if (i) + atomic_inc(&i->use); + read_unlock_bh(&ip_conntrack_lock); + + return i; } /* If an expectation for this connection is found, it gets delete from @@ -177,17 +256,21 @@ find_expectation(const struct ip_conntrack_tuple *tuple) master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) - && is_confirmed(i->master) - && del_timer(&i->timeout)) { - unlink_expect(i); - return i; + && is_confirmed(i->master)) { + if (i->flags & IP_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + return i; + } } } return NULL; } /* delete all expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct) +void ip_ct_remove_expectations(struct ip_conntrack *ct) { struct ip_conntrack_expect *i, *tmp; @@ -197,8 +280,8 @@ static void remove_expectations(struct ip_conntrack *ct) list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { if (i->master == ct && del_timer(&i->timeout)) { - unlink_expect(i); - destroy_expect(i); + ip_ct_unlink_expect(i); + ip_conntrack_expect_put(i); } } } @@ -206,18 +289,12 @@ static void remove_expectations(struct ip_conntrack *ct) static void clean_from_lists(struct ip_conntrack *ct) { - unsigned int ho, hr; - DEBUGP("clean_from_lists(%p)\n", ct); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - - ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list); /* Destroy all pending expectations */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); } static void @@ -225,27 +302,35 @@ destroy_conntrack(struct nf_conntrack *nfct) { struct ip_conntrack *ct = (struct ip_conntrack *)nfct; struct ip_conntrack_protocol *proto; + struct ip_conntrack_helper *helper; DEBUGP("destroy_conntrack(%p)\n", ct); IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + + helper = ct->helper; + if (helper && helper->destroy) + helper->destroy(ct); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ - proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); if (ip_conntrack_destroyed) ip_conntrack_destroyed(ct); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!is_confirmed(ct)) { @@ -254,49 +339,38 @@ destroy_conntrack(struct nf_conntrack *nfct) } CONNTRACK_STAT_INC(delete); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); if (ct->master) ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - kmem_cache_free(ip_conntrack_cachep, ct); - atomic_dec(&ip_conntrack_count); + ip_conntrack_free(ct); } static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ CONNTRACK_STAT_INC(delete_list); clean_from_lists(ct); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); ip_conntrack_put(ct); } -static inline int -conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - return tuplehash_to_ctrack(i) != ignored_conntrack - && ip_ct_tuple_equal(tuple, &i->tuple); -} - -static struct ip_conntrack_tuple_hash * +struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; unsigned int hash = hash_conntrack(tuple); - MUST_BE_READ_LOCKED(&ip_conntrack_lock); list_for_each_entry(h, &ip_conntrack_hash[hash], list) { - if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + if (tuplehash_to_ctrack(h) != ignored_conntrack && + ip_ct_tuple_equal(tuple, &h->tuple)) { CONNTRACK_STAT_INC(found); return h; } @@ -313,20 +387,44 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); if (h) atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); return h; } +static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++ip_conntrack_next_id; + list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list, + &ip_conntrack_hash[hash]); + list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list, + &ip_conntrack_hash[repl_hash]); +} + +void ip_conntrack_hash_insert(struct ip_conntrack *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&ip_conntrack_lock); + __ip_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&ip_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __ip_conntrack_confirm(struct sk_buff **pskb) { unsigned int hash, repl_hash; + struct ip_conntrack_tuple_hash *h; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; @@ -352,41 +450,48 @@ __ip_conntrack_confirm(struct sk_buff **pskb) IP_NF_ASSERT(!is_confirmed(ct)); DEBUGP("Confirming conntrack %p\n", ct); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - if (!LIST_FIND(&ip_conntrack_hash[hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) - && !LIST_FIND(&ip_conntrack_hash[repl_hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { - /* Remove from unconfirmed list */ - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_for_each_entry(h, &ip_conntrack_hash[hash], list) + if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple)) + goto out; + list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list) + if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple)) + goto out; - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); - /* Timer relative to confirmation time, not original - setting time, otherwise we'd get timer wrap in - weird delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); - atomic_inc(&ct->ct_general.use); - set_bit(IPS_CONFIRMED_BIT, &ct->status); - CONNTRACK_STAT_INC(insert); - WRITE_UNLOCK(&ip_conntrack_lock); - return NF_ACCEPT; - } + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + __ip_conntrack_hash_insert(ct, hash, repl_hash); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + CONNTRACK_STAT_INC(insert); + write_unlock_bh(&ip_conntrack_lock); + if (ct->helper) + ip_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_IP_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + ip_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); - CONNTRACK_STAT_INC(insert_failed); - WRITE_UNLOCK(&ip_conntrack_lock); + return NF_ACCEPT; +out: + CONNTRACK_STAT_INC(insert_failed); + write_unlock_bh(&ip_conntrack_lock); return NF_DROP; } @@ -398,34 +503,32 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); return h != NULL; } /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static inline int unreplied(const struct ip_conntrack_tuple_hash *i) -{ - return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); -} - static int early_drop(struct list_head *chain) { /* Traverse backwards: gives us oldest, which is roughly LRU */ struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct = NULL; + struct ip_conntrack *ct = NULL, *tmp; int dropped = 0; - READ_LOCK(&ip_conntrack_lock); - h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); - if (h) { - ct = tuplehash_to_ctrack(h); - atomic_inc(&ct->ct_general.use); + read_lock_bh(&ip_conntrack_lock); + list_for_each_entry_reverse(h, chain, list) { + tmp = tuplehash_to_ctrack(h); + if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) { + ct = tmp; + atomic_inc(&ct->ct_general.use); + break; + } } - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); if (!ct) return dropped; @@ -439,42 +542,94 @@ static int early_drop(struct list_head *chain) return dropped; } -static inline int helper_cmp(const struct ip_conntrack_helper *i, - const struct ip_conntrack_tuple *rtuple) +static struct ip_conntrack_helper * +__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { - return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask)) + return h; + } + return NULL; } -static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +struct ip_conntrack_helper * +ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) { - return LIST_FIND(&helpers, helper_cmp, - struct ip_conntrack_helper *, - tuple); + struct ip_conntrack_helper *helper; + + /* need ip_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&ip_conntrack_lock); + + helper = __ip_conntrack_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&ip_conntrack_lock); + + return helper; } -/* Allocate a new conntrack: we return -ENOMEM if classification - failed due to stress. Otherwise it really is unclassifiable. */ -static struct ip_conntrack_tuple_hash * -init_conntrack(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) +void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) +{ + module_put(helper->me); +} + +struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol) +{ + return ip_ct_protos[protocol]; +} + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + preempt_disable(); + p = __ip_conntrack_proto_find(protocol); + if (p) { + if (!try_module_get(p->me)) + p = &ip_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, + struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - size_t hash; - struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - hash = hash_conntrack(tuple); + /* We don't want any race condition at early drop stage */ + atomic_inc(&ip_conntrack_count); if (ip_conntrack_max - && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + && atomic_read(&ip_conntrack_count) > ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ if (!early_drop(&ip_conntrack_hash[hash])) { + atomic_dec(&ip_conntrack_count); if (net_ratelimit()) printk(KERN_WARNING "ip_conntrack: table full, dropping" @@ -483,32 +638,59 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); + atomic_dec(&ip_conntrack_count); return ERR_PTR(-ENOMEM); } memset(conntrack, 0, sizeof(*conntrack)); atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - if (!protocol->new(conntrack, skb)) { - kmem_cache_free(ip_conntrack_cachep, conntrack); - return NULL; - } + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - WRITE_LOCK(&ip_conntrack_lock); + return conntrack; +} + +void +ip_conntrack_free(struct ip_conntrack *conntrack) +{ + atomic_dec(&ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + * failed due to stress. Otherwise it really is unclassifiable */ +static struct ip_conntrack_tuple_hash * +init_conntrack(struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = ip_conntrack_alloc(tuple, &repl_tuple); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; + + if (!protocol->new(conntrack, skb)) { + ip_conntrack_free(conntrack); + return NULL; + } + + write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple); if (exp) { @@ -517,13 +699,21 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &conntrack->status); conntrack->master = exp->master; -#if CONFIG_IP_NF_CONNTRACK_MARK +#ifdef CONFIG_IP_NF_CONNTRACK_MARK conntrack->mark = exp->master->mark; +#endif +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + /* this is ugly, but there is no other place where to put it */ + conntrack->nat.masq_index = exp->master->nat.masq_index; +#endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; #endif nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { - conntrack->helper = ip_ct_find_helper(&repl_tuple); + conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); CONNTRACK_STAT_INC(new); } @@ -531,13 +721,12 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, /* Overload tuple linked list to put us in unconfirmed list. */ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - atomic_inc(&ip_conntrack_count); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); if (exp) { if (exp->expectfn) exp->expectfn(conntrack, exp); - destroy_expect(exp); + ip_conntrack_expect_put(exp); } return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; @@ -609,7 +798,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; - int set_reply; + int set_reply = 0; int ret; /* Previously seen (loopback or untracked)? Ignore. */ @@ -627,9 +816,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return NF_DROP; } - /* FIXME: Do this right please. --RR */ - (*pskb)->nfcache |= NFC_UNKNOWN; - /* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ @@ -645,7 +831,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } #endif - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -681,8 +867,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return -ret; } - if (set_reply) - set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_STATUS, *pskb); return ret; } @@ -691,7 +877,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { return ip_ct_invert_tuple(inverse, orig, - ip_ct_find_proto(orig->dst.protonum)); + __ip_conntrack_proto_find(orig->dst.protonum)); } /* Would two expected things clash? */ @@ -723,20 +909,23 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) { struct ip_conntrack_expect *i; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* choose the the oldest expectation to evict */ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, exp) && del_timer(&i->timeout)) { - unlink_expect(i); - WRITE_UNLOCK(&ip_conntrack_lock); - destroy_expect(i); + ip_ct_unlink_expect(i); + write_unlock_bh(&ip_conntrack_lock); + ip_conntrack_expect_put(i); return; } } - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } -struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) { struct ip_conntrack_expect *new; @@ -745,31 +934,31 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) DEBUGP("expect_related: OOM allocating expect\n"); return NULL; } - new->master = NULL; + new->master = me; + atomic_set(&new->use, 1); return new; } -void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) +void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) { - kmem_cache_free(ip_conntrack_expect_cachep, expect); + if (atomic_dec_and_test(&exp->use)) + kmem_cache_free(ip_conntrack_expect_cachep, exp); } static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) { - atomic_inc(&exp->master->ct_general.use); + atomic_inc(&exp->use); exp->master->expecting++; list_add(&exp->list, &ip_conntrack_expect_list); - if (exp->master->helper->timeout) { - init_timer(&exp->timeout); - exp->timeout.data = (unsigned long)exp; - exp->timeout.function = expectation_timed_out; - exp->timeout.expires - = jiffies + exp->master->helper->timeout * HZ; - add_timer(&exp->timeout); - } else - exp->timeout.function = NULL; + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + exp->id = ++ip_conntrack_expect_next_id; + atomic_inc(&exp->use); CONNTRACK_STAT_INC(expect_create); } @@ -781,8 +970,8 @@ static void evict_oldest_expect(struct ip_conntrack *master) list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (i->master == master) { if (del_timer(&i->timeout)) { - unlink_expect(i); - destroy_expect(i); + ip_ct_unlink_expect(i); + ip_conntrack_expect_put(i); } break; } @@ -808,14 +997,12 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); list_for_each_entry(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { ret = 0; - /* We don't need the one they've given us. */ - ip_conntrack_expect_free(expect); goto out; } } else if (expect_clash(i, expect)) { @@ -830,9 +1017,10 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) evict_oldest_expect(expect->master); ip_conntrack_expect_insert(expect); + ip_conntrack_expect_event(IPEXP_NEW, expect); ret = 0; out: - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return ret; } @@ -841,7 +1029,7 @@ out: void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, const struct ip_conntrack_tuple *newreply) { - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Should be unconfirmed, so not in hash table yet */ IP_NF_ASSERT(!is_confirmed(conntrack)); @@ -850,127 +1038,168 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = ip_ct_find_helper(newreply); - WRITE_UNLOCK(&ip_conntrack_lock); + conntrack->helper = __ip_conntrack_helper_find(newreply); + write_unlock_bh(&ip_conntrack_lock); } int ip_conntrack_helper_register(struct ip_conntrack_helper *me) { BUG_ON(me->timeout == 0); - WRITE_LOCK(&ip_conntrack_lock); - list_prepend(&helpers, me); - WRITE_UNLOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); + list_add(&me->list, &helpers); + write_unlock_bh(&ip_conntrack_lock); return 0; } -static inline int unhelp(struct ip_conntrack_tuple_hash *i, - const struct ip_conntrack_helper *me) +struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *name) +{ + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + +static inline void unhelp(struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_helper *me) { - if (tuplehash_to_ctrack(i)->helper == me) + if (tuplehash_to_ctrack(i)->helper == me) { + ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); tuplehash_to_ctrack(i)->helper = NULL; - return 0; + } } void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) { unsigned int i; + struct ip_conntrack_tuple_hash *h; struct ip_conntrack_expect *exp, *tmp; /* Need write lock here, to delete helper. */ - WRITE_LOCK(&ip_conntrack_lock); - LIST_DELETE(&helpers, me); + write_lock_bh(&ip_conntrack_lock); + list_del(&me->list); /* Get rid of expectations */ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { if (exp->master->helper == me && del_timer(&exp->timeout)) { - unlink_expect(exp); - destroy_expect(exp); + ip_ct_unlink_expect(exp); + ip_conntrack_expect_put(exp); } } /* Get rid of expecteds, set helpers to NULL. */ - LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); - for (i = 0; i < ip_conntrack_htable_size; i++) - LIST_FIND_W(&ip_conntrack_hash[i], unhelp, - struct ip_conntrack_tuple_hash *, me); - WRITE_UNLOCK(&ip_conntrack_lock); + list_for_each_entry(h, &unconfirmed, list) + unhelp(h, me); + for (i = 0; i < ip_conntrack_htable_size; i++) { + list_for_each_entry(h, &ip_conntrack_hash[i], list) + unhelp(h, me); + } + write_unlock_bh(&ip_conntrack_lock); /* Someone could be still looking at the helper in a bh. */ synchronize_net(); } -static inline void ct_add_counters(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb) -{ -#ifdef CONFIG_IP_NF_CT_ACCT - if (skb) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - ntohs(skb->nh.iph->tot_len); - } -#endif -} - -/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ -void ip_ct_refresh_acct(struct ip_conntrack *ct, +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __ip_ct_refresh_acct(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, - unsigned long extra_jiffies) + unsigned long extra_jiffies, + int do_acct) { + int event = 0; + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); + IP_NF_ASSERT(skb); + + write_lock_bh(&ip_conntrack_lock); + + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&ip_conntrack_lock); + return; + } /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - ct_add_counters(ct, ctinfo, skb); + event = IPCT_REFRESH; } else { - WRITE_LOCK(&ip_conntrack_lock); /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); + event = IPCT_REFRESH; } - ct_add_counters(ct, ctinfo, skb); - WRITE_UNLOCK(&ip_conntrack_lock); } + +#ifdef CONFIG_IP_NF_CT_ACCT + if (do_acct) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) + || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) + event |= IPCT_COUNTER_FILLING; + } +#endif + + write_unlock_bh(&ip_conntrack_lock); + + /* must be unlocked when calling event cache */ + if (event) + ip_conntrack_event_cache(event, skb); +} + +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + t->src.u.tcp.port = + *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; } +#endif /* Returns new sk_buff, or NULL */ struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { - struct sock *sk = skb->sk; -#ifdef CONFIG_NETFILTER_DEBUG - unsigned int olddebug = skb->nf_debug; -#endif - - if (sk) { - sock_hold(sk); - skb_orphan(skb); - } + skb_orphan(skb); local_bh_disable(); skb = ip_defrag(skb, user); local_bh_enable(); - if (!skb) { - if (sk) - sock_put(sk); - return skb; - } - - if (sk) { - skb_set_owner_w(skb, sk); - sock_put(sk); - } - - ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; -#ifdef CONFIG_NETFILTER_DEBUG - /* Packet path as if nothing had happened. */ - skb->nf_debug = olddebug; -#endif + if (skb) + ip_send_check(skb->nh.iph); return skb; } @@ -994,46 +1223,43 @@ static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) nf_conntrack_get(nskb->nfct); } -static inline int -do_iter(const struct ip_conntrack_tuple_hash *i, - int (*iter)(struct ip_conntrack *i, void *data), - void *data) -{ - return iter(tuplehash_to_ctrack(i), data); -} - /* Bring out ya dead! */ -static struct ip_conntrack_tuple_hash * +static struct ip_conntrack * get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), void *data, unsigned int *bucket) { - struct ip_conntrack_tuple_hash *h = NULL; + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { - h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, - struct ip_conntrack_tuple_hash *, iter, data); - if (h) - break; + list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) { + ct = tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } } - if (!h) - h = LIST_FIND_W(&unconfirmed, do_iter, - struct ip_conntrack_tuple_hash *, iter, data); - if (h) - atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); - WRITE_UNLOCK(&ip_conntrack_lock); + list_for_each_entry(h, &unconfirmed, list) { + ct = tuplehash_to_ctrack(h); + if (iter(ct, data)) + set_bit(IPS_DYING_BIT, &ct->status); + } + write_unlock_bh(&ip_conntrack_lock); + return NULL; - return h; +found: + atomic_inc(&ct->ct_general.use); + write_unlock_bh(&ip_conntrack_lock); + return ct; } void ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) { - struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; unsigned int bucket = 0; - while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { - struct ip_conntrack *ct = tuplehash_to_ctrack(h); + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) death_by_timeout((unsigned long)ct); @@ -1083,6 +1309,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); @@ -1110,14 +1337,18 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } -static void free_conntrack_hash(void) +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + +static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { - if (ip_conntrack_vmalloc) - vfree(ip_conntrack_hash); + if (vmalloced) + vfree(hash); else - free_pages((unsigned long)ip_conntrack_hash, - get_order(sizeof(struct list_head) - * ip_conntrack_htable_size)); + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); } /* Mishearing the voices in his head, our hero wonders how he's @@ -1125,26 +1356,102 @@ static void free_conntrack_hash(void) void ip_conntrack_cleanup(void) { ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ synchronize_net(); - + + ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; } + /* wait until all references to ip_conntrack_untracked are dropped */ + while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) + schedule(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); nf_unregister_sockopt(&so_getorigdst); } -static int hashsize; -module_param(hashsize, int, 0400); +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +static int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct ip_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!ip_conntrack_htable_size) + return param_set_int(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehash for the new table anyway, so we also can + * use a new random seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { + while (!list_empty(&ip_conntrack_hash[i])) { + h = list_entry(ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = ip_conntrack_htable_size; + old_vmalloced = ip_conntrack_vmalloc; + old_hash = ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; + ip_conntrack_vmalloc = vmalloced; + ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); int __init ip_conntrack_init(void) { @@ -1153,9 +1460,7 @@ int __init ip_conntrack_init(void) /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (hashsize) { - ip_conntrack_htable_size = hashsize; - } else { + if (!ip_conntrack_htable_size) { ip_conntrack_htable_size = (((num_physpages << PAGE_SHIFT) / 16384) / sizeof(struct list_head)); @@ -1177,20 +1482,8 @@ int __init ip_conntrack_init(void) return ret; } - /* AK: the hash table is twice as big than needed because it - uses list_head. it would be much nicer to caches to use a - single pointer list head here. */ - ip_conntrack_vmalloc = 0; - ip_conntrack_hash - =(void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - *ip_conntrack_htable_size)); - if (!ip_conntrack_hash) { - ip_conntrack_vmalloc = 1; - printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); - } + ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, + &ip_conntrack_vmalloc); if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1213,17 +1506,14 @@ int __init ip_conntrack_init(void) } /* Don't NEED lock here, but good form anyway. */ - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); for (i = 0; i < MAX_IP_CT_PROTO; i++) ip_ct_protos[i] = &ip_conntrack_generic_protocol; /* Sew in builtin protocols. */ ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; - WRITE_UNLOCK(&ip_conntrack_lock); - - for (i = 0; i < ip_conntrack_htable_size; i++) - INIT_LIST_HEAD(&ip_conntrack_hash[i]); + write_unlock_bh(&ip_conntrack_lock); /* For use by ipt_REJECT */ ip_ct_attach = ip_conntrack_attach; @@ -1239,7 +1529,8 @@ int __init ip_conntrack_init(void) err_free_conntrack_slab: kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst);