X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fnetfilter%2Fip_conntrack_core.c;h=28d9425d5c390dac7601953e65043e973833e7aa;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=00a89f4f8d8b5580a838fb9b3a4b38f00edef8d7;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 00a89f4f8..28d9425d5 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -34,8 +34,9 @@ #include #include #include -/* For ERR_PTR(). Yeah, I know... --RR */ -#include +#include +#include +#include /* This rwlock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -57,58 +58,31 @@ #endif DECLARE_RWLOCK(ip_conntrack_lock); -DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock); + +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(ip_conntrack_expect_list); -LIST_HEAD(protocol_list); +struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; -static atomic_t ip_conntrack_count = ATOMIC_INIT(0); struct list_head *ip_conntrack_hash; static kmem_cache_t *ip_conntrack_cachep; +static kmem_cache_t *ip_conntrack_expect_cachep; struct ip_conntrack ip_conntrack_untracked; +unsigned int ip_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int ip_conntrack_vmalloc; -extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; - -static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr, - u_int8_t protocol) -{ - return protocol == curr->proto; -} - -struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol) -{ - struct ip_conntrack_protocol *p; - - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - p = LIST_FIND(&protocol_list, proto_cmpfn, - struct ip_conntrack_protocol *, protocol); - if (!p) - p = &ip_conntrack_generic_protocol; +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); - return p; -} - -struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol) -{ - struct ip_conntrack_protocol *p; - - READ_LOCK(&ip_conntrack_lock); - p = __ip_ct_find_proto(protocol); - READ_UNLOCK(&ip_conntrack_lock); - return p; -} - -inline void +void ip_conntrack_put(struct ip_conntrack *ct) { IP_NF_ASSERT(ct); - IP_NF_ASSERT(ct->infos[0].master); - /* nf_conntrack_put wants to go via an info struct, so feed it - one at random. */ - nf_conntrack_put(&ct->infos[0]); + nf_conntrack_put(&ct->ct_general); } static int ip_conntrack_hash_rnd_initted; @@ -127,11 +101,11 @@ hash_conntrack(const struct ip_conntrack_tuple *tuple) } int -get_tuple(const struct iphdr *iph, - const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_protocol *protocol) +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol) { /* Never happen */ if (iph->frag_off & htons(IP_OFFSET)) { @@ -143,146 +117,89 @@ get_tuple(const struct iphdr *iph, tuple->src.ip = iph->saddr; tuple->dst.ip = iph->daddr; tuple->dst.protonum = iph->protocol; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; return protocol->pkt_to_tuple(skb, dataoff, tuple); } -static int -invert_tuple(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig, - const struct ip_conntrack_protocol *protocol) +int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) { inverse->src.ip = orig->dst.ip; inverse->dst.ip = orig->src.ip; inverse->dst.protonum = orig->dst.protonum; + inverse->dst.dir = !orig->dst.dir; return protocol->invert_tuple(inverse, orig); } /* ip_conntrack_expect helper functions */ - -/* Compare tuple parts depending on mask. */ -static inline int expect_cmp(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); - return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask); -} - -static void -destroy_expect(struct ip_conntrack_expect *exp) +static void destroy_expect(struct ip_conntrack_expect *exp) { - DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use)); - IP_NF_ASSERT(atomic_read(&exp->use) == 0); + ip_conntrack_put(exp->master); IP_NF_ASSERT(!timer_pending(&exp->timeout)); - - kfree(exp); + kmem_cache_free(ip_conntrack_expect_cachep, exp); + CONNTRACK_STAT_INC(expect_delete); } -inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) +static void unlink_expect(struct ip_conntrack_expect *exp) { - IP_NF_ASSERT(exp); - - if (atomic_dec_and_test(&exp->use)) { - /* usage count dropped to zero */ - destroy_expect(exp); - } -} - -static inline struct ip_conntrack_expect * -__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); - return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); -} - -/* Find a expectation corresponding to a tuple. */ -struct ip_conntrack_expect * -ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *exp; - - READ_LOCK(&ip_conntrack_lock); - READ_LOCK(&ip_conntrack_expect_tuple_lock); - exp = __ip_ct_expect_find(tuple); - if (exp) - atomic_inc(&exp->use); - READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - READ_UNLOCK(&ip_conntrack_lock); - - return exp; + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + list_del(&exp->list); + /* Logically in destroy_expect, but we hold the lock here. */ + exp->master->expecting--; } -/* remove one specific expectation from all lists and drop refcount, - * does _NOT_ delete the timer. */ -static void __unexpect_related(struct ip_conntrack_expect *expect) +static void expectation_timed_out(unsigned long ul_expect) { - DEBUGP("unexpect_related(%p)\n", expect); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - - /* we're not allowed to unexpect a confirmed expectation! */ - IP_NF_ASSERT(!expect->sibling); + struct ip_conntrack_expect *exp = (void *)ul_expect; - /* delete from global and local lists */ - list_del(&expect->list); - list_del(&expect->expected_list); - - /* decrement expect-count of master conntrack */ - if (expect->expectant) - expect->expectant->expecting--; - - ip_conntrack_expect_put(expect); + WRITE_LOCK(&ip_conntrack_lock); + unlink_expect(exp); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(exp); } -/* remove one specific expecatation from all lists, drop refcount - * and expire timer. - * This function can _NOT_ be called for confirmed expects! */ -static void unexpect_related(struct ip_conntrack_expect *expect) +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct ip_conntrack_expect * +find_expectation(const struct ip_conntrack_tuple *tuple) { - IP_NF_ASSERT(expect->expectant); - IP_NF_ASSERT(expect->expectant->helper); - /* if we are supposed to have a timer, but we can't delete - * it: race condition. __unexpect_related will - * be calledd by timeout function */ - if (expect->expectant->helper->timeout - && !del_timer(&expect->timeout)) - return; - - __unexpect_related(expect); + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && is_confirmed(i->master) + && del_timer(&i->timeout)) { + unlink_expect(i); + return i; + } + } + return NULL; } -/* delete all unconfirmed expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct, int drop_refcount) +/* delete all expectations for this conntrack */ +static void remove_expectations(struct ip_conntrack *ct) { - struct list_head *exp_entry, *next; - struct ip_conntrack_expect *exp; - - DEBUGP("remove_expectations(%p)\n", ct); + struct ip_conntrack_expect *i, *tmp; - list_for_each_safe(exp_entry, next, &ct->sibling_list) { - exp = list_entry(exp_entry, struct ip_conntrack_expect, - expected_list); + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; - /* we skip established expectations, as we want to delete - * the un-established ones only */ - if (exp->sibling) { - DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct); - if (drop_refcount) { - /* Indicate that this expectations parent is dead */ - ip_conntrack_put(exp->expectant); - exp->expectant = NULL; - } - continue; + list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); } - - IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp)); - IP_NF_ASSERT(exp->expectant == ct); - - /* delete expectation from global and private lists */ - unexpect_related(exp); } } @@ -299,14 +216,14 @@ clean_from_lists(struct ip_conntrack *ct) LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); - /* Destroy all un-established, pending expectations */ - remove_expectations(ct, 1); + /* Destroy all pending expectations */ + remove_expectations(ct); } static void destroy_conntrack(struct nf_conntrack *nfct) { - struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL; + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; struct ip_conntrack_protocol *proto; DEBUGP("destroy_conntrack(%p)\n", ct); @@ -324,24 +241,23 @@ destroy_conntrack(struct nf_conntrack *nfct) ip_conntrack_destroyed(ct); WRITE_LOCK(&ip_conntrack_lock); - /* Make sure don't leave any orphaned expectations lying around */ - if (ct->expecting) - remove_expectations(ct, 1); - - /* Delete our master expectation */ - if (ct->master) { - if (ct->master->expectant) { - /* can't call __unexpect_related here, - * since it would screw up expect_list */ - list_del(&ct->master->expected_list); - master = ct->master->expectant; - } - kfree(ct->master); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); } + + CONNTRACK_STAT_INC(delete); WRITE_UNLOCK(&ip_conntrack_lock); - if (master) - ip_conntrack_put(master); + if (ct->master) + ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); kmem_cache_free(ip_conntrack_cachep, ct); @@ -353,6 +269,9 @@ static void death_by_timeout(unsigned long ul_conntrack) struct ip_conntrack *ct = (void *)ul_conntrack; WRITE_LOCK(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + CONNTRACK_STAT_INC(delete_list); clean_from_lists(ct); WRITE_UNLOCK(&ip_conntrack_lock); ip_conntrack_put(ct); @@ -364,7 +283,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, const struct ip_conntrack *ignored_conntrack) { MUST_BE_READ_LOCKED(&ip_conntrack_lock); - return i->ctrack != ignored_conntrack + return tuplehash_to_ctrack(i) != ignored_conntrack && ip_ct_tuple_equal(tuple, &i->tuple); } @@ -376,11 +295,15 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, unsigned int hash = hash_conntrack(tuple); MUST_BE_READ_LOCKED(&ip_conntrack_lock); - h = LIST_FIND(&ip_conntrack_hash[hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - tuple, ignored_conntrack); - return h; + list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; + } + CONNTRACK_STAT_INC(searched); + } + + return NULL; } /* Find a connection corresponding to a tuple. */ @@ -393,42 +316,21 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, READ_LOCK(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); if (h) - atomic_inc(&h->ctrack->ct_general.use); + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); READ_UNLOCK(&ip_conntrack_lock); return h; } -static inline struct ip_conntrack * -__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo) -{ - struct ip_conntrack *ct - = (struct ip_conntrack *)nfct->master; - - /* ctinfo is the index of the nfct inside the conntrack */ - *ctinfo = nfct - ct->infos; - IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER); - return ct; -} - -/* Return conntrack and conntrack_info given skb->nfct->master */ -struct ip_conntrack * -ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo) -{ - if (skb->nfct) - return __ip_conntrack_get(skb->nfct, ctinfo); - return NULL; -} - -/* Confirm a connection given skb->nfct; places it in hash table */ +/* Confirm a connection given skb; places it in hash table */ int -__ip_conntrack_confirm(struct nf_ct_info *nfct) +__ip_conntrack_confirm(struct sk_buff **pskb) { unsigned int hash, repl_hash; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - ct = __ip_conntrack_get(nfct, &ctinfo); + ct = ip_conntrack_get(*pskb, &ctinfo); /* ipt_REJECT uses ip_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet @@ -451,6 +353,7 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) DEBUGP("Confirming conntrack %p\n", ct); WRITE_LOCK(&ip_conntrack_lock); + /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -462,6 +365,9 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[hash], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); list_prepend(&ip_conntrack_hash[repl_hash], @@ -473,11 +379,14 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); + CONNTRACK_STAT_INC(insert); WRITE_UNLOCK(&ip_conntrack_lock); return NF_ACCEPT; } + CONNTRACK_STAT_INC(insert_failed); WRITE_UNLOCK(&ip_conntrack_lock); + return NF_DROP; } @@ -496,110 +405,37 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, return h != NULL; } -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -struct ip_conntrack * -icmp_error_track(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct ip_conntrack_tuple innertuple, origtuple; - struct { - struct icmphdr icmp; - struct iphdr ip; - } inside; - struct ip_conntrack_protocol *innerproto; - struct ip_conntrack_tuple_hash *h; - int dataoff; - - IP_NF_ASSERT(skb->nfct == NULL); - - /* Not enough header? */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0) - return NULL; - - if (inside.icmp.type != ICMP_DEST_UNREACH - && inside.icmp.type != ICMP_SOURCE_QUENCH - && inside.icmp.type != ICMP_TIME_EXCEEDED - && inside.icmp.type != ICMP_PARAMETERPROB - && inside.icmp.type != ICMP_REDIRECT) - return NULL; - - /* Ignore ICMP's containing fragments (shouldn't happen) */ - if (inside.ip.frag_off & htons(IP_OFFSET)) { - DEBUGP("icmp_error_track: fragment of proto %u\n", - inside.ip.protocol); - return NULL; - } - - innerproto = ip_ct_find_proto(inside.ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4; - /* Are they talking about one of our connections? */ - if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) { - DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol); - return NULL; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!invert_tuple(&innertuple, &origtuple, innerproto)) { - DEBUGP("icmp_error_track: Can't invert tuple\n"); - return NULL; - } - - *ctinfo = IP_CT_RELATED; - - h = ip_conntrack_find_get(&innertuple, NULL); - if (!h) { - /* Locally generated ICMPs will match inverted if they - haven't been SNAT'ed yet */ - /* FIXME: NAT code has to handle half-done double NAT --RR */ - if (hooknum == NF_IP_LOCAL_OUT) - h = ip_conntrack_find_get(&origtuple, NULL); - - if (!h) { - DEBUGP("icmp_error_track: no match\n"); - return NULL; - } - /* Reverse direction from that found */ - if (DIRECTION(h) != IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } else { - if (DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - skb->nfct = &h->ctrack->infos[*ctinfo]; - return h->ctrack; -} - /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static inline int unreplied(const struct ip_conntrack_tuple_hash *i) { - return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status)); + return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); } static int early_drop(struct list_head *chain) { /* Traverse backwards: gives us oldest, which is roughly LRU */ struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct = NULL; int dropped = 0; READ_LOCK(&ip_conntrack_lock); h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); - if (h) - atomic_inc(&h->ctrack->ct_general.use); + if (h) { + ct = tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } READ_UNLOCK(&ip_conntrack_lock); - if (!h) + if (!ct) return dropped; - if (del_timer(&h->ctrack->timeout)) { - death_by_timeout((unsigned long)h->ctrack); + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); dropped = 1; + CONNTRACK_STAT_INC(early_drop); } - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); return dropped; } @@ -609,7 +445,7 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, @@ -626,9 +462,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, struct ip_conntrack *conntrack; struct ip_conntrack_tuple repl_tuple; size_t hash; - struct ip_conntrack_expect *expected; - int i; - static unsigned int drop_next; + struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); @@ -637,15 +471,10 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, hash = hash_conntrack(tuple); - if (ip_conntrack_max && - atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { - /* Try dropping from random chain, or else from the - chain about to put into (in case they're trying to - bomb one hash chain). */ - unsigned int next = (drop_next++)%ip_conntrack_htable_size; - - if (!early_drop(&ip_conntrack_hash[next]) - && !early_drop(&ip_conntrack_hash[hash])) { + if (ip_conntrack_max + && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + /* Try dropping from this hash chain. */ + if (!early_drop(&ip_conntrack_hash[hash])) { if (net_ratelimit()) printk(KERN_WARNING "ip_conntrack: table full, dropping" @@ -654,7 +483,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!invert_tuple(&repl_tuple, tuple, protocol)) { + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { DEBUGP("Can't invert tuple.\n"); return NULL; } @@ -669,12 +498,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack; conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack; - for (i=0; i < IP_CT_NUMBER; i++) - conntrack->infos[i].master = &conntrack->ct_general; - if (!protocol->new(conntrack, skb)) { kmem_cache_free(ip_conntrack_cachep, conntrack); return NULL; @@ -684,49 +508,38 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - INIT_LIST_HEAD(&conntrack->sibling_list); - WRITE_LOCK(&ip_conntrack_lock); - /* Need finding and deleting of expected ONLY if we win race */ - READ_LOCK(&ip_conntrack_expect_tuple_lock); - expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); - READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (expected && !is_confirmed(expected->expectant)) - expected = NULL; - - /* Look up the conntrack helper for master connections only */ - if (!expected) - conntrack->helper = ip_ct_find_helper(&repl_tuple); + exp = find_expectation(tuple); - /* If the expectation is dying, then this is a loser. */ - if (expected - && expected->expectant->helper->timeout - && ! del_timer(&expected->timeout)) - expected = NULL; - - if (expected) { + if (exp) { DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", - conntrack, expected); + conntrack, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &conntrack->status); - conntrack->master = expected; - expected->sibling = conntrack; - LIST_DELETE(&ip_conntrack_expect_list, expected); - expected->expectant->expecting--; - nf_conntrack_get(&master_ct(conntrack)->infos[0]); + conntrack->master = exp->master; +#if CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + CONNTRACK_STAT_INC(expect_new); + } else { + conntrack->helper = ip_ct_find_helper(&repl_tuple); + + CONNTRACK_STAT_INC(new); } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + atomic_inc(&ip_conntrack_count); WRITE_UNLOCK(&ip_conntrack_lock); - if (expected && expected->expectfn) - expected->expectfn(conntrack); + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + destroy_expect(exp); + } + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } @@ -740,10 +553,12 @@ resolve_normal_ct(struct sk_buff *skb, { struct ip_conntrack_tuple tuple; struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); - if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto)) + if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, + &tuple,proto)) return NULL; /* look for tuple match */ @@ -755,6 +570,7 @@ resolve_normal_ct(struct sk_buff *skb, if (IS_ERR(h)) return (void *)h; } + ct = tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (DIRECTION(h) == IP_CT_DIR_REPLY) { @@ -763,23 +579,24 @@ resolve_normal_ct(struct sk_buff *skb, *set_reply = 1; } else { /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) { + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: normal packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) { + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: related packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_RELATED; } else { DEBUGP("ip_conntrack_in: new packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; } - skb->nfct = &h->ctrack->infos[*ctinfo]; - return h->ctrack; + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; } /* Netfilter hook itself. */ @@ -795,6 +612,12 @@ unsigned int ip_conntrack_in(unsigned int hooknum, int set_reply; int ret; + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + CONNTRACK_STAT_INC(ignore); + return NF_ACCEPT; + } + /* Never happen */ if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { if (net_ratelimit()) { @@ -822,44 +645,42 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } #endif - /* Previously seen (loopback or untracked)? Ignore. */ - if ((*pskb)->nfct) - return NF_ACCEPT; - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); - /* It may be an icmp error... */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP - && icmp_error_track(*pskb, &ctinfo, hooknum)) - return NF_ACCEPT; + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL + && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { + CONNTRACK_STAT_INC(error); + CONNTRACK_STAT_INC(invalid); + return -ret; + } - if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { /* Not valid part of a connection */ + CONNTRACK_STAT_INC(invalid); return NF_ACCEPT; + } - if (IS_ERR(ct)) + if (IS_ERR(ct)) { /* Too stressed to deal. */ + CONNTRACK_STAT_INC(drop); return NF_DROP; + } IP_NF_ASSERT((*pskb)->nfct); ret = proto->packet(ct, *pskb, ctinfo); - if (ret == -1) { - /* Invalid */ + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do*/ nf_conntrack_put((*pskb)->nfct); (*pskb)->nfct = NULL; - return NF_ACCEPT; + CONNTRACK_STAT_INC(invalid); + return -ret; } - if (ret != NF_DROP && ct->helper) { - ret = ct->helper->help(*pskb, ct, ctinfo); - if (ret == -1) { - /* Invalid */ - nf_conntrack_put((*pskb)->nfct); - (*pskb)->nfct = NULL; - return NF_ACCEPT; - } - } if (set_reply) set_bit(IPS_SEEN_REPLY_BIT, &ct->status); @@ -869,248 +690,158 @@ unsigned int ip_conntrack_in(unsigned int hooknum, int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { - return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum)); -} - -static inline int resent_expect(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) -{ - DEBUGP("resent_expect\n"); - DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple); - DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple); - DEBUGP("test tuple: "); DUMP_TUPLE(tuple); - return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple)) - || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple))) - && ip_ct_tuple_equal(&i->mask, mask)); + return ip_ct_invert_tuple(inverse, orig, + ip_ct_find_proto(orig->dst.protonum)); } /* Would two expected things clash? */ -static inline int expect_clash(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) +static inline int expect_clash(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct ip_conntrack_tuple intersect_mask - = { { i->mask.src.ip & mask->src.ip, - { i->mask.src.u.all & mask->src.u.all } }, - { i->mask.dst.ip & mask->dst.ip, - { i->mask.dst.u.all & mask->dst.u.all }, - i->mask.dst.protonum & mask->dst.protonum } }; + = { { a->mask.src.ip & b->mask.src.ip, + { a->mask.src.u.all & b->mask.src.u.all } }, + { a->mask.dst.ip & b->mask.dst.ip, + { a->mask.dst.u.all & b->mask.dst.u.all }, + a->mask.dst.protonum & b->mask.dst.protonum } }; - return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask); + return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); } -inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect) +static inline int expect_matches(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) { - WRITE_LOCK(&ip_conntrack_lock); - unexpect_related(expect); - WRITE_UNLOCK(&ip_conntrack_lock); + return a->master == b->master + && ip_ct_tuple_equal(&a->tuple, &b->tuple) + && ip_ct_tuple_equal(&a->mask, &b->mask); } - -static void expectation_timed_out(unsigned long ul_expect) + +/* Generally a bad idea to call this: could have matched already. */ +void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) { - struct ip_conntrack_expect *expect = (void *) ul_expect; + struct ip_conntrack_expect *i; - DEBUGP("expectation %p timed out\n", expect); WRITE_LOCK(&ip_conntrack_lock); - __unexpect_related(expect); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + unlink_expect(i); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(i); + return; + } + } WRITE_UNLOCK(&ip_conntrack_lock); } -struct ip_conntrack_expect * -ip_conntrack_expect_alloc(void) +struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) { struct ip_conntrack_expect *new; - - new = (struct ip_conntrack_expect *) - kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC); + + new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); if (!new) { DEBUGP("expect_related: OOM allocating expect\n"); return NULL; } + new->master = NULL; + return new; +} - /* tuple_cmp compares whole union, we have to initialized cleanly */ - memset(new, 0, sizeof(struct ip_conntrack_expect)); +void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) +{ + kmem_cache_free(ip_conntrack_expect_cachep, expect); +} - return new; +static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) +{ + atomic_inc(&exp->master->ct_general.use); + exp->master->expecting++; + list_add(&exp->list, &ip_conntrack_expect_list); + + if (exp->master->helper->timeout) { + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires + = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + } else + exp->timeout.function = NULL; + + CONNTRACK_STAT_INC(expect_create); } -static void -ip_conntrack_expect_insert(struct ip_conntrack_expect *new, - struct ip_conntrack *related_to) +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct ip_conntrack *master) { - DEBUGP("new expectation %p of conntrack %p\n", new, related_to); - new->expectant = related_to; - new->sibling = NULL; - atomic_set(&new->use, 1); - - /* add to expected list for this connection */ - list_add_tail(&new->expected_list, &related_to->sibling_list); - /* add to global list of expectations */ - list_prepend(&ip_conntrack_expect_list, &new->list); - /* add and start timer if required */ - if (related_to->helper->timeout) { - init_timer(&new->timeout); - new->timeout.data = (unsigned long)new; - new->timeout.function = expectation_timed_out; - new->timeout.expires = jiffies + - related_to->helper->timeout * HZ; - add_timer(&new->timeout); + struct ip_conntrack_expect *i; + + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + break; + } } - related_to->expecting++; } -/* Add a related connection. */ -int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, - struct ip_conntrack *related_to) +static inline int refresh_timer(struct ip_conntrack_expect *i) { - struct ip_conntrack_expect *old; - int ret = 0; + if (!del_timer(&i->timeout)) + return 0; - WRITE_LOCK(&ip_conntrack_lock); - /* Because of the write lock, no reader can walk the lists, - * so there is no need to use the tuple lock too */ + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) +{ + struct ip_conntrack_expect *i; + int ret; DEBUGP("ip_conntrack_expect_related %p\n", related_to); DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); - old = LIST_FIND(&ip_conntrack_expect_list, resent_expect, - struct ip_conntrack_expect *, &expect->tuple, - &expect->mask); - if (old) { - /* Helper private data may contain offsets but no pointers - pointing into the payload - otherwise we should have to copy - the data filled out by the helper over the old one */ - DEBUGP("expect_related: resent packet\n"); - if (related_to->helper->timeout) { - if (!del_timer(&old->timeout)) { - /* expectation is dying. Fall through */ + WRITE_LOCK(&ip_conntrack_lock); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + /* We don't need the one they've given us. */ + ip_conntrack_expect_free(expect); goto out; - } else { - old->timeout.expires = jiffies + - related_to->helper->timeout * HZ; - add_timer(&old->timeout); } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; } - - WRITE_UNLOCK(&ip_conntrack_lock); - kfree(expect); - return -EEXIST; - - } else if (related_to->helper->max_expected && - related_to->expecting >= related_to->helper->max_expected) { - /* old == NULL */ - if (!(related_to->helper->flags & - IP_CT_HELPER_F_REUSE_EXPECT)) { - WRITE_UNLOCK(&ip_conntrack_lock); - if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: max number of expected " - "connections %i of %s reached for " - "%u.%u.%u.%u->%u.%u.%u.%u\n", - related_to->helper->max_expected, - related_to->helper->name, - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - kfree(expect); - return -EPERM; - } - DEBUGP("ip_conntrack: max number of expected " - "connections %i of %s reached for " - "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n", - related_to->helper->max_expected, - related_to->helper->name, - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - - /* choose the the oldest expectation to evict */ - list_for_each_entry(old, &related_to->sibling_list, - expected_list) - if (old->sibling == NULL) - break; - - /* We cannot fail since related_to->expecting is the number - * of unconfirmed expectations */ - IP_NF_ASSERT(old && old->sibling == NULL); - - /* newnat14 does not reuse the real allocated memory - * structures but rather unexpects the old and - * allocates a new. unexpect_related will decrement - * related_to->expecting. - */ - unexpect_related(old); - ret = -EPERM; - } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash, - struct ip_conntrack_expect *, &expect->tuple, - &expect->mask)) { - WRITE_UNLOCK(&ip_conntrack_lock); - DEBUGP("expect_related: busy!\n"); - - kfree(expect); - return -EBUSY; } -out: ip_conntrack_expect_insert(expect, related_to); + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + ip_conntrack_expect_insert(expect); + ret = 0; +out: WRITE_UNLOCK(&ip_conntrack_lock); - - return ret; -} - -/* Change tuple in an existing expectation */ -int ip_conntrack_change_expect(struct ip_conntrack_expect *expect, - struct ip_conntrack_tuple *newtuple) -{ - int ret; - - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - WRITE_LOCK(&ip_conntrack_expect_tuple_lock); - - DEBUGP("change_expect:\n"); - DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask); - DEBUGP("newtuple: "); DUMP_TUPLE(newtuple); - if (expect->ct_tuple.dst.protonum == 0) { - /* Never seen before */ - DEBUGP("change expect: never seen before\n"); - if (!ip_ct_tuple_equal(&expect->tuple, newtuple) - && LIST_FIND(&ip_conntrack_expect_list, expect_clash, - struct ip_conntrack_expect *, newtuple, &expect->mask)) { - /* Force NAT to find an unused tuple */ - ret = -1; - } else { - memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple)); - memcpy(&expect->tuple, newtuple, sizeof(expect->tuple)); - ret = 0; - } - } else { - /* Resent packet */ - DEBUGP("change expect: resent packet\n"); - if (ip_ct_tuple_equal(&expect->tuple, newtuple)) { - ret = 0; - } else { - /* Force NAT to choose again the same port */ - ret = -1; - } - } - WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock); - - return ret; + return ret; } -/* Alter reply tuple (maybe alter helper). If it's already taken, - return 0 and don't do alteration. */ -int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, - const struct ip_conntrack_tuple *newreply) +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __ip_conntrack_confirm */ +void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) { WRITE_LOCK(&ip_conntrack_lock); - if (__ip_conntrack_find(newreply, conntrack)) { - WRITE_UNLOCK(&ip_conntrack_lock); - return 0; - } /* Should be unconfirmed, so not in hash table yet */ IP_NF_ASSERT(!is_confirmed(conntrack)); @@ -1118,15 +849,14 @@ int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, DUMP_TUPLE(newreply); conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (!conntrack->master && list_empty(&conntrack->sibling_list)) + if (!conntrack->master && conntrack->expecting == 0) conntrack->helper = ip_ct_find_helper(newreply); WRITE_UNLOCK(&ip_conntrack_lock); - - return 1; } int ip_conntrack_helper_register(struct ip_conntrack_helper *me) { + BUG_ON(me->timeout == 0); WRITE_LOCK(&ip_conntrack_lock); list_prepend(&helpers, me); WRITE_UNLOCK(&ip_conntrack_lock); @@ -1137,24 +867,29 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (i->ctrack->helper == me) { - /* Get rid of any expected. */ - remove_expectations(i->ctrack, 0); - /* And *then* set helper to NULL */ - i->ctrack->helper = NULL; - } + if (tuplehash_to_ctrack(i)->helper == me) + tuplehash_to_ctrack(i)->helper = NULL; return 0; } void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) { unsigned int i; + struct ip_conntrack_expect *exp, *tmp; /* Need write lock here, to delete helper. */ WRITE_LOCK(&ip_conntrack_lock); LIST_DELETE(&helpers, me); + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + unlink_expect(exp); + destroy_expect(exp); + } + } /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); for (i = 0; i < ip_conntrack_htable_size; i++) LIST_FIND_W(&ip_conntrack_hash[i], unhelp, struct ip_conntrack_tuple_hash *, me); @@ -1164,40 +899,59 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) synchronize_net(); } -/* Refresh conntrack for this many jiffies. */ -void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) +static inline void ct_add_counters(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_NF_CT_ACCT + if (skb) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + } +#endif +} + +/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ +void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) { IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); /* If not in hash table, timer will not be active yet */ - if (!is_confirmed(ct)) + if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - else { + ct_add_counters(ct, ctinfo, skb); + } else { WRITE_LOCK(&ip_conntrack_lock); /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); } + ct_add_counters(ct, ctinfo, skb); WRITE_UNLOCK(&ip_conntrack_lock); } } /* Returns new sk_buff, or NULL */ struct sk_buff * -ip_ct_gather_frags(struct sk_buff *skb) +ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { struct sock *sk = skb->sk; #ifdef CONFIG_NETFILTER_DEBUG unsigned int olddebug = skb->nf_debug; #endif + if (sk) { sock_hold(sk); skb_orphan(skb); } local_bh_disable(); - skb = ip_defrag(skb); + skb = ip_defrag(skb, user); local_bh_enable(); if (!skb) { @@ -1221,66 +975,71 @@ ip_ct_gather_frags(struct sk_buff *skb) } /* Used by ipt_REJECT. */ -static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct) +static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - ct = __ip_conntrack_get(nfct, &ctinfo); - - /* This ICMP is in reverse direction to the packet which - caused it */ + /* This ICMP is in reverse direction to the packet which caused it */ + ct = ip_conntrack_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; else ctinfo = IP_CT_RELATED; - /* Attach new skbuff, and increment count */ - nskb->nfct = &ct->infos[ctinfo]; - atomic_inc(&ct->ct_general.use); + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); } static inline int -do_kill(const struct ip_conntrack_tuple_hash *i, - int (*kill)(const struct ip_conntrack *i, void *data), +do_iter(const struct ip_conntrack_tuple_hash *i, + int (*iter)(struct ip_conntrack *i, void *data), void *data) { - return kill(i->ctrack, data); + return iter(tuplehash_to_ctrack(i), data); } /* Bring out ya dead! */ static struct ip_conntrack_tuple_hash * -get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data), +get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), void *data, unsigned int *bucket) { struct ip_conntrack_tuple_hash *h = NULL; - READ_LOCK(&ip_conntrack_lock); - for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) { - h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill, - struct ip_conntrack_tuple_hash *, kill, data); + WRITE_LOCK(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + WRITE_UNLOCK(&ip_conntrack_lock); return h; } void -ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), - void *data) +ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) { struct ip_conntrack_tuple_hash *h; unsigned int bucket = 0; - while ((h = get_next_corpse(kill, data, &bucket)) != NULL) { + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct ip_conntrack *ct = tuplehash_to_ctrack(h); /* Time to push up daises... */ - if (del_timer(&h->ctrack->timeout)) - death_by_timeout((unsigned long)h->ctrack); + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); /* ... else the timer will get him soon. */ - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); } } @@ -1291,7 +1050,7 @@ ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), static int getorigdst(struct sock *sk, int optval, void __user *user, int *len) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_conntrack_tuple_hash *h; struct ip_conntrack_tuple tuple; @@ -1317,16 +1076,17 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) h = ip_conntrack_find_get(&tuple, NULL); if (h) { struct sockaddr_in sin; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); sin.sin_family = AF_INET; - sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else @@ -1345,11 +1105,21 @@ static struct nf_sockopt_ops so_getorigdst = { .get = &getorigdst, }; -static int kill_all(const struct ip_conntrack *i, void *data) +static int kill_all(struct ip_conntrack *i, void *data) { return 1; } +static void free_conntrack_hash(void) +{ + if (ip_conntrack_vmalloc) + vfree(ip_conntrack_hash); + else + free_pages((unsigned long)ip_conntrack_hash, + get_order(sizeof(struct list_head) + * ip_conntrack_htable_size)); +} + /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ void ip_conntrack_cleanup(void) @@ -1361,19 +1131,20 @@ void ip_conntrack_cleanup(void) synchronize_net(); i_see_dead_people: - ip_ct_selective_cleanup(kill_all, NULL); + ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; } kmem_cache_destroy(ip_conntrack_cachep); - vfree(ip_conntrack_hash); + kmem_cache_destroy(ip_conntrack_expect_cachep); + free_conntrack_hash(); nf_unregister_sockopt(&so_getorigdst); } static int hashsize; -MODULE_PARM(hashsize, "i"); +module_param(hashsize, int, 0400); int __init ip_conntrack_init(void) { @@ -1406,8 +1177,20 @@ int __init ip_conntrack_init(void) return ret; } - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); + /* AK: the hash table is twice as big than needed because it + uses list_head. it would be much nicer to caches to use a + single pointer list head here. */ + ip_conntrack_vmalloc = 0; + ip_conntrack_hash + =(void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + *ip_conntrack_htable_size)); + if (!ip_conntrack_hash) { + ip_conntrack_vmalloc = 1; + printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); + ip_conntrack_hash = vmalloc(sizeof(struct list_head) + * ip_conntrack_htable_size); + } if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1415,17 +1198,28 @@ int __init ip_conntrack_init(void) ip_conntrack_cachep = kmem_cache_create("ip_conntrack", sizeof(struct ip_conntrack), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + 0, NULL, NULL); if (!ip_conntrack_cachep) { printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); goto err_free_hash; } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), + 0, 0, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + /* Don't NEED lock here, but good form anyway. */ WRITE_LOCK(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) + ip_ct_protos[i] = &ip_conntrack_generic_protocol; /* Sew in builtin protocols. */ - list_append(&protocol_list, &ip_conntrack_protocol_tcp); - list_append(&protocol_list, &ip_conntrack_protocol_udp); - list_append(&protocol_list, &ip_conntrack_protocol_icmp); + ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; + ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; + ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; WRITE_UNLOCK(&ip_conntrack_lock); for (i = 0; i < ip_conntrack_htable_size; i++) @@ -1439,16 +1233,13 @@ int __init ip_conntrack_init(void) atomic_set(&ip_conntrack_untracked.ct_general.use, 1); /* - and look it like as a confirmed connection */ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); - /* - and prepare the ctinfo field for REJECT & NAT. */ - ip_conntrack_untracked.infos[IP_CT_NEW].master = - ip_conntrack_untracked.infos[IP_CT_RELATED].master = - ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = - &ip_conntrack_untracked.ct_general; return ret; +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - vfree(ip_conntrack_hash); + free_conntrack_hash(); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst);