X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fnetfilter%2Fip_conntrack_core.c;h=28d9425d5c390dac7601953e65043e973833e7aa;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=67caf5f43daf6e37e3c7db03e2949c957b96f8c8;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 67caf5f43..28d9425d5 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -58,11 +58,9 @@ #endif DECLARE_RWLOCK(ip_conntrack_lock); -DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock); /* ip_conntrack_standalone needs this */ atomic_t ip_conntrack_count = ATOMIC_INIT(0); -EXPORT_SYMBOL(ip_conntrack_count); void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(ip_conntrack_expect_list); @@ -75,10 +73,12 @@ static kmem_cache_t *ip_conntrack_cachep; static kmem_cache_t *ip_conntrack_expect_cachep; struct ip_conntrack ip_conntrack_untracked; unsigned int ip_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int ip_conntrack_vmalloc; DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); -inline void +void ip_conntrack_put(struct ip_conntrack *ct) { IP_NF_ASSERT(ct); @@ -117,6 +117,7 @@ ip_ct_get_tuple(const struct iphdr *iph, tuple->src.ip = iph->saddr; tuple->dst.ip = iph->daddr; tuple->dst.protonum = iph->protocol; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; return protocol->pkt_to_tuple(skb, dataoff, tuple); } @@ -129,135 +130,76 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, inverse->src.ip = orig->dst.ip; inverse->dst.ip = orig->src.ip; inverse->dst.protonum = orig->dst.protonum; + inverse->dst.dir = !orig->dst.dir; return protocol->invert_tuple(inverse, orig); } /* ip_conntrack_expect helper functions */ - -/* Compare tuple parts depending on mask. */ -static inline int expect_cmp(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); - return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask); -} - -static void -destroy_expect(struct ip_conntrack_expect *exp) +static void destroy_expect(struct ip_conntrack_expect *exp) { - DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use)); - IP_NF_ASSERT(atomic_read(&exp->use) == 0); + ip_conntrack_put(exp->master); IP_NF_ASSERT(!timer_pending(&exp->timeout)); - kmem_cache_free(ip_conntrack_expect_cachep, exp); CONNTRACK_STAT_INC(expect_delete); } -inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) +static void unlink_expect(struct ip_conntrack_expect *exp) { - IP_NF_ASSERT(exp); - - if (atomic_dec_and_test(&exp->use)) { - /* usage count dropped to zero */ - destroy_expect(exp); - } -} - -static inline struct ip_conntrack_expect * -__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock); - return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); -} - -/* Find a expectation corresponding to a tuple. */ -struct ip_conntrack_expect * -ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *exp; - - READ_LOCK(&ip_conntrack_lock); - READ_LOCK(&ip_conntrack_expect_tuple_lock); - exp = __ip_ct_expect_find(tuple); - if (exp) - atomic_inc(&exp->use); - READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - READ_UNLOCK(&ip_conntrack_lock); - - return exp; + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + list_del(&exp->list); + /* Logically in destroy_expect, but we hold the lock here. */ + exp->master->expecting--; } -/* remove one specific expectation from all lists and drop refcount, - * does _NOT_ delete the timer. */ -static void __unexpect_related(struct ip_conntrack_expect *expect) +static void expectation_timed_out(unsigned long ul_expect) { - DEBUGP("unexpect_related(%p)\n", expect); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - - /* we're not allowed to unexpect a confirmed expectation! */ - IP_NF_ASSERT(!expect->sibling); - - /* delete from global and local lists */ - list_del(&expect->list); - list_del(&expect->expected_list); - - /* decrement expect-count of master conntrack */ - if (expect->expectant) - expect->expectant->expecting--; + struct ip_conntrack_expect *exp = (void *)ul_expect; - ip_conntrack_expect_put(expect); + WRITE_LOCK(&ip_conntrack_lock); + unlink_expect(exp); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(exp); } -/* remove one specific expecatation from all lists, drop refcount - * and expire timer. - * This function can _NOT_ be called for confirmed expects! */ -static void unexpect_related(struct ip_conntrack_expect *expect) +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct ip_conntrack_expect * +find_expectation(const struct ip_conntrack_tuple *tuple) { - IP_NF_ASSERT(expect->expectant); - IP_NF_ASSERT(expect->expectant->helper); - /* if we are supposed to have a timer, but we can't delete - * it: race condition. __unexpect_related will - * be calledd by timeout function */ - if (expect->expectant->helper->timeout - && !del_timer(&expect->timeout)) - return; + struct ip_conntrack_expect *i; - __unexpect_related(expect); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && is_confirmed(i->master) + && del_timer(&i->timeout)) { + unlink_expect(i); + return i; + } + } + return NULL; } -/* delete all unconfirmed expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct, int drop_refcount) +/* delete all expectations for this conntrack */ +static void remove_expectations(struct ip_conntrack *ct) { - struct list_head *exp_entry, *next; - struct ip_conntrack_expect *exp; - - DEBUGP("remove_expectations(%p)\n", ct); + struct ip_conntrack_expect *i, *tmp; - list_for_each_safe(exp_entry, next, &ct->sibling_list) { - exp = list_entry(exp_entry, struct ip_conntrack_expect, - expected_list); + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; - /* we skip established expectations, as we want to delete - * the un-established ones only */ - if (exp->sibling) { - DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct); - if (drop_refcount) { - /* Indicate that this expectations parent is dead */ - ip_conntrack_put(exp->expectant); - exp->expectant = NULL; - } - continue; + list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); } - - IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp)); - IP_NF_ASSERT(exp->expectant == ct); - - /* delete expectation from global and private lists */ - unexpect_related(exp); } } @@ -274,14 +216,14 @@ clean_from_lists(struct ip_conntrack *ct) LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); - /* Destroy all un-established, pending expectations */ - remove_expectations(ct, 1); + /* Destroy all pending expectations */ + remove_expectations(ct); } static void destroy_conntrack(struct nf_conntrack *nfct) { - struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL; + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; struct ip_conntrack_protocol *proto; DEBUGP("destroy_conntrack(%p)\n", ct); @@ -299,38 +241,37 @@ destroy_conntrack(struct nf_conntrack *nfct) ip_conntrack_destroyed(ct); WRITE_LOCK(&ip_conntrack_lock); - /* Make sure don't leave any orphaned expectations lying around */ - if (ct->expecting) - remove_expectations(ct, 1); - - /* Delete our master expectation */ - if (ct->master) { - if (ct->master->expectant) { - /* can't call __unexpect_related here, - * since it would screw up expect_list */ - list_del(&ct->master->expected_list); - master = ct->master->expectant; - } - kmem_cache_free(ip_conntrack_expect_cachep, ct->master); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); } + + CONNTRACK_STAT_INC(delete); WRITE_UNLOCK(&ip_conntrack_lock); - if (master) - ip_conntrack_put(master); + if (ct->master) + ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); kmem_cache_free(ip_conntrack_cachep, ct); atomic_dec(&ip_conntrack_count); - CONNTRACK_STAT_INC(delete); } static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - CONNTRACK_STAT_INC(delete_list); - WRITE_LOCK(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + CONNTRACK_STAT_INC(delete_list); clean_from_lists(ct); WRITE_UNLOCK(&ip_conntrack_lock); ip_conntrack_put(ct); @@ -342,7 +283,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, const struct ip_conntrack *ignored_conntrack) { MUST_BE_READ_LOCKED(&ip_conntrack_lock); - return i->ctrack != ignored_conntrack + return tuplehash_to_ctrack(i) != ignored_conntrack && ip_ct_tuple_equal(tuple, &i->tuple); } @@ -352,16 +293,14 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; unsigned int hash = hash_conntrack(tuple); - /* use per_cpu() to avoid multiple calls to smp_processor_id() */ - unsigned int cpu = smp_processor_id(); MUST_BE_READ_LOCKED(&ip_conntrack_lock); list_for_each_entry(h, &ip_conntrack_hash[hash], list) { if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { - per_cpu(ip_conntrack_stat, cpu).found++; + CONNTRACK_STAT_INC(found); return h; } - per_cpu(ip_conntrack_stat, cpu).searched++; + CONNTRACK_STAT_INC(searched); } return NULL; @@ -377,7 +316,7 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, READ_LOCK(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); if (h) - atomic_inc(&h->ctrack->ct_general.use); + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); READ_UNLOCK(&ip_conntrack_lock); return h; @@ -385,13 +324,13 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, /* Confirm a connection given skb; places it in hash table */ int -__ip_conntrack_confirm(struct sk_buff *skb) +__ip_conntrack_confirm(struct sk_buff **pskb) { unsigned int hash, repl_hash; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - ct = ip_conntrack_get(skb, &ctinfo); + ct = ip_conntrack_get(*pskb, &ctinfo); /* ipt_REJECT uses ip_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet @@ -414,6 +353,7 @@ __ip_conntrack_confirm(struct sk_buff *skb) DEBUGP("Confirming conntrack %p\n", ct); WRITE_LOCK(&ip_conntrack_lock); + /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -425,6 +365,9 @@ __ip_conntrack_confirm(struct sk_buff *skb) conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[hash], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); list_prepend(&ip_conntrack_hash[repl_hash], @@ -436,13 +379,14 @@ __ip_conntrack_confirm(struct sk_buff *skb) add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); - WRITE_UNLOCK(&ip_conntrack_lock); CONNTRACK_STAT_INC(insert); + WRITE_UNLOCK(&ip_conntrack_lock); return NF_ACCEPT; } - WRITE_UNLOCK(&ip_conntrack_lock); CONNTRACK_STAT_INC(insert_failed); + WRITE_UNLOCK(&ip_conntrack_lock); + return NF_DROP; } @@ -465,30 +409,33 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, connection. Too bad: we're in trouble anyway. */ static inline int unreplied(const struct ip_conntrack_tuple_hash *i) { - return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status)); + return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); } static int early_drop(struct list_head *chain) { /* Traverse backwards: gives us oldest, which is roughly LRU */ struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct = NULL; int dropped = 0; READ_LOCK(&ip_conntrack_lock); h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); - if (h) - atomic_inc(&h->ctrack->ct_general.use); + if (h) { + ct = tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } READ_UNLOCK(&ip_conntrack_lock); - if (!h) + if (!ct) return dropped; - if (del_timer(&h->ctrack->timeout)) { - death_by_timeout((unsigned long)h->ctrack); + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); dropped = 1; CONNTRACK_STAT_INC(early_drop); } - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); return dropped; } @@ -498,7 +445,7 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, @@ -515,7 +462,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, struct ip_conntrack *conntrack; struct ip_conntrack_tuple repl_tuple; size_t hash; - struct ip_conntrack_expect *expected; + struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); @@ -551,9 +498,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack; conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack; if (!protocol->new(conntrack, skb)) { kmem_cache_free(ip_conntrack_cachep, conntrack); return NULL; @@ -563,62 +508,39 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - INIT_LIST_HEAD(&conntrack->sibling_list); - WRITE_LOCK(&ip_conntrack_lock); - /* Need finding and deleting of expected ONLY if we win race */ - READ_LOCK(&ip_conntrack_expect_tuple_lock); - expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); - READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - - if (expected) { - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (!is_confirmed(expected->expectant)) { - conntrack->helper = ip_ct_find_helper(&repl_tuple); - goto end; - } - - /* Expectation is dying... */ - if (expected->expectant->helper->timeout - && !del_timer(&expected->timeout)) - goto end; + exp = find_expectation(tuple); + if (exp) { DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", - conntrack, expected); + conntrack, exp); /* Welcome, Mr. Bond. We've been expecting you... */ - IP_NF_ASSERT(expected->expectant); __set_bit(IPS_EXPECTED_BIT, &conntrack->status); - conntrack->master = expected; - expected->sibling = conntrack; - LIST_DELETE(&ip_conntrack_expect_list, expected); - expected->expectant->expecting--; - nf_conntrack_get(&master_ct(conntrack)->ct_general); - - /* this is a braindead... --pablo */ - atomic_inc(&ip_conntrack_count); - WRITE_UNLOCK(&ip_conntrack_lock); - - if (expected->expectfn) - expected->expectfn(conntrack); - + conntrack->master = exp->master; +#if CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); - - goto ret; - } else { + } else { conntrack->helper = ip_ct_find_helper(&repl_tuple); CONNTRACK_STAT_INC(new); } -end: atomic_inc(&ip_conntrack_count); + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + atomic_inc(&ip_conntrack_count); WRITE_UNLOCK(&ip_conntrack_lock); -ret: return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + destroy_expect(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ @@ -631,6 +553,7 @@ resolve_normal_ct(struct sk_buff *skb, { struct ip_conntrack_tuple tuple; struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); @@ -647,6 +570,7 @@ resolve_normal_ct(struct sk_buff *skb, if (IS_ERR(h)) return (void *)h; } + ct = tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (DIRECTION(h) == IP_CT_DIR_REPLY) { @@ -655,24 +579,24 @@ resolve_normal_ct(struct sk_buff *skb, *set_reply = 1; } else { /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) { + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: normal packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) { + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: related packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_RELATED; } else { DEBUGP("ip_conntrack_in: new packet for %p\n", - h->ctrack); + ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; } - skb->nfct = &h->ctrack->ct_general; + skb->nfct = &ct->ct_general; skb->nfctinfo = *ctinfo; - return h->ctrack; + return ct; } /* Netfilter hook itself. */ @@ -757,16 +681,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return -ret; } - if (ret != NF_DROP && ct->helper) { - ret = ct->helper->help(*pskb, ct, ctinfo); - if (ret == -1) { - /* Invalid */ - CONNTRACK_STAT_INC(invalid); - nf_conntrack_put((*pskb)->nfct); - (*pskb)->nfct = NULL; - return NF_ACCEPT; - } - } if (set_reply) set_bit(IPS_SEEN_REPLY_BIT, &ct->status); @@ -780,55 +694,49 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse, ip_ct_find_proto(orig->dst.protonum)); } -static inline int resent_expect(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) -{ - DEBUGP("resent_expect\n"); - DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple); - DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple); - DEBUGP("test tuple: "); DUMP_TUPLE(tuple); - return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple)) - || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple))) - && ip_ct_tuple_equal(&i->mask, mask)); -} - /* Would two expected things clash? */ -static inline int expect_clash(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) +static inline int expect_clash(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct ip_conntrack_tuple intersect_mask - = { { i->mask.src.ip & mask->src.ip, - { i->mask.src.u.all & mask->src.u.all } }, - { i->mask.dst.ip & mask->dst.ip, - { i->mask.dst.u.all & mask->dst.u.all }, - i->mask.dst.protonum & mask->dst.protonum } }; + = { { a->mask.src.ip & b->mask.src.ip, + { a->mask.src.u.all & b->mask.src.u.all } }, + { a->mask.dst.ip & b->mask.dst.ip, + { a->mask.dst.u.all & b->mask.dst.u.all }, + a->mask.dst.protonum & b->mask.dst.protonum } }; - return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask); + return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); } -inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect) +static inline int expect_matches(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) { - WRITE_LOCK(&ip_conntrack_lock); - unexpect_related(expect); - WRITE_UNLOCK(&ip_conntrack_lock); + return a->master == b->master + && ip_ct_tuple_equal(&a->tuple, &b->tuple) + && ip_ct_tuple_equal(&a->mask, &b->mask); } - -static void expectation_timed_out(unsigned long ul_expect) + +/* Generally a bad idea to call this: could have matched already. */ +void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) { - struct ip_conntrack_expect *expect = (void *) ul_expect; + struct ip_conntrack_expect *i; - DEBUGP("expectation %p timed out\n", expect); WRITE_LOCK(&ip_conntrack_lock); - __unexpect_related(expect); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + unlink_expect(i); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(i); + return; + } + } WRITE_UNLOCK(&ip_conntrack_lock); } -struct ip_conntrack_expect * -ip_conntrack_expect_alloc(void) +struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) { struct ip_conntrack_expect *new; @@ -837,190 +745,103 @@ ip_conntrack_expect_alloc(void) DEBUGP("expect_related: OOM allocating expect\n"); return NULL; } - - /* tuple_cmp compares whole union, we have to initialized cleanly */ - memset(new, 0, sizeof(struct ip_conntrack_expect)); - atomic_set(&new->use, 1); - + new->master = NULL; return new; } -static void -ip_conntrack_expect_insert(struct ip_conntrack_expect *new, - struct ip_conntrack *related_to) +void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) { - DEBUGP("new expectation %p of conntrack %p\n", new, related_to); - new->expectant = related_to; - new->sibling = NULL; - - /* add to expected list for this connection */ - list_add_tail(&new->expected_list, &related_to->sibling_list); - /* add to global list of expectations */ - list_prepend(&ip_conntrack_expect_list, &new->list); - /* add and start timer if required */ - if (related_to->helper->timeout) { - init_timer(&new->timeout); - new->timeout.data = (unsigned long)new; - new->timeout.function = expectation_timed_out; - new->timeout.expires = jiffies + - related_to->helper->timeout * HZ; - add_timer(&new->timeout); - } - related_to->expecting++; + kmem_cache_free(ip_conntrack_expect_cachep, expect); } -/* Add a related connection. */ -int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, - struct ip_conntrack *related_to) +static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) { - struct ip_conntrack_expect *old; - int ret = 0; + atomic_inc(&exp->master->ct_general.use); + exp->master->expecting++; + list_add(&exp->list, &ip_conntrack_expect_list); + + if (exp->master->helper->timeout) { + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires + = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + } else + exp->timeout.function = NULL; - WRITE_LOCK(&ip_conntrack_lock); - /* Because of the write lock, no reader can walk the lists, - * so there is no need to use the tuple lock too */ + CONNTRACK_STAT_INC(expect_create); +} - DEBUGP("ip_conntrack_expect_related %p\n", related_to); - DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct ip_conntrack *master) +{ + struct ip_conntrack_expect *i; - old = LIST_FIND(&ip_conntrack_expect_list, resent_expect, - struct ip_conntrack_expect *, &expect->tuple, - &expect->mask); - if (old) { - /* Helper private data may contain offsets but no pointers - pointing into the payload - otherwise we should have to copy - the data filled out by the helper over the old one */ - DEBUGP("expect_related: resent packet\n"); - if (related_to->helper->timeout) { - if (!del_timer(&old->timeout)) { - /* expectation is dying. Fall through */ - goto out; - } else { - old->timeout.expires = jiffies + - related_to->helper->timeout * HZ; - add_timer(&old->timeout); + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); } + break; } - - WRITE_UNLOCK(&ip_conntrack_lock); - /* This expectation is not inserted so no need to lock */ - kmem_cache_free(ip_conntrack_expect_cachep, expect); - return -EEXIST; - - } else if (related_to->helper->max_expected && - related_to->expecting >= related_to->helper->max_expected) { - /* old == NULL */ - if (!(related_to->helper->flags & - IP_CT_HELPER_F_REUSE_EXPECT)) { - WRITE_UNLOCK(&ip_conntrack_lock); - if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: max number of expected " - "connections %i of %s reached for " - "%u.%u.%u.%u->%u.%u.%u.%u\n", - related_to->helper->max_expected, - related_to->helper->name, - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - kmem_cache_free(ip_conntrack_expect_cachep, expect); - return -EPERM; - } - DEBUGP("ip_conntrack: max number of expected " - "connections %i of %s reached for " - "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n", - related_to->helper->max_expected, - related_to->helper->name, - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), - NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - - /* choose the the oldest expectation to evict */ - list_for_each_entry(old, &related_to->sibling_list, - expected_list) - if (old->sibling == NULL) - break; - - /* We cannot fail since related_to->expecting is the number - * of unconfirmed expectations */ - IP_NF_ASSERT(old && old->sibling == NULL); - - /* newnat14 does not reuse the real allocated memory - * structures but rather unexpects the old and - * allocates a new. unexpect_related will decrement - * related_to->expecting. - */ - unexpect_related(old); - ret = -EPERM; - } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash, - struct ip_conntrack_expect *, &expect->tuple, - &expect->mask)) { - WRITE_UNLOCK(&ip_conntrack_lock); - DEBUGP("expect_related: busy!\n"); - - kmem_cache_free(ip_conntrack_expect_cachep, expect); - return -EBUSY; } +} -out: ip_conntrack_expect_insert(expect, related_to); - - WRITE_UNLOCK(&ip_conntrack_lock); - - CONNTRACK_STAT_INC(expect_create); +static inline int refresh_timer(struct ip_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; - return ret; + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; } -/* Change tuple in an existing expectation */ -int ip_conntrack_change_expect(struct ip_conntrack_expect *expect, - struct ip_conntrack_tuple *newtuple) +int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) { + struct ip_conntrack_expect *i; int ret; - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - WRITE_LOCK(&ip_conntrack_expect_tuple_lock); - - DEBUGP("change_expect:\n"); - DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask); - DEBUGP("newtuple: "); DUMP_TUPLE(newtuple); - if (expect->ct_tuple.dst.protonum == 0) { - /* Never seen before */ - DEBUGP("change expect: never seen before\n"); - if (!ip_ct_tuple_equal(&expect->tuple, newtuple) - && LIST_FIND(&ip_conntrack_expect_list, expect_clash, - struct ip_conntrack_expect *, newtuple, &expect->mask)) { - /* Force NAT to find an unused tuple */ - ret = -1; - } else { - memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple)); - memcpy(&expect->tuple, newtuple, sizeof(expect->tuple)); - ret = 0; - } - } else { - /* Resent packet */ - DEBUGP("change expect: resent packet\n"); - if (ip_ct_tuple_equal(&expect->tuple, newtuple)) { - ret = 0; - } else { - /* Force NAT to choose again the same port */ - ret = -1; + DEBUGP("ip_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + WRITE_LOCK(&ip_conntrack_lock); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + /* We don't need the one they've given us. */ + ip_conntrack_expect_free(expect); + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; } } - WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock); - - return ret; + + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + ip_conntrack_expect_insert(expect); + ret = 0; +out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; } -/* Alter reply tuple (maybe alter helper). If it's already taken, - return 0 and don't do alteration. */ -int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, - const struct ip_conntrack_tuple *newreply) +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __ip_conntrack_confirm */ +void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) { WRITE_LOCK(&ip_conntrack_lock); - if (__ip_conntrack_find(newreply, conntrack)) { - WRITE_UNLOCK(&ip_conntrack_lock); - return 0; - } /* Should be unconfirmed, so not in hash table yet */ IP_NF_ASSERT(!is_confirmed(conntrack)); @@ -1028,15 +849,14 @@ int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, DUMP_TUPLE(newreply); conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (!conntrack->master && list_empty(&conntrack->sibling_list)) + if (!conntrack->master && conntrack->expecting == 0) conntrack->helper = ip_ct_find_helper(newreply); WRITE_UNLOCK(&ip_conntrack_lock); - - return 1; } int ip_conntrack_helper_register(struct ip_conntrack_helper *me) { + BUG_ON(me->timeout == 0); WRITE_LOCK(&ip_conntrack_lock); list_prepend(&helpers, me); WRITE_UNLOCK(&ip_conntrack_lock); @@ -1047,24 +867,29 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (i->ctrack->helper == me) { - /* Get rid of any expected. */ - remove_expectations(i->ctrack, 0); - /* And *then* set helper to NULL */ - i->ctrack->helper = NULL; - } + if (tuplehash_to_ctrack(i)->helper == me) + tuplehash_to_ctrack(i)->helper = NULL; return 0; } void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) { unsigned int i; + struct ip_conntrack_expect *exp, *tmp; /* Need write lock here, to delete helper. */ WRITE_LOCK(&ip_conntrack_lock); LIST_DELETE(&helpers, me); + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + unlink_expect(exp); + destroy_expect(exp); + } + } /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); for (i = 0; i < ip_conntrack_htable_size; i++) LIST_FIND_W(&ip_conntrack_hash[i], unhelp, struct ip_conntrack_tuple_hash *, me); @@ -1111,29 +936,22 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, } } -int ip_ct_no_defrag; - /* Returns new sk_buff, or NULL */ struct sk_buff * -ip_ct_gather_frags(struct sk_buff *skb) +ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { struct sock *sk = skb->sk; #ifdef CONFIG_NETFILTER_DEBUG unsigned int olddebug = skb->nf_debug; #endif - if (unlikely(ip_ct_no_defrag)) { - kfree_skb(skb); - return NULL; - } - if (sk) { sock_hold(sk); skb_orphan(skb); } local_bh_disable(); - skb = ip_defrag(skb); + skb = ip_defrag(skb, user); local_bh_enable(); if (!skb) { @@ -1177,46 +995,51 @@ static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) } static inline int -do_kill(const struct ip_conntrack_tuple_hash *i, - int (*kill)(const struct ip_conntrack *i, void *data), +do_iter(const struct ip_conntrack_tuple_hash *i, + int (*iter)(struct ip_conntrack *i, void *data), void *data) { - return kill(i->ctrack, data); + return iter(tuplehash_to_ctrack(i), data); } /* Bring out ya dead! */ static struct ip_conntrack_tuple_hash * -get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data), +get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), void *data, unsigned int *bucket) { struct ip_conntrack_tuple_hash *h = NULL; - READ_LOCK(&ip_conntrack_lock); - for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) { - h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill, - struct ip_conntrack_tuple_hash *, kill, data); + WRITE_LOCK(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + WRITE_UNLOCK(&ip_conntrack_lock); return h; } void -ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), - void *data) +ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) { struct ip_conntrack_tuple_hash *h; unsigned int bucket = 0; - while ((h = get_next_corpse(kill, data, &bucket)) != NULL) { + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct ip_conntrack *ct = tuplehash_to_ctrack(h); /* Time to push up daises... */ - if (del_timer(&h->ctrack->timeout)) - death_by_timeout((unsigned long)h->ctrack); + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); /* ... else the timer will get him soon. */ - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); } } @@ -1227,7 +1050,7 @@ ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), static int getorigdst(struct sock *sk, int optval, void __user *user, int *len) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_conntrack_tuple_hash *h; struct ip_conntrack_tuple tuple; @@ -1253,16 +1076,17 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) h = ip_conntrack_find_get(&tuple, NULL); if (h) { struct sockaddr_in sin; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); sin.sin_family = AF_INET; - sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); - ip_conntrack_put(h->ctrack); + ip_conntrack_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else @@ -1281,11 +1105,21 @@ static struct nf_sockopt_ops so_getorigdst = { .get = &getorigdst, }; -static int kill_all(const struct ip_conntrack *i, void *data) +static int kill_all(struct ip_conntrack *i, void *data) { return 1; } +static void free_conntrack_hash(void) +{ + if (ip_conntrack_vmalloc) + vfree(ip_conntrack_hash); + else + free_pages((unsigned long)ip_conntrack_hash, + get_order(sizeof(struct list_head) + * ip_conntrack_htable_size)); +} + /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ void ip_conntrack_cleanup(void) @@ -1297,7 +1131,7 @@ void ip_conntrack_cleanup(void) synchronize_net(); i_see_dead_people: - ip_ct_selective_cleanup(kill_all, NULL); + ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1305,7 +1139,7 @@ void ip_conntrack_cleanup(void) kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); - vfree(ip_conntrack_hash); + free_conntrack_hash(); nf_unregister_sockopt(&so_getorigdst); } @@ -1343,8 +1177,20 @@ int __init ip_conntrack_init(void) return ret; } - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); + /* AK: the hash table is twice as big than needed because it + uses list_head. it would be much nicer to caches to use a + single pointer list head here. */ + ip_conntrack_vmalloc = 0; + ip_conntrack_hash + =(void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + *ip_conntrack_htable_size)); + if (!ip_conntrack_hash) { + ip_conntrack_vmalloc = 1; + printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); + ip_conntrack_hash = vmalloc(sizeof(struct list_head) + * ip_conntrack_htable_size); + } if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1352,7 +1198,7 @@ int __init ip_conntrack_init(void) ip_conntrack_cachep = kmem_cache_create("ip_conntrack", sizeof(struct ip_conntrack), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + 0, NULL, NULL); if (!ip_conntrack_cachep) { printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); goto err_free_hash; @@ -1360,7 +1206,7 @@ int __init ip_conntrack_init(void) ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", sizeof(struct ip_conntrack_expect), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + 0, 0, NULL, NULL); if (!ip_conntrack_expect_cachep) { printk(KERN_ERR "Unable to create ip_expect slab cache\n"); goto err_free_conntrack_slab; @@ -1393,7 +1239,7 @@ int __init ip_conntrack_init(void) err_free_conntrack_slab: kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - vfree(ip_conntrack_hash); + free_conntrack_hash(); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst);