#include <linux/slab.h>
#include <linux/random.h>
#include <linux/jhash.h>
-/* For ERR_PTR(). Yeah, I know... --RR */
-#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
/* This rwlock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
#endif
DECLARE_RWLOCK(ip_conntrack_lock);
-DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
+
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
LIST_HEAD(ip_conntrack_expect_list);
-LIST_HEAD(protocol_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
static LIST_HEAD(helpers);
unsigned int ip_conntrack_htable_size = 0;
int ip_conntrack_max;
-static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
struct list_head *ip_conntrack_hash;
static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_expect_cachep;
struct ip_conntrack ip_conntrack_untracked;
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
-extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
-
-static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
- u_int8_t protocol)
-{
- return protocol == curr->proto;
-}
-
-struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
-{
- struct ip_conntrack_protocol *p;
-
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- p = LIST_FIND(&protocol_list, proto_cmpfn,
- struct ip_conntrack_protocol *, protocol);
- if (!p)
- p = &ip_conntrack_generic_protocol;
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
- return p;
-}
-
-struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
-{
- struct ip_conntrack_protocol *p;
-
- READ_LOCK(&ip_conntrack_lock);
- p = __ip_ct_find_proto(protocol);
- READ_UNLOCK(&ip_conntrack_lock);
- return p;
-}
-
-inline void
+void
ip_conntrack_put(struct ip_conntrack *ct)
{
IP_NF_ASSERT(ct);
- IP_NF_ASSERT(ct->infos[0].master);
- /* nf_conntrack_put wants to go via an info struct, so feed it
- one at random. */
- nf_conntrack_put(&ct->infos[0]);
+ nf_conntrack_put(&ct->ct_general);
}
static int ip_conntrack_hash_rnd_initted;
}
int
-get_tuple(const struct iphdr *iph,
- const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_protocol *protocol)
+ip_ct_get_tuple(const struct iphdr *iph,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_protocol *protocol)
{
/* Never happen */
if (iph->frag_off & htons(IP_OFFSET)) {
tuple->src.ip = iph->saddr;
tuple->dst.ip = iph->daddr;
tuple->dst.protonum = iph->protocol;
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
return protocol->pkt_to_tuple(skb, dataoff, tuple);
}
-static int
-invert_tuple(struct ip_conntrack_tuple *inverse,
- const struct ip_conntrack_tuple *orig,
- const struct ip_conntrack_protocol *protocol)
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig,
+ const struct ip_conntrack_protocol *protocol)
{
inverse->src.ip = orig->dst.ip;
inverse->dst.ip = orig->src.ip;
inverse->dst.protonum = orig->dst.protonum;
+ inverse->dst.dir = !orig->dst.dir;
return protocol->invert_tuple(inverse, orig);
}
/* ip_conntrack_expect helper functions */
-
-/* Compare tuple parts depending on mask. */
-static inline int expect_cmp(const struct ip_conntrack_expect *i,
- const struct ip_conntrack_tuple *tuple)
-{
- MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
- return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
-}
-
-static void
-destroy_expect(struct ip_conntrack_expect *exp)
+static void destroy_expect(struct ip_conntrack_expect *exp)
{
- DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
- IP_NF_ASSERT(atomic_read(&exp->use) == 0);
+ ip_conntrack_put(exp->master);
IP_NF_ASSERT(!timer_pending(&exp->timeout));
-
- kfree(exp);
+ kmem_cache_free(ip_conntrack_expect_cachep, exp);
+ CONNTRACK_STAT_INC(expect_delete);
}
-inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
+static void unlink_expect(struct ip_conntrack_expect *exp)
{
- IP_NF_ASSERT(exp);
-
- if (atomic_dec_and_test(&exp->use)) {
- /* usage count dropped to zero */
- destroy_expect(exp);
- }
-}
-
-static inline struct ip_conntrack_expect *
-__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
-{
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
- return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
- struct ip_conntrack_expect *, tuple);
-}
-
-/* Find a expectation corresponding to a tuple. */
-struct ip_conntrack_expect *
-ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
-{
- struct ip_conntrack_expect *exp;
-
- READ_LOCK(&ip_conntrack_lock);
- READ_LOCK(&ip_conntrack_expect_tuple_lock);
- exp = __ip_ct_expect_find(tuple);
- if (exp)
- atomic_inc(&exp->use);
- READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
- READ_UNLOCK(&ip_conntrack_lock);
-
- return exp;
+ MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+ list_del(&exp->list);
+ /* Logically in destroy_expect, but we hold the lock here. */
+ exp->master->expecting--;
}
-/* remove one specific expectation from all lists and drop refcount,
- * does _NOT_ delete the timer. */
-static void __unexpect_related(struct ip_conntrack_expect *expect)
+static void expectation_timed_out(unsigned long ul_expect)
{
- DEBUGP("unexpect_related(%p)\n", expect);
- MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
-
- /* we're not allowed to unexpect a confirmed expectation! */
- IP_NF_ASSERT(!expect->sibling);
+ struct ip_conntrack_expect *exp = (void *)ul_expect;
- /* delete from global and local lists */
- list_del(&expect->list);
- list_del(&expect->expected_list);
-
- /* decrement expect-count of master conntrack */
- if (expect->expectant)
- expect->expectant->expecting--;
-
- ip_conntrack_expect_put(expect);
+ WRITE_LOCK(&ip_conntrack_lock);
+ unlink_expect(exp);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ destroy_expect(exp);
}
-/* remove one specific expecatation from all lists, drop refcount
- * and expire timer.
- * This function can _NOT_ be called for confirmed expects! */
-static void unexpect_related(struct ip_conntrack_expect *expect)
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
{
- IP_NF_ASSERT(expect->expectant);
- IP_NF_ASSERT(expect->expectant->helper);
- /* if we are supposed to have a timer, but we can't delete
- * it: race condition. __unexpect_related will
- * be calledd by timeout function */
- if (expect->expectant->helper->timeout
- && !del_timer(&expect->timeout))
- return;
-
- __unexpect_related(expect);
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+ && is_confirmed(i->master)
+ && del_timer(&i->timeout)) {
+ unlink_expect(i);
+ return i;
+ }
+ }
+ return NULL;
}
-/* delete all unconfirmed expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct ip_conntrack *ct)
{
- struct list_head *exp_entry, *next;
- struct ip_conntrack_expect *exp;
-
- DEBUGP("remove_expectations(%p)\n", ct);
+ struct ip_conntrack_expect *i, *tmp;
- list_for_each_safe(exp_entry, next, &ct->sibling_list) {
- exp = list_entry(exp_entry, struct ip_conntrack_expect,
- expected_list);
+ /* Optimization: most connection never expect any others. */
+ if (ct->expecting == 0)
+ return;
- /* we skip established expectations, as we want to delete
- * the un-established ones only */
- if (exp->sibling) {
- DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
- if (drop_refcount) {
- /* Indicate that this expectations parent is dead */
- ip_conntrack_put(exp->expectant);
- exp->expectant = NULL;
- }
- continue;
+ list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+ if (i->master == ct && del_timer(&i->timeout)) {
+ unlink_expect(i);
+ destroy_expect(i);
}
-
- IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
- IP_NF_ASSERT(exp->expectant == ct);
-
- /* delete expectation from global and private lists */
- unexpect_related(exp);
}
}
LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
- /* Destroy all un-established, pending expectations */
- remove_expectations(ct, 1);
+ /* Destroy all pending expectations */
+ remove_expectations(ct);
}
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
- struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
+ struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
struct ip_conntrack_protocol *proto;
DEBUGP("destroy_conntrack(%p)\n", ct);
ip_conntrack_destroyed(ct);
WRITE_LOCK(&ip_conntrack_lock);
- /* Make sure don't leave any orphaned expectations lying around */
- if (ct->expecting)
- remove_expectations(ct, 1);
-
- /* Delete our master expectation */
- if (ct->master) {
- if (ct->master->expectant) {
- /* can't call __unexpect_related here,
- * since it would screw up expect_list */
- list_del(&ct->master->expected_list);
- master = ct->master->expectant;
- }
- kfree(ct->master);
+ /* Expectations will have been removed in clean_from_lists,
+ * except TFTP can create an expectation on the first packet,
+ * before connection is in the list, so we need to clean here,
+ * too. */
+ remove_expectations(ct);
+
+ /* We overload first tuple to link into unconfirmed list. */
+ if (!is_confirmed(ct)) {
+ BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
}
+
+ CONNTRACK_STAT_INC(delete);
WRITE_UNLOCK(&ip_conntrack_lock);
- if (master)
- ip_conntrack_put(master);
+ if (ct->master)
+ ip_conntrack_put(ct->master);
DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
kmem_cache_free(ip_conntrack_cachep, ct);
struct ip_conntrack *ct = (void *)ul_conntrack;
WRITE_LOCK(&ip_conntrack_lock);
+ /* Inside lock so preempt is disabled on module removal path.
+ * Otherwise we can get spurious warnings. */
+ CONNTRACK_STAT_INC(delete_list);
clean_from_lists(ct);
WRITE_UNLOCK(&ip_conntrack_lock);
ip_conntrack_put(ct);
const struct ip_conntrack *ignored_conntrack)
{
MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- return i->ctrack != ignored_conntrack
+ return tuplehash_to_ctrack(i) != ignored_conntrack
&& ip_ct_tuple_equal(tuple, &i->tuple);
}
unsigned int hash = hash_conntrack(tuple);
MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- h = LIST_FIND(&ip_conntrack_hash[hash],
- conntrack_tuple_cmp,
- struct ip_conntrack_tuple_hash *,
- tuple, ignored_conntrack);
- return h;
+ list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+ if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+ CONNTRACK_STAT_INC(found);
+ return h;
+ }
+ CONNTRACK_STAT_INC(searched);
+ }
+
+ return NULL;
}
/* Find a connection corresponding to a tuple. */
READ_LOCK(&ip_conntrack_lock);
h = __ip_conntrack_find(tuple, ignored_conntrack);
if (h)
- atomic_inc(&h->ctrack->ct_general.use);
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
READ_UNLOCK(&ip_conntrack_lock);
return h;
}
-static inline struct ip_conntrack *
-__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
-{
- struct ip_conntrack *ct
- = (struct ip_conntrack *)nfct->master;
-
- /* ctinfo is the index of the nfct inside the conntrack */
- *ctinfo = nfct - ct->infos;
- IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
- return ct;
-}
-
-/* Return conntrack and conntrack_info given skb->nfct->master */
-struct ip_conntrack *
-ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
-{
- if (skb->nfct)
- return __ip_conntrack_get(skb->nfct, ctinfo);
- return NULL;
-}
-
-/* Confirm a connection given skb->nfct; places it in hash table */
+/* Confirm a connection given skb; places it in hash table */
int
-__ip_conntrack_confirm(struct nf_ct_info *nfct)
+__ip_conntrack_confirm(struct sk_buff **pskb)
{
unsigned int hash, repl_hash;
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
- ct = __ip_conntrack_get(nfct, &ctinfo);
+ ct = ip_conntrack_get(*pskb, &ctinfo);
/* ipt_REJECT uses ip_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
DEBUGP("Confirming conntrack %p\n", ct);
WRITE_LOCK(&ip_conntrack_lock);
+
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
conntrack_tuple_cmp,
struct ip_conntrack_tuple_hash *,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+ /* Remove from unconfirmed list */
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
list_prepend(&ip_conntrack_hash[hash],
&ct->tuplehash[IP_CT_DIR_ORIGINAL]);
list_prepend(&ip_conntrack_hash[repl_hash],
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ CONNTRACK_STAT_INC(insert);
WRITE_UNLOCK(&ip_conntrack_lock);
return NF_ACCEPT;
}
+ CONNTRACK_STAT_INC(insert_failed);
WRITE_UNLOCK(&ip_conntrack_lock);
+
return NF_DROP;
}
return h != NULL;
}
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-struct ip_conntrack *
-icmp_error_track(struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct ip_conntrack_tuple innertuple, origtuple;
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } inside;
- struct ip_conntrack_protocol *innerproto;
- struct ip_conntrack_tuple_hash *h;
- int dataoff;
-
- IP_NF_ASSERT(skb->nfct == NULL);
-
- /* Not enough header? */
- if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
- return NULL;
-
- if (inside.icmp.type != ICMP_DEST_UNREACH
- && inside.icmp.type != ICMP_SOURCE_QUENCH
- && inside.icmp.type != ICMP_TIME_EXCEEDED
- && inside.icmp.type != ICMP_PARAMETERPROB
- && inside.icmp.type != ICMP_REDIRECT)
- return NULL;
-
- /* Ignore ICMP's containing fragments (shouldn't happen) */
- if (inside.ip.frag_off & htons(IP_OFFSET)) {
- DEBUGP("icmp_error_track: fragment of proto %u\n",
- inside.ip.protocol);
- return NULL;
- }
-
- innerproto = ip_ct_find_proto(inside.ip.protocol);
- dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
- /* Are they talking about one of our connections? */
- if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
- DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
- return NULL;
- }
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
- DEBUGP("icmp_error_track: Can't invert tuple\n");
- return NULL;
- }
-
- *ctinfo = IP_CT_RELATED;
-
- h = ip_conntrack_find_get(&innertuple, NULL);
- if (!h) {
- /* Locally generated ICMPs will match inverted if they
- haven't been SNAT'ed yet */
- /* FIXME: NAT code has to handle half-done double NAT --RR */
- if (hooknum == NF_IP_LOCAL_OUT)
- h = ip_conntrack_find_get(&origtuple, NULL);
-
- if (!h) {
- DEBUGP("icmp_error_track: no match\n");
- return NULL;
- }
- /* Reverse direction from that found */
- if (DIRECTION(h) != IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- } else {
- if (DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- }
-
- /* Update skb to refer to this connection */
- skb->nfct = &h->ctrack->infos[*ctinfo];
- return h->ctrack;
-}
-
/* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */
static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
{
- return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
+ return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
}
static int early_drop(struct list_head *chain)
{
/* Traverse backwards: gives us oldest, which is roughly LRU */
struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack *ct = NULL;
int dropped = 0;
READ_LOCK(&ip_conntrack_lock);
h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
- if (h)
- atomic_inc(&h->ctrack->ct_general.use);
+ if (h) {
+ ct = tuplehash_to_ctrack(h);
+ atomic_inc(&ct->ct_general.use);
+ }
READ_UNLOCK(&ip_conntrack_lock);
- if (!h)
+ if (!ct)
return dropped;
- if (del_timer(&h->ctrack->timeout)) {
- death_by_timeout((unsigned long)h->ctrack);
+ if (del_timer(&ct->timeout)) {
+ death_by_timeout((unsigned long)ct);
dropped = 1;
+ CONNTRACK_STAT_INC(early_drop);
}
- ip_conntrack_put(h->ctrack);
+ ip_conntrack_put(ct);
return dropped;
}
return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
}
-struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
{
return LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
struct ip_conntrack *conntrack;
struct ip_conntrack_tuple repl_tuple;
size_t hash;
- struct ip_conntrack_expect *expected;
- int i;
- static unsigned int drop_next;
+ struct ip_conntrack_expect *exp;
if (!ip_conntrack_hash_rnd_initted) {
get_random_bytes(&ip_conntrack_hash_rnd, 4);
hash = hash_conntrack(tuple);
- if (ip_conntrack_max &&
- atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
- /* Try dropping from random chain, or else from the
- chain about to put into (in case they're trying to
- bomb one hash chain). */
- unsigned int next = (drop_next++)%ip_conntrack_htable_size;
-
- if (!early_drop(&ip_conntrack_hash[next])
- && !early_drop(&ip_conntrack_hash[hash])) {
+ if (ip_conntrack_max
+ && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+ /* Try dropping from this hash chain. */
+ if (!early_drop(&ip_conntrack_hash[hash])) {
if (net_ratelimit())
printk(KERN_WARNING
"ip_conntrack: table full, dropping"
}
}
- if (!invert_tuple(&repl_tuple, tuple, protocol)) {
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
DEBUGP("Can't invert tuple.\n");
return NULL;
}
atomic_set(&conntrack->ct_general.use, 1);
conntrack->ct_general.destroy = destroy_conntrack;
conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
- conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
- for (i=0; i < IP_CT_NUMBER; i++)
- conntrack->infos[i].master = &conntrack->ct_general;
-
if (!protocol->new(conntrack, skb)) {
kmem_cache_free(ip_conntrack_cachep, conntrack);
return NULL;
conntrack->timeout.data = (unsigned long)conntrack;
conntrack->timeout.function = death_by_timeout;
- INIT_LIST_HEAD(&conntrack->sibling_list);
-
WRITE_LOCK(&ip_conntrack_lock);
- /* Need finding and deleting of expected ONLY if we win race */
- READ_LOCK(&ip_conntrack_expect_tuple_lock);
- expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
- struct ip_conntrack_expect *, tuple);
- READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
-
- /* If master is not in hash table yet (ie. packet hasn't left
- this machine yet), how can other end know about expected?
- Hence these are not the droids you are looking for (if
- master ct never got confirmed, we'd hold a reference to it
- and weird things would happen to future packets). */
- if (expected && !is_confirmed(expected->expectant))
- expected = NULL;
-
- /* Look up the conntrack helper for master connections only */
- if (!expected)
- conntrack->helper = ip_ct_find_helper(&repl_tuple);
+ exp = find_expectation(tuple);
- /* If the expectation is dying, then this is a loser. */
- if (expected
- && expected->expectant->helper->timeout
- && ! del_timer(&expected->timeout))
- expected = NULL;
-
- if (expected) {
+ if (exp) {
DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
- conntrack, expected);
+ conntrack, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
- conntrack->master = expected;
- expected->sibling = conntrack;
- LIST_DELETE(&ip_conntrack_expect_list, expected);
- expected->expectant->expecting--;
- nf_conntrack_get(&master_ct(conntrack)->infos[0]);
+ conntrack->master = exp->master;
+#if CONFIG_IP_NF_CONNTRACK_MARK
+ conntrack->mark = exp->master->mark;
+#endif
+ nf_conntrack_get(&conntrack->master->ct_general);
+ CONNTRACK_STAT_INC(expect_new);
+ } else {
+ conntrack->helper = ip_ct_find_helper(&repl_tuple);
+
+ CONNTRACK_STAT_INC(new);
}
+
+ /* Overload tuple linked list to put us in unconfirmed list. */
+ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
atomic_inc(&ip_conntrack_count);
WRITE_UNLOCK(&ip_conntrack_lock);
- if (expected && expected->expectfn)
- expected->expectfn(conntrack);
+ if (exp) {
+ if (exp->expectfn)
+ exp->expectfn(conntrack, exp);
+ destroy_expect(exp);
+ }
+
return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
}
{
struct ip_conntrack_tuple tuple;
struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack *ct;
IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
- if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
+ if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
+ &tuple,proto))
return NULL;
/* look for tuple match */
if (IS_ERR(h))
return (void *)h;
}
+ ct = tuplehash_to_ctrack(h);
/* It exists; we have (non-exclusive) reference. */
if (DIRECTION(h) == IP_CT_DIR_REPLY) {
*set_reply = 1;
} else {
/* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
DEBUGP("ip_conntrack_in: normal packet for %p\n",
- h->ctrack);
+ ct);
*ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
+ } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
DEBUGP("ip_conntrack_in: related packet for %p\n",
- h->ctrack);
+ ct);
*ctinfo = IP_CT_RELATED;
} else {
DEBUGP("ip_conntrack_in: new packet for %p\n",
- h->ctrack);
+ ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
}
- skb->nfct = &h->ctrack->infos[*ctinfo];
- return h->ctrack;
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return ct;
}
/* Netfilter hook itself. */
int set_reply;
int ret;
+ /* Previously seen (loopback or untracked)? Ignore. */
+ if ((*pskb)->nfct) {
+ CONNTRACK_STAT_INC(ignore);
+ return NF_ACCEPT;
+ }
+
/* Never happen */
if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
if (net_ratelimit()) {
}
#endif
- /* Previously seen (loopback or untracked)? Ignore. */
- if ((*pskb)->nfct)
- return NF_ACCEPT;
-
proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
- /* It may be an icmp error... */
- if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
- && icmp_error_track(*pskb, &ctinfo, hooknum))
- return NF_ACCEPT;
+ /* It may be an special packet, error, unclean...
+ * inverse of the return code tells to the netfilter
+ * core what to do with the packet. */
+ if (proto->error != NULL
+ && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+ CONNTRACK_STAT_INC(error);
+ CONNTRACK_STAT_INC(invalid);
+ return -ret;
+ }
- if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
+ if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
/* Not valid part of a connection */
+ CONNTRACK_STAT_INC(invalid);
return NF_ACCEPT;
+ }
- if (IS_ERR(ct))
+ if (IS_ERR(ct)) {
/* Too stressed to deal. */
+ CONNTRACK_STAT_INC(drop);
return NF_DROP;
+ }
IP_NF_ASSERT((*pskb)->nfct);
ret = proto->packet(ct, *pskb, ctinfo);
- if (ret == -1) {
- /* Invalid */
+ if (ret < 0) {
+ /* Invalid: inverse of the return code tells
+ * the netfilter core what to do*/
nf_conntrack_put((*pskb)->nfct);
(*pskb)->nfct = NULL;
- return NF_ACCEPT;
+ CONNTRACK_STAT_INC(invalid);
+ return -ret;
}
- if (ret != NF_DROP && ct->helper) {
- ret = ct->helper->help(*pskb, ct, ctinfo);
- if (ret == -1) {
- /* Invalid */
- nf_conntrack_put((*pskb)->nfct);
- (*pskb)->nfct = NULL;
- return NF_ACCEPT;
- }
- }
if (set_reply)
set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
int invert_tuplepr(struct ip_conntrack_tuple *inverse,
const struct ip_conntrack_tuple *orig)
{
- return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
-}
-
-static inline int resent_expect(const struct ip_conntrack_expect *i,
- const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *mask)
-{
- DEBUGP("resent_expect\n");
- DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple);
- DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple);
- DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
- return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
- || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
- && ip_ct_tuple_equal(&i->mask, mask));
+ return ip_ct_invert_tuple(inverse, orig,
+ ip_ct_find_proto(orig->dst.protonum));
}
/* Would two expected things clash? */
-static inline int expect_clash(const struct ip_conntrack_expect *i,
- const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *mask)
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+ const struct ip_conntrack_expect *b)
{
/* Part covered by intersection of masks must be unequal,
otherwise they clash */
struct ip_conntrack_tuple intersect_mask
- = { { i->mask.src.ip & mask->src.ip,
- { i->mask.src.u.all & mask->src.u.all } },
- { i->mask.dst.ip & mask->dst.ip,
- { i->mask.dst.u.all & mask->dst.u.all },
- i->mask.dst.protonum & mask->dst.protonum } };
+ = { { a->mask.src.ip & b->mask.src.ip,
+ { a->mask.src.u.all & b->mask.src.u.all } },
+ { a->mask.dst.ip & b->mask.dst.ip,
+ { a->mask.dst.u.all & b->mask.dst.u.all },
+ a->mask.dst.protonum & b->mask.dst.protonum } };
- return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
+ return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
}
-inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+ const struct ip_conntrack_expect *b)
{
- WRITE_LOCK(&ip_conntrack_lock);
- unexpect_related(expect);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ return a->master == b->master
+ && ip_ct_tuple_equal(&a->tuple, &b->tuple)
+ && ip_ct_tuple_equal(&a->mask, &b->mask);
}
-
-static void expectation_timed_out(unsigned long ul_expect)
+
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
{
- struct ip_conntrack_expect *expect = (void *) ul_expect;
+ struct ip_conntrack_expect *i;
- DEBUGP("expectation %p timed out\n", expect);
WRITE_LOCK(&ip_conntrack_lock);
- __unexpect_related(expect);
+ /* choose the the oldest expectation to evict */
+ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+ if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+ unlink_expect(i);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ destroy_expect(i);
+ return;
+ }
+ }
WRITE_UNLOCK(&ip_conntrack_lock);
}
-struct ip_conntrack_expect *
-ip_conntrack_expect_alloc(void)
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
{
struct ip_conntrack_expect *new;
-
- new = (struct ip_conntrack_expect *)
- kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
+
+ new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
if (!new) {
DEBUGP("expect_related: OOM allocating expect\n");
return NULL;
}
+ new->master = NULL;
+ return new;
+}
- /* tuple_cmp compares whole union, we have to initialized cleanly */
- memset(new, 0, sizeof(struct ip_conntrack_expect));
+void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+{
+ kmem_cache_free(ip_conntrack_expect_cachep, expect);
+}
- return new;
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
+{
+ atomic_inc(&exp->master->ct_general.use);
+ exp->master->expecting++;
+ list_add(&exp->list, &ip_conntrack_expect_list);
+
+ if (exp->master->helper->timeout) {
+ init_timer(&exp->timeout);
+ exp->timeout.data = (unsigned long)exp;
+ exp->timeout.function = expectation_timed_out;
+ exp->timeout.expires
+ = jiffies + exp->master->helper->timeout * HZ;
+ add_timer(&exp->timeout);
+ } else
+ exp->timeout.function = NULL;
+
+ CONNTRACK_STAT_INC(expect_create);
}
-static void
-ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
- struct ip_conntrack *related_to)
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
{
- DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
- new->expectant = related_to;
- new->sibling = NULL;
- atomic_set(&new->use, 1);
-
- /* add to expected list for this connection */
- list_add_tail(&new->expected_list, &related_to->sibling_list);
- /* add to global list of expectations */
- list_prepend(&ip_conntrack_expect_list, &new->list);
- /* add and start timer if required */
- if (related_to->helper->timeout) {
- init_timer(&new->timeout);
- new->timeout.data = (unsigned long)new;
- new->timeout.function = expectation_timed_out;
- new->timeout.expires = jiffies +
- related_to->helper->timeout * HZ;
- add_timer(&new->timeout);
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+ if (i->master == master) {
+ if (del_timer(&i->timeout)) {
+ unlink_expect(i);
+ destroy_expect(i);
+ }
+ break;
+ }
}
- related_to->expecting++;
}
-/* Add a related connection. */
-int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
- struct ip_conntrack *related_to)
+static inline int refresh_timer(struct ip_conntrack_expect *i)
{
- struct ip_conntrack_expect *old;
- int ret = 0;
+ if (!del_timer(&i->timeout))
+ return 0;
- WRITE_LOCK(&ip_conntrack_lock);
- /* Because of the write lock, no reader can walk the lists,
- * so there is no need to use the tuple lock too */
+ i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+ add_timer(&i->timeout);
+ return 1;
+}
+
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
+{
+ struct ip_conntrack_expect *i;
+ int ret;
DEBUGP("ip_conntrack_expect_related %p\n", related_to);
DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
- old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
- struct ip_conntrack_expect *, &expect->tuple,
- &expect->mask);
- if (old) {
- /* Helper private data may contain offsets but no pointers
- pointing into the payload - otherwise we should have to copy
- the data filled out by the helper over the old one */
- DEBUGP("expect_related: resent packet\n");
- if (related_to->helper->timeout) {
- if (!del_timer(&old->timeout)) {
- /* expectation is dying. Fall through */
+ WRITE_LOCK(&ip_conntrack_lock);
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (expect_matches(i, expect)) {
+ /* Refresh timer: if it's dying, ignore.. */
+ if (refresh_timer(i)) {
+ ret = 0;
+ /* We don't need the one they've given us. */
+ ip_conntrack_expect_free(expect);
goto out;
- } else {
- old->timeout.expires = jiffies +
- related_to->helper->timeout * HZ;
- add_timer(&old->timeout);
}
+ } else if (expect_clash(i, expect)) {
+ ret = -EBUSY;
+ goto out;
}
-
- WRITE_UNLOCK(&ip_conntrack_lock);
- kfree(expect);
- return -EEXIST;
-
- } else if (related_to->helper->max_expected &&
- related_to->expecting >= related_to->helper->max_expected) {
- /* old == NULL */
- if (!(related_to->helper->flags &
- IP_CT_HELPER_F_REUSE_EXPECT)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- if (net_ratelimit())
- printk(KERN_WARNING
- "ip_conntrack: max number of expected "
- "connections %i of %s reached for "
- "%u.%u.%u.%u->%u.%u.%u.%u\n",
- related_to->helper->max_expected,
- related_to->helper->name,
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
- kfree(expect);
- return -EPERM;
- }
- DEBUGP("ip_conntrack: max number of expected "
- "connections %i of %s reached for "
- "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
- related_to->helper->max_expected,
- related_to->helper->name,
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
-
- /* choose the the oldest expectation to evict */
- list_for_each_entry(old, &related_to->sibling_list,
- expected_list)
- if (old->sibling == NULL)
- break;
-
- /* We cannot fail since related_to->expecting is the number
- * of unconfirmed expectations */
- IP_NF_ASSERT(old && old->sibling == NULL);
-
- /* newnat14 does not reuse the real allocated memory
- * structures but rather unexpects the old and
- * allocates a new. unexpect_related will decrement
- * related_to->expecting.
- */
- unexpect_related(old);
- ret = -EPERM;
- } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
- struct ip_conntrack_expect *, &expect->tuple,
- &expect->mask)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- DEBUGP("expect_related: busy!\n");
-
- kfree(expect);
- return -EBUSY;
}
-out: ip_conntrack_expect_insert(expect, related_to);
+ /* Will be over limit? */
+ if (expect->master->helper->max_expected &&
+ expect->master->expecting >= expect->master->helper->max_expected)
+ evict_oldest_expect(expect->master);
+ ip_conntrack_expect_insert(expect);
+ ret = 0;
+out:
WRITE_UNLOCK(&ip_conntrack_lock);
-
- return ret;
-}
-
-/* Change tuple in an existing expectation */
-int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
- struct ip_conntrack_tuple *newtuple)
-{
- int ret;
-
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
- DEBUGP("change_expect:\n");
- DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
- DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
- DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
- if (expect->ct_tuple.dst.protonum == 0) {
- /* Never seen before */
- DEBUGP("change expect: never seen before\n");
- if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
- && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
- struct ip_conntrack_expect *, newtuple, &expect->mask)) {
- /* Force NAT to find an unused tuple */
- ret = -1;
- } else {
- memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
- memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
- ret = 0;
- }
- } else {
- /* Resent packet */
- DEBUGP("change expect: resent packet\n");
- if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
- ret = 0;
- } else {
- /* Force NAT to choose again the same port */
- ret = -1;
- }
- }
- WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
-
- return ret;
+ return ret;
}
-/* Alter reply tuple (maybe alter helper). If it's already taken,
- return 0 and don't do alteration. */
-int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
- const struct ip_conntrack_tuple *newreply)
+/* Alter reply tuple (maybe alter helper). This is for NAT, and is
+ implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+ const struct ip_conntrack_tuple *newreply)
{
WRITE_LOCK(&ip_conntrack_lock);
- if (__ip_conntrack_find(newreply, conntrack)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- return 0;
- }
/* Should be unconfirmed, so not in hash table yet */
IP_NF_ASSERT(!is_confirmed(conntrack));
DUMP_TUPLE(newreply);
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (!conntrack->master && list_empty(&conntrack->sibling_list))
+ if (!conntrack->master && conntrack->expecting == 0)
conntrack->helper = ip_ct_find_helper(newreply);
WRITE_UNLOCK(&ip_conntrack_lock);
-
- return 1;
}
int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
{
+ BUG_ON(me->timeout == 0);
WRITE_LOCK(&ip_conntrack_lock);
list_prepend(&helpers, me);
WRITE_UNLOCK(&ip_conntrack_lock);
static inline int unhelp(struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_helper *me)
{
- if (i->ctrack->helper == me) {
- /* Get rid of any expected. */
- remove_expectations(i->ctrack, 0);
- /* And *then* set helper to NULL */
- i->ctrack->helper = NULL;
- }
+ if (tuplehash_to_ctrack(i)->helper == me)
+ tuplehash_to_ctrack(i)->helper = NULL;
return 0;
}
void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
{
unsigned int i;
+ struct ip_conntrack_expect *exp, *tmp;
/* Need write lock here, to delete helper. */
WRITE_LOCK(&ip_conntrack_lock);
LIST_DELETE(&helpers, me);
+ /* Get rid of expectations */
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+ if (exp->master->helper == me && del_timer(&exp->timeout)) {
+ unlink_expect(exp);
+ destroy_expect(exp);
+ }
+ }
/* Get rid of expecteds, set helpers to NULL. */
+ LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
for (i = 0; i < ip_conntrack_htable_size; i++)
LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
struct ip_conntrack_tuple_hash *, me);
synchronize_net();
}
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+static inline void ct_add_counters(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if (skb) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ ntohs(skb->nh.iph->tot_len);
+ }
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ unsigned long extra_jiffies)
{
IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
/* If not in hash table, timer will not be active yet */
- if (!is_confirmed(ct))
+ if (!is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- else {
+ ct_add_counters(ct, ctinfo, skb);
+ } else {
WRITE_LOCK(&ip_conntrack_lock);
/* Need del_timer for race avoidance (may already be dying). */
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
}
+ ct_add_counters(ct, ctinfo, skb);
WRITE_UNLOCK(&ip_conntrack_lock);
}
}
/* Returns new sk_buff, or NULL */
struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb)
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
{
struct sock *sk = skb->sk;
#ifdef CONFIG_NETFILTER_DEBUG
unsigned int olddebug = skb->nf_debug;
#endif
+
if (sk) {
sock_hold(sk);
skb_orphan(skb);
}
local_bh_disable();
- skb = ip_defrag(skb);
+ skb = ip_defrag(skb, user);
local_bh_enable();
if (!skb) {
}
/* Used by ipt_REJECT. */
-static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
{
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
- ct = __ip_conntrack_get(nfct, &ctinfo);
-
- /* This ICMP is in reverse direction to the packet which
- caused it */
+ /* This ICMP is in reverse direction to the packet which caused it */
+ ct = ip_conntrack_get(skb, &ctinfo);
+
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
else
ctinfo = IP_CT_RELATED;
- /* Attach new skbuff, and increment count */
- nskb->nfct = &ct->infos[ctinfo];
- atomic_inc(&ct->ct_general.use);
+ /* Attach to new skbuff, and increment count */
+ nskb->nfct = &ct->ct_general;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nskb->nfct);
}
static inline int
-do_kill(const struct ip_conntrack_tuple_hash *i,
- int (*kill)(const struct ip_conntrack *i, void *data),
+do_iter(const struct ip_conntrack_tuple_hash *i,
+ int (*iter)(struct ip_conntrack *i, void *data),
void *data)
{
- return kill(i->ctrack, data);
+ return iter(tuplehash_to_ctrack(i), data);
}
/* Bring out ya dead! */
static struct ip_conntrack_tuple_hash *
-get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
void *data, unsigned int *bucket)
{
struct ip_conntrack_tuple_hash *h = NULL;
- READ_LOCK(&ip_conntrack_lock);
- for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
- h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
- struct ip_conntrack_tuple_hash *, kill, data);
+ WRITE_LOCK(&ip_conntrack_lock);
+ for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+ h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
+ if (h)
+ break;
}
+ if (!h)
+ h = LIST_FIND_W(&unconfirmed, do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
if (h)
- atomic_inc(&h->ctrack->ct_general.use);
- READ_UNLOCK(&ip_conntrack_lock);
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+ WRITE_UNLOCK(&ip_conntrack_lock);
return h;
}
void
-ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
- void *data)
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
{
struct ip_conntrack_tuple_hash *h;
unsigned int bucket = 0;
- while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
+ while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
/* Time to push up daises... */
- if (del_timer(&h->ctrack->timeout))
- death_by_timeout((unsigned long)h->ctrack);
+ if (del_timer(&ct->timeout))
+ death_by_timeout((unsigned long)ct);
/* ... else the timer will get him soon. */
- ip_conntrack_put(h->ctrack);
+ ip_conntrack_put(ct);
}
}
static int
getorigdst(struct sock *sk, int optval, void __user *user, int *len)
{
- struct inet_opt *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct ip_conntrack_tuple_hash *h;
struct ip_conntrack_tuple tuple;
h = ip_conntrack_find_get(&tuple, NULL);
if (h) {
struct sockaddr_in sin;
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
sin.sin_family = AF_INET;
- sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
.tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
.tuple.dst.ip;
DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
- ip_conntrack_put(h->ctrack);
+ ip_conntrack_put(ct);
if (copy_to_user(user, &sin, sizeof(sin)) != 0)
return -EFAULT;
else
.get = &getorigdst,
};
-static int kill_all(const struct ip_conntrack *i, void *data)
+static int kill_all(struct ip_conntrack *i, void *data)
{
return 1;
}
+static void free_conntrack_hash(void)
+{
+ if (ip_conntrack_vmalloc)
+ vfree(ip_conntrack_hash);
+ else
+ free_pages((unsigned long)ip_conntrack_hash,
+ get_order(sizeof(struct list_head)
+ * ip_conntrack_htable_size));
+}
+
/* Mishearing the voices in his head, our hero wonders how he's
supposed to kill the mall. */
void ip_conntrack_cleanup(void)
synchronize_net();
i_see_dead_people:
- ip_ct_selective_cleanup(kill_all, NULL);
+ ip_ct_iterate_cleanup(kill_all, NULL);
if (atomic_read(&ip_conntrack_count) != 0) {
schedule();
goto i_see_dead_people;
}
kmem_cache_destroy(ip_conntrack_cachep);
- vfree(ip_conntrack_hash);
+ kmem_cache_destroy(ip_conntrack_expect_cachep);
+ free_conntrack_hash();
nf_unregister_sockopt(&so_getorigdst);
}
static int hashsize;
-MODULE_PARM(hashsize, "i");
+module_param(hashsize, int, 0400);
int __init ip_conntrack_init(void)
{
return ret;
}
- ip_conntrack_hash = vmalloc(sizeof(struct list_head)
- * ip_conntrack_htable_size);
+ /* AK: the hash table is twice as big than needed because it
+ uses list_head. it would be much nicer to caches to use a
+ single pointer list head here. */
+ ip_conntrack_vmalloc = 0;
+ ip_conntrack_hash
+ =(void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ *ip_conntrack_htable_size));
+ if (!ip_conntrack_hash) {
+ ip_conntrack_vmalloc = 1;
+ printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
+ ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+ * ip_conntrack_htable_size);
+ }
if (!ip_conntrack_hash) {
printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
goto err_unreg_sockopt;
ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
sizeof(struct ip_conntrack), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ 0, NULL, NULL);
if (!ip_conntrack_cachep) {
printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
goto err_free_hash;
}
+
+ ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+ sizeof(struct ip_conntrack_expect),
+ 0, 0, NULL, NULL);
+ if (!ip_conntrack_expect_cachep) {
+ printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+ goto err_free_conntrack_slab;
+ }
+
/* Don't NEED lock here, but good form anyway. */
WRITE_LOCK(&ip_conntrack_lock);
+ for (i = 0; i < MAX_IP_CT_PROTO; i++)
+ ip_ct_protos[i] = &ip_conntrack_generic_protocol;
/* Sew in builtin protocols. */
- list_append(&protocol_list, &ip_conntrack_protocol_tcp);
- list_append(&protocol_list, &ip_conntrack_protocol_udp);
- list_append(&protocol_list, &ip_conntrack_protocol_icmp);
+ ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+ ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+ ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
WRITE_UNLOCK(&ip_conntrack_lock);
for (i = 0; i < ip_conntrack_htable_size; i++)
atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
/* - and look it like as a confirmed connection */
set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
- /* - and prepare the ctinfo field for REJECT & NAT. */
- ip_conntrack_untracked.infos[IP_CT_NEW].master =
- ip_conntrack_untracked.infos[IP_CT_RELATED].master =
- ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master =
- &ip_conntrack_untracked.ct_general;
return ret;
+err_free_conntrack_slab:
+ kmem_cache_destroy(ip_conntrack_cachep);
err_free_hash:
- vfree(ip_conntrack_hash);
+ free_conntrack_hash();
err_unreg_sockopt:
nf_unregister_sockopt(&so_getorigdst);