#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <net/checksum.h>
+#include <net/ip.h>
#include <linux/stddef.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/jhash.h>
-/* For ERR_PTR(). Yeah, I know... --RR */
-#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
-/* This rwlock protects the main hash table, protocol/helper/expected
+/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION "2.1"
+#define IP_CONNTRACK_VERSION "2.4"
#if 0
#define DEBUGP printk
#define DEBUGP(format, args...)
#endif
-DECLARE_RWLOCK(ip_conntrack_lock);
-DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
+DEFINE_RWLOCK(ip_conntrack_lock);
+
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
LIST_HEAD(ip_conntrack_expect_list);
-LIST_HEAD(protocol_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
static LIST_HEAD(helpers);
unsigned int ip_conntrack_htable_size = 0;
int ip_conntrack_max;
-static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
struct ip_conntrack ip_conntrack_untracked;
-
-extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
-
-static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
- u_int8_t protocol)
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
+
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
{
- return protocol == curr->proto;
+ DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+ if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+ notifier_call_chain(&ip_conntrack_chain, ecache->events,
+ ecache->ct);
+ ecache->events = 0;
+ ip_conntrack_put(ecache->ct);
+ ecache->ct = NULL;
}
-struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
{
- struct ip_conntrack_protocol *p;
-
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- p = LIST_FIND(&protocol_list, proto_cmpfn,
- struct ip_conntrack_protocol *, protocol);
- if (!p)
- p = &ip_conntrack_generic_protocol;
-
- return p;
+ struct ip_conntrack_ecache *ecache;
+
+ local_bh_disable();
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ if (ecache->ct == ct)
+ __ip_ct_deliver_cached_events(ecache);
+ local_bh_enable();
}
-struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
{
- struct ip_conntrack_protocol *p;
-
- READ_LOCK(&ip_conntrack_lock);
- p = __ip_ct_find_proto(protocol);
- READ_UNLOCK(&ip_conntrack_lock);
- return p;
+ struct ip_conntrack_ecache *ecache;
+
+ /* take care of delivering potentially old events */
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ BUG_ON(ecache->ct == ct);
+ if (ecache->ct)
+ __ip_ct_deliver_cached_events(ecache);
+ /* initialize for this conntrack/packet */
+ ecache->ct = ct;
+ nf_conntrack_get(&ct->ct_general);
}
-inline void
-ip_conntrack_put(struct ip_conntrack *ct)
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
{
- IP_NF_ASSERT(ct);
- IP_NF_ASSERT(ct->infos[0].master);
- /* nf_conntrack_put wants to go via an info struct, so feed it
- one at random. */
- nf_conntrack_put(&ct->infos[0]);
+ struct ip_conntrack_ecache *ecache;
+ int cpu;
+
+ for_each_cpu(cpu) {
+ ecache = &per_cpu(ip_conntrack_ecache, cpu);
+ if (ecache->ct)
+ ip_conntrack_put(ecache->ct);
+ }
}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
static int ip_conntrack_hash_rnd_initted;
static unsigned int ip_conntrack_hash_rnd;
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
+static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
+ unsigned int size, unsigned int rnd)
{
-#if 0
- dump_tuple(tuple);
-#endif
return (jhash_3words(tuple->src.ip,
(tuple->dst.ip ^ tuple->dst.protonum),
(tuple->src.u.all | (tuple->dst.u.all << 16)),
- ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+ rnd) % size);
+}
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+ return __hash_conntrack(tuple, ip_conntrack_htable_size,
+ ip_conntrack_hash_rnd);
}
int
-get_tuple(const struct iphdr *iph,
- const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_protocol *protocol)
+ip_ct_get_tuple(const struct iphdr *iph,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_protocol *protocol)
{
/* Never happen */
if (iph->frag_off & htons(IP_OFFSET)) {
tuple->src.ip = iph->saddr;
tuple->dst.ip = iph->daddr;
tuple->dst.protonum = iph->protocol;
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
return protocol->pkt_to_tuple(skb, dataoff, tuple);
}
-static int
-invert_tuple(struct ip_conntrack_tuple *inverse,
- const struct ip_conntrack_tuple *orig,
- const struct ip_conntrack_protocol *protocol)
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig,
+ const struct ip_conntrack_protocol *protocol)
{
inverse->src.ip = orig->dst.ip;
inverse->dst.ip = orig->src.ip;
inverse->dst.protonum = orig->dst.protonum;
+ inverse->dst.dir = !orig->dst.dir;
return protocol->invert_tuple(inverse, orig);
}
/* ip_conntrack_expect helper functions */
-
-/* Compare tuple parts depending on mask. */
-static inline int expect_cmp(const struct ip_conntrack_expect *i,
- const struct ip_conntrack_tuple *tuple)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
{
- MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
- return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
-}
-
-static void
-destroy_expect(struct ip_conntrack_expect *exp)
-{
- DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
- IP_NF_ASSERT(atomic_read(&exp->use));
+ ASSERT_WRITE_LOCK(&ip_conntrack_lock);
IP_NF_ASSERT(!timer_pending(&exp->timeout));
-
- kfree(exp);
+ list_del(&exp->list);
+ CONNTRACK_STAT_INC(expect_delete);
+ exp->master->expecting--;
+ ip_conntrack_expect_put(exp);
}
-
-inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
+static void expectation_timed_out(unsigned long ul_expect)
{
- IP_NF_ASSERT(exp);
-
- if (atomic_dec_and_test(&exp->use)) {
- /* usage count dropped to zero */
- destroy_expect(exp);
- }
-}
+ struct ip_conntrack_expect *exp = (void *)ul_expect;
-static inline struct ip_conntrack_expect *
-__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
-{
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
- return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
- struct ip_conntrack_expect *, tuple);
+ write_lock_bh(&ip_conntrack_lock);
+ ip_ct_unlink_expect(exp);
+ write_unlock_bh(&ip_conntrack_lock);
+ ip_conntrack_expect_put(exp);
}
-/* Find a expectation corresponding to a tuple. */
struct ip_conntrack_expect *
-ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
{
- struct ip_conntrack_expect *exp;
-
- READ_LOCK(&ip_conntrack_lock);
- READ_LOCK(&ip_conntrack_expect_tuple_lock);
- exp = __ip_ct_expect_find(tuple);
- if (exp)
- atomic_inc(&exp->use);
- READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
- READ_UNLOCK(&ip_conntrack_lock);
-
- return exp;
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+ atomic_inc(&i->use);
+ return i;
+ }
+ }
+ return NULL;
}
-/* remove one specific expectation from all lists and drop refcount,
- * does _NOT_ delete the timer. */
-static void __unexpect_related(struct ip_conntrack_expect *expect)
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
{
- DEBUGP("unexpect_related(%p)\n", expect);
- MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
-
- /* we're not allowed to unexpect a confirmed expectation! */
- IP_NF_ASSERT(!expect->sibling);
-
- /* delete from global and local lists */
- list_del(&expect->list);
- list_del(&expect->expected_list);
-
- /* decrement expect-count of master conntrack */
- if (expect->expectant)
- expect->expectant->expecting--;
+ struct ip_conntrack_expect *i;
+
+ read_lock_bh(&ip_conntrack_lock);
+ i = __ip_conntrack_expect_find(tuple);
+ read_unlock_bh(&ip_conntrack_lock);
- ip_conntrack_expect_put(expect);
+ return i;
}
-/* remove one specific expecatation from all lists, drop refcount
- * and expire timer.
- * This function can _NOT_ be called for confirmed expects! */
-static void unexpect_related(struct ip_conntrack_expect *expect)
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
{
- IP_NF_ASSERT(expect->expectant);
- IP_NF_ASSERT(expect->expectant->helper);
- /* if we are supposed to have a timer, but we can't delete
- * it: race condition. __unexpect_related will
- * be calledd by timeout function */
- if (expect->expectant->helper->timeout
- && !del_timer(&expect->timeout))
- return;
-
- __unexpect_related(expect);
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+ && is_confirmed(i->master)) {
+ if (i->flags & IP_CT_EXPECT_PERMANENT) {
+ atomic_inc(&i->use);
+ return i;
+ } else if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ return i;
+ }
+ }
+ }
+ return NULL;
}
-/* delete all unconfirmed expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
+/* delete all expectations for this conntrack */
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
{
- struct list_head *exp_entry, *next;
- struct ip_conntrack_expect *exp;
-
- DEBUGP("remove_expectations(%p)\n", ct);
+ struct ip_conntrack_expect *i, *tmp;
- list_for_each_safe(exp_entry, next, &ct->sibling_list) {
- exp = list_entry(exp_entry, struct ip_conntrack_expect,
- expected_list);
+ /* Optimization: most connection never expect any others. */
+ if (ct->expecting == 0)
+ return;
- /* we skip established expectations, as we want to delete
- * the un-established ones only */
- if (exp->sibling) {
- DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
- if (drop_refcount) {
- /* Indicate that this expectations parent is dead */
- ip_conntrack_put(exp->expectant);
- exp->expectant = NULL;
- }
- continue;
+ list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+ if (i->master == ct && del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ ip_conntrack_expect_put(i);
}
-
- IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
- IP_NF_ASSERT(exp->expectant == ct);
-
- /* delete expectation from global and private lists */
- unexpect_related(exp);
}
}
unsigned int ho, hr;
DEBUGP("clean_from_lists(%p)\n", ct);
- MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+ ASSERT_WRITE_LOCK(&ip_conntrack_lock);
ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
- /* Destroy all un-established, pending expectations */
- remove_expectations(ct, 1);
+ /* Destroy all pending expectations */
+ ip_ct_remove_expectations(ct);
}
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
- struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
+ struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
struct ip_conntrack_protocol *proto;
DEBUGP("destroy_conntrack(%p)\n", ct);
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
+ ip_conntrack_event(IPCT_DESTROY, ct);
+ set_bit(IPS_DYING_BIT, &ct->status);
+
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to ip_conntrack_lock!!! -HW */
- proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
if (proto && proto->destroy)
proto->destroy(ct);
if (ip_conntrack_destroyed)
ip_conntrack_destroyed(ct);
- WRITE_LOCK(&ip_conntrack_lock);
- /* Delete us from our own list to prevent corruption later */
- list_del(&ct->sibling_list);
-
- /* Delete our master expectation */
- if (ct->master) {
- if (ct->master->expectant) {
- /* can't call __unexpect_related here,
- * since it would screw up expect_list */
- list_del(&ct->master->expected_list);
- master = ct->master->expectant;
- }
- kfree(ct->master);
+ write_lock_bh(&ip_conntrack_lock);
+ /* Expectations will have been removed in clean_from_lists,
+ * except TFTP can create an expectation on the first packet,
+ * before connection is in the list, so we need to clean here,
+ * too. */
+ ip_ct_remove_expectations(ct);
+
+ /* We overload first tuple to link into unconfirmed list. */
+ if (!is_confirmed(ct)) {
+ BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
}
- WRITE_UNLOCK(&ip_conntrack_lock);
- if (master)
- ip_conntrack_put(master);
+ CONNTRACK_STAT_INC(delete);
+ write_unlock_bh(&ip_conntrack_lock);
+
+ if (ct->master)
+ ip_conntrack_put(ct->master);
DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
- kmem_cache_free(ip_conntrack_cachep, ct);
- atomic_dec(&ip_conntrack_count);
+ ip_conntrack_free(ct);
}
static void death_by_timeout(unsigned long ul_conntrack)
{
struct ip_conntrack *ct = (void *)ul_conntrack;
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
+ /* Inside lock so preempt is disabled on module removal path.
+ * Otherwise we can get spurious warnings. */
+ CONNTRACK_STAT_INC(delete_list);
clean_from_lists(ct);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_put(ct);
}
const struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *ignored_conntrack)
{
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- return i->ctrack != ignored_conntrack
+ ASSERT_READ_LOCK(&ip_conntrack_lock);
+ return tuplehash_to_ctrack(i) != ignored_conntrack
&& ip_ct_tuple_equal(tuple, &i->tuple);
}
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *ignored_conntrack)
{
struct ip_conntrack_tuple_hash *h;
unsigned int hash = hash_conntrack(tuple);
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- h = LIST_FIND(&ip_conntrack_hash[hash],
- conntrack_tuple_cmp,
- struct ip_conntrack_tuple_hash *,
- tuple, ignored_conntrack);
- return h;
+ ASSERT_READ_LOCK(&ip_conntrack_lock);
+ list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+ if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+ CONNTRACK_STAT_INC(found);
+ return h;
+ }
+ CONNTRACK_STAT_INC(searched);
+ }
+
+ return NULL;
}
/* Find a connection corresponding to a tuple. */
{
struct ip_conntrack_tuple_hash *h;
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
h = __ip_conntrack_find(tuple, ignored_conntrack);
if (h)
- atomic_inc(&h->ctrack->ct_general.use);
- READ_UNLOCK(&ip_conntrack_lock);
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+ read_unlock_bh(&ip_conntrack_lock);
return h;
}
-static inline struct ip_conntrack *
-__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+ unsigned int hash,
+ unsigned int repl_hash)
{
- struct ip_conntrack *ct
- = (struct ip_conntrack *)nfct->master;
-
- /* ctinfo is the index of the nfct inside the conntrack */
- *ctinfo = nfct - ct->infos;
- IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
- return ct;
+ ct->id = ++ip_conntrack_next_id;
+ list_prepend(&ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY].list);
}
-/* Return conntrack and conntrack_info given skb->nfct->master */
-struct ip_conntrack *
-ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
{
- if (skb->nfct)
- return __ip_conntrack_get(skb->nfct, ctinfo);
- return NULL;
+ unsigned int hash, repl_hash;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ write_lock_bh(&ip_conntrack_lock);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
+ write_unlock_bh(&ip_conntrack_lock);
}
-/* Confirm a connection given skb->nfct; places it in hash table */
+/* Confirm a connection given skb; places it in hash table */
int
-__ip_conntrack_confirm(struct nf_ct_info *nfct)
+__ip_conntrack_confirm(struct sk_buff **pskb)
{
unsigned int hash, repl_hash;
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
- ct = __ip_conntrack_get(nfct, &ctinfo);
+ ct = ip_conntrack_get(*pskb, &ctinfo);
/* ipt_REJECT uses ip_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
IP_NF_ASSERT(!is_confirmed(ct));
DEBUGP("Confirming conntrack %p\n", ct);
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
+
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
conntrack_tuple_cmp,
struct ip_conntrack_tuple_hash *,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
- list_prepend(&ip_conntrack_hash[hash],
- &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
- list_prepend(&ip_conntrack_hash[repl_hash],
- &ct->tuplehash[IP_CT_DIR_REPLY]);
+ /* Remove from unconfirmed list */
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
set_bit(IPS_CONFIRMED_BIT, &ct->status);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ CONNTRACK_STAT_INC(insert);
+ write_unlock_bh(&ip_conntrack_lock);
+ if (ct->helper)
+ ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+ if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+ test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+ ip_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, *pskb);
+
return NF_ACCEPT;
}
- WRITE_UNLOCK(&ip_conntrack_lock);
+ CONNTRACK_STAT_INC(insert_failed);
+ write_unlock_bh(&ip_conntrack_lock);
+
return NF_DROP;
}
{
struct ip_conntrack_tuple_hash *h;
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
h = __ip_conntrack_find(tuple, ignored_conntrack);
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
return h != NULL;
}
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-struct ip_conntrack *
-icmp_error_track(struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct ip_conntrack_tuple innertuple, origtuple;
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } inside;
- struct ip_conntrack_protocol *innerproto;
- struct ip_conntrack_tuple_hash *h;
- int dataoff;
-
- IP_NF_ASSERT(skb->nfct == NULL);
-
- /* Not enough header? */
- if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
- return NULL;
-
- if (inside.icmp.type != ICMP_DEST_UNREACH
- && inside.icmp.type != ICMP_SOURCE_QUENCH
- && inside.icmp.type != ICMP_TIME_EXCEEDED
- && inside.icmp.type != ICMP_PARAMETERPROB
- && inside.icmp.type != ICMP_REDIRECT)
- return NULL;
-
- /* Ignore ICMP's containing fragments (shouldn't happen) */
- if (inside.ip.frag_off & htons(IP_OFFSET)) {
- DEBUGP("icmp_error_track: fragment of proto %u\n",
- inside.ip.protocol);
- return NULL;
- }
-
- innerproto = ip_ct_find_proto(inside.ip.protocol);
- dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
- /* Are they talking about one of our connections? */
- if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
- DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
- return NULL;
- }
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
- DEBUGP("icmp_error_track: Can't invert tuple\n");
- return NULL;
- }
-
- *ctinfo = IP_CT_RELATED;
-
- h = ip_conntrack_find_get(&innertuple, NULL);
- if (!h) {
- /* Locally generated ICMPs will match inverted if they
- haven't been SNAT'ed yet */
- /* FIXME: NAT code has to handle half-done double NAT --RR */
- if (hooknum == NF_IP_LOCAL_OUT)
- h = ip_conntrack_find_get(&origtuple, NULL);
-
- if (!h) {
- DEBUGP("icmp_error_track: no match\n");
- return NULL;
- }
- /* Reverse direction from that found */
- if (DIRECTION(h) != IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- } else {
- if (DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- }
-
- /* Update skb to refer to this connection */
- skb->nfct = &h->ctrack->infos[*ctinfo];
- return h->ctrack;
-}
-
/* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */
static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
{
- return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
+ return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
}
static int early_drop(struct list_head *chain)
{
/* Traverse backwards: gives us oldest, which is roughly LRU */
struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack *ct = NULL;
int dropped = 0;
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
- if (h)
- atomic_inc(&h->ctrack->ct_general.use);
- READ_UNLOCK(&ip_conntrack_lock);
+ if (h) {
+ ct = tuplehash_to_ctrack(h);
+ atomic_inc(&ct->ct_general.use);
+ }
+ read_unlock_bh(&ip_conntrack_lock);
- if (!h)
+ if (!ct)
return dropped;
- if (del_timer(&h->ctrack->timeout)) {
- death_by_timeout((unsigned long)h->ctrack);
+ if (del_timer(&ct->timeout)) {
+ death_by_timeout((unsigned long)ct);
dropped = 1;
+ CONNTRACK_STAT_INC(early_drop);
}
- ip_conntrack_put(h->ctrack);
+ ip_conntrack_put(ct);
return dropped;
}
return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
}
-struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
{
return LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
tuple);
}
-/* Allocate a new conntrack: we return -ENOMEM if classification
- failed due to stress. Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *protocol,
- struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_helper *helper;
+
+ /* need ip_conntrack_lock to assure that helper exists until
+ * try_module_get() is called */
+ read_lock_bh(&ip_conntrack_lock);
+
+ helper = __ip_conntrack_helper_find(tuple);
+ if (helper) {
+ /* need to increase module usage count to assure helper will
+ * not go away while the caller is e.g. busy putting a
+ * conntrack in the hash that uses the helper */
+ if (!try_module_get(helper->me))
+ helper = NULL;
+ }
+
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+ module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+ return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ preempt_disable();
+ p = __ip_conntrack_proto_find(protocol);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_conntrack_generic_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+ module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+ struct ip_conntrack_tuple *repl)
{
struct ip_conntrack *conntrack;
- struct ip_conntrack_tuple repl_tuple;
- size_t hash;
- struct ip_conntrack_expect *expected;
- int i;
- static unsigned int drop_next;
if (!ip_conntrack_hash_rnd_initted) {
get_random_bytes(&ip_conntrack_hash_rnd, 4);
ip_conntrack_hash_rnd_initted = 1;
}
- hash = hash_conntrack(tuple);
-
- if (ip_conntrack_max &&
- atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
- /* Try dropping from random chain, or else from the
- chain about to put into (in case they're trying to
- bomb one hash chain). */
- unsigned int next = (drop_next++)%ip_conntrack_htable_size;
-
- if (!early_drop(&ip_conntrack_hash[next])
- && !early_drop(&ip_conntrack_hash[hash])) {
+ if (ip_conntrack_max
+ && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+ unsigned int hash = hash_conntrack(orig);
+ /* Try dropping from this hash chain. */
+ if (!early_drop(&ip_conntrack_hash[hash])) {
if (net_ratelimit())
printk(KERN_WARNING
"ip_conntrack: table full, dropping"
}
}
- if (!invert_tuple(&repl_tuple, tuple, protocol)) {
- DEBUGP("Can't invert tuple.\n");
- return NULL;
- }
-
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
if (!conntrack) {
DEBUGP("Can't allocate conntrack.\n");
memset(conntrack, 0, sizeof(*conntrack));
atomic_set(&conntrack->ct_general.use, 1);
conntrack->ct_general.destroy = destroy_conntrack;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
- conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
- for (i=0; i < IP_CT_NUMBER; i++)
- conntrack->infos[i].master = &conntrack->ct_general;
-
- if (!protocol->new(conntrack, skb)) {
- kmem_cache_free(ip_conntrack_cachep, conntrack);
- return NULL;
- }
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+#if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
+ conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
+ conntrack->xid[IP_CT_DIR_REPLY] = -1;
+ conntrack->priority = (u_int32_t)-1;
+#endif
/* Don't set timer yet: wait for confirmation */
init_timer(&conntrack->timeout);
conntrack->timeout.data = (unsigned long)conntrack;
conntrack->timeout.function = death_by_timeout;
- INIT_LIST_HEAD(&conntrack->sibling_list);
-
- WRITE_LOCK(&ip_conntrack_lock);
- /* Need finding and deleting of expected ONLY if we win race */
- READ_LOCK(&ip_conntrack_expect_tuple_lock);
- expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
- struct ip_conntrack_expect *, tuple);
- READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
-
- /* If master is not in hash table yet (ie. packet hasn't left
- this machine yet), how can other end know about expected?
- Hence these are not the droids you are looking for (if
- master ct never got confirmed, we'd hold a reference to it
- and weird things would happen to future packets). */
- if (expected && !is_confirmed(expected->expectant))
- expected = NULL;
-
- /* Look up the conntrack helper for master connections only */
- if (!expected)
- conntrack->helper = ip_ct_find_helper(&repl_tuple);
-
- /* If the expectation is dying, then this is a loser. */
- if (expected
- && expected->expectant->helper->timeout
- && ! del_timer(&expected->timeout))
- expected = NULL;
-
- if (expected) {
+ atomic_inc(&ip_conntrack_count);
+
+ return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+ atomic_dec(&ip_conntrack_count);
+ kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress. Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ struct ip_conntrack_expect *exp;
+
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+ if (conntrack == NULL || IS_ERR(conntrack))
+ return (struct ip_conntrack_tuple_hash *)conntrack;
+
+ if (!protocol->new(conntrack, skb)) {
+ ip_conntrack_free(conntrack);
+ return NULL;
+ }
+
+ write_lock_bh(&ip_conntrack_lock);
+ exp = find_expectation(tuple);
+
+ if (exp) {
DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
- conntrack, expected);
+ conntrack, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
- IP_NF_ASSERT(master_ct(conntrack));
__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
- conntrack->master = expected;
- expected->sibling = conntrack;
- LIST_DELETE(&ip_conntrack_expect_list, expected);
- expected->expectant->expecting--;
- nf_conntrack_get(&master_ct(conntrack)->infos[0]);
+ conntrack->master = exp->master;
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+ conntrack->mark = exp->master->mark;
+#endif
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+ /* this is ugly, but there is no other place where to put it */
+ conntrack->nat.masq_index = exp->master->nat.masq_index;
+#endif
+ nf_conntrack_get(&conntrack->master->ct_general);
+ CONNTRACK_STAT_INC(expect_new);
+ } else {
+ conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
+
+ CONNTRACK_STAT_INC(new);
+ }
+
+ /* Overload tuple linked list to put us in unconfirmed list. */
+ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
+ write_unlock_bh(&ip_conntrack_lock);
+
+ if (exp) {
+ if (exp->expectfn)
+ exp->expectfn(conntrack, exp);
+ ip_conntrack_expect_put(exp);
}
- atomic_inc(&ip_conntrack_count);
- WRITE_UNLOCK(&ip_conntrack_lock);
- if (expected && expected->expectfn)
- expected->expectfn(conntrack);
return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
}
{
struct ip_conntrack_tuple tuple;
struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack *ct;
IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
- if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
+ if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
+ &tuple,proto))
return NULL;
/* look for tuple match */
if (IS_ERR(h))
return (void *)h;
}
+ ct = tuplehash_to_ctrack(h);
/* It exists; we have (non-exclusive) reference. */
if (DIRECTION(h) == IP_CT_DIR_REPLY) {
*set_reply = 1;
} else {
/* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
DEBUGP("ip_conntrack_in: normal packet for %p\n",
- h->ctrack);
+ ct);
*ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
+ } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
DEBUGP("ip_conntrack_in: related packet for %p\n",
- h->ctrack);
+ ct);
*ctinfo = IP_CT_RELATED;
} else {
DEBUGP("ip_conntrack_in: new packet for %p\n",
- h->ctrack);
+ ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
}
- skb->nfct = &h->ctrack->infos[*ctinfo];
- return h->ctrack;
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return ct;
}
/* Netfilter hook itself. */
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
struct ip_conntrack_protocol *proto;
- int set_reply;
+ int set_reply = 0;
int ret;
+ /* Previously seen (loopback or untracked)? Ignore. */
+ if ((*pskb)->nfct) {
+ CONNTRACK_STAT_INC(ignore);
+ return NF_ACCEPT;
+ }
+
/* Never happen */
if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
if (net_ratelimit()) {
return NF_DROP;
}
- /* FIXME: Do this right please. --RR */
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* Doesn't cover locally-generated broadcast, so not worth it. */
#if 0
/* Ignore broadcast: no `connection'. */
}
#endif
- /* Previously seen (loopback or untracked)? Ignore. */
- if ((*pskb)->nfct)
- return NF_ACCEPT;
+ proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
- proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
-
- /* It may be an icmp error... */
- if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
- && icmp_error_track(*pskb, &ctinfo, hooknum))
- return NF_ACCEPT;
+ /* It may be an special packet, error, unclean...
+ * inverse of the return code tells to the netfilter
+ * core what to do with the packet. */
+ if (proto->error != NULL
+ && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+ CONNTRACK_STAT_INC(error);
+ CONNTRACK_STAT_INC(invalid);
+ return -ret;
+ }
- if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
+ if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
/* Not valid part of a connection */
+ CONNTRACK_STAT_INC(invalid);
return NF_ACCEPT;
+ }
- if (IS_ERR(ct))
+ if (IS_ERR(ct)) {
/* Too stressed to deal. */
+ CONNTRACK_STAT_INC(drop);
return NF_DROP;
+ }
IP_NF_ASSERT((*pskb)->nfct);
ret = proto->packet(ct, *pskb, ctinfo);
- if (ret == -1) {
- /* Invalid */
+ if (ret < 0) {
+ /* Invalid: inverse of the return code tells
+ * the netfilter core what to do*/
nf_conntrack_put((*pskb)->nfct);
(*pskb)->nfct = NULL;
- return NF_ACCEPT;
+ CONNTRACK_STAT_INC(invalid);
+ return -ret;
}
- if (ret != NF_DROP && ct->helper) {
- ret = ct->helper->help(*pskb, ct, ctinfo);
- if (ret == -1) {
- /* Invalid */
- nf_conntrack_put((*pskb)->nfct);
- (*pskb)->nfct = NULL;
- return NF_ACCEPT;
- }
- }
- if (set_reply)
- set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_STATUS, *pskb);
return ret;
}
int invert_tuplepr(struct ip_conntrack_tuple *inverse,
const struct ip_conntrack_tuple *orig)
{
- return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
-}
-
-static inline int resent_expect(const struct ip_conntrack_expect *i,
- const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *mask)
-{
- DEBUGP("resent_expect\n");
- DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple);
- DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple);
- DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
- return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
- || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
- && ip_ct_tuple_equal(&i->mask, mask));
+ return ip_ct_invert_tuple(inverse, orig,
+ __ip_conntrack_proto_find(orig->dst.protonum));
}
/* Would two expected things clash? */
-static inline int expect_clash(const struct ip_conntrack_expect *i,
- const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *mask)
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+ const struct ip_conntrack_expect *b)
{
/* Part covered by intersection of masks must be unequal,
otherwise they clash */
struct ip_conntrack_tuple intersect_mask
- = { { i->mask.src.ip & mask->src.ip,
- { i->mask.src.u.all & mask->src.u.all } },
- { i->mask.dst.ip & mask->dst.ip,
- { i->mask.dst.u.all & mask->dst.u.all },
- i->mask.dst.protonum & mask->dst.protonum } };
+ = { { a->mask.src.ip & b->mask.src.ip,
+ { a->mask.src.u.all & b->mask.src.u.all } },
+ { a->mask.dst.ip & b->mask.dst.ip,
+ { a->mask.dst.u.all & b->mask.dst.u.all },
+ a->mask.dst.protonum & b->mask.dst.protonum } };
- return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
+ return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
}
-inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+ const struct ip_conntrack_expect *b)
{
- WRITE_LOCK(&ip_conntrack_lock);
- unexpect_related(expect);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ return a->master == b->master
+ && ip_ct_tuple_equal(&a->tuple, &b->tuple)
+ && ip_ct_tuple_equal(&a->mask, &b->mask);
}
-
-static void expectation_timed_out(unsigned long ul_expect)
-{
- struct ip_conntrack_expect *expect = (void *) ul_expect;
- DEBUGP("expectation %p timed out\n", expect);
- WRITE_LOCK(&ip_conntrack_lock);
- __unexpect_related(expect);
- WRITE_UNLOCK(&ip_conntrack_lock);
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack_expect *i;
+
+ write_lock_bh(&ip_conntrack_lock);
+ /* choose the the oldest expectation to evict */
+ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+ if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ write_unlock_bh(&ip_conntrack_lock);
+ ip_conntrack_expect_put(i);
+ return;
+ }
+ }
+ write_unlock_bh(&ip_conntrack_lock);
}
-struct ip_conntrack_expect *
-ip_conntrack_expect_alloc(void)
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
{
struct ip_conntrack_expect *new;
-
- new = (struct ip_conntrack_expect *)
- kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
+
+ new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
if (!new) {
DEBUGP("expect_related: OOM allocating expect\n");
return NULL;
}
-
- /* tuple_cmp compares whole union, we have to initialized cleanly */
- memset(new, 0, sizeof(struct ip_conntrack_expect));
-
+ new->master = me;
+ atomic_set(&new->use, 1);
return new;
}
-static void
-ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
- struct ip_conntrack *related_to)
+void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
{
- DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
- new->expectant = related_to;
- new->sibling = NULL;
- atomic_set(&new->use, 1);
-
- /* add to expected list for this connection */
- list_add(&new->expected_list, &related_to->sibling_list);
- /* add to global list of expectations */
-
- list_prepend(&ip_conntrack_expect_list, &new->list);
- /* add and start timer if required */
- if (related_to->helper->timeout) {
- init_timer(&new->timeout);
- new->timeout.data = (unsigned long)new;
- new->timeout.function = expectation_timed_out;
- new->timeout.expires = jiffies +
- related_to->helper->timeout * HZ;
- add_timer(&new->timeout);
- }
- related_to->expecting++;
+ if (atomic_dec_and_test(&exp->use))
+ kmem_cache_free(ip_conntrack_expect_cachep, exp);
}
-/* Add a related connection. */
-int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
- struct ip_conntrack *related_to)
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
{
- struct ip_conntrack_expect *old;
- int ret = 0;
-
- WRITE_LOCK(&ip_conntrack_lock);
- /* Because of the write lock, no reader can walk the lists,
- * so there is no need to use the tuple lock too */
-
- DEBUGP("ip_conntrack_expect_related %p\n", related_to);
- DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
- DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
+ atomic_inc(&exp->use);
+ exp->master->expecting++;
+ list_add(&exp->list, &ip_conntrack_expect_list);
+
+ init_timer(&exp->timeout);
+ exp->timeout.data = (unsigned long)exp;
+ exp->timeout.function = expectation_timed_out;
+ exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
+ add_timer(&exp->timeout);
+
+ exp->id = ++ip_conntrack_expect_next_id;
+ atomic_inc(&exp->use);
+ CONNTRACK_STAT_INC(expect_create);
+}
- old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
- struct ip_conntrack_expect *, &expect->tuple,
- &expect->mask);
- if (old) {
- /* Helper private data may contain offsets but no pointers
- pointing into the payload - otherwise we should have to copy
- the data filled out by the helper over the old one */
- DEBUGP("expect_related: resent packet\n");
- if (related_to->helper->timeout) {
- if (!del_timer(&old->timeout)) {
- /* expectation is dying. Fall through */
- goto out;
- } else {
- old->timeout.expires = jiffies +
- related_to->helper->timeout * HZ;
- add_timer(&old->timeout);
- }
- }
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
+{
+ struct ip_conntrack_expect *i;
- WRITE_UNLOCK(&ip_conntrack_lock);
- kfree(expect);
- return -EEXIST;
-
- } else if (related_to->helper->max_expected &&
- related_to->expecting >= related_to->helper->max_expected) {
- struct list_head *cur_item;
- /* old == NULL */
- if (!(related_to->helper->flags &
- IP_CT_HELPER_F_REUSE_EXPECT)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- if (net_ratelimit())
- printk(KERN_WARNING
- "ip_conntrack: max number of expected "
- "connections %i of %s reached for "
- "%u.%u.%u.%u->%u.%u.%u.%u\n",
- related_to->helper->max_expected,
- related_to->helper->name,
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
- kfree(expect);
- return -EPERM;
- }
- DEBUGP("ip_conntrack: max number of expected "
- "connections %i of %s reached for "
- "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
- related_to->helper->max_expected,
- related_to->helper->name,
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
- NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
-
- /* choose the the oldest expectation to evict */
- list_for_each(cur_item, &related_to->sibling_list) {
- struct ip_conntrack_expect *cur;
-
- cur = list_entry(cur_item,
- struct ip_conntrack_expect,
- expected_list);
- if (cur->sibling == NULL) {
- old = cur;
- break;
+ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+ if (i->master == master) {
+ if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ ip_conntrack_expect_put(i);
}
+ break;
}
-
- /* (!old) cannot happen, since related_to->expecting is the
- * number of unconfirmed expects */
- IP_NF_ASSERT(old);
-
- /* newnat14 does not reuse the real allocated memory
- * structures but rather unexpects the old and
- * allocates a new. unexpect_related will decrement
- * related_to->expecting.
- */
- unexpect_related(old);
- ret = -EPERM;
- } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
- struct ip_conntrack_expect *, &expect->tuple,
- &expect->mask)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- DEBUGP("expect_related: busy!\n");
-
- kfree(expect);
- return -EBUSY;
}
+}
-out: ip_conntrack_expect_insert(expect, related_to);
-
- WRITE_UNLOCK(&ip_conntrack_lock);
+static inline int refresh_timer(struct ip_conntrack_expect *i)
+{
+ if (!del_timer(&i->timeout))
+ return 0;
- return ret;
+ i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+ add_timer(&i->timeout);
+ return 1;
}
-/* Change tuple in an existing expectation */
-int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
- struct ip_conntrack_tuple *newtuple)
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
{
+ struct ip_conntrack_expect *i;
int ret;
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
- WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
- DEBUGP("change_expect:\n");
- DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
- DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
- DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
- if (expect->ct_tuple.dst.protonum == 0) {
- /* Never seen before */
- DEBUGP("change expect: never seen before\n");
- if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
- && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
- struct ip_conntrack_expect *, newtuple, &expect->mask)) {
- /* Force NAT to find an unused tuple */
- ret = -1;
- } else {
- memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
- memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
- ret = 0;
- }
- } else {
- /* Resent packet */
- DEBUGP("change expect: resent packet\n");
- if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
- ret = 0;
- } else {
- /* Force NAT to choose again the same port */
- ret = -1;
+ DEBUGP("ip_conntrack_expect_related %p\n", related_to);
+ DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
+ DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
+
+ write_lock_bh(&ip_conntrack_lock);
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (expect_matches(i, expect)) {
+ /* Refresh timer: if it's dying, ignore.. */
+ if (refresh_timer(i)) {
+ ret = 0;
+ goto out;
+ }
+ } else if (expect_clash(i, expect)) {
+ ret = -EBUSY;
+ goto out;
}
}
- WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
-
- return ret;
+
+ /* Will be over limit? */
+ if (expect->master->helper->max_expected &&
+ expect->master->expecting >= expect->master->helper->max_expected)
+ evict_oldest_expect(expect->master);
+
+ ip_conntrack_expect_insert(expect);
+ ip_conntrack_expect_event(IPEXP_NEW, expect);
+ ret = 0;
+out:
+ write_unlock_bh(&ip_conntrack_lock);
+ return ret;
}
-/* Alter reply tuple (maybe alter helper). If it's already taken,
- return 0 and don't do alteration. */
-int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
- const struct ip_conntrack_tuple *newreply)
+/* Alter reply tuple (maybe alter helper). This is for NAT, and is
+ implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+ const struct ip_conntrack_tuple *newreply)
{
- WRITE_LOCK(&ip_conntrack_lock);
- if (__ip_conntrack_find(newreply, conntrack)) {
- WRITE_UNLOCK(&ip_conntrack_lock);
- return 0;
- }
+ write_lock_bh(&ip_conntrack_lock);
/* Should be unconfirmed, so not in hash table yet */
IP_NF_ASSERT(!is_confirmed(conntrack));
DUMP_TUPLE(newreply);
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (!conntrack->master)
- conntrack->helper = LIST_FIND(&helpers, helper_cmp,
- struct ip_conntrack_helper *,
- newreply);
- WRITE_UNLOCK(&ip_conntrack_lock);
-
- return 1;
+ if (!conntrack->master && conntrack->expecting == 0)
+ conntrack->helper = __ip_conntrack_helper_find(newreply);
+ write_unlock_bh(&ip_conntrack_lock);
}
int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
{
- WRITE_LOCK(&ip_conntrack_lock);
+ BUG_ON(me->timeout == 0);
+ write_lock_bh(&ip_conntrack_lock);
list_prepend(&helpers, me);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
return 0;
}
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+ struct ip_conntrack_helper *h;
+
+ list_for_each_entry(h, &helpers, list) {
+ if (!strcmp(h->name, name))
+ return h;
+ }
+
+ return NULL;
+}
+
static inline int unhelp(struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_helper *me)
{
- if (i->ctrack->helper == me) {
- /* Get rid of any expected. */
- remove_expectations(i->ctrack, 0);
- /* And *then* set helper to NULL */
- i->ctrack->helper = NULL;
+ if (tuplehash_to_ctrack(i)->helper == me) {
+ ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
+ tuplehash_to_ctrack(i)->helper = NULL;
}
return 0;
}
void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
{
unsigned int i;
+ struct ip_conntrack_expect *exp, *tmp;
/* Need write lock here, to delete helper. */
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
LIST_DELETE(&helpers, me);
+ /* Get rid of expectations */
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+ if (exp->master->helper == me && del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
/* Get rid of expecteds, set helpers to NULL. */
+ LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
for (i = 0; i < ip_conntrack_htable_size; i++)
LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
struct ip_conntrack_tuple_hash *, me);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
/* Someone could be still looking at the helper in a bh. */
synchronize_net();
}
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __ip_ct_refresh_acct(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ unsigned long extra_jiffies,
+ int do_acct)
{
+ int event = 0;
+
IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+ IP_NF_ASSERT(skb);
+
+ write_lock_bh(&ip_conntrack_lock);
/* If not in hash table, timer will not be active yet */
- if (!is_confirmed(ct))
+ if (!is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- else {
- WRITE_LOCK(&ip_conntrack_lock);
+ event = IPCT_REFRESH;
+ } else {
/* Need del_timer for race avoidance (may already be dying). */
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
+ event = IPCT_REFRESH;
}
- WRITE_UNLOCK(&ip_conntrack_lock);
}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if (do_acct) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ ntohs(skb->nh.iph->tot_len);
+ if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+ || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+ event |= IPCT_COUNTER_FILLING;
+ }
+#endif
+
+ write_unlock_bh(&ip_conntrack_lock);
+
+ /* must be unlocked when calling event cache */
+ if (event)
+ ip_conntrack_event_cache(event, skb);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+ &tuple->src.u.tcp.port);
+ NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+ &tuple->dst.u.tcp.port);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *t)
+{
+ if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+ return -EINVAL;
+
+ t->src.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+ t->dst.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+ return 0;
+}
+#endif
+
/* Returns new sk_buff, or NULL */
struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb)
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
{
- struct sock *sk = skb->sk;
-#ifdef CONFIG_NETFILTER_DEBUG
- unsigned int olddebug = skb->nf_debug;
-#endif
- if (sk) {
- sock_hold(sk);
- skb_orphan(skb);
- }
+ skb_orphan(skb);
local_bh_disable();
- skb = ip_defrag(skb);
+ skb = ip_defrag(skb, user);
local_bh_enable();
- if (!skb) {
- if (sk)
- sock_put(sk);
- return skb;
- }
-
- if (sk) {
- skb_set_owner_w(skb, sk);
- sock_put(sk);
- }
-
- ip_send_check(skb->nh.iph);
- skb->nfcache |= NFC_ALTERED;
-#ifdef CONFIG_NETFILTER_DEBUG
- /* Packet path as if nothing had happened. */
- skb->nf_debug = olddebug;
-#endif
+ if (skb)
+ ip_send_check(skb->nh.iph);
return skb;
}
/* Used by ipt_REJECT. */
-static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
{
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
- ct = __ip_conntrack_get(nfct, &ctinfo);
-
- /* This ICMP is in reverse direction to the packet which
- caused it */
+ /* This ICMP is in reverse direction to the packet which caused it */
+ ct = ip_conntrack_get(skb, &ctinfo);
+
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
else
ctinfo = IP_CT_RELATED;
- /* Attach new skbuff, and increment count */
- nskb->nfct = &ct->infos[ctinfo];
- atomic_inc(&ct->ct_general.use);
+ /* Attach to new skbuff, and increment count */
+ nskb->nfct = &ct->ct_general;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nskb->nfct);
}
static inline int
-do_kill(const struct ip_conntrack_tuple_hash *i,
- int (*kill)(const struct ip_conntrack *i, void *data),
+do_iter(const struct ip_conntrack_tuple_hash *i,
+ int (*iter)(struct ip_conntrack *i, void *data),
void *data)
{
- return kill(i->ctrack, data);
+ return iter(tuplehash_to_ctrack(i), data);
}
/* Bring out ya dead! */
static struct ip_conntrack_tuple_hash *
-get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
void *data, unsigned int *bucket)
{
struct ip_conntrack_tuple_hash *h = NULL;
- READ_LOCK(&ip_conntrack_lock);
- for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
- h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
- struct ip_conntrack_tuple_hash *, kill, data);
+ write_lock_bh(&ip_conntrack_lock);
+ for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+ h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
+ if (h)
+ break;
}
+ if (!h)
+ h = LIST_FIND_W(&unconfirmed, do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
if (h)
- atomic_inc(&h->ctrack->ct_general.use);
- READ_UNLOCK(&ip_conntrack_lock);
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+ write_unlock_bh(&ip_conntrack_lock);
return h;
}
void
-ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
- void *data)
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
{
struct ip_conntrack_tuple_hash *h;
unsigned int bucket = 0;
- while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
+ while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
/* Time to push up daises... */
- if (del_timer(&h->ctrack->timeout))
- death_by_timeout((unsigned long)h->ctrack);
+ if (del_timer(&ct->timeout))
+ death_by_timeout((unsigned long)ct);
/* ... else the timer will get him soon. */
- ip_conntrack_put(h->ctrack);
+ ip_conntrack_put(ct);
}
}
static int
getorigdst(struct sock *sk, int optval, void __user *user, int *len)
{
- struct inet_opt *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct ip_conntrack_tuple_hash *h;
struct ip_conntrack_tuple tuple;
h = ip_conntrack_find_get(&tuple, NULL);
if (h) {
struct sockaddr_in sin;
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
sin.sin_family = AF_INET;
- sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
.tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
.tuple.dst.ip;
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
- ip_conntrack_put(h->ctrack);
+ ip_conntrack_put(ct);
if (copy_to_user(user, &sin, sizeof(sin)) != 0)
return -EFAULT;
else
.get = &getorigdst,
};
-static int kill_all(const struct ip_conntrack *i, void *data)
+static int kill_all(struct ip_conntrack *i, void *data)
{
return 1;
}
+void ip_conntrack_flush(void)
+{
+ ip_ct_iterate_cleanup(kill_all, NULL);
+}
+
+static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
+{
+ if (vmalloced)
+ vfree(hash);
+ else
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct list_head) * size));
+}
+
/* Mishearing the voices in his head, our hero wonders how he's
supposed to kill the mall. */
void ip_conntrack_cleanup(void)
{
ip_ct_attach = NULL;
+
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
synchronize_net();
-
+
+ ip_ct_event_cache_flush();
i_see_dead_people:
- ip_ct_selective_cleanup(kill_all, NULL);
+ ip_conntrack_flush();
if (atomic_read(&ip_conntrack_count) != 0) {
schedule();
goto i_see_dead_people;
}
+ /* wait until all references to ip_conntrack_untracked are dropped */
+ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
+ schedule();
kmem_cache_destroy(ip_conntrack_cachep);
- vfree(ip_conntrack_hash);
+ kmem_cache_destroy(ip_conntrack_expect_cachep);
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
nf_unregister_sockopt(&so_getorigdst);
}
-static int hashsize;
-MODULE_PARM(hashsize, "i");
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+ struct list_head *hash;
+ unsigned int i;
+
+ *vmalloced = 0;
+ hash = (void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ * size));
+ if (!hash) {
+ *vmalloced = 1;
+ printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+ hash = vmalloc(sizeof(struct list_head) * size);
+ }
+
+ if (hash)
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+static int set_hashsize(const char *val, struct kernel_param *kp)
+{
+ int i, bucket, hashsize, vmalloced;
+ int old_vmalloced, old_size;
+ int rnd;
+ struct list_head *hash, *old_hash;
+ struct ip_conntrack_tuple_hash *h;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!ip_conntrack_htable_size)
+ return param_set_int(val, kp);
+
+ hashsize = simple_strtol(val, NULL, 0);
+ if (!hashsize)
+ return -EINVAL;
+
+ hash = alloc_hashtable(hashsize, &vmalloced);
+ if (!hash)
+ return -ENOMEM;
+
+ /* We have to rehash for the new table anyway, so we also can
+ * use a new random seed */
+ get_random_bytes(&rnd, 4);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+ while (!list_empty(&ip_conntrack_hash[i])) {
+ h = list_entry(ip_conntrack_hash[i].next,
+ struct ip_conntrack_tuple_hash, list);
+ list_del(&h->list);
+ bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+ list_add_tail(&h->list, &hash[bucket]);
+ }
+ }
+ old_size = ip_conntrack_htable_size;
+ old_vmalloced = ip_conntrack_vmalloc;
+ old_hash = ip_conntrack_hash;
+
+ ip_conntrack_htable_size = hashsize;
+ ip_conntrack_vmalloc = vmalloced;
+ ip_conntrack_hash = hash;
+ ip_conntrack_hash_rnd = rnd;
+ write_unlock_bh(&ip_conntrack_lock);
+
+ free_conntrack_hash(old_hash, old_vmalloced, old_size);
+ return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+ &ip_conntrack_htable_size, 0600);
int __init ip_conntrack_init(void)
{
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 256 buckets. >= 1GB machines have 8192 buckets. */
- if (hashsize) {
- ip_conntrack_htable_size = hashsize;
- } else {
+ if (!ip_conntrack_htable_size) {
ip_conntrack_htable_size
= (((num_physpages << PAGE_SHIFT) / 16384)
/ sizeof(struct list_head));
return ret;
}
- ip_conntrack_hash = vmalloc(sizeof(struct list_head)
- * ip_conntrack_htable_size);
+ ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+ &ip_conntrack_vmalloc);
if (!ip_conntrack_hash) {
printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
goto err_unreg_sockopt;
ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
sizeof(struct ip_conntrack), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ 0, NULL, NULL);
if (!ip_conntrack_cachep) {
printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
goto err_free_hash;
}
+
+ ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+ sizeof(struct ip_conntrack_expect),
+ 0, 0, NULL, NULL);
+ if (!ip_conntrack_expect_cachep) {
+ printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+ goto err_free_conntrack_slab;
+ }
+
/* Don't NEED lock here, but good form anyway. */
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < MAX_IP_CT_PROTO; i++)
+ ip_ct_protos[i] = &ip_conntrack_generic_protocol;
/* Sew in builtin protocols. */
- list_append(&protocol_list, &ip_conntrack_protocol_tcp);
- list_append(&protocol_list, &ip_conntrack_protocol_udp);
- list_append(&protocol_list, &ip_conntrack_protocol_icmp);
- WRITE_UNLOCK(&ip_conntrack_lock);
-
- for (i = 0; i < ip_conntrack_htable_size; i++)
- INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+ ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+ ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+ ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
+ write_unlock_bh(&ip_conntrack_lock);
/* For use by ipt_REJECT */
ip_ct_attach = ip_conntrack_attach;
atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
/* - and look it like as a confirmed connection */
set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
- /* - and prepare the ctinfo field for REJECT & NAT. */
- ip_conntrack_untracked.infos[IP_CT_NEW].master =
- ip_conntrack_untracked.infos[IP_CT_RELATED].master =
- ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master =
- &ip_conntrack_untracked.ct_general;
return ret;
+err_free_conntrack_slab:
+ kmem_cache_destroy(ip_conntrack_cachep);
err_free_hash:
- vfree(ip_conntrack_hash);
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
err_unreg_sockopt:
nf_unregister_sockopt(&so_getorigdst);