1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
41 /* This rwlock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
52 #define IP_CONNTRACK_VERSION "2.1"
57 #define DEBUGP(format, args...)
60 DECLARE_RWLOCK(ip_conntrack_lock);
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size = 0;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep;
73 static kmem_cache_t *ip_conntrack_expect_cachep;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
82 ip_conntrack_put(struct ip_conntrack *ct)
85 nf_conntrack_put(&ct->ct_general);
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
104 ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->src.u.all = tuple->dst.u.all = 0;
121 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
123 return protocol->pkt_to_tuple(skb, dataoff, tuple);
127 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
128 const struct ip_conntrack_tuple *orig,
129 const struct ip_conntrack_protocol *protocol)
131 inverse->src.ip = orig->dst.ip;
132 inverse->dst.ip = orig->src.ip;
133 inverse->dst.protonum = orig->dst.protonum;
134 inverse->dst.dir = !orig->dst.dir;
136 inverse->src.u.all = inverse->dst.u.all = 0;
138 return protocol->invert_tuple(inverse, orig);
142 /* ip_conntrack_expect helper functions */
143 static void destroy_expect(struct ip_conntrack_expect *exp)
145 ip_conntrack_put(exp->master);
146 IP_NF_ASSERT(!timer_pending(&exp->timeout));
147 kmem_cache_free(ip_conntrack_expect_cachep, exp);
148 CONNTRACK_STAT_INC(expect_delete);
151 static void unlink_expect(struct ip_conntrack_expect *exp)
153 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
154 list_del(&exp->list);
155 /* Logically in destroy_expect, but we hold the lock here. */
156 exp->master->expecting--;
159 static void expectation_timed_out(unsigned long ul_expect)
161 struct ip_conntrack_expect *exp = (void *)ul_expect;
163 WRITE_LOCK(&ip_conntrack_lock);
165 WRITE_UNLOCK(&ip_conntrack_lock);
169 /* If an expectation for this connection is found, it gets delete from
170 * global list then returned. */
171 static struct ip_conntrack_expect *
172 find_expectation(const struct ip_conntrack_tuple *tuple)
174 struct ip_conntrack_expect *i;
176 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
177 /* If master is not in hash table yet (ie. packet hasn't left
178 this machine yet), how can other end know about expected?
179 Hence these are not the droids you are looking for (if
180 master ct never got confirmed, we'd hold a reference to it
181 and weird things would happen to future packets). */
182 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
183 && is_confirmed(i->master)
184 && del_timer(&i->timeout)) {
192 /* delete all expectations for this conntrack */
193 static void remove_expectations(struct ip_conntrack *ct)
195 struct ip_conntrack_expect *i, *tmp;
197 /* Optimization: most connection never expect any others. */
198 if (ct->expecting == 0)
201 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
202 if (i->master == ct && del_timer(&i->timeout)) {
210 clean_from_lists(struct ip_conntrack *ct)
214 DEBUGP("clean_from_lists(%p)\n", ct);
215 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
217 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
218 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
219 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
220 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
222 /* Destroy all pending expectations */
223 remove_expectations(ct);
227 destroy_conntrack(struct nf_conntrack *nfct)
229 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
230 struct ip_conntrack_protocol *proto;
232 DEBUGP("destroy_conntrack(%p)\n", ct);
233 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
234 IP_NF_ASSERT(!timer_pending(&ct->timeout));
236 /* To make sure we don't get any weird locking issues here:
237 * destroy_conntrack() MUST NOT be called with a write lock
238 * to ip_conntrack_lock!!! -HW */
239 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
240 if (proto && proto->destroy)
243 if (ip_conntrack_destroyed)
244 ip_conntrack_destroyed(ct);
246 WRITE_LOCK(&ip_conntrack_lock);
247 /* Expectations will have been removed in clean_from_lists,
248 * except TFTP can create an expectation on the first packet,
249 * before connection is in the list, so we need to clean here,
251 remove_expectations(ct);
253 /* We overload first tuple to link into unconfirmed list. */
254 if (!is_confirmed(ct)) {
255 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
256 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
259 CONNTRACK_STAT_INC(delete);
260 WRITE_UNLOCK(&ip_conntrack_lock);
263 ip_conntrack_put(ct->master);
265 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
266 kmem_cache_free(ip_conntrack_cachep, ct);
267 atomic_dec(&ip_conntrack_count);
270 static void death_by_timeout(unsigned long ul_conntrack)
272 struct ip_conntrack *ct = (void *)ul_conntrack;
274 WRITE_LOCK(&ip_conntrack_lock);
275 /* Inside lock so preempt is disabled on module removal path.
276 * Otherwise we can get spurious warnings. */
277 CONNTRACK_STAT_INC(delete_list);
278 clean_from_lists(ct);
279 WRITE_UNLOCK(&ip_conntrack_lock);
280 ip_conntrack_put(ct);
284 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
285 const struct ip_conntrack_tuple *tuple,
286 const struct ip_conntrack *ignored_conntrack)
288 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
289 return tuplehash_to_ctrack(i) != ignored_conntrack
290 && ip_ct_tuple_equal(tuple, &i->tuple);
293 static struct ip_conntrack_tuple_hash *
294 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
295 const struct ip_conntrack *ignored_conntrack)
297 struct ip_conntrack_tuple_hash *h;
298 unsigned int hash = hash_conntrack(tuple);
300 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
301 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
302 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
303 CONNTRACK_STAT_INC(found);
306 CONNTRACK_STAT_INC(searched);
312 /* Find a connection corresponding to a tuple. */
313 struct ip_conntrack_tuple_hash *
314 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
315 const struct ip_conntrack *ignored_conntrack)
317 struct ip_conntrack_tuple_hash *h;
319 READ_LOCK(&ip_conntrack_lock);
320 h = __ip_conntrack_find(tuple, ignored_conntrack);
322 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
323 READ_UNLOCK(&ip_conntrack_lock);
328 /* Confirm a connection given skb; places it in hash table */
330 __ip_conntrack_confirm(struct sk_buff **pskb)
332 unsigned int hash, repl_hash;
333 struct ip_conntrack *ct;
334 enum ip_conntrack_info ctinfo;
336 ct = ip_conntrack_get(*pskb, &ctinfo);
338 /* ipt_REJECT uses ip_conntrack_attach to attach related
339 ICMP/TCP RST packets in other direction. Actual packet
340 which created connection will be IP_CT_NEW or for an
341 expected connection, IP_CT_RELATED. */
342 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
345 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
346 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
348 /* We're not in hash table, and we refuse to set up related
349 connections for unconfirmed conns. But packet copies and
350 REJECT will give spurious warnings here. */
351 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
353 /* No external references means noone else could have
355 IP_NF_ASSERT(!is_confirmed(ct));
356 DEBUGP("Confirming conntrack %p\n", ct);
358 WRITE_LOCK(&ip_conntrack_lock);
360 /* See if there's one in the list already, including reverse:
361 NAT could have grabbed it without realizing, since we're
362 not in the hash. If there is, we lost race. */
363 if (!LIST_FIND(&ip_conntrack_hash[hash],
365 struct ip_conntrack_tuple_hash *,
366 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
367 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
369 struct ip_conntrack_tuple_hash *,
370 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
371 /* Remove from unconfirmed list */
372 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
374 list_prepend(&ip_conntrack_hash[hash],
375 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
376 list_prepend(&ip_conntrack_hash[repl_hash],
377 &ct->tuplehash[IP_CT_DIR_REPLY]);
378 /* Timer relative to confirmation time, not original
379 setting time, otherwise we'd get timer wrap in
380 weird delay cases. */
381 ct->timeout.expires += jiffies;
382 add_timer(&ct->timeout);
383 atomic_inc(&ct->ct_general.use);
384 set_bit(IPS_CONFIRMED_BIT, &ct->status);
385 CONNTRACK_STAT_INC(insert);
386 WRITE_UNLOCK(&ip_conntrack_lock);
390 CONNTRACK_STAT_INC(insert_failed);
391 WRITE_UNLOCK(&ip_conntrack_lock);
396 /* Returns true if a connection correspondings to the tuple (required
399 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
400 const struct ip_conntrack *ignored_conntrack)
402 struct ip_conntrack_tuple_hash *h;
404 READ_LOCK(&ip_conntrack_lock);
405 h = __ip_conntrack_find(tuple, ignored_conntrack);
406 READ_UNLOCK(&ip_conntrack_lock);
411 /* There's a small race here where we may free a just-assured
412 connection. Too bad: we're in trouble anyway. */
413 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
415 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
418 static int early_drop(struct list_head *chain)
420 /* Traverse backwards: gives us oldest, which is roughly LRU */
421 struct ip_conntrack_tuple_hash *h;
422 struct ip_conntrack *ct = NULL;
425 READ_LOCK(&ip_conntrack_lock);
426 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
428 ct = tuplehash_to_ctrack(h);
429 atomic_inc(&ct->ct_general.use);
431 READ_UNLOCK(&ip_conntrack_lock);
436 if (del_timer(&ct->timeout)) {
437 death_by_timeout((unsigned long)ct);
439 CONNTRACK_STAT_INC(early_drop);
441 ip_conntrack_put(ct);
445 static inline int helper_cmp(const struct ip_conntrack_helper *i,
446 const struct ip_conntrack_tuple *rtuple)
448 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
451 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
453 return LIST_FIND(&helpers, helper_cmp,
454 struct ip_conntrack_helper *,
458 /* Allocate a new conntrack: we return -ENOMEM if classification
459 failed due to stress. Otherwise it really is unclassifiable. */
460 static struct ip_conntrack_tuple_hash *
461 init_conntrack(const struct ip_conntrack_tuple *tuple,
462 struct ip_conntrack_protocol *protocol,
465 struct ip_conntrack *conntrack;
466 struct ip_conntrack_tuple repl_tuple;
468 struct ip_conntrack_expect *exp;
470 if (!ip_conntrack_hash_rnd_initted) {
471 get_random_bytes(&ip_conntrack_hash_rnd, 4);
472 ip_conntrack_hash_rnd_initted = 1;
475 hash = hash_conntrack(tuple);
478 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
479 /* Try dropping from this hash chain. */
480 if (!early_drop(&ip_conntrack_hash[hash])) {
483 "ip_conntrack: table full, dropping"
485 return ERR_PTR(-ENOMEM);
489 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
490 DEBUGP("Can't invert tuple.\n");
494 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
496 DEBUGP("Can't allocate conntrack.\n");
497 return ERR_PTR(-ENOMEM);
500 memset(conntrack, 0, sizeof(*conntrack));
501 atomic_set(&conntrack->ct_general.use, 1);
502 conntrack->ct_general.destroy = destroy_conntrack;
503 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
504 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
505 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
506 conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
507 conntrack->xid[IP_CT_DIR_REPLY] = -1;
509 if (!protocol->new(conntrack, skb)) {
510 kmem_cache_free(ip_conntrack_cachep, conntrack);
513 /* Don't set timer yet: wait for confirmation */
514 init_timer(&conntrack->timeout);
515 conntrack->timeout.data = (unsigned long)conntrack;
516 conntrack->timeout.function = death_by_timeout;
518 WRITE_LOCK(&ip_conntrack_lock);
519 exp = find_expectation(tuple);
522 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
524 /* Welcome, Mr. Bond. We've been expecting you... */
525 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
526 conntrack->master = exp->master;
527 #if CONFIG_IP_NF_CONNTRACK_MARK
528 conntrack->mark = exp->master->mark;
530 nf_conntrack_get(&conntrack->master->ct_general);
531 CONNTRACK_STAT_INC(expect_new);
533 conntrack->helper = ip_ct_find_helper(&repl_tuple);
535 CONNTRACK_STAT_INC(new);
538 /* Overload tuple linked list to put us in unconfirmed list. */
539 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
541 atomic_inc(&ip_conntrack_count);
542 WRITE_UNLOCK(&ip_conntrack_lock);
546 exp->expectfn(conntrack, exp);
550 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
553 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
554 static inline struct ip_conntrack *
555 resolve_normal_ct(struct sk_buff *skb,
556 struct ip_conntrack_protocol *proto,
558 unsigned int hooknum,
559 enum ip_conntrack_info *ctinfo)
561 struct ip_conntrack_tuple tuple;
562 struct ip_conntrack_tuple_hash *h;
563 struct ip_conntrack *ct;
565 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
567 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
571 /* look for tuple match */
572 h = ip_conntrack_find_get(&tuple, NULL);
574 h = init_conntrack(&tuple, proto, skb);
580 ct = tuplehash_to_ctrack(h);
582 /* It exists; we have (non-exclusive) reference. */
583 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
584 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
585 /* Please set reply bit if this packet OK */
588 /* Once we've had two way comms, always ESTABLISHED. */
589 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
590 DEBUGP("ip_conntrack_in: normal packet for %p\n",
592 *ctinfo = IP_CT_ESTABLISHED;
593 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
594 DEBUGP("ip_conntrack_in: related packet for %p\n",
596 *ctinfo = IP_CT_RELATED;
598 DEBUGP("ip_conntrack_in: new packet for %p\n",
604 skb->nfct = &ct->ct_general;
605 skb->nfctinfo = *ctinfo;
609 /* Netfilter hook itself. */
610 unsigned int ip_conntrack_in(unsigned int hooknum,
611 struct sk_buff **pskb,
612 const struct net_device *in,
613 const struct net_device *out,
614 int (*okfn)(struct sk_buff *))
616 struct ip_conntrack *ct;
617 enum ip_conntrack_info ctinfo;
618 struct ip_conntrack_protocol *proto;
622 /* Previously seen (loopback or untracked)? Ignore. */
624 CONNTRACK_STAT_INC(ignore);
629 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
630 if (net_ratelimit()) {
631 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
632 (*pskb)->nh.iph->protocol, hooknum);
637 /* FIXME: Do this right please. --RR */
638 (*pskb)->nfcache |= NFC_UNKNOWN;
640 /* Doesn't cover locally-generated broadcast, so not worth it. */
642 /* Ignore broadcast: no `connection'. */
643 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
644 printk("Broadcast packet!\n");
646 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
647 == htonl(0x000000FF)) {
648 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
649 NIPQUAD((*pskb)->nh.iph->saddr),
650 NIPQUAD((*pskb)->nh.iph->daddr),
651 (*pskb)->sk, (*pskb)->pkt_type);
655 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
657 /* It may be an special packet, error, unclean...
658 * inverse of the return code tells to the netfilter
659 * core what to do with the packet. */
660 if (proto->error != NULL
661 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
662 CONNTRACK_STAT_INC(error);
663 CONNTRACK_STAT_INC(invalid);
667 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
668 /* Not valid part of a connection */
669 CONNTRACK_STAT_INC(invalid);
674 /* Too stressed to deal. */
675 CONNTRACK_STAT_INC(drop);
679 IP_NF_ASSERT((*pskb)->nfct);
681 ret = proto->packet(ct, *pskb, ctinfo);
683 /* Invalid: inverse of the return code tells
684 * the netfilter core what to do*/
685 nf_conntrack_put((*pskb)->nfct);
686 (*pskb)->nfct = NULL;
687 CONNTRACK_STAT_INC(invalid);
692 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
697 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
698 const struct ip_conntrack_tuple *orig)
700 return ip_ct_invert_tuple(inverse, orig,
701 ip_ct_find_proto(orig->dst.protonum));
704 /* Would two expected things clash? */
705 static inline int expect_clash(const struct ip_conntrack_expect *a,
706 const struct ip_conntrack_expect *b)
708 /* Part covered by intersection of masks must be unequal,
709 otherwise they clash */
710 struct ip_conntrack_tuple intersect_mask
711 = { { a->mask.src.ip & b->mask.src.ip,
712 { a->mask.src.u.all & b->mask.src.u.all } },
713 { a->mask.dst.ip & b->mask.dst.ip,
714 { a->mask.dst.u.all & b->mask.dst.u.all },
715 a->mask.dst.protonum & b->mask.dst.protonum } };
717 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
720 static inline int expect_matches(const struct ip_conntrack_expect *a,
721 const struct ip_conntrack_expect *b)
723 return a->master == b->master
724 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
725 && ip_ct_tuple_equal(&a->mask, &b->mask);
728 /* Generally a bad idea to call this: could have matched already. */
729 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
731 struct ip_conntrack_expect *i;
733 WRITE_LOCK(&ip_conntrack_lock);
734 /* choose the the oldest expectation to evict */
735 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
736 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
738 WRITE_UNLOCK(&ip_conntrack_lock);
743 WRITE_UNLOCK(&ip_conntrack_lock);
746 struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
748 struct ip_conntrack_expect *new;
750 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
752 DEBUGP("expect_related: OOM allocating expect\n");
759 void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
761 kmem_cache_free(ip_conntrack_expect_cachep, expect);
764 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
766 atomic_inc(&exp->master->ct_general.use);
767 exp->master->expecting++;
768 list_add(&exp->list, &ip_conntrack_expect_list);
770 if (exp->master->helper->timeout) {
771 init_timer(&exp->timeout);
772 exp->timeout.data = (unsigned long)exp;
773 exp->timeout.function = expectation_timed_out;
775 = jiffies + exp->master->helper->timeout * HZ;
776 add_timer(&exp->timeout);
778 exp->timeout.function = NULL;
780 CONNTRACK_STAT_INC(expect_create);
783 /* Race with expectations being used means we could have none to find; OK. */
784 static void evict_oldest_expect(struct ip_conntrack *master)
786 struct ip_conntrack_expect *i;
788 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
789 if (i->master == master) {
790 if (del_timer(&i->timeout)) {
799 static inline int refresh_timer(struct ip_conntrack_expect *i)
801 if (!del_timer(&i->timeout))
804 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
805 add_timer(&i->timeout);
809 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
811 struct ip_conntrack_expect *i;
814 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
815 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
816 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
818 WRITE_LOCK(&ip_conntrack_lock);
819 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
820 if (expect_matches(i, expect)) {
821 /* Refresh timer: if it's dying, ignore.. */
822 if (refresh_timer(i)) {
824 /* We don't need the one they've given us. */
825 ip_conntrack_expect_free(expect);
828 } else if (expect_clash(i, expect)) {
834 /* Will be over limit? */
835 if (expect->master->helper->max_expected &&
836 expect->master->expecting >= expect->master->helper->max_expected)
837 evict_oldest_expect(expect->master);
839 ip_conntrack_expect_insert(expect);
842 WRITE_UNLOCK(&ip_conntrack_lock);
846 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
847 implicitly racy: see __ip_conntrack_confirm */
848 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
849 const struct ip_conntrack_tuple *newreply)
851 WRITE_LOCK(&ip_conntrack_lock);
852 /* Should be unconfirmed, so not in hash table yet */
853 IP_NF_ASSERT(!is_confirmed(conntrack));
855 DEBUGP("Altering reply tuple of %p to ", conntrack);
856 DUMP_TUPLE(newreply);
858 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
859 if (!conntrack->master && conntrack->expecting == 0)
860 conntrack->helper = ip_ct_find_helper(newreply);
861 WRITE_UNLOCK(&ip_conntrack_lock);
864 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
866 BUG_ON(me->timeout == 0);
867 WRITE_LOCK(&ip_conntrack_lock);
868 list_prepend(&helpers, me);
869 WRITE_UNLOCK(&ip_conntrack_lock);
874 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
875 const struct ip_conntrack_helper *me)
877 if (tuplehash_to_ctrack(i)->helper == me)
878 tuplehash_to_ctrack(i)->helper = NULL;
882 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
885 struct ip_conntrack_expect *exp, *tmp;
887 /* Need write lock here, to delete helper. */
888 WRITE_LOCK(&ip_conntrack_lock);
889 LIST_DELETE(&helpers, me);
891 /* Get rid of expectations */
892 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
893 if (exp->master->helper == me && del_timer(&exp->timeout)) {
898 /* Get rid of expecteds, set helpers to NULL. */
899 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
900 for (i = 0; i < ip_conntrack_htable_size; i++)
901 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
902 struct ip_conntrack_tuple_hash *, me);
903 WRITE_UNLOCK(&ip_conntrack_lock);
905 /* Someone could be still looking at the helper in a bh. */
909 static inline void ct_add_counters(struct ip_conntrack *ct,
910 enum ip_conntrack_info ctinfo,
911 const struct sk_buff *skb)
913 #ifdef CONFIG_IP_NF_CT_ACCT
915 ct->counters[CTINFO2DIR(ctinfo)].packets++;
916 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
917 ntohs(skb->nh.iph->tot_len);
922 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
923 void ip_ct_refresh_acct(struct ip_conntrack *ct,
924 enum ip_conntrack_info ctinfo,
925 const struct sk_buff *skb,
926 unsigned long extra_jiffies)
928 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
930 /* If not in hash table, timer will not be active yet */
931 if (!is_confirmed(ct)) {
932 ct->timeout.expires = extra_jiffies;
933 ct_add_counters(ct, ctinfo, skb);
935 WRITE_LOCK(&ip_conntrack_lock);
936 /* Need del_timer for race avoidance (may already be dying). */
937 if (del_timer(&ct->timeout)) {
938 ct->timeout.expires = jiffies + extra_jiffies;
939 add_timer(&ct->timeout);
941 ct_add_counters(ct, ctinfo, skb);
942 WRITE_UNLOCK(&ip_conntrack_lock);
946 /* Returns new sk_buff, or NULL */
948 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
950 #ifdef CONFIG_NETFILTER_DEBUG
951 unsigned int olddebug = skb->nf_debug;
957 skb = ip_defrag(skb, user);
961 ip_send_check(skb->nh.iph);
962 skb->nfcache |= NFC_ALTERED;
963 #ifdef CONFIG_NETFILTER_DEBUG
964 /* Packet path as if nothing had happened. */
965 skb->nf_debug = olddebug;
972 /* Used by ipt_REJECT. */
973 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
975 struct ip_conntrack *ct;
976 enum ip_conntrack_info ctinfo;
978 /* This ICMP is in reverse direction to the packet which caused it */
979 ct = ip_conntrack_get(skb, &ctinfo);
981 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
982 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
984 ctinfo = IP_CT_RELATED;
986 /* Attach to new skbuff, and increment count */
987 nskb->nfct = &ct->ct_general;
988 nskb->nfctinfo = ctinfo;
989 nf_conntrack_get(nskb->nfct);
993 do_iter(const struct ip_conntrack_tuple_hash *i,
994 int (*iter)(struct ip_conntrack *i, void *data),
997 return iter(tuplehash_to_ctrack(i), data);
1000 /* Bring out ya dead! */
1001 static struct ip_conntrack_tuple_hash *
1002 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1003 void *data, unsigned int *bucket)
1005 struct ip_conntrack_tuple_hash *h = NULL;
1007 WRITE_LOCK(&ip_conntrack_lock);
1008 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1009 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1010 struct ip_conntrack_tuple_hash *, iter, data);
1015 h = LIST_FIND_W(&unconfirmed, do_iter,
1016 struct ip_conntrack_tuple_hash *, iter, data);
1018 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1019 WRITE_UNLOCK(&ip_conntrack_lock);
1025 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1027 struct ip_conntrack_tuple_hash *h;
1028 unsigned int bucket = 0;
1030 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1031 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1032 /* Time to push up daises... */
1033 if (del_timer(&ct->timeout))
1034 death_by_timeout((unsigned long)ct);
1035 /* ... else the timer will get him soon. */
1037 ip_conntrack_put(ct);
1041 /* Fast function for those who don't want to parse /proc (and I don't
1043 /* Reversing the socket's dst/src point of view gives us the reply
1046 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1048 struct inet_sock *inet = inet_sk(sk);
1049 struct ip_conntrack_tuple_hash *h;
1050 struct ip_conntrack_tuple tuple;
1052 IP_CT_TUPLE_U_BLANK(&tuple);
1053 tuple.src.ip = inet->rcv_saddr;
1054 tuple.src.u.tcp.port = inet->sport;
1055 tuple.dst.ip = inet->daddr;
1056 tuple.dst.u.tcp.port = inet->dport;
1057 tuple.dst.protonum = IPPROTO_TCP;
1059 /* We only do TCP at the moment: is there a better way? */
1060 if (strcmp(sk->sk_prot->name, "TCP")) {
1061 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1062 return -ENOPROTOOPT;
1065 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1066 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1067 *len, sizeof(struct sockaddr_in));
1071 h = ip_conntrack_find_get(&tuple, NULL);
1073 struct sockaddr_in sin;
1074 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1076 sin.sin_family = AF_INET;
1077 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1078 .tuple.dst.u.tcp.port;
1079 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1082 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1083 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1084 ip_conntrack_put(ct);
1085 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1090 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1091 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1092 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1096 static struct nf_sockopt_ops so_getorigdst = {
1098 .get_optmin = SO_ORIGINAL_DST,
1099 .get_optmax = SO_ORIGINAL_DST+1,
1103 static int kill_all(struct ip_conntrack *i, void *data)
1108 static void free_conntrack_hash(void)
1110 if (ip_conntrack_vmalloc)
1111 vfree(ip_conntrack_hash);
1113 free_pages((unsigned long)ip_conntrack_hash,
1114 get_order(sizeof(struct list_head)
1115 * ip_conntrack_htable_size));
1118 /* Mishearing the voices in his head, our hero wonders how he's
1119 supposed to kill the mall. */
1120 void ip_conntrack_cleanup(void)
1122 ip_ct_attach = NULL;
1123 /* This makes sure all current packets have passed through
1124 netfilter framework. Roll on, two-stage module
1129 ip_ct_iterate_cleanup(kill_all, NULL);
1130 if (atomic_read(&ip_conntrack_count) != 0) {
1132 goto i_see_dead_people;
1135 kmem_cache_destroy(ip_conntrack_cachep);
1136 kmem_cache_destroy(ip_conntrack_expect_cachep);
1137 free_conntrack_hash();
1138 nf_unregister_sockopt(&so_getorigdst);
1141 static int hashsize;
1142 module_param(hashsize, int, 0400);
1144 int __init ip_conntrack_init(void)
1149 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1150 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1152 ip_conntrack_htable_size = hashsize;
1154 ip_conntrack_htable_size
1155 = (((num_physpages << PAGE_SHIFT) / 16384)
1156 / sizeof(struct list_head));
1157 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1158 ip_conntrack_htable_size = 8192;
1159 if (ip_conntrack_htable_size < 16)
1160 ip_conntrack_htable_size = 16;
1162 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1164 printk("ip_conntrack version %s (%u buckets, %d max)"
1165 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1166 ip_conntrack_htable_size, ip_conntrack_max,
1167 sizeof(struct ip_conntrack));
1169 ret = nf_register_sockopt(&so_getorigdst);
1171 printk(KERN_ERR "Unable to register netfilter socket option\n");
1175 /* AK: the hash table is twice as big than needed because it
1176 uses list_head. it would be much nicer to caches to use a
1177 single pointer list head here. */
1178 ip_conntrack_vmalloc = 0;
1180 =(void*)__get_free_pages(GFP_KERNEL,
1181 get_order(sizeof(struct list_head)
1182 *ip_conntrack_htable_size));
1183 if (!ip_conntrack_hash) {
1184 ip_conntrack_vmalloc = 1;
1185 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1186 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1187 * ip_conntrack_htable_size);
1189 if (!ip_conntrack_hash) {
1190 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1191 goto err_unreg_sockopt;
1194 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1195 sizeof(struct ip_conntrack), 0,
1197 if (!ip_conntrack_cachep) {
1198 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1202 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1203 sizeof(struct ip_conntrack_expect),
1205 if (!ip_conntrack_expect_cachep) {
1206 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1207 goto err_free_conntrack_slab;
1210 /* Don't NEED lock here, but good form anyway. */
1211 WRITE_LOCK(&ip_conntrack_lock);
1212 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1213 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1214 /* Sew in builtin protocols. */
1215 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1216 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1217 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1218 WRITE_UNLOCK(&ip_conntrack_lock);
1220 for (i = 0; i < ip_conntrack_htable_size; i++)
1221 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1223 /* For use by ipt_REJECT */
1224 ip_ct_attach = ip_conntrack_attach;
1226 /* Set up fake conntrack:
1227 - to never be deleted, not in any hashes */
1228 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1229 /* - and look it like as a confirmed connection */
1230 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1234 err_free_conntrack_slab:
1235 kmem_cache_destroy(ip_conntrack_cachep);
1237 free_conntrack_hash();
1239 nf_unregister_sockopt(&so_getorigdst);