1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
41 /* This rwlock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
52 #define IP_CONNTRACK_VERSION "2.1"
57 #define DEBUGP(format, args...)
60 DECLARE_RWLOCK(ip_conntrack_lock);
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size = 0;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep;
73 static kmem_cache_t *ip_conntrack_expect_cachep;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
82 ip_conntrack_put(struct ip_conntrack *ct)
85 nf_conntrack_put(&ct->ct_general);
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
104 ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
122 return protocol->pkt_to_tuple(skb, dataoff, tuple);
126 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127 const struct ip_conntrack_tuple *orig,
128 const struct ip_conntrack_protocol *protocol)
130 inverse->src.ip = orig->dst.ip;
131 inverse->dst.ip = orig->src.ip;
132 inverse->dst.protonum = orig->dst.protonum;
133 inverse->dst.dir = !orig->dst.dir;
135 return protocol->invert_tuple(inverse, orig);
139 /* ip_conntrack_expect helper functions */
140 static void destroy_expect(struct ip_conntrack_expect *exp)
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
148 static void unlink_expect(struct ip_conntrack_expect *exp)
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--;
156 static void expectation_timed_out(unsigned long ul_expect)
158 struct ip_conntrack_expect *exp = (void *)ul_expect;
160 WRITE_LOCK(&ip_conntrack_lock);
162 WRITE_UNLOCK(&ip_conntrack_lock);
166 /* If an expectation for this connection is found, it gets delete from
167 * global list then returned. */
168 struct ip_conntrack_expect *
169 __ip_conntrack_exp_find(const struct ip_conntrack_tuple *tuple)
171 struct ip_conntrack_expect *i;
173 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
174 /* If master is not in hash table yet (ie. packet hasn't left
175 this machine yet), how can other end know about expected?
176 Hence these are not the droids you are looking for (if
177 master ct never got confirmed, we'd hold a reference to it
178 and weird things would happen to future packets). */
179 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
180 && is_confirmed(i->master)
181 && del_timer(&i->timeout)) {
189 /* delete all expectations for this conntrack */
190 static void remove_expectations(struct ip_conntrack *ct)
192 struct ip_conntrack_expect *i, *tmp;
194 /* Optimization: most connection never expect any others. */
195 if (ct->expecting == 0)
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) {
207 clean_from_lists(struct ip_conntrack *ct)
211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
216 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
217 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
219 /* Destroy all pending expectations */
220 remove_expectations(ct);
224 destroy_conntrack(struct nf_conntrack *nfct)
226 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
227 struct ip_conntrack_protocol *proto;
229 DEBUGP("destroy_conntrack(%p)\n", ct);
230 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
231 IP_NF_ASSERT(!timer_pending(&ct->timeout));
233 /* To make sure we don't get any weird locking issues here:
234 * destroy_conntrack() MUST NOT be called with a write lock
235 * to ip_conntrack_lock!!! -HW */
236 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
237 if (proto && proto->destroy)
240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct);
243 WRITE_LOCK(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here,
248 remove_expectations(ct);
250 /* We overload first tuple to link into unconfirmed list. */
251 if (!is_confirmed(ct)) {
252 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
253 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock);
260 ip_conntrack_put(ct->master);
262 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
263 kmem_cache_free(ip_conntrack_cachep, ct);
264 atomic_dec(&ip_conntrack_count);
267 static void death_by_timeout(unsigned long ul_conntrack)
269 struct ip_conntrack *ct = (void *)ul_conntrack;
271 WRITE_LOCK(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock);
277 ip_conntrack_put(ct);
281 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack)
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple);
290 struct ip_conntrack_tuple_hash *
291 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
292 const struct ip_conntrack *ignored_conntrack)
294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple);
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found);
303 CONNTRACK_STAT_INC(searched);
309 /* Find a connection corresponding to a tuple. */
310 struct ip_conntrack_tuple_hash *
311 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
312 const struct ip_conntrack *ignored_conntrack)
314 struct ip_conntrack_tuple_hash *h;
316 READ_LOCK(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack);
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock);
325 /* Confirm a connection given skb; places it in hash table */
327 __ip_conntrack_confirm(struct sk_buff **pskb)
329 unsigned int hash, repl_hash;
330 struct ip_conntrack *ct;
331 enum ip_conntrack_info ctinfo;
333 ct = ip_conntrack_get(*pskb, &ctinfo);
335 /* ipt_REJECT uses ip_conntrack_attach to attach related
336 ICMP/TCP RST packets in other direction. Actual packet
337 which created connection will be IP_CT_NEW or for an
338 expected connection, IP_CT_RELATED. */
339 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
342 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
343 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
345 /* We're not in hash table, and we refuse to set up related
346 connections for unconfirmed conns. But packet copies and
347 REJECT will give spurious warnings here. */
348 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
350 /* No external references means noone else could have
352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct);
355 WRITE_LOCK(&ip_conntrack_lock);
357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're
359 not in the hash. If there is, we lost race. */
360 if (!LIST_FIND(&ip_conntrack_hash[hash],
362 struct ip_conntrack_tuple_hash *,
363 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
364 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
366 struct ip_conntrack_tuple_hash *,
367 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
368 /* Remove from unconfirmed list */
369 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
371 list_prepend(&ip_conntrack_hash[hash],
372 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
373 list_prepend(&ip_conntrack_hash[repl_hash],
374 &ct->tuplehash[IP_CT_DIR_REPLY]);
375 /* Timer relative to confirmation time, not original
376 setting time, otherwise we'd get timer wrap in
377 weird delay cases. */
378 ct->timeout.expires += jiffies;
379 add_timer(&ct->timeout);
380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock);
387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock);
393 /* Returns true if a connection correspondings to the tuple (required
396 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
397 const struct ip_conntrack *ignored_conntrack)
399 struct ip_conntrack_tuple_hash *h;
401 READ_LOCK(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock);
408 /* There's a small race here where we may free a just-assured
409 connection. Too bad: we're in trouble anyway. */
410 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
412 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
415 static int early_drop(struct list_head *chain)
417 /* Traverse backwards: gives us oldest, which is roughly LRU */
418 struct ip_conntrack_tuple_hash *h;
419 struct ip_conntrack *ct = NULL;
422 READ_LOCK(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use);
428 READ_UNLOCK(&ip_conntrack_lock);
433 if (del_timer(&ct->timeout)) {
434 death_by_timeout((unsigned long)ct);
436 CONNTRACK_STAT_INC(early_drop);
438 ip_conntrack_put(ct);
442 static inline int helper_cmp(const struct ip_conntrack_helper *i,
443 const struct ip_conntrack_tuple *rtuple)
445 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
448 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
450 return LIST_FIND(&helpers, helper_cmp,
451 struct ip_conntrack_helper *,
455 /* Allocate a new conntrack: we return -ENOMEM if classification
456 failed due to stress. Otherwise it really is unclassifiable. */
457 static struct ip_conntrack_tuple_hash *
458 init_conntrack(const struct ip_conntrack_tuple *tuple,
459 struct ip_conntrack_protocol *protocol,
462 struct ip_conntrack *conntrack;
463 struct ip_conntrack_tuple repl_tuple;
465 struct ip_conntrack_expect *exp;
467 if (!ip_conntrack_hash_rnd_initted) {
468 get_random_bytes(&ip_conntrack_hash_rnd, 4);
469 ip_conntrack_hash_rnd_initted = 1;
472 hash = hash_conntrack(tuple);
475 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
476 /* Try dropping from this hash chain. */
477 if (!early_drop(&ip_conntrack_hash[hash])) {
480 "ip_conntrack: table full, dropping"
482 return ERR_PTR(-ENOMEM);
486 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
487 DEBUGP("Can't invert tuple.\n");
491 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
493 DEBUGP("Can't allocate conntrack.\n");
494 return ERR_PTR(-ENOMEM);
497 memset(conntrack, 0, sizeof(*conntrack));
498 atomic_set(&conntrack->ct_general.use, 1);
499 conntrack->ct_general.destroy = destroy_conntrack;
500 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
501 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
502 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
503 conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
504 conntrack->xid[IP_CT_DIR_REPLY] = -1;
506 if (!protocol->new(conntrack, skb)) {
507 kmem_cache_free(ip_conntrack_cachep, conntrack);
510 /* Don't set timer yet: wait for confirmation */
511 init_timer(&conntrack->timeout);
512 conntrack->timeout.data = (unsigned long)conntrack;
513 conntrack->timeout.function = death_by_timeout;
515 WRITE_LOCK(&ip_conntrack_lock);
516 exp = __ip_conntrack_exp_find(tuple);
519 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
521 /* Welcome, Mr. Bond. We've been expecting you... */
522 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
523 conntrack->master = exp->master;
524 #if CONFIG_IP_NF_CONNTRACK_MARK
525 conntrack->mark = exp->master->mark;
527 nf_conntrack_get(&conntrack->master->ct_general);
528 CONNTRACK_STAT_INC(expect_new);
530 conntrack->helper = ip_ct_find_helper(&repl_tuple);
532 CONNTRACK_STAT_INC(new);
535 /* Overload tuple linked list to put us in unconfirmed list. */
536 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
538 atomic_inc(&ip_conntrack_count);
539 WRITE_UNLOCK(&ip_conntrack_lock);
543 exp->expectfn(conntrack, exp);
547 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
550 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
551 static inline struct ip_conntrack *
552 resolve_normal_ct(struct sk_buff *skb,
553 struct ip_conntrack_protocol *proto,
555 unsigned int hooknum,
556 enum ip_conntrack_info *ctinfo)
558 struct ip_conntrack_tuple tuple;
559 struct ip_conntrack_tuple_hash *h;
560 struct ip_conntrack *ct;
562 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
564 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
568 /* look for tuple match */
569 h = ip_conntrack_find_get(&tuple, NULL);
571 h = init_conntrack(&tuple, proto, skb);
577 ct = tuplehash_to_ctrack(h);
579 /* It exists; we have (non-exclusive) reference. */
580 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
581 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
582 /* Please set reply bit if this packet OK */
585 /* Once we've had two way comms, always ESTABLISHED. */
586 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
587 DEBUGP("ip_conntrack_in: normal packet for %p\n",
589 *ctinfo = IP_CT_ESTABLISHED;
590 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
591 DEBUGP("ip_conntrack_in: related packet for %p\n",
593 *ctinfo = IP_CT_RELATED;
595 DEBUGP("ip_conntrack_in: new packet for %p\n",
601 skb->nfct = &ct->ct_general;
602 skb->nfctinfo = *ctinfo;
606 /* Netfilter hook itself. */
607 unsigned int ip_conntrack_in(unsigned int hooknum,
608 struct sk_buff **pskb,
609 const struct net_device *in,
610 const struct net_device *out,
611 int (*okfn)(struct sk_buff *))
613 struct ip_conntrack *ct;
614 enum ip_conntrack_info ctinfo;
615 struct ip_conntrack_protocol *proto;
619 /* Previously seen (loopback or untracked)? Ignore. */
621 CONNTRACK_STAT_INC(ignore);
626 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
627 if (net_ratelimit()) {
628 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
629 (*pskb)->nh.iph->protocol, hooknum);
634 /* FIXME: Do this right please. --RR */
635 (*pskb)->nfcache |= NFC_UNKNOWN;
637 /* Doesn't cover locally-generated broadcast, so not worth it. */
639 /* Ignore broadcast: no `connection'. */
640 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
641 printk("Broadcast packet!\n");
643 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
644 == htonl(0x000000FF)) {
645 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
646 NIPQUAD((*pskb)->nh.iph->saddr),
647 NIPQUAD((*pskb)->nh.iph->daddr),
648 (*pskb)->sk, (*pskb)->pkt_type);
652 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
654 /* It may be an special packet, error, unclean...
655 * inverse of the return code tells to the netfilter
656 * core what to do with the packet. */
657 if (proto->error != NULL
658 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
659 CONNTRACK_STAT_INC(error);
660 CONNTRACK_STAT_INC(invalid);
664 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
665 /* Not valid part of a connection */
666 CONNTRACK_STAT_INC(invalid);
671 /* Too stressed to deal. */
672 CONNTRACK_STAT_INC(drop);
676 IP_NF_ASSERT((*pskb)->nfct);
678 ret = proto->packet(ct, *pskb, ctinfo);
680 /* Invalid: inverse of the return code tells
681 * the netfilter core what to do*/
682 nf_conntrack_put((*pskb)->nfct);
683 (*pskb)->nfct = NULL;
684 CONNTRACK_STAT_INC(invalid);
689 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
694 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
695 const struct ip_conntrack_tuple *orig)
697 return ip_ct_invert_tuple(inverse, orig,
698 ip_ct_find_proto(orig->dst.protonum));
701 /* Would two expected things clash? */
702 static inline int expect_clash(const struct ip_conntrack_expect *a,
703 const struct ip_conntrack_expect *b)
705 /* Part covered by intersection of masks must be unequal,
706 otherwise they clash */
707 struct ip_conntrack_tuple intersect_mask
708 = { { a->mask.src.ip & b->mask.src.ip,
709 { a->mask.src.u.all & b->mask.src.u.all } },
710 { a->mask.dst.ip & b->mask.dst.ip,
711 { a->mask.dst.u.all & b->mask.dst.u.all },
712 a->mask.dst.protonum & b->mask.dst.protonum } };
714 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
717 static inline int expect_matches(const struct ip_conntrack_expect *a,
718 const struct ip_conntrack_expect *b)
720 return a->master == b->master
721 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
722 && ip_ct_tuple_equal(&a->mask, &b->mask);
725 /* Generally a bad idea to call this: could have matched already. */
726 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
728 struct ip_conntrack_expect *i;
730 WRITE_LOCK(&ip_conntrack_lock);
731 /* choose the the oldest expectation to evict */
732 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
733 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
735 WRITE_UNLOCK(&ip_conntrack_lock);
740 WRITE_UNLOCK(&ip_conntrack_lock);
743 struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
745 struct ip_conntrack_expect *new;
747 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
749 DEBUGP("expect_related: OOM allocating expect\n");
756 void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
758 kmem_cache_free(ip_conntrack_expect_cachep, expect);
761 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
763 atomic_inc(&exp->master->ct_general.use);
764 exp->master->expecting++;
765 list_add(&exp->list, &ip_conntrack_expect_list);
767 if (exp->master->helper->timeout) {
768 init_timer(&exp->timeout);
769 exp->timeout.data = (unsigned long)exp;
770 exp->timeout.function = expectation_timed_out;
772 = jiffies + exp->master->helper->timeout * HZ;
773 add_timer(&exp->timeout);
775 exp->timeout.function = NULL;
777 CONNTRACK_STAT_INC(expect_create);
780 /* Race with expectations being used means we could have none to find; OK. */
781 static void evict_oldest_expect(struct ip_conntrack *master)
783 struct ip_conntrack_expect *i;
785 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
786 if (i->master == master) {
787 if (del_timer(&i->timeout)) {
796 static inline int refresh_timer(struct ip_conntrack_expect *i)
798 if (!del_timer(&i->timeout))
801 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
802 add_timer(&i->timeout);
806 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
808 struct ip_conntrack_expect *i;
811 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
812 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
813 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
815 WRITE_LOCK(&ip_conntrack_lock);
816 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
817 if (expect_matches(i, expect)) {
818 /* Refresh timer: if it's dying, ignore.. */
819 if (refresh_timer(i)) {
821 /* We don't need the one they've given us. */
822 ip_conntrack_expect_free(expect);
825 } else if (expect_clash(i, expect)) {
831 /* Will be over limit? */
832 if (expect->master->helper->max_expected &&
833 expect->master->expecting >= expect->master->helper->max_expected)
834 evict_oldest_expect(expect->master);
836 ip_conntrack_expect_insert(expect);
839 WRITE_UNLOCK(&ip_conntrack_lock);
843 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
844 implicitly racy: see __ip_conntrack_confirm */
845 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
846 const struct ip_conntrack_tuple *newreply)
848 WRITE_LOCK(&ip_conntrack_lock);
849 /* Should be unconfirmed, so not in hash table yet */
850 IP_NF_ASSERT(!is_confirmed(conntrack));
852 DEBUGP("Altering reply tuple of %p to ", conntrack);
853 DUMP_TUPLE(newreply);
855 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
856 if (!conntrack->master && conntrack->expecting == 0)
857 conntrack->helper = ip_ct_find_helper(newreply);
858 WRITE_UNLOCK(&ip_conntrack_lock);
861 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
863 BUG_ON(me->timeout == 0);
864 WRITE_LOCK(&ip_conntrack_lock);
865 list_prepend(&helpers, me);
866 WRITE_UNLOCK(&ip_conntrack_lock);
871 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
872 const struct ip_conntrack_helper *me)
874 if (tuplehash_to_ctrack(i)->helper == me)
875 tuplehash_to_ctrack(i)->helper = NULL;
879 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
882 struct ip_conntrack_expect *exp, *tmp;
884 /* Need write lock here, to delete helper. */
885 WRITE_LOCK(&ip_conntrack_lock);
886 LIST_DELETE(&helpers, me);
888 /* Get rid of expectations */
889 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
890 if (exp->master->helper == me && del_timer(&exp->timeout)) {
895 /* Get rid of expecteds, set helpers to NULL. */
896 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
897 for (i = 0; i < ip_conntrack_htable_size; i++)
898 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
899 struct ip_conntrack_tuple_hash *, me);
900 WRITE_UNLOCK(&ip_conntrack_lock);
902 /* Someone could be still looking at the helper in a bh. */
906 static inline void ct_add_counters(struct ip_conntrack *ct,
907 enum ip_conntrack_info ctinfo,
908 const struct sk_buff *skb)
910 #ifdef CONFIG_IP_NF_CT_ACCT
912 ct->counters[CTINFO2DIR(ctinfo)].packets++;
913 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
914 ntohs(skb->nh.iph->tot_len);
919 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
920 void ip_ct_refresh_acct(struct ip_conntrack *ct,
921 enum ip_conntrack_info ctinfo,
922 const struct sk_buff *skb,
923 unsigned long extra_jiffies)
925 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
927 /* If not in hash table, timer will not be active yet */
928 if (!is_confirmed(ct)) {
929 ct->timeout.expires = extra_jiffies;
930 ct_add_counters(ct, ctinfo, skb);
932 WRITE_LOCK(&ip_conntrack_lock);
933 /* Need del_timer for race avoidance (may already be dying). */
934 if (del_timer(&ct->timeout)) {
935 ct->timeout.expires = jiffies + extra_jiffies;
936 add_timer(&ct->timeout);
938 ct_add_counters(ct, ctinfo, skb);
939 WRITE_UNLOCK(&ip_conntrack_lock);
943 /* Returns new sk_buff, or NULL */
945 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
947 #ifdef CONFIG_NETFILTER_DEBUG
948 unsigned int olddebug = skb->nf_debug;
954 skb = ip_defrag(skb, user);
958 ip_send_check(skb->nh.iph);
959 skb->nfcache |= NFC_ALTERED;
960 #ifdef CONFIG_NETFILTER_DEBUG
961 /* Packet path as if nothing had happened. */
962 skb->nf_debug = olddebug;
969 /* Used by ipt_REJECT. */
970 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
972 struct ip_conntrack *ct;
973 enum ip_conntrack_info ctinfo;
975 /* This ICMP is in reverse direction to the packet which caused it */
976 ct = ip_conntrack_get(skb, &ctinfo);
978 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
979 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
981 ctinfo = IP_CT_RELATED;
983 /* Attach to new skbuff, and increment count */
984 nskb->nfct = &ct->ct_general;
985 nskb->nfctinfo = ctinfo;
986 nf_conntrack_get(nskb->nfct);
990 do_iter(const struct ip_conntrack_tuple_hash *i,
991 int (*iter)(struct ip_conntrack *i, void *data),
994 return iter(tuplehash_to_ctrack(i), data);
997 /* Bring out ya dead! */
998 static struct ip_conntrack_tuple_hash *
999 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1000 void *data, unsigned int *bucket)
1002 struct ip_conntrack_tuple_hash *h = NULL;
1004 WRITE_LOCK(&ip_conntrack_lock);
1005 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1006 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1007 struct ip_conntrack_tuple_hash *, iter, data);
1012 h = LIST_FIND_W(&unconfirmed, do_iter,
1013 struct ip_conntrack_tuple_hash *, iter, data);
1015 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1016 WRITE_UNLOCK(&ip_conntrack_lock);
1022 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1024 struct ip_conntrack_tuple_hash *h;
1025 unsigned int bucket = 0;
1027 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1028 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1029 /* Time to push up daises... */
1030 if (del_timer(&ct->timeout))
1031 death_by_timeout((unsigned long)ct);
1032 /* ... else the timer will get him soon. */
1034 ip_conntrack_put(ct);
1038 /* Fast function for those who don't want to parse /proc (and I don't
1040 /* Reversing the socket's dst/src point of view gives us the reply
1043 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1045 struct inet_sock *inet = inet_sk(sk);
1046 struct ip_conntrack_tuple_hash *h;
1047 struct ip_conntrack_tuple tuple;
1049 IP_CT_TUPLE_U_BLANK(&tuple);
1050 tuple.src.ip = inet->rcv_saddr;
1051 tuple.src.u.tcp.port = inet->sport;
1052 tuple.dst.ip = inet->daddr;
1053 tuple.dst.u.tcp.port = inet->dport;
1054 tuple.dst.protonum = IPPROTO_TCP;
1056 /* We only do TCP at the moment: is there a better way? */
1057 if (strcmp(sk->sk_prot->name, "TCP")) {
1058 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1059 return -ENOPROTOOPT;
1062 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1063 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1064 *len, sizeof(struct sockaddr_in));
1068 h = ip_conntrack_find_get(&tuple, NULL);
1070 struct sockaddr_in sin;
1071 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1073 sin.sin_family = AF_INET;
1074 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1075 .tuple.dst.u.tcp.port;
1076 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1079 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1080 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1081 ip_conntrack_put(ct);
1082 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1087 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1088 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1089 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1093 static struct nf_sockopt_ops so_getorigdst = {
1095 .get_optmin = SO_ORIGINAL_DST,
1096 .get_optmax = SO_ORIGINAL_DST+1,
1100 static int kill_all(struct ip_conntrack *i, void *data)
1105 static void free_conntrack_hash(void)
1107 if (ip_conntrack_vmalloc)
1108 vfree(ip_conntrack_hash);
1110 free_pages((unsigned long)ip_conntrack_hash,
1111 get_order(sizeof(struct list_head)
1112 * ip_conntrack_htable_size));
1115 /* Mishearing the voices in his head, our hero wonders how he's
1116 supposed to kill the mall. */
1117 void ip_conntrack_cleanup(void)
1119 ip_ct_attach = NULL;
1120 /* This makes sure all current packets have passed through
1121 netfilter framework. Roll on, two-stage module
1126 ip_ct_iterate_cleanup(kill_all, NULL);
1127 if (atomic_read(&ip_conntrack_count) != 0) {
1129 goto i_see_dead_people;
1132 kmem_cache_destroy(ip_conntrack_cachep);
1133 kmem_cache_destroy(ip_conntrack_expect_cachep);
1134 free_conntrack_hash();
1135 nf_unregister_sockopt(&so_getorigdst);
1138 static int hashsize;
1139 module_param(hashsize, int, 0400);
1141 int __init ip_conntrack_init(void)
1146 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1147 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1149 ip_conntrack_htable_size = hashsize;
1151 ip_conntrack_htable_size
1152 = (((num_physpages << PAGE_SHIFT) / 16384)
1153 / sizeof(struct list_head));
1154 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1155 ip_conntrack_htable_size = 8192;
1156 if (ip_conntrack_htable_size < 16)
1157 ip_conntrack_htable_size = 16;
1159 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1161 printk("ip_conntrack version %s (%u buckets, %d max)"
1162 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1163 ip_conntrack_htable_size, ip_conntrack_max,
1164 sizeof(struct ip_conntrack));
1166 ret = nf_register_sockopt(&so_getorigdst);
1168 printk(KERN_ERR "Unable to register netfilter socket option\n");
1172 /* AK: the hash table is twice as big than needed because it
1173 uses list_head. it would be much nicer to caches to use a
1174 single pointer list head here. */
1175 ip_conntrack_vmalloc = 0;
1177 =(void*)__get_free_pages(GFP_KERNEL,
1178 get_order(sizeof(struct list_head)
1179 *ip_conntrack_htable_size));
1180 if (!ip_conntrack_hash) {
1181 ip_conntrack_vmalloc = 1;
1182 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1183 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1184 * ip_conntrack_htable_size);
1186 if (!ip_conntrack_hash) {
1187 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1188 goto err_unreg_sockopt;
1191 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1192 sizeof(struct ip_conntrack), 0,
1194 if (!ip_conntrack_cachep) {
1195 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1199 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1200 sizeof(struct ip_conntrack_expect),
1202 if (!ip_conntrack_expect_cachep) {
1203 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1204 goto err_free_conntrack_slab;
1207 /* Don't NEED lock here, but good form anyway. */
1208 WRITE_LOCK(&ip_conntrack_lock);
1209 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1210 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1211 /* Sew in builtin protocols. */
1212 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1213 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1214 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1215 WRITE_UNLOCK(&ip_conntrack_lock);
1217 for (i = 0; i < ip_conntrack_htable_size; i++)
1218 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1220 /* For use by ipt_REJECT */
1221 ip_ct_attach = ip_conntrack_attach;
1223 /* Set up fake conntrack:
1224 - to never be deleted, not in any hashes */
1225 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1226 /* - and look it like as a confirmed connection */
1227 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1231 err_free_conntrack_slab:
1232 kmem_cache_destroy(ip_conntrack_cachep);
1234 free_conntrack_hash();
1236 nf_unregister_sockopt(&so_getorigdst);