Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24  *      - restructure nf_conn (introduce nf_conn_help)
25  *      - redesign 'features' how they were originally intended
26  * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27  *      - add support for L3 protocol module load on demand.
28  *
29  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30  */
31
32 #include <linux/types.h>
33 #include <linux/netfilter.h>
34 #include <linux/module.h>
35 #include <linux/skbuff.h>
36 #include <linux/proc_fs.h>
37 #include <linux/vmalloc.h>
38 #include <linux/stddef.h>
39 #include <linux/slab.h>
40 #include <linux/random.h>
41 #include <linux/jhash.h>
42 #include <linux/err.h>
43 #include <linux/percpu.h>
44 #include <linux/moduleparam.h>
45 #include <linux/notifier.h>
46 #include <linux/kernel.h>
47 #include <linux/netdevice.h>
48 #include <linux/socket.h>
49
50 /* This rwlock protects the main hash table, protocol/helper/expected
51    registrations, conntrack timers*/
52 #define ASSERT_READ_LOCK(x)
53 #define ASSERT_WRITE_LOCK(x)
54
55 #include <net/netfilter/nf_conntrack.h>
56 #include <net/netfilter/nf_conntrack_l3proto.h>
57 #include <net/netfilter/nf_conntrack_protocol.h>
58 #include <net/netfilter/nf_conntrack_helper.h>
59 #include <net/netfilter/nf_conntrack_core.h>
60 #include <linux/netfilter_ipv4/listhelp.h>
61
62 #define NF_CONNTRACK_VERSION    "0.5.0"
63
64 #if 0
65 #define DEBUGP printk
66 #else
67 #define DEBUGP(format, args...)
68 #endif
69
70 DEFINE_RWLOCK(nf_conntrack_lock);
71
72 /* nf_conntrack_standalone needs this */
73 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
74
75 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
76 LIST_HEAD(nf_conntrack_expect_list);
77 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
78 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
79 static LIST_HEAD(helpers);
80 unsigned int nf_conntrack_htable_size = 0;
81 int nf_conntrack_max;
82 struct list_head *nf_conntrack_hash;
83 static kmem_cache_t *nf_conntrack_expect_cachep;
84 struct nf_conn nf_conntrack_untracked;
85 unsigned int nf_ct_log_invalid;
86 static LIST_HEAD(unconfirmed);
87 static int nf_conntrack_vmalloc;
88
89 static unsigned int nf_conntrack_next_id;
90 static unsigned int nf_conntrack_expect_next_id;
91 #ifdef CONFIG_NF_CONNTRACK_EVENTS
92 ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
93 ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
94
95 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
96
97 /* deliver cached events and clear cache entry - must be called with locally
98  * disabled softirqs */
99 static inline void
100 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
101 {
102         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
103         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
104             && ecache->events)
105                 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
106                                     ecache->ct);
107
108         ecache->events = 0;
109         nf_ct_put(ecache->ct);
110         ecache->ct = NULL;
111 }
112
113 /* Deliver all cached events for a particular conntrack. This is called
114  * by code prior to async packet handling for freeing the skb */
115 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
116 {
117         struct nf_conntrack_ecache *ecache;
118
119         local_bh_disable();
120         ecache = &__get_cpu_var(nf_conntrack_ecache);
121         if (ecache->ct == ct)
122                 __nf_ct_deliver_cached_events(ecache);
123         local_bh_enable();
124 }
125
126 /* Deliver cached events for old pending events, if current conntrack != old */
127 void __nf_ct_event_cache_init(struct nf_conn *ct)
128 {
129         struct nf_conntrack_ecache *ecache;
130         
131         /* take care of delivering potentially old events */
132         ecache = &__get_cpu_var(nf_conntrack_ecache);
133         BUG_ON(ecache->ct == ct);
134         if (ecache->ct)
135                 __nf_ct_deliver_cached_events(ecache);
136         /* initialize for this conntrack/packet */
137         ecache->ct = ct;
138         nf_conntrack_get(&ct->ct_general);
139 }
140
141 /* flush the event cache - touches other CPU's data and must not be called
142  * while packets are still passing through the code */
143 static void nf_ct_event_cache_flush(void)
144 {
145         struct nf_conntrack_ecache *ecache;
146         int cpu;
147
148         for_each_possible_cpu(cpu) {
149                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
150                 if (ecache->ct)
151                         nf_ct_put(ecache->ct);
152         }
153 }
154 #else
155 static inline void nf_ct_event_cache_flush(void) {}
156 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
157
158 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
159 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
160
161 /*
162  * This scheme offers various size of "struct nf_conn" dependent on
163  * features(helper, nat, ...)
164  */
165
166 #define NF_CT_FEATURES_NAMELEN  256
167 static struct {
168         /* name of slab cache. printed in /proc/slabinfo */
169         char *name;
170
171         /* size of slab cache */
172         size_t size;
173
174         /* slab cache pointer */
175         kmem_cache_t *cachep;
176
177         /* allocated slab cache + modules which uses this slab cache */
178         int use;
179
180 } nf_ct_cache[NF_CT_F_NUM];
181
182 /* protect members of nf_ct_cache except of "use" */
183 DEFINE_RWLOCK(nf_ct_cache_lock);
184
185 /* This avoids calling kmem_cache_create() with same name simultaneously */
186 static DEFINE_MUTEX(nf_ct_cache_mutex);
187
188 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
189 struct nf_conntrack_protocol *
190 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
191 {
192         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
193                 return &nf_conntrack_generic_protocol;
194
195         return nf_ct_protos[l3proto][protocol];
196 }
197
198 /* this is guaranteed to always return a valid protocol helper, since
199  * it falls back to generic_protocol */
200 struct nf_conntrack_protocol *
201 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
202 {
203         struct nf_conntrack_protocol *p;
204
205         preempt_disable();
206         p = __nf_ct_proto_find(l3proto, protocol);
207         if (!try_module_get(p->me))
208                 p = &nf_conntrack_generic_protocol;
209         preempt_enable();
210         
211         return p;
212 }
213
214 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
215 {
216         module_put(p->me);
217 }
218
219 struct nf_conntrack_l3proto *
220 nf_ct_l3proto_find_get(u_int16_t l3proto)
221 {
222         struct nf_conntrack_l3proto *p;
223
224         preempt_disable();
225         p = __nf_ct_l3proto_find(l3proto);
226         if (!try_module_get(p->me))
227                 p = &nf_conntrack_generic_l3proto;
228         preempt_enable();
229
230         return p;
231 }
232
233 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
234 {
235         module_put(p->me);
236 }
237
238 int
239 nf_ct_l3proto_try_module_get(unsigned short l3proto)
240 {
241         int ret;
242         struct nf_conntrack_l3proto *p;
243
244 retry:  p = nf_ct_l3proto_find_get(l3proto);
245         if (p == &nf_conntrack_generic_l3proto) {
246                 ret = request_module("nf_conntrack-%d", l3proto);
247                 if (!ret)
248                         goto retry;
249
250                 return -EPROTOTYPE;
251         }
252
253         return 0;
254 }
255
256 void nf_ct_l3proto_module_put(unsigned short l3proto)
257 {
258         struct nf_conntrack_l3proto *p;
259
260         preempt_disable();
261         p = __nf_ct_l3proto_find(l3proto);
262         preempt_enable();
263
264         module_put(p->me);
265 }
266
267 static int nf_conntrack_hash_rnd_initted;
268 static unsigned int nf_conntrack_hash_rnd;
269
270 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
271                                   unsigned int size, unsigned int rnd)
272 {
273         unsigned int a, b;
274         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
275                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
276         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
277                         (tuple->src.u.all << 16) | tuple->dst.u.all);
278
279         return jhash_2words(a, b, rnd) % size;
280 }
281
282 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
283 {
284         return __hash_conntrack(tuple, nf_conntrack_htable_size,
285                                 nf_conntrack_hash_rnd);
286 }
287
288 int nf_conntrack_register_cache(u_int32_t features, const char *name,
289                                 size_t size)
290 {
291         int ret = 0;
292         char *cache_name;
293         kmem_cache_t *cachep;
294
295         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
296                features, name, size);
297
298         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
299                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
300                         features);
301                 return -EINVAL;
302         }
303
304         mutex_lock(&nf_ct_cache_mutex);
305
306         write_lock_bh(&nf_ct_cache_lock);
307         /* e.g: multiple helpers are loaded */
308         if (nf_ct_cache[features].use > 0) {
309                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
310                 if ((!strncmp(nf_ct_cache[features].name, name,
311                               NF_CT_FEATURES_NAMELEN))
312                     && nf_ct_cache[features].size == size) {
313                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
314                         nf_ct_cache[features].use++;
315                         ret = 0;
316                 } else
317                         ret = -EBUSY;
318
319                 write_unlock_bh(&nf_ct_cache_lock);
320                 mutex_unlock(&nf_ct_cache_mutex);
321                 return ret;
322         }
323         write_unlock_bh(&nf_ct_cache_lock);
324
325         /*
326          * The memory space for name of slab cache must be alive until
327          * cache is destroyed.
328          */
329         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
330         if (cache_name == NULL) {
331                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
332                 ret = -ENOMEM;
333                 goto out_up_mutex;
334         }
335
336         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
337                                                 >= NF_CT_FEATURES_NAMELEN) {
338                 printk("nf_conntrack_register_cache: name too long\n");
339                 ret = -EINVAL;
340                 goto out_free_name;
341         }
342
343         cachep = kmem_cache_create(cache_name, size, 0, 0,
344                                    NULL, NULL);
345         if (!cachep) {
346                 printk("nf_conntrack_register_cache: Can't create slab cache "
347                        "for the features = 0x%x\n", features);
348                 ret = -ENOMEM;
349                 goto out_free_name;
350         }
351
352         write_lock_bh(&nf_ct_cache_lock);
353         nf_ct_cache[features].use = 1;
354         nf_ct_cache[features].size = size;
355         nf_ct_cache[features].cachep = cachep;
356         nf_ct_cache[features].name = cache_name;
357         write_unlock_bh(&nf_ct_cache_lock);
358
359         goto out_up_mutex;
360
361 out_free_name:
362         kfree(cache_name);
363 out_up_mutex:
364         mutex_unlock(&nf_ct_cache_mutex);
365         return ret;
366 }
367
368 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
369 void nf_conntrack_unregister_cache(u_int32_t features)
370 {
371         kmem_cache_t *cachep;
372         char *name;
373
374         /*
375          * This assures that kmem_cache_create() isn't called before destroying
376          * slab cache.
377          */
378         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
379         mutex_lock(&nf_ct_cache_mutex);
380
381         write_lock_bh(&nf_ct_cache_lock);
382         if (--nf_ct_cache[features].use > 0) {
383                 write_unlock_bh(&nf_ct_cache_lock);
384                 mutex_unlock(&nf_ct_cache_mutex);
385                 return;
386         }
387         cachep = nf_ct_cache[features].cachep;
388         name = nf_ct_cache[features].name;
389         nf_ct_cache[features].cachep = NULL;
390         nf_ct_cache[features].name = NULL;
391         nf_ct_cache[features].size = 0;
392         write_unlock_bh(&nf_ct_cache_lock);
393
394         synchronize_net();
395
396         kmem_cache_destroy(cachep);
397         kfree(name);
398
399         mutex_unlock(&nf_ct_cache_mutex);
400 }
401
402 int
403 nf_ct_get_tuple(const struct sk_buff *skb,
404                 unsigned int nhoff,
405                 unsigned int dataoff,
406                 u_int16_t l3num,
407                 u_int8_t protonum,
408                 struct nf_conntrack_tuple *tuple,
409                 const struct nf_conntrack_l3proto *l3proto,
410                 const struct nf_conntrack_protocol *protocol)
411 {
412         NF_CT_TUPLE_U_BLANK(tuple);
413
414         tuple->src.l3num = l3num;
415         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
416                 return 0;
417
418         tuple->dst.protonum = protonum;
419         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
420
421         return protocol->pkt_to_tuple(skb, dataoff, tuple);
422 }
423
424 int
425 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
426                    const struct nf_conntrack_tuple *orig,
427                    const struct nf_conntrack_l3proto *l3proto,
428                    const struct nf_conntrack_protocol *protocol)
429 {
430         NF_CT_TUPLE_U_BLANK(inverse);
431
432         inverse->src.l3num = orig->src.l3num;
433         if (l3proto->invert_tuple(inverse, orig) == 0)
434                 return 0;
435
436         inverse->dst.dir = !orig->dst.dir;
437
438         inverse->dst.protonum = orig->dst.protonum;
439         return protocol->invert_tuple(inverse, orig);
440 }
441
442 /* nf_conntrack_expect helper functions */
443 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
444 {
445         struct nf_conn_help *master_help = nfct_help(exp->master);
446
447         NF_CT_ASSERT(master_help);
448         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
449         NF_CT_ASSERT(!timer_pending(&exp->timeout));
450
451         list_del(&exp->list);
452         NF_CT_STAT_INC(expect_delete);
453         master_help->expecting--;
454         nf_conntrack_expect_put(exp);
455 }
456
457 static void expectation_timed_out(unsigned long ul_expect)
458 {
459         struct nf_conntrack_expect *exp = (void *)ul_expect;
460
461         write_lock_bh(&nf_conntrack_lock);
462         nf_ct_unlink_expect(exp);
463         write_unlock_bh(&nf_conntrack_lock);
464         nf_conntrack_expect_put(exp);
465 }
466
467 struct nf_conntrack_expect *
468 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
469 {
470         struct nf_conntrack_expect *i;
471         
472         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
473                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
474                         atomic_inc(&i->use);
475                         return i;
476                 }
477         }
478         return NULL;
479 }
480
481 /* Just find a expectation corresponding to a tuple. */
482 struct nf_conntrack_expect *
483 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
484 {
485         struct nf_conntrack_expect *i;
486         
487         read_lock_bh(&nf_conntrack_lock);
488         i = __nf_conntrack_expect_find(tuple);
489         read_unlock_bh(&nf_conntrack_lock);
490
491         return i;
492 }
493
494 /* If an expectation for this connection is found, it gets delete from
495  * global list then returned. */
496 static struct nf_conntrack_expect *
497 find_expectation(const struct nf_conntrack_tuple *tuple)
498 {
499         struct nf_conntrack_expect *i;
500
501         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
502         /* If master is not in hash table yet (ie. packet hasn't left
503            this machine yet), how can other end know about expected?
504            Hence these are not the droids you are looking for (if
505            master ct never got confirmed, we'd hold a reference to it
506            and weird things would happen to future packets). */
507                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
508                     && nf_ct_is_confirmed(i->master)) {
509                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
510                                 atomic_inc(&i->use);
511                                 return i;
512                         } else if (del_timer(&i->timeout)) {
513                                 nf_ct_unlink_expect(i);
514                                 return i;
515                         }
516                 }
517         }
518         return NULL;
519 }
520
521 /* delete all expectations for this conntrack */
522 void nf_ct_remove_expectations(struct nf_conn *ct)
523 {
524         struct nf_conntrack_expect *i, *tmp;
525         struct nf_conn_help *help = nfct_help(ct);
526
527         /* Optimization: most connection never expect any others. */
528         if (!help || help->expecting == 0)
529                 return;
530
531         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
532                 if (i->master == ct && del_timer(&i->timeout)) {
533                         nf_ct_unlink_expect(i);
534                         nf_conntrack_expect_put(i);
535                 }
536         }
537 }
538
539 static void
540 clean_from_lists(struct nf_conn *ct)
541 {
542         unsigned int ho, hr;
543         
544         DEBUGP("clean_from_lists(%p)\n", ct);
545         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
546
547         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
548         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
549         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
550         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
551
552         /* Destroy all pending expectations */
553         nf_ct_remove_expectations(ct);
554 }
555
556 static void
557 destroy_conntrack(struct nf_conntrack *nfct)
558 {
559         struct nf_conn *ct = (struct nf_conn *)nfct;
560         struct nf_conntrack_l3proto *l3proto;
561         struct nf_conntrack_protocol *proto;
562
563         DEBUGP("destroy_conntrack(%p)\n", ct);
564         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
565         NF_CT_ASSERT(!timer_pending(&ct->timeout));
566
567         nf_conntrack_event(IPCT_DESTROY, ct);
568         set_bit(IPS_DYING_BIT, &ct->status);
569
570         /* To make sure we don't get any weird locking issues here:
571          * destroy_conntrack() MUST NOT be called with a write lock
572          * to nf_conntrack_lock!!! -HW */
573         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
574         if (l3proto && l3proto->destroy)
575                 l3proto->destroy(ct);
576
577         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
578         if (proto && proto->destroy)
579                 proto->destroy(ct);
580
581         if (nf_conntrack_destroyed)
582                 nf_conntrack_destroyed(ct);
583
584         write_lock_bh(&nf_conntrack_lock);
585         /* Expectations will have been removed in clean_from_lists,
586          * except TFTP can create an expectation on the first packet,
587          * before connection is in the list, so we need to clean here,
588          * too. */
589         nf_ct_remove_expectations(ct);
590
591         /* We overload first tuple to link into unconfirmed list. */
592         if (!nf_ct_is_confirmed(ct)) {
593                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
594                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
595         }
596
597         NF_CT_STAT_INC(delete);
598         write_unlock_bh(&nf_conntrack_lock);
599
600         if (ct->master)
601                 nf_ct_put(ct->master);
602
603         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
604         nf_conntrack_free(ct);
605 }
606
607 static void death_by_timeout(unsigned long ul_conntrack)
608 {
609         struct nf_conn *ct = (void *)ul_conntrack;
610
611         write_lock_bh(&nf_conntrack_lock);
612         /* Inside lock so preempt is disabled on module removal path.
613          * Otherwise we can get spurious warnings. */
614         NF_CT_STAT_INC(delete_list);
615         clean_from_lists(ct);
616         write_unlock_bh(&nf_conntrack_lock);
617         nf_ct_put(ct);
618 }
619
620 static inline int
621 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
622                     const struct nf_conntrack_tuple *tuple,
623                     const struct nf_conn *ignored_conntrack)
624 {
625         ASSERT_READ_LOCK(&nf_conntrack_lock);
626         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
627                 && nf_ct_tuple_equal(tuple, &i->tuple);
628 }
629
630 struct nf_conntrack_tuple_hash *
631 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
632                     const struct nf_conn *ignored_conntrack)
633 {
634         struct nf_conntrack_tuple_hash *h;
635         unsigned int hash = hash_conntrack(tuple);
636
637         ASSERT_READ_LOCK(&nf_conntrack_lock);
638         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
639                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
640                         NF_CT_STAT_INC(found);
641                         return h;
642                 }
643                 NF_CT_STAT_INC(searched);
644         }
645
646         return NULL;
647 }
648
649 /* Find a connection corresponding to a tuple. */
650 struct nf_conntrack_tuple_hash *
651 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
652                       const struct nf_conn *ignored_conntrack)
653 {
654         struct nf_conntrack_tuple_hash *h;
655
656         read_lock_bh(&nf_conntrack_lock);
657         h = __nf_conntrack_find(tuple, ignored_conntrack);
658         if (h)
659                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
660         read_unlock_bh(&nf_conntrack_lock);
661
662         return h;
663 }
664
665 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
666                                        unsigned int hash,
667                                        unsigned int repl_hash) 
668 {
669         ct->id = ++nf_conntrack_next_id;
670         list_prepend(&nf_conntrack_hash[hash],
671                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
672         list_prepend(&nf_conntrack_hash[repl_hash],
673                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
674 }
675
676 void nf_conntrack_hash_insert(struct nf_conn *ct)
677 {
678         unsigned int hash, repl_hash;
679
680         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
681         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
682
683         write_lock_bh(&nf_conntrack_lock);
684         __nf_conntrack_hash_insert(ct, hash, repl_hash);
685         write_unlock_bh(&nf_conntrack_lock);
686 }
687
688 /* Confirm a connection given skb; places it in hash table */
689 int
690 __nf_conntrack_confirm(struct sk_buff **pskb)
691 {
692         unsigned int hash, repl_hash;
693         struct nf_conn *ct;
694         enum ip_conntrack_info ctinfo;
695
696         ct = nf_ct_get(*pskb, &ctinfo);
697
698         /* ipt_REJECT uses nf_conntrack_attach to attach related
699            ICMP/TCP RST packets in other direction.  Actual packet
700            which created connection will be IP_CT_NEW or for an
701            expected connection, IP_CT_RELATED. */
702         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
703                 return NF_ACCEPT;
704
705         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
706         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
707
708         /* We're not in hash table, and we refuse to set up related
709            connections for unconfirmed conns.  But packet copies and
710            REJECT will give spurious warnings here. */
711         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
712
713         /* No external references means noone else could have
714            confirmed us. */
715         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
716         DEBUGP("Confirming conntrack %p\n", ct);
717
718         write_lock_bh(&nf_conntrack_lock);
719
720         /* See if there's one in the list already, including reverse:
721            NAT could have grabbed it without realizing, since we're
722            not in the hash.  If there is, we lost race. */
723         if (!LIST_FIND(&nf_conntrack_hash[hash],
724                        conntrack_tuple_cmp,
725                        struct nf_conntrack_tuple_hash *,
726                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
727             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
728                           conntrack_tuple_cmp,
729                           struct nf_conntrack_tuple_hash *,
730                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
731                 struct nf_conn_help *help;
732                 /* Remove from unconfirmed list */
733                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
734
735                 __nf_conntrack_hash_insert(ct, hash, repl_hash);
736                 /* Timer relative to confirmation time, not original
737                    setting time, otherwise we'd get timer wrap in
738                    weird delay cases. */
739                 ct->timeout.expires += jiffies;
740                 add_timer(&ct->timeout);
741                 atomic_inc(&ct->ct_general.use);
742                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
743                 NF_CT_STAT_INC(insert);
744                 write_unlock_bh(&nf_conntrack_lock);
745                 help = nfct_help(ct);
746                 if (help && help->helper)
747                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
748 #ifdef CONFIG_NF_NAT_NEEDED
749                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
750                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
751                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
752 #endif
753                 nf_conntrack_event_cache(master_ct(ct) ?
754                                          IPCT_RELATED : IPCT_NEW, *pskb);
755                 return NF_ACCEPT;
756         }
757
758         NF_CT_STAT_INC(insert_failed);
759         write_unlock_bh(&nf_conntrack_lock);
760         return NF_DROP;
761 }
762
763 /* Returns true if a connection correspondings to the tuple (required
764    for NAT). */
765 int
766 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
767                          const struct nf_conn *ignored_conntrack)
768 {
769         struct nf_conntrack_tuple_hash *h;
770
771         read_lock_bh(&nf_conntrack_lock);
772         h = __nf_conntrack_find(tuple, ignored_conntrack);
773         read_unlock_bh(&nf_conntrack_lock);
774
775         return h != NULL;
776 }
777
778 /* There's a small race here where we may free a just-assured
779    connection.  Too bad: we're in trouble anyway. */
780 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
781 {
782         return !(test_bit(IPS_ASSURED_BIT,
783                           &nf_ct_tuplehash_to_ctrack(i)->status));
784 }
785
786 static int early_drop(struct list_head *chain)
787 {
788         /* Traverse backwards: gives us oldest, which is roughly LRU */
789         struct nf_conntrack_tuple_hash *h;
790         struct nf_conn *ct = NULL;
791         int dropped = 0;
792
793         read_lock_bh(&nf_conntrack_lock);
794         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
795         if (h) {
796                 ct = nf_ct_tuplehash_to_ctrack(h);
797                 atomic_inc(&ct->ct_general.use);
798         }
799         read_unlock_bh(&nf_conntrack_lock);
800
801         if (!ct)
802                 return dropped;
803
804         if (del_timer(&ct->timeout)) {
805                 death_by_timeout((unsigned long)ct);
806                 dropped = 1;
807                 NF_CT_STAT_INC(early_drop);
808         }
809         nf_ct_put(ct);
810         return dropped;
811 }
812
813 static inline int helper_cmp(const struct nf_conntrack_helper *i,
814                              const struct nf_conntrack_tuple *rtuple)
815 {
816         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
817 }
818
819 static struct nf_conntrack_helper *
820 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
821 {
822         return LIST_FIND(&helpers, helper_cmp,
823                          struct nf_conntrack_helper *,
824                          tuple);
825 }
826
827 struct nf_conntrack_helper *
828 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
829 {
830         struct nf_conntrack_helper *helper;
831
832         /* need nf_conntrack_lock to assure that helper exists until
833          * try_module_get() is called */
834         read_lock_bh(&nf_conntrack_lock);
835
836         helper = __nf_ct_helper_find(tuple);
837         if (helper) {
838                 /* need to increase module usage count to assure helper will
839                  * not go away while the caller is e.g. busy putting a
840                  * conntrack in the hash that uses the helper */
841                 if (!try_module_get(helper->me))
842                         helper = NULL;
843         }
844
845         read_unlock_bh(&nf_conntrack_lock);
846
847         return helper;
848 }
849
850 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
851 {
852         module_put(helper->me);
853 }
854
855 static struct nf_conn *
856 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
857                      const struct nf_conntrack_tuple *repl,
858                      const struct nf_conntrack_l3proto *l3proto)
859 {
860         struct nf_conn *conntrack = NULL;
861         u_int32_t features = 0;
862         struct nf_conntrack_helper *helper;
863
864         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
865                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
866                 nf_conntrack_hash_rnd_initted = 1;
867         }
868
869         if (nf_conntrack_max
870             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
871                 unsigned int hash = hash_conntrack(orig);
872                 /* Try dropping from this hash chain. */
873                 if (!early_drop(&nf_conntrack_hash[hash])) {
874                         if (net_ratelimit())
875                                 printk(KERN_WARNING
876                                        "nf_conntrack: table full, dropping"
877                                        " packet.\n");
878                         return ERR_PTR(-ENOMEM);
879                 }
880         }
881
882         /*  find features needed by this conntrack. */
883         features = l3proto->get_features(orig);
884
885         /* FIXME: protect helper list per RCU */
886         read_lock_bh(&nf_conntrack_lock);
887         helper = __nf_ct_helper_find(repl);
888         if (helper)
889                 features |= NF_CT_F_HELP;
890         read_unlock_bh(&nf_conntrack_lock);
891
892         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
893
894         read_lock_bh(&nf_ct_cache_lock);
895
896         if (unlikely(!nf_ct_cache[features].use)) {
897                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
898                         features);
899                 goto out;
900         }
901
902         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
903         if (conntrack == NULL) {
904                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
905                 goto out;
906         }
907
908         memset(conntrack, 0, nf_ct_cache[features].size);
909         conntrack->features = features;
910         if (helper) {
911                 struct nf_conn_help *help = nfct_help(conntrack);
912                 NF_CT_ASSERT(help);
913                 help->helper = helper;
914         }
915
916         atomic_set(&conntrack->ct_general.use, 1);
917         conntrack->ct_general.destroy = destroy_conntrack;
918         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
919         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
920         /* Don't set timer yet: wait for confirmation */
921         init_timer(&conntrack->timeout);
922         conntrack->timeout.data = (unsigned long)conntrack;
923         conntrack->timeout.function = death_by_timeout;
924
925         atomic_inc(&nf_conntrack_count);
926 out:
927         read_unlock_bh(&nf_ct_cache_lock);
928         return conntrack;
929 }
930
931 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
932                                    const struct nf_conntrack_tuple *repl)
933 {
934         struct nf_conntrack_l3proto *l3proto;
935
936         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
937         return __nf_conntrack_alloc(orig, repl, l3proto);
938 }
939
940 void nf_conntrack_free(struct nf_conn *conntrack)
941 {
942         u_int32_t features = conntrack->features;
943         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
944         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
945                conntrack);
946         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
947         atomic_dec(&nf_conntrack_count);
948 }
949
950 /* Allocate a new conntrack: we return -ENOMEM if classification
951    failed due to stress.  Otherwise it really is unclassifiable. */
952 static struct nf_conntrack_tuple_hash *
953 init_conntrack(const struct nf_conntrack_tuple *tuple,
954                struct nf_conntrack_l3proto *l3proto,
955                struct nf_conntrack_protocol *protocol,
956                struct sk_buff *skb,
957                unsigned int dataoff)
958 {
959         struct nf_conn *conntrack;
960         struct nf_conntrack_tuple repl_tuple;
961         struct nf_conntrack_expect *exp;
962
963         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
964                 DEBUGP("Can't invert tuple.\n");
965                 return NULL;
966         }
967
968         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
969         if (conntrack == NULL || IS_ERR(conntrack)) {
970                 DEBUGP("Can't allocate conntrack.\n");
971                 return (struct nf_conntrack_tuple_hash *)conntrack;
972         }
973
974         if (!protocol->new(conntrack, skb, dataoff)) {
975                 nf_conntrack_free(conntrack);
976                 DEBUGP("init conntrack: can't track with proto module\n");
977                 return NULL;
978         }
979
980         write_lock_bh(&nf_conntrack_lock);
981         exp = find_expectation(tuple);
982
983         if (exp) {
984                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
985                         conntrack, exp);
986                 /* Welcome, Mr. Bond.  We've been expecting you... */
987                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
988                 conntrack->master = exp->master;
989 #ifdef CONFIG_NF_CONNTRACK_MARK
990                 conntrack->mark = exp->master->mark;
991 #endif
992 #ifdef CONFIG_NF_CONNTRACK_SECMARK
993                 conntrack->secmark = exp->master->secmark;
994 #endif
995                 nf_conntrack_get(&conntrack->master->ct_general);
996                 NF_CT_STAT_INC(expect_new);
997         } else
998                 NF_CT_STAT_INC(new);
999
1000         /* Overload tuple linked list to put us in unconfirmed list. */
1001         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1002
1003         write_unlock_bh(&nf_conntrack_lock);
1004
1005         if (exp) {
1006                 if (exp->expectfn)
1007                         exp->expectfn(conntrack, exp);
1008                 nf_conntrack_expect_put(exp);
1009         }
1010
1011         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1012 }
1013
1014 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1015 static inline struct nf_conn *
1016 resolve_normal_ct(struct sk_buff *skb,
1017                   unsigned int dataoff,
1018                   u_int16_t l3num,
1019                   u_int8_t protonum,
1020                   struct nf_conntrack_l3proto *l3proto,
1021                   struct nf_conntrack_protocol *proto,
1022                   int *set_reply,
1023                   enum ip_conntrack_info *ctinfo)
1024 {
1025         struct nf_conntrack_tuple tuple;
1026         struct nf_conntrack_tuple_hash *h;
1027         struct nf_conn *ct;
1028
1029         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1030                              dataoff, l3num, protonum, &tuple, l3proto,
1031                              proto)) {
1032                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1033                 return NULL;
1034         }
1035
1036         /* look for tuple match */
1037         h = nf_conntrack_find_get(&tuple, NULL);
1038         if (!h) {
1039                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1040                 if (!h)
1041                         return NULL;
1042                 if (IS_ERR(h))
1043                         return (void *)h;
1044         }
1045         ct = nf_ct_tuplehash_to_ctrack(h);
1046
1047         /* It exists; we have (non-exclusive) reference. */
1048         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1049                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1050                 /* Please set reply bit if this packet OK */
1051                 *set_reply = 1;
1052         } else {
1053                 /* Once we've had two way comms, always ESTABLISHED. */
1054                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1055                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1056                         *ctinfo = IP_CT_ESTABLISHED;
1057                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1058                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1059                         *ctinfo = IP_CT_RELATED;
1060                 } else {
1061                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1062                         *ctinfo = IP_CT_NEW;
1063                 }
1064                 *set_reply = 0;
1065         }
1066         skb->nfct = &ct->ct_general;
1067         skb->nfctinfo = *ctinfo;
1068         return ct;
1069 }
1070
1071 unsigned int
1072 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1073 {
1074         struct nf_conn *ct;
1075         enum ip_conntrack_info ctinfo;
1076         struct nf_conntrack_l3proto *l3proto;
1077         struct nf_conntrack_protocol *proto;
1078         unsigned int dataoff;
1079         u_int8_t protonum;
1080         int set_reply = 0;
1081         int ret;
1082
1083         /* Previously seen (loopback or untracked)?  Ignore. */
1084         if ((*pskb)->nfct) {
1085                 NF_CT_STAT_INC(ignore);
1086                 return NF_ACCEPT;
1087         }
1088
1089         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1090         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1091                 DEBUGP("not prepared to track yet or error occured\n");
1092                 return -ret;
1093         }
1094
1095         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1096
1097         /* It may be an special packet, error, unclean...
1098          * inverse of the return code tells to the netfilter
1099          * core what to do with the packet. */
1100         if (proto->error != NULL &&
1101             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1102                 NF_CT_STAT_INC(error);
1103                 NF_CT_STAT_INC(invalid);
1104                 return -ret;
1105         }
1106
1107         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1108                                &set_reply, &ctinfo);
1109         if (!ct) {
1110                 /* Not valid part of a connection */
1111                 NF_CT_STAT_INC(invalid);
1112                 return NF_ACCEPT;
1113         }
1114
1115         if (IS_ERR(ct)) {
1116                 /* Too stressed to deal. */
1117                 NF_CT_STAT_INC(drop);
1118                 return NF_DROP;
1119         }
1120
1121         NF_CT_ASSERT((*pskb)->nfct);
1122
1123         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1124         if (ret < 0) {
1125                 /* Invalid: inverse of the return code tells
1126                  * the netfilter core what to do */
1127                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1128                 nf_conntrack_put((*pskb)->nfct);
1129                 (*pskb)->nfct = NULL;
1130                 NF_CT_STAT_INC(invalid);
1131                 return -ret;
1132         }
1133
1134         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1135                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1136
1137         return ret;
1138 }
1139
1140 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1141                          const struct nf_conntrack_tuple *orig)
1142 {
1143         return nf_ct_invert_tuple(inverse, orig,
1144                                   __nf_ct_l3proto_find(orig->src.l3num),
1145                                   __nf_ct_proto_find(orig->src.l3num,
1146                                                      orig->dst.protonum));
1147 }
1148
1149 /* Would two expected things clash? */
1150 static inline int expect_clash(const struct nf_conntrack_expect *a,
1151                                const struct nf_conntrack_expect *b)
1152 {
1153         /* Part covered by intersection of masks must be unequal,
1154            otherwise they clash */
1155         struct nf_conntrack_tuple intersect_mask;
1156         int count;
1157
1158         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1159         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1160         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1161         intersect_mask.dst.protonum = a->mask.dst.protonum
1162                                         & b->mask.dst.protonum;
1163
1164         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1165                 intersect_mask.src.u3.all[count] =
1166                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1167         }
1168
1169         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1170                 intersect_mask.dst.u3.all[count] =
1171                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1172         }
1173
1174         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1175 }
1176
1177 static inline int expect_matches(const struct nf_conntrack_expect *a,
1178                                  const struct nf_conntrack_expect *b)
1179 {
1180         return a->master == b->master
1181                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1182                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1183 }
1184
1185 /* Generally a bad idea to call this: could have matched already. */
1186 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1187 {
1188         struct nf_conntrack_expect *i;
1189
1190         write_lock_bh(&nf_conntrack_lock);
1191         /* choose the the oldest expectation to evict */
1192         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1193                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1194                         nf_ct_unlink_expect(i);
1195                         write_unlock_bh(&nf_conntrack_lock);
1196                         nf_conntrack_expect_put(i);
1197                         return;
1198                 }
1199         }
1200         write_unlock_bh(&nf_conntrack_lock);
1201 }
1202
1203 /* We don't increase the master conntrack refcount for non-fulfilled
1204  * conntracks. During the conntrack destruction, the expectations are
1205  * always killed before the conntrack itself */
1206 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1207 {
1208         struct nf_conntrack_expect *new;
1209
1210         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1211         if (!new) {
1212                 DEBUGP("expect_related: OOM allocating expect\n");
1213                 return NULL;
1214         }
1215         new->master = me;
1216         atomic_set(&new->use, 1);
1217         return new;
1218 }
1219
1220 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1221 {
1222         if (atomic_dec_and_test(&exp->use))
1223                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1224 }
1225
1226 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1227 {
1228         struct nf_conn_help *master_help = nfct_help(exp->master);
1229
1230         atomic_inc(&exp->use);
1231         master_help->expecting++;
1232         list_add(&exp->list, &nf_conntrack_expect_list);
1233
1234         init_timer(&exp->timeout);
1235         exp->timeout.data = (unsigned long)exp;
1236         exp->timeout.function = expectation_timed_out;
1237         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1238         add_timer(&exp->timeout);
1239
1240         exp->id = ++nf_conntrack_expect_next_id;
1241         atomic_inc(&exp->use);
1242         NF_CT_STAT_INC(expect_create);
1243 }
1244
1245 /* Race with expectations being used means we could have none to find; OK. */
1246 static void evict_oldest_expect(struct nf_conn *master)
1247 {
1248         struct nf_conntrack_expect *i;
1249
1250         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1251                 if (i->master == master) {
1252                         if (del_timer(&i->timeout)) {
1253                                 nf_ct_unlink_expect(i);
1254                                 nf_conntrack_expect_put(i);
1255                         }
1256                         break;
1257                 }
1258         }
1259 }
1260
1261 static inline int refresh_timer(struct nf_conntrack_expect *i)
1262 {
1263         struct nf_conn_help *master_help = nfct_help(i->master);
1264
1265         if (!del_timer(&i->timeout))
1266                 return 0;
1267
1268         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1269         add_timer(&i->timeout);
1270         return 1;
1271 }
1272
1273 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1274 {
1275         struct nf_conntrack_expect *i;
1276         struct nf_conn *master = expect->master;
1277         struct nf_conn_help *master_help = nfct_help(master);
1278         int ret;
1279
1280         NF_CT_ASSERT(master_help);
1281
1282         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1283         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1284         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1285
1286         write_lock_bh(&nf_conntrack_lock);
1287         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1288                 if (expect_matches(i, expect)) {
1289                         /* Refresh timer: if it's dying, ignore.. */
1290                         if (refresh_timer(i)) {
1291                                 ret = 0;
1292                                 goto out;
1293                         }
1294                 } else if (expect_clash(i, expect)) {
1295                         ret = -EBUSY;
1296                         goto out;
1297                 }
1298         }
1299         /* Will be over limit? */
1300         if (master_help->helper->max_expected &&
1301             master_help->expecting >= master_help->helper->max_expected)
1302                 evict_oldest_expect(master);
1303
1304         nf_conntrack_expect_insert(expect);
1305         nf_conntrack_expect_event(IPEXP_NEW, expect);
1306         ret = 0;
1307 out:
1308         write_unlock_bh(&nf_conntrack_lock);
1309         return ret;
1310 }
1311
1312 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1313 {
1314         int ret;
1315         BUG_ON(me->timeout == 0);
1316
1317         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1318                                           sizeof(struct nf_conn)
1319                                           + sizeof(struct nf_conn_help)
1320                                           + __alignof__(struct nf_conn_help));
1321         if (ret < 0) {
1322                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1323                 return ret;
1324         }
1325         write_lock_bh(&nf_conntrack_lock);
1326         list_prepend(&helpers, me);
1327         write_unlock_bh(&nf_conntrack_lock);
1328
1329         return 0;
1330 }
1331
1332 struct nf_conntrack_helper *
1333 __nf_conntrack_helper_find_byname(const char *name)
1334 {
1335         struct nf_conntrack_helper *h;
1336
1337         list_for_each_entry(h, &helpers, list) {
1338                 if (!strcmp(h->name, name))
1339                         return h;
1340         }
1341
1342         return NULL;
1343 }
1344
1345 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1346                          const struct nf_conntrack_helper *me)
1347 {
1348         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1349         struct nf_conn_help *help = nfct_help(ct);
1350
1351         if (help && help->helper == me) {
1352                 nf_conntrack_event(IPCT_HELPER, ct);
1353                 help->helper = NULL;
1354         }
1355         return 0;
1356 }
1357
1358 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1359 {
1360         unsigned int i;
1361         struct nf_conntrack_expect *exp, *tmp;
1362
1363         /* Need write lock here, to delete helper. */
1364         write_lock_bh(&nf_conntrack_lock);
1365         LIST_DELETE(&helpers, me);
1366
1367         /* Get rid of expectations */
1368         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1369                 struct nf_conn_help *help = nfct_help(exp->master);
1370                 if (help->helper == me && del_timer(&exp->timeout)) {
1371                         nf_ct_unlink_expect(exp);
1372                         nf_conntrack_expect_put(exp);
1373                 }
1374         }
1375
1376         /* Get rid of expecteds, set helpers to NULL. */
1377         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1378         for (i = 0; i < nf_conntrack_htable_size; i++)
1379                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1380                             struct nf_conntrack_tuple_hash *, me);
1381         write_unlock_bh(&nf_conntrack_lock);
1382
1383         /* Someone could be still looking at the helper in a bh. */
1384         synchronize_net();
1385 }
1386
1387 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1388 void __nf_ct_refresh_acct(struct nf_conn *ct,
1389                           enum ip_conntrack_info ctinfo,
1390                           const struct sk_buff *skb,
1391                           unsigned long extra_jiffies,
1392                           int do_acct)
1393 {
1394         int event = 0;
1395
1396         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1397         NF_CT_ASSERT(skb);
1398
1399         write_lock_bh(&nf_conntrack_lock);
1400
1401         /* Only update if this is not a fixed timeout */
1402         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1403                 write_unlock_bh(&nf_conntrack_lock);
1404                 return;
1405         }
1406
1407         /* If not in hash table, timer will not be active yet */
1408         if (!nf_ct_is_confirmed(ct)) {
1409                 ct->timeout.expires = extra_jiffies;
1410                 event = IPCT_REFRESH;
1411         } else {
1412                 /* Need del_timer for race avoidance (may already be dying). */
1413                 if (del_timer(&ct->timeout)) {
1414                         ct->timeout.expires = jiffies + extra_jiffies;
1415                         add_timer(&ct->timeout);
1416                         event = IPCT_REFRESH;
1417                 }
1418         }
1419
1420 #ifdef CONFIG_NF_CT_ACCT
1421         if (do_acct) {
1422                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1423                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1424                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1425         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1426             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1427                 event |= IPCT_COUNTER_FILLING;
1428         }
1429 #endif
1430
1431         write_unlock_bh(&nf_conntrack_lock);
1432
1433         /* must be unlocked when calling event cache */
1434         if (event)
1435                 nf_conntrack_event_cache(event, skb);
1436 }
1437
1438 #if defined(CONFIG_NF_CT_NETLINK) || \
1439     defined(CONFIG_NF_CT_NETLINK_MODULE)
1440
1441 #include <linux/netfilter/nfnetlink.h>
1442 #include <linux/netfilter/nfnetlink_conntrack.h>
1443 #include <linux/mutex.h>
1444
1445
1446 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1447  * in ip_conntrack_core, since we don't want the protocols to autoload
1448  * or depend on ctnetlink */
1449 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1450                                const struct nf_conntrack_tuple *tuple)
1451 {
1452         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1453                 &tuple->src.u.tcp.port);
1454         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1455                 &tuple->dst.u.tcp.port);
1456         return 0;
1457
1458 nfattr_failure:
1459         return -1;
1460 }
1461
1462 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1463         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1464         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1465 };
1466
1467 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1468                                struct nf_conntrack_tuple *t)
1469 {
1470         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1471                 return -EINVAL;
1472
1473         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1474                 return -EINVAL;
1475
1476         t->src.u.tcp.port =
1477                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1478         t->dst.u.tcp.port =
1479                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1480
1481         return 0;
1482 }
1483 #endif
1484
1485 /* Used by ipt_REJECT and ip6t_REJECT. */
1486 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1487 {
1488         struct nf_conn *ct;
1489         enum ip_conntrack_info ctinfo;
1490
1491         /* This ICMP is in reverse direction to the packet which caused it */
1492         ct = nf_ct_get(skb, &ctinfo);
1493         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1494                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1495         else
1496                 ctinfo = IP_CT_RELATED;
1497
1498         /* Attach to new skbuff, and increment count */
1499         nskb->nfct = &ct->ct_general;
1500         nskb->nfctinfo = ctinfo;
1501         nf_conntrack_get(nskb->nfct);
1502 }
1503
1504 static inline int
1505 do_iter(const struct nf_conntrack_tuple_hash *i,
1506         int (*iter)(struct nf_conn *i, void *data),
1507         void *data)
1508 {
1509         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1510 }
1511
1512 /* Bring out ya dead! */
1513 static struct nf_conntrack_tuple_hash *
1514 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1515                 void *data, unsigned int *bucket)
1516 {
1517         struct nf_conntrack_tuple_hash *h = NULL;
1518
1519         write_lock_bh(&nf_conntrack_lock);
1520         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1521                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1522                                 struct nf_conntrack_tuple_hash *, iter, data);
1523                 if (h)
1524                         break;
1525         }
1526         if (!h)
1527                 h = LIST_FIND_W(&unconfirmed, do_iter,
1528                                 struct nf_conntrack_tuple_hash *, iter, data);
1529         if (h)
1530                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1531         write_unlock_bh(&nf_conntrack_lock);
1532
1533         return h;
1534 }
1535
1536 void
1537 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1538 {
1539         struct nf_conntrack_tuple_hash *h;
1540         unsigned int bucket = 0;
1541
1542         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1543                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1544                 /* Time to push up daises... */
1545                 if (del_timer(&ct->timeout))
1546                         death_by_timeout((unsigned long)ct);
1547                 /* ... else the timer will get him soon. */
1548
1549                 nf_ct_put(ct);
1550         }
1551 }
1552
1553 static int kill_all(struct nf_conn *i, void *data)
1554 {
1555         return 1;
1556 }
1557
1558 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1559 {
1560         if (vmalloced)
1561                 vfree(hash);
1562         else
1563                 free_pages((unsigned long)hash, 
1564                            get_order(sizeof(struct list_head) * size));
1565 }
1566
1567 void nf_conntrack_flush()
1568 {
1569         nf_ct_iterate_cleanup(kill_all, NULL);
1570 }
1571
1572 /* Mishearing the voices in his head, our hero wonders how he's
1573    supposed to kill the mall. */
1574 void nf_conntrack_cleanup(void)
1575 {
1576         int i;
1577
1578         ip_ct_attach = NULL;
1579
1580         /* This makes sure all current packets have passed through
1581            netfilter framework.  Roll on, two-stage module
1582            delete... */
1583         synchronize_net();
1584
1585         nf_ct_event_cache_flush();
1586  i_see_dead_people:
1587         nf_conntrack_flush();
1588         if (atomic_read(&nf_conntrack_count) != 0) {
1589                 schedule();
1590                 goto i_see_dead_people;
1591         }
1592         /* wait until all references to nf_conntrack_untracked are dropped */
1593         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1594                 schedule();
1595
1596         for (i = 0; i < NF_CT_F_NUM; i++) {
1597                 if (nf_ct_cache[i].use == 0)
1598                         continue;
1599
1600                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1601                 nf_ct_cache[i].use = 1;
1602                 nf_conntrack_unregister_cache(i);
1603         }
1604         kmem_cache_destroy(nf_conntrack_expect_cachep);
1605         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1606                             nf_conntrack_htable_size);
1607
1608         /* free l3proto protocol tables */
1609         for (i = 0; i < PF_MAX; i++)
1610                 if (nf_ct_protos[i]) {
1611                         kfree(nf_ct_protos[i]);
1612                         nf_ct_protos[i] = NULL;
1613                 }
1614 }
1615
1616 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1617 {
1618         struct list_head *hash;
1619         unsigned int i;
1620
1621         *vmalloced = 0; 
1622         hash = (void*)__get_free_pages(GFP_KERNEL, 
1623                                        get_order(sizeof(struct list_head)
1624                                                  * size));
1625         if (!hash) { 
1626                 *vmalloced = 1;
1627                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1628                 hash = vmalloc(sizeof(struct list_head) * size);
1629         }
1630
1631         if (hash)
1632                 for (i = 0; i < size; i++) 
1633                         INIT_LIST_HEAD(&hash[i]);
1634
1635         return hash;
1636 }
1637
1638 int set_hashsize(const char *val, struct kernel_param *kp)
1639 {
1640         int i, bucket, hashsize, vmalloced;
1641         int old_vmalloced, old_size;
1642         int rnd;
1643         struct list_head *hash, *old_hash;
1644         struct nf_conntrack_tuple_hash *h;
1645
1646         /* On boot, we can set this without any fancy locking. */
1647         if (!nf_conntrack_htable_size)
1648                 return param_set_uint(val, kp);
1649
1650         hashsize = simple_strtol(val, NULL, 0);
1651         if (!hashsize)
1652                 return -EINVAL;
1653
1654         hash = alloc_hashtable(hashsize, &vmalloced);
1655         if (!hash)
1656                 return -ENOMEM;
1657
1658         /* We have to rehahs for the new table anyway, so we also can
1659          * use a newrandom seed */
1660         get_random_bytes(&rnd, 4);
1661
1662         write_lock_bh(&nf_conntrack_lock);
1663         for (i = 0; i < nf_conntrack_htable_size; i++) {
1664                 while (!list_empty(&nf_conntrack_hash[i])) {
1665                         h = list_entry(nf_conntrack_hash[i].next,
1666                                        struct nf_conntrack_tuple_hash, list);
1667                         list_del(&h->list);
1668                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1669                         list_add_tail(&h->list, &hash[bucket]);
1670                 }
1671         }
1672         old_size = nf_conntrack_htable_size;
1673         old_vmalloced = nf_conntrack_vmalloc;
1674         old_hash = nf_conntrack_hash;
1675
1676         nf_conntrack_htable_size = hashsize;
1677         nf_conntrack_vmalloc = vmalloced;
1678         nf_conntrack_hash = hash;
1679         nf_conntrack_hash_rnd = rnd;
1680         write_unlock_bh(&nf_conntrack_lock);
1681
1682         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1683         return 0;
1684 }
1685
1686 module_param_call(hashsize, set_hashsize, param_get_uint,
1687                   &nf_conntrack_htable_size, 0600);
1688
1689 int __init nf_conntrack_init(void)
1690 {
1691         unsigned int i;
1692         int ret;
1693
1694         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1695          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1696         if (!nf_conntrack_htable_size) {
1697                 nf_conntrack_htable_size
1698                         = (((num_physpages << PAGE_SHIFT) / 16384)
1699                            / sizeof(struct list_head));
1700                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1701                         nf_conntrack_htable_size = 8192;
1702                 if (nf_conntrack_htable_size < 16)
1703                         nf_conntrack_htable_size = 16;
1704         }
1705         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1706
1707         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1708                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1709                nf_conntrack_max);
1710
1711         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1712                                             &nf_conntrack_vmalloc);
1713         if (!nf_conntrack_hash) {
1714                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1715                 goto err_out;
1716         }
1717
1718         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1719                                           sizeof(struct nf_conn));
1720         if (ret < 0) {
1721                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1722                 goto err_free_hash;
1723         }
1724
1725         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1726                                         sizeof(struct nf_conntrack_expect),
1727                                         0, 0, NULL, NULL);
1728         if (!nf_conntrack_expect_cachep) {
1729                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1730                 goto err_free_conntrack_slab;
1731         }
1732
1733         /* Don't NEED lock here, but good form anyway. */
1734         write_lock_bh(&nf_conntrack_lock);
1735         for (i = 0; i < PF_MAX; i++)
1736                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1737         write_unlock_bh(&nf_conntrack_lock);
1738
1739         /* For use by REJECT target */
1740         ip_ct_attach = __nf_conntrack_attach;
1741
1742         /* Set up fake conntrack:
1743             - to never be deleted, not in any hashes */
1744         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1745         /*  - and look it like as a confirmed connection */
1746         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1747
1748         return ret;
1749
1750 err_free_conntrack_slab:
1751         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1752 err_free_hash:
1753         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1754                             nf_conntrack_htable_size);
1755 err_out:
1756         return -ENOMEM;
1757 }