This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40
41 /* This rwlock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION    "2.1"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DECLARE_RWLOCK(ip_conntrack_lock);
61 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77
78 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
79
80 inline void 
81 ip_conntrack_put(struct ip_conntrack *ct)
82 {
83         IP_NF_ASSERT(ct);
84         nf_conntrack_put(&ct->ct_general);
85 }
86
87 static int ip_conntrack_hash_rnd_initted;
88 static unsigned int ip_conntrack_hash_rnd;
89
90 static u_int32_t
91 hash_conntrack(const struct ip_conntrack_tuple *tuple)
92 {
93 #if 0
94         dump_tuple(tuple);
95 #endif
96         return (jhash_3words(tuple->src.ip,
97                              (tuple->dst.ip ^ tuple->dst.protonum),
98                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
99                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
100 }
101
102 int
103 ip_ct_get_tuple(const struct iphdr *iph,
104                 const struct sk_buff *skb,
105                 unsigned int dataoff,
106                 struct ip_conntrack_tuple *tuple,
107                 const struct ip_conntrack_protocol *protocol)
108 {
109         /* Never happen */
110         if (iph->frag_off & htons(IP_OFFSET)) {
111                 printk("ip_conntrack_core: Frag of proto %u.\n",
112                        iph->protocol);
113                 return 0;
114         }
115
116         tuple->src.ip = iph->saddr;
117         tuple->dst.ip = iph->daddr;
118         tuple->dst.protonum = iph->protocol;
119         tuple->src.u.all = tuple->dst.u.all = 0;
120
121         return protocol->pkt_to_tuple(skb, dataoff, tuple);
122 }
123
124 int
125 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
126                    const struct ip_conntrack_tuple *orig,
127                    const struct ip_conntrack_protocol *protocol)
128 {
129         inverse->src.ip = orig->dst.ip;
130         inverse->dst.ip = orig->src.ip;
131         inverse->dst.protonum = orig->dst.protonum;
132
133         inverse->src.u.all = inverse->dst.u.all = 0;
134
135         return protocol->invert_tuple(inverse, orig);
136 }
137
138
139 /* ip_conntrack_expect helper functions */
140
141 /* Compare tuple parts depending on mask. */
142 static inline int expect_cmp(const struct ip_conntrack_expect *i,
143                              const struct ip_conntrack_tuple *tuple)
144 {
145         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
146         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
147 }
148
149 static void
150 destroy_expect(struct ip_conntrack_expect *exp)
151 {
152         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
153         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
154         IP_NF_ASSERT(!timer_pending(&exp->timeout));
155
156         kmem_cache_free(ip_conntrack_expect_cachep, exp);
157         CONNTRACK_STAT_INC(expect_delete);
158 }
159
160 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
161 {
162         IP_NF_ASSERT(exp);
163
164         if (atomic_dec_and_test(&exp->use)) {
165                 /* usage count dropped to zero */
166                 destroy_expect(exp);
167         }
168 }
169
170 static inline struct ip_conntrack_expect *
171 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
172 {
173         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
174         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
175         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
176                          struct ip_conntrack_expect *, tuple);
177 }
178
179 /* Find a expectation corresponding to a tuple. */
180 struct ip_conntrack_expect *
181 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
182 {
183         struct ip_conntrack_expect *exp;
184
185         READ_LOCK(&ip_conntrack_lock);
186         READ_LOCK(&ip_conntrack_expect_tuple_lock);
187         exp = __ip_ct_expect_find(tuple);
188         if (exp)
189                 atomic_inc(&exp->use);
190         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
191         READ_UNLOCK(&ip_conntrack_lock);
192
193         return exp;
194 }
195
196 /* remove one specific expectation from all lists and drop refcount,
197  * does _NOT_ delete the timer. */
198 static void __unexpect_related(struct ip_conntrack_expect *expect)
199 {
200         DEBUGP("unexpect_related(%p)\n", expect);
201         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
202
203         /* we're not allowed to unexpect a confirmed expectation! */
204         IP_NF_ASSERT(!expect->sibling);
205
206         /* delete from global and local lists */
207         list_del(&expect->list);
208         list_del(&expect->expected_list);
209
210         /* decrement expect-count of master conntrack */
211         if (expect->expectant)
212                 expect->expectant->expecting--;
213
214         ip_conntrack_expect_put(expect);
215 }
216
217 /* remove one specific expecatation from all lists, drop refcount
218  * and expire timer. 
219  * This function can _NOT_ be called for confirmed expects! */
220 static void unexpect_related(struct ip_conntrack_expect *expect)
221 {
222         IP_NF_ASSERT(expect->expectant);
223         IP_NF_ASSERT(expect->expectant->helper);
224         /* if we are supposed to have a timer, but we can't delete
225          * it: race condition.  __unexpect_related will
226          * be calledd by timeout function */
227         if (expect->expectant->helper->timeout
228             && !del_timer(&expect->timeout))
229                 return;
230
231         __unexpect_related(expect);
232 }
233
234 /* delete all unconfirmed expectations for this conntrack */
235 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
236 {
237         struct list_head *exp_entry, *next;
238         struct ip_conntrack_expect *exp;
239
240         DEBUGP("remove_expectations(%p)\n", ct);
241
242         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
243                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
244                                  expected_list);
245
246                 /* we skip established expectations, as we want to delete
247                  * the un-established ones only */
248                 if (exp->sibling) {
249                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
250                         if (drop_refcount) {
251                                 /* Indicate that this expectations parent is dead */
252                                 ip_conntrack_put(exp->expectant);
253                                 exp->expectant = NULL;
254                         }
255                         continue;
256                 }
257
258                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
259                 IP_NF_ASSERT(exp->expectant == ct);
260
261                 /* delete expectation from global and private lists */
262                 unexpect_related(exp);
263         }
264 }
265
266 static void
267 clean_from_lists(struct ip_conntrack *ct)
268 {
269         unsigned int ho, hr;
270         
271         DEBUGP("clean_from_lists(%p)\n", ct);
272         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
273
274         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
275         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
276         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
277         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
278
279         /* Destroy all un-established, pending expectations */
280         remove_expectations(ct, 1);
281 }
282
283 static void
284 destroy_conntrack(struct nf_conntrack *nfct)
285 {
286         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
287         struct ip_conntrack_protocol *proto;
288
289         DEBUGP("destroy_conntrack(%p)\n", ct);
290         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
291         IP_NF_ASSERT(!timer_pending(&ct->timeout));
292
293         /* To make sure we don't get any weird locking issues here:
294          * destroy_conntrack() MUST NOT be called with a write lock
295          * to ip_conntrack_lock!!! -HW */
296         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
297         if (proto && proto->destroy)
298                 proto->destroy(ct);
299
300         if (ip_conntrack_destroyed)
301                 ip_conntrack_destroyed(ct);
302
303         WRITE_LOCK(&ip_conntrack_lock);
304         /* Make sure don't leave any orphaned expectations lying around */
305         if (ct->expecting)
306                 remove_expectations(ct, 1);
307
308         /* Delete our master expectation */
309         if (ct->master) {
310                 if (ct->master->expectant) {
311                         /* can't call __unexpect_related here,
312                          * since it would screw up expect_list */
313                         list_del(&ct->master->expected_list);
314                         master = ct->master->expectant;
315                 }
316                 kmem_cache_free(ip_conntrack_expect_cachep, ct->master);
317         }
318         CONNTRACK_STAT_INC(delete);
319         WRITE_UNLOCK(&ip_conntrack_lock);
320
321         if (master)
322                 ip_conntrack_put(master);
323
324         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
325         kmem_cache_free(ip_conntrack_cachep, ct);
326         atomic_dec(&ip_conntrack_count);
327 }
328
329 static void death_by_timeout(unsigned long ul_conntrack)
330 {
331         struct ip_conntrack *ct = (void *)ul_conntrack;
332
333         WRITE_LOCK(&ip_conntrack_lock);
334         /* Inside lock so preempt is disabled on module removal path.
335          * Otherwise we can get spurious warnings. */
336         CONNTRACK_STAT_INC(delete_list);
337         clean_from_lists(ct);
338         WRITE_UNLOCK(&ip_conntrack_lock);
339         ip_conntrack_put(ct);
340 }
341
342 static inline int
343 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
344                     const struct ip_conntrack_tuple *tuple,
345                     const struct ip_conntrack *ignored_conntrack)
346 {
347         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
348         return i->ctrack != ignored_conntrack
349                 && ip_ct_tuple_equal(tuple, &i->tuple);
350 }
351
352 static struct ip_conntrack_tuple_hash *
353 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
354                     const struct ip_conntrack *ignored_conntrack)
355 {
356         struct ip_conntrack_tuple_hash *h;
357         unsigned int hash = hash_conntrack(tuple);
358
359         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
360         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
361                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
362                         CONNTRACK_STAT_INC(found);
363                         return h;
364                 }
365                 CONNTRACK_STAT_INC(searched);
366         }
367
368         return NULL;
369 }
370
371 /* Find a connection corresponding to a tuple. */
372 struct ip_conntrack_tuple_hash *
373 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
374                       const struct ip_conntrack *ignored_conntrack)
375 {
376         struct ip_conntrack_tuple_hash *h;
377
378         READ_LOCK(&ip_conntrack_lock);
379         h = __ip_conntrack_find(tuple, ignored_conntrack);
380         if (h)
381                 atomic_inc(&h->ctrack->ct_general.use);
382         READ_UNLOCK(&ip_conntrack_lock);
383
384         return h;
385 }
386
387 /* Confirm a connection given skb; places it in hash table */
388 int
389 __ip_conntrack_confirm(struct sk_buff *skb)
390 {
391         unsigned int hash, repl_hash;
392         struct ip_conntrack *ct;
393         enum ip_conntrack_info ctinfo;
394
395         ct = ip_conntrack_get(skb, &ctinfo);
396
397         /* ipt_REJECT uses ip_conntrack_attach to attach related
398            ICMP/TCP RST packets in other direction.  Actual packet
399            which created connection will be IP_CT_NEW or for an
400            expected connection, IP_CT_RELATED. */
401         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
402                 return NF_ACCEPT;
403
404         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
405         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
406
407         /* We're not in hash table, and we refuse to set up related
408            connections for unconfirmed conns.  But packet copies and
409            REJECT will give spurious warnings here. */
410         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
411
412         /* No external references means noone else could have
413            confirmed us. */
414         IP_NF_ASSERT(!is_confirmed(ct));
415         DEBUGP("Confirming conntrack %p\n", ct);
416
417         WRITE_LOCK(&ip_conntrack_lock);
418         /* See if there's one in the list already, including reverse:
419            NAT could have grabbed it without realizing, since we're
420            not in the hash.  If there is, we lost race. */
421         if (!LIST_FIND(&ip_conntrack_hash[hash],
422                        conntrack_tuple_cmp,
423                        struct ip_conntrack_tuple_hash *,
424                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
425             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
426                           conntrack_tuple_cmp,
427                           struct ip_conntrack_tuple_hash *,
428                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
429                 list_prepend(&ip_conntrack_hash[hash],
430                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
431                 list_prepend(&ip_conntrack_hash[repl_hash],
432                              &ct->tuplehash[IP_CT_DIR_REPLY]);
433                 /* Timer relative to confirmation time, not original
434                    setting time, otherwise we'd get timer wrap in
435                    weird delay cases. */
436                 ct->timeout.expires += jiffies;
437                 add_timer(&ct->timeout);
438                 atomic_inc(&ct->ct_general.use);
439                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
440                 CONNTRACK_STAT_INC(insert);
441                 WRITE_UNLOCK(&ip_conntrack_lock);
442                 return NF_ACCEPT;
443         }
444
445         CONNTRACK_STAT_INC(insert_failed);
446         WRITE_UNLOCK(&ip_conntrack_lock);
447
448         return NF_DROP;
449 }
450
451 /* Returns true if a connection correspondings to the tuple (required
452    for NAT). */
453 int
454 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
455                          const struct ip_conntrack *ignored_conntrack)
456 {
457         struct ip_conntrack_tuple_hash *h;
458
459         READ_LOCK(&ip_conntrack_lock);
460         h = __ip_conntrack_find(tuple, ignored_conntrack);
461         READ_UNLOCK(&ip_conntrack_lock);
462
463         return h != NULL;
464 }
465
466 /* There's a small race here where we may free a just-assured
467    connection.  Too bad: we're in trouble anyway. */
468 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
469 {
470         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
471 }
472
473 static int early_drop(struct list_head *chain)
474 {
475         /* Traverse backwards: gives us oldest, which is roughly LRU */
476         struct ip_conntrack_tuple_hash *h;
477         int dropped = 0;
478
479         READ_LOCK(&ip_conntrack_lock);
480         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
481         if (h)
482                 atomic_inc(&h->ctrack->ct_general.use);
483         READ_UNLOCK(&ip_conntrack_lock);
484
485         if (!h)
486                 return dropped;
487
488         if (del_timer(&h->ctrack->timeout)) {
489                 death_by_timeout((unsigned long)h->ctrack);
490                 dropped = 1;
491                 CONNTRACK_STAT_INC(early_drop);
492         }
493         ip_conntrack_put(h->ctrack);
494         return dropped;
495 }
496
497 static inline int helper_cmp(const struct ip_conntrack_helper *i,
498                              const struct ip_conntrack_tuple *rtuple)
499 {
500         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
501 }
502
503 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
504 {
505         return LIST_FIND(&helpers, helper_cmp,
506                          struct ip_conntrack_helper *,
507                          tuple);
508 }
509
510 /* Allocate a new conntrack: we return -ENOMEM if classification
511    failed due to stress.  Otherwise it really is unclassifiable. */
512 static struct ip_conntrack_tuple_hash *
513 init_conntrack(const struct ip_conntrack_tuple *tuple,
514                struct ip_conntrack_protocol *protocol,
515                struct sk_buff *skb)
516 {
517         struct ip_conntrack *conntrack;
518         struct ip_conntrack_tuple repl_tuple;
519         size_t hash;
520         struct ip_conntrack_expect *expected;
521
522         if (!ip_conntrack_hash_rnd_initted) {
523                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
524                 ip_conntrack_hash_rnd_initted = 1;
525         }
526
527         hash = hash_conntrack(tuple);
528
529         if (ip_conntrack_max
530             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
531                 /* Try dropping from this hash chain. */
532                 if (!early_drop(&ip_conntrack_hash[hash])) {
533                         if (net_ratelimit())
534                                 printk(KERN_WARNING
535                                        "ip_conntrack: table full, dropping"
536                                        " packet.\n");
537                         return ERR_PTR(-ENOMEM);
538                 }
539         }
540
541         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
542                 DEBUGP("Can't invert tuple.\n");
543                 return NULL;
544         }
545
546         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
547         if (!conntrack) {
548                 DEBUGP("Can't allocate conntrack.\n");
549                 return ERR_PTR(-ENOMEM);
550         }
551
552         memset(conntrack, 0, sizeof(*conntrack));
553         atomic_set(&conntrack->ct_general.use, 1);
554         conntrack->ct_general.destroy = destroy_conntrack;
555         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
556         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
557         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
558         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
559 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
560         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
561         conntrack->xid[IP_CT_DIR_REPLY] = -1;
562 #endif
563
564         if (!protocol->new(conntrack, skb)) {
565                 kmem_cache_free(ip_conntrack_cachep, conntrack);
566                 return NULL;
567         }
568         /* Don't set timer yet: wait for confirmation */
569         init_timer(&conntrack->timeout);
570         conntrack->timeout.data = (unsigned long)conntrack;
571         conntrack->timeout.function = death_by_timeout;
572
573         INIT_LIST_HEAD(&conntrack->sibling_list);
574
575         WRITE_LOCK(&ip_conntrack_lock);
576         /* Need finding and deleting of expected ONLY if we win race */
577         READ_LOCK(&ip_conntrack_expect_tuple_lock);
578         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
579                              struct ip_conntrack_expect *, tuple);
580         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
581
582         if (expected) {
583                 /* If master is not in hash table yet (ie. packet hasn't left
584                    this machine yet), how can other end know about expected?
585                    Hence these are not the droids you are looking for (if
586                    master ct never got confirmed, we'd hold a reference to it
587                    and weird things would happen to future packets). */
588                 if (!is_confirmed(expected->expectant)) {
589                         conntrack->helper = ip_ct_find_helper(&repl_tuple);
590                         goto end;
591                 }
592
593                 /* Expectation is dying... */
594                 if (expected->expectant->helper->timeout
595                     && !del_timer(&expected->timeout))
596                         goto end;       
597
598                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
599                         conntrack, expected);
600                 /* Welcome, Mr. Bond.  We've been expecting you... */
601                 IP_NF_ASSERT(expected->expectant);
602                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
603                 conntrack->master = expected;
604                 expected->sibling = conntrack;
605 #if CONFIG_IP_NF_CONNTRACK_MARK
606                 conntrack->mark = expected->expectant->mark;
607 #endif
608                 LIST_DELETE(&ip_conntrack_expect_list, expected);
609                 expected->expectant->expecting--;
610                 nf_conntrack_get(&master_ct(conntrack)->ct_general);
611
612                 /* this is a braindead... --pablo */
613                 atomic_inc(&ip_conntrack_count);
614                 WRITE_UNLOCK(&ip_conntrack_lock);
615
616                 if (expected->expectfn)
617                         expected->expectfn(conntrack);
618         
619                 CONNTRACK_STAT_INC(expect_new);
620
621                 goto ret;
622         } else  {
623                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
624
625                 CONNTRACK_STAT_INC(new);
626         }
627
628 end:    atomic_inc(&ip_conntrack_count);
629         WRITE_UNLOCK(&ip_conntrack_lock);
630
631 ret:    return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
632 }
633
634 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
635 static inline struct ip_conntrack *
636 resolve_normal_ct(struct sk_buff *skb,
637                   struct ip_conntrack_protocol *proto,
638                   int *set_reply,
639                   unsigned int hooknum,
640                   enum ip_conntrack_info *ctinfo)
641 {
642         struct ip_conntrack_tuple tuple;
643         struct ip_conntrack_tuple_hash *h;
644
645         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
646
647         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
648                                 &tuple,proto))
649                 return NULL;
650
651         /* look for tuple match */
652         h = ip_conntrack_find_get(&tuple, NULL);
653         if (!h) {
654                 h = init_conntrack(&tuple, proto, skb);
655                 if (!h)
656                         return NULL;
657                 if (IS_ERR(h))
658                         return (void *)h;
659         }
660
661         /* It exists; we have (non-exclusive) reference. */
662         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
663                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
664                 /* Please set reply bit if this packet OK */
665                 *set_reply = 1;
666         } else {
667                 /* Once we've had two way comms, always ESTABLISHED. */
668                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
669                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
670                                h->ctrack);
671                         *ctinfo = IP_CT_ESTABLISHED;
672                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
673                         DEBUGP("ip_conntrack_in: related packet for %p\n",
674                                h->ctrack);
675                         *ctinfo = IP_CT_RELATED;
676                 } else {
677                         DEBUGP("ip_conntrack_in: new packet for %p\n",
678                                h->ctrack);
679                         *ctinfo = IP_CT_NEW;
680                 }
681                 *set_reply = 0;
682         }
683         skb->nfct = &h->ctrack->ct_general;
684         skb->nfctinfo = *ctinfo;
685         return h->ctrack;
686 }
687
688 /* Netfilter hook itself. */
689 unsigned int ip_conntrack_in(unsigned int hooknum,
690                              struct sk_buff **pskb,
691                              const struct net_device *in,
692                              const struct net_device *out,
693                              int (*okfn)(struct sk_buff *))
694 {
695         struct ip_conntrack *ct;
696         enum ip_conntrack_info ctinfo;
697         struct ip_conntrack_protocol *proto;
698         int set_reply;
699         int ret;
700
701         /* Previously seen (loopback or untracked)?  Ignore. */
702         if ((*pskb)->nfct) {
703                 CONNTRACK_STAT_INC(ignore);
704                 return NF_ACCEPT;
705         }
706
707         /* Never happen */
708         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
709                 if (net_ratelimit()) {
710                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
711                        (*pskb)->nh.iph->protocol, hooknum);
712                 }
713                 return NF_DROP;
714         }
715
716         /* FIXME: Do this right please. --RR */
717         (*pskb)->nfcache |= NFC_UNKNOWN;
718
719 /* Doesn't cover locally-generated broadcast, so not worth it. */
720 #if 0
721         /* Ignore broadcast: no `connection'. */
722         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
723                 printk("Broadcast packet!\n");
724                 return NF_ACCEPT;
725         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
726                    == htonl(0x000000FF)) {
727                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
728                        NIPQUAD((*pskb)->nh.iph->saddr),
729                        NIPQUAD((*pskb)->nh.iph->daddr),
730                        (*pskb)->sk, (*pskb)->pkt_type);
731         }
732 #endif
733
734         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
735
736         /* It may be an special packet, error, unclean...
737          * inverse of the return code tells to the netfilter
738          * core what to do with the packet. */
739         if (proto->error != NULL 
740             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
741                 CONNTRACK_STAT_INC(error);
742                 CONNTRACK_STAT_INC(invalid);
743                 return -ret;
744         }
745
746         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
747                 /* Not valid part of a connection */
748                 CONNTRACK_STAT_INC(invalid);
749                 return NF_ACCEPT;
750         }
751
752         if (IS_ERR(ct)) {
753                 /* Too stressed to deal. */
754                 CONNTRACK_STAT_INC(drop);
755                 return NF_DROP;
756         }
757
758         IP_NF_ASSERT((*pskb)->nfct);
759
760         ret = proto->packet(ct, *pskb, ctinfo);
761         if (ret < 0) {
762                 /* Invalid: inverse of the return code tells
763                  * the netfilter core what to do*/
764                 nf_conntrack_put((*pskb)->nfct);
765                 (*pskb)->nfct = NULL;
766                 CONNTRACK_STAT_INC(invalid);
767                 return -ret;
768         }
769
770         if (ret != NF_DROP && ct->helper) {
771                 ret = ct->helper->help(*pskb, ct, ctinfo);
772                 if (ret == -1) {
773                         /* Invalid */
774                         CONNTRACK_STAT_INC(invalid);
775                         nf_conntrack_put((*pskb)->nfct);
776                         (*pskb)->nfct = NULL;
777                         return NF_ACCEPT;
778                 }
779         }
780         if (set_reply)
781                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
782
783         return ret;
784 }
785
786 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
787                    const struct ip_conntrack_tuple *orig)
788 {
789         return ip_ct_invert_tuple(inverse, orig, 
790                                   ip_ct_find_proto(orig->dst.protonum));
791 }
792
793 static inline int resent_expect(const struct ip_conntrack_expect *i,
794                                 const struct ip_conntrack_tuple *tuple,
795                                 const struct ip_conntrack_tuple *mask)
796 {
797         DEBUGP("resent_expect\n");
798         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
799         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
800         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
801         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
802                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
803                 && ip_ct_tuple_equal(&i->mask, mask));
804 }
805
806 /* Would two expected things clash? */
807 static inline int expect_clash(const struct ip_conntrack_expect *i,
808                                const struct ip_conntrack_tuple *tuple,
809                                const struct ip_conntrack_tuple *mask)
810 {
811         /* Part covered by intersection of masks must be unequal,
812            otherwise they clash */
813         struct ip_conntrack_tuple intersect_mask
814                 = { { i->mask.src.ip & mask->src.ip,
815                       { i->mask.src.u.all & mask->src.u.all } },
816                     { i->mask.dst.ip & mask->dst.ip,
817                       { i->mask.dst.u.all & mask->dst.u.all },
818                       i->mask.dst.protonum & mask->dst.protonum } };
819
820         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
821 }
822
823 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
824 {
825         WRITE_LOCK(&ip_conntrack_lock);
826         unexpect_related(expect);
827         WRITE_UNLOCK(&ip_conntrack_lock);
828 }
829         
830 static void expectation_timed_out(unsigned long ul_expect)
831 {
832         struct ip_conntrack_expect *expect = (void *) ul_expect;
833
834         DEBUGP("expectation %p timed out\n", expect);   
835         WRITE_LOCK(&ip_conntrack_lock);
836         __unexpect_related(expect);
837         WRITE_UNLOCK(&ip_conntrack_lock);
838 }
839
840 struct ip_conntrack_expect *
841 ip_conntrack_expect_alloc(void)
842 {
843         struct ip_conntrack_expect *new;
844
845         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
846         if (!new) {
847                 DEBUGP("expect_related: OOM allocating expect\n");
848                 return NULL;
849         }
850
851         /* tuple_cmp compares whole union, we have to initialized cleanly */
852         memset(new, 0, sizeof(struct ip_conntrack_expect));
853         atomic_set(&new->use, 1);
854
855         return new;
856 }
857
858 static void
859 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
860                            struct ip_conntrack *related_to)
861 {
862         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
863         new->expectant = related_to;
864         new->sibling = NULL;
865
866         /* add to expected list for this connection */
867         list_add_tail(&new->expected_list, &related_to->sibling_list);
868         /* add to global list of expectations */
869         list_prepend(&ip_conntrack_expect_list, &new->list);
870         /* add and start timer if required */
871         if (related_to->helper->timeout) {
872                 init_timer(&new->timeout);
873                 new->timeout.data = (unsigned long)new;
874                 new->timeout.function = expectation_timed_out;
875                 new->timeout.expires = jiffies +
876                                         related_to->helper->timeout * HZ;
877                 add_timer(&new->timeout);
878         }
879         related_to->expecting++;
880 }
881
882 /* Add a related connection. */
883 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
884                                 struct ip_conntrack *related_to)
885 {
886         struct ip_conntrack_expect *old;
887         int ret = 0;
888
889         WRITE_LOCK(&ip_conntrack_lock);
890         /* Because of the write lock, no reader can walk the lists,
891          * so there is no need to use the tuple lock too */
892
893         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
894         DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
895         DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
896
897         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
898                         struct ip_conntrack_expect *, &expect->tuple, 
899                         &expect->mask);
900         if (old) {
901                 /* Helper private data may contain offsets but no pointers
902                    pointing into the payload - otherwise we should have to copy 
903                    the data filled out by the helper over the old one */
904                 DEBUGP("expect_related: resent packet\n");
905                 if (related_to->helper->timeout) {
906                         if (!del_timer(&old->timeout)) {
907                                 /* expectation is dying. Fall through */
908                                 goto out;
909                         } else {
910                                 old->timeout.expires = jiffies + 
911                                         related_to->helper->timeout * HZ;
912                                 add_timer(&old->timeout);
913                         }
914                 }
915
916                 WRITE_UNLOCK(&ip_conntrack_lock);
917                 /* This expectation is not inserted so no need to lock */
918                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
919                 return -EEXIST;
920
921         } else if (related_to->helper->max_expected && 
922                    related_to->expecting >= related_to->helper->max_expected) {
923                 /* old == NULL */
924                 if (!(related_to->helper->flags & 
925                       IP_CT_HELPER_F_REUSE_EXPECT)) {
926                         WRITE_UNLOCK(&ip_conntrack_lock);
927                         if (net_ratelimit())
928                                 printk(KERN_WARNING
929                                        "ip_conntrack: max number of expected "
930                                        "connections %i of %s reached for "
931                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
932                                        related_to->helper->max_expected,
933                                        related_to->helper->name,
934                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
935                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
936                         kmem_cache_free(ip_conntrack_expect_cachep, expect);
937                         return -EPERM;
938                 }
939                 DEBUGP("ip_conntrack: max number of expected "
940                        "connections %i of %s reached for "
941                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
942                        related_to->helper->max_expected,
943                        related_to->helper->name,
944                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
945                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
946  
947                 /* choose the the oldest expectation to evict */
948                 list_for_each_entry(old, &related_to->sibling_list, 
949                                                       expected_list)
950                         if (old->sibling == NULL)
951                                 break;
952
953                 /* We cannot fail since related_to->expecting is the number
954                  * of unconfirmed expectations */
955                 IP_NF_ASSERT(old && old->sibling == NULL);
956
957                 /* newnat14 does not reuse the real allocated memory
958                  * structures but rather unexpects the old and
959                  * allocates a new.  unexpect_related will decrement
960                  * related_to->expecting. 
961                  */
962                 unexpect_related(old);
963                 ret = -EPERM;
964         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
965                              struct ip_conntrack_expect *, &expect->tuple, 
966                              &expect->mask)) {
967                 WRITE_UNLOCK(&ip_conntrack_lock);
968                 DEBUGP("expect_related: busy!\n");
969
970                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
971                 return -EBUSY;
972         }
973
974 out:    ip_conntrack_expect_insert(expect, related_to);
975
976         WRITE_UNLOCK(&ip_conntrack_lock);
977
978         CONNTRACK_STAT_INC(expect_create);
979
980         return ret;
981 }
982
983 /* Change tuple in an existing expectation */
984 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
985                                struct ip_conntrack_tuple *newtuple)
986 {
987         int ret;
988
989         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
990         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
991         DEBUGP("change_expect:\n");
992         DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
993         DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
994         DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
995         if (expect->ct_tuple.dst.protonum == 0) {
996                 /* Never seen before */
997                 DEBUGP("change expect: never seen before\n");
998                 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
999                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1000                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1001                         /* Force NAT to find an unused tuple */
1002                         ret = -1;
1003                 } else {
1004                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1005                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1006                         ret = 0;
1007                 }
1008         } else {
1009                 /* Resent packet */
1010                 DEBUGP("change expect: resent packet\n");
1011                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1012                         ret = 0;
1013                 } else {
1014                         /* Force NAT to choose again the same port */
1015                         ret = -1;
1016                 }
1017         }
1018         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1019         
1020         return ret;
1021 }
1022
1023 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1024    return 0 and don't do alteration. */
1025 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1026                              const struct ip_conntrack_tuple *newreply)
1027 {
1028         WRITE_LOCK(&ip_conntrack_lock);
1029         if (__ip_conntrack_find(newreply, conntrack)) {
1030                 WRITE_UNLOCK(&ip_conntrack_lock);
1031                 return 0;
1032         }
1033         /* Should be unconfirmed, so not in hash table yet */
1034         IP_NF_ASSERT(!is_confirmed(conntrack));
1035
1036         DEBUGP("Altering reply tuple of %p to ", conntrack);
1037         DUMP_TUPLE(newreply);
1038
1039         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1040         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1041                 conntrack->helper = ip_ct_find_helper(newreply);
1042         WRITE_UNLOCK(&ip_conntrack_lock);
1043
1044         return 1;
1045 }
1046
1047 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1048 {
1049         WRITE_LOCK(&ip_conntrack_lock);
1050         list_prepend(&helpers, me);
1051         WRITE_UNLOCK(&ip_conntrack_lock);
1052
1053         return 0;
1054 }
1055
1056 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1057                          const struct ip_conntrack_helper *me)
1058 {
1059         if (i->ctrack->helper == me) {
1060                 /* Get rid of any expected. */
1061                 remove_expectations(i->ctrack, 0);
1062                 /* And *then* set helper to NULL */
1063                 i->ctrack->helper = NULL;
1064         }
1065         return 0;
1066 }
1067
1068 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1069 {
1070         unsigned int i;
1071
1072         /* Need write lock here, to delete helper. */
1073         WRITE_LOCK(&ip_conntrack_lock);
1074         LIST_DELETE(&helpers, me);
1075
1076         /* Get rid of expecteds, set helpers to NULL. */
1077         for (i = 0; i < ip_conntrack_htable_size; i++)
1078                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1079                             struct ip_conntrack_tuple_hash *, me);
1080         WRITE_UNLOCK(&ip_conntrack_lock);
1081
1082         /* Someone could be still looking at the helper in a bh. */
1083         synchronize_net();
1084 }
1085
1086 static inline void ct_add_counters(struct ip_conntrack *ct,
1087                                    enum ip_conntrack_info ctinfo,
1088                                    const struct sk_buff *skb)
1089 {
1090 #ifdef CONFIG_IP_NF_CT_ACCT
1091         if (skb) {
1092                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1093                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1094                                         ntohs(skb->nh.iph->tot_len);
1095         }
1096 #endif
1097 }
1098
1099 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1100 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1101                         enum ip_conntrack_info ctinfo,
1102                         const struct sk_buff *skb,
1103                         unsigned long extra_jiffies)
1104 {
1105         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1106
1107         /* If not in hash table, timer will not be active yet */
1108         if (!is_confirmed(ct)) {
1109                 ct->timeout.expires = extra_jiffies;
1110                 ct_add_counters(ct, ctinfo, skb);
1111         } else {
1112                 WRITE_LOCK(&ip_conntrack_lock);
1113                 /* Need del_timer for race avoidance (may already be dying). */
1114                 if (del_timer(&ct->timeout)) {
1115                         ct->timeout.expires = jiffies + extra_jiffies;
1116                         add_timer(&ct->timeout);
1117                 }
1118                 ct_add_counters(ct, ctinfo, skb);
1119                 WRITE_UNLOCK(&ip_conntrack_lock);
1120         }
1121 }
1122
1123 /* Returns new sk_buff, or NULL */
1124 struct sk_buff *
1125 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1126 {
1127         struct sock *sk = skb->sk;
1128 #ifdef CONFIG_NETFILTER_DEBUG
1129         unsigned int olddebug = skb->nf_debug;
1130 #endif
1131
1132         if (sk) {
1133                 sock_hold(sk);
1134                 skb_orphan(skb);
1135         }
1136
1137         local_bh_disable(); 
1138         skb = ip_defrag(skb, user);
1139         local_bh_enable();
1140
1141         if (!skb) {
1142                 if (sk)
1143                         sock_put(sk);
1144                 return skb;
1145         }
1146
1147         if (sk) {
1148                 skb_set_owner_w(skb, sk);
1149                 sock_put(sk);
1150         }
1151
1152         ip_send_check(skb->nh.iph);
1153         skb->nfcache |= NFC_ALTERED;
1154 #ifdef CONFIG_NETFILTER_DEBUG
1155         /* Packet path as if nothing had happened. */
1156         skb->nf_debug = olddebug;
1157 #endif
1158         return skb;
1159 }
1160
1161 /* Used by ipt_REJECT. */
1162 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1163 {
1164         struct ip_conntrack *ct;
1165         enum ip_conntrack_info ctinfo;
1166
1167         /* This ICMP is in reverse direction to the packet which caused it */
1168         ct = ip_conntrack_get(skb, &ctinfo);
1169         
1170         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1171                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1172         else
1173                 ctinfo = IP_CT_RELATED;
1174
1175         /* Attach to new skbuff, and increment count */
1176         nskb->nfct = &ct->ct_general;
1177         nskb->nfctinfo = ctinfo;
1178         nf_conntrack_get(nskb->nfct);
1179 }
1180
1181 static inline int
1182 do_kill(const struct ip_conntrack_tuple_hash *i,
1183         int (*kill)(const struct ip_conntrack *i, void *data),
1184         void *data)
1185 {
1186         return kill(i->ctrack, data);
1187 }
1188
1189 /* Bring out ya dead! */
1190 static struct ip_conntrack_tuple_hash *
1191 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1192                 void *data, unsigned int *bucket)
1193 {
1194         struct ip_conntrack_tuple_hash *h = NULL;
1195
1196         READ_LOCK(&ip_conntrack_lock);
1197         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1198                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1199                               struct ip_conntrack_tuple_hash *, kill, data);
1200         }
1201         if (h)
1202                 atomic_inc(&h->ctrack->ct_general.use);
1203         READ_UNLOCK(&ip_conntrack_lock);
1204
1205         return h;
1206 }
1207
1208 void
1209 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1210                         void *data)
1211 {
1212         struct ip_conntrack_tuple_hash *h;
1213         unsigned int bucket = 0;
1214
1215         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1216                 /* Time to push up daises... */
1217                 if (del_timer(&h->ctrack->timeout))
1218                         death_by_timeout((unsigned long)h->ctrack);
1219                 /* ... else the timer will get him soon. */
1220
1221                 ip_conntrack_put(h->ctrack);
1222         }
1223 }
1224
1225 /* Fast function for those who don't want to parse /proc (and I don't
1226    blame them). */
1227 /* Reversing the socket's dst/src point of view gives us the reply
1228    mapping. */
1229 static int
1230 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1231 {
1232         struct inet_opt *inet = inet_sk(sk);
1233         struct ip_conntrack_tuple_hash *h;
1234         struct ip_conntrack_tuple tuple;
1235         
1236         IP_CT_TUPLE_U_BLANK(&tuple);
1237         tuple.src.ip = inet->rcv_saddr;
1238         tuple.src.u.tcp.port = inet->sport;
1239         tuple.dst.ip = inet->daddr;
1240         tuple.dst.u.tcp.port = inet->dport;
1241         tuple.dst.protonum = IPPROTO_TCP;
1242
1243         /* We only do TCP at the moment: is there a better way? */
1244         if (strcmp(sk->sk_prot->name, "TCP")) {
1245                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1246                 return -ENOPROTOOPT;
1247         }
1248
1249         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1250                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1251                        *len, sizeof(struct sockaddr_in));
1252                 return -EINVAL;
1253         }
1254
1255         h = ip_conntrack_find_get(&tuple, NULL);
1256         if (h) {
1257                 struct sockaddr_in sin;
1258
1259                 sin.sin_family = AF_INET;
1260                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1261                         .tuple.dst.u.tcp.port;
1262                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1263                         .tuple.dst.ip;
1264
1265                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1266                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1267                 ip_conntrack_put(h->ctrack);
1268                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1269                         return -EFAULT;
1270                 else
1271                         return 0;
1272         }
1273         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1274                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1275                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1276         return -ENOENT;
1277 }
1278
1279 static struct nf_sockopt_ops so_getorigdst = {
1280         .pf             = PF_INET,
1281         .get_optmin     = SO_ORIGINAL_DST,
1282         .get_optmax     = SO_ORIGINAL_DST+1,
1283         .get            = &getorigdst,
1284 };
1285
1286 static int kill_all(const struct ip_conntrack *i, void *data)
1287 {
1288         return 1;
1289 }
1290
1291 /* Mishearing the voices in his head, our hero wonders how he's
1292    supposed to kill the mall. */
1293 void ip_conntrack_cleanup(void)
1294 {
1295         ip_ct_attach = NULL;
1296         /* This makes sure all current packets have passed through
1297            netfilter framework.  Roll on, two-stage module
1298            delete... */
1299         synchronize_net();
1300  
1301  i_see_dead_people:
1302         ip_ct_selective_cleanup(kill_all, NULL);
1303         if (atomic_read(&ip_conntrack_count) != 0) {
1304                 schedule();
1305                 goto i_see_dead_people;
1306         }
1307
1308         kmem_cache_destroy(ip_conntrack_cachep);
1309         kmem_cache_destroy(ip_conntrack_expect_cachep);
1310         vfree(ip_conntrack_hash);
1311         nf_unregister_sockopt(&so_getorigdst);
1312 }
1313
1314 static int hashsize;
1315 module_param(hashsize, int, 0400);
1316
1317 int __init ip_conntrack_init(void)
1318 {
1319         unsigned int i;
1320         int ret;
1321
1322         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1323          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1324         if (hashsize) {
1325                 ip_conntrack_htable_size = hashsize;
1326         } else {
1327                 ip_conntrack_htable_size
1328                         = (((num_physpages << PAGE_SHIFT) / 16384)
1329                            / sizeof(struct list_head));
1330                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1331                         ip_conntrack_htable_size = 8192;
1332                 if (ip_conntrack_htable_size < 16)
1333                         ip_conntrack_htable_size = 16;
1334         }
1335         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1336
1337         printk("ip_conntrack version %s (%u buckets, %d max)"
1338                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1339                ip_conntrack_htable_size, ip_conntrack_max,
1340                sizeof(struct ip_conntrack));
1341
1342         ret = nf_register_sockopt(&so_getorigdst);
1343         if (ret != 0) {
1344                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1345                 return ret;
1346         }
1347
1348         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1349                                     * ip_conntrack_htable_size);
1350         if (!ip_conntrack_hash) {
1351                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1352                 goto err_unreg_sockopt;
1353         }
1354
1355         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1356                                                 sizeof(struct ip_conntrack), 0,
1357                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1358         if (!ip_conntrack_cachep) {
1359                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1360                 goto err_free_hash;
1361         }
1362
1363         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1364                                         sizeof(struct ip_conntrack_expect),
1365                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1366         if (!ip_conntrack_expect_cachep) {
1367                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1368                 goto err_free_conntrack_slab;
1369         }
1370
1371         /* Don't NEED lock here, but good form anyway. */
1372         WRITE_LOCK(&ip_conntrack_lock);
1373         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1374                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1375         /* Sew in builtin protocols. */
1376         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1377         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1378         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1379         WRITE_UNLOCK(&ip_conntrack_lock);
1380
1381         for (i = 0; i < ip_conntrack_htable_size; i++)
1382                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1383
1384         /* For use by ipt_REJECT */
1385         ip_ct_attach = ip_conntrack_attach;
1386
1387         /* Set up fake conntrack:
1388             - to never be deleted, not in any hashes */
1389         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1390         /*  - and look it like as a confirmed connection */
1391         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1392
1393         return ret;
1394
1395 err_free_conntrack_slab:
1396         kmem_cache_destroy(ip_conntrack_cachep);
1397 err_free_hash:
1398         vfree(ip_conntrack_hash);
1399 err_unreg_sockopt:
1400         nf_unregister_sockopt(&so_getorigdst);
1401
1402         return -ENOMEM;
1403 }