This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40
41 /* This rwlock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION    "2.1"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DECLARE_RWLOCK(ip_conntrack_lock);
61 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 EXPORT_SYMBOL(ip_conntrack_count);
66
67 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
68 LIST_HEAD(ip_conntrack_expect_list);
69 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
70 static LIST_HEAD(helpers);
71 unsigned int ip_conntrack_htable_size = 0;
72 int ip_conntrack_max;
73 struct list_head *ip_conntrack_hash;
74 static kmem_cache_t *ip_conntrack_cachep;
75 static kmem_cache_t *ip_conntrack_expect_cachep;
76 struct ip_conntrack ip_conntrack_untracked;
77 unsigned int ip_ct_log_invalid;
78
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81 inline void 
82 ip_conntrack_put(struct ip_conntrack *ct)
83 {
84         IP_NF_ASSERT(ct);
85         nf_conntrack_put(&ct->ct_general);
86 }
87
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
90
91 static u_int32_t
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
93 {
94 #if 0
95         dump_tuple(tuple);
96 #endif
97         return (jhash_3words(tuple->src.ip,
98                              (tuple->dst.ip ^ tuple->dst.protonum),
99                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
100                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101 }
102
103 int
104 ip_ct_get_tuple(const struct iphdr *iph,
105                 const struct sk_buff *skb,
106                 unsigned int dataoff,
107                 struct ip_conntrack_tuple *tuple,
108                 const struct ip_conntrack_protocol *protocol)
109 {
110         /* Never happen */
111         if (iph->frag_off & htons(IP_OFFSET)) {
112                 printk("ip_conntrack_core: Frag of proto %u.\n",
113                        iph->protocol);
114                 return 0;
115         }
116
117         tuple->src.ip = iph->saddr;
118         tuple->dst.ip = iph->daddr;
119         tuple->dst.protonum = iph->protocol;
120         tuple->src.u.all = tuple->dst.u.all = 0;
121
122         return protocol->pkt_to_tuple(skb, dataoff, tuple);
123 }
124
125 int
126 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127                    const struct ip_conntrack_tuple *orig,
128                    const struct ip_conntrack_protocol *protocol)
129 {
130         inverse->src.ip = orig->dst.ip;
131         inverse->dst.ip = orig->src.ip;
132         inverse->dst.protonum = orig->dst.protonum;
133
134         inverse->src.u.all = inverse->dst.u.all = 0;
135
136         return protocol->invert_tuple(inverse, orig);
137 }
138
139
140 /* ip_conntrack_expect helper functions */
141
142 /* Compare tuple parts depending on mask. */
143 static inline int expect_cmp(const struct ip_conntrack_expect *i,
144                              const struct ip_conntrack_tuple *tuple)
145 {
146         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
147         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
148 }
149
150 static void
151 destroy_expect(struct ip_conntrack_expect *exp)
152 {
153         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
154         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
155         IP_NF_ASSERT(!timer_pending(&exp->timeout));
156
157         kmem_cache_free(ip_conntrack_expect_cachep, exp);
158         CONNTRACK_STAT_INC(expect_delete);
159 }
160
161 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
162 {
163         IP_NF_ASSERT(exp);
164
165         if (atomic_dec_and_test(&exp->use)) {
166                 /* usage count dropped to zero */
167                 destroy_expect(exp);
168         }
169 }
170
171 static inline struct ip_conntrack_expect *
172 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
173 {
174         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
175         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
176         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
177                          struct ip_conntrack_expect *, tuple);
178 }
179
180 /* Find a expectation corresponding to a tuple. */
181 struct ip_conntrack_expect *
182 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
183 {
184         struct ip_conntrack_expect *exp;
185
186         READ_LOCK(&ip_conntrack_lock);
187         READ_LOCK(&ip_conntrack_expect_tuple_lock);
188         exp = __ip_ct_expect_find(tuple);
189         if (exp)
190                 atomic_inc(&exp->use);
191         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
192         READ_UNLOCK(&ip_conntrack_lock);
193
194         return exp;
195 }
196
197 /* remove one specific expectation from all lists and drop refcount,
198  * does _NOT_ delete the timer. */
199 static void __unexpect_related(struct ip_conntrack_expect *expect)
200 {
201         DEBUGP("unexpect_related(%p)\n", expect);
202         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
203
204         /* we're not allowed to unexpect a confirmed expectation! */
205         IP_NF_ASSERT(!expect->sibling);
206
207         /* delete from global and local lists */
208         list_del(&expect->list);
209         list_del(&expect->expected_list);
210
211         /* decrement expect-count of master conntrack */
212         if (expect->expectant)
213                 expect->expectant->expecting--;
214
215         ip_conntrack_expect_put(expect);
216 }
217
218 /* remove one specific expecatation from all lists, drop refcount
219  * and expire timer. 
220  * This function can _NOT_ be called for confirmed expects! */
221 static void unexpect_related(struct ip_conntrack_expect *expect)
222 {
223         IP_NF_ASSERT(expect->expectant);
224         IP_NF_ASSERT(expect->expectant->helper);
225         /* if we are supposed to have a timer, but we can't delete
226          * it: race condition.  __unexpect_related will
227          * be calledd by timeout function */
228         if (expect->expectant->helper->timeout
229             && !del_timer(&expect->timeout))
230                 return;
231
232         __unexpect_related(expect);
233 }
234
235 /* delete all unconfirmed expectations for this conntrack */
236 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
237 {
238         struct list_head *exp_entry, *next;
239         struct ip_conntrack_expect *exp;
240
241         DEBUGP("remove_expectations(%p)\n", ct);
242
243         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
244                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
245                                  expected_list);
246
247                 /* we skip established expectations, as we want to delete
248                  * the un-established ones only */
249                 if (exp->sibling) {
250                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
251                         if (drop_refcount) {
252                                 /* Indicate that this expectations parent is dead */
253                                 ip_conntrack_put(exp->expectant);
254                                 exp->expectant = NULL;
255                         }
256                         continue;
257                 }
258
259                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
260                 IP_NF_ASSERT(exp->expectant == ct);
261
262                 /* delete expectation from global and private lists */
263                 unexpect_related(exp);
264         }
265 }
266
267 static void
268 clean_from_lists(struct ip_conntrack *ct)
269 {
270         unsigned int ho, hr;
271         
272         DEBUGP("clean_from_lists(%p)\n", ct);
273         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
274
275         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
276         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
277         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
278         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
279
280         /* Destroy all un-established, pending expectations */
281         remove_expectations(ct, 1);
282 }
283
284 static void
285 destroy_conntrack(struct nf_conntrack *nfct)
286 {
287         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
288         struct ip_conntrack_protocol *proto;
289
290         DEBUGP("destroy_conntrack(%p)\n", ct);
291         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
292         IP_NF_ASSERT(!timer_pending(&ct->timeout));
293
294         /* To make sure we don't get any weird locking issues here:
295          * destroy_conntrack() MUST NOT be called with a write lock
296          * to ip_conntrack_lock!!! -HW */
297         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
298         if (proto && proto->destroy)
299                 proto->destroy(ct);
300
301         if (ip_conntrack_destroyed)
302                 ip_conntrack_destroyed(ct);
303
304         WRITE_LOCK(&ip_conntrack_lock);
305         /* Make sure don't leave any orphaned expectations lying around */
306         if (ct->expecting)
307                 remove_expectations(ct, 1);
308
309         /* Delete our master expectation */
310         if (ct->master) {
311                 if (ct->master->expectant) {
312                         /* can't call __unexpect_related here,
313                          * since it would screw up expect_list */
314                         list_del(&ct->master->expected_list);
315                         master = ct->master->expectant;
316                 }
317                 kmem_cache_free(ip_conntrack_expect_cachep, ct->master);
318         }
319         WRITE_UNLOCK(&ip_conntrack_lock);
320
321         if (master)
322                 ip_conntrack_put(master);
323
324         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
325         kmem_cache_free(ip_conntrack_cachep, ct);
326         atomic_dec(&ip_conntrack_count);
327         CONNTRACK_STAT_INC(delete);
328 }
329
330 static void death_by_timeout(unsigned long ul_conntrack)
331 {
332         struct ip_conntrack *ct = (void *)ul_conntrack;
333
334         CONNTRACK_STAT_INC(delete_list);
335
336         WRITE_LOCK(&ip_conntrack_lock);
337         clean_from_lists(ct);
338         WRITE_UNLOCK(&ip_conntrack_lock);
339         ip_conntrack_put(ct);
340 }
341
342 static inline int
343 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
344                     const struct ip_conntrack_tuple *tuple,
345                     const struct ip_conntrack *ignored_conntrack)
346 {
347         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
348         return i->ctrack != ignored_conntrack
349                 && ip_ct_tuple_equal(tuple, &i->tuple);
350 }
351
352 static struct ip_conntrack_tuple_hash *
353 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
354                     const struct ip_conntrack *ignored_conntrack)
355 {
356         struct ip_conntrack_tuple_hash *h;
357         unsigned int hash = hash_conntrack(tuple);
358         /* use per_cpu() to avoid multiple calls to smp_processor_id() */
359         unsigned int cpu = smp_processor_id();
360
361         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
362         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
363                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
364                         per_cpu(ip_conntrack_stat, cpu).found++;
365                         return h;
366                 }
367                 per_cpu(ip_conntrack_stat, cpu).searched++;
368         }
369
370         return NULL;
371 }
372
373 /* Find a connection corresponding to a tuple. */
374 struct ip_conntrack_tuple_hash *
375 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
376                       const struct ip_conntrack *ignored_conntrack)
377 {
378         struct ip_conntrack_tuple_hash *h;
379
380         READ_LOCK(&ip_conntrack_lock);
381         h = __ip_conntrack_find(tuple, ignored_conntrack);
382         if (h)
383                 atomic_inc(&h->ctrack->ct_general.use);
384         READ_UNLOCK(&ip_conntrack_lock);
385
386         return h;
387 }
388
389 /* Confirm a connection given skb; places it in hash table */
390 int
391 __ip_conntrack_confirm(struct sk_buff *skb)
392 {
393         unsigned int hash, repl_hash;
394         struct ip_conntrack *ct;
395         enum ip_conntrack_info ctinfo;
396
397         ct = ip_conntrack_get(skb, &ctinfo);
398
399         /* ipt_REJECT uses ip_conntrack_attach to attach related
400            ICMP/TCP RST packets in other direction.  Actual packet
401            which created connection will be IP_CT_NEW or for an
402            expected connection, IP_CT_RELATED. */
403         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
404                 return NF_ACCEPT;
405
406         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
407         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
408
409         /* We're not in hash table, and we refuse to set up related
410            connections for unconfirmed conns.  But packet copies and
411            REJECT will give spurious warnings here. */
412         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
413
414         /* No external references means noone else could have
415            confirmed us. */
416         IP_NF_ASSERT(!is_confirmed(ct));
417         DEBUGP("Confirming conntrack %p\n", ct);
418
419         WRITE_LOCK(&ip_conntrack_lock);
420         /* See if there's one in the list already, including reverse:
421            NAT could have grabbed it without realizing, since we're
422            not in the hash.  If there is, we lost race. */
423         if (!LIST_FIND(&ip_conntrack_hash[hash],
424                        conntrack_tuple_cmp,
425                        struct ip_conntrack_tuple_hash *,
426                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
427             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
428                           conntrack_tuple_cmp,
429                           struct ip_conntrack_tuple_hash *,
430                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
431                 list_prepend(&ip_conntrack_hash[hash],
432                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
433                 list_prepend(&ip_conntrack_hash[repl_hash],
434                              &ct->tuplehash[IP_CT_DIR_REPLY]);
435                 /* Timer relative to confirmation time, not original
436                    setting time, otherwise we'd get timer wrap in
437                    weird delay cases. */
438                 ct->timeout.expires += jiffies;
439                 add_timer(&ct->timeout);
440                 atomic_inc(&ct->ct_general.use);
441                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
442                 WRITE_UNLOCK(&ip_conntrack_lock);
443                 CONNTRACK_STAT_INC(insert);
444                 return NF_ACCEPT;
445         }
446
447         WRITE_UNLOCK(&ip_conntrack_lock);
448         CONNTRACK_STAT_INC(insert_failed);
449         return NF_DROP;
450 }
451
452 /* Returns true if a connection correspondings to the tuple (required
453    for NAT). */
454 int
455 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
456                          const struct ip_conntrack *ignored_conntrack)
457 {
458         struct ip_conntrack_tuple_hash *h;
459
460         READ_LOCK(&ip_conntrack_lock);
461         h = __ip_conntrack_find(tuple, ignored_conntrack);
462         READ_UNLOCK(&ip_conntrack_lock);
463
464         return h != NULL;
465 }
466
467 /* There's a small race here where we may free a just-assured
468    connection.  Too bad: we're in trouble anyway. */
469 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
470 {
471         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
472 }
473
474 static int early_drop(struct list_head *chain)
475 {
476         /* Traverse backwards: gives us oldest, which is roughly LRU */
477         struct ip_conntrack_tuple_hash *h;
478         int dropped = 0;
479
480         READ_LOCK(&ip_conntrack_lock);
481         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
482         if (h)
483                 atomic_inc(&h->ctrack->ct_general.use);
484         READ_UNLOCK(&ip_conntrack_lock);
485
486         if (!h)
487                 return dropped;
488
489         if (del_timer(&h->ctrack->timeout)) {
490                 death_by_timeout((unsigned long)h->ctrack);
491                 dropped = 1;
492                 CONNTRACK_STAT_INC(early_drop);
493         }
494         ip_conntrack_put(h->ctrack);
495         return dropped;
496 }
497
498 static inline int helper_cmp(const struct ip_conntrack_helper *i,
499                              const struct ip_conntrack_tuple *rtuple)
500 {
501         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
502 }
503
504 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
505 {
506         return LIST_FIND(&helpers, helper_cmp,
507                          struct ip_conntrack_helper *,
508                          tuple);
509 }
510
511 /* Allocate a new conntrack: we return -ENOMEM if classification
512    failed due to stress.  Otherwise it really is unclassifiable. */
513 static struct ip_conntrack_tuple_hash *
514 init_conntrack(const struct ip_conntrack_tuple *tuple,
515                struct ip_conntrack_protocol *protocol,
516                struct sk_buff *skb)
517 {
518         struct ip_conntrack *conntrack;
519         struct ip_conntrack_tuple repl_tuple;
520         size_t hash;
521         struct ip_conntrack_expect *expected;
522
523         if (!ip_conntrack_hash_rnd_initted) {
524                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
525                 ip_conntrack_hash_rnd_initted = 1;
526         }
527
528         hash = hash_conntrack(tuple);
529
530         if (ip_conntrack_max
531             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
532                 /* Try dropping from this hash chain. */
533                 if (!early_drop(&ip_conntrack_hash[hash])) {
534                         if (net_ratelimit())
535                                 printk(KERN_WARNING
536                                        "ip_conntrack: table full, dropping"
537                                        " packet.\n");
538                         return ERR_PTR(-ENOMEM);
539                 }
540         }
541
542         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
543                 DEBUGP("Can't invert tuple.\n");
544                 return NULL;
545         }
546
547         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
548         if (!conntrack) {
549                 DEBUGP("Can't allocate conntrack.\n");
550                 return ERR_PTR(-ENOMEM);
551         }
552
553         memset(conntrack, 0, sizeof(*conntrack));
554         atomic_set(&conntrack->ct_general.use, 1);
555         conntrack->ct_general.destroy = destroy_conntrack;
556         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
557         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
558         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
559         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
560         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
561         conntrack->xid[IP_CT_DIR_REPLY] = -1;
562
563 #warning MEF removed initialization of conntrack->infos structure, as this structure no longer exists in 2.6.9-1.11_FC.
564 #if 0
565         for (i=0; i < IP_CT_NUMBER; i++)
566                 conntrack->infos[i].master = &conntrack->ct_general;
567 #endif
568
569         if (!protocol->new(conntrack, skb)) {
570                 kmem_cache_free(ip_conntrack_cachep, conntrack);
571                 return NULL;
572         }
573         /* Don't set timer yet: wait for confirmation */
574         init_timer(&conntrack->timeout);
575         conntrack->timeout.data = (unsigned long)conntrack;
576         conntrack->timeout.function = death_by_timeout;
577
578         INIT_LIST_HEAD(&conntrack->sibling_list);
579
580         WRITE_LOCK(&ip_conntrack_lock);
581         /* Need finding and deleting of expected ONLY if we win race */
582         READ_LOCK(&ip_conntrack_expect_tuple_lock);
583         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
584                              struct ip_conntrack_expect *, tuple);
585         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
586
587         if (expected) {
588                 /* If master is not in hash table yet (ie. packet hasn't left
589                    this machine yet), how can other end know about expected?
590                    Hence these are not the droids you are looking for (if
591                    master ct never got confirmed, we'd hold a reference to it
592                    and weird things would happen to future packets). */
593                 if (!is_confirmed(expected->expectant)) {
594                         conntrack->helper = ip_ct_find_helper(&repl_tuple);
595                         goto end;
596                 }
597
598                 /* Expectation is dying... */
599                 if (expected->expectant->helper->timeout
600                     && !del_timer(&expected->timeout))
601                         goto end;       
602
603                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
604                         conntrack, expected);
605                 /* Welcome, Mr. Bond.  We've been expecting you... */
606                 IP_NF_ASSERT(expected->expectant);
607                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
608                 conntrack->master = expected;
609                 expected->sibling = conntrack;
610                 LIST_DELETE(&ip_conntrack_expect_list, expected);
611                 expected->expectant->expecting--;
612                 nf_conntrack_get(&master_ct(conntrack)->ct_general);
613
614                 /* this is a braindead... --pablo */
615                 atomic_inc(&ip_conntrack_count);
616                 WRITE_UNLOCK(&ip_conntrack_lock);
617
618                 if (expected->expectfn)
619                         expected->expectfn(conntrack);
620         
621                 CONNTRACK_STAT_INC(expect_new);
622
623                 goto ret;
624         } else  {
625                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
626
627                 CONNTRACK_STAT_INC(new);
628         }
629
630 end:    atomic_inc(&ip_conntrack_count);
631         WRITE_UNLOCK(&ip_conntrack_lock);
632
633 ret:    return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
634 }
635
636 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
637 static inline struct ip_conntrack *
638 resolve_normal_ct(struct sk_buff *skb,
639                   struct ip_conntrack_protocol *proto,
640                   int *set_reply,
641                   unsigned int hooknum,
642                   enum ip_conntrack_info *ctinfo)
643 {
644         struct ip_conntrack_tuple tuple;
645         struct ip_conntrack_tuple_hash *h;
646
647         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
648
649         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
650                                 &tuple,proto))
651                 return NULL;
652
653         /* look for tuple match */
654         h = ip_conntrack_find_get(&tuple, NULL);
655         if (!h) {
656                 h = init_conntrack(&tuple, proto, skb);
657                 if (!h)
658                         return NULL;
659                 if (IS_ERR(h))
660                         return (void *)h;
661         }
662
663         /* It exists; we have (non-exclusive) reference. */
664         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
665                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
666                 /* Please set reply bit if this packet OK */
667                 *set_reply = 1;
668         } else {
669                 /* Once we've had two way comms, always ESTABLISHED. */
670                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
671                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
672                                h->ctrack);
673                         *ctinfo = IP_CT_ESTABLISHED;
674                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
675                         DEBUGP("ip_conntrack_in: related packet for %p\n",
676                                h->ctrack);
677                         *ctinfo = IP_CT_RELATED;
678                 } else {
679                         DEBUGP("ip_conntrack_in: new packet for %p\n",
680                                h->ctrack);
681                         *ctinfo = IP_CT_NEW;
682                 }
683                 *set_reply = 0;
684         }
685         skb->nfct = &h->ctrack->ct_general;
686         skb->nfctinfo = *ctinfo;
687         return h->ctrack;
688 }
689
690 /* Netfilter hook itself. */
691 unsigned int ip_conntrack_in(unsigned int hooknum,
692                              struct sk_buff **pskb,
693                              const struct net_device *in,
694                              const struct net_device *out,
695                              int (*okfn)(struct sk_buff *))
696 {
697         struct ip_conntrack *ct;
698         enum ip_conntrack_info ctinfo;
699         struct ip_conntrack_protocol *proto;
700         int set_reply;
701         int ret;
702
703         /* Previously seen (loopback or untracked)?  Ignore. */
704         if ((*pskb)->nfct) {
705                 CONNTRACK_STAT_INC(ignore);
706                 return NF_ACCEPT;
707         }
708
709         /* Never happen */
710         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
711                 if (net_ratelimit()) {
712                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
713                        (*pskb)->nh.iph->protocol, hooknum);
714                 }
715                 return NF_DROP;
716         }
717
718         /* FIXME: Do this right please. --RR */
719         (*pskb)->nfcache |= NFC_UNKNOWN;
720
721 /* Doesn't cover locally-generated broadcast, so not worth it. */
722 #if 0
723         /* Ignore broadcast: no `connection'. */
724         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
725                 printk("Broadcast packet!\n");
726                 return NF_ACCEPT;
727         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
728                    == htonl(0x000000FF)) {
729                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
730                        NIPQUAD((*pskb)->nh.iph->saddr),
731                        NIPQUAD((*pskb)->nh.iph->daddr),
732                        (*pskb)->sk, (*pskb)->pkt_type);
733         }
734 #endif
735
736         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
737
738         /* It may be an special packet, error, unclean...
739          * inverse of the return code tells to the netfilter
740          * core what to do with the packet. */
741         if (proto->error != NULL 
742             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
743                 CONNTRACK_STAT_INC(error);
744                 CONNTRACK_STAT_INC(invalid);
745                 return -ret;
746         }
747
748         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
749                 /* Not valid part of a connection */
750                 CONNTRACK_STAT_INC(invalid);
751                 return NF_ACCEPT;
752         }
753
754         if (IS_ERR(ct)) {
755                 /* Too stressed to deal. */
756                 CONNTRACK_STAT_INC(drop);
757                 return NF_DROP;
758         }
759
760         IP_NF_ASSERT((*pskb)->nfct);
761
762         ret = proto->packet(ct, *pskb, ctinfo);
763         if (ret < 0) {
764                 /* Invalid: inverse of the return code tells
765                  * the netfilter core what to do*/
766                 nf_conntrack_put((*pskb)->nfct);
767                 (*pskb)->nfct = NULL;
768                 CONNTRACK_STAT_INC(invalid);
769                 return -ret;
770         }
771
772         if (ret != NF_DROP && ct->helper) {
773                 ret = ct->helper->help(*pskb, ct, ctinfo);
774                 if (ret == -1) {
775                         /* Invalid */
776                         CONNTRACK_STAT_INC(invalid);
777                         nf_conntrack_put((*pskb)->nfct);
778                         (*pskb)->nfct = NULL;
779                         return NF_ACCEPT;
780                 }
781         }
782         if (set_reply)
783                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
784
785         return ret;
786 }
787
788 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
789                    const struct ip_conntrack_tuple *orig)
790 {
791         return ip_ct_invert_tuple(inverse, orig, 
792                                   ip_ct_find_proto(orig->dst.protonum));
793 }
794
795 static inline int resent_expect(const struct ip_conntrack_expect *i,
796                                 const struct ip_conntrack_tuple *tuple,
797                                 const struct ip_conntrack_tuple *mask)
798 {
799         DEBUGP("resent_expect\n");
800         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
801         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
802         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
803         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
804                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
805                 && ip_ct_tuple_equal(&i->mask, mask));
806 }
807
808 /* Would two expected things clash? */
809 static inline int expect_clash(const struct ip_conntrack_expect *i,
810                                const struct ip_conntrack_tuple *tuple,
811                                const struct ip_conntrack_tuple *mask)
812 {
813         /* Part covered by intersection of masks must be unequal,
814            otherwise they clash */
815         struct ip_conntrack_tuple intersect_mask
816                 = { { i->mask.src.ip & mask->src.ip,
817                       { i->mask.src.u.all & mask->src.u.all } },
818                     { i->mask.dst.ip & mask->dst.ip,
819                       { i->mask.dst.u.all & mask->dst.u.all },
820                       i->mask.dst.protonum & mask->dst.protonum } };
821
822         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
823 }
824
825 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
826 {
827         WRITE_LOCK(&ip_conntrack_lock);
828         unexpect_related(expect);
829         WRITE_UNLOCK(&ip_conntrack_lock);
830 }
831         
832 static void expectation_timed_out(unsigned long ul_expect)
833 {
834         struct ip_conntrack_expect *expect = (void *) ul_expect;
835
836         DEBUGP("expectation %p timed out\n", expect);   
837         WRITE_LOCK(&ip_conntrack_lock);
838         __unexpect_related(expect);
839         WRITE_UNLOCK(&ip_conntrack_lock);
840 }
841
842 struct ip_conntrack_expect *
843 ip_conntrack_expect_alloc(void)
844 {
845         struct ip_conntrack_expect *new;
846
847         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
848         if (!new) {
849                 DEBUGP("expect_related: OOM allocating expect\n");
850                 return NULL;
851         }
852
853         /* tuple_cmp compares whole union, we have to initialized cleanly */
854         memset(new, 0, sizeof(struct ip_conntrack_expect));
855         atomic_set(&new->use, 1);
856
857         return new;
858 }
859
860 static void
861 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
862                            struct ip_conntrack *related_to)
863 {
864         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
865         new->expectant = related_to;
866         new->sibling = NULL;
867
868         /* add to expected list for this connection */
869         list_add_tail(&new->expected_list, &related_to->sibling_list);
870         /* add to global list of expectations */
871         list_prepend(&ip_conntrack_expect_list, &new->list);
872         /* add and start timer if required */
873         if (related_to->helper->timeout) {
874                 init_timer(&new->timeout);
875                 new->timeout.data = (unsigned long)new;
876                 new->timeout.function = expectation_timed_out;
877                 new->timeout.expires = jiffies +
878                                         related_to->helper->timeout * HZ;
879                 add_timer(&new->timeout);
880         }
881         related_to->expecting++;
882 }
883
884 /* Add a related connection. */
885 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
886                                 struct ip_conntrack *related_to)
887 {
888         struct ip_conntrack_expect *old;
889         int ret = 0;
890
891         WRITE_LOCK(&ip_conntrack_lock);
892         /* Because of the write lock, no reader can walk the lists,
893          * so there is no need to use the tuple lock too */
894
895         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
896         DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
897         DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
898
899         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
900                         struct ip_conntrack_expect *, &expect->tuple, 
901                         &expect->mask);
902         if (old) {
903                 /* Helper private data may contain offsets but no pointers
904                    pointing into the payload - otherwise we should have to copy 
905                    the data filled out by the helper over the old one */
906                 DEBUGP("expect_related: resent packet\n");
907                 if (related_to->helper->timeout) {
908                         if (!del_timer(&old->timeout)) {
909                                 /* expectation is dying. Fall through */
910                                 goto out;
911                         } else {
912                                 old->timeout.expires = jiffies + 
913                                         related_to->helper->timeout * HZ;
914                                 add_timer(&old->timeout);
915                         }
916                 }
917
918                 WRITE_UNLOCK(&ip_conntrack_lock);
919                 /* This expectation is not inserted so no need to lock */
920                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
921                 return -EEXIST;
922
923         } else if (related_to->helper->max_expected && 
924                    related_to->expecting >= related_to->helper->max_expected) {
925                 /* old == NULL */
926                 if (!(related_to->helper->flags & 
927                       IP_CT_HELPER_F_REUSE_EXPECT)) {
928                         WRITE_UNLOCK(&ip_conntrack_lock);
929                         if (net_ratelimit())
930                                 printk(KERN_WARNING
931                                        "ip_conntrack: max number of expected "
932                                        "connections %i of %s reached for "
933                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
934                                        related_to->helper->max_expected,
935                                        related_to->helper->name,
936                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
937                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
938                         kmem_cache_free(ip_conntrack_expect_cachep, expect);
939                         return -EPERM;
940                 }
941                 DEBUGP("ip_conntrack: max number of expected "
942                        "connections %i of %s reached for "
943                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
944                        related_to->helper->max_expected,
945                        related_to->helper->name,
946                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
947                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
948  
949                 /* choose the the oldest expectation to evict */
950                 list_for_each_entry(old, &related_to->sibling_list, 
951                                                       expected_list)
952                         if (old->sibling == NULL)
953                                 break;
954
955                 /* We cannot fail since related_to->expecting is the number
956                  * of unconfirmed expectations */
957                 IP_NF_ASSERT(old && old->sibling == NULL);
958
959                 /* newnat14 does not reuse the real allocated memory
960                  * structures but rather unexpects the old and
961                  * allocates a new.  unexpect_related will decrement
962                  * related_to->expecting. 
963                  */
964                 unexpect_related(old);
965                 ret = -EPERM;
966         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
967                              struct ip_conntrack_expect *, &expect->tuple, 
968                              &expect->mask)) {
969                 WRITE_UNLOCK(&ip_conntrack_lock);
970                 DEBUGP("expect_related: busy!\n");
971
972                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
973                 return -EBUSY;
974         }
975
976 out:    ip_conntrack_expect_insert(expect, related_to);
977
978         WRITE_UNLOCK(&ip_conntrack_lock);
979
980         CONNTRACK_STAT_INC(expect_create);
981
982         return ret;
983 }
984
985 /* Change tuple in an existing expectation */
986 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
987                                struct ip_conntrack_tuple *newtuple)
988 {
989         int ret;
990
991         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
992         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
993         DEBUGP("change_expect:\n");
994         DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
995         DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
996         DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
997         if (expect->ct_tuple.dst.protonum == 0) {
998                 /* Never seen before */
999                 DEBUGP("change expect: never seen before\n");
1000                 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
1001                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1002                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1003                         /* Force NAT to find an unused tuple */
1004                         ret = -1;
1005                 } else {
1006                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1007                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1008                         ret = 0;
1009                 }
1010         } else {
1011                 /* Resent packet */
1012                 DEBUGP("change expect: resent packet\n");
1013                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1014                         ret = 0;
1015                 } else {
1016                         /* Force NAT to choose again the same port */
1017                         ret = -1;
1018                 }
1019         }
1020         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1021         
1022         return ret;
1023 }
1024
1025 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1026    return 0 and don't do alteration. */
1027 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1028                              const struct ip_conntrack_tuple *newreply)
1029 {
1030         WRITE_LOCK(&ip_conntrack_lock);
1031         if (__ip_conntrack_find(newreply, conntrack)) {
1032                 WRITE_UNLOCK(&ip_conntrack_lock);
1033                 return 0;
1034         }
1035         /* Should be unconfirmed, so not in hash table yet */
1036         IP_NF_ASSERT(!is_confirmed(conntrack));
1037
1038         DEBUGP("Altering reply tuple of %p to ", conntrack);
1039         DUMP_TUPLE(newreply);
1040
1041         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1042         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1043                 conntrack->helper = ip_ct_find_helper(newreply);
1044         WRITE_UNLOCK(&ip_conntrack_lock);
1045
1046         return 1;
1047 }
1048
1049 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1050 {
1051         WRITE_LOCK(&ip_conntrack_lock);
1052         list_prepend(&helpers, me);
1053         WRITE_UNLOCK(&ip_conntrack_lock);
1054
1055         return 0;
1056 }
1057
1058 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1059                          const struct ip_conntrack_helper *me)
1060 {
1061         if (i->ctrack->helper == me) {
1062                 /* Get rid of any expected. */
1063                 remove_expectations(i->ctrack, 0);
1064                 /* And *then* set helper to NULL */
1065                 i->ctrack->helper = NULL;
1066         }
1067         return 0;
1068 }
1069
1070 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1071 {
1072         unsigned int i;
1073
1074         /* Need write lock here, to delete helper. */
1075         WRITE_LOCK(&ip_conntrack_lock);
1076         LIST_DELETE(&helpers, me);
1077
1078         /* Get rid of expecteds, set helpers to NULL. */
1079         for (i = 0; i < ip_conntrack_htable_size; i++)
1080                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1081                             struct ip_conntrack_tuple_hash *, me);
1082         WRITE_UNLOCK(&ip_conntrack_lock);
1083
1084         /* Someone could be still looking at the helper in a bh. */
1085         synchronize_net();
1086 }
1087
1088 static inline void ct_add_counters(struct ip_conntrack *ct,
1089                                    enum ip_conntrack_info ctinfo,
1090                                    const struct sk_buff *skb)
1091 {
1092 #ifdef CONFIG_IP_NF_CT_ACCT
1093         if (skb) {
1094                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1095                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1096                                         ntohs(skb->nh.iph->tot_len);
1097         }
1098 #endif
1099 }
1100
1101 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1102 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1103                         enum ip_conntrack_info ctinfo,
1104                         const struct sk_buff *skb,
1105                         unsigned long extra_jiffies)
1106 {
1107         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1108
1109         /* If not in hash table, timer will not be active yet */
1110         if (!is_confirmed(ct)) {
1111                 ct->timeout.expires = extra_jiffies;
1112                 ct_add_counters(ct, ctinfo, skb);
1113         } else {
1114                 WRITE_LOCK(&ip_conntrack_lock);
1115                 /* Need del_timer for race avoidance (may already be dying). */
1116                 if (del_timer(&ct->timeout)) {
1117                         ct->timeout.expires = jiffies + extra_jiffies;
1118                         add_timer(&ct->timeout);
1119                 }
1120                 ct_add_counters(ct, ctinfo, skb);
1121                 WRITE_UNLOCK(&ip_conntrack_lock);
1122         }
1123 }
1124
1125 int ip_ct_no_defrag;
1126
1127 /* Returns new sk_buff, or NULL */
1128 struct sk_buff *
1129 ip_ct_gather_frags(struct sk_buff *skb)
1130 {
1131         struct sock *sk = skb->sk;
1132 #ifdef CONFIG_NETFILTER_DEBUG
1133         unsigned int olddebug = skb->nf_debug;
1134 #endif
1135
1136         if (unlikely(ip_ct_no_defrag)) {
1137                 kfree_skb(skb);
1138                 return NULL;
1139         }
1140
1141         if (sk) {
1142                 sock_hold(sk);
1143                 skb_orphan(skb);
1144         }
1145
1146         local_bh_disable(); 
1147         skb = ip_defrag(skb);
1148         local_bh_enable();
1149
1150         if (!skb) {
1151                 if (sk)
1152                         sock_put(sk);
1153                 return skb;
1154         }
1155
1156         if (sk) {
1157                 skb_set_owner_w(skb, sk);
1158                 sock_put(sk);
1159         }
1160
1161         ip_send_check(skb->nh.iph);
1162         skb->nfcache |= NFC_ALTERED;
1163 #ifdef CONFIG_NETFILTER_DEBUG
1164         /* Packet path as if nothing had happened. */
1165         skb->nf_debug = olddebug;
1166 #endif
1167         return skb;
1168 }
1169
1170 /* Used by ipt_REJECT. */
1171 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1172 {
1173         struct ip_conntrack *ct;
1174         enum ip_conntrack_info ctinfo;
1175
1176         /* This ICMP is in reverse direction to the packet which caused it */
1177         ct = ip_conntrack_get(skb, &ctinfo);
1178         
1179         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1180                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1181         else
1182                 ctinfo = IP_CT_RELATED;
1183
1184         /* Attach to new skbuff, and increment count */
1185         nskb->nfct = &ct->ct_general;
1186         nskb->nfctinfo = ctinfo;
1187         nf_conntrack_get(nskb->nfct);
1188 }
1189
1190 static inline int
1191 do_kill(const struct ip_conntrack_tuple_hash *i,
1192         int (*kill)(const struct ip_conntrack *i, void *data),
1193         void *data)
1194 {
1195         return kill(i->ctrack, data);
1196 }
1197
1198 /* Bring out ya dead! */
1199 static struct ip_conntrack_tuple_hash *
1200 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1201                 void *data, unsigned int *bucket)
1202 {
1203         struct ip_conntrack_tuple_hash *h = NULL;
1204
1205         READ_LOCK(&ip_conntrack_lock);
1206         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1207                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1208                               struct ip_conntrack_tuple_hash *, kill, data);
1209         }
1210         if (h)
1211                 atomic_inc(&h->ctrack->ct_general.use);
1212         READ_UNLOCK(&ip_conntrack_lock);
1213
1214         return h;
1215 }
1216
1217 void
1218 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1219                         void *data)
1220 {
1221         struct ip_conntrack_tuple_hash *h;
1222         unsigned int bucket = 0;
1223
1224         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1225                 /* Time to push up daises... */
1226                 if (del_timer(&h->ctrack->timeout))
1227                         death_by_timeout((unsigned long)h->ctrack);
1228                 /* ... else the timer will get him soon. */
1229
1230                 ip_conntrack_put(h->ctrack);
1231         }
1232 }
1233
1234 /* Fast function for those who don't want to parse /proc (and I don't
1235    blame them). */
1236 /* Reversing the socket's dst/src point of view gives us the reply
1237    mapping. */
1238 static int
1239 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1240 {
1241         struct inet_opt *inet = inet_sk(sk);
1242         struct ip_conntrack_tuple_hash *h;
1243         struct ip_conntrack_tuple tuple;
1244         
1245         IP_CT_TUPLE_U_BLANK(&tuple);
1246         tuple.src.ip = inet->rcv_saddr;
1247         tuple.src.u.tcp.port = inet->sport;
1248         tuple.dst.ip = inet->daddr;
1249         tuple.dst.u.tcp.port = inet->dport;
1250         tuple.dst.protonum = IPPROTO_TCP;
1251
1252         /* We only do TCP at the moment: is there a better way? */
1253         if (strcmp(sk->sk_prot->name, "TCP")) {
1254                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1255                 return -ENOPROTOOPT;
1256         }
1257
1258         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1259                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1260                        *len, sizeof(struct sockaddr_in));
1261                 return -EINVAL;
1262         }
1263
1264         h = ip_conntrack_find_get(&tuple, NULL);
1265         if (h) {
1266                 struct sockaddr_in sin;
1267
1268                 sin.sin_family = AF_INET;
1269                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1270                         .tuple.dst.u.tcp.port;
1271                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1272                         .tuple.dst.ip;
1273
1274                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1275                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1276                 ip_conntrack_put(h->ctrack);
1277                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1278                         return -EFAULT;
1279                 else
1280                         return 0;
1281         }
1282         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1283                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1284                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1285         return -ENOENT;
1286 }
1287
1288 static struct nf_sockopt_ops so_getorigdst = {
1289         .pf             = PF_INET,
1290         .get_optmin     = SO_ORIGINAL_DST,
1291         .get_optmax     = SO_ORIGINAL_DST+1,
1292         .get            = &getorigdst,
1293 };
1294
1295 static int kill_all(const struct ip_conntrack *i, void *data)
1296 {
1297         return 1;
1298 }
1299
1300 /* Mishearing the voices in his head, our hero wonders how he's
1301    supposed to kill the mall. */
1302 void ip_conntrack_cleanup(void)
1303 {
1304         ip_ct_attach = NULL;
1305         /* This makes sure all current packets have passed through
1306            netfilter framework.  Roll on, two-stage module
1307            delete... */
1308         synchronize_net();
1309  
1310  i_see_dead_people:
1311         ip_ct_selective_cleanup(kill_all, NULL);
1312         if (atomic_read(&ip_conntrack_count) != 0) {
1313                 schedule();
1314                 goto i_see_dead_people;
1315         }
1316
1317         kmem_cache_destroy(ip_conntrack_cachep);
1318         kmem_cache_destroy(ip_conntrack_expect_cachep);
1319         vfree(ip_conntrack_hash);
1320         nf_unregister_sockopt(&so_getorigdst);
1321 }
1322
1323 static int hashsize;
1324 module_param(hashsize, int, 0400);
1325
1326 int __init ip_conntrack_init(void)
1327 {
1328         unsigned int i;
1329         int ret;
1330
1331         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1332          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1333         if (hashsize) {
1334                 ip_conntrack_htable_size = hashsize;
1335         } else {
1336                 ip_conntrack_htable_size
1337                         = (((num_physpages << PAGE_SHIFT) / 16384)
1338                            / sizeof(struct list_head));
1339                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1340                         ip_conntrack_htable_size = 8192;
1341                 if (ip_conntrack_htable_size < 16)
1342                         ip_conntrack_htable_size = 16;
1343         }
1344         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1345
1346         printk("ip_conntrack version %s (%u buckets, %d max)"
1347                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1348                ip_conntrack_htable_size, ip_conntrack_max,
1349                sizeof(struct ip_conntrack));
1350
1351         ret = nf_register_sockopt(&so_getorigdst);
1352         if (ret != 0) {
1353                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1354                 return ret;
1355         }
1356
1357         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1358                                     * ip_conntrack_htable_size);
1359         if (!ip_conntrack_hash) {
1360                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1361                 goto err_unreg_sockopt;
1362         }
1363
1364         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1365                                                 sizeof(struct ip_conntrack), 0,
1366                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1367         if (!ip_conntrack_cachep) {
1368                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1369                 goto err_free_hash;
1370         }
1371
1372         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1373                                         sizeof(struct ip_conntrack_expect),
1374                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1375         if (!ip_conntrack_expect_cachep) {
1376                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1377                 goto err_free_conntrack_slab;
1378         }
1379
1380         /* Don't NEED lock here, but good form anyway. */
1381         WRITE_LOCK(&ip_conntrack_lock);
1382         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1383                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1384         /* Sew in builtin protocols. */
1385         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1386         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1387         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1388         WRITE_UNLOCK(&ip_conntrack_lock);
1389
1390         for (i = 0; i < ip_conntrack_htable_size; i++)
1391                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1392
1393         /* For use by ipt_REJECT */
1394         ip_ct_attach = ip_conntrack_attach;
1395
1396         /* Set up fake conntrack:
1397             - to never be deleted, not in any hashes */
1398         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1399         /*  - and look it like as a confirmed connection */
1400         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1401
1402         return ret;
1403
1404 err_free_conntrack_slab:
1405         kmem_cache_destroy(ip_conntrack_cachep);
1406 err_free_hash:
1407         vfree(ip_conntrack_hash);
1408 err_unreg_sockopt:
1409         nf_unregister_sockopt(&so_getorigdst);
1410
1411         return -ENOMEM;
1412 }