vserver 1.9.3
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40
41 /* This rwlock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION    "2.1"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DECLARE_RWLOCK(ip_conntrack_lock);
61 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 EXPORT_SYMBOL(ip_conntrack_count);
66
67 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
68 LIST_HEAD(ip_conntrack_expect_list);
69 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
70 static LIST_HEAD(helpers);
71 unsigned int ip_conntrack_htable_size = 0;
72 int ip_conntrack_max;
73 struct list_head *ip_conntrack_hash;
74 static kmem_cache_t *ip_conntrack_cachep;
75 static kmem_cache_t *ip_conntrack_expect_cachep;
76 struct ip_conntrack ip_conntrack_untracked;
77 unsigned int ip_ct_log_invalid;
78
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81 inline void 
82 ip_conntrack_put(struct ip_conntrack *ct)
83 {
84         IP_NF_ASSERT(ct);
85         nf_conntrack_put(&ct->ct_general);
86 }
87
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
90
91 static u_int32_t
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
93 {
94 #if 0
95         dump_tuple(tuple);
96 #endif
97         return (jhash_3words(tuple->src.ip,
98                              (tuple->dst.ip ^ tuple->dst.protonum),
99                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
100                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101 }
102
103 int
104 ip_ct_get_tuple(const struct iphdr *iph,
105                 const struct sk_buff *skb,
106                 unsigned int dataoff,
107                 struct ip_conntrack_tuple *tuple,
108                 const struct ip_conntrack_protocol *protocol)
109 {
110         /* Never happen */
111         if (iph->frag_off & htons(IP_OFFSET)) {
112                 printk("ip_conntrack_core: Frag of proto %u.\n",
113                        iph->protocol);
114                 return 0;
115         }
116
117         tuple->src.ip = iph->saddr;
118         tuple->dst.ip = iph->daddr;
119         tuple->dst.protonum = iph->protocol;
120
121         return protocol->pkt_to_tuple(skb, dataoff, tuple);
122 }
123
124 int
125 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
126                    const struct ip_conntrack_tuple *orig,
127                    const struct ip_conntrack_protocol *protocol)
128 {
129         inverse->src.ip = orig->dst.ip;
130         inverse->dst.ip = orig->src.ip;
131         inverse->dst.protonum = orig->dst.protonum;
132
133         return protocol->invert_tuple(inverse, orig);
134 }
135
136
137 /* ip_conntrack_expect helper functions */
138
139 /* Compare tuple parts depending on mask. */
140 static inline int expect_cmp(const struct ip_conntrack_expect *i,
141                              const struct ip_conntrack_tuple *tuple)
142 {
143         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
144         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
145 }
146
147 static void
148 destroy_expect(struct ip_conntrack_expect *exp)
149 {
150         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
151         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
152         IP_NF_ASSERT(!timer_pending(&exp->timeout));
153
154         kmem_cache_free(ip_conntrack_expect_cachep, exp);
155         CONNTRACK_STAT_INC(expect_delete);
156 }
157
158 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
159 {
160         IP_NF_ASSERT(exp);
161
162         if (atomic_dec_and_test(&exp->use)) {
163                 /* usage count dropped to zero */
164                 destroy_expect(exp);
165         }
166 }
167
168 static inline struct ip_conntrack_expect *
169 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
170 {
171         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
172         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
173         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
174                          struct ip_conntrack_expect *, tuple);
175 }
176
177 /* Find a expectation corresponding to a tuple. */
178 struct ip_conntrack_expect *
179 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
180 {
181         struct ip_conntrack_expect *exp;
182
183         READ_LOCK(&ip_conntrack_lock);
184         READ_LOCK(&ip_conntrack_expect_tuple_lock);
185         exp = __ip_ct_expect_find(tuple);
186         if (exp)
187                 atomic_inc(&exp->use);
188         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
189         READ_UNLOCK(&ip_conntrack_lock);
190
191         return exp;
192 }
193
194 /* remove one specific expectation from all lists and drop refcount,
195  * does _NOT_ delete the timer. */
196 static void __unexpect_related(struct ip_conntrack_expect *expect)
197 {
198         DEBUGP("unexpect_related(%p)\n", expect);
199         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
200
201         /* we're not allowed to unexpect a confirmed expectation! */
202         IP_NF_ASSERT(!expect->sibling);
203
204         /* delete from global and local lists */
205         list_del(&expect->list);
206         list_del(&expect->expected_list);
207
208         /* decrement expect-count of master conntrack */
209         if (expect->expectant)
210                 expect->expectant->expecting--;
211
212         ip_conntrack_expect_put(expect);
213 }
214
215 /* remove one specific expecatation from all lists, drop refcount
216  * and expire timer. 
217  * This function can _NOT_ be called for confirmed expects! */
218 static void unexpect_related(struct ip_conntrack_expect *expect)
219 {
220         IP_NF_ASSERT(expect->expectant);
221         IP_NF_ASSERT(expect->expectant->helper);
222         /* if we are supposed to have a timer, but we can't delete
223          * it: race condition.  __unexpect_related will
224          * be calledd by timeout function */
225         if (expect->expectant->helper->timeout
226             && !del_timer(&expect->timeout))
227                 return;
228
229         __unexpect_related(expect);
230 }
231
232 /* delete all unconfirmed expectations for this conntrack */
233 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
234 {
235         struct list_head *exp_entry, *next;
236         struct ip_conntrack_expect *exp;
237
238         DEBUGP("remove_expectations(%p)\n", ct);
239
240         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
241                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
242                                  expected_list);
243
244                 /* we skip established expectations, as we want to delete
245                  * the un-established ones only */
246                 if (exp->sibling) {
247                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
248                         if (drop_refcount) {
249                                 /* Indicate that this expectations parent is dead */
250                                 ip_conntrack_put(exp->expectant);
251                                 exp->expectant = NULL;
252                         }
253                         continue;
254                 }
255
256                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
257                 IP_NF_ASSERT(exp->expectant == ct);
258
259                 /* delete expectation from global and private lists */
260                 unexpect_related(exp);
261         }
262 }
263
264 static void
265 clean_from_lists(struct ip_conntrack *ct)
266 {
267         unsigned int ho, hr;
268         
269         DEBUGP("clean_from_lists(%p)\n", ct);
270         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
271
272         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
273         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
274         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
275         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
276
277         /* Destroy all un-established, pending expectations */
278         remove_expectations(ct, 1);
279 }
280
281 static void
282 destroy_conntrack(struct nf_conntrack *nfct)
283 {
284         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
285         struct ip_conntrack_protocol *proto;
286
287         DEBUGP("destroy_conntrack(%p)\n", ct);
288         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
289         IP_NF_ASSERT(!timer_pending(&ct->timeout));
290
291         /* To make sure we don't get any weird locking issues here:
292          * destroy_conntrack() MUST NOT be called with a write lock
293          * to ip_conntrack_lock!!! -HW */
294         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
295         if (proto && proto->destroy)
296                 proto->destroy(ct);
297
298         if (ip_conntrack_destroyed)
299                 ip_conntrack_destroyed(ct);
300
301         WRITE_LOCK(&ip_conntrack_lock);
302         /* Make sure don't leave any orphaned expectations lying around */
303         if (ct->expecting)
304                 remove_expectations(ct, 1);
305
306         /* Delete our master expectation */
307         if (ct->master) {
308                 if (ct->master->expectant) {
309                         /* can't call __unexpect_related here,
310                          * since it would screw up expect_list */
311                         list_del(&ct->master->expected_list);
312                         master = ct->master->expectant;
313                 }
314                 kmem_cache_free(ip_conntrack_expect_cachep, ct->master);
315         }
316         WRITE_UNLOCK(&ip_conntrack_lock);
317
318         if (master)
319                 ip_conntrack_put(master);
320
321         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
322         kmem_cache_free(ip_conntrack_cachep, ct);
323         atomic_dec(&ip_conntrack_count);
324         CONNTRACK_STAT_INC(delete);
325 }
326
327 static void death_by_timeout(unsigned long ul_conntrack)
328 {
329         struct ip_conntrack *ct = (void *)ul_conntrack;
330
331         CONNTRACK_STAT_INC(delete_list);
332
333         WRITE_LOCK(&ip_conntrack_lock);
334         clean_from_lists(ct);
335         WRITE_UNLOCK(&ip_conntrack_lock);
336         ip_conntrack_put(ct);
337 }
338
339 static inline int
340 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
341                     const struct ip_conntrack_tuple *tuple,
342                     const struct ip_conntrack *ignored_conntrack)
343 {
344         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
345         return i->ctrack != ignored_conntrack
346                 && ip_ct_tuple_equal(tuple, &i->tuple);
347 }
348
349 static struct ip_conntrack_tuple_hash *
350 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
351                     const struct ip_conntrack *ignored_conntrack)
352 {
353         struct ip_conntrack_tuple_hash *h;
354         unsigned int hash = hash_conntrack(tuple);
355         /* use per_cpu() to avoid multiple calls to smp_processor_id() */
356         unsigned int cpu = smp_processor_id();
357
358         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
359         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
360                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
361                         per_cpu(ip_conntrack_stat, cpu).found++;
362                         return h;
363                 }
364                 per_cpu(ip_conntrack_stat, cpu).searched++;
365         }
366
367         return NULL;
368 }
369
370 /* Find a connection corresponding to a tuple. */
371 struct ip_conntrack_tuple_hash *
372 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
373                       const struct ip_conntrack *ignored_conntrack)
374 {
375         struct ip_conntrack_tuple_hash *h;
376
377         READ_LOCK(&ip_conntrack_lock);
378         h = __ip_conntrack_find(tuple, ignored_conntrack);
379         if (h)
380                 atomic_inc(&h->ctrack->ct_general.use);
381         READ_UNLOCK(&ip_conntrack_lock);
382
383         return h;
384 }
385
386 /* Confirm a connection given skb; places it in hash table */
387 int
388 __ip_conntrack_confirm(struct sk_buff *skb)
389 {
390         unsigned int hash, repl_hash;
391         struct ip_conntrack *ct;
392         enum ip_conntrack_info ctinfo;
393
394         ct = ip_conntrack_get(skb, &ctinfo);
395
396         /* ipt_REJECT uses ip_conntrack_attach to attach related
397            ICMP/TCP RST packets in other direction.  Actual packet
398            which created connection will be IP_CT_NEW or for an
399            expected connection, IP_CT_RELATED. */
400         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
401                 return NF_ACCEPT;
402
403         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
404         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
405
406         /* We're not in hash table, and we refuse to set up related
407            connections for unconfirmed conns.  But packet copies and
408            REJECT will give spurious warnings here. */
409         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
410
411         /* No external references means noone else could have
412            confirmed us. */
413         IP_NF_ASSERT(!is_confirmed(ct));
414         DEBUGP("Confirming conntrack %p\n", ct);
415
416         WRITE_LOCK(&ip_conntrack_lock);
417         /* See if there's one in the list already, including reverse:
418            NAT could have grabbed it without realizing, since we're
419            not in the hash.  If there is, we lost race. */
420         if (!LIST_FIND(&ip_conntrack_hash[hash],
421                        conntrack_tuple_cmp,
422                        struct ip_conntrack_tuple_hash *,
423                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
424             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
425                           conntrack_tuple_cmp,
426                           struct ip_conntrack_tuple_hash *,
427                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
428                 list_prepend(&ip_conntrack_hash[hash],
429                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
430                 list_prepend(&ip_conntrack_hash[repl_hash],
431                              &ct->tuplehash[IP_CT_DIR_REPLY]);
432                 /* Timer relative to confirmation time, not original
433                    setting time, otherwise we'd get timer wrap in
434                    weird delay cases. */
435                 ct->timeout.expires += jiffies;
436                 add_timer(&ct->timeout);
437                 atomic_inc(&ct->ct_general.use);
438                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
439                 WRITE_UNLOCK(&ip_conntrack_lock);
440                 CONNTRACK_STAT_INC(insert);
441                 return NF_ACCEPT;
442         }
443
444         WRITE_UNLOCK(&ip_conntrack_lock);
445         CONNTRACK_STAT_INC(insert_failed);
446         return NF_DROP;
447 }
448
449 /* Returns true if a connection correspondings to the tuple (required
450    for NAT). */
451 int
452 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
453                          const struct ip_conntrack *ignored_conntrack)
454 {
455         struct ip_conntrack_tuple_hash *h;
456
457         READ_LOCK(&ip_conntrack_lock);
458         h = __ip_conntrack_find(tuple, ignored_conntrack);
459         READ_UNLOCK(&ip_conntrack_lock);
460
461         return h != NULL;
462 }
463
464 /* There's a small race here where we may free a just-assured
465    connection.  Too bad: we're in trouble anyway. */
466 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
467 {
468         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
469 }
470
471 static int early_drop(struct list_head *chain)
472 {
473         /* Traverse backwards: gives us oldest, which is roughly LRU */
474         struct ip_conntrack_tuple_hash *h;
475         int dropped = 0;
476
477         READ_LOCK(&ip_conntrack_lock);
478         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
479         if (h)
480                 atomic_inc(&h->ctrack->ct_general.use);
481         READ_UNLOCK(&ip_conntrack_lock);
482
483         if (!h)
484                 return dropped;
485
486         if (del_timer(&h->ctrack->timeout)) {
487                 death_by_timeout((unsigned long)h->ctrack);
488                 dropped = 1;
489                 CONNTRACK_STAT_INC(early_drop);
490         }
491         ip_conntrack_put(h->ctrack);
492         return dropped;
493 }
494
495 static inline int helper_cmp(const struct ip_conntrack_helper *i,
496                              const struct ip_conntrack_tuple *rtuple)
497 {
498         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
499 }
500
501 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
502 {
503         return LIST_FIND(&helpers, helper_cmp,
504                          struct ip_conntrack_helper *,
505                          tuple);
506 }
507
508 /* Allocate a new conntrack: we return -ENOMEM if classification
509    failed due to stress.  Otherwise it really is unclassifiable. */
510 static struct ip_conntrack_tuple_hash *
511 init_conntrack(const struct ip_conntrack_tuple *tuple,
512                struct ip_conntrack_protocol *protocol,
513                struct sk_buff *skb)
514 {
515         struct ip_conntrack *conntrack;
516         struct ip_conntrack_tuple repl_tuple;
517         size_t hash;
518         struct ip_conntrack_expect *expected;
519
520         if (!ip_conntrack_hash_rnd_initted) {
521                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
522                 ip_conntrack_hash_rnd_initted = 1;
523         }
524
525         hash = hash_conntrack(tuple);
526
527         if (ip_conntrack_max
528             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
529                 /* Try dropping from this hash chain. */
530                 if (!early_drop(&ip_conntrack_hash[hash])) {
531                         if (net_ratelimit())
532                                 printk(KERN_WARNING
533                                        "ip_conntrack: table full, dropping"
534                                        " packet.\n");
535                         return ERR_PTR(-ENOMEM);
536                 }
537         }
538
539         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
540                 DEBUGP("Can't invert tuple.\n");
541                 return NULL;
542         }
543
544         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
545         if (!conntrack) {
546                 DEBUGP("Can't allocate conntrack.\n");
547                 return ERR_PTR(-ENOMEM);
548         }
549
550         memset(conntrack, 0, sizeof(*conntrack));
551         atomic_set(&conntrack->ct_general.use, 1);
552         conntrack->ct_general.destroy = destroy_conntrack;
553         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
554         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
555         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
556         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
557         if (!protocol->new(conntrack, skb)) {
558                 kmem_cache_free(ip_conntrack_cachep, conntrack);
559                 return NULL;
560         }
561         /* Don't set timer yet: wait for confirmation */
562         init_timer(&conntrack->timeout);
563         conntrack->timeout.data = (unsigned long)conntrack;
564         conntrack->timeout.function = death_by_timeout;
565
566         INIT_LIST_HEAD(&conntrack->sibling_list);
567
568         WRITE_LOCK(&ip_conntrack_lock);
569         /* Need finding and deleting of expected ONLY if we win race */
570         READ_LOCK(&ip_conntrack_expect_tuple_lock);
571         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
572                              struct ip_conntrack_expect *, tuple);
573         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
574
575         if (expected) {
576                 /* If master is not in hash table yet (ie. packet hasn't left
577                    this machine yet), how can other end know about expected?
578                    Hence these are not the droids you are looking for (if
579                    master ct never got confirmed, we'd hold a reference to it
580                    and weird things would happen to future packets). */
581                 if (!is_confirmed(expected->expectant)) {
582                         conntrack->helper = ip_ct_find_helper(&repl_tuple);
583                         goto end;
584                 }
585
586                 /* Expectation is dying... */
587                 if (expected->expectant->helper->timeout
588                     && !del_timer(&expected->timeout))
589                         goto end;       
590
591                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
592                         conntrack, expected);
593                 /* Welcome, Mr. Bond.  We've been expecting you... */
594                 IP_NF_ASSERT(expected->expectant);
595                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
596                 conntrack->master = expected;
597                 expected->sibling = conntrack;
598                 LIST_DELETE(&ip_conntrack_expect_list, expected);
599                 expected->expectant->expecting--;
600                 nf_conntrack_get(&master_ct(conntrack)->ct_general);
601
602                 /* this is a braindead... --pablo */
603                 atomic_inc(&ip_conntrack_count);
604                 WRITE_UNLOCK(&ip_conntrack_lock);
605
606                 if (expected->expectfn)
607                         expected->expectfn(conntrack);
608         
609                 CONNTRACK_STAT_INC(expect_new);
610
611                 goto ret;
612         } else  {
613                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
614
615                 CONNTRACK_STAT_INC(new);
616         }
617
618 end:    atomic_inc(&ip_conntrack_count);
619         WRITE_UNLOCK(&ip_conntrack_lock);
620
621 ret:    return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
622 }
623
624 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
625 static inline struct ip_conntrack *
626 resolve_normal_ct(struct sk_buff *skb,
627                   struct ip_conntrack_protocol *proto,
628                   int *set_reply,
629                   unsigned int hooknum,
630                   enum ip_conntrack_info *ctinfo)
631 {
632         struct ip_conntrack_tuple tuple;
633         struct ip_conntrack_tuple_hash *h;
634
635         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
636
637         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
638                                 &tuple,proto))
639                 return NULL;
640
641         /* look for tuple match */
642         h = ip_conntrack_find_get(&tuple, NULL);
643         if (!h) {
644                 h = init_conntrack(&tuple, proto, skb);
645                 if (!h)
646                         return NULL;
647                 if (IS_ERR(h))
648                         return (void *)h;
649         }
650
651         /* It exists; we have (non-exclusive) reference. */
652         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
653                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
654                 /* Please set reply bit if this packet OK */
655                 *set_reply = 1;
656         } else {
657                 /* Once we've had two way comms, always ESTABLISHED. */
658                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
659                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
660                                h->ctrack);
661                         *ctinfo = IP_CT_ESTABLISHED;
662                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
663                         DEBUGP("ip_conntrack_in: related packet for %p\n",
664                                h->ctrack);
665                         *ctinfo = IP_CT_RELATED;
666                 } else {
667                         DEBUGP("ip_conntrack_in: new packet for %p\n",
668                                h->ctrack);
669                         *ctinfo = IP_CT_NEW;
670                 }
671                 *set_reply = 0;
672         }
673         skb->nfct = &h->ctrack->ct_general;
674         skb->nfctinfo = *ctinfo;
675         return h->ctrack;
676 }
677
678 /* Netfilter hook itself. */
679 unsigned int ip_conntrack_in(unsigned int hooknum,
680                              struct sk_buff **pskb,
681                              const struct net_device *in,
682                              const struct net_device *out,
683                              int (*okfn)(struct sk_buff *))
684 {
685         struct ip_conntrack *ct;
686         enum ip_conntrack_info ctinfo;
687         struct ip_conntrack_protocol *proto;
688         int set_reply;
689         int ret;
690
691         /* Previously seen (loopback or untracked)?  Ignore. */
692         if ((*pskb)->nfct) {
693                 CONNTRACK_STAT_INC(ignore);
694                 return NF_ACCEPT;
695         }
696
697         /* Never happen */
698         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
699                 if (net_ratelimit()) {
700                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
701                        (*pskb)->nh.iph->protocol, hooknum);
702                 }
703                 return NF_DROP;
704         }
705
706         /* FIXME: Do this right please. --RR */
707         (*pskb)->nfcache |= NFC_UNKNOWN;
708
709 /* Doesn't cover locally-generated broadcast, so not worth it. */
710 #if 0
711         /* Ignore broadcast: no `connection'. */
712         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
713                 printk("Broadcast packet!\n");
714                 return NF_ACCEPT;
715         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
716                    == htonl(0x000000FF)) {
717                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
718                        NIPQUAD((*pskb)->nh.iph->saddr),
719                        NIPQUAD((*pskb)->nh.iph->daddr),
720                        (*pskb)->sk, (*pskb)->pkt_type);
721         }
722 #endif
723
724         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
725
726         /* It may be an special packet, error, unclean...
727          * inverse of the return code tells to the netfilter
728          * core what to do with the packet. */
729         if (proto->error != NULL 
730             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
731                 CONNTRACK_STAT_INC(error);
732                 CONNTRACK_STAT_INC(invalid);
733                 return -ret;
734         }
735
736         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
737                 /* Not valid part of a connection */
738                 CONNTRACK_STAT_INC(invalid);
739                 return NF_ACCEPT;
740         }
741
742         if (IS_ERR(ct)) {
743                 /* Too stressed to deal. */
744                 CONNTRACK_STAT_INC(drop);
745                 return NF_DROP;
746         }
747
748         IP_NF_ASSERT((*pskb)->nfct);
749
750         ret = proto->packet(ct, *pskb, ctinfo);
751         if (ret < 0) {
752                 /* Invalid: inverse of the return code tells
753                  * the netfilter core what to do*/
754                 nf_conntrack_put((*pskb)->nfct);
755                 (*pskb)->nfct = NULL;
756                 CONNTRACK_STAT_INC(invalid);
757                 return -ret;
758         }
759
760         if (ret != NF_DROP && ct->helper) {
761                 ret = ct->helper->help(*pskb, ct, ctinfo);
762                 if (ret == -1) {
763                         /* Invalid */
764                         CONNTRACK_STAT_INC(invalid);
765                         nf_conntrack_put((*pskb)->nfct);
766                         (*pskb)->nfct = NULL;
767                         return NF_ACCEPT;
768                 }
769         }
770         if (set_reply)
771                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
772
773         return ret;
774 }
775
776 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
777                    const struct ip_conntrack_tuple *orig)
778 {
779         return ip_ct_invert_tuple(inverse, orig, 
780                                   ip_ct_find_proto(orig->dst.protonum));
781 }
782
783 static inline int resent_expect(const struct ip_conntrack_expect *i,
784                                 const struct ip_conntrack_tuple *tuple,
785                                 const struct ip_conntrack_tuple *mask)
786 {
787         DEBUGP("resent_expect\n");
788         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
789         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
790         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
791         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
792                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
793                 && ip_ct_tuple_equal(&i->mask, mask));
794 }
795
796 /* Would two expected things clash? */
797 static inline int expect_clash(const struct ip_conntrack_expect *i,
798                                const struct ip_conntrack_tuple *tuple,
799                                const struct ip_conntrack_tuple *mask)
800 {
801         /* Part covered by intersection of masks must be unequal,
802            otherwise they clash */
803         struct ip_conntrack_tuple intersect_mask
804                 = { { i->mask.src.ip & mask->src.ip,
805                       { i->mask.src.u.all & mask->src.u.all } },
806                     { i->mask.dst.ip & mask->dst.ip,
807                       { i->mask.dst.u.all & mask->dst.u.all },
808                       i->mask.dst.protonum & mask->dst.protonum } };
809
810         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
811 }
812
813 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
814 {
815         WRITE_LOCK(&ip_conntrack_lock);
816         unexpect_related(expect);
817         WRITE_UNLOCK(&ip_conntrack_lock);
818 }
819         
820 static void expectation_timed_out(unsigned long ul_expect)
821 {
822         struct ip_conntrack_expect *expect = (void *) ul_expect;
823
824         DEBUGP("expectation %p timed out\n", expect);   
825         WRITE_LOCK(&ip_conntrack_lock);
826         __unexpect_related(expect);
827         WRITE_UNLOCK(&ip_conntrack_lock);
828 }
829
830 struct ip_conntrack_expect *
831 ip_conntrack_expect_alloc(void)
832 {
833         struct ip_conntrack_expect *new;
834
835         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
836         if (!new) {
837                 DEBUGP("expect_related: OOM allocating expect\n");
838                 return NULL;
839         }
840
841         /* tuple_cmp compares whole union, we have to initialized cleanly */
842         memset(new, 0, sizeof(struct ip_conntrack_expect));
843         atomic_set(&new->use, 1);
844
845         return new;
846 }
847
848 static void
849 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
850                            struct ip_conntrack *related_to)
851 {
852         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
853         new->expectant = related_to;
854         new->sibling = NULL;
855
856         /* add to expected list for this connection */
857         list_add_tail(&new->expected_list, &related_to->sibling_list);
858         /* add to global list of expectations */
859         list_prepend(&ip_conntrack_expect_list, &new->list);
860         /* add and start timer if required */
861         if (related_to->helper->timeout) {
862                 init_timer(&new->timeout);
863                 new->timeout.data = (unsigned long)new;
864                 new->timeout.function = expectation_timed_out;
865                 new->timeout.expires = jiffies +
866                                         related_to->helper->timeout * HZ;
867                 add_timer(&new->timeout);
868         }
869         related_to->expecting++;
870 }
871
872 /* Add a related connection. */
873 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
874                                 struct ip_conntrack *related_to)
875 {
876         struct ip_conntrack_expect *old;
877         int ret = 0;
878
879         WRITE_LOCK(&ip_conntrack_lock);
880         /* Because of the write lock, no reader can walk the lists,
881          * so there is no need to use the tuple lock too */
882
883         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
884         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
885         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
886
887         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
888                         struct ip_conntrack_expect *, &expect->tuple, 
889                         &expect->mask);
890         if (old) {
891                 /* Helper private data may contain offsets but no pointers
892                    pointing into the payload - otherwise we should have to copy 
893                    the data filled out by the helper over the old one */
894                 DEBUGP("expect_related: resent packet\n");
895                 if (related_to->helper->timeout) {
896                         if (!del_timer(&old->timeout)) {
897                                 /* expectation is dying. Fall through */
898                                 goto out;
899                         } else {
900                                 old->timeout.expires = jiffies + 
901                                         related_to->helper->timeout * HZ;
902                                 add_timer(&old->timeout);
903                         }
904                 }
905
906                 WRITE_UNLOCK(&ip_conntrack_lock);
907                 /* This expectation is not inserted so no need to lock */
908                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
909                 return -EEXIST;
910
911         } else if (related_to->helper->max_expected && 
912                    related_to->expecting >= related_to->helper->max_expected) {
913                 /* old == NULL */
914                 if (!(related_to->helper->flags & 
915                       IP_CT_HELPER_F_REUSE_EXPECT)) {
916                         WRITE_UNLOCK(&ip_conntrack_lock);
917                         if (net_ratelimit())
918                                 printk(KERN_WARNING
919                                        "ip_conntrack: max number of expected "
920                                        "connections %i of %s reached for "
921                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
922                                        related_to->helper->max_expected,
923                                        related_to->helper->name,
924                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
925                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
926                         kmem_cache_free(ip_conntrack_expect_cachep, expect);
927                         return -EPERM;
928                 }
929                 DEBUGP("ip_conntrack: max number of expected "
930                        "connections %i of %s reached for "
931                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
932                        related_to->helper->max_expected,
933                        related_to->helper->name,
934                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
935                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
936  
937                 /* choose the the oldest expectation to evict */
938                 list_for_each_entry(old, &related_to->sibling_list, 
939                                                       expected_list)
940                         if (old->sibling == NULL)
941                                 break;
942
943                 /* We cannot fail since related_to->expecting is the number
944                  * of unconfirmed expectations */
945                 IP_NF_ASSERT(old && old->sibling == NULL);
946
947                 /* newnat14 does not reuse the real allocated memory
948                  * structures but rather unexpects the old and
949                  * allocates a new.  unexpect_related will decrement
950                  * related_to->expecting. 
951                  */
952                 unexpect_related(old);
953                 ret = -EPERM;
954         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
955                              struct ip_conntrack_expect *, &expect->tuple, 
956                              &expect->mask)) {
957                 WRITE_UNLOCK(&ip_conntrack_lock);
958                 DEBUGP("expect_related: busy!\n");
959
960                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
961                 return -EBUSY;
962         }
963
964 out:    ip_conntrack_expect_insert(expect, related_to);
965
966         WRITE_UNLOCK(&ip_conntrack_lock);
967
968         CONNTRACK_STAT_INC(expect_create);
969
970         return ret;
971 }
972
973 /* Change tuple in an existing expectation */
974 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
975                                struct ip_conntrack_tuple *newtuple)
976 {
977         int ret;
978
979         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
980         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
981
982         DEBUGP("change_expect:\n");
983         DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
984         DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
985         DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
986         if (expect->ct_tuple.dst.protonum == 0) {
987                 /* Never seen before */
988                 DEBUGP("change expect: never seen before\n");
989                 if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
990                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
991                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
992                         /* Force NAT to find an unused tuple */
993                         ret = -1;
994                 } else {
995                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
996                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
997                         ret = 0;
998                 }
999         } else {
1000                 /* Resent packet */
1001                 DEBUGP("change expect: resent packet\n");
1002                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1003                         ret = 0;
1004                 } else {
1005                         /* Force NAT to choose again the same port */
1006                         ret = -1;
1007                 }
1008         }
1009         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1010         
1011         return ret;
1012 }
1013
1014 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1015    return 0 and don't do alteration. */
1016 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1017                              const struct ip_conntrack_tuple *newreply)
1018 {
1019         WRITE_LOCK(&ip_conntrack_lock);
1020         if (__ip_conntrack_find(newreply, conntrack)) {
1021                 WRITE_UNLOCK(&ip_conntrack_lock);
1022                 return 0;
1023         }
1024         /* Should be unconfirmed, so not in hash table yet */
1025         IP_NF_ASSERT(!is_confirmed(conntrack));
1026
1027         DEBUGP("Altering reply tuple of %p to ", conntrack);
1028         DUMP_TUPLE(newreply);
1029
1030         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1031         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1032                 conntrack->helper = ip_ct_find_helper(newreply);
1033         WRITE_UNLOCK(&ip_conntrack_lock);
1034
1035         return 1;
1036 }
1037
1038 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1039 {
1040         WRITE_LOCK(&ip_conntrack_lock);
1041         list_prepend(&helpers, me);
1042         WRITE_UNLOCK(&ip_conntrack_lock);
1043
1044         return 0;
1045 }
1046
1047 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1048                          const struct ip_conntrack_helper *me)
1049 {
1050         if (i->ctrack->helper == me) {
1051                 /* Get rid of any expected. */
1052                 remove_expectations(i->ctrack, 0);
1053                 /* And *then* set helper to NULL */
1054                 i->ctrack->helper = NULL;
1055         }
1056         return 0;
1057 }
1058
1059 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1060 {
1061         unsigned int i;
1062
1063         /* Need write lock here, to delete helper. */
1064         WRITE_LOCK(&ip_conntrack_lock);
1065         LIST_DELETE(&helpers, me);
1066
1067         /* Get rid of expecteds, set helpers to NULL. */
1068         for (i = 0; i < ip_conntrack_htable_size; i++)
1069                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1070                             struct ip_conntrack_tuple_hash *, me);
1071         WRITE_UNLOCK(&ip_conntrack_lock);
1072
1073         /* Someone could be still looking at the helper in a bh. */
1074         synchronize_net();
1075 }
1076
1077 static inline void ct_add_counters(struct ip_conntrack *ct,
1078                                    enum ip_conntrack_info ctinfo,
1079                                    const struct sk_buff *skb)
1080 {
1081 #ifdef CONFIG_IP_NF_CT_ACCT
1082         if (skb) {
1083                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1084                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1085                                         ntohs(skb->nh.iph->tot_len);
1086         }
1087 #endif
1088 }
1089
1090 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1091 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1092                         enum ip_conntrack_info ctinfo,
1093                         const struct sk_buff *skb,
1094                         unsigned long extra_jiffies)
1095 {
1096         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1097
1098         /* If not in hash table, timer will not be active yet */
1099         if (!is_confirmed(ct)) {
1100                 ct->timeout.expires = extra_jiffies;
1101                 ct_add_counters(ct, ctinfo, skb);
1102         } else {
1103                 WRITE_LOCK(&ip_conntrack_lock);
1104                 /* Need del_timer for race avoidance (may already be dying). */
1105                 if (del_timer(&ct->timeout)) {
1106                         ct->timeout.expires = jiffies + extra_jiffies;
1107                         add_timer(&ct->timeout);
1108                 }
1109                 ct_add_counters(ct, ctinfo, skb);
1110                 WRITE_UNLOCK(&ip_conntrack_lock);
1111         }
1112 }
1113
1114 int ip_ct_no_defrag;
1115
1116 /* Returns new sk_buff, or NULL */
1117 struct sk_buff *
1118 ip_ct_gather_frags(struct sk_buff *skb)
1119 {
1120         struct sock *sk = skb->sk;
1121 #ifdef CONFIG_NETFILTER_DEBUG
1122         unsigned int olddebug = skb->nf_debug;
1123 #endif
1124
1125         if (unlikely(ip_ct_no_defrag)) {
1126                 kfree_skb(skb);
1127                 return NULL;
1128         }
1129
1130         if (sk) {
1131                 sock_hold(sk);
1132                 skb_orphan(skb);
1133         }
1134
1135         local_bh_disable(); 
1136         skb = ip_defrag(skb);
1137         local_bh_enable();
1138
1139         if (!skb) {
1140                 if (sk)
1141                         sock_put(sk);
1142                 return skb;
1143         }
1144
1145         if (sk) {
1146                 skb_set_owner_w(skb, sk);
1147                 sock_put(sk);
1148         }
1149
1150         ip_send_check(skb->nh.iph);
1151         skb->nfcache |= NFC_ALTERED;
1152 #ifdef CONFIG_NETFILTER_DEBUG
1153         /* Packet path as if nothing had happened. */
1154         skb->nf_debug = olddebug;
1155 #endif
1156         return skb;
1157 }
1158
1159 /* Used by ipt_REJECT. */
1160 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1161 {
1162         struct ip_conntrack *ct;
1163         enum ip_conntrack_info ctinfo;
1164
1165         /* This ICMP is in reverse direction to the packet which caused it */
1166         ct = ip_conntrack_get(skb, &ctinfo);
1167         
1168         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1169                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1170         else
1171                 ctinfo = IP_CT_RELATED;
1172
1173         /* Attach to new skbuff, and increment count */
1174         nskb->nfct = &ct->ct_general;
1175         nskb->nfctinfo = ctinfo;
1176         nf_conntrack_get(nskb->nfct);
1177 }
1178
1179 static inline int
1180 do_kill(const struct ip_conntrack_tuple_hash *i,
1181         int (*kill)(const struct ip_conntrack *i, void *data),
1182         void *data)
1183 {
1184         return kill(i->ctrack, data);
1185 }
1186
1187 /* Bring out ya dead! */
1188 static struct ip_conntrack_tuple_hash *
1189 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1190                 void *data, unsigned int *bucket)
1191 {
1192         struct ip_conntrack_tuple_hash *h = NULL;
1193
1194         READ_LOCK(&ip_conntrack_lock);
1195         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1196                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1197                               struct ip_conntrack_tuple_hash *, kill, data);
1198         }
1199         if (h)
1200                 atomic_inc(&h->ctrack->ct_general.use);
1201         READ_UNLOCK(&ip_conntrack_lock);
1202
1203         return h;
1204 }
1205
1206 void
1207 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1208                         void *data)
1209 {
1210         struct ip_conntrack_tuple_hash *h;
1211         unsigned int bucket = 0;
1212
1213         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1214                 /* Time to push up daises... */
1215                 if (del_timer(&h->ctrack->timeout))
1216                         death_by_timeout((unsigned long)h->ctrack);
1217                 /* ... else the timer will get him soon. */
1218
1219                 ip_conntrack_put(h->ctrack);
1220         }
1221 }
1222
1223 /* Fast function for those who don't want to parse /proc (and I don't
1224    blame them). */
1225 /* Reversing the socket's dst/src point of view gives us the reply
1226    mapping. */
1227 static int
1228 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1229 {
1230         struct inet_opt *inet = inet_sk(sk);
1231         struct ip_conntrack_tuple_hash *h;
1232         struct ip_conntrack_tuple tuple;
1233         
1234         IP_CT_TUPLE_U_BLANK(&tuple);
1235         tuple.src.ip = inet->rcv_saddr;
1236         tuple.src.u.tcp.port = inet->sport;
1237         tuple.dst.ip = inet->daddr;
1238         tuple.dst.u.tcp.port = inet->dport;
1239         tuple.dst.protonum = IPPROTO_TCP;
1240
1241         /* We only do TCP at the moment: is there a better way? */
1242         if (strcmp(sk->sk_prot->name, "TCP")) {
1243                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1244                 return -ENOPROTOOPT;
1245         }
1246
1247         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1248                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1249                        *len, sizeof(struct sockaddr_in));
1250                 return -EINVAL;
1251         }
1252
1253         h = ip_conntrack_find_get(&tuple, NULL);
1254         if (h) {
1255                 struct sockaddr_in sin;
1256
1257                 sin.sin_family = AF_INET;
1258                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1259                         .tuple.dst.u.tcp.port;
1260                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1261                         .tuple.dst.ip;
1262
1263                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1264                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1265                 ip_conntrack_put(h->ctrack);
1266                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1267                         return -EFAULT;
1268                 else
1269                         return 0;
1270         }
1271         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1272                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1273                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1274         return -ENOENT;
1275 }
1276
1277 static struct nf_sockopt_ops so_getorigdst = {
1278         .pf             = PF_INET,
1279         .get_optmin     = SO_ORIGINAL_DST,
1280         .get_optmax     = SO_ORIGINAL_DST+1,
1281         .get            = &getorigdst,
1282 };
1283
1284 static int kill_all(const struct ip_conntrack *i, void *data)
1285 {
1286         return 1;
1287 }
1288
1289 /* Mishearing the voices in his head, our hero wonders how he's
1290    supposed to kill the mall. */
1291 void ip_conntrack_cleanup(void)
1292 {
1293         ip_ct_attach = NULL;
1294         /* This makes sure all current packets have passed through
1295            netfilter framework.  Roll on, two-stage module
1296            delete... */
1297         synchronize_net();
1298  
1299  i_see_dead_people:
1300         ip_ct_selective_cleanup(kill_all, NULL);
1301         if (atomic_read(&ip_conntrack_count) != 0) {
1302                 schedule();
1303                 goto i_see_dead_people;
1304         }
1305
1306         kmem_cache_destroy(ip_conntrack_cachep);
1307         kmem_cache_destroy(ip_conntrack_expect_cachep);
1308         vfree(ip_conntrack_hash);
1309         nf_unregister_sockopt(&so_getorigdst);
1310 }
1311
1312 static int hashsize;
1313 module_param(hashsize, int, 0400);
1314
1315 int __init ip_conntrack_init(void)
1316 {
1317         unsigned int i;
1318         int ret;
1319
1320         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1321          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1322         if (hashsize) {
1323                 ip_conntrack_htable_size = hashsize;
1324         } else {
1325                 ip_conntrack_htable_size
1326                         = (((num_physpages << PAGE_SHIFT) / 16384)
1327                            / sizeof(struct list_head));
1328                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1329                         ip_conntrack_htable_size = 8192;
1330                 if (ip_conntrack_htable_size < 16)
1331                         ip_conntrack_htable_size = 16;
1332         }
1333         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1334
1335         printk("ip_conntrack version %s (%u buckets, %d max)"
1336                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1337                ip_conntrack_htable_size, ip_conntrack_max,
1338                sizeof(struct ip_conntrack));
1339
1340         ret = nf_register_sockopt(&so_getorigdst);
1341         if (ret != 0) {
1342                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1343                 return ret;
1344         }
1345
1346         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1347                                     * ip_conntrack_htable_size);
1348         if (!ip_conntrack_hash) {
1349                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1350                 goto err_unreg_sockopt;
1351         }
1352
1353         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1354                                                 sizeof(struct ip_conntrack), 0,
1355                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1356         if (!ip_conntrack_cachep) {
1357                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1358                 goto err_free_hash;
1359         }
1360
1361         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1362                                         sizeof(struct ip_conntrack_expect),
1363                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1364         if (!ip_conntrack_expect_cachep) {
1365                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1366                 goto err_free_conntrack_slab;
1367         }
1368
1369         /* Don't NEED lock here, but good form anyway. */
1370         WRITE_LOCK(&ip_conntrack_lock);
1371         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1372                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1373         /* Sew in builtin protocols. */
1374         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1375         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1376         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1377         WRITE_UNLOCK(&ip_conntrack_lock);
1378
1379         for (i = 0; i < ip_conntrack_htable_size; i++)
1380                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1381
1382         /* For use by ipt_REJECT */
1383         ip_ct_attach = ip_conntrack_attach;
1384
1385         /* Set up fake conntrack:
1386             - to never be deleted, not in any hashes */
1387         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1388         /*  - and look it like as a confirmed connection */
1389         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1390
1391         return ret;
1392
1393 err_free_conntrack_slab:
1394         kmem_cache_destroy(ip_conntrack_cachep);
1395 err_free_hash:
1396         vfree(ip_conntrack_hash);
1397 err_unreg_sockopt:
1398         nf_unregister_sockopt(&so_getorigdst);
1399
1400         return -ENOMEM;
1401 }