This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40
41 /* This rwlock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION    "2.1"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DECLARE_RWLOCK(ip_conntrack_lock);
61 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 EXPORT_SYMBOL(ip_conntrack_count);
66
67 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
68 LIST_HEAD(ip_conntrack_expect_list);
69 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
70 static LIST_HEAD(helpers);
71 unsigned int ip_conntrack_htable_size = 0;
72 int ip_conntrack_max;
73 struct list_head *ip_conntrack_hash;
74 static kmem_cache_t *ip_conntrack_cachep;
75 static kmem_cache_t *ip_conntrack_expect_cachep;
76 struct ip_conntrack ip_conntrack_untracked;
77 unsigned int ip_ct_log_invalid;
78
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81 inline void 
82 ip_conntrack_put(struct ip_conntrack *ct)
83 {
84         IP_NF_ASSERT(ct);
85         nf_conntrack_put(&ct->ct_general);
86 }
87
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
90
91 static u_int32_t
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
93 {
94 #if 0
95         dump_tuple(tuple);
96 #endif
97         return (jhash_3words(tuple->src.ip,
98                              (tuple->dst.ip ^ tuple->dst.protonum),
99                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
100                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101 }
102
103 int
104 ip_ct_get_tuple(const struct iphdr *iph,
105                 const struct sk_buff *skb,
106                 unsigned int dataoff,
107                 struct ip_conntrack_tuple *tuple,
108                 const struct ip_conntrack_protocol *protocol)
109 {
110         /* Never happen */
111         if (iph->frag_off & htons(IP_OFFSET)) {
112                 printk("ip_conntrack_core: Frag of proto %u.\n",
113                        iph->protocol);
114                 return 0;
115         }
116
117         tuple->src.ip = iph->saddr;
118         tuple->dst.ip = iph->daddr;
119         tuple->dst.protonum = iph->protocol;
120         tuple->src.u.all = tuple->dst.u.all = 0;
121
122         return protocol->pkt_to_tuple(skb, dataoff, tuple);
123 }
124
125 int
126 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127                    const struct ip_conntrack_tuple *orig,
128                    const struct ip_conntrack_protocol *protocol)
129 {
130         inverse->src.ip = orig->dst.ip;
131         inverse->dst.ip = orig->src.ip;
132         inverse->dst.protonum = orig->dst.protonum;
133
134         inverse->src.u.all = inverse->dst.u.all = 0;
135
136         return protocol->invert_tuple(inverse, orig);
137 }
138
139
140 /* ip_conntrack_expect helper functions */
141
142 /* Compare tuple parts depending on mask. */
143 static inline int expect_cmp(const struct ip_conntrack_expect *i,
144                              const struct ip_conntrack_tuple *tuple)
145 {
146         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
147         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
148 }
149
150 static void
151 destroy_expect(struct ip_conntrack_expect *exp)
152 {
153         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
154         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
155         IP_NF_ASSERT(!timer_pending(&exp->timeout));
156
157         kmem_cache_free(ip_conntrack_expect_cachep, exp);
158         CONNTRACK_STAT_INC(expect_delete);
159 }
160
161 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
162 {
163         IP_NF_ASSERT(exp);
164
165         if (atomic_dec_and_test(&exp->use)) {
166                 /* usage count dropped to zero */
167                 destroy_expect(exp);
168         }
169 }
170
171 static inline struct ip_conntrack_expect *
172 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
173 {
174         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
175         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
176         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
177                          struct ip_conntrack_expect *, tuple);
178 }
179
180 /* Find a expectation corresponding to a tuple. */
181 struct ip_conntrack_expect *
182 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
183 {
184         struct ip_conntrack_expect *exp;
185
186         READ_LOCK(&ip_conntrack_lock);
187         READ_LOCK(&ip_conntrack_expect_tuple_lock);
188         exp = __ip_ct_expect_find(tuple);
189         if (exp)
190                 atomic_inc(&exp->use);
191         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
192         READ_UNLOCK(&ip_conntrack_lock);
193
194         return exp;
195 }
196
197 /* remove one specific expectation from all lists and drop refcount,
198  * does _NOT_ delete the timer. */
199 static void __unexpect_related(struct ip_conntrack_expect *expect)
200 {
201         DEBUGP("unexpect_related(%p)\n", expect);
202         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
203
204         /* we're not allowed to unexpect a confirmed expectation! */
205         IP_NF_ASSERT(!expect->sibling);
206
207         /* delete from global and local lists */
208         list_del(&expect->list);
209         list_del(&expect->expected_list);
210
211         /* decrement expect-count of master conntrack */
212         if (expect->expectant)
213                 expect->expectant->expecting--;
214
215         ip_conntrack_expect_put(expect);
216 }
217
218 /* remove one specific expecatation from all lists, drop refcount
219  * and expire timer. 
220  * This function can _NOT_ be called for confirmed expects! */
221 static void unexpect_related(struct ip_conntrack_expect *expect)
222 {
223         IP_NF_ASSERT(expect->expectant);
224         IP_NF_ASSERT(expect->expectant->helper);
225         /* if we are supposed to have a timer, but we can't delete
226          * it: race condition.  __unexpect_related will
227          * be calledd by timeout function */
228         if (expect->expectant->helper->timeout
229             && !del_timer(&expect->timeout))
230                 return;
231
232         __unexpect_related(expect);
233 }
234
235 /* delete all unconfirmed expectations for this conntrack */
236 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
237 {
238         struct list_head *exp_entry, *next;
239         struct ip_conntrack_expect *exp;
240
241         DEBUGP("remove_expectations(%p)\n", ct);
242
243         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
244                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
245                                  expected_list);
246
247                 /* we skip established expectations, as we want to delete
248                  * the un-established ones only */
249                 if (exp->sibling) {
250                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
251                         if (drop_refcount) {
252                                 /* Indicate that this expectations parent is dead */
253                                 ip_conntrack_put(exp->expectant);
254                                 exp->expectant = NULL;
255                         }
256                         continue;
257                 }
258
259                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
260                 IP_NF_ASSERT(exp->expectant == ct);
261
262                 /* delete expectation from global and private lists */
263                 unexpect_related(exp);
264         }
265 }
266
267 static void
268 clean_from_lists(struct ip_conntrack *ct)
269 {
270         unsigned int ho, hr;
271         
272         DEBUGP("clean_from_lists(%p)\n", ct);
273         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
274
275         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
276         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
277         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
278         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
279
280         /* Destroy all un-established, pending expectations */
281         remove_expectations(ct, 1);
282 }
283
284 static void
285 destroy_conntrack(struct nf_conntrack *nfct)
286 {
287         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
288         struct ip_conntrack_protocol *proto;
289
290         DEBUGP("destroy_conntrack(%p)\n", ct);
291         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
292         IP_NF_ASSERT(!timer_pending(&ct->timeout));
293
294         /* To make sure we don't get any weird locking issues here:
295          * destroy_conntrack() MUST NOT be called with a write lock
296          * to ip_conntrack_lock!!! -HW */
297         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
298         if (proto && proto->destroy)
299                 proto->destroy(ct);
300
301         if (ip_conntrack_destroyed)
302                 ip_conntrack_destroyed(ct);
303
304         WRITE_LOCK(&ip_conntrack_lock);
305         /* Make sure don't leave any orphaned expectations lying around */
306         if (ct->expecting)
307                 remove_expectations(ct, 1);
308
309         /* Delete our master expectation */
310         if (ct->master) {
311                 if (ct->master->expectant) {
312                         /* can't call __unexpect_related here,
313                          * since it would screw up expect_list */
314                         list_del(&ct->master->expected_list);
315                         master = ct->master->expectant;
316                 }
317                 kmem_cache_free(ip_conntrack_expect_cachep, ct->master);
318         }
319         WRITE_UNLOCK(&ip_conntrack_lock);
320
321         if (master)
322                 ip_conntrack_put(master);
323
324         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
325         kmem_cache_free(ip_conntrack_cachep, ct);
326         atomic_dec(&ip_conntrack_count);
327         CONNTRACK_STAT_INC(delete);
328 }
329
330 static void death_by_timeout(unsigned long ul_conntrack)
331 {
332         struct ip_conntrack *ct = (void *)ul_conntrack;
333
334         CONNTRACK_STAT_INC(delete_list);
335
336         WRITE_LOCK(&ip_conntrack_lock);
337         clean_from_lists(ct);
338         WRITE_UNLOCK(&ip_conntrack_lock);
339         ip_conntrack_put(ct);
340 }
341
342 static inline int
343 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
344                     const struct ip_conntrack_tuple *tuple,
345                     const struct ip_conntrack *ignored_conntrack)
346 {
347         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
348         return i->ctrack != ignored_conntrack
349                 && ip_ct_tuple_equal(tuple, &i->tuple);
350 }
351
352 static struct ip_conntrack_tuple_hash *
353 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
354                     const struct ip_conntrack *ignored_conntrack)
355 {
356         struct ip_conntrack_tuple_hash *h;
357         unsigned int hash = hash_conntrack(tuple);
358         /* use per_cpu() to avoid multiple calls to smp_processor_id() */
359         unsigned int cpu = smp_processor_id();
360
361         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
362         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
363                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
364                         per_cpu(ip_conntrack_stat, cpu).found++;
365                         return h;
366                 }
367                 per_cpu(ip_conntrack_stat, cpu).searched++;
368         }
369
370         return NULL;
371 }
372
373 /* Find a connection corresponding to a tuple. */
374 struct ip_conntrack_tuple_hash *
375 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
376                       const struct ip_conntrack *ignored_conntrack)
377 {
378         struct ip_conntrack_tuple_hash *h;
379
380         READ_LOCK(&ip_conntrack_lock);
381         h = __ip_conntrack_find(tuple, ignored_conntrack);
382         if (h)
383                 atomic_inc(&h->ctrack->ct_general.use);
384         READ_UNLOCK(&ip_conntrack_lock);
385
386         return h;
387 }
388
389 /* Confirm a connection given skb; places it in hash table */
390 int
391 __ip_conntrack_confirm(struct sk_buff *skb)
392 {
393         unsigned int hash, repl_hash;
394         struct ip_conntrack *ct;
395         enum ip_conntrack_info ctinfo;
396
397         ct = ip_conntrack_get(skb, &ctinfo);
398
399         /* ipt_REJECT uses ip_conntrack_attach to attach related
400            ICMP/TCP RST packets in other direction.  Actual packet
401            which created connection will be IP_CT_NEW or for an
402            expected connection, IP_CT_RELATED. */
403         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
404                 return NF_ACCEPT;
405
406         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
407         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
408
409         /* We're not in hash table, and we refuse to set up related
410            connections for unconfirmed conns.  But packet copies and
411            REJECT will give spurious warnings here. */
412         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
413
414         /* No external references means noone else could have
415            confirmed us. */
416         IP_NF_ASSERT(!is_confirmed(ct));
417         DEBUGP("Confirming conntrack %p\n", ct);
418
419         WRITE_LOCK(&ip_conntrack_lock);
420         /* See if there's one in the list already, including reverse:
421            NAT could have grabbed it without realizing, since we're
422            not in the hash.  If there is, we lost race. */
423         if (!LIST_FIND(&ip_conntrack_hash[hash],
424                        conntrack_tuple_cmp,
425                        struct ip_conntrack_tuple_hash *,
426                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
427             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
428                           conntrack_tuple_cmp,
429                           struct ip_conntrack_tuple_hash *,
430                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
431                 list_prepend(&ip_conntrack_hash[hash],
432                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
433                 list_prepend(&ip_conntrack_hash[repl_hash],
434                              &ct->tuplehash[IP_CT_DIR_REPLY]);
435                 /* Timer relative to confirmation time, not original
436                    setting time, otherwise we'd get timer wrap in
437                    weird delay cases. */
438                 ct->timeout.expires += jiffies;
439                 add_timer(&ct->timeout);
440                 atomic_inc(&ct->ct_general.use);
441                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
442                 WRITE_UNLOCK(&ip_conntrack_lock);
443                 CONNTRACK_STAT_INC(insert);
444                 return NF_ACCEPT;
445         }
446
447         WRITE_UNLOCK(&ip_conntrack_lock);
448         CONNTRACK_STAT_INC(insert_failed);
449         return NF_DROP;
450 }
451
452 /* Returns true if a connection correspondings to the tuple (required
453    for NAT). */
454 int
455 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
456                          const struct ip_conntrack *ignored_conntrack)
457 {
458         struct ip_conntrack_tuple_hash *h;
459
460         READ_LOCK(&ip_conntrack_lock);
461         h = __ip_conntrack_find(tuple, ignored_conntrack);
462         READ_UNLOCK(&ip_conntrack_lock);
463
464         return h != NULL;
465 }
466
467 /* There's a small race here where we may free a just-assured
468    connection.  Too bad: we're in trouble anyway. */
469 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
470 {
471         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
472 }
473
474 static int early_drop(struct list_head *chain)
475 {
476         /* Traverse backwards: gives us oldest, which is roughly LRU */
477         struct ip_conntrack_tuple_hash *h;
478         int dropped = 0;
479
480         READ_LOCK(&ip_conntrack_lock);
481         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
482         if (h)
483                 atomic_inc(&h->ctrack->ct_general.use);
484         READ_UNLOCK(&ip_conntrack_lock);
485
486         if (!h)
487                 return dropped;
488
489         if (del_timer(&h->ctrack->timeout)) {
490                 death_by_timeout((unsigned long)h->ctrack);
491                 dropped = 1;
492                 CONNTRACK_STAT_INC(early_drop);
493         }
494         ip_conntrack_put(h->ctrack);
495         return dropped;
496 }
497
498 static inline int helper_cmp(const struct ip_conntrack_helper *i,
499                              const struct ip_conntrack_tuple *rtuple)
500 {
501         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
502 }
503
504 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
505 {
506         return LIST_FIND(&helpers, helper_cmp,
507                          struct ip_conntrack_helper *,
508                          tuple);
509 }
510
511 /* Allocate a new conntrack: we return -ENOMEM if classification
512    failed due to stress.  Otherwise it really is unclassifiable. */
513 static struct ip_conntrack_tuple_hash *
514 init_conntrack(const struct ip_conntrack_tuple *tuple,
515                struct ip_conntrack_protocol *protocol,
516                struct sk_buff *skb)
517 {
518         struct ip_conntrack *conntrack;
519         struct ip_conntrack_tuple repl_tuple;
520         size_t hash;
521         struct ip_conntrack_expect *expected;
522
523         if (!ip_conntrack_hash_rnd_initted) {
524                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
525                 ip_conntrack_hash_rnd_initted = 1;
526         }
527
528         hash = hash_conntrack(tuple);
529
530         if (ip_conntrack_max
531             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
532                 /* Try dropping from this hash chain. */
533                 if (!early_drop(&ip_conntrack_hash[hash])) {
534                         if (net_ratelimit())
535                                 printk(KERN_WARNING
536                                        "ip_conntrack: table full, dropping"
537                                        " packet.\n");
538                         return ERR_PTR(-ENOMEM);
539                 }
540         }
541
542         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
543                 DEBUGP("Can't invert tuple.\n");
544                 return NULL;
545         }
546
547         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
548         if (!conntrack) {
549                 DEBUGP("Can't allocate conntrack.\n");
550                 return ERR_PTR(-ENOMEM);
551         }
552
553         memset(conntrack, 0, sizeof(*conntrack));
554         atomic_set(&conntrack->ct_general.use, 1);
555         conntrack->ct_general.destroy = destroy_conntrack;
556         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
557         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
558         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
559         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
560 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
561         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
562         conntrack->xid[IP_CT_DIR_REPLY] = -1;
563 #endif
564
565         if (!protocol->new(conntrack, skb)) {
566                 kmem_cache_free(ip_conntrack_cachep, conntrack);
567                 return NULL;
568         }
569         /* Don't set timer yet: wait for confirmation */
570         init_timer(&conntrack->timeout);
571         conntrack->timeout.data = (unsigned long)conntrack;
572         conntrack->timeout.function = death_by_timeout;
573
574         INIT_LIST_HEAD(&conntrack->sibling_list);
575
576         WRITE_LOCK(&ip_conntrack_lock);
577         /* Need finding and deleting of expected ONLY if we win race */
578         READ_LOCK(&ip_conntrack_expect_tuple_lock);
579         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
580                              struct ip_conntrack_expect *, tuple);
581         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
582
583         if (expected) {
584                 /* If master is not in hash table yet (ie. packet hasn't left
585                    this machine yet), how can other end know about expected?
586                    Hence these are not the droids you are looking for (if
587                    master ct never got confirmed, we'd hold a reference to it
588                    and weird things would happen to future packets). */
589                 if (!is_confirmed(expected->expectant)) {
590                         conntrack->helper = ip_ct_find_helper(&repl_tuple);
591                         goto end;
592                 }
593
594                 /* Expectation is dying... */
595                 if (expected->expectant->helper->timeout
596                     && !del_timer(&expected->timeout))
597                         goto end;       
598
599                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
600                         conntrack, expected);
601                 /* Welcome, Mr. Bond.  We've been expecting you... */
602                 IP_NF_ASSERT(expected->expectant);
603                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
604                 conntrack->master = expected;
605                 expected->sibling = conntrack;
606                 LIST_DELETE(&ip_conntrack_expect_list, expected);
607                 expected->expectant->expecting--;
608                 nf_conntrack_get(&master_ct(conntrack)->ct_general);
609
610                 /* this is a braindead... --pablo */
611                 atomic_inc(&ip_conntrack_count);
612                 WRITE_UNLOCK(&ip_conntrack_lock);
613
614                 if (expected->expectfn)
615                         expected->expectfn(conntrack);
616         
617                 CONNTRACK_STAT_INC(expect_new);
618
619                 goto ret;
620         } else  {
621                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
622
623                 CONNTRACK_STAT_INC(new);
624         }
625
626 end:    atomic_inc(&ip_conntrack_count);
627         WRITE_UNLOCK(&ip_conntrack_lock);
628
629 ret:    return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
630 }
631
632 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
633 static inline struct ip_conntrack *
634 resolve_normal_ct(struct sk_buff *skb,
635                   struct ip_conntrack_protocol *proto,
636                   int *set_reply,
637                   unsigned int hooknum,
638                   enum ip_conntrack_info *ctinfo)
639 {
640         struct ip_conntrack_tuple tuple;
641         struct ip_conntrack_tuple_hash *h;
642
643         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
644
645         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
646                                 &tuple,proto))
647                 return NULL;
648
649         /* look for tuple match */
650         h = ip_conntrack_find_get(&tuple, NULL);
651         if (!h) {
652                 h = init_conntrack(&tuple, proto, skb);
653                 if (!h)
654                         return NULL;
655                 if (IS_ERR(h))
656                         return (void *)h;
657         }
658
659         /* It exists; we have (non-exclusive) reference. */
660         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
661                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
662                 /* Please set reply bit if this packet OK */
663                 *set_reply = 1;
664         } else {
665                 /* Once we've had two way comms, always ESTABLISHED. */
666                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
667                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
668                                h->ctrack);
669                         *ctinfo = IP_CT_ESTABLISHED;
670                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
671                         DEBUGP("ip_conntrack_in: related packet for %p\n",
672                                h->ctrack);
673                         *ctinfo = IP_CT_RELATED;
674                 } else {
675                         DEBUGP("ip_conntrack_in: new packet for %p\n",
676                                h->ctrack);
677                         *ctinfo = IP_CT_NEW;
678                 }
679                 *set_reply = 0;
680         }
681         skb->nfct = &h->ctrack->ct_general;
682         skb->nfctinfo = *ctinfo;
683         return h->ctrack;
684 }
685
686 /* Netfilter hook itself. */
687 unsigned int ip_conntrack_in(unsigned int hooknum,
688                              struct sk_buff **pskb,
689                              const struct net_device *in,
690                              const struct net_device *out,
691                              int (*okfn)(struct sk_buff *))
692 {
693         struct ip_conntrack *ct;
694         enum ip_conntrack_info ctinfo;
695         struct ip_conntrack_protocol *proto;
696         int set_reply;
697         int ret;
698
699         /* Previously seen (loopback or untracked)?  Ignore. */
700         if ((*pskb)->nfct) {
701                 CONNTRACK_STAT_INC(ignore);
702                 return NF_ACCEPT;
703         }
704
705         /* Never happen */
706         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
707                 if (net_ratelimit()) {
708                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
709                        (*pskb)->nh.iph->protocol, hooknum);
710                 }
711                 return NF_DROP;
712         }
713
714         /* FIXME: Do this right please. --RR */
715         (*pskb)->nfcache |= NFC_UNKNOWN;
716
717 /* Doesn't cover locally-generated broadcast, so not worth it. */
718 #if 0
719         /* Ignore broadcast: no `connection'. */
720         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
721                 printk("Broadcast packet!\n");
722                 return NF_ACCEPT;
723         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
724                    == htonl(0x000000FF)) {
725                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
726                        NIPQUAD((*pskb)->nh.iph->saddr),
727                        NIPQUAD((*pskb)->nh.iph->daddr),
728                        (*pskb)->sk, (*pskb)->pkt_type);
729         }
730 #endif
731
732         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
733
734         /* It may be an special packet, error, unclean...
735          * inverse of the return code tells to the netfilter
736          * core what to do with the packet. */
737         if (proto->error != NULL 
738             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
739                 CONNTRACK_STAT_INC(error);
740                 CONNTRACK_STAT_INC(invalid);
741                 return -ret;
742         }
743
744         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
745                 /* Not valid part of a connection */
746                 CONNTRACK_STAT_INC(invalid);
747                 return NF_ACCEPT;
748         }
749
750         if (IS_ERR(ct)) {
751                 /* Too stressed to deal. */
752                 CONNTRACK_STAT_INC(drop);
753                 return NF_DROP;
754         }
755
756         IP_NF_ASSERT((*pskb)->nfct);
757
758         ret = proto->packet(ct, *pskb, ctinfo);
759         if (ret < 0) {
760                 /* Invalid: inverse of the return code tells
761                  * the netfilter core what to do*/
762                 nf_conntrack_put((*pskb)->nfct);
763                 (*pskb)->nfct = NULL;
764                 CONNTRACK_STAT_INC(invalid);
765                 return -ret;
766         }
767
768         if (ret != NF_DROP && ct->helper) {
769                 ret = ct->helper->help(*pskb, ct, ctinfo);
770                 if (ret == -1) {
771                         /* Invalid */
772                         CONNTRACK_STAT_INC(invalid);
773                         nf_conntrack_put((*pskb)->nfct);
774                         (*pskb)->nfct = NULL;
775                         return NF_ACCEPT;
776                 }
777         }
778         if (set_reply)
779                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
780
781         return ret;
782 }
783
784 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
785                    const struct ip_conntrack_tuple *orig)
786 {
787         return ip_ct_invert_tuple(inverse, orig, 
788                                   ip_ct_find_proto(orig->dst.protonum));
789 }
790
791 static inline int resent_expect(const struct ip_conntrack_expect *i,
792                                 const struct ip_conntrack_tuple *tuple,
793                                 const struct ip_conntrack_tuple *mask)
794 {
795         DEBUGP("resent_expect\n");
796         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
797         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
798         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
799         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
800                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
801                 && ip_ct_tuple_equal(&i->mask, mask));
802 }
803
804 /* Would two expected things clash? */
805 static inline int expect_clash(const struct ip_conntrack_expect *i,
806                                const struct ip_conntrack_tuple *tuple,
807                                const struct ip_conntrack_tuple *mask)
808 {
809         /* Part covered by intersection of masks must be unequal,
810            otherwise they clash */
811         struct ip_conntrack_tuple intersect_mask
812                 = { { i->mask.src.ip & mask->src.ip,
813                       { i->mask.src.u.all & mask->src.u.all } },
814                     { i->mask.dst.ip & mask->dst.ip,
815                       { i->mask.dst.u.all & mask->dst.u.all },
816                       i->mask.dst.protonum & mask->dst.protonum } };
817
818         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
819 }
820
821 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
822 {
823         WRITE_LOCK(&ip_conntrack_lock);
824         unexpect_related(expect);
825         WRITE_UNLOCK(&ip_conntrack_lock);
826 }
827         
828 static void expectation_timed_out(unsigned long ul_expect)
829 {
830         struct ip_conntrack_expect *expect = (void *) ul_expect;
831
832         DEBUGP("expectation %p timed out\n", expect);   
833         WRITE_LOCK(&ip_conntrack_lock);
834         __unexpect_related(expect);
835         WRITE_UNLOCK(&ip_conntrack_lock);
836 }
837
838 struct ip_conntrack_expect *
839 ip_conntrack_expect_alloc(void)
840 {
841         struct ip_conntrack_expect *new;
842
843         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
844         if (!new) {
845                 DEBUGP("expect_related: OOM allocating expect\n");
846                 return NULL;
847         }
848
849         /* tuple_cmp compares whole union, we have to initialized cleanly */
850         memset(new, 0, sizeof(struct ip_conntrack_expect));
851         atomic_set(&new->use, 1);
852
853         return new;
854 }
855
856 static void
857 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
858                            struct ip_conntrack *related_to)
859 {
860         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
861         new->expectant = related_to;
862         new->sibling = NULL;
863
864         /* add to expected list for this connection */
865         list_add_tail(&new->expected_list, &related_to->sibling_list);
866         /* add to global list of expectations */
867         list_prepend(&ip_conntrack_expect_list, &new->list);
868         /* add and start timer if required */
869         if (related_to->helper->timeout) {
870                 init_timer(&new->timeout);
871                 new->timeout.data = (unsigned long)new;
872                 new->timeout.function = expectation_timed_out;
873                 new->timeout.expires = jiffies +
874                                         related_to->helper->timeout * HZ;
875                 add_timer(&new->timeout);
876         }
877         related_to->expecting++;
878 }
879
880 /* Add a related connection. */
881 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
882                                 struct ip_conntrack *related_to)
883 {
884         struct ip_conntrack_expect *old;
885         int ret = 0;
886
887         WRITE_LOCK(&ip_conntrack_lock);
888         /* Because of the write lock, no reader can walk the lists,
889          * so there is no need to use the tuple lock too */
890
891         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
892         DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
893         DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
894
895         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
896                         struct ip_conntrack_expect *, &expect->tuple, 
897                         &expect->mask);
898         if (old) {
899                 /* Helper private data may contain offsets but no pointers
900                    pointing into the payload - otherwise we should have to copy 
901                    the data filled out by the helper over the old one */
902                 DEBUGP("expect_related: resent packet\n");
903                 if (related_to->helper->timeout) {
904                         if (!del_timer(&old->timeout)) {
905                                 /* expectation is dying. Fall through */
906                                 goto out;
907                         } else {
908                                 old->timeout.expires = jiffies + 
909                                         related_to->helper->timeout * HZ;
910                                 add_timer(&old->timeout);
911                         }
912                 }
913
914                 WRITE_UNLOCK(&ip_conntrack_lock);
915                 /* This expectation is not inserted so no need to lock */
916                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
917                 return -EEXIST;
918
919         } else if (related_to->helper->max_expected && 
920                    related_to->expecting >= related_to->helper->max_expected) {
921                 /* old == NULL */
922                 if (!(related_to->helper->flags & 
923                       IP_CT_HELPER_F_REUSE_EXPECT)) {
924                         WRITE_UNLOCK(&ip_conntrack_lock);
925                         if (net_ratelimit())
926                                 printk(KERN_WARNING
927                                        "ip_conntrack: max number of expected "
928                                        "connections %i of %s reached for "
929                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
930                                        related_to->helper->max_expected,
931                                        related_to->helper->name,
932                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
933                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
934                         kmem_cache_free(ip_conntrack_expect_cachep, expect);
935                         return -EPERM;
936                 }
937                 DEBUGP("ip_conntrack: max number of expected "
938                        "connections %i of %s reached for "
939                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
940                        related_to->helper->max_expected,
941                        related_to->helper->name,
942                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
943                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
944  
945                 /* choose the the oldest expectation to evict */
946                 list_for_each_entry(old, &related_to->sibling_list, 
947                                                       expected_list)
948                         if (old->sibling == NULL)
949                                 break;
950
951                 /* We cannot fail since related_to->expecting is the number
952                  * of unconfirmed expectations */
953                 IP_NF_ASSERT(old && old->sibling == NULL);
954
955                 /* newnat14 does not reuse the real allocated memory
956                  * structures but rather unexpects the old and
957                  * allocates a new.  unexpect_related will decrement
958                  * related_to->expecting. 
959                  */
960                 unexpect_related(old);
961                 ret = -EPERM;
962         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
963                              struct ip_conntrack_expect *, &expect->tuple, 
964                              &expect->mask)) {
965                 WRITE_UNLOCK(&ip_conntrack_lock);
966                 DEBUGP("expect_related: busy!\n");
967
968                 kmem_cache_free(ip_conntrack_expect_cachep, expect);
969                 return -EBUSY;
970         }
971
972 out:    ip_conntrack_expect_insert(expect, related_to);
973
974         WRITE_UNLOCK(&ip_conntrack_lock);
975
976         CONNTRACK_STAT_INC(expect_create);
977
978         return ret;
979 }
980
981 /* Change tuple in an existing expectation */
982 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
983                                struct ip_conntrack_tuple *newtuple)
984 {
985         int ret;
986
987         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
988         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
989         DEBUGP("change_expect:\n");
990         DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
991         DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
992         DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
993         if (expect->ct_tuple.dst.protonum == 0) {
994                 /* Never seen before */
995                 DEBUGP("change expect: never seen before\n");
996                 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
997                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
998                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
999                         /* Force NAT to find an unused tuple */
1000                         ret = -1;
1001                 } else {
1002                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1003                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1004                         ret = 0;
1005                 }
1006         } else {
1007                 /* Resent packet */
1008                 DEBUGP("change expect: resent packet\n");
1009                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1010                         ret = 0;
1011                 } else {
1012                         /* Force NAT to choose again the same port */
1013                         ret = -1;
1014                 }
1015         }
1016         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1017         
1018         return ret;
1019 }
1020
1021 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1022    return 0 and don't do alteration. */
1023 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1024                              const struct ip_conntrack_tuple *newreply)
1025 {
1026         WRITE_LOCK(&ip_conntrack_lock);
1027         if (__ip_conntrack_find(newreply, conntrack)) {
1028                 WRITE_UNLOCK(&ip_conntrack_lock);
1029                 return 0;
1030         }
1031         /* Should be unconfirmed, so not in hash table yet */
1032         IP_NF_ASSERT(!is_confirmed(conntrack));
1033
1034         DEBUGP("Altering reply tuple of %p to ", conntrack);
1035         DUMP_TUPLE(newreply);
1036
1037         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1038         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1039                 conntrack->helper = ip_ct_find_helper(newreply);
1040         WRITE_UNLOCK(&ip_conntrack_lock);
1041
1042         return 1;
1043 }
1044
1045 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1046 {
1047         WRITE_LOCK(&ip_conntrack_lock);
1048         list_prepend(&helpers, me);
1049         WRITE_UNLOCK(&ip_conntrack_lock);
1050
1051         return 0;
1052 }
1053
1054 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1055                          const struct ip_conntrack_helper *me)
1056 {
1057         if (i->ctrack->helper == me) {
1058                 /* Get rid of any expected. */
1059                 remove_expectations(i->ctrack, 0);
1060                 /* And *then* set helper to NULL */
1061                 i->ctrack->helper = NULL;
1062         }
1063         return 0;
1064 }
1065
1066 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1067 {
1068         unsigned int i;
1069
1070         /* Need write lock here, to delete helper. */
1071         WRITE_LOCK(&ip_conntrack_lock);
1072         LIST_DELETE(&helpers, me);
1073
1074         /* Get rid of expecteds, set helpers to NULL. */
1075         for (i = 0; i < ip_conntrack_htable_size; i++)
1076                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1077                             struct ip_conntrack_tuple_hash *, me);
1078         WRITE_UNLOCK(&ip_conntrack_lock);
1079
1080         /* Someone could be still looking at the helper in a bh. */
1081         synchronize_net();
1082 }
1083
1084 static inline void ct_add_counters(struct ip_conntrack *ct,
1085                                    enum ip_conntrack_info ctinfo,
1086                                    const struct sk_buff *skb)
1087 {
1088 #ifdef CONFIG_IP_NF_CT_ACCT
1089         if (skb) {
1090                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1091                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1092                                         ntohs(skb->nh.iph->tot_len);
1093         }
1094 #endif
1095 }
1096
1097 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1098 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1099                         enum ip_conntrack_info ctinfo,
1100                         const struct sk_buff *skb,
1101                         unsigned long extra_jiffies)
1102 {
1103         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1104
1105         /* If not in hash table, timer will not be active yet */
1106         if (!is_confirmed(ct)) {
1107                 ct->timeout.expires = extra_jiffies;
1108                 ct_add_counters(ct, ctinfo, skb);
1109         } else {
1110                 WRITE_LOCK(&ip_conntrack_lock);
1111                 /* Need del_timer for race avoidance (may already be dying). */
1112                 if (del_timer(&ct->timeout)) {
1113                         ct->timeout.expires = jiffies + extra_jiffies;
1114                         add_timer(&ct->timeout);
1115                 }
1116                 ct_add_counters(ct, ctinfo, skb);
1117                 WRITE_UNLOCK(&ip_conntrack_lock);
1118         }
1119 }
1120
1121 int ip_ct_no_defrag;
1122
1123 /* Returns new sk_buff, or NULL */
1124 struct sk_buff *
1125 ip_ct_gather_frags(struct sk_buff *skb)
1126 {
1127         struct sock *sk = skb->sk;
1128 #ifdef CONFIG_NETFILTER_DEBUG
1129         unsigned int olddebug = skb->nf_debug;
1130 #endif
1131
1132         if (unlikely(ip_ct_no_defrag)) {
1133                 kfree_skb(skb);
1134                 return NULL;
1135         }
1136
1137         if (sk) {
1138                 sock_hold(sk);
1139                 skb_orphan(skb);
1140         }
1141
1142         local_bh_disable(); 
1143         skb = ip_defrag(skb);
1144         local_bh_enable();
1145
1146         if (!skb) {
1147                 if (sk)
1148                         sock_put(sk);
1149                 return skb;
1150         }
1151
1152         if (sk) {
1153                 skb_set_owner_w(skb, sk);
1154                 sock_put(sk);
1155         }
1156
1157         ip_send_check(skb->nh.iph);
1158         skb->nfcache |= NFC_ALTERED;
1159 #ifdef CONFIG_NETFILTER_DEBUG
1160         /* Packet path as if nothing had happened. */
1161         skb->nf_debug = olddebug;
1162 #endif
1163         return skb;
1164 }
1165
1166 /* Used by ipt_REJECT. */
1167 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1168 {
1169         struct ip_conntrack *ct;
1170         enum ip_conntrack_info ctinfo;
1171
1172         /* This ICMP is in reverse direction to the packet which caused it */
1173         ct = ip_conntrack_get(skb, &ctinfo);
1174         
1175         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1176                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1177         else
1178                 ctinfo = IP_CT_RELATED;
1179
1180         /* Attach to new skbuff, and increment count */
1181         nskb->nfct = &ct->ct_general;
1182         nskb->nfctinfo = ctinfo;
1183         nf_conntrack_get(nskb->nfct);
1184 }
1185
1186 static inline int
1187 do_kill(const struct ip_conntrack_tuple_hash *i,
1188         int (*kill)(const struct ip_conntrack *i, void *data),
1189         void *data)
1190 {
1191         return kill(i->ctrack, data);
1192 }
1193
1194 /* Bring out ya dead! */
1195 static struct ip_conntrack_tuple_hash *
1196 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1197                 void *data, unsigned int *bucket)
1198 {
1199         struct ip_conntrack_tuple_hash *h = NULL;
1200
1201         READ_LOCK(&ip_conntrack_lock);
1202         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1203                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1204                               struct ip_conntrack_tuple_hash *, kill, data);
1205         }
1206         if (h)
1207                 atomic_inc(&h->ctrack->ct_general.use);
1208         READ_UNLOCK(&ip_conntrack_lock);
1209
1210         return h;
1211 }
1212
1213 void
1214 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1215                         void *data)
1216 {
1217         struct ip_conntrack_tuple_hash *h;
1218         unsigned int bucket = 0;
1219
1220         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1221                 /* Time to push up daises... */
1222                 if (del_timer(&h->ctrack->timeout))
1223                         death_by_timeout((unsigned long)h->ctrack);
1224                 /* ... else the timer will get him soon. */
1225
1226                 ip_conntrack_put(h->ctrack);
1227         }
1228 }
1229
1230 /* Fast function for those who don't want to parse /proc (and I don't
1231    blame them). */
1232 /* Reversing the socket's dst/src point of view gives us the reply
1233    mapping. */
1234 static int
1235 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1236 {
1237         struct inet_opt *inet = inet_sk(sk);
1238         struct ip_conntrack_tuple_hash *h;
1239         struct ip_conntrack_tuple tuple;
1240         
1241         IP_CT_TUPLE_U_BLANK(&tuple);
1242         tuple.src.ip = inet->rcv_saddr;
1243         tuple.src.u.tcp.port = inet->sport;
1244         tuple.dst.ip = inet->daddr;
1245         tuple.dst.u.tcp.port = inet->dport;
1246         tuple.dst.protonum = IPPROTO_TCP;
1247
1248         /* We only do TCP at the moment: is there a better way? */
1249         if (strcmp(sk->sk_prot->name, "TCP")) {
1250                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1251                 return -ENOPROTOOPT;
1252         }
1253
1254         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1255                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1256                        *len, sizeof(struct sockaddr_in));
1257                 return -EINVAL;
1258         }
1259
1260         h = ip_conntrack_find_get(&tuple, NULL);
1261         if (h) {
1262                 struct sockaddr_in sin;
1263
1264                 sin.sin_family = AF_INET;
1265                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1266                         .tuple.dst.u.tcp.port;
1267                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1268                         .tuple.dst.ip;
1269
1270                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1271                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1272                 ip_conntrack_put(h->ctrack);
1273                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1274                         return -EFAULT;
1275                 else
1276                         return 0;
1277         }
1278         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1279                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1280                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1281         return -ENOENT;
1282 }
1283
1284 static struct nf_sockopt_ops so_getorigdst = {
1285         .pf             = PF_INET,
1286         .get_optmin     = SO_ORIGINAL_DST,
1287         .get_optmax     = SO_ORIGINAL_DST+1,
1288         .get            = &getorigdst,
1289 };
1290
1291 static int kill_all(const struct ip_conntrack *i, void *data)
1292 {
1293         return 1;
1294 }
1295
1296 /* Mishearing the voices in his head, our hero wonders how he's
1297    supposed to kill the mall. */
1298 void ip_conntrack_cleanup(void)
1299 {
1300         ip_ct_attach = NULL;
1301         /* This makes sure all current packets have passed through
1302            netfilter framework.  Roll on, two-stage module
1303            delete... */
1304         synchronize_net();
1305  
1306  i_see_dead_people:
1307         ip_ct_selective_cleanup(kill_all, NULL);
1308         if (atomic_read(&ip_conntrack_count) != 0) {
1309                 schedule();
1310                 goto i_see_dead_people;
1311         }
1312
1313         kmem_cache_destroy(ip_conntrack_cachep);
1314         kmem_cache_destroy(ip_conntrack_expect_cachep);
1315         vfree(ip_conntrack_hash);
1316         nf_unregister_sockopt(&so_getorigdst);
1317 }
1318
1319 static int hashsize;
1320 module_param(hashsize, int, 0400);
1321
1322 int __init ip_conntrack_init(void)
1323 {
1324         unsigned int i;
1325         int ret;
1326
1327         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1328          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1329         if (hashsize) {
1330                 ip_conntrack_htable_size = hashsize;
1331         } else {
1332                 ip_conntrack_htable_size
1333                         = (((num_physpages << PAGE_SHIFT) / 16384)
1334                            / sizeof(struct list_head));
1335                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1336                         ip_conntrack_htable_size = 8192;
1337                 if (ip_conntrack_htable_size < 16)
1338                         ip_conntrack_htable_size = 16;
1339         }
1340         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1341
1342         printk("ip_conntrack version %s (%u buckets, %d max)"
1343                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1344                ip_conntrack_htable_size, ip_conntrack_max,
1345                sizeof(struct ip_conntrack));
1346
1347         ret = nf_register_sockopt(&so_getorigdst);
1348         if (ret != 0) {
1349                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1350                 return ret;
1351         }
1352
1353         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1354                                     * ip_conntrack_htable_size);
1355         if (!ip_conntrack_hash) {
1356                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1357                 goto err_unreg_sockopt;
1358         }
1359
1360         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1361                                                 sizeof(struct ip_conntrack), 0,
1362                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1363         if (!ip_conntrack_cachep) {
1364                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1365                 goto err_free_hash;
1366         }
1367
1368         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1369                                         sizeof(struct ip_conntrack_expect),
1370                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1371         if (!ip_conntrack_expect_cachep) {
1372                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1373                 goto err_free_conntrack_slab;
1374         }
1375
1376         /* Don't NEED lock here, but good form anyway. */
1377         WRITE_LOCK(&ip_conntrack_lock);
1378         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1379                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1380         /* Sew in builtin protocols. */
1381         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1382         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1383         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1384         WRITE_UNLOCK(&ip_conntrack_lock);
1385
1386         for (i = 0; i < ip_conntrack_htable_size; i++)
1387                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1388
1389         /* For use by ipt_REJECT */
1390         ip_ct_attach = ip_conntrack_attach;
1391
1392         /* Set up fake conntrack:
1393             - to never be deleted, not in any hashes */
1394         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1395         /*  - and look it like as a confirmed connection */
1396         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1397
1398         return ret;
1399
1400 err_free_conntrack_slab:
1401         kmem_cache_destroy(ip_conntrack_cachep);
1402 err_free_hash:
1403         vfree(ip_conntrack_hash);
1404 err_unreg_sockopt:
1405         nf_unregister_sockopt(&so_getorigdst);
1406
1407         return -ENOMEM;
1408 }