ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 /* For ERR_PTR().  Yeah, I know... --RR */
37 #include <linux/fs.h>
38
39 /* This rwlock protects the main hash table, protocol/helper/expected
40    registrations, conntrack timers*/
41 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
42 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
43
44 #include <linux/netfilter_ipv4/ip_conntrack.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
48 #include <linux/netfilter_ipv4/listhelp.h>
49
50 #define IP_CONNTRACK_VERSION    "2.1"
51
52 #if 0
53 #define DEBUGP printk
54 #else
55 #define DEBUGP(format, args...)
56 #endif
57
58 DECLARE_RWLOCK(ip_conntrack_lock);
59 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 LIST_HEAD(protocol_list);
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size = 0;
66 int ip_conntrack_max;
67 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
68 struct list_head *ip_conntrack_hash;
69 static kmem_cache_t *ip_conntrack_cachep;
70 struct ip_conntrack ip_conntrack_untracked;
71
72 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
73
74 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
75                               u_int8_t protocol)
76 {
77         return protocol == curr->proto;
78 }
79
80 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
81 {
82         struct ip_conntrack_protocol *p;
83
84         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
85         p = LIST_FIND(&protocol_list, proto_cmpfn,
86                       struct ip_conntrack_protocol *, protocol);
87         if (!p)
88                 p = &ip_conntrack_generic_protocol;
89
90         return p;
91 }
92
93 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
94 {
95         struct ip_conntrack_protocol *p;
96
97         READ_LOCK(&ip_conntrack_lock);
98         p = __ip_ct_find_proto(protocol);
99         READ_UNLOCK(&ip_conntrack_lock);
100         return p;
101 }
102
103 inline void 
104 ip_conntrack_put(struct ip_conntrack *ct)
105 {
106         IP_NF_ASSERT(ct);
107         IP_NF_ASSERT(ct->infos[0].master);
108         /* nf_conntrack_put wants to go via an info struct, so feed it
109            one at random. */
110         nf_conntrack_put(&ct->infos[0]);
111 }
112
113 static int ip_conntrack_hash_rnd_initted;
114 static unsigned int ip_conntrack_hash_rnd;
115
116 static u_int32_t
117 hash_conntrack(const struct ip_conntrack_tuple *tuple)
118 {
119 #if 0
120         dump_tuple(tuple);
121 #endif
122         return (jhash_3words(tuple->src.ip,
123                              (tuple->dst.ip ^ tuple->dst.protonum),
124                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
125                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
126 }
127
128 int
129 get_tuple(const struct iphdr *iph,
130           const struct sk_buff *skb,
131           unsigned int dataoff,
132           struct ip_conntrack_tuple *tuple,
133           const struct ip_conntrack_protocol *protocol)
134 {
135         /* Never happen */
136         if (iph->frag_off & htons(IP_OFFSET)) {
137                 printk("ip_conntrack_core: Frag of proto %u.\n",
138                        iph->protocol);
139                 return 0;
140         }
141
142         tuple->src.ip = iph->saddr;
143         tuple->dst.ip = iph->daddr;
144         tuple->dst.protonum = iph->protocol;
145
146         return protocol->pkt_to_tuple(skb, dataoff, tuple);
147 }
148
149 static int
150 invert_tuple(struct ip_conntrack_tuple *inverse,
151              const struct ip_conntrack_tuple *orig,
152              const struct ip_conntrack_protocol *protocol)
153 {
154         inverse->src.ip = orig->dst.ip;
155         inverse->dst.ip = orig->src.ip;
156         inverse->dst.protonum = orig->dst.protonum;
157
158         return protocol->invert_tuple(inverse, orig);
159 }
160
161
162 /* ip_conntrack_expect helper functions */
163
164 /* Compare tuple parts depending on mask. */
165 static inline int expect_cmp(const struct ip_conntrack_expect *i,
166                              const struct ip_conntrack_tuple *tuple)
167 {
168         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
169         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
170 }
171
172 static void
173 destroy_expect(struct ip_conntrack_expect *exp)
174 {
175         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
176         IP_NF_ASSERT(atomic_read(&exp->use));
177         IP_NF_ASSERT(!timer_pending(&exp->timeout));
178
179         kfree(exp);
180 }
181
182
183 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
184 {
185         IP_NF_ASSERT(exp);
186
187         if (atomic_dec_and_test(&exp->use)) {
188                 /* usage count dropped to zero */
189                 destroy_expect(exp);
190         }
191 }
192
193 static inline struct ip_conntrack_expect *
194 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
195 {
196         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
197         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
198         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
199                          struct ip_conntrack_expect *, tuple);
200 }
201
202 /* Find a expectation corresponding to a tuple. */
203 struct ip_conntrack_expect *
204 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
205 {
206         struct ip_conntrack_expect *exp;
207
208         READ_LOCK(&ip_conntrack_lock);
209         READ_LOCK(&ip_conntrack_expect_tuple_lock);
210         exp = __ip_ct_expect_find(tuple);
211         if (exp)
212                 atomic_inc(&exp->use);
213         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
214         READ_UNLOCK(&ip_conntrack_lock);
215
216         return exp;
217 }
218
219 /* remove one specific expectation from all lists and drop refcount,
220  * does _NOT_ delete the timer. */
221 static void __unexpect_related(struct ip_conntrack_expect *expect)
222 {
223         DEBUGP("unexpect_related(%p)\n", expect);
224         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
225
226         /* we're not allowed to unexpect a confirmed expectation! */
227         IP_NF_ASSERT(!expect->sibling);
228
229         /* delete from global and local lists */
230         list_del(&expect->list);
231         list_del(&expect->expected_list);
232
233         /* decrement expect-count of master conntrack */
234         if (expect->expectant)
235                 expect->expectant->expecting--;
236
237         ip_conntrack_expect_put(expect);
238 }
239
240 /* remove one specific expecatation from all lists, drop refcount
241  * and expire timer. 
242  * This function can _NOT_ be called for confirmed expects! */
243 static void unexpect_related(struct ip_conntrack_expect *expect)
244 {
245         IP_NF_ASSERT(expect->expectant);
246         IP_NF_ASSERT(expect->expectant->helper);
247         /* if we are supposed to have a timer, but we can't delete
248          * it: race condition.  __unexpect_related will
249          * be calledd by timeout function */
250         if (expect->expectant->helper->timeout
251             && !del_timer(&expect->timeout))
252                 return;
253
254         __unexpect_related(expect);
255 }
256
257 /* delete all unconfirmed expectations for this conntrack */
258 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
259 {
260         struct list_head *exp_entry, *next;
261         struct ip_conntrack_expect *exp;
262
263         DEBUGP("remove_expectations(%p)\n", ct);
264
265         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
266                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
267                                  expected_list);
268
269                 /* we skip established expectations, as we want to delete
270                  * the un-established ones only */
271                 if (exp->sibling) {
272                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
273                         if (drop_refcount) {
274                                 /* Indicate that this expectations parent is dead */
275                                 ip_conntrack_put(exp->expectant);
276                                 exp->expectant = NULL;
277                         }
278                         continue;
279                 }
280
281                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
282                 IP_NF_ASSERT(exp->expectant == ct);
283
284                 /* delete expectation from global and private lists */
285                 unexpect_related(exp);
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         unsigned int ho, hr;
293         
294         DEBUGP("clean_from_lists(%p)\n", ct);
295         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
296
297         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
298         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
299         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
300         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
301
302         /* Destroy all un-established, pending expectations */
303         remove_expectations(ct, 1);
304 }
305
306 static void
307 destroy_conntrack(struct nf_conntrack *nfct)
308 {
309         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
310         struct ip_conntrack_protocol *proto;
311
312         DEBUGP("destroy_conntrack(%p)\n", ct);
313         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314         IP_NF_ASSERT(!timer_pending(&ct->timeout));
315
316         /* To make sure we don't get any weird locking issues here:
317          * destroy_conntrack() MUST NOT be called with a write lock
318          * to ip_conntrack_lock!!! -HW */
319         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
320         if (proto && proto->destroy)
321                 proto->destroy(ct);
322
323         if (ip_conntrack_destroyed)
324                 ip_conntrack_destroyed(ct);
325
326         WRITE_LOCK(&ip_conntrack_lock);
327         /* Delete us from our own list to prevent corruption later */
328         list_del(&ct->sibling_list);
329
330         /* Delete our master expectation */
331         if (ct->master) {
332                 if (ct->master->expectant) {
333                         /* can't call __unexpect_related here,
334                          * since it would screw up expect_list */
335                         list_del(&ct->master->expected_list);
336                         master = ct->master->expectant;
337                 }
338                 kfree(ct->master);
339         }
340         WRITE_UNLOCK(&ip_conntrack_lock);
341
342         if (master)
343                 ip_conntrack_put(master);
344
345         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
346         kmem_cache_free(ip_conntrack_cachep, ct);
347         atomic_dec(&ip_conntrack_count);
348 }
349
350 static void death_by_timeout(unsigned long ul_conntrack)
351 {
352         struct ip_conntrack *ct = (void *)ul_conntrack;
353
354         WRITE_LOCK(&ip_conntrack_lock);
355         clean_from_lists(ct);
356         WRITE_UNLOCK(&ip_conntrack_lock);
357         ip_conntrack_put(ct);
358 }
359
360 static inline int
361 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
362                     const struct ip_conntrack_tuple *tuple,
363                     const struct ip_conntrack *ignored_conntrack)
364 {
365         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
366         return i->ctrack != ignored_conntrack
367                 && ip_ct_tuple_equal(tuple, &i->tuple);
368 }
369
370 static struct ip_conntrack_tuple_hash *
371 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
372                     const struct ip_conntrack *ignored_conntrack)
373 {
374         struct ip_conntrack_tuple_hash *h;
375         unsigned int hash = hash_conntrack(tuple);
376
377         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
378         h = LIST_FIND(&ip_conntrack_hash[hash],
379                       conntrack_tuple_cmp,
380                       struct ip_conntrack_tuple_hash *,
381                       tuple, ignored_conntrack);
382         return h;
383 }
384
385 /* Find a connection corresponding to a tuple. */
386 struct ip_conntrack_tuple_hash *
387 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
388                       const struct ip_conntrack *ignored_conntrack)
389 {
390         struct ip_conntrack_tuple_hash *h;
391
392         READ_LOCK(&ip_conntrack_lock);
393         h = __ip_conntrack_find(tuple, ignored_conntrack);
394         if (h)
395                 atomic_inc(&h->ctrack->ct_general.use);
396         READ_UNLOCK(&ip_conntrack_lock);
397
398         return h;
399 }
400
401 static inline struct ip_conntrack *
402 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
403 {
404         struct ip_conntrack *ct
405                 = (struct ip_conntrack *)nfct->master;
406
407         /* ctinfo is the index of the nfct inside the conntrack */
408         *ctinfo = nfct - ct->infos;
409         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
410         return ct;
411 }
412
413 /* Return conntrack and conntrack_info given skb->nfct->master */
414 struct ip_conntrack *
415 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
416 {
417         if (skb->nfct) 
418                 return __ip_conntrack_get(skb->nfct, ctinfo);
419         return NULL;
420 }
421
422 /* Confirm a connection given skb->nfct; places it in hash table */
423 int
424 __ip_conntrack_confirm(struct nf_ct_info *nfct)
425 {
426         unsigned int hash, repl_hash;
427         struct ip_conntrack *ct;
428         enum ip_conntrack_info ctinfo;
429
430         ct = __ip_conntrack_get(nfct, &ctinfo);
431
432         /* ipt_REJECT uses ip_conntrack_attach to attach related
433            ICMP/TCP RST packets in other direction.  Actual packet
434            which created connection will be IP_CT_NEW or for an
435            expected connection, IP_CT_RELATED. */
436         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
437                 return NF_ACCEPT;
438
439         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
440         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
441
442         /* We're not in hash table, and we refuse to set up related
443            connections for unconfirmed conns.  But packet copies and
444            REJECT will give spurious warnings here. */
445         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
446
447         /* No external references means noone else could have
448            confirmed us. */
449         IP_NF_ASSERT(!is_confirmed(ct));
450         DEBUGP("Confirming conntrack %p\n", ct);
451
452         WRITE_LOCK(&ip_conntrack_lock);
453         /* See if there's one in the list already, including reverse:
454            NAT could have grabbed it without realizing, since we're
455            not in the hash.  If there is, we lost race. */
456         if (!LIST_FIND(&ip_conntrack_hash[hash],
457                        conntrack_tuple_cmp,
458                        struct ip_conntrack_tuple_hash *,
459                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
460             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
461                           conntrack_tuple_cmp,
462                           struct ip_conntrack_tuple_hash *,
463                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
464                 list_prepend(&ip_conntrack_hash[hash],
465                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
466                 list_prepend(&ip_conntrack_hash[repl_hash],
467                              &ct->tuplehash[IP_CT_DIR_REPLY]);
468                 /* Timer relative to confirmation time, not original
469                    setting time, otherwise we'd get timer wrap in
470                    weird delay cases. */
471                 ct->timeout.expires += jiffies;
472                 add_timer(&ct->timeout);
473                 atomic_inc(&ct->ct_general.use);
474                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
475                 WRITE_UNLOCK(&ip_conntrack_lock);
476                 return NF_ACCEPT;
477         }
478
479         WRITE_UNLOCK(&ip_conntrack_lock);
480         return NF_DROP;
481 }
482
483 /* Returns true if a connection correspondings to the tuple (required
484    for NAT). */
485 int
486 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
487                          const struct ip_conntrack *ignored_conntrack)
488 {
489         struct ip_conntrack_tuple_hash *h;
490
491         READ_LOCK(&ip_conntrack_lock);
492         h = __ip_conntrack_find(tuple, ignored_conntrack);
493         READ_UNLOCK(&ip_conntrack_lock);
494
495         return h != NULL;
496 }
497
498 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
499 struct ip_conntrack *
500 icmp_error_track(struct sk_buff *skb,
501                  enum ip_conntrack_info *ctinfo,
502                  unsigned int hooknum)
503 {
504         struct ip_conntrack_tuple innertuple, origtuple;
505         struct {
506                 struct icmphdr icmp;
507                 struct iphdr ip;
508         } inside;
509         struct ip_conntrack_protocol *innerproto;
510         struct ip_conntrack_tuple_hash *h;
511         int dataoff;
512
513         IP_NF_ASSERT(skb->nfct == NULL);
514
515         /* Not enough header? */
516         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
517                 return NULL;
518
519         if (inside.icmp.type != ICMP_DEST_UNREACH
520             && inside.icmp.type != ICMP_SOURCE_QUENCH
521             && inside.icmp.type != ICMP_TIME_EXCEEDED
522             && inside.icmp.type != ICMP_PARAMETERPROB
523             && inside.icmp.type != ICMP_REDIRECT)
524                 return NULL;
525
526         /* Ignore ICMP's containing fragments (shouldn't happen) */
527         if (inside.ip.frag_off & htons(IP_OFFSET)) {
528                 DEBUGP("icmp_error_track: fragment of proto %u\n",
529                        inside.ip.protocol);
530                 return NULL;
531         }
532
533         innerproto = ip_ct_find_proto(inside.ip.protocol);
534         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
535         /* Are they talking about one of our connections? */
536         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
537                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
538                 return NULL;
539         }
540
541         /* Ordinarily, we'd expect the inverted tupleproto, but it's
542            been preserved inside the ICMP. */
543         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
544                 DEBUGP("icmp_error_track: Can't invert tuple\n");
545                 return NULL;
546         }
547
548         *ctinfo = IP_CT_RELATED;
549
550         h = ip_conntrack_find_get(&innertuple, NULL);
551         if (!h) {
552                 /* Locally generated ICMPs will match inverted if they
553                    haven't been SNAT'ed yet */
554                 /* FIXME: NAT code has to handle half-done double NAT --RR */
555                 if (hooknum == NF_IP_LOCAL_OUT)
556                         h = ip_conntrack_find_get(&origtuple, NULL);
557
558                 if (!h) {
559                         DEBUGP("icmp_error_track: no match\n");
560                         return NULL;
561                 }
562                 /* Reverse direction from that found */
563                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
564                         *ctinfo += IP_CT_IS_REPLY;
565         } else {
566                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
567                         *ctinfo += IP_CT_IS_REPLY;
568         }
569
570         /* Update skb to refer to this connection */
571         skb->nfct = &h->ctrack->infos[*ctinfo];
572         return h->ctrack;
573 }
574
575 /* There's a small race here where we may free a just-assured
576    connection.  Too bad: we're in trouble anyway. */
577 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
578 {
579         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
580 }
581
582 static int early_drop(struct list_head *chain)
583 {
584         /* Traverse backwards: gives us oldest, which is roughly LRU */
585         struct ip_conntrack_tuple_hash *h;
586         int dropped = 0;
587
588         READ_LOCK(&ip_conntrack_lock);
589         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
590         if (h)
591                 atomic_inc(&h->ctrack->ct_general.use);
592         READ_UNLOCK(&ip_conntrack_lock);
593
594         if (!h)
595                 return dropped;
596
597         if (del_timer(&h->ctrack->timeout)) {
598                 death_by_timeout((unsigned long)h->ctrack);
599                 dropped = 1;
600         }
601         ip_conntrack_put(h->ctrack);
602         return dropped;
603 }
604
605 static inline int helper_cmp(const struct ip_conntrack_helper *i,
606                              const struct ip_conntrack_tuple *rtuple)
607 {
608         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
609 }
610
611 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
612 {
613         return LIST_FIND(&helpers, helper_cmp,
614                          struct ip_conntrack_helper *,
615                          tuple);
616 }
617
618 /* Allocate a new conntrack: we return -ENOMEM if classification
619    failed due to stress.  Otherwise it really is unclassifiable. */
620 static struct ip_conntrack_tuple_hash *
621 init_conntrack(const struct ip_conntrack_tuple *tuple,
622                struct ip_conntrack_protocol *protocol,
623                struct sk_buff *skb)
624 {
625         struct ip_conntrack *conntrack;
626         struct ip_conntrack_tuple repl_tuple;
627         size_t hash;
628         struct ip_conntrack_expect *expected;
629         int i;
630         static unsigned int drop_next;
631
632         if (!ip_conntrack_hash_rnd_initted) {
633                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
634                 ip_conntrack_hash_rnd_initted = 1;
635         }
636
637         hash = hash_conntrack(tuple);
638
639         if (ip_conntrack_max &&
640             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
641                 /* Try dropping from random chain, or else from the
642                    chain about to put into (in case they're trying to
643                    bomb one hash chain). */
644                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
645
646                 if (!early_drop(&ip_conntrack_hash[next])
647                     && !early_drop(&ip_conntrack_hash[hash])) {
648                         if (net_ratelimit())
649                                 printk(KERN_WARNING
650                                        "ip_conntrack: table full, dropping"
651                                        " packet.\n");
652                         return ERR_PTR(-ENOMEM);
653                 }
654         }
655
656         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
657                 DEBUGP("Can't invert tuple.\n");
658                 return NULL;
659         }
660
661         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
662         if (!conntrack) {
663                 DEBUGP("Can't allocate conntrack.\n");
664                 return ERR_PTR(-ENOMEM);
665         }
666
667         memset(conntrack, 0, sizeof(*conntrack));
668         atomic_set(&conntrack->ct_general.use, 1);
669         conntrack->ct_general.destroy = destroy_conntrack;
670         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
671         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
672         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
673         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
674         for (i=0; i < IP_CT_NUMBER; i++)
675                 conntrack->infos[i].master = &conntrack->ct_general;
676
677         if (!protocol->new(conntrack, skb)) {
678                 kmem_cache_free(ip_conntrack_cachep, conntrack);
679                 return NULL;
680         }
681         /* Don't set timer yet: wait for confirmation */
682         init_timer(&conntrack->timeout);
683         conntrack->timeout.data = (unsigned long)conntrack;
684         conntrack->timeout.function = death_by_timeout;
685
686         INIT_LIST_HEAD(&conntrack->sibling_list);
687
688         WRITE_LOCK(&ip_conntrack_lock);
689         /* Need finding and deleting of expected ONLY if we win race */
690         READ_LOCK(&ip_conntrack_expect_tuple_lock);
691         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
692                              struct ip_conntrack_expect *, tuple);
693         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
694
695         /* If master is not in hash table yet (ie. packet hasn't left
696            this machine yet), how can other end know about expected?
697            Hence these are not the droids you are looking for (if
698            master ct never got confirmed, we'd hold a reference to it
699            and weird things would happen to future packets). */
700         if (expected && !is_confirmed(expected->expectant))
701                 expected = NULL;
702
703         /* Look up the conntrack helper for master connections only */
704         if (!expected)
705                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
706
707         /* If the expectation is dying, then this is a loser. */
708         if (expected
709             && expected->expectant->helper->timeout
710             && ! del_timer(&expected->timeout))
711                 expected = NULL;
712
713         if (expected) {
714                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
715                         conntrack, expected);
716                 /* Welcome, Mr. Bond.  We've been expecting you... */
717                 IP_NF_ASSERT(master_ct(conntrack));
718                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
719                 conntrack->master = expected;
720                 expected->sibling = conntrack;
721                 LIST_DELETE(&ip_conntrack_expect_list, expected);
722                 expected->expectant->expecting--;
723                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
724         }
725         atomic_inc(&ip_conntrack_count);
726         WRITE_UNLOCK(&ip_conntrack_lock);
727
728         if (expected && expected->expectfn)
729                 expected->expectfn(conntrack);
730         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
731 }
732
733 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
734 static inline struct ip_conntrack *
735 resolve_normal_ct(struct sk_buff *skb,
736                   struct ip_conntrack_protocol *proto,
737                   int *set_reply,
738                   unsigned int hooknum,
739                   enum ip_conntrack_info *ctinfo)
740 {
741         struct ip_conntrack_tuple tuple;
742         struct ip_conntrack_tuple_hash *h;
743
744         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
745
746         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
747                 return NULL;
748
749         /* look for tuple match */
750         h = ip_conntrack_find_get(&tuple, NULL);
751         if (!h) {
752                 h = init_conntrack(&tuple, proto, skb);
753                 if (!h)
754                         return NULL;
755                 if (IS_ERR(h))
756                         return (void *)h;
757         }
758
759         /* It exists; we have (non-exclusive) reference. */
760         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
761                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
762                 /* Please set reply bit if this packet OK */
763                 *set_reply = 1;
764         } else {
765                 /* Once we've had two way comms, always ESTABLISHED. */
766                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
767                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
768                                h->ctrack);
769                         *ctinfo = IP_CT_ESTABLISHED;
770                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
771                         DEBUGP("ip_conntrack_in: related packet for %p\n",
772                                h->ctrack);
773                         *ctinfo = IP_CT_RELATED;
774                 } else {
775                         DEBUGP("ip_conntrack_in: new packet for %p\n",
776                                h->ctrack);
777                         *ctinfo = IP_CT_NEW;
778                 }
779                 *set_reply = 0;
780         }
781         skb->nfct = &h->ctrack->infos[*ctinfo];
782         return h->ctrack;
783 }
784
785 /* Netfilter hook itself. */
786 unsigned int ip_conntrack_in(unsigned int hooknum,
787                              struct sk_buff **pskb,
788                              const struct net_device *in,
789                              const struct net_device *out,
790                              int (*okfn)(struct sk_buff *))
791 {
792         struct ip_conntrack *ct;
793         enum ip_conntrack_info ctinfo;
794         struct ip_conntrack_protocol *proto;
795         int set_reply;
796         int ret;
797
798         /* Never happen */
799         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
800                 if (net_ratelimit()) {
801                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
802                        (*pskb)->nh.iph->protocol, hooknum);
803                 }
804                 return NF_DROP;
805         }
806
807         /* FIXME: Do this right please. --RR */
808         (*pskb)->nfcache |= NFC_UNKNOWN;
809
810 /* Doesn't cover locally-generated broadcast, so not worth it. */
811 #if 0
812         /* Ignore broadcast: no `connection'. */
813         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
814                 printk("Broadcast packet!\n");
815                 return NF_ACCEPT;
816         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
817                    == htonl(0x000000FF)) {
818                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
819                        NIPQUAD((*pskb)->nh.iph->saddr),
820                        NIPQUAD((*pskb)->nh.iph->daddr),
821                        (*pskb)->sk, (*pskb)->pkt_type);
822         }
823 #endif
824
825         /* Previously seen (loopback or untracked)?  Ignore. */
826         if ((*pskb)->nfct)
827                 return NF_ACCEPT;
828
829         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
830
831         /* It may be an icmp error... */
832         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
833             && icmp_error_track(*pskb, &ctinfo, hooknum))
834                 return NF_ACCEPT;
835
836         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
837                 /* Not valid part of a connection */
838                 return NF_ACCEPT;
839
840         if (IS_ERR(ct))
841                 /* Too stressed to deal. */
842                 return NF_DROP;
843
844         IP_NF_ASSERT((*pskb)->nfct);
845
846         ret = proto->packet(ct, *pskb, ctinfo);
847         if (ret == -1) {
848                 /* Invalid */
849                 nf_conntrack_put((*pskb)->nfct);
850                 (*pskb)->nfct = NULL;
851                 return NF_ACCEPT;
852         }
853
854         if (ret != NF_DROP && ct->helper) {
855                 ret = ct->helper->help(*pskb, ct, ctinfo);
856                 if (ret == -1) {
857                         /* Invalid */
858                         nf_conntrack_put((*pskb)->nfct);
859                         (*pskb)->nfct = NULL;
860                         return NF_ACCEPT;
861                 }
862         }
863         if (set_reply)
864                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
865
866         return ret;
867 }
868
869 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
870                    const struct ip_conntrack_tuple *orig)
871 {
872         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
873 }
874
875 static inline int resent_expect(const struct ip_conntrack_expect *i,
876                                 const struct ip_conntrack_tuple *tuple,
877                                 const struct ip_conntrack_tuple *mask)
878 {
879         DEBUGP("resent_expect\n");
880         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
881         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
882         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
883         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
884                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
885                 && ip_ct_tuple_equal(&i->mask, mask));
886 }
887
888 /* Would two expected things clash? */
889 static inline int expect_clash(const struct ip_conntrack_expect *i,
890                                const struct ip_conntrack_tuple *tuple,
891                                const struct ip_conntrack_tuple *mask)
892 {
893         /* Part covered by intersection of masks must be unequal,
894            otherwise they clash */
895         struct ip_conntrack_tuple intersect_mask
896                 = { { i->mask.src.ip & mask->src.ip,
897                       { i->mask.src.u.all & mask->src.u.all } },
898                     { i->mask.dst.ip & mask->dst.ip,
899                       { i->mask.dst.u.all & mask->dst.u.all },
900                       i->mask.dst.protonum & mask->dst.protonum } };
901
902         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
903 }
904
905 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
906 {
907         WRITE_LOCK(&ip_conntrack_lock);
908         unexpect_related(expect);
909         WRITE_UNLOCK(&ip_conntrack_lock);
910 }
911         
912 static void expectation_timed_out(unsigned long ul_expect)
913 {
914         struct ip_conntrack_expect *expect = (void *) ul_expect;
915
916         DEBUGP("expectation %p timed out\n", expect);   
917         WRITE_LOCK(&ip_conntrack_lock);
918         __unexpect_related(expect);
919         WRITE_UNLOCK(&ip_conntrack_lock);
920 }
921
922 struct ip_conntrack_expect *
923 ip_conntrack_expect_alloc()
924 {
925         struct ip_conntrack_expect *new;
926         
927         new = (struct ip_conntrack_expect *)
928                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
929         if (!new) {
930                 DEBUGP("expect_related: OOM allocating expect\n");
931                 return NULL;
932         }
933
934         /* tuple_cmp compares whole union, we have to initialized cleanly */
935         memset(new, 0, sizeof(struct ip_conntrack_expect));
936
937         return new;
938 }
939
940 static void
941 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
942                            struct ip_conntrack *related_to)
943 {
944         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
945         new->expectant = related_to;
946         new->sibling = NULL;
947         atomic_set(&new->use, 1);
948
949         /* add to expected list for this connection */
950         list_add(&new->expected_list, &related_to->sibling_list);
951         /* add to global list of expectations */
952
953         list_prepend(&ip_conntrack_expect_list, &new->list);
954         /* add and start timer if required */
955         if (related_to->helper->timeout) {
956                 init_timer(&new->timeout);
957                 new->timeout.data = (unsigned long)new;
958                 new->timeout.function = expectation_timed_out;
959                 new->timeout.expires = jiffies +
960                                         related_to->helper->timeout * HZ;
961                 add_timer(&new->timeout);
962         }
963         related_to->expecting++;
964 }
965
966 /* Add a related connection. */
967 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
968                                 struct ip_conntrack *related_to)
969 {
970         struct ip_conntrack_expect *old;
971         int ret = 0;
972
973         WRITE_LOCK(&ip_conntrack_lock);
974         /* Because of the write lock, no reader can walk the lists,
975          * so there is no need to use the tuple lock too */
976
977         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
978         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
979         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
980
981         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
982                         struct ip_conntrack_expect *, &expect->tuple, 
983                         &expect->mask);
984         if (old) {
985                 /* Helper private data may contain offsets but no pointers
986                    pointing into the payload - otherwise we should have to copy 
987                    the data filled out by the helper over the old one */
988                 DEBUGP("expect_related: resent packet\n");
989                 if (related_to->helper->timeout) {
990                         if (!del_timer(&old->timeout)) {
991                                 /* expectation is dying. Fall through */
992                                 goto out;
993                         } else {
994                                 old->timeout.expires = jiffies + 
995                                         related_to->helper->timeout * HZ;
996                                 add_timer(&old->timeout);
997                         }
998                 }
999
1000                 WRITE_UNLOCK(&ip_conntrack_lock);
1001                 kfree(expect);
1002                 return -EEXIST;
1003
1004         } else if (related_to->helper->max_expected && 
1005                    related_to->expecting >= related_to->helper->max_expected) {
1006                 struct list_head *cur_item;
1007                 /* old == NULL */
1008                 if (!(related_to->helper->flags & 
1009                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1010                         WRITE_UNLOCK(&ip_conntrack_lock);
1011                         if (net_ratelimit())
1012                                 printk(KERN_WARNING
1013                                        "ip_conntrack: max number of expected "
1014                                        "connections %i of %s reached for "
1015                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1016                                        related_to->helper->max_expected,
1017                                        related_to->helper->name,
1018                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1019                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1020                         kfree(expect);
1021                         return -EPERM;
1022                 }
1023                 DEBUGP("ip_conntrack: max number of expected "
1024                        "connections %i of %s reached for "
1025                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1026                        related_to->helper->max_expected,
1027                        related_to->helper->name,
1028                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1029                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1030  
1031                 /* choose the the oldest expectation to evict */
1032                 list_for_each(cur_item, &related_to->sibling_list) { 
1033                         struct ip_conntrack_expect *cur;
1034
1035                         cur = list_entry(cur_item, 
1036                                          struct ip_conntrack_expect,
1037                                          expected_list);
1038                         if (cur->sibling == NULL) {
1039                                 old = cur;
1040                                 break;
1041                         }
1042                 }
1043
1044                 /* (!old) cannot happen, since related_to->expecting is the
1045                  * number of unconfirmed expects */
1046                 IP_NF_ASSERT(old);
1047
1048                 /* newnat14 does not reuse the real allocated memory
1049                  * structures but rather unexpects the old and
1050                  * allocates a new.  unexpect_related will decrement
1051                  * related_to->expecting. 
1052                  */
1053                 unexpect_related(old);
1054                 ret = -EPERM;
1055         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1056                              struct ip_conntrack_expect *, &expect->tuple, 
1057                              &expect->mask)) {
1058                 WRITE_UNLOCK(&ip_conntrack_lock);
1059                 DEBUGP("expect_related: busy!\n");
1060
1061                 kfree(expect);
1062                 return -EBUSY;
1063         }
1064
1065 out:    ip_conntrack_expect_insert(expect, related_to);
1066
1067         WRITE_UNLOCK(&ip_conntrack_lock);
1068
1069         return ret;
1070 }
1071
1072 /* Change tuple in an existing expectation */
1073 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1074                                struct ip_conntrack_tuple *newtuple)
1075 {
1076         int ret;
1077
1078         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1079         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1080
1081         DEBUGP("change_expect:\n");
1082         DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1083         DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
1084         DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
1085         if (expect->ct_tuple.dst.protonum == 0) {
1086                 /* Never seen before */
1087                 DEBUGP("change expect: never seen before\n");
1088                 if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
1089                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1090                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1091                         /* Force NAT to find an unused tuple */
1092                         ret = -1;
1093                 } else {
1094                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1095                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1096                         ret = 0;
1097                 }
1098         } else {
1099                 /* Resent packet */
1100                 DEBUGP("change expect: resent packet\n");
1101                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1102                         ret = 0;
1103                 } else {
1104                         /* Force NAT to choose again the same port */
1105                         ret = -1;
1106                 }
1107         }
1108         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1109         
1110         return ret;
1111 }
1112
1113 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1114    return 0 and don't do alteration. */
1115 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1116                              const struct ip_conntrack_tuple *newreply)
1117 {
1118         WRITE_LOCK(&ip_conntrack_lock);
1119         if (__ip_conntrack_find(newreply, conntrack)) {
1120                 WRITE_UNLOCK(&ip_conntrack_lock);
1121                 return 0;
1122         }
1123         /* Should be unconfirmed, so not in hash table yet */
1124         IP_NF_ASSERT(!is_confirmed(conntrack));
1125
1126         DEBUGP("Altering reply tuple of %p to ", conntrack);
1127         DUMP_TUPLE(newreply);
1128
1129         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1130         if (!conntrack->master)
1131                 conntrack->helper = LIST_FIND(&helpers, helper_cmp,
1132                                               struct ip_conntrack_helper *,
1133                                               newreply);
1134         WRITE_UNLOCK(&ip_conntrack_lock);
1135
1136         return 1;
1137 }
1138
1139 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1140 {
1141         WRITE_LOCK(&ip_conntrack_lock);
1142         list_prepend(&helpers, me);
1143         WRITE_UNLOCK(&ip_conntrack_lock);
1144
1145         return 0;
1146 }
1147
1148 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1149                          const struct ip_conntrack_helper *me)
1150 {
1151         if (i->ctrack->helper == me) {
1152                 /* Get rid of any expected. */
1153                 remove_expectations(i->ctrack, 0);
1154                 /* And *then* set helper to NULL */
1155                 i->ctrack->helper = NULL;
1156         }
1157         return 0;
1158 }
1159
1160 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1161 {
1162         unsigned int i;
1163
1164         /* Need write lock here, to delete helper. */
1165         WRITE_LOCK(&ip_conntrack_lock);
1166         LIST_DELETE(&helpers, me);
1167
1168         /* Get rid of expecteds, set helpers to NULL. */
1169         for (i = 0; i < ip_conntrack_htable_size; i++)
1170                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1171                             struct ip_conntrack_tuple_hash *, me);
1172         WRITE_UNLOCK(&ip_conntrack_lock);
1173
1174         /* Someone could be still looking at the helper in a bh. */
1175         synchronize_net();
1176 }
1177
1178 /* Refresh conntrack for this many jiffies. */
1179 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1180 {
1181         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1182
1183         /* If not in hash table, timer will not be active yet */
1184         if (!is_confirmed(ct))
1185                 ct->timeout.expires = extra_jiffies;
1186         else {
1187                 WRITE_LOCK(&ip_conntrack_lock);
1188                 /* Need del_timer for race avoidance (may already be dying). */
1189                 if (del_timer(&ct->timeout)) {
1190                         ct->timeout.expires = jiffies + extra_jiffies;
1191                         add_timer(&ct->timeout);
1192                 }
1193                 WRITE_UNLOCK(&ip_conntrack_lock);
1194         }
1195 }
1196
1197 /* Returns new sk_buff, or NULL */
1198 struct sk_buff *
1199 ip_ct_gather_frags(struct sk_buff *skb)
1200 {
1201         struct sock *sk = skb->sk;
1202 #ifdef CONFIG_NETFILTER_DEBUG
1203         unsigned int olddebug = skb->nf_debug;
1204 #endif
1205         if (sk) {
1206                 sock_hold(sk);
1207                 skb_orphan(skb);
1208         }
1209
1210         local_bh_disable(); 
1211         skb = ip_defrag(skb);
1212         local_bh_enable();
1213
1214         if (!skb) {
1215                 if (sk)
1216                         sock_put(sk);
1217                 return skb;
1218         }
1219
1220         if (sk) {
1221                 skb_set_owner_w(skb, sk);
1222                 sock_put(sk);
1223         }
1224
1225         ip_send_check(skb->nh.iph);
1226         skb->nfcache |= NFC_ALTERED;
1227 #ifdef CONFIG_NETFILTER_DEBUG
1228         /* Packet path as if nothing had happened. */
1229         skb->nf_debug = olddebug;
1230 #endif
1231         return skb;
1232 }
1233
1234 /* Used by ipt_REJECT. */
1235 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1236 {
1237         struct ip_conntrack *ct;
1238         enum ip_conntrack_info ctinfo;
1239
1240         ct = __ip_conntrack_get(nfct, &ctinfo);
1241
1242         /* This ICMP is in reverse direction to the packet which
1243            caused it */
1244         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1245                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1246         else
1247                 ctinfo = IP_CT_RELATED;
1248
1249         /* Attach new skbuff, and increment count */
1250         nskb->nfct = &ct->infos[ctinfo];
1251         atomic_inc(&ct->ct_general.use);
1252 }
1253
1254 static inline int
1255 do_kill(const struct ip_conntrack_tuple_hash *i,
1256         int (*kill)(const struct ip_conntrack *i, void *data),
1257         void *data)
1258 {
1259         return kill(i->ctrack, data);
1260 }
1261
1262 /* Bring out ya dead! */
1263 static struct ip_conntrack_tuple_hash *
1264 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1265                 void *data, unsigned int *bucket)
1266 {
1267         struct ip_conntrack_tuple_hash *h = NULL;
1268
1269         READ_LOCK(&ip_conntrack_lock);
1270         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1271                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1272                               struct ip_conntrack_tuple_hash *, kill, data);
1273         }
1274         if (h)
1275                 atomic_inc(&h->ctrack->ct_general.use);
1276         READ_UNLOCK(&ip_conntrack_lock);
1277
1278         return h;
1279 }
1280
1281 void
1282 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1283                         void *data)
1284 {
1285         struct ip_conntrack_tuple_hash *h;
1286         unsigned int bucket = 0;
1287
1288         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1289                 /* Time to push up daises... */
1290                 if (del_timer(&h->ctrack->timeout))
1291                         death_by_timeout((unsigned long)h->ctrack);
1292                 /* ... else the timer will get him soon. */
1293
1294                 ip_conntrack_put(h->ctrack);
1295         }
1296 }
1297
1298 /* Fast function for those who don't want to parse /proc (and I don't
1299    blame them). */
1300 /* Reversing the socket's dst/src point of view gives us the reply
1301    mapping. */
1302 static int
1303 getorigdst(struct sock *sk, int optval, void *user, int *len)
1304 {
1305         struct inet_opt *inet = inet_sk(sk);
1306         struct ip_conntrack_tuple_hash *h;
1307         struct ip_conntrack_tuple tuple;
1308         
1309         IP_CT_TUPLE_U_BLANK(&tuple);
1310         tuple.src.ip = inet->rcv_saddr;
1311         tuple.src.u.tcp.port = inet->sport;
1312         tuple.dst.ip = inet->daddr;
1313         tuple.dst.u.tcp.port = inet->dport;
1314         tuple.dst.protonum = IPPROTO_TCP;
1315
1316         /* We only do TCP at the moment: is there a better way? */
1317         if (strcmp(sk->sk_prot->name, "TCP")) {
1318                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1319                 return -ENOPROTOOPT;
1320         }
1321
1322         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1323                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1324                        *len, sizeof(struct sockaddr_in));
1325                 return -EINVAL;
1326         }
1327
1328         h = ip_conntrack_find_get(&tuple, NULL);
1329         if (h) {
1330                 struct sockaddr_in sin;
1331
1332                 sin.sin_family = AF_INET;
1333                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1334                         .tuple.dst.u.tcp.port;
1335                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1336                         .tuple.dst.ip;
1337
1338                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1339                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1340                 ip_conntrack_put(h->ctrack);
1341                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1342                         return -EFAULT;
1343                 else
1344                         return 0;
1345         }
1346         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1347                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1348                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1349         return -ENOENT;
1350 }
1351
1352 static struct nf_sockopt_ops so_getorigdst = {
1353         .pf             = PF_INET,
1354         .get_optmin     = SO_ORIGINAL_DST,
1355         .get_optmax     = SO_ORIGINAL_DST+1,
1356         .get            = &getorigdst,
1357 };
1358
1359 static int kill_all(const struct ip_conntrack *i, void *data)
1360 {
1361         return 1;
1362 }
1363
1364 /* Mishearing the voices in his head, our hero wonders how he's
1365    supposed to kill the mall. */
1366 void ip_conntrack_cleanup(void)
1367 {
1368         ip_ct_attach = NULL;
1369         /* This makes sure all current packets have passed through
1370            netfilter framework.  Roll on, two-stage module
1371            delete... */
1372         synchronize_net();
1373  
1374  i_see_dead_people:
1375         ip_ct_selective_cleanup(kill_all, NULL);
1376         if (atomic_read(&ip_conntrack_count) != 0) {
1377                 schedule();
1378                 goto i_see_dead_people;
1379         }
1380
1381         kmem_cache_destroy(ip_conntrack_cachep);
1382         vfree(ip_conntrack_hash);
1383         nf_unregister_sockopt(&so_getorigdst);
1384 }
1385
1386 static int hashsize;
1387 MODULE_PARM(hashsize, "i");
1388
1389 int __init ip_conntrack_init(void)
1390 {
1391         unsigned int i;
1392         int ret;
1393
1394         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1395          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1396         if (hashsize) {
1397                 ip_conntrack_htable_size = hashsize;
1398         } else {
1399                 ip_conntrack_htable_size
1400                         = (((num_physpages << PAGE_SHIFT) / 16384)
1401                            / sizeof(struct list_head));
1402                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1403                         ip_conntrack_htable_size = 8192;
1404                 if (ip_conntrack_htable_size < 16)
1405                         ip_conntrack_htable_size = 16;
1406         }
1407         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1408
1409         printk("ip_conntrack version %s (%u buckets, %d max)"
1410                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1411                ip_conntrack_htable_size, ip_conntrack_max,
1412                sizeof(struct ip_conntrack));
1413
1414         ret = nf_register_sockopt(&so_getorigdst);
1415         if (ret != 0) {
1416                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1417                 return ret;
1418         }
1419
1420         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1421                                     * ip_conntrack_htable_size);
1422         if (!ip_conntrack_hash) {
1423                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1424                 goto err_unreg_sockopt;
1425         }
1426
1427         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1428                                                 sizeof(struct ip_conntrack), 0,
1429                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1430         if (!ip_conntrack_cachep) {
1431                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1432                 goto err_free_hash;
1433         }
1434         /* Don't NEED lock here, but good form anyway. */
1435         WRITE_LOCK(&ip_conntrack_lock);
1436         /* Sew in builtin protocols. */
1437         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1438         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1439         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1440         WRITE_UNLOCK(&ip_conntrack_lock);
1441
1442         for (i = 0; i < ip_conntrack_htable_size; i++)
1443                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1444
1445         /* For use by ipt_REJECT */
1446         ip_ct_attach = ip_conntrack_attach;
1447
1448         /* Set up fake conntrack:
1449             - to never be deleted, not in any hashes */
1450         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1451         /*  - and look it like as a confirmed connection */
1452         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1453         /*  - and prepare the ctinfo field for REJECT & NAT. */
1454         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1455         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1456         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1457                         &ip_conntrack_untracked.ct_general;
1458
1459         return ret;
1460
1461 err_free_hash:
1462         vfree(ip_conntrack_hash);
1463 err_unreg_sockopt:
1464         nf_unregister_sockopt(&so_getorigdst);
1465
1466         return -ENOMEM;
1467 }