patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 /* For ERR_PTR().  Yeah, I know... --RR */
37 #include <linux/fs.h>
38
39 /* This rwlock protects the main hash table, protocol/helper/expected
40    registrations, conntrack timers*/
41 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
42 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
43
44 #include <linux/netfilter_ipv4/ip_conntrack.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
48 #include <linux/netfilter_ipv4/listhelp.h>
49
50 #define IP_CONNTRACK_VERSION    "2.1"
51
52 #if 0
53 #define DEBUGP printk
54 #else
55 #define DEBUGP(format, args...)
56 #endif
57
58 DECLARE_RWLOCK(ip_conntrack_lock);
59 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 LIST_HEAD(protocol_list);
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size = 0;
66 int ip_conntrack_max;
67 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
68 struct list_head *ip_conntrack_hash;
69 static kmem_cache_t *ip_conntrack_cachep;
70 struct ip_conntrack ip_conntrack_untracked;
71
72 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
73
74 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
75                               u_int8_t protocol)
76 {
77         return protocol == curr->proto;
78 }
79
80 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
81 {
82         struct ip_conntrack_protocol *p;
83
84         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
85         p = LIST_FIND(&protocol_list, proto_cmpfn,
86                       struct ip_conntrack_protocol *, protocol);
87         if (!p)
88                 p = &ip_conntrack_generic_protocol;
89
90         return p;
91 }
92
93 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
94 {
95         struct ip_conntrack_protocol *p;
96
97         READ_LOCK(&ip_conntrack_lock);
98         p = __ip_ct_find_proto(protocol);
99         READ_UNLOCK(&ip_conntrack_lock);
100         return p;
101 }
102
103 inline void 
104 ip_conntrack_put(struct ip_conntrack *ct)
105 {
106         IP_NF_ASSERT(ct);
107         IP_NF_ASSERT(ct->infos[0].master);
108         /* nf_conntrack_put wants to go via an info struct, so feed it
109            one at random. */
110         nf_conntrack_put(&ct->infos[0]);
111 }
112
113 static int ip_conntrack_hash_rnd_initted;
114 static unsigned int ip_conntrack_hash_rnd;
115
116 static u_int32_t
117 hash_conntrack(const struct ip_conntrack_tuple *tuple)
118 {
119 #if 0
120         dump_tuple(tuple);
121 #endif
122         return (jhash_3words(tuple->src.ip,
123                              (tuple->dst.ip ^ tuple->dst.protonum),
124                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
125                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
126 }
127
128 int
129 get_tuple(const struct iphdr *iph,
130           const struct sk_buff *skb,
131           unsigned int dataoff,
132           struct ip_conntrack_tuple *tuple,
133           const struct ip_conntrack_protocol *protocol)
134 {
135         /* Never happen */
136         if (iph->frag_off & htons(IP_OFFSET)) {
137                 printk("ip_conntrack_core: Frag of proto %u.\n",
138                        iph->protocol);
139                 return 0;
140         }
141
142         tuple->src.ip = iph->saddr;
143         tuple->dst.ip = iph->daddr;
144         tuple->dst.protonum = iph->protocol;
145
146         return protocol->pkt_to_tuple(skb, dataoff, tuple);
147 }
148
149 static int
150 invert_tuple(struct ip_conntrack_tuple *inverse,
151              const struct ip_conntrack_tuple *orig,
152              const struct ip_conntrack_protocol *protocol)
153 {
154         inverse->src.ip = orig->dst.ip;
155         inverse->dst.ip = orig->src.ip;
156         inverse->dst.protonum = orig->dst.protonum;
157
158         return protocol->invert_tuple(inverse, orig);
159 }
160
161
162 /* ip_conntrack_expect helper functions */
163
164 /* Compare tuple parts depending on mask. */
165 static inline int expect_cmp(const struct ip_conntrack_expect *i,
166                              const struct ip_conntrack_tuple *tuple)
167 {
168         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
169         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
170 }
171
172 static void
173 destroy_expect(struct ip_conntrack_expect *exp)
174 {
175         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
176         IP_NF_ASSERT(atomic_read(&exp->use));
177         IP_NF_ASSERT(!timer_pending(&exp->timeout));
178
179         kfree(exp);
180 }
181
182
183 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
184 {
185         IP_NF_ASSERT(exp);
186
187         if (atomic_dec_and_test(&exp->use)) {
188                 /* usage count dropped to zero */
189                 destroy_expect(exp);
190         }
191 }
192
193 static inline struct ip_conntrack_expect *
194 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
195 {
196         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
197         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
198         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
199                          struct ip_conntrack_expect *, tuple);
200 }
201
202 /* Find a expectation corresponding to a tuple. */
203 struct ip_conntrack_expect *
204 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
205 {
206         struct ip_conntrack_expect *exp;
207
208         READ_LOCK(&ip_conntrack_lock);
209         READ_LOCK(&ip_conntrack_expect_tuple_lock);
210         exp = __ip_ct_expect_find(tuple);
211         if (exp)
212                 atomic_inc(&exp->use);
213         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
214         READ_UNLOCK(&ip_conntrack_lock);
215
216         return exp;
217 }
218
219 /* remove one specific expectation from all lists and drop refcount,
220  * does _NOT_ delete the timer. */
221 static void __unexpect_related(struct ip_conntrack_expect *expect)
222 {
223         DEBUGP("unexpect_related(%p)\n", expect);
224         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
225
226         /* we're not allowed to unexpect a confirmed expectation! */
227         IP_NF_ASSERT(!expect->sibling);
228
229         /* delete from global and local lists */
230         list_del(&expect->list);
231         list_del(&expect->expected_list);
232
233         /* decrement expect-count of master conntrack */
234         if (expect->expectant)
235                 expect->expectant->expecting--;
236
237         ip_conntrack_expect_put(expect);
238 }
239
240 /* remove one specific expecatation from all lists, drop refcount
241  * and expire timer. 
242  * This function can _NOT_ be called for confirmed expects! */
243 static void unexpect_related(struct ip_conntrack_expect *expect)
244 {
245         IP_NF_ASSERT(expect->expectant);
246         IP_NF_ASSERT(expect->expectant->helper);
247         /* if we are supposed to have a timer, but we can't delete
248          * it: race condition.  __unexpect_related will
249          * be calledd by timeout function */
250         if (expect->expectant->helper->timeout
251             && !del_timer(&expect->timeout))
252                 return;
253
254         __unexpect_related(expect);
255 }
256
257 /* delete all unconfirmed expectations for this conntrack */
258 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
259 {
260         struct list_head *exp_entry, *next;
261         struct ip_conntrack_expect *exp;
262
263         DEBUGP("remove_expectations(%p)\n", ct);
264
265         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
266                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
267                                  expected_list);
268
269                 /* we skip established expectations, as we want to delete
270                  * the un-established ones only */
271                 if (exp->sibling) {
272                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
273                         if (drop_refcount) {
274                                 /* Indicate that this expectations parent is dead */
275                                 ip_conntrack_put(exp->expectant);
276                                 exp->expectant = NULL;
277                         }
278                         continue;
279                 }
280
281                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
282                 IP_NF_ASSERT(exp->expectant == ct);
283
284                 /* delete expectation from global and private lists */
285                 unexpect_related(exp);
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         unsigned int ho, hr;
293         
294         DEBUGP("clean_from_lists(%p)\n", ct);
295         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
296
297         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
298         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
299         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
300         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
301
302         /* Destroy all un-established, pending expectations */
303         remove_expectations(ct, 1);
304 }
305
306 static void
307 destroy_conntrack(struct nf_conntrack *nfct)
308 {
309         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
310         struct ip_conntrack_protocol *proto;
311
312         DEBUGP("destroy_conntrack(%p)\n", ct);
313         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314         IP_NF_ASSERT(!timer_pending(&ct->timeout));
315
316         /* To make sure we don't get any weird locking issues here:
317          * destroy_conntrack() MUST NOT be called with a write lock
318          * to ip_conntrack_lock!!! -HW */
319         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
320         if (proto && proto->destroy)
321                 proto->destroy(ct);
322
323         if (ip_conntrack_destroyed)
324                 ip_conntrack_destroyed(ct);
325
326         WRITE_LOCK(&ip_conntrack_lock);
327         /* Make sure don't leave any orphaned expectations lying around */
328         if (ct->expecting)
329                 remove_expectations(ct, 1);
330
331         /* Delete our master expectation */
332         if (ct->master) {
333                 if (ct->master->expectant) {
334                         /* can't call __unexpect_related here,
335                          * since it would screw up expect_list */
336                         list_del(&ct->master->expected_list);
337                         master = ct->master->expectant;
338                 }
339                 kfree(ct->master);
340         }
341         WRITE_UNLOCK(&ip_conntrack_lock);
342
343         if (master)
344                 ip_conntrack_put(master);
345
346         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
347         kmem_cache_free(ip_conntrack_cachep, ct);
348         atomic_dec(&ip_conntrack_count);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         WRITE_LOCK(&ip_conntrack_lock);
356         clean_from_lists(ct);
357         WRITE_UNLOCK(&ip_conntrack_lock);
358         ip_conntrack_put(ct);
359 }
360
361 static inline int
362 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
363                     const struct ip_conntrack_tuple *tuple,
364                     const struct ip_conntrack *ignored_conntrack)
365 {
366         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
367         return i->ctrack != ignored_conntrack
368                 && ip_ct_tuple_equal(tuple, &i->tuple);
369 }
370
371 static struct ip_conntrack_tuple_hash *
372 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
373                     const struct ip_conntrack *ignored_conntrack)
374 {
375         struct ip_conntrack_tuple_hash *h;
376         unsigned int hash = hash_conntrack(tuple);
377
378         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
379         h = LIST_FIND(&ip_conntrack_hash[hash],
380                       conntrack_tuple_cmp,
381                       struct ip_conntrack_tuple_hash *,
382                       tuple, ignored_conntrack);
383         return h;
384 }
385
386 /* Find a connection corresponding to a tuple. */
387 struct ip_conntrack_tuple_hash *
388 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
389                       const struct ip_conntrack *ignored_conntrack)
390 {
391         struct ip_conntrack_tuple_hash *h;
392
393         READ_LOCK(&ip_conntrack_lock);
394         h = __ip_conntrack_find(tuple, ignored_conntrack);
395         if (h)
396                 atomic_inc(&h->ctrack->ct_general.use);
397         READ_UNLOCK(&ip_conntrack_lock);
398
399         return h;
400 }
401
402 static inline struct ip_conntrack *
403 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
404 {
405         struct ip_conntrack *ct
406                 = (struct ip_conntrack *)nfct->master;
407
408         /* ctinfo is the index of the nfct inside the conntrack */
409         *ctinfo = nfct - ct->infos;
410         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
411         return ct;
412 }
413
414 /* Return conntrack and conntrack_info given skb->nfct->master */
415 struct ip_conntrack *
416 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
417 {
418         if (skb->nfct) 
419                 return __ip_conntrack_get(skb->nfct, ctinfo);
420         return NULL;
421 }
422
423 /* Confirm a connection given skb->nfct; places it in hash table */
424 int
425 __ip_conntrack_confirm(struct nf_ct_info *nfct)
426 {
427         unsigned int hash, repl_hash;
428         struct ip_conntrack *ct;
429         enum ip_conntrack_info ctinfo;
430
431         ct = __ip_conntrack_get(nfct, &ctinfo);
432
433         /* ipt_REJECT uses ip_conntrack_attach to attach related
434            ICMP/TCP RST packets in other direction.  Actual packet
435            which created connection will be IP_CT_NEW or for an
436            expected connection, IP_CT_RELATED. */
437         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438                 return NF_ACCEPT;
439
440         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443         /* We're not in hash table, and we refuse to set up related
444            connections for unconfirmed conns.  But packet copies and
445            REJECT will give spurious warnings here. */
446         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448         /* No external references means noone else could have
449            confirmed us. */
450         IP_NF_ASSERT(!is_confirmed(ct));
451         DEBUGP("Confirming conntrack %p\n", ct);
452
453         WRITE_LOCK(&ip_conntrack_lock);
454         /* See if there's one in the list already, including reverse:
455            NAT could have grabbed it without realizing, since we're
456            not in the hash.  If there is, we lost race. */
457         if (!LIST_FIND(&ip_conntrack_hash[hash],
458                        conntrack_tuple_cmp,
459                        struct ip_conntrack_tuple_hash *,
460                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
461             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
462                           conntrack_tuple_cmp,
463                           struct ip_conntrack_tuple_hash *,
464                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
465                 list_prepend(&ip_conntrack_hash[hash],
466                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
467                 list_prepend(&ip_conntrack_hash[repl_hash],
468                              &ct->tuplehash[IP_CT_DIR_REPLY]);
469                 /* Timer relative to confirmation time, not original
470                    setting time, otherwise we'd get timer wrap in
471                    weird delay cases. */
472                 ct->timeout.expires += jiffies;
473                 add_timer(&ct->timeout);
474                 atomic_inc(&ct->ct_general.use);
475                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
476                 WRITE_UNLOCK(&ip_conntrack_lock);
477                 return NF_ACCEPT;
478         }
479
480         WRITE_UNLOCK(&ip_conntrack_lock);
481         return NF_DROP;
482 }
483
484 /* Returns true if a connection correspondings to the tuple (required
485    for NAT). */
486 int
487 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
488                          const struct ip_conntrack *ignored_conntrack)
489 {
490         struct ip_conntrack_tuple_hash *h;
491
492         READ_LOCK(&ip_conntrack_lock);
493         h = __ip_conntrack_find(tuple, ignored_conntrack);
494         READ_UNLOCK(&ip_conntrack_lock);
495
496         return h != NULL;
497 }
498
499 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
500 struct ip_conntrack *
501 icmp_error_track(struct sk_buff *skb,
502                  enum ip_conntrack_info *ctinfo,
503                  unsigned int hooknum)
504 {
505         struct ip_conntrack_tuple innertuple, origtuple;
506         struct {
507                 struct icmphdr icmp;
508                 struct iphdr ip;
509         } inside;
510         struct ip_conntrack_protocol *innerproto;
511         struct ip_conntrack_tuple_hash *h;
512         int dataoff;
513
514         IP_NF_ASSERT(skb->nfct == NULL);
515
516         /* Not enough header? */
517         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
518                 return NULL;
519
520         if (inside.icmp.type != ICMP_DEST_UNREACH
521             && inside.icmp.type != ICMP_SOURCE_QUENCH
522             && inside.icmp.type != ICMP_TIME_EXCEEDED
523             && inside.icmp.type != ICMP_PARAMETERPROB
524             && inside.icmp.type != ICMP_REDIRECT)
525                 return NULL;
526
527         /* Ignore ICMP's containing fragments (shouldn't happen) */
528         if (inside.ip.frag_off & htons(IP_OFFSET)) {
529                 DEBUGP("icmp_error_track: fragment of proto %u\n",
530                        inside.ip.protocol);
531                 return NULL;
532         }
533
534         innerproto = ip_ct_find_proto(inside.ip.protocol);
535         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
536         /* Are they talking about one of our connections? */
537         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
538                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
539                 return NULL;
540         }
541
542         /* Ordinarily, we'd expect the inverted tupleproto, but it's
543            been preserved inside the ICMP. */
544         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
545                 DEBUGP("icmp_error_track: Can't invert tuple\n");
546                 return NULL;
547         }
548
549         *ctinfo = IP_CT_RELATED;
550
551         h = ip_conntrack_find_get(&innertuple, NULL);
552         if (!h) {
553                 /* Locally generated ICMPs will match inverted if they
554                    haven't been SNAT'ed yet */
555                 /* FIXME: NAT code has to handle half-done double NAT --RR */
556                 if (hooknum == NF_IP_LOCAL_OUT)
557                         h = ip_conntrack_find_get(&origtuple, NULL);
558
559                 if (!h) {
560                         DEBUGP("icmp_error_track: no match\n");
561                         return NULL;
562                 }
563                 /* Reverse direction from that found */
564                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
565                         *ctinfo += IP_CT_IS_REPLY;
566         } else {
567                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
568                         *ctinfo += IP_CT_IS_REPLY;
569         }
570
571         /* Update skb to refer to this connection */
572         skb->nfct = &h->ctrack->infos[*ctinfo];
573         return h->ctrack;
574 }
575
576 /* There's a small race here where we may free a just-assured
577    connection.  Too bad: we're in trouble anyway. */
578 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
579 {
580         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
581 }
582
583 static int early_drop(struct list_head *chain)
584 {
585         /* Traverse backwards: gives us oldest, which is roughly LRU */
586         struct ip_conntrack_tuple_hash *h;
587         int dropped = 0;
588
589         READ_LOCK(&ip_conntrack_lock);
590         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
591         if (h)
592                 atomic_inc(&h->ctrack->ct_general.use);
593         READ_UNLOCK(&ip_conntrack_lock);
594
595         if (!h)
596                 return dropped;
597
598         if (del_timer(&h->ctrack->timeout)) {
599                 death_by_timeout((unsigned long)h->ctrack);
600                 dropped = 1;
601         }
602         ip_conntrack_put(h->ctrack);
603         return dropped;
604 }
605
606 static inline int helper_cmp(const struct ip_conntrack_helper *i,
607                              const struct ip_conntrack_tuple *rtuple)
608 {
609         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
610 }
611
612 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
613 {
614         return LIST_FIND(&helpers, helper_cmp,
615                          struct ip_conntrack_helper *,
616                          tuple);
617 }
618
619 /* Allocate a new conntrack: we return -ENOMEM if classification
620    failed due to stress.  Otherwise it really is unclassifiable. */
621 static struct ip_conntrack_tuple_hash *
622 init_conntrack(const struct ip_conntrack_tuple *tuple,
623                struct ip_conntrack_protocol *protocol,
624                struct sk_buff *skb)
625 {
626         struct ip_conntrack *conntrack;
627         struct ip_conntrack_tuple repl_tuple;
628         size_t hash;
629         struct ip_conntrack_expect *expected;
630         int i;
631         static unsigned int drop_next;
632
633         if (!ip_conntrack_hash_rnd_initted) {
634                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
635                 ip_conntrack_hash_rnd_initted = 1;
636         }
637
638         hash = hash_conntrack(tuple);
639
640         if (ip_conntrack_max &&
641             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
642                 /* Try dropping from random chain, or else from the
643                    chain about to put into (in case they're trying to
644                    bomb one hash chain). */
645                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
646
647                 if (!early_drop(&ip_conntrack_hash[next])
648                     && !early_drop(&ip_conntrack_hash[hash])) {
649                         if (net_ratelimit())
650                                 printk(KERN_WARNING
651                                        "ip_conntrack: table full, dropping"
652                                        " packet.\n");
653                         return ERR_PTR(-ENOMEM);
654                 }
655         }
656
657         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
658                 DEBUGP("Can't invert tuple.\n");
659                 return NULL;
660         }
661
662         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
663         if (!conntrack) {
664                 DEBUGP("Can't allocate conntrack.\n");
665                 return ERR_PTR(-ENOMEM);
666         }
667
668         memset(conntrack, 0, sizeof(*conntrack));
669         atomic_set(&conntrack->ct_general.use, 1);
670         conntrack->ct_general.destroy = destroy_conntrack;
671         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
672         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
673         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
674         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
675         for (i=0; i < IP_CT_NUMBER; i++)
676                 conntrack->infos[i].master = &conntrack->ct_general;
677
678         if (!protocol->new(conntrack, skb)) {
679                 kmem_cache_free(ip_conntrack_cachep, conntrack);
680                 return NULL;
681         }
682         /* Don't set timer yet: wait for confirmation */
683         init_timer(&conntrack->timeout);
684         conntrack->timeout.data = (unsigned long)conntrack;
685         conntrack->timeout.function = death_by_timeout;
686
687         INIT_LIST_HEAD(&conntrack->sibling_list);
688
689         WRITE_LOCK(&ip_conntrack_lock);
690         /* Need finding and deleting of expected ONLY if we win race */
691         READ_LOCK(&ip_conntrack_expect_tuple_lock);
692         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
693                              struct ip_conntrack_expect *, tuple);
694         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
695
696         /* If master is not in hash table yet (ie. packet hasn't left
697            this machine yet), how can other end know about expected?
698            Hence these are not the droids you are looking for (if
699            master ct never got confirmed, we'd hold a reference to it
700            and weird things would happen to future packets). */
701         if (expected && !is_confirmed(expected->expectant))
702                 expected = NULL;
703
704         /* Look up the conntrack helper for master connections only */
705         if (!expected)
706                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
707
708         /* If the expectation is dying, then this is a loser. */
709         if (expected
710             && expected->expectant->helper->timeout
711             && ! del_timer(&expected->timeout))
712                 expected = NULL;
713
714         if (expected) {
715                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
716                         conntrack, expected);
717                 /* Welcome, Mr. Bond.  We've been expecting you... */
718                 IP_NF_ASSERT(master_ct(conntrack));
719                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
720                 conntrack->master = expected;
721                 expected->sibling = conntrack;
722                 LIST_DELETE(&ip_conntrack_expect_list, expected);
723                 expected->expectant->expecting--;
724                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
725         }
726         atomic_inc(&ip_conntrack_count);
727         WRITE_UNLOCK(&ip_conntrack_lock);
728
729         if (expected && expected->expectfn)
730                 expected->expectfn(conntrack);
731         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
732 }
733
734 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
735 static inline struct ip_conntrack *
736 resolve_normal_ct(struct sk_buff *skb,
737                   struct ip_conntrack_protocol *proto,
738                   int *set_reply,
739                   unsigned int hooknum,
740                   enum ip_conntrack_info *ctinfo)
741 {
742         struct ip_conntrack_tuple tuple;
743         struct ip_conntrack_tuple_hash *h;
744
745         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
746
747         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
748                 return NULL;
749
750         /* look for tuple match */
751         h = ip_conntrack_find_get(&tuple, NULL);
752         if (!h) {
753                 h = init_conntrack(&tuple, proto, skb);
754                 if (!h)
755                         return NULL;
756                 if (IS_ERR(h))
757                         return (void *)h;
758         }
759
760         /* It exists; we have (non-exclusive) reference. */
761         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
762                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
763                 /* Please set reply bit if this packet OK */
764                 *set_reply = 1;
765         } else {
766                 /* Once we've had two way comms, always ESTABLISHED. */
767                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
768                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
769                                h->ctrack);
770                         *ctinfo = IP_CT_ESTABLISHED;
771                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
772                         DEBUGP("ip_conntrack_in: related packet for %p\n",
773                                h->ctrack);
774                         *ctinfo = IP_CT_RELATED;
775                 } else {
776                         DEBUGP("ip_conntrack_in: new packet for %p\n",
777                                h->ctrack);
778                         *ctinfo = IP_CT_NEW;
779                 }
780                 *set_reply = 0;
781         }
782         skb->nfct = &h->ctrack->infos[*ctinfo];
783         return h->ctrack;
784 }
785
786 /* Netfilter hook itself. */
787 unsigned int ip_conntrack_in(unsigned int hooknum,
788                              struct sk_buff **pskb,
789                              const struct net_device *in,
790                              const struct net_device *out,
791                              int (*okfn)(struct sk_buff *))
792 {
793         struct ip_conntrack *ct;
794         enum ip_conntrack_info ctinfo;
795         struct ip_conntrack_protocol *proto;
796         int set_reply;
797         int ret;
798
799         /* Never happen */
800         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
801                 if (net_ratelimit()) {
802                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
803                        (*pskb)->nh.iph->protocol, hooknum);
804                 }
805                 return NF_DROP;
806         }
807
808         /* FIXME: Do this right please. --RR */
809         (*pskb)->nfcache |= NFC_UNKNOWN;
810
811 /* Doesn't cover locally-generated broadcast, so not worth it. */
812 #if 0
813         /* Ignore broadcast: no `connection'. */
814         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
815                 printk("Broadcast packet!\n");
816                 return NF_ACCEPT;
817         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
818                    == htonl(0x000000FF)) {
819                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
820                        NIPQUAD((*pskb)->nh.iph->saddr),
821                        NIPQUAD((*pskb)->nh.iph->daddr),
822                        (*pskb)->sk, (*pskb)->pkt_type);
823         }
824 #endif
825
826         /* Previously seen (loopback or untracked)?  Ignore. */
827         if ((*pskb)->nfct)
828                 return NF_ACCEPT;
829
830         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
831
832         /* It may be an icmp error... */
833         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
834             && icmp_error_track(*pskb, &ctinfo, hooknum))
835                 return NF_ACCEPT;
836
837         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
838                 /* Not valid part of a connection */
839                 return NF_ACCEPT;
840
841         if (IS_ERR(ct))
842                 /* Too stressed to deal. */
843                 return NF_DROP;
844
845         IP_NF_ASSERT((*pskb)->nfct);
846
847         ret = proto->packet(ct, *pskb, ctinfo);
848         if (ret == -1) {
849                 /* Invalid */
850                 nf_conntrack_put((*pskb)->nfct);
851                 (*pskb)->nfct = NULL;
852                 return NF_ACCEPT;
853         }
854
855         if (ret != NF_DROP && ct->helper) {
856                 ret = ct->helper->help(*pskb, ct, ctinfo);
857                 if (ret == -1) {
858                         /* Invalid */
859                         nf_conntrack_put((*pskb)->nfct);
860                         (*pskb)->nfct = NULL;
861                         return NF_ACCEPT;
862                 }
863         }
864         if (set_reply)
865                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
866
867         return ret;
868 }
869
870 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
871                    const struct ip_conntrack_tuple *orig)
872 {
873         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
874 }
875
876 static inline int resent_expect(const struct ip_conntrack_expect *i,
877                                 const struct ip_conntrack_tuple *tuple,
878                                 const struct ip_conntrack_tuple *mask)
879 {
880         DEBUGP("resent_expect\n");
881         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
882         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
883         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
884         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
885                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
886                 && ip_ct_tuple_equal(&i->mask, mask));
887 }
888
889 /* Would two expected things clash? */
890 static inline int expect_clash(const struct ip_conntrack_expect *i,
891                                const struct ip_conntrack_tuple *tuple,
892                                const struct ip_conntrack_tuple *mask)
893 {
894         /* Part covered by intersection of masks must be unequal,
895            otherwise they clash */
896         struct ip_conntrack_tuple intersect_mask
897                 = { { i->mask.src.ip & mask->src.ip,
898                       { i->mask.src.u.all & mask->src.u.all } },
899                     { i->mask.dst.ip & mask->dst.ip,
900                       { i->mask.dst.u.all & mask->dst.u.all },
901                       i->mask.dst.protonum & mask->dst.protonum } };
902
903         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
904 }
905
906 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
907 {
908         WRITE_LOCK(&ip_conntrack_lock);
909         unexpect_related(expect);
910         WRITE_UNLOCK(&ip_conntrack_lock);
911 }
912         
913 static void expectation_timed_out(unsigned long ul_expect)
914 {
915         struct ip_conntrack_expect *expect = (void *) ul_expect;
916
917         DEBUGP("expectation %p timed out\n", expect);   
918         WRITE_LOCK(&ip_conntrack_lock);
919         __unexpect_related(expect);
920         WRITE_UNLOCK(&ip_conntrack_lock);
921 }
922
923 struct ip_conntrack_expect *
924 ip_conntrack_expect_alloc(void)
925 {
926         struct ip_conntrack_expect *new;
927         
928         new = (struct ip_conntrack_expect *)
929                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
930         if (!new) {
931                 DEBUGP("expect_related: OOM allocating expect\n");
932                 return NULL;
933         }
934
935         /* tuple_cmp compares whole union, we have to initialized cleanly */
936         memset(new, 0, sizeof(struct ip_conntrack_expect));
937
938         return new;
939 }
940
941 static void
942 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
943                            struct ip_conntrack *related_to)
944 {
945         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
946         new->expectant = related_to;
947         new->sibling = NULL;
948         atomic_set(&new->use, 1);
949
950         /* add to expected list for this connection */
951         list_add(&new->expected_list, &related_to->sibling_list);
952         /* add to global list of expectations */
953
954         list_prepend(&ip_conntrack_expect_list, &new->list);
955         /* add and start timer if required */
956         if (related_to->helper->timeout) {
957                 init_timer(&new->timeout);
958                 new->timeout.data = (unsigned long)new;
959                 new->timeout.function = expectation_timed_out;
960                 new->timeout.expires = jiffies +
961                                         related_to->helper->timeout * HZ;
962                 add_timer(&new->timeout);
963         }
964         related_to->expecting++;
965 }
966
967 /* Add a related connection. */
968 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
969                                 struct ip_conntrack *related_to)
970 {
971         struct ip_conntrack_expect *old;
972         int ret = 0;
973
974         WRITE_LOCK(&ip_conntrack_lock);
975         /* Because of the write lock, no reader can walk the lists,
976          * so there is no need to use the tuple lock too */
977
978         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
979         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
980         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
981
982         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
983                         struct ip_conntrack_expect *, &expect->tuple, 
984                         &expect->mask);
985         if (old) {
986                 /* Helper private data may contain offsets but no pointers
987                    pointing into the payload - otherwise we should have to copy 
988                    the data filled out by the helper over the old one */
989                 DEBUGP("expect_related: resent packet\n");
990                 if (related_to->helper->timeout) {
991                         if (!del_timer(&old->timeout)) {
992                                 /* expectation is dying. Fall through */
993                                 goto out;
994                         } else {
995                                 old->timeout.expires = jiffies + 
996                                         related_to->helper->timeout * HZ;
997                                 add_timer(&old->timeout);
998                         }
999                 }
1000
1001                 WRITE_UNLOCK(&ip_conntrack_lock);
1002                 kfree(expect);
1003                 return -EEXIST;
1004
1005         } else if (related_to->helper->max_expected && 
1006                    related_to->expecting >= related_to->helper->max_expected) {
1007                 struct list_head *cur_item;
1008                 /* old == NULL */
1009                 if (!(related_to->helper->flags & 
1010                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1011                         WRITE_UNLOCK(&ip_conntrack_lock);
1012                         if (net_ratelimit())
1013                                 printk(KERN_WARNING
1014                                        "ip_conntrack: max number of expected "
1015                                        "connections %i of %s reached for "
1016                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1017                                        related_to->helper->max_expected,
1018                                        related_to->helper->name,
1019                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1020                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1021                         kfree(expect);
1022                         return -EPERM;
1023                 }
1024                 DEBUGP("ip_conntrack: max number of expected "
1025                        "connections %i of %s reached for "
1026                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1027                        related_to->helper->max_expected,
1028                        related_to->helper->name,
1029                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1030                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1031  
1032                 /* choose the the oldest expectation to evict */
1033                 list_for_each(cur_item, &related_to->sibling_list) { 
1034                         struct ip_conntrack_expect *cur;
1035
1036                         cur = list_entry(cur_item, 
1037                                          struct ip_conntrack_expect,
1038                                          expected_list);
1039                         if (cur->sibling == NULL) {
1040                                 old = cur;
1041                                 break;
1042                         }
1043                 }
1044
1045                 /* (!old) cannot happen, since related_to->expecting is the
1046                  * number of unconfirmed expects */
1047                 IP_NF_ASSERT(old);
1048
1049                 /* newnat14 does not reuse the real allocated memory
1050                  * structures but rather unexpects the old and
1051                  * allocates a new.  unexpect_related will decrement
1052                  * related_to->expecting. 
1053                  */
1054                 unexpect_related(old);
1055                 ret = -EPERM;
1056         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1057                              struct ip_conntrack_expect *, &expect->tuple, 
1058                              &expect->mask)) {
1059                 WRITE_UNLOCK(&ip_conntrack_lock);
1060                 DEBUGP("expect_related: busy!\n");
1061
1062                 kfree(expect);
1063                 return -EBUSY;
1064         }
1065
1066 out:    ip_conntrack_expect_insert(expect, related_to);
1067
1068         WRITE_UNLOCK(&ip_conntrack_lock);
1069
1070         return ret;
1071 }
1072
1073 /* Change tuple in an existing expectation */
1074 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1075                                struct ip_conntrack_tuple *newtuple)
1076 {
1077         int ret;
1078
1079         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1080         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1081
1082         DEBUGP("change_expect:\n");
1083         DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1084         DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
1085         DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
1086         if (expect->ct_tuple.dst.protonum == 0) {
1087                 /* Never seen before */
1088                 DEBUGP("change expect: never seen before\n");
1089                 if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
1090                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1091                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1092                         /* Force NAT to find an unused tuple */
1093                         ret = -1;
1094                 } else {
1095                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1096                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1097                         ret = 0;
1098                 }
1099         } else {
1100                 /* Resent packet */
1101                 DEBUGP("change expect: resent packet\n");
1102                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1103                         ret = 0;
1104                 } else {
1105                         /* Force NAT to choose again the same port */
1106                         ret = -1;
1107                 }
1108         }
1109         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1110         
1111         return ret;
1112 }
1113
1114 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1115    return 0 and don't do alteration. */
1116 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1117                              const struct ip_conntrack_tuple *newreply)
1118 {
1119         WRITE_LOCK(&ip_conntrack_lock);
1120         if (__ip_conntrack_find(newreply, conntrack)) {
1121                 WRITE_UNLOCK(&ip_conntrack_lock);
1122                 return 0;
1123         }
1124         /* Should be unconfirmed, so not in hash table yet */
1125         IP_NF_ASSERT(!is_confirmed(conntrack));
1126
1127         DEBUGP("Altering reply tuple of %p to ", conntrack);
1128         DUMP_TUPLE(newreply);
1129
1130         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1131         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1132                 conntrack->helper = ip_ct_find_helper(newreply);
1133         WRITE_UNLOCK(&ip_conntrack_lock);
1134
1135         return 1;
1136 }
1137
1138 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1139 {
1140         WRITE_LOCK(&ip_conntrack_lock);
1141         list_prepend(&helpers, me);
1142         WRITE_UNLOCK(&ip_conntrack_lock);
1143
1144         return 0;
1145 }
1146
1147 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1148                          const struct ip_conntrack_helper *me)
1149 {
1150         if (i->ctrack->helper == me) {
1151                 /* Get rid of any expected. */
1152                 remove_expectations(i->ctrack, 0);
1153                 /* And *then* set helper to NULL */
1154                 i->ctrack->helper = NULL;
1155         }
1156         return 0;
1157 }
1158
1159 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1160 {
1161         unsigned int i;
1162
1163         /* Need write lock here, to delete helper. */
1164         WRITE_LOCK(&ip_conntrack_lock);
1165         LIST_DELETE(&helpers, me);
1166
1167         /* Get rid of expecteds, set helpers to NULL. */
1168         for (i = 0; i < ip_conntrack_htable_size; i++)
1169                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1170                             struct ip_conntrack_tuple_hash *, me);
1171         WRITE_UNLOCK(&ip_conntrack_lock);
1172
1173         /* Someone could be still looking at the helper in a bh. */
1174         synchronize_net();
1175 }
1176
1177 /* Refresh conntrack for this many jiffies. */
1178 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1179 {
1180         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1181
1182         /* If not in hash table, timer will not be active yet */
1183         if (!is_confirmed(ct))
1184                 ct->timeout.expires = extra_jiffies;
1185         else {
1186                 WRITE_LOCK(&ip_conntrack_lock);
1187                 /* Need del_timer for race avoidance (may already be dying). */
1188                 if (del_timer(&ct->timeout)) {
1189                         ct->timeout.expires = jiffies + extra_jiffies;
1190                         add_timer(&ct->timeout);
1191                 }
1192                 WRITE_UNLOCK(&ip_conntrack_lock);
1193         }
1194 }
1195
1196 /* Returns new sk_buff, or NULL */
1197 struct sk_buff *
1198 ip_ct_gather_frags(struct sk_buff *skb)
1199 {
1200         struct sock *sk = skb->sk;
1201 #ifdef CONFIG_NETFILTER_DEBUG
1202         unsigned int olddebug = skb->nf_debug;
1203 #endif
1204         if (sk) {
1205                 sock_hold(sk);
1206                 skb_orphan(skb);
1207         }
1208
1209         local_bh_disable(); 
1210         skb = ip_defrag(skb);
1211         local_bh_enable();
1212
1213         if (!skb) {
1214                 if (sk)
1215                         sock_put(sk);
1216                 return skb;
1217         }
1218
1219         if (sk) {
1220                 skb_set_owner_w(skb, sk);
1221                 sock_put(sk);
1222         }
1223
1224         ip_send_check(skb->nh.iph);
1225         skb->nfcache |= NFC_ALTERED;
1226 #ifdef CONFIG_NETFILTER_DEBUG
1227         /* Packet path as if nothing had happened. */
1228         skb->nf_debug = olddebug;
1229 #endif
1230         return skb;
1231 }
1232
1233 /* Used by ipt_REJECT. */
1234 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1235 {
1236         struct ip_conntrack *ct;
1237         enum ip_conntrack_info ctinfo;
1238
1239         ct = __ip_conntrack_get(nfct, &ctinfo);
1240
1241         /* This ICMP is in reverse direction to the packet which
1242            caused it */
1243         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1244                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1245         else
1246                 ctinfo = IP_CT_RELATED;
1247
1248         /* Attach new skbuff, and increment count */
1249         nskb->nfct = &ct->infos[ctinfo];
1250         atomic_inc(&ct->ct_general.use);
1251 }
1252
1253 static inline int
1254 do_kill(const struct ip_conntrack_tuple_hash *i,
1255         int (*kill)(const struct ip_conntrack *i, void *data),
1256         void *data)
1257 {
1258         return kill(i->ctrack, data);
1259 }
1260
1261 /* Bring out ya dead! */
1262 static struct ip_conntrack_tuple_hash *
1263 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1264                 void *data, unsigned int *bucket)
1265 {
1266         struct ip_conntrack_tuple_hash *h = NULL;
1267
1268         READ_LOCK(&ip_conntrack_lock);
1269         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1270                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1271                               struct ip_conntrack_tuple_hash *, kill, data);
1272         }
1273         if (h)
1274                 atomic_inc(&h->ctrack->ct_general.use);
1275         READ_UNLOCK(&ip_conntrack_lock);
1276
1277         return h;
1278 }
1279
1280 void
1281 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1282                         void *data)
1283 {
1284         struct ip_conntrack_tuple_hash *h;
1285         unsigned int bucket = 0;
1286
1287         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1288                 /* Time to push up daises... */
1289                 if (del_timer(&h->ctrack->timeout))
1290                         death_by_timeout((unsigned long)h->ctrack);
1291                 /* ... else the timer will get him soon. */
1292
1293                 ip_conntrack_put(h->ctrack);
1294         }
1295 }
1296
1297 /* Fast function for those who don't want to parse /proc (and I don't
1298    blame them). */
1299 /* Reversing the socket's dst/src point of view gives us the reply
1300    mapping. */
1301 static int
1302 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1303 {
1304         struct inet_opt *inet = inet_sk(sk);
1305         struct ip_conntrack_tuple_hash *h;
1306         struct ip_conntrack_tuple tuple;
1307         
1308         IP_CT_TUPLE_U_BLANK(&tuple);
1309         tuple.src.ip = inet->rcv_saddr;
1310         tuple.src.u.tcp.port = inet->sport;
1311         tuple.dst.ip = inet->daddr;
1312         tuple.dst.u.tcp.port = inet->dport;
1313         tuple.dst.protonum = IPPROTO_TCP;
1314
1315         /* We only do TCP at the moment: is there a better way? */
1316         if (strcmp(sk->sk_prot->name, "TCP")) {
1317                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1318                 return -ENOPROTOOPT;
1319         }
1320
1321         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1322                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1323                        *len, sizeof(struct sockaddr_in));
1324                 return -EINVAL;
1325         }
1326
1327         h = ip_conntrack_find_get(&tuple, NULL);
1328         if (h) {
1329                 struct sockaddr_in sin;
1330
1331                 sin.sin_family = AF_INET;
1332                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1333                         .tuple.dst.u.tcp.port;
1334                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1335                         .tuple.dst.ip;
1336
1337                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1338                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1339                 ip_conntrack_put(h->ctrack);
1340                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1341                         return -EFAULT;
1342                 else
1343                         return 0;
1344         }
1345         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1346                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1347                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1348         return -ENOENT;
1349 }
1350
1351 static struct nf_sockopt_ops so_getorigdst = {
1352         .pf             = PF_INET,
1353         .get_optmin     = SO_ORIGINAL_DST,
1354         .get_optmax     = SO_ORIGINAL_DST+1,
1355         .get            = &getorigdst,
1356 };
1357
1358 static int kill_all(const struct ip_conntrack *i, void *data)
1359 {
1360         return 1;
1361 }
1362
1363 /* Mishearing the voices in his head, our hero wonders how he's
1364    supposed to kill the mall. */
1365 void ip_conntrack_cleanup(void)
1366 {
1367         ip_ct_attach = NULL;
1368         /* This makes sure all current packets have passed through
1369            netfilter framework.  Roll on, two-stage module
1370            delete... */
1371         synchronize_net();
1372  
1373  i_see_dead_people:
1374         ip_ct_selective_cleanup(kill_all, NULL);
1375         if (atomic_read(&ip_conntrack_count) != 0) {
1376                 schedule();
1377                 goto i_see_dead_people;
1378         }
1379
1380         kmem_cache_destroy(ip_conntrack_cachep);
1381         vfree(ip_conntrack_hash);
1382         nf_unregister_sockopt(&so_getorigdst);
1383 }
1384
1385 static int hashsize;
1386 MODULE_PARM(hashsize, "i");
1387
1388 int __init ip_conntrack_init(void)
1389 {
1390         unsigned int i;
1391         int ret;
1392
1393         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1394          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1395         if (hashsize) {
1396                 ip_conntrack_htable_size = hashsize;
1397         } else {
1398                 ip_conntrack_htable_size
1399                         = (((num_physpages << PAGE_SHIFT) / 16384)
1400                            / sizeof(struct list_head));
1401                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1402                         ip_conntrack_htable_size = 8192;
1403                 if (ip_conntrack_htable_size < 16)
1404                         ip_conntrack_htable_size = 16;
1405         }
1406         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1407
1408         printk("ip_conntrack version %s (%u buckets, %d max)"
1409                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1410                ip_conntrack_htable_size, ip_conntrack_max,
1411                sizeof(struct ip_conntrack));
1412
1413         ret = nf_register_sockopt(&so_getorigdst);
1414         if (ret != 0) {
1415                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1416                 return ret;
1417         }
1418
1419         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1420                                     * ip_conntrack_htable_size);
1421         if (!ip_conntrack_hash) {
1422                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1423                 goto err_unreg_sockopt;
1424         }
1425
1426         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1427                                                 sizeof(struct ip_conntrack), 0,
1428                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1429         if (!ip_conntrack_cachep) {
1430                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1431                 goto err_free_hash;
1432         }
1433         /* Don't NEED lock here, but good form anyway. */
1434         WRITE_LOCK(&ip_conntrack_lock);
1435         /* Sew in builtin protocols. */
1436         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1437         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1438         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1439         WRITE_UNLOCK(&ip_conntrack_lock);
1440
1441         for (i = 0; i < ip_conntrack_htable_size; i++)
1442                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1443
1444         /* For use by ipt_REJECT */
1445         ip_ct_attach = ip_conntrack_attach;
1446
1447         /* Set up fake conntrack:
1448             - to never be deleted, not in any hashes */
1449         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1450         /*  - and look it like as a confirmed connection */
1451         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1452         /*  - and prepare the ctinfo field for REJECT & NAT. */
1453         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1454         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1455         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1456                         &ip_conntrack_untracked.ct_general;
1457
1458         return ret;
1459
1460 err_free_hash:
1461         vfree(ip_conntrack_hash);
1462 err_unreg_sockopt:
1463         nf_unregister_sockopt(&so_getorigdst);
1464
1465         return -ENOMEM;
1466 }