05fbb43cc0a57552936a1a74ef89af5c693b5121
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 /* For ERR_PTR().  Yeah, I know... --RR */
38 #include <linux/fs.h>
39
40 /* This rwlock protects the main hash table, protocol/helper/expected
41    registrations, conntrack timers*/
42 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
43 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
44
45 #include <linux/netfilter_ipv4/ip_conntrack.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
49 #include <linux/netfilter_ipv4/listhelp.h>
50
51 #define IP_CONNTRACK_VERSION    "2.1"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DECLARE_RWLOCK(ip_conntrack_lock);
60 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
61
62 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
63 LIST_HEAD(ip_conntrack_expect_list);
64 LIST_HEAD(protocol_list);
65 static LIST_HEAD(helpers);
66 unsigned int ip_conntrack_htable_size = 0;
67 int ip_conntrack_max;
68 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
69 struct list_head *ip_conntrack_hash;
70 static kmem_cache_t *ip_conntrack_cachep;
71 struct ip_conntrack ip_conntrack_untracked;
72
73 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
74
75 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
76                               u_int8_t protocol)
77 {
78         return protocol == curr->proto;
79 }
80
81 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
82 {
83         struct ip_conntrack_protocol *p;
84
85         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
86         p = LIST_FIND(&protocol_list, proto_cmpfn,
87                       struct ip_conntrack_protocol *, protocol);
88         if (!p)
89                 p = &ip_conntrack_generic_protocol;
90
91         return p;
92 }
93
94 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
95 {
96         struct ip_conntrack_protocol *p;
97
98         READ_LOCK(&ip_conntrack_lock);
99         p = __ip_ct_find_proto(protocol);
100         READ_UNLOCK(&ip_conntrack_lock);
101         return p;
102 }
103
104 inline void 
105 ip_conntrack_put(struct ip_conntrack *ct)
106 {
107         IP_NF_ASSERT(ct);
108         IP_NF_ASSERT(ct->infos[0].master);
109         /* nf_conntrack_put wants to go via an info struct, so feed it
110            one at random. */
111         nf_conntrack_put(&ct->infos[0]);
112 }
113
114 static int ip_conntrack_hash_rnd_initted;
115 static unsigned int ip_conntrack_hash_rnd;
116
117 static u_int32_t
118 hash_conntrack(const struct ip_conntrack_tuple *tuple)
119 {
120 #if 0
121         dump_tuple(tuple);
122 #endif
123         return (jhash_3words(tuple->src.ip,
124                              (tuple->dst.ip ^ tuple->dst.protonum),
125                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
126                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
127 }
128
129 int
130 get_tuple(const struct iphdr *iph,
131           const struct sk_buff *skb,
132           unsigned int dataoff,
133           struct ip_conntrack_tuple *tuple,
134           const struct ip_conntrack_protocol *protocol)
135 {
136         /* Never happen */
137         if (iph->frag_off & htons(IP_OFFSET)) {
138                 printk("ip_conntrack_core: Frag of proto %u.\n",
139                        iph->protocol);
140                 return 0;
141         }
142
143         tuple->src.ip = iph->saddr;
144         tuple->dst.ip = iph->daddr;
145         tuple->dst.protonum = iph->protocol;
146
147         return protocol->pkt_to_tuple(skb, dataoff, tuple);
148 }
149
150 static int
151 invert_tuple(struct ip_conntrack_tuple *inverse,
152              const struct ip_conntrack_tuple *orig,
153              const struct ip_conntrack_protocol *protocol)
154 {
155         inverse->src.ip = orig->dst.ip;
156         inverse->dst.ip = orig->src.ip;
157         inverse->dst.protonum = orig->dst.protonum;
158
159         return protocol->invert_tuple(inverse, orig);
160 }
161
162
163 /* ip_conntrack_expect helper functions */
164
165 /* Compare tuple parts depending on mask. */
166 static inline int expect_cmp(const struct ip_conntrack_expect *i,
167                              const struct ip_conntrack_tuple *tuple)
168 {
169         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
170         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
171 }
172
173 static void
174 destroy_expect(struct ip_conntrack_expect *exp)
175 {
176         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
177         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
178         IP_NF_ASSERT(!timer_pending(&exp->timeout));
179
180         kfree(exp);
181 }
182
183 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
184 {
185         IP_NF_ASSERT(exp);
186
187         if (atomic_dec_and_test(&exp->use)) {
188                 /* usage count dropped to zero */
189                 destroy_expect(exp);
190         }
191 }
192
193 static inline struct ip_conntrack_expect *
194 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
195 {
196         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
197         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
198         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
199                          struct ip_conntrack_expect *, tuple);
200 }
201
202 /* Find a expectation corresponding to a tuple. */
203 struct ip_conntrack_expect *
204 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
205 {
206         struct ip_conntrack_expect *exp;
207
208         READ_LOCK(&ip_conntrack_lock);
209         READ_LOCK(&ip_conntrack_expect_tuple_lock);
210         exp = __ip_ct_expect_find(tuple);
211         if (exp)
212                 atomic_inc(&exp->use);
213         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
214         READ_UNLOCK(&ip_conntrack_lock);
215
216         return exp;
217 }
218
219 /* remove one specific expectation from all lists and drop refcount,
220  * does _NOT_ delete the timer. */
221 static void __unexpect_related(struct ip_conntrack_expect *expect)
222 {
223         DEBUGP("unexpect_related(%p)\n", expect);
224         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
225
226         /* we're not allowed to unexpect a confirmed expectation! */
227         IP_NF_ASSERT(!expect->sibling);
228
229         /* delete from global and local lists */
230         list_del(&expect->list);
231         list_del(&expect->expected_list);
232
233         /* decrement expect-count of master conntrack */
234         if (expect->expectant)
235                 expect->expectant->expecting--;
236
237         ip_conntrack_expect_put(expect);
238 }
239
240 /* remove one specific expecatation from all lists, drop refcount
241  * and expire timer. 
242  * This function can _NOT_ be called for confirmed expects! */
243 static void unexpect_related(struct ip_conntrack_expect *expect)
244 {
245         IP_NF_ASSERT(expect->expectant);
246         IP_NF_ASSERT(expect->expectant->helper);
247         /* if we are supposed to have a timer, but we can't delete
248          * it: race condition.  __unexpect_related will
249          * be calledd by timeout function */
250         if (expect->expectant->helper->timeout
251             && !del_timer(&expect->timeout))
252                 return;
253
254         __unexpect_related(expect);
255 }
256
257 /* delete all unconfirmed expectations for this conntrack */
258 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
259 {
260         struct list_head *exp_entry, *next;
261         struct ip_conntrack_expect *exp;
262
263         DEBUGP("remove_expectations(%p)\n", ct);
264
265         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
266                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
267                                  expected_list);
268
269                 /* we skip established expectations, as we want to delete
270                  * the un-established ones only */
271                 if (exp->sibling) {
272                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
273                         if (drop_refcount) {
274                                 /* Indicate that this expectations parent is dead */
275                                 ip_conntrack_put(exp->expectant);
276                                 exp->expectant = NULL;
277                         }
278                         continue;
279                 }
280
281                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
282                 IP_NF_ASSERT(exp->expectant == ct);
283
284                 /* delete expectation from global and private lists */
285                 unexpect_related(exp);
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         unsigned int ho, hr;
293         
294         DEBUGP("clean_from_lists(%p)\n", ct);
295         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
296
297         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
298         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
299         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
300         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
301
302         /* Destroy all un-established, pending expectations */
303         remove_expectations(ct, 1);
304 }
305
306 static void
307 destroy_conntrack(struct nf_conntrack *nfct)
308 {
309         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
310         struct ip_conntrack_protocol *proto;
311
312         DEBUGP("destroy_conntrack(%p)\n", ct);
313         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314         IP_NF_ASSERT(!timer_pending(&ct->timeout));
315
316         /* To make sure we don't get any weird locking issues here:
317          * destroy_conntrack() MUST NOT be called with a write lock
318          * to ip_conntrack_lock!!! -HW */
319         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
320         if (proto && proto->destroy)
321                 proto->destroy(ct);
322
323         if (ip_conntrack_destroyed)
324                 ip_conntrack_destroyed(ct);
325
326         WRITE_LOCK(&ip_conntrack_lock);
327         /* Make sure don't leave any orphaned expectations lying around */
328         if (ct->expecting)
329                 remove_expectations(ct, 1);
330
331         /* Delete our master expectation */
332         if (ct->master) {
333                 if (ct->master->expectant) {
334                         /* can't call __unexpect_related here,
335                          * since it would screw up expect_list */
336                         list_del(&ct->master->expected_list);
337                         master = ct->master->expectant;
338                 }
339                 kfree(ct->master);
340         }
341         WRITE_UNLOCK(&ip_conntrack_lock);
342
343         if (master)
344                 ip_conntrack_put(master);
345
346         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
347         kmem_cache_free(ip_conntrack_cachep, ct);
348         atomic_dec(&ip_conntrack_count);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         WRITE_LOCK(&ip_conntrack_lock);
356         clean_from_lists(ct);
357         WRITE_UNLOCK(&ip_conntrack_lock);
358         ip_conntrack_put(ct);
359 }
360
361 static inline int
362 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
363                     const struct ip_conntrack_tuple *tuple,
364                     const struct ip_conntrack *ignored_conntrack)
365 {
366         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
367         return i->ctrack != ignored_conntrack
368                 && ip_ct_tuple_equal(tuple, &i->tuple);
369 }
370
371 static struct ip_conntrack_tuple_hash *
372 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
373                     const struct ip_conntrack *ignored_conntrack)
374 {
375         struct ip_conntrack_tuple_hash *h;
376         unsigned int hash = hash_conntrack(tuple);
377
378         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
379         h = LIST_FIND(&ip_conntrack_hash[hash],
380                       conntrack_tuple_cmp,
381                       struct ip_conntrack_tuple_hash *,
382                       tuple, ignored_conntrack);
383         return h;
384 }
385
386 /* Find a connection corresponding to a tuple. */
387 struct ip_conntrack_tuple_hash *
388 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
389                       const struct ip_conntrack *ignored_conntrack)
390 {
391         struct ip_conntrack_tuple_hash *h;
392
393         READ_LOCK(&ip_conntrack_lock);
394         h = __ip_conntrack_find(tuple, ignored_conntrack);
395         if (h)
396                 atomic_inc(&h->ctrack->ct_general.use);
397         READ_UNLOCK(&ip_conntrack_lock);
398
399         return h;
400 }
401
402 static inline struct ip_conntrack *
403 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
404 {
405         struct ip_conntrack *ct
406                 = (struct ip_conntrack *)nfct->master;
407
408         /* ctinfo is the index of the nfct inside the conntrack */
409         *ctinfo = nfct - ct->infos;
410         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
411         return ct;
412 }
413
414 /* Return conntrack and conntrack_info given skb->nfct->master */
415 struct ip_conntrack *
416 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
417 {
418         if (skb->nfct) 
419                 return __ip_conntrack_get(skb->nfct, ctinfo);
420         return NULL;
421 }
422
423 /* Confirm a connection given skb->nfct; places it in hash table */
424 int
425 __ip_conntrack_confirm(struct nf_ct_info *nfct)
426 {
427         unsigned int hash, repl_hash;
428         struct ip_conntrack *ct;
429         enum ip_conntrack_info ctinfo;
430
431         ct = __ip_conntrack_get(nfct, &ctinfo);
432
433         /* ipt_REJECT uses ip_conntrack_attach to attach related
434            ICMP/TCP RST packets in other direction.  Actual packet
435            which created connection will be IP_CT_NEW or for an
436            expected connection, IP_CT_RELATED. */
437         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438                 return NF_ACCEPT;
439
440         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443         /* We're not in hash table, and we refuse to set up related
444            connections for unconfirmed conns.  But packet copies and
445            REJECT will give spurious warnings here. */
446         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448         /* No external references means noone else could have
449            confirmed us. */
450         IP_NF_ASSERT(!is_confirmed(ct));
451         DEBUGP("Confirming conntrack %p\n", ct);
452
453         WRITE_LOCK(&ip_conntrack_lock);
454         /* See if there's one in the list already, including reverse:
455            NAT could have grabbed it without realizing, since we're
456            not in the hash.  If there is, we lost race. */
457         if (!LIST_FIND(&ip_conntrack_hash[hash],
458                        conntrack_tuple_cmp,
459                        struct ip_conntrack_tuple_hash *,
460                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
461             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
462                           conntrack_tuple_cmp,
463                           struct ip_conntrack_tuple_hash *,
464                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
465                 list_prepend(&ip_conntrack_hash[hash],
466                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
467                 list_prepend(&ip_conntrack_hash[repl_hash],
468                              &ct->tuplehash[IP_CT_DIR_REPLY]);
469                 /* Timer relative to confirmation time, not original
470                    setting time, otherwise we'd get timer wrap in
471                    weird delay cases. */
472                 ct->timeout.expires += jiffies;
473                 add_timer(&ct->timeout);
474                 atomic_inc(&ct->ct_general.use);
475                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
476                 WRITE_UNLOCK(&ip_conntrack_lock);
477                 return NF_ACCEPT;
478         }
479
480         WRITE_UNLOCK(&ip_conntrack_lock);
481         return NF_DROP;
482 }
483
484 /* Returns true if a connection correspondings to the tuple (required
485    for NAT). */
486 int
487 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
488                          const struct ip_conntrack *ignored_conntrack)
489 {
490         struct ip_conntrack_tuple_hash *h;
491
492         READ_LOCK(&ip_conntrack_lock);
493         h = __ip_conntrack_find(tuple, ignored_conntrack);
494         READ_UNLOCK(&ip_conntrack_lock);
495
496         return h != NULL;
497 }
498
499 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
500 struct ip_conntrack *
501 icmp_error_track(struct sk_buff *skb,
502                  enum ip_conntrack_info *ctinfo,
503                  unsigned int hooknum)
504 {
505         struct ip_conntrack_tuple innertuple, origtuple;
506         struct {
507                 struct icmphdr icmp;
508                 struct iphdr ip;
509         } inside;
510         struct ip_conntrack_protocol *innerproto;
511         struct ip_conntrack_tuple_hash *h;
512         int dataoff;
513
514         IP_NF_ASSERT(skb->nfct == NULL);
515
516         /* Not enough header? */
517         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
518                 return NULL;
519
520         if (inside.icmp.type != ICMP_DEST_UNREACH
521             && inside.icmp.type != ICMP_SOURCE_QUENCH
522             && inside.icmp.type != ICMP_TIME_EXCEEDED
523             && inside.icmp.type != ICMP_PARAMETERPROB
524             && inside.icmp.type != ICMP_REDIRECT)
525                 return NULL;
526
527         /* Ignore ICMP's containing fragments (shouldn't happen) */
528         if (inside.ip.frag_off & htons(IP_OFFSET)) {
529                 DEBUGP("icmp_error_track: fragment of proto %u\n",
530                        inside.ip.protocol);
531                 return NULL;
532         }
533
534         innerproto = ip_ct_find_proto(inside.ip.protocol);
535         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
536         /* Are they talking about one of our connections? */
537         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
538                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
539                 return NULL;
540         }
541
542         /* Ordinarily, we'd expect the inverted tupleproto, but it's
543            been preserved inside the ICMP. */
544         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
545                 DEBUGP("icmp_error_track: Can't invert tuple\n");
546                 return NULL;
547         }
548
549         *ctinfo = IP_CT_RELATED;
550
551         h = ip_conntrack_find_get(&innertuple, NULL);
552         if (!h) {
553                 /* Locally generated ICMPs will match inverted if they
554                    haven't been SNAT'ed yet */
555                 /* FIXME: NAT code has to handle half-done double NAT --RR */
556                 if (hooknum == NF_IP_LOCAL_OUT)
557                         h = ip_conntrack_find_get(&origtuple, NULL);
558
559                 if (!h) {
560                         DEBUGP("icmp_error_track: no match\n");
561                         return NULL;
562                 }
563                 /* Reverse direction from that found */
564                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
565                         *ctinfo += IP_CT_IS_REPLY;
566         } else {
567                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
568                         *ctinfo += IP_CT_IS_REPLY;
569         }
570
571         /* Update skb to refer to this connection */
572         skb->nfct = &h->ctrack->infos[*ctinfo];
573         return h->ctrack;
574 }
575
576 /* There's a small race here where we may free a just-assured
577    connection.  Too bad: we're in trouble anyway. */
578 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
579 {
580         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
581 }
582
583 static int early_drop(struct list_head *chain)
584 {
585         /* Traverse backwards: gives us oldest, which is roughly LRU */
586         struct ip_conntrack_tuple_hash *h;
587         int dropped = 0;
588
589         READ_LOCK(&ip_conntrack_lock);
590         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
591         if (h)
592                 atomic_inc(&h->ctrack->ct_general.use);
593         READ_UNLOCK(&ip_conntrack_lock);
594
595         if (!h)
596                 return dropped;
597
598         if (del_timer(&h->ctrack->timeout)) {
599                 death_by_timeout((unsigned long)h->ctrack);
600                 dropped = 1;
601         }
602         ip_conntrack_put(h->ctrack);
603         return dropped;
604 }
605
606 static inline int helper_cmp(const struct ip_conntrack_helper *i,
607                              const struct ip_conntrack_tuple *rtuple)
608 {
609         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
610 }
611
612 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
613 {
614         return LIST_FIND(&helpers, helper_cmp,
615                          struct ip_conntrack_helper *,
616                          tuple);
617 }
618
619 /* Allocate a new conntrack: we return -ENOMEM if classification
620    failed due to stress.  Otherwise it really is unclassifiable. */
621 static struct ip_conntrack_tuple_hash *
622 init_conntrack(const struct ip_conntrack_tuple *tuple,
623                struct ip_conntrack_protocol *protocol,
624                struct sk_buff *skb)
625 {
626         struct ip_conntrack *conntrack;
627         struct ip_conntrack_tuple repl_tuple;
628         size_t hash;
629         struct ip_conntrack_expect *expected;
630         int i;
631         static unsigned int drop_next;
632
633         if (!ip_conntrack_hash_rnd_initted) {
634                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
635                 ip_conntrack_hash_rnd_initted = 1;
636         }
637
638         hash = hash_conntrack(tuple);
639
640         if (ip_conntrack_max &&
641             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
642                 /* Try dropping from random chain, or else from the
643                    chain about to put into (in case they're trying to
644                    bomb one hash chain). */
645                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
646
647                 if (!early_drop(&ip_conntrack_hash[next])
648                     && !early_drop(&ip_conntrack_hash[hash])) {
649                         if (net_ratelimit())
650                                 printk(KERN_WARNING
651                                        "ip_conntrack: table full, dropping"
652                                        " packet.\n");
653                         return ERR_PTR(-ENOMEM);
654                 }
655         }
656
657         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
658                 DEBUGP("Can't invert tuple.\n");
659                 return NULL;
660         }
661
662         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
663         if (!conntrack) {
664                 DEBUGP("Can't allocate conntrack.\n");
665                 return ERR_PTR(-ENOMEM);
666         }
667
668         memset(conntrack, 0, sizeof(*conntrack));
669         atomic_set(&conntrack->ct_general.use, 1);
670         conntrack->ct_general.destroy = destroy_conntrack;
671         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
672         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
673         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
674         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
675         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
676         conntrack->xid[IP_CT_DIR_REPLY] = -1;
677         for (i=0; i < IP_CT_NUMBER; i++)
678                 conntrack->infos[i].master = &conntrack->ct_general;
679
680         if (!protocol->new(conntrack, skb)) {
681                 kmem_cache_free(ip_conntrack_cachep, conntrack);
682                 return NULL;
683         }
684         /* Don't set timer yet: wait for confirmation */
685         init_timer(&conntrack->timeout);
686         conntrack->timeout.data = (unsigned long)conntrack;
687         conntrack->timeout.function = death_by_timeout;
688
689         INIT_LIST_HEAD(&conntrack->sibling_list);
690
691         WRITE_LOCK(&ip_conntrack_lock);
692         /* Need finding and deleting of expected ONLY if we win race */
693         READ_LOCK(&ip_conntrack_expect_tuple_lock);
694         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
695                              struct ip_conntrack_expect *, tuple);
696         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
697
698         /* If master is not in hash table yet (ie. packet hasn't left
699            this machine yet), how can other end know about expected?
700            Hence these are not the droids you are looking for (if
701            master ct never got confirmed, we'd hold a reference to it
702            and weird things would happen to future packets). */
703         if (expected && !is_confirmed(expected->expectant))
704                 expected = NULL;
705
706         /* Look up the conntrack helper for master connections only */
707         if (!expected)
708                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
709
710         /* If the expectation is dying, then this is a loser. */
711         if (expected
712             && expected->expectant->helper->timeout
713             && ! del_timer(&expected->timeout))
714                 expected = NULL;
715
716         if (expected) {
717                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
718                         conntrack, expected);
719                 /* Welcome, Mr. Bond.  We've been expecting you... */
720                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
721                 conntrack->master = expected;
722                 expected->sibling = conntrack;
723                 LIST_DELETE(&ip_conntrack_expect_list, expected);
724                 expected->expectant->expecting--;
725                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
726         }
727         atomic_inc(&ip_conntrack_count);
728         WRITE_UNLOCK(&ip_conntrack_lock);
729
730         if (expected && expected->expectfn)
731                 expected->expectfn(conntrack);
732         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
733 }
734
735 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
736 static inline struct ip_conntrack *
737 resolve_normal_ct(struct sk_buff *skb,
738                   struct ip_conntrack_protocol *proto,
739                   int *set_reply,
740                   unsigned int hooknum,
741                   enum ip_conntrack_info *ctinfo)
742 {
743         struct ip_conntrack_tuple tuple;
744         struct ip_conntrack_tuple_hash *h;
745
746         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
747
748         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
749                 return NULL;
750
751         /* look for tuple match */
752         h = ip_conntrack_find_get(&tuple, NULL);
753         if (!h) {
754                 h = init_conntrack(&tuple, proto, skb);
755                 if (!h)
756                         return NULL;
757                 if (IS_ERR(h))
758                         return (void *)h;
759         }
760
761         /* It exists; we have (non-exclusive) reference. */
762         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
763                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
764                 /* Please set reply bit if this packet OK */
765                 *set_reply = 1;
766         } else {
767                 /* Once we've had two way comms, always ESTABLISHED. */
768                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
769                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
770                                h->ctrack);
771                         *ctinfo = IP_CT_ESTABLISHED;
772                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
773                         DEBUGP("ip_conntrack_in: related packet for %p\n",
774                                h->ctrack);
775                         *ctinfo = IP_CT_RELATED;
776                 } else {
777                         DEBUGP("ip_conntrack_in: new packet for %p\n",
778                                h->ctrack);
779                         *ctinfo = IP_CT_NEW;
780                 }
781                 *set_reply = 0;
782         }
783         skb->nfct = &h->ctrack->infos[*ctinfo];
784         return h->ctrack;
785 }
786
787 /* Netfilter hook itself. */
788 unsigned int ip_conntrack_in(unsigned int hooknum,
789                              struct sk_buff **pskb,
790                              const struct net_device *in,
791                              const struct net_device *out,
792                              int (*okfn)(struct sk_buff *))
793 {
794         struct ip_conntrack *ct;
795         enum ip_conntrack_info ctinfo;
796         struct ip_conntrack_protocol *proto;
797         int set_reply;
798         int ret;
799
800         /* Never happen */
801         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
802                 if (net_ratelimit()) {
803                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
804                        (*pskb)->nh.iph->protocol, hooknum);
805                 }
806                 return NF_DROP;
807         }
808
809         /* FIXME: Do this right please. --RR */
810         (*pskb)->nfcache |= NFC_UNKNOWN;
811
812 /* Doesn't cover locally-generated broadcast, so not worth it. */
813 #if 0
814         /* Ignore broadcast: no `connection'. */
815         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
816                 printk("Broadcast packet!\n");
817                 return NF_ACCEPT;
818         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
819                    == htonl(0x000000FF)) {
820                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
821                        NIPQUAD((*pskb)->nh.iph->saddr),
822                        NIPQUAD((*pskb)->nh.iph->daddr),
823                        (*pskb)->sk, (*pskb)->pkt_type);
824         }
825 #endif
826
827         /* Previously seen (loopback or untracked)?  Ignore. */
828         if ((*pskb)->nfct)
829                 return NF_ACCEPT;
830
831         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
832
833         /* It may be an icmp error... */
834         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
835             && icmp_error_track(*pskb, &ctinfo, hooknum))
836                 return NF_ACCEPT;
837
838         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
839                 /* Not valid part of a connection */
840                 return NF_ACCEPT;
841
842         if (IS_ERR(ct))
843                 /* Too stressed to deal. */
844                 return NF_DROP;
845
846         IP_NF_ASSERT((*pskb)->nfct);
847
848         ret = proto->packet(ct, *pskb, ctinfo);
849         if (ret == -1) {
850                 /* Invalid */
851                 nf_conntrack_put((*pskb)->nfct);
852                 (*pskb)->nfct = NULL;
853                 return NF_ACCEPT;
854         }
855
856         if (ret != NF_DROP && ct->helper) {
857                 ret = ct->helper->help(*pskb, ct, ctinfo);
858                 if (ret == -1) {
859                         /* Invalid */
860                         nf_conntrack_put((*pskb)->nfct);
861                         (*pskb)->nfct = NULL;
862                         return NF_ACCEPT;
863                 }
864         }
865         if (set_reply)
866                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
867
868         return ret;
869 }
870
871 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
872                    const struct ip_conntrack_tuple *orig)
873 {
874         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
875 }
876
877 static inline int resent_expect(const struct ip_conntrack_expect *i,
878                                 const struct ip_conntrack_tuple *tuple,
879                                 const struct ip_conntrack_tuple *mask)
880 {
881         DEBUGP("resent_expect\n");
882         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
883         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
884         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
885         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
886                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
887                 && ip_ct_tuple_equal(&i->mask, mask));
888 }
889
890 /* Would two expected things clash? */
891 static inline int expect_clash(const struct ip_conntrack_expect *i,
892                                const struct ip_conntrack_tuple *tuple,
893                                const struct ip_conntrack_tuple *mask)
894 {
895         /* Part covered by intersection of masks must be unequal,
896            otherwise they clash */
897         struct ip_conntrack_tuple intersect_mask
898                 = { { i->mask.src.ip & mask->src.ip,
899                       { i->mask.src.u.all & mask->src.u.all } },
900                     { i->mask.dst.ip & mask->dst.ip,
901                       { i->mask.dst.u.all & mask->dst.u.all },
902                       i->mask.dst.protonum & mask->dst.protonum } };
903
904         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
905 }
906
907 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
908 {
909         WRITE_LOCK(&ip_conntrack_lock);
910         unexpect_related(expect);
911         WRITE_UNLOCK(&ip_conntrack_lock);
912 }
913         
914 static void expectation_timed_out(unsigned long ul_expect)
915 {
916         struct ip_conntrack_expect *expect = (void *) ul_expect;
917
918         DEBUGP("expectation %p timed out\n", expect);   
919         WRITE_LOCK(&ip_conntrack_lock);
920         __unexpect_related(expect);
921         WRITE_UNLOCK(&ip_conntrack_lock);
922 }
923
924 struct ip_conntrack_expect *
925 ip_conntrack_expect_alloc(void)
926 {
927         struct ip_conntrack_expect *new;
928         
929         new = (struct ip_conntrack_expect *)
930                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
931         if (!new) {
932                 DEBUGP("expect_related: OOM allocating expect\n");
933                 return NULL;
934         }
935
936         /* tuple_cmp compares whole union, we have to initialized cleanly */
937         memset(new, 0, sizeof(struct ip_conntrack_expect));
938
939         return new;
940 }
941
942 static void
943 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
944                            struct ip_conntrack *related_to)
945 {
946         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
947         new->expectant = related_to;
948         new->sibling = NULL;
949         atomic_set(&new->use, 1);
950
951         /* add to expected list for this connection */
952         list_add_tail(&new->expected_list, &related_to->sibling_list);
953         /* add to global list of expectations */
954         list_prepend(&ip_conntrack_expect_list, &new->list);
955         /* add and start timer if required */
956         if (related_to->helper->timeout) {
957                 init_timer(&new->timeout);
958                 new->timeout.data = (unsigned long)new;
959                 new->timeout.function = expectation_timed_out;
960                 new->timeout.expires = jiffies +
961                                         related_to->helper->timeout * HZ;
962                 add_timer(&new->timeout);
963         }
964         related_to->expecting++;
965 }
966
967 /* Add a related connection. */
968 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
969                                 struct ip_conntrack *related_to)
970 {
971         struct ip_conntrack_expect *old;
972         int ret = 0;
973
974         WRITE_LOCK(&ip_conntrack_lock);
975         /* Because of the write lock, no reader can walk the lists,
976          * so there is no need to use the tuple lock too */
977
978         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
979         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
980         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
981
982         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
983                         struct ip_conntrack_expect *, &expect->tuple, 
984                         &expect->mask);
985         if (old) {
986                 /* Helper private data may contain offsets but no pointers
987                    pointing into the payload - otherwise we should have to copy 
988                    the data filled out by the helper over the old one */
989                 DEBUGP("expect_related: resent packet\n");
990                 if (related_to->helper->timeout) {
991                         if (!del_timer(&old->timeout)) {
992                                 /* expectation is dying. Fall through */
993                                 goto out;
994                         } else {
995                                 old->timeout.expires = jiffies + 
996                                         related_to->helper->timeout * HZ;
997                                 add_timer(&old->timeout);
998                         }
999                 }
1000
1001                 WRITE_UNLOCK(&ip_conntrack_lock);
1002                 kfree(expect);
1003                 return -EEXIST;
1004
1005         } else if (related_to->helper->max_expected && 
1006                    related_to->expecting >= related_to->helper->max_expected) {
1007                 /* old == NULL */
1008                 if (!(related_to->helper->flags & 
1009                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1010                         WRITE_UNLOCK(&ip_conntrack_lock);
1011                         if (net_ratelimit())
1012                                 printk(KERN_WARNING
1013                                        "ip_conntrack: max number of expected "
1014                                        "connections %i of %s reached for "
1015                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1016                                        related_to->helper->max_expected,
1017                                        related_to->helper->name,
1018                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1019                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1020                         kfree(expect);
1021                         return -EPERM;
1022                 }
1023                 DEBUGP("ip_conntrack: max number of expected "
1024                        "connections %i of %s reached for "
1025                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1026                        related_to->helper->max_expected,
1027                        related_to->helper->name,
1028                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1029                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1030  
1031                 /* choose the the oldest expectation to evict */
1032                 list_for_each_entry(old, &related_to->sibling_list, 
1033                                                       expected_list)
1034                         if (old->sibling == NULL)
1035                                 break;
1036
1037                 /* We cannot fail since related_to->expecting is the number
1038                  * of unconfirmed expectations */
1039                 IP_NF_ASSERT(old && old->sibling == NULL);
1040
1041                 /* newnat14 does not reuse the real allocated memory
1042                  * structures but rather unexpects the old and
1043                  * allocates a new.  unexpect_related will decrement
1044                  * related_to->expecting. 
1045                  */
1046                 unexpect_related(old);
1047                 ret = -EPERM;
1048         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1049                              struct ip_conntrack_expect *, &expect->tuple, 
1050                              &expect->mask)) {
1051                 WRITE_UNLOCK(&ip_conntrack_lock);
1052                 DEBUGP("expect_related: busy!\n");
1053
1054                 kfree(expect);
1055                 return -EBUSY;
1056         }
1057
1058 out:    ip_conntrack_expect_insert(expect, related_to);
1059
1060         WRITE_UNLOCK(&ip_conntrack_lock);
1061
1062         return ret;
1063 }
1064
1065 /* Change tuple in an existing expectation */
1066 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1067                                struct ip_conntrack_tuple *newtuple)
1068 {
1069         int ret;
1070
1071         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1072         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1073
1074         DEBUGP("change_expect:\n");
1075         DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1076         DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
1077         DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
1078         if (expect->ct_tuple.dst.protonum == 0) {
1079                 /* Never seen before */
1080                 DEBUGP("change expect: never seen before\n");
1081                 if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
1082                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1083                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1084                         /* Force NAT to find an unused tuple */
1085                         ret = -1;
1086                 } else {
1087                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1088                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1089                         ret = 0;
1090                 }
1091         } else {
1092                 /* Resent packet */
1093                 DEBUGP("change expect: resent packet\n");
1094                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1095                         ret = 0;
1096                 } else {
1097                         /* Force NAT to choose again the same port */
1098                         ret = -1;
1099                 }
1100         }
1101         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1102         
1103         return ret;
1104 }
1105
1106 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1107    return 0 and don't do alteration. */
1108 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1109                              const struct ip_conntrack_tuple *newreply)
1110 {
1111         WRITE_LOCK(&ip_conntrack_lock);
1112         if (__ip_conntrack_find(newreply, conntrack)) {
1113                 WRITE_UNLOCK(&ip_conntrack_lock);
1114                 return 0;
1115         }
1116         /* Should be unconfirmed, so not in hash table yet */
1117         IP_NF_ASSERT(!is_confirmed(conntrack));
1118
1119         DEBUGP("Altering reply tuple of %p to ", conntrack);
1120         DUMP_TUPLE(newreply);
1121
1122         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1123         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1124                 conntrack->helper = ip_ct_find_helper(newreply);
1125         WRITE_UNLOCK(&ip_conntrack_lock);
1126
1127         return 1;
1128 }
1129
1130 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1131 {
1132         WRITE_LOCK(&ip_conntrack_lock);
1133         list_prepend(&helpers, me);
1134         WRITE_UNLOCK(&ip_conntrack_lock);
1135
1136         return 0;
1137 }
1138
1139 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1140                          const struct ip_conntrack_helper *me)
1141 {
1142         if (i->ctrack->helper == me) {
1143                 /* Get rid of any expected. */
1144                 remove_expectations(i->ctrack, 0);
1145                 /* And *then* set helper to NULL */
1146                 i->ctrack->helper = NULL;
1147         }
1148         return 0;
1149 }
1150
1151 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1152 {
1153         unsigned int i;
1154
1155         /* Need write lock here, to delete helper. */
1156         WRITE_LOCK(&ip_conntrack_lock);
1157         LIST_DELETE(&helpers, me);
1158
1159         /* Get rid of expecteds, set helpers to NULL. */
1160         for (i = 0; i < ip_conntrack_htable_size; i++)
1161                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1162                             struct ip_conntrack_tuple_hash *, me);
1163         WRITE_UNLOCK(&ip_conntrack_lock);
1164
1165         /* Someone could be still looking at the helper in a bh. */
1166         synchronize_net();
1167 }
1168
1169 /* Refresh conntrack for this many jiffies. */
1170 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1171 {
1172         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1173
1174         /* If not in hash table, timer will not be active yet */
1175         if (!is_confirmed(ct))
1176                 ct->timeout.expires = extra_jiffies;
1177         else {
1178                 WRITE_LOCK(&ip_conntrack_lock);
1179                 /* Need del_timer for race avoidance (may already be dying). */
1180                 if (del_timer(&ct->timeout)) {
1181                         ct->timeout.expires = jiffies + extra_jiffies;
1182                         add_timer(&ct->timeout);
1183                 }
1184                 WRITE_UNLOCK(&ip_conntrack_lock);
1185         }
1186 }
1187
1188 /* Returns new sk_buff, or NULL */
1189 struct sk_buff *
1190 ip_ct_gather_frags(struct sk_buff *skb)
1191 {
1192         struct sock *sk = skb->sk;
1193 #ifdef CONFIG_NETFILTER_DEBUG
1194         unsigned int olddebug = skb->nf_debug;
1195 #endif
1196         if (sk) {
1197                 sock_hold(sk);
1198                 skb_orphan(skb);
1199         }
1200
1201         local_bh_disable(); 
1202         skb = ip_defrag(skb);
1203         local_bh_enable();
1204
1205         if (!skb) {
1206                 if (sk)
1207                         sock_put(sk);
1208                 return skb;
1209         }
1210
1211         if (sk) {
1212                 skb_set_owner_w(skb, sk);
1213                 sock_put(sk);
1214         }
1215
1216         ip_send_check(skb->nh.iph);
1217         skb->nfcache |= NFC_ALTERED;
1218 #ifdef CONFIG_NETFILTER_DEBUG
1219         /* Packet path as if nothing had happened. */
1220         skb->nf_debug = olddebug;
1221 #endif
1222         return skb;
1223 }
1224
1225 /* Used by ipt_REJECT. */
1226 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1227 {
1228         struct ip_conntrack *ct;
1229         enum ip_conntrack_info ctinfo;
1230
1231         ct = __ip_conntrack_get(nfct, &ctinfo);
1232
1233         /* This ICMP is in reverse direction to the packet which
1234            caused it */
1235         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1236                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1237         else
1238                 ctinfo = IP_CT_RELATED;
1239
1240         /* Attach new skbuff, and increment count */
1241         nskb->nfct = &ct->infos[ctinfo];
1242         atomic_inc(&ct->ct_general.use);
1243 }
1244
1245 static inline int
1246 do_kill(const struct ip_conntrack_tuple_hash *i,
1247         int (*kill)(const struct ip_conntrack *i, void *data),
1248         void *data)
1249 {
1250         return kill(i->ctrack, data);
1251 }
1252
1253 /* Bring out ya dead! */
1254 static struct ip_conntrack_tuple_hash *
1255 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1256                 void *data, unsigned int *bucket)
1257 {
1258         struct ip_conntrack_tuple_hash *h = NULL;
1259
1260         READ_LOCK(&ip_conntrack_lock);
1261         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1262                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1263                               struct ip_conntrack_tuple_hash *, kill, data);
1264         }
1265         if (h)
1266                 atomic_inc(&h->ctrack->ct_general.use);
1267         READ_UNLOCK(&ip_conntrack_lock);
1268
1269         return h;
1270 }
1271
1272 void
1273 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1274                         void *data)
1275 {
1276         struct ip_conntrack_tuple_hash *h;
1277         unsigned int bucket = 0;
1278
1279         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1280                 /* Time to push up daises... */
1281                 if (del_timer(&h->ctrack->timeout))
1282                         death_by_timeout((unsigned long)h->ctrack);
1283                 /* ... else the timer will get him soon. */
1284
1285                 ip_conntrack_put(h->ctrack);
1286         }
1287 }
1288
1289 /* Fast function for those who don't want to parse /proc (and I don't
1290    blame them). */
1291 /* Reversing the socket's dst/src point of view gives us the reply
1292    mapping. */
1293 static int
1294 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1295 {
1296         struct inet_opt *inet = inet_sk(sk);
1297         struct ip_conntrack_tuple_hash *h;
1298         struct ip_conntrack_tuple tuple;
1299         
1300         IP_CT_TUPLE_U_BLANK(&tuple);
1301         tuple.src.ip = inet->rcv_saddr;
1302         tuple.src.u.tcp.port = inet->sport;
1303         tuple.dst.ip = inet->daddr;
1304         tuple.dst.u.tcp.port = inet->dport;
1305         tuple.dst.protonum = IPPROTO_TCP;
1306
1307         /* We only do TCP at the moment: is there a better way? */
1308         if (strcmp(sk->sk_prot->name, "TCP")) {
1309                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1310                 return -ENOPROTOOPT;
1311         }
1312
1313         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1314                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1315                        *len, sizeof(struct sockaddr_in));
1316                 return -EINVAL;
1317         }
1318
1319         h = ip_conntrack_find_get(&tuple, NULL);
1320         if (h) {
1321                 struct sockaddr_in sin;
1322
1323                 sin.sin_family = AF_INET;
1324                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1325                         .tuple.dst.u.tcp.port;
1326                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1327                         .tuple.dst.ip;
1328
1329                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1330                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1331                 ip_conntrack_put(h->ctrack);
1332                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1333                         return -EFAULT;
1334                 else
1335                         return 0;
1336         }
1337         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1338                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1339                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1340         return -ENOENT;
1341 }
1342
1343 static struct nf_sockopt_ops so_getorigdst = {
1344         .pf             = PF_INET,
1345         .get_optmin     = SO_ORIGINAL_DST,
1346         .get_optmax     = SO_ORIGINAL_DST+1,
1347         .get            = &getorigdst,
1348 };
1349
1350 static int kill_all(const struct ip_conntrack *i, void *data)
1351 {
1352         return 1;
1353 }
1354
1355 /* Mishearing the voices in his head, our hero wonders how he's
1356    supposed to kill the mall. */
1357 void ip_conntrack_cleanup(void)
1358 {
1359         ip_ct_attach = NULL;
1360         /* This makes sure all current packets have passed through
1361            netfilter framework.  Roll on, two-stage module
1362            delete... */
1363         synchronize_net();
1364  
1365  i_see_dead_people:
1366         ip_ct_selective_cleanup(kill_all, NULL);
1367         if (atomic_read(&ip_conntrack_count) != 0) {
1368                 schedule();
1369                 goto i_see_dead_people;
1370         }
1371
1372         kmem_cache_destroy(ip_conntrack_cachep);
1373         vfree(ip_conntrack_hash);
1374         nf_unregister_sockopt(&so_getorigdst);
1375 }
1376
1377 static int hashsize;
1378 MODULE_PARM(hashsize, "i");
1379
1380 int __init ip_conntrack_init(void)
1381 {
1382         unsigned int i;
1383         int ret;
1384
1385         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1386          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1387         if (hashsize) {
1388                 ip_conntrack_htable_size = hashsize;
1389         } else {
1390                 ip_conntrack_htable_size
1391                         = (((num_physpages << PAGE_SHIFT) / 16384)
1392                            / sizeof(struct list_head));
1393                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1394                         ip_conntrack_htable_size = 8192;
1395                 if (ip_conntrack_htable_size < 16)
1396                         ip_conntrack_htable_size = 16;
1397         }
1398         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1399
1400         printk("ip_conntrack version %s (%u buckets, %d max)"
1401                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1402                ip_conntrack_htable_size, ip_conntrack_max,
1403                sizeof(struct ip_conntrack));
1404
1405         ret = nf_register_sockopt(&so_getorigdst);
1406         if (ret != 0) {
1407                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1408                 return ret;
1409         }
1410
1411         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1412                                     * ip_conntrack_htable_size);
1413         if (!ip_conntrack_hash) {
1414                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1415                 goto err_unreg_sockopt;
1416         }
1417
1418         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1419                                                 sizeof(struct ip_conntrack), 0,
1420                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1421         if (!ip_conntrack_cachep) {
1422                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1423                 goto err_free_hash;
1424         }
1425         /* Don't NEED lock here, but good form anyway. */
1426         WRITE_LOCK(&ip_conntrack_lock);
1427         /* Sew in builtin protocols. */
1428         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1429         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1430         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1431         WRITE_UNLOCK(&ip_conntrack_lock);
1432
1433         for (i = 0; i < ip_conntrack_htable_size; i++)
1434                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1435
1436         /* For use by ipt_REJECT */
1437         ip_ct_attach = ip_conntrack_attach;
1438
1439         /* Set up fake conntrack:
1440             - to never be deleted, not in any hashes */
1441         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1442         /*  - and look it like as a confirmed connection */
1443         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1444         /*  - and prepare the ctinfo field for REJECT & NAT. */
1445         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1446         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1447         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1448                         &ip_conntrack_untracked.ct_general;
1449
1450         return ret;
1451
1452 err_free_hash:
1453         vfree(ip_conntrack_hash);
1454 err_unreg_sockopt:
1455         nf_unregister_sockopt(&so_getorigdst);
1456
1457         return -ENOMEM;
1458 }