VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 /* For ERR_PTR().  Yeah, I know... --RR */
38 #include <linux/fs.h>
39
40 /* This rwlock protects the main hash table, protocol/helper/expected
41    registrations, conntrack timers*/
42 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
43 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
44
45 #include <linux/netfilter_ipv4/ip_conntrack.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
49 #include <linux/netfilter_ipv4/listhelp.h>
50
51 #define IP_CONNTRACK_VERSION    "2.1"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DECLARE_RWLOCK(ip_conntrack_lock);
60 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
61
62 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
63 LIST_HEAD(ip_conntrack_expect_list);
64 LIST_HEAD(protocol_list);
65 static LIST_HEAD(helpers);
66 unsigned int ip_conntrack_htable_size = 0;
67 int ip_conntrack_max;
68 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
69 struct list_head *ip_conntrack_hash;
70 static kmem_cache_t *ip_conntrack_cachep;
71 struct ip_conntrack ip_conntrack_untracked;
72
73 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
74
75 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
76                               u_int8_t protocol)
77 {
78         return protocol == curr->proto;
79 }
80
81 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
82 {
83         struct ip_conntrack_protocol *p;
84
85         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
86         p = LIST_FIND(&protocol_list, proto_cmpfn,
87                       struct ip_conntrack_protocol *, protocol);
88         if (!p)
89                 p = &ip_conntrack_generic_protocol;
90
91         return p;
92 }
93
94 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
95 {
96         struct ip_conntrack_protocol *p;
97
98         READ_LOCK(&ip_conntrack_lock);
99         p = __ip_ct_find_proto(protocol);
100         READ_UNLOCK(&ip_conntrack_lock);
101         return p;
102 }
103
104 inline void 
105 ip_conntrack_put(struct ip_conntrack *ct)
106 {
107         IP_NF_ASSERT(ct);
108         IP_NF_ASSERT(ct->infos[0].master);
109         /* nf_conntrack_put wants to go via an info struct, so feed it
110            one at random. */
111         nf_conntrack_put(&ct->infos[0]);
112 }
113
114 static int ip_conntrack_hash_rnd_initted;
115 static unsigned int ip_conntrack_hash_rnd;
116
117 static u_int32_t
118 hash_conntrack(const struct ip_conntrack_tuple *tuple)
119 {
120 #if 0
121         dump_tuple(tuple);
122 #endif
123         return (jhash_3words(tuple->src.ip,
124                              (tuple->dst.ip ^ tuple->dst.protonum),
125                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
126                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
127 }
128
129 int
130 get_tuple(const struct iphdr *iph,
131           const struct sk_buff *skb,
132           unsigned int dataoff,
133           struct ip_conntrack_tuple *tuple,
134           const struct ip_conntrack_protocol *protocol)
135 {
136         /* Never happen */
137         if (iph->frag_off & htons(IP_OFFSET)) {
138                 printk("ip_conntrack_core: Frag of proto %u.\n",
139                        iph->protocol);
140                 return 0;
141         }
142
143         tuple->src.ip = iph->saddr;
144         tuple->dst.ip = iph->daddr;
145         tuple->dst.protonum = iph->protocol;
146
147         return protocol->pkt_to_tuple(skb, dataoff, tuple);
148 }
149
150 static int
151 invert_tuple(struct ip_conntrack_tuple *inverse,
152              const struct ip_conntrack_tuple *orig,
153              const struct ip_conntrack_protocol *protocol)
154 {
155         inverse->src.ip = orig->dst.ip;
156         inverse->dst.ip = orig->src.ip;
157         inverse->dst.protonum = orig->dst.protonum;
158
159         return protocol->invert_tuple(inverse, orig);
160 }
161
162
163 /* ip_conntrack_expect helper functions */
164
165 /* Compare tuple parts depending on mask. */
166 static inline int expect_cmp(const struct ip_conntrack_expect *i,
167                              const struct ip_conntrack_tuple *tuple)
168 {
169         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
170         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
171 }
172
173 static void
174 destroy_expect(struct ip_conntrack_expect *exp)
175 {
176         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
177         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
178         IP_NF_ASSERT(!timer_pending(&exp->timeout));
179
180         kfree(exp);
181 }
182
183 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
184 {
185         IP_NF_ASSERT(exp);
186
187         if (atomic_dec_and_test(&exp->use)) {
188                 /* usage count dropped to zero */
189                 destroy_expect(exp);
190         }
191 }
192
193 static inline struct ip_conntrack_expect *
194 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
195 {
196         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
197         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
198         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
199                          struct ip_conntrack_expect *, tuple);
200 }
201
202 /* Find a expectation corresponding to a tuple. */
203 struct ip_conntrack_expect *
204 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
205 {
206         struct ip_conntrack_expect *exp;
207
208         READ_LOCK(&ip_conntrack_lock);
209         READ_LOCK(&ip_conntrack_expect_tuple_lock);
210         exp = __ip_ct_expect_find(tuple);
211         if (exp)
212                 atomic_inc(&exp->use);
213         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
214         READ_UNLOCK(&ip_conntrack_lock);
215
216         return exp;
217 }
218
219 /* remove one specific expectation from all lists and drop refcount,
220  * does _NOT_ delete the timer. */
221 static void __unexpect_related(struct ip_conntrack_expect *expect)
222 {
223         DEBUGP("unexpect_related(%p)\n", expect);
224         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
225
226         /* we're not allowed to unexpect a confirmed expectation! */
227         IP_NF_ASSERT(!expect->sibling);
228
229         /* delete from global and local lists */
230         list_del(&expect->list);
231         list_del(&expect->expected_list);
232
233         /* decrement expect-count of master conntrack */
234         if (expect->expectant)
235                 expect->expectant->expecting--;
236
237         ip_conntrack_expect_put(expect);
238 }
239
240 /* remove one specific expecatation from all lists, drop refcount
241  * and expire timer. 
242  * This function can _NOT_ be called for confirmed expects! */
243 static void unexpect_related(struct ip_conntrack_expect *expect)
244 {
245         IP_NF_ASSERT(expect->expectant);
246         IP_NF_ASSERT(expect->expectant->helper);
247         /* if we are supposed to have a timer, but we can't delete
248          * it: race condition.  __unexpect_related will
249          * be calledd by timeout function */
250         if (expect->expectant->helper->timeout
251             && !del_timer(&expect->timeout))
252                 return;
253
254         __unexpect_related(expect);
255 }
256
257 /* delete all unconfirmed expectations for this conntrack */
258 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
259 {
260         struct list_head *exp_entry, *next;
261         struct ip_conntrack_expect *exp;
262
263         DEBUGP("remove_expectations(%p)\n", ct);
264
265         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
266                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
267                                  expected_list);
268
269                 /* we skip established expectations, as we want to delete
270                  * the un-established ones only */
271                 if (exp->sibling) {
272                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
273                         if (drop_refcount) {
274                                 /* Indicate that this expectations parent is dead */
275                                 ip_conntrack_put(exp->expectant);
276                                 exp->expectant = NULL;
277                         }
278                         continue;
279                 }
280
281                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
282                 IP_NF_ASSERT(exp->expectant == ct);
283
284                 /* delete expectation from global and private lists */
285                 unexpect_related(exp);
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         unsigned int ho, hr;
293         
294         DEBUGP("clean_from_lists(%p)\n", ct);
295         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
296
297         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
298         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
299         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
300         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
301
302         /* Destroy all un-established, pending expectations */
303         remove_expectations(ct, 1);
304 }
305
306 static void
307 destroy_conntrack(struct nf_conntrack *nfct)
308 {
309         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
310         struct ip_conntrack_protocol *proto;
311
312         DEBUGP("destroy_conntrack(%p)\n", ct);
313         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314         IP_NF_ASSERT(!timer_pending(&ct->timeout));
315
316         /* To make sure we don't get any weird locking issues here:
317          * destroy_conntrack() MUST NOT be called with a write lock
318          * to ip_conntrack_lock!!! -HW */
319         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
320         if (proto && proto->destroy)
321                 proto->destroy(ct);
322
323         if (ip_conntrack_destroyed)
324                 ip_conntrack_destroyed(ct);
325
326         WRITE_LOCK(&ip_conntrack_lock);
327         /* Make sure don't leave any orphaned expectations lying around */
328         if (ct->expecting)
329                 remove_expectations(ct, 1);
330
331         /* Delete our master expectation */
332         if (ct->master) {
333                 if (ct->master->expectant) {
334                         /* can't call __unexpect_related here,
335                          * since it would screw up expect_list */
336                         list_del(&ct->master->expected_list);
337                         master = ct->master->expectant;
338                 }
339                 kfree(ct->master);
340         }
341         WRITE_UNLOCK(&ip_conntrack_lock);
342
343         if (master)
344                 ip_conntrack_put(master);
345
346         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
347         kmem_cache_free(ip_conntrack_cachep, ct);
348         atomic_dec(&ip_conntrack_count);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         WRITE_LOCK(&ip_conntrack_lock);
356         clean_from_lists(ct);
357         WRITE_UNLOCK(&ip_conntrack_lock);
358         ip_conntrack_put(ct);
359 }
360
361 static inline int
362 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
363                     const struct ip_conntrack_tuple *tuple,
364                     const struct ip_conntrack *ignored_conntrack)
365 {
366         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
367         return i->ctrack != ignored_conntrack
368                 && ip_ct_tuple_equal(tuple, &i->tuple);
369 }
370
371 static struct ip_conntrack_tuple_hash *
372 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
373                     const struct ip_conntrack *ignored_conntrack)
374 {
375         struct ip_conntrack_tuple_hash *h;
376         unsigned int hash = hash_conntrack(tuple);
377
378         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
379         h = LIST_FIND(&ip_conntrack_hash[hash],
380                       conntrack_tuple_cmp,
381                       struct ip_conntrack_tuple_hash *,
382                       tuple, ignored_conntrack);
383         return h;
384 }
385
386 /* Find a connection corresponding to a tuple. */
387 struct ip_conntrack_tuple_hash *
388 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
389                       const struct ip_conntrack *ignored_conntrack)
390 {
391         struct ip_conntrack_tuple_hash *h;
392
393         READ_LOCK(&ip_conntrack_lock);
394         h = __ip_conntrack_find(tuple, ignored_conntrack);
395         if (h)
396                 atomic_inc(&h->ctrack->ct_general.use);
397         READ_UNLOCK(&ip_conntrack_lock);
398
399         return h;
400 }
401
402 static inline struct ip_conntrack *
403 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
404 {
405         struct ip_conntrack *ct
406                 = (struct ip_conntrack *)nfct->master;
407
408         /* ctinfo is the index of the nfct inside the conntrack */
409         *ctinfo = nfct - ct->infos;
410         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
411         return ct;
412 }
413
414 /* Return conntrack and conntrack_info given skb->nfct->master */
415 struct ip_conntrack *
416 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
417 {
418         if (skb->nfct) 
419                 return __ip_conntrack_get(skb->nfct, ctinfo);
420         return NULL;
421 }
422
423 /* Confirm a connection given skb->nfct; places it in hash table */
424 int
425 __ip_conntrack_confirm(struct nf_ct_info *nfct)
426 {
427         unsigned int hash, repl_hash;
428         struct ip_conntrack *ct;
429         enum ip_conntrack_info ctinfo;
430
431         ct = __ip_conntrack_get(nfct, &ctinfo);
432
433         /* ipt_REJECT uses ip_conntrack_attach to attach related
434            ICMP/TCP RST packets in other direction.  Actual packet
435            which created connection will be IP_CT_NEW or for an
436            expected connection, IP_CT_RELATED. */
437         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438                 return NF_ACCEPT;
439
440         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443         /* We're not in hash table, and we refuse to set up related
444            connections for unconfirmed conns.  But packet copies and
445            REJECT will give spurious warnings here. */
446         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448         /* No external references means noone else could have
449            confirmed us. */
450         IP_NF_ASSERT(!is_confirmed(ct));
451         DEBUGP("Confirming conntrack %p\n", ct);
452
453         WRITE_LOCK(&ip_conntrack_lock);
454         /* See if there's one in the list already, including reverse:
455            NAT could have grabbed it without realizing, since we're
456            not in the hash.  If there is, we lost race. */
457         if (!LIST_FIND(&ip_conntrack_hash[hash],
458                        conntrack_tuple_cmp,
459                        struct ip_conntrack_tuple_hash *,
460                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
461             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
462                           conntrack_tuple_cmp,
463                           struct ip_conntrack_tuple_hash *,
464                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
465                 list_prepend(&ip_conntrack_hash[hash],
466                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
467                 list_prepend(&ip_conntrack_hash[repl_hash],
468                              &ct->tuplehash[IP_CT_DIR_REPLY]);
469                 /* Timer relative to confirmation time, not original
470                    setting time, otherwise we'd get timer wrap in
471                    weird delay cases. */
472                 ct->timeout.expires += jiffies;
473                 add_timer(&ct->timeout);
474                 atomic_inc(&ct->ct_general.use);
475                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
476                 WRITE_UNLOCK(&ip_conntrack_lock);
477                 return NF_ACCEPT;
478         }
479
480         WRITE_UNLOCK(&ip_conntrack_lock);
481         return NF_DROP;
482 }
483
484 /* Returns true if a connection correspondings to the tuple (required
485    for NAT). */
486 int
487 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
488                          const struct ip_conntrack *ignored_conntrack)
489 {
490         struct ip_conntrack_tuple_hash *h;
491
492         READ_LOCK(&ip_conntrack_lock);
493         h = __ip_conntrack_find(tuple, ignored_conntrack);
494         READ_UNLOCK(&ip_conntrack_lock);
495
496         return h != NULL;
497 }
498
499 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
500 struct ip_conntrack *
501 icmp_error_track(struct sk_buff *skb,
502                  enum ip_conntrack_info *ctinfo,
503                  unsigned int hooknum)
504 {
505         struct ip_conntrack_tuple innertuple, origtuple;
506         struct {
507                 struct icmphdr icmp;
508                 struct iphdr ip;
509         } inside;
510         struct ip_conntrack_protocol *innerproto;
511         struct ip_conntrack_tuple_hash *h;
512         int dataoff;
513
514         IP_NF_ASSERT(skb->nfct == NULL);
515
516         /* Not enough header? */
517         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
518                 return NULL;
519
520         if (inside.icmp.type != ICMP_DEST_UNREACH
521             && inside.icmp.type != ICMP_SOURCE_QUENCH
522             && inside.icmp.type != ICMP_TIME_EXCEEDED
523             && inside.icmp.type != ICMP_PARAMETERPROB
524             && inside.icmp.type != ICMP_REDIRECT)
525                 return NULL;
526
527         /* Ignore ICMP's containing fragments (shouldn't happen) */
528         if (inside.ip.frag_off & htons(IP_OFFSET)) {
529                 DEBUGP("icmp_error_track: fragment of proto %u\n",
530                        inside.ip.protocol);
531                 return NULL;
532         }
533
534         innerproto = ip_ct_find_proto(inside.ip.protocol);
535         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
536         /* Are they talking about one of our connections? */
537         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
538                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
539                 return NULL;
540         }
541
542         /* Ordinarily, we'd expect the inverted tupleproto, but it's
543            been preserved inside the ICMP. */
544         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
545                 DEBUGP("icmp_error_track: Can't invert tuple\n");
546                 return NULL;
547         }
548
549         *ctinfo = IP_CT_RELATED;
550
551         h = ip_conntrack_find_get(&innertuple, NULL);
552         if (!h) {
553                 /* Locally generated ICMPs will match inverted if they
554                    haven't been SNAT'ed yet */
555                 /* FIXME: NAT code has to handle half-done double NAT --RR */
556                 if (hooknum == NF_IP_LOCAL_OUT)
557                         h = ip_conntrack_find_get(&origtuple, NULL);
558
559                 if (!h) {
560                         DEBUGP("icmp_error_track: no match\n");
561                         return NULL;
562                 }
563                 /* Reverse direction from that found */
564                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
565                         *ctinfo += IP_CT_IS_REPLY;
566         } else {
567                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
568                         *ctinfo += IP_CT_IS_REPLY;
569         }
570
571         /* Update skb to refer to this connection */
572         skb->nfct = &h->ctrack->infos[*ctinfo];
573         return h->ctrack;
574 }
575
576 /* There's a small race here where we may free a just-assured
577    connection.  Too bad: we're in trouble anyway. */
578 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
579 {
580         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
581 }
582
583 static int early_drop(struct list_head *chain)
584 {
585         /* Traverse backwards: gives us oldest, which is roughly LRU */
586         struct ip_conntrack_tuple_hash *h;
587         int dropped = 0;
588
589         READ_LOCK(&ip_conntrack_lock);
590         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
591         if (h)
592                 atomic_inc(&h->ctrack->ct_general.use);
593         READ_UNLOCK(&ip_conntrack_lock);
594
595         if (!h)
596                 return dropped;
597
598         if (del_timer(&h->ctrack->timeout)) {
599                 death_by_timeout((unsigned long)h->ctrack);
600                 dropped = 1;
601         }
602         ip_conntrack_put(h->ctrack);
603         return dropped;
604 }
605
606 static inline int helper_cmp(const struct ip_conntrack_helper *i,
607                              const struct ip_conntrack_tuple *rtuple)
608 {
609         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
610 }
611
612 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
613 {
614         return LIST_FIND(&helpers, helper_cmp,
615                          struct ip_conntrack_helper *,
616                          tuple);
617 }
618
619 /* Allocate a new conntrack: we return -ENOMEM if classification
620    failed due to stress.  Otherwise it really is unclassifiable. */
621 static struct ip_conntrack_tuple_hash *
622 init_conntrack(const struct ip_conntrack_tuple *tuple,
623                struct ip_conntrack_protocol *protocol,
624                struct sk_buff *skb)
625 {
626         struct ip_conntrack *conntrack;
627         struct ip_conntrack_tuple repl_tuple;
628         size_t hash;
629         struct ip_conntrack_expect *expected;
630         int i;
631         static unsigned int drop_next;
632
633         if (!ip_conntrack_hash_rnd_initted) {
634                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
635                 ip_conntrack_hash_rnd_initted = 1;
636         }
637
638         hash = hash_conntrack(tuple);
639
640         if (ip_conntrack_max &&
641             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
642                 /* Try dropping from random chain, or else from the
643                    chain about to put into (in case they're trying to
644                    bomb one hash chain). */
645                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
646
647                 if (!early_drop(&ip_conntrack_hash[next])
648                     && !early_drop(&ip_conntrack_hash[hash])) {
649                         if (net_ratelimit())
650                                 printk(KERN_WARNING
651                                        "ip_conntrack: table full, dropping"
652                                        " packet.\n");
653                         return ERR_PTR(-ENOMEM);
654                 }
655         }
656
657         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
658                 DEBUGP("Can't invert tuple.\n");
659                 return NULL;
660         }
661
662         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
663         if (!conntrack) {
664                 DEBUGP("Can't allocate conntrack.\n");
665                 return ERR_PTR(-ENOMEM);
666         }
667
668         memset(conntrack, 0, sizeof(*conntrack));
669         atomic_set(&conntrack->ct_general.use, 1);
670         conntrack->ct_general.destroy = destroy_conntrack;
671         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
672         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
673         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
674         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
675         for (i=0; i < IP_CT_NUMBER; i++)
676                 conntrack->infos[i].master = &conntrack->ct_general;
677
678         if (!protocol->new(conntrack, skb)) {
679                 kmem_cache_free(ip_conntrack_cachep, conntrack);
680                 return NULL;
681         }
682         /* Don't set timer yet: wait for confirmation */
683         init_timer(&conntrack->timeout);
684         conntrack->timeout.data = (unsigned long)conntrack;
685         conntrack->timeout.function = death_by_timeout;
686
687         INIT_LIST_HEAD(&conntrack->sibling_list);
688
689         WRITE_LOCK(&ip_conntrack_lock);
690         /* Need finding and deleting of expected ONLY if we win race */
691         READ_LOCK(&ip_conntrack_expect_tuple_lock);
692         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
693                              struct ip_conntrack_expect *, tuple);
694         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
695
696         /* If master is not in hash table yet (ie. packet hasn't left
697            this machine yet), how can other end know about expected?
698            Hence these are not the droids you are looking for (if
699            master ct never got confirmed, we'd hold a reference to it
700            and weird things would happen to future packets). */
701         if (expected && !is_confirmed(expected->expectant))
702                 expected = NULL;
703
704         /* Look up the conntrack helper for master connections only */
705         if (!expected)
706                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
707
708         /* If the expectation is dying, then this is a loser. */
709         if (expected
710             && expected->expectant->helper->timeout
711             && ! del_timer(&expected->timeout))
712                 expected = NULL;
713
714         if (expected) {
715                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
716                         conntrack, expected);
717                 /* Welcome, Mr. Bond.  We've been expecting you... */
718                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
719                 conntrack->master = expected;
720                 expected->sibling = conntrack;
721                 LIST_DELETE(&ip_conntrack_expect_list, expected);
722                 expected->expectant->expecting--;
723                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
724         }
725         atomic_inc(&ip_conntrack_count);
726         WRITE_UNLOCK(&ip_conntrack_lock);
727
728         if (expected && expected->expectfn)
729                 expected->expectfn(conntrack);
730         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
731 }
732
733 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
734 static inline struct ip_conntrack *
735 resolve_normal_ct(struct sk_buff *skb,
736                   struct ip_conntrack_protocol *proto,
737                   int *set_reply,
738                   unsigned int hooknum,
739                   enum ip_conntrack_info *ctinfo)
740 {
741         struct ip_conntrack_tuple tuple;
742         struct ip_conntrack_tuple_hash *h;
743
744         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
745
746         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
747                 return NULL;
748
749         /* look for tuple match */
750         h = ip_conntrack_find_get(&tuple, NULL);
751         if (!h) {
752                 h = init_conntrack(&tuple, proto, skb);
753                 if (!h)
754                         return NULL;
755                 if (IS_ERR(h))
756                         return (void *)h;
757         }
758
759         /* It exists; we have (non-exclusive) reference. */
760         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
761                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
762                 /* Please set reply bit if this packet OK */
763                 *set_reply = 1;
764         } else {
765                 /* Once we've had two way comms, always ESTABLISHED. */
766                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
767                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
768                                h->ctrack);
769                         *ctinfo = IP_CT_ESTABLISHED;
770                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
771                         DEBUGP("ip_conntrack_in: related packet for %p\n",
772                                h->ctrack);
773                         *ctinfo = IP_CT_RELATED;
774                 } else {
775                         DEBUGP("ip_conntrack_in: new packet for %p\n",
776                                h->ctrack);
777                         *ctinfo = IP_CT_NEW;
778                 }
779                 *set_reply = 0;
780         }
781         skb->nfct = &h->ctrack->infos[*ctinfo];
782         return h->ctrack;
783 }
784
785 /* Netfilter hook itself. */
786 unsigned int ip_conntrack_in(unsigned int hooknum,
787                              struct sk_buff **pskb,
788                              const struct net_device *in,
789                              const struct net_device *out,
790                              int (*okfn)(struct sk_buff *))
791 {
792         struct ip_conntrack *ct;
793         enum ip_conntrack_info ctinfo;
794         struct ip_conntrack_protocol *proto;
795         int set_reply;
796         int ret;
797
798         /* Never happen */
799         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
800                 if (net_ratelimit()) {
801                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
802                        (*pskb)->nh.iph->protocol, hooknum);
803                 }
804                 return NF_DROP;
805         }
806
807         /* FIXME: Do this right please. --RR */
808         (*pskb)->nfcache |= NFC_UNKNOWN;
809
810 /* Doesn't cover locally-generated broadcast, so not worth it. */
811 #if 0
812         /* Ignore broadcast: no `connection'. */
813         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
814                 printk("Broadcast packet!\n");
815                 return NF_ACCEPT;
816         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
817                    == htonl(0x000000FF)) {
818                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
819                        NIPQUAD((*pskb)->nh.iph->saddr),
820                        NIPQUAD((*pskb)->nh.iph->daddr),
821                        (*pskb)->sk, (*pskb)->pkt_type);
822         }
823 #endif
824
825         /* Previously seen (loopback or untracked)?  Ignore. */
826         if ((*pskb)->nfct)
827                 return NF_ACCEPT;
828
829         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
830
831         /* It may be an icmp error... */
832         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
833             && icmp_error_track(*pskb, &ctinfo, hooknum))
834                 return NF_ACCEPT;
835
836         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
837                 /* Not valid part of a connection */
838                 return NF_ACCEPT;
839
840         if (IS_ERR(ct))
841                 /* Too stressed to deal. */
842                 return NF_DROP;
843
844         IP_NF_ASSERT((*pskb)->nfct);
845
846         ret = proto->packet(ct, *pskb, ctinfo);
847         if (ret == -1) {
848                 /* Invalid */
849                 nf_conntrack_put((*pskb)->nfct);
850                 (*pskb)->nfct = NULL;
851                 return NF_ACCEPT;
852         }
853
854         if (ret != NF_DROP && ct->helper) {
855                 ret = ct->helper->help(*pskb, ct, ctinfo);
856                 if (ret == -1) {
857                         /* Invalid */
858                         nf_conntrack_put((*pskb)->nfct);
859                         (*pskb)->nfct = NULL;
860                         return NF_ACCEPT;
861                 }
862         }
863         if (set_reply)
864                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
865
866         return ret;
867 }
868
869 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
870                    const struct ip_conntrack_tuple *orig)
871 {
872         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
873 }
874
875 static inline int resent_expect(const struct ip_conntrack_expect *i,
876                                 const struct ip_conntrack_tuple *tuple,
877                                 const struct ip_conntrack_tuple *mask)
878 {
879         DEBUGP("resent_expect\n");
880         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
881         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
882         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
883         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
884                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
885                 && ip_ct_tuple_equal(&i->mask, mask));
886 }
887
888 /* Would two expected things clash? */
889 static inline int expect_clash(const struct ip_conntrack_expect *i,
890                                const struct ip_conntrack_tuple *tuple,
891                                const struct ip_conntrack_tuple *mask)
892 {
893         /* Part covered by intersection of masks must be unequal,
894            otherwise they clash */
895         struct ip_conntrack_tuple intersect_mask
896                 = { { i->mask.src.ip & mask->src.ip,
897                       { i->mask.src.u.all & mask->src.u.all } },
898                     { i->mask.dst.ip & mask->dst.ip,
899                       { i->mask.dst.u.all & mask->dst.u.all },
900                       i->mask.dst.protonum & mask->dst.protonum } };
901
902         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
903 }
904
905 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
906 {
907         WRITE_LOCK(&ip_conntrack_lock);
908         unexpect_related(expect);
909         WRITE_UNLOCK(&ip_conntrack_lock);
910 }
911         
912 static void expectation_timed_out(unsigned long ul_expect)
913 {
914         struct ip_conntrack_expect *expect = (void *) ul_expect;
915
916         DEBUGP("expectation %p timed out\n", expect);   
917         WRITE_LOCK(&ip_conntrack_lock);
918         __unexpect_related(expect);
919         WRITE_UNLOCK(&ip_conntrack_lock);
920 }
921
922 struct ip_conntrack_expect *
923 ip_conntrack_expect_alloc(void)
924 {
925         struct ip_conntrack_expect *new;
926         
927         new = (struct ip_conntrack_expect *)
928                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
929         if (!new) {
930                 DEBUGP("expect_related: OOM allocating expect\n");
931                 return NULL;
932         }
933
934         /* tuple_cmp compares whole union, we have to initialized cleanly */
935         memset(new, 0, sizeof(struct ip_conntrack_expect));
936
937         return new;
938 }
939
940 static void
941 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
942                            struct ip_conntrack *related_to)
943 {
944         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
945         new->expectant = related_to;
946         new->sibling = NULL;
947         atomic_set(&new->use, 1);
948
949         /* add to expected list for this connection */
950         list_add_tail(&new->expected_list, &related_to->sibling_list);
951         /* add to global list of expectations */
952         list_prepend(&ip_conntrack_expect_list, &new->list);
953         /* add and start timer if required */
954         if (related_to->helper->timeout) {
955                 init_timer(&new->timeout);
956                 new->timeout.data = (unsigned long)new;
957                 new->timeout.function = expectation_timed_out;
958                 new->timeout.expires = jiffies +
959                                         related_to->helper->timeout * HZ;
960                 add_timer(&new->timeout);
961         }
962         related_to->expecting++;
963 }
964
965 /* Add a related connection. */
966 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
967                                 struct ip_conntrack *related_to)
968 {
969         struct ip_conntrack_expect *old;
970         int ret = 0;
971
972         WRITE_LOCK(&ip_conntrack_lock);
973         /* Because of the write lock, no reader can walk the lists,
974          * so there is no need to use the tuple lock too */
975
976         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
977         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
978         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
979
980         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
981                         struct ip_conntrack_expect *, &expect->tuple, 
982                         &expect->mask);
983         if (old) {
984                 /* Helper private data may contain offsets but no pointers
985                    pointing into the payload - otherwise we should have to copy 
986                    the data filled out by the helper over the old one */
987                 DEBUGP("expect_related: resent packet\n");
988                 if (related_to->helper->timeout) {
989                         if (!del_timer(&old->timeout)) {
990                                 /* expectation is dying. Fall through */
991                                 goto out;
992                         } else {
993                                 old->timeout.expires = jiffies + 
994                                         related_to->helper->timeout * HZ;
995                                 add_timer(&old->timeout);
996                         }
997                 }
998
999                 WRITE_UNLOCK(&ip_conntrack_lock);
1000                 kfree(expect);
1001                 return -EEXIST;
1002
1003         } else if (related_to->helper->max_expected && 
1004                    related_to->expecting >= related_to->helper->max_expected) {
1005                 /* old == NULL */
1006                 if (!(related_to->helper->flags & 
1007                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1008                         WRITE_UNLOCK(&ip_conntrack_lock);
1009                         if (net_ratelimit())
1010                                 printk(KERN_WARNING
1011                                        "ip_conntrack: max number of expected "
1012                                        "connections %i of %s reached for "
1013                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1014                                        related_to->helper->max_expected,
1015                                        related_to->helper->name,
1016                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1017                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1018                         kfree(expect);
1019                         return -EPERM;
1020                 }
1021                 DEBUGP("ip_conntrack: max number of expected "
1022                        "connections %i of %s reached for "
1023                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1024                        related_to->helper->max_expected,
1025                        related_to->helper->name,
1026                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1027                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1028  
1029                 /* choose the the oldest expectation to evict */
1030                 list_for_each_entry(old, &related_to->sibling_list, 
1031                                                       expected_list)
1032                         if (old->sibling == NULL)
1033                                 break;
1034
1035                 /* We cannot fail since related_to->expecting is the number
1036                  * of unconfirmed expectations */
1037                 IP_NF_ASSERT(old && old->sibling == NULL);
1038
1039                 /* newnat14 does not reuse the real allocated memory
1040                  * structures but rather unexpects the old and
1041                  * allocates a new.  unexpect_related will decrement
1042                  * related_to->expecting. 
1043                  */
1044                 unexpect_related(old);
1045                 ret = -EPERM;
1046         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1047                              struct ip_conntrack_expect *, &expect->tuple, 
1048                              &expect->mask)) {
1049                 WRITE_UNLOCK(&ip_conntrack_lock);
1050                 DEBUGP("expect_related: busy!\n");
1051
1052                 kfree(expect);
1053                 return -EBUSY;
1054         }
1055
1056 out:    ip_conntrack_expect_insert(expect, related_to);
1057
1058         WRITE_UNLOCK(&ip_conntrack_lock);
1059
1060         return ret;
1061 }
1062
1063 /* Change tuple in an existing expectation */
1064 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1065                                struct ip_conntrack_tuple *newtuple)
1066 {
1067         int ret;
1068
1069         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1070         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1071
1072         DEBUGP("change_expect:\n");
1073         DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1074         DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
1075         DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
1076         if (expect->ct_tuple.dst.protonum == 0) {
1077                 /* Never seen before */
1078                 DEBUGP("change expect: never seen before\n");
1079                 if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
1080                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1081                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1082                         /* Force NAT to find an unused tuple */
1083                         ret = -1;
1084                 } else {
1085                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1086                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1087                         ret = 0;
1088                 }
1089         } else {
1090                 /* Resent packet */
1091                 DEBUGP("change expect: resent packet\n");
1092                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1093                         ret = 0;
1094                 } else {
1095                         /* Force NAT to choose again the same port */
1096                         ret = -1;
1097                 }
1098         }
1099         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1100         
1101         return ret;
1102 }
1103
1104 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1105    return 0 and don't do alteration. */
1106 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1107                              const struct ip_conntrack_tuple *newreply)
1108 {
1109         WRITE_LOCK(&ip_conntrack_lock);
1110         if (__ip_conntrack_find(newreply, conntrack)) {
1111                 WRITE_UNLOCK(&ip_conntrack_lock);
1112                 return 0;
1113         }
1114         /* Should be unconfirmed, so not in hash table yet */
1115         IP_NF_ASSERT(!is_confirmed(conntrack));
1116
1117         DEBUGP("Altering reply tuple of %p to ", conntrack);
1118         DUMP_TUPLE(newreply);
1119
1120         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1121         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1122                 conntrack->helper = ip_ct_find_helper(newreply);
1123         WRITE_UNLOCK(&ip_conntrack_lock);
1124
1125         return 1;
1126 }
1127
1128 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1129 {
1130         WRITE_LOCK(&ip_conntrack_lock);
1131         list_prepend(&helpers, me);
1132         WRITE_UNLOCK(&ip_conntrack_lock);
1133
1134         return 0;
1135 }
1136
1137 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1138                          const struct ip_conntrack_helper *me)
1139 {
1140         if (i->ctrack->helper == me) {
1141                 /* Get rid of any expected. */
1142                 remove_expectations(i->ctrack, 0);
1143                 /* And *then* set helper to NULL */
1144                 i->ctrack->helper = NULL;
1145         }
1146         return 0;
1147 }
1148
1149 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1150 {
1151         unsigned int i;
1152
1153         /* Need write lock here, to delete helper. */
1154         WRITE_LOCK(&ip_conntrack_lock);
1155         LIST_DELETE(&helpers, me);
1156
1157         /* Get rid of expecteds, set helpers to NULL. */
1158         for (i = 0; i < ip_conntrack_htable_size; i++)
1159                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1160                             struct ip_conntrack_tuple_hash *, me);
1161         WRITE_UNLOCK(&ip_conntrack_lock);
1162
1163         /* Someone could be still looking at the helper in a bh. */
1164         synchronize_net();
1165 }
1166
1167 /* Refresh conntrack for this many jiffies. */
1168 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1169 {
1170         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1171
1172         /* If not in hash table, timer will not be active yet */
1173         if (!is_confirmed(ct))
1174                 ct->timeout.expires = extra_jiffies;
1175         else {
1176                 WRITE_LOCK(&ip_conntrack_lock);
1177                 /* Need del_timer for race avoidance (may already be dying). */
1178                 if (del_timer(&ct->timeout)) {
1179                         ct->timeout.expires = jiffies + extra_jiffies;
1180                         add_timer(&ct->timeout);
1181                 }
1182                 WRITE_UNLOCK(&ip_conntrack_lock);
1183         }
1184 }
1185
1186 /* Returns new sk_buff, or NULL */
1187 struct sk_buff *
1188 ip_ct_gather_frags(struct sk_buff *skb)
1189 {
1190         struct sock *sk = skb->sk;
1191 #ifdef CONFIG_NETFILTER_DEBUG
1192         unsigned int olddebug = skb->nf_debug;
1193 #endif
1194         if (sk) {
1195                 sock_hold(sk);
1196                 skb_orphan(skb);
1197         }
1198
1199         local_bh_disable(); 
1200         skb = ip_defrag(skb);
1201         local_bh_enable();
1202
1203         if (!skb) {
1204                 if (sk)
1205                         sock_put(sk);
1206                 return skb;
1207         }
1208
1209         if (sk) {
1210                 skb_set_owner_w(skb, sk);
1211                 sock_put(sk);
1212         }
1213
1214         ip_send_check(skb->nh.iph);
1215         skb->nfcache |= NFC_ALTERED;
1216 #ifdef CONFIG_NETFILTER_DEBUG
1217         /* Packet path as if nothing had happened. */
1218         skb->nf_debug = olddebug;
1219 #endif
1220         return skb;
1221 }
1222
1223 /* Used by ipt_REJECT. */
1224 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1225 {
1226         struct ip_conntrack *ct;
1227         enum ip_conntrack_info ctinfo;
1228
1229         ct = __ip_conntrack_get(nfct, &ctinfo);
1230
1231         /* This ICMP is in reverse direction to the packet which
1232            caused it */
1233         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1234                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1235         else
1236                 ctinfo = IP_CT_RELATED;
1237
1238         /* Attach new skbuff, and increment count */
1239         nskb->nfct = &ct->infos[ctinfo];
1240         atomic_inc(&ct->ct_general.use);
1241 }
1242
1243 static inline int
1244 do_kill(const struct ip_conntrack_tuple_hash *i,
1245         int (*kill)(const struct ip_conntrack *i, void *data),
1246         void *data)
1247 {
1248         return kill(i->ctrack, data);
1249 }
1250
1251 /* Bring out ya dead! */
1252 static struct ip_conntrack_tuple_hash *
1253 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1254                 void *data, unsigned int *bucket)
1255 {
1256         struct ip_conntrack_tuple_hash *h = NULL;
1257
1258         READ_LOCK(&ip_conntrack_lock);
1259         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1260                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1261                               struct ip_conntrack_tuple_hash *, kill, data);
1262         }
1263         if (h)
1264                 atomic_inc(&h->ctrack->ct_general.use);
1265         READ_UNLOCK(&ip_conntrack_lock);
1266
1267         return h;
1268 }
1269
1270 void
1271 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1272                         void *data)
1273 {
1274         struct ip_conntrack_tuple_hash *h;
1275         unsigned int bucket = 0;
1276
1277         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1278                 /* Time to push up daises... */
1279                 if (del_timer(&h->ctrack->timeout))
1280                         death_by_timeout((unsigned long)h->ctrack);
1281                 /* ... else the timer will get him soon. */
1282
1283                 ip_conntrack_put(h->ctrack);
1284         }
1285 }
1286
1287 /* Fast function for those who don't want to parse /proc (and I don't
1288    blame them). */
1289 /* Reversing the socket's dst/src point of view gives us the reply
1290    mapping. */
1291 static int
1292 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1293 {
1294         struct inet_opt *inet = inet_sk(sk);
1295         struct ip_conntrack_tuple_hash *h;
1296         struct ip_conntrack_tuple tuple;
1297         
1298         IP_CT_TUPLE_U_BLANK(&tuple);
1299         tuple.src.ip = inet->rcv_saddr;
1300         tuple.src.u.tcp.port = inet->sport;
1301         tuple.dst.ip = inet->daddr;
1302         tuple.dst.u.tcp.port = inet->dport;
1303         tuple.dst.protonum = IPPROTO_TCP;
1304
1305         /* We only do TCP at the moment: is there a better way? */
1306         if (strcmp(sk->sk_prot->name, "TCP")) {
1307                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1308                 return -ENOPROTOOPT;
1309         }
1310
1311         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1312                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1313                        *len, sizeof(struct sockaddr_in));
1314                 return -EINVAL;
1315         }
1316
1317         h = ip_conntrack_find_get(&tuple, NULL);
1318         if (h) {
1319                 struct sockaddr_in sin;
1320
1321                 sin.sin_family = AF_INET;
1322                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1323                         .tuple.dst.u.tcp.port;
1324                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1325                         .tuple.dst.ip;
1326
1327                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1328                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1329                 ip_conntrack_put(h->ctrack);
1330                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1331                         return -EFAULT;
1332                 else
1333                         return 0;
1334         }
1335         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1336                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1337                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1338         return -ENOENT;
1339 }
1340
1341 static struct nf_sockopt_ops so_getorigdst = {
1342         .pf             = PF_INET,
1343         .get_optmin     = SO_ORIGINAL_DST,
1344         .get_optmax     = SO_ORIGINAL_DST+1,
1345         .get            = &getorigdst,
1346 };
1347
1348 static int kill_all(const struct ip_conntrack *i, void *data)
1349 {
1350         return 1;
1351 }
1352
1353 /* Mishearing the voices in his head, our hero wonders how he's
1354    supposed to kill the mall. */
1355 void ip_conntrack_cleanup(void)
1356 {
1357         ip_ct_attach = NULL;
1358         /* This makes sure all current packets have passed through
1359            netfilter framework.  Roll on, two-stage module
1360            delete... */
1361         synchronize_net();
1362  
1363  i_see_dead_people:
1364         ip_ct_selective_cleanup(kill_all, NULL);
1365         if (atomic_read(&ip_conntrack_count) != 0) {
1366                 schedule();
1367                 goto i_see_dead_people;
1368         }
1369
1370         kmem_cache_destroy(ip_conntrack_cachep);
1371         vfree(ip_conntrack_hash);
1372         nf_unregister_sockopt(&so_getorigdst);
1373 }
1374
1375 static int hashsize;
1376 MODULE_PARM(hashsize, "i");
1377
1378 int __init ip_conntrack_init(void)
1379 {
1380         unsigned int i;
1381         int ret;
1382
1383         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1384          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1385         if (hashsize) {
1386                 ip_conntrack_htable_size = hashsize;
1387         } else {
1388                 ip_conntrack_htable_size
1389                         = (((num_physpages << PAGE_SHIFT) / 16384)
1390                            / sizeof(struct list_head));
1391                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1392                         ip_conntrack_htable_size = 8192;
1393                 if (ip_conntrack_htable_size < 16)
1394                         ip_conntrack_htable_size = 16;
1395         }
1396         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1397
1398         printk("ip_conntrack version %s (%u buckets, %d max)"
1399                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1400                ip_conntrack_htable_size, ip_conntrack_max,
1401                sizeof(struct ip_conntrack));
1402
1403         ret = nf_register_sockopt(&so_getorigdst);
1404         if (ret != 0) {
1405                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1406                 return ret;
1407         }
1408
1409         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1410                                     * ip_conntrack_htable_size);
1411         if (!ip_conntrack_hash) {
1412                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1413                 goto err_unreg_sockopt;
1414         }
1415
1416         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1417                                                 sizeof(struct ip_conntrack), 0,
1418                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1419         if (!ip_conntrack_cachep) {
1420                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1421                 goto err_free_hash;
1422         }
1423         /* Don't NEED lock here, but good form anyway. */
1424         WRITE_LOCK(&ip_conntrack_lock);
1425         /* Sew in builtin protocols. */
1426         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1427         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1428         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1429         WRITE_UNLOCK(&ip_conntrack_lock);
1430
1431         for (i = 0; i < ip_conntrack_htable_size; i++)
1432                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1433
1434         /* For use by ipt_REJECT */
1435         ip_ct_attach = ip_conntrack_attach;
1436
1437         /* Set up fake conntrack:
1438             - to never be deleted, not in any hashes */
1439         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1440         /*  - and look it like as a confirmed connection */
1441         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1442         /*  - and prepare the ctinfo field for REJECT & NAT. */
1443         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1444         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1445         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1446                         &ip_conntrack_untracked.ct_general;
1447
1448         return ret;
1449
1450 err_free_hash:
1451         vfree(ip_conntrack_hash);
1452 err_unreg_sockopt:
1453         nf_unregister_sockopt(&so_getorigdst);
1454
1455         return -ENOMEM;
1456 }