This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 /* For ERR_PTR().  Yeah, I know... --RR */
38 #include <linux/fs.h>
39
40 /* This rwlock protects the main hash table, protocol/helper/expected
41    registrations, conntrack timers*/
42 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
43 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
44
45 #include <linux/netfilter_ipv4/ip_conntrack.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
49 #include <linux/netfilter_ipv4/listhelp.h>
50
51 #define IP_CONNTRACK_VERSION    "2.1"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DECLARE_RWLOCK(ip_conntrack_lock);
60 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
61
62 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
63 LIST_HEAD(ip_conntrack_expect_list);
64 LIST_HEAD(protocol_list);
65 static LIST_HEAD(helpers);
66 unsigned int ip_conntrack_htable_size = 0;
67 int ip_conntrack_max;
68 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
69 struct list_head *ip_conntrack_hash;
70 static kmem_cache_t *ip_conntrack_cachep;
71 struct ip_conntrack ip_conntrack_untracked;
72
73 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
74
75 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
76                               u_int8_t protocol)
77 {
78         return protocol == curr->proto;
79 }
80
81 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
82 {
83         struct ip_conntrack_protocol *p;
84
85         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
86         p = LIST_FIND(&protocol_list, proto_cmpfn,
87                       struct ip_conntrack_protocol *, protocol);
88         if (!p)
89                 p = &ip_conntrack_generic_protocol;
90
91         return p;
92 }
93
94 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
95 {
96         struct ip_conntrack_protocol *p;
97
98         READ_LOCK(&ip_conntrack_lock);
99         p = __ip_ct_find_proto(protocol);
100         READ_UNLOCK(&ip_conntrack_lock);
101         return p;
102 }
103
104 inline void 
105 ip_conntrack_put(struct ip_conntrack *ct)
106 {
107         IP_NF_ASSERT(ct);
108         IP_NF_ASSERT(ct->infos[0].master);
109         /* nf_conntrack_put wants to go via an info struct, so feed it
110            one at random. */
111         nf_conntrack_put(&ct->infos[0]);
112 }
113
114 static int ip_conntrack_hash_rnd_initted;
115 static unsigned int ip_conntrack_hash_rnd;
116
117 static u_int32_t
118 hash_conntrack(const struct ip_conntrack_tuple *tuple)
119 {
120 #if 0
121         dump_tuple(tuple);
122 #endif
123         return (jhash_3words(tuple->src.ip,
124                              (tuple->dst.ip ^ tuple->dst.protonum),
125                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
126                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
127 }
128
129 int
130 get_tuple(const struct iphdr *iph,
131           const struct sk_buff *skb,
132           unsigned int dataoff,
133           struct ip_conntrack_tuple *tuple,
134           const struct ip_conntrack_protocol *protocol)
135 {
136         /* Never happen */
137         if (iph->frag_off & htons(IP_OFFSET)) {
138                 printk("ip_conntrack_core: Frag of proto %u.\n",
139                        iph->protocol);
140                 return 0;
141         }
142
143         tuple->src.ip = iph->saddr;
144         tuple->dst.ip = iph->daddr;
145         tuple->dst.protonum = iph->protocol;
146         tuple->src.u.all = tuple->dst.u.all = 0;
147
148         return protocol->pkt_to_tuple(skb, dataoff, tuple);
149 }
150
151 static int
152 invert_tuple(struct ip_conntrack_tuple *inverse,
153              const struct ip_conntrack_tuple *orig,
154              const struct ip_conntrack_protocol *protocol)
155 {
156         inverse->src.ip = orig->dst.ip;
157         inverse->dst.ip = orig->src.ip;
158         inverse->dst.protonum = orig->dst.protonum;
159
160         inverse->src.u.all = inverse->dst.u.all = 0;
161
162         return protocol->invert_tuple(inverse, orig);
163 }
164
165
166 /* ip_conntrack_expect helper functions */
167
168 /* Compare tuple parts depending on mask. */
169 static inline int expect_cmp(const struct ip_conntrack_expect *i,
170                              const struct ip_conntrack_tuple *tuple)
171 {
172         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
173         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
174 }
175
176 static void
177 destroy_expect(struct ip_conntrack_expect *exp)
178 {
179         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
180         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
181         IP_NF_ASSERT(!timer_pending(&exp->timeout));
182
183         kfree(exp);
184 }
185
186 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
187 {
188         IP_NF_ASSERT(exp);
189
190         if (atomic_dec_and_test(&exp->use)) {
191                 /* usage count dropped to zero */
192                 destroy_expect(exp);
193         }
194 }
195
196 static inline struct ip_conntrack_expect *
197 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
198 {
199         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
200         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
201         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
202                          struct ip_conntrack_expect *, tuple);
203 }
204
205 /* Find a expectation corresponding to a tuple. */
206 struct ip_conntrack_expect *
207 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
208 {
209         struct ip_conntrack_expect *exp;
210
211         READ_LOCK(&ip_conntrack_lock);
212         READ_LOCK(&ip_conntrack_expect_tuple_lock);
213         exp = __ip_ct_expect_find(tuple);
214         if (exp)
215                 atomic_inc(&exp->use);
216         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
217         READ_UNLOCK(&ip_conntrack_lock);
218
219         return exp;
220 }
221
222 /* remove one specific expectation from all lists and drop refcount,
223  * does _NOT_ delete the timer. */
224 static void __unexpect_related(struct ip_conntrack_expect *expect)
225 {
226         DEBUGP("unexpect_related(%p)\n", expect);
227         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
228
229         /* we're not allowed to unexpect a confirmed expectation! */
230         IP_NF_ASSERT(!expect->sibling);
231
232         /* delete from global and local lists */
233         list_del(&expect->list);
234         list_del(&expect->expected_list);
235
236         /* decrement expect-count of master conntrack */
237         if (expect->expectant)
238                 expect->expectant->expecting--;
239
240         ip_conntrack_expect_put(expect);
241 }
242
243 /* remove one specific expecatation from all lists, drop refcount
244  * and expire timer. 
245  * This function can _NOT_ be called for confirmed expects! */
246 static void unexpect_related(struct ip_conntrack_expect *expect)
247 {
248         IP_NF_ASSERT(expect->expectant);
249         IP_NF_ASSERT(expect->expectant->helper);
250         /* if we are supposed to have a timer, but we can't delete
251          * it: race condition.  __unexpect_related will
252          * be calledd by timeout function */
253         if (expect->expectant->helper->timeout
254             && !del_timer(&expect->timeout))
255                 return;
256
257         __unexpect_related(expect);
258 }
259
260 /* delete all unconfirmed expectations for this conntrack */
261 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
262 {
263         struct list_head *exp_entry, *next;
264         struct ip_conntrack_expect *exp;
265
266         DEBUGP("remove_expectations(%p)\n", ct);
267
268         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
269                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
270                                  expected_list);
271
272                 /* we skip established expectations, as we want to delete
273                  * the un-established ones only */
274                 if (exp->sibling) {
275                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
276                         if (drop_refcount) {
277                                 /* Indicate that this expectations parent is dead */
278                                 ip_conntrack_put(exp->expectant);
279                                 exp->expectant = NULL;
280                         }
281                         continue;
282                 }
283
284                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
285                 IP_NF_ASSERT(exp->expectant == ct);
286
287                 /* delete expectation from global and private lists */
288                 unexpect_related(exp);
289         }
290 }
291
292 static void
293 clean_from_lists(struct ip_conntrack *ct)
294 {
295         unsigned int ho, hr;
296         
297         DEBUGP("clean_from_lists(%p)\n", ct);
298         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
299
300         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
301         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
302         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
303         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
304
305         /* Destroy all un-established, pending expectations */
306         remove_expectations(ct, 1);
307 }
308
309 static void
310 destroy_conntrack(struct nf_conntrack *nfct)
311 {
312         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
313         struct ip_conntrack_protocol *proto;
314
315         DEBUGP("destroy_conntrack(%p)\n", ct);
316         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
317         IP_NF_ASSERT(!timer_pending(&ct->timeout));
318
319         /* To make sure we don't get any weird locking issues here:
320          * destroy_conntrack() MUST NOT be called with a write lock
321          * to ip_conntrack_lock!!! -HW */
322         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
323         if (proto && proto->destroy)
324                 proto->destroy(ct);
325
326         if (ip_conntrack_destroyed)
327                 ip_conntrack_destroyed(ct);
328
329         WRITE_LOCK(&ip_conntrack_lock);
330         /* Make sure don't leave any orphaned expectations lying around */
331         if (ct->expecting)
332                 remove_expectations(ct, 1);
333
334         /* Delete our master expectation */
335         if (ct->master) {
336                 if (ct->master->expectant) {
337                         /* can't call __unexpect_related here,
338                          * since it would screw up expect_list */
339                         list_del(&ct->master->expected_list);
340                         master = ct->master->expectant;
341                 }
342                 kfree(ct->master);
343         }
344         WRITE_UNLOCK(&ip_conntrack_lock);
345
346         if (master)
347                 ip_conntrack_put(master);
348
349         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
350         kmem_cache_free(ip_conntrack_cachep, ct);
351         atomic_dec(&ip_conntrack_count);
352 }
353
354 static void death_by_timeout(unsigned long ul_conntrack)
355 {
356         struct ip_conntrack *ct = (void *)ul_conntrack;
357
358         WRITE_LOCK(&ip_conntrack_lock);
359         clean_from_lists(ct);
360         WRITE_UNLOCK(&ip_conntrack_lock);
361         ip_conntrack_put(ct);
362 }
363
364 static inline int
365 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
366                     const struct ip_conntrack_tuple *tuple,
367                     const struct ip_conntrack *ignored_conntrack)
368 {
369         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
370         return i->ctrack != ignored_conntrack
371                 && ip_ct_tuple_equal(tuple, &i->tuple);
372 }
373
374 static struct ip_conntrack_tuple_hash *
375 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
376                     const struct ip_conntrack *ignored_conntrack)
377 {
378         struct ip_conntrack_tuple_hash *h;
379         unsigned int hash = hash_conntrack(tuple);
380
381         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
382         h = LIST_FIND(&ip_conntrack_hash[hash],
383                       conntrack_tuple_cmp,
384                       struct ip_conntrack_tuple_hash *,
385                       tuple, ignored_conntrack);
386         return h;
387 }
388
389 /* Find a connection corresponding to a tuple. */
390 struct ip_conntrack_tuple_hash *
391 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
392                       const struct ip_conntrack *ignored_conntrack)
393 {
394         struct ip_conntrack_tuple_hash *h;
395
396         READ_LOCK(&ip_conntrack_lock);
397         h = __ip_conntrack_find(tuple, ignored_conntrack);
398         if (h)
399                 atomic_inc(&h->ctrack->ct_general.use);
400         READ_UNLOCK(&ip_conntrack_lock);
401
402         return h;
403 }
404
405 static inline struct ip_conntrack *
406 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
407 {
408         struct ip_conntrack *ct
409                 = (struct ip_conntrack *)nfct->master;
410
411         /* ctinfo is the index of the nfct inside the conntrack */
412         *ctinfo = nfct - ct->infos;
413         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
414         return ct;
415 }
416
417 /* Return conntrack and conntrack_info given skb->nfct->master */
418 struct ip_conntrack *
419 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
420 {
421         if (skb->nfct) 
422                 return __ip_conntrack_get(skb->nfct, ctinfo);
423         return NULL;
424 }
425
426 /* Confirm a connection given skb->nfct; places it in hash table */
427 int
428 __ip_conntrack_confirm(struct nf_ct_info *nfct)
429 {
430         unsigned int hash, repl_hash;
431         struct ip_conntrack *ct;
432         enum ip_conntrack_info ctinfo;
433
434         ct = __ip_conntrack_get(nfct, &ctinfo);
435
436         /* ipt_REJECT uses ip_conntrack_attach to attach related
437            ICMP/TCP RST packets in other direction.  Actual packet
438            which created connection will be IP_CT_NEW or for an
439            expected connection, IP_CT_RELATED. */
440         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
441                 return NF_ACCEPT;
442
443         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
444         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
445
446         /* We're not in hash table, and we refuse to set up related
447            connections for unconfirmed conns.  But packet copies and
448            REJECT will give spurious warnings here. */
449         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
450
451         /* No external references means noone else could have
452            confirmed us. */
453         IP_NF_ASSERT(!is_confirmed(ct));
454         DEBUGP("Confirming conntrack %p\n", ct);
455
456         WRITE_LOCK(&ip_conntrack_lock);
457         /* See if there's one in the list already, including reverse:
458            NAT could have grabbed it without realizing, since we're
459            not in the hash.  If there is, we lost race. */
460         if (!LIST_FIND(&ip_conntrack_hash[hash],
461                        conntrack_tuple_cmp,
462                        struct ip_conntrack_tuple_hash *,
463                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
464             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
465                           conntrack_tuple_cmp,
466                           struct ip_conntrack_tuple_hash *,
467                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
468                 list_prepend(&ip_conntrack_hash[hash],
469                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
470                 list_prepend(&ip_conntrack_hash[repl_hash],
471                              &ct->tuplehash[IP_CT_DIR_REPLY]);
472                 /* Timer relative to confirmation time, not original
473                    setting time, otherwise we'd get timer wrap in
474                    weird delay cases. */
475                 ct->timeout.expires += jiffies;
476                 add_timer(&ct->timeout);
477                 atomic_inc(&ct->ct_general.use);
478                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
479                 WRITE_UNLOCK(&ip_conntrack_lock);
480                 return NF_ACCEPT;
481         }
482
483         WRITE_UNLOCK(&ip_conntrack_lock);
484         return NF_DROP;
485 }
486
487 /* Returns true if a connection correspondings to the tuple (required
488    for NAT). */
489 int
490 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
491                          const struct ip_conntrack *ignored_conntrack)
492 {
493         struct ip_conntrack_tuple_hash *h;
494
495         READ_LOCK(&ip_conntrack_lock);
496         h = __ip_conntrack_find(tuple, ignored_conntrack);
497         READ_UNLOCK(&ip_conntrack_lock);
498
499         return h != NULL;
500 }
501
502 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
503 struct ip_conntrack *
504 icmp_error_track(struct sk_buff *skb,
505                  enum ip_conntrack_info *ctinfo,
506                  unsigned int hooknum)
507 {
508         struct ip_conntrack_tuple innertuple, origtuple;
509         struct {
510                 struct icmphdr icmp;
511                 struct iphdr ip;
512         } inside;
513         struct ip_conntrack_protocol *innerproto;
514         struct ip_conntrack_tuple_hash *h;
515         int dataoff;
516
517         IP_NF_ASSERT(skb->nfct == NULL);
518
519         /* Not enough header? */
520         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
521                 return NULL;
522
523         if (inside.icmp.type != ICMP_DEST_UNREACH
524             && inside.icmp.type != ICMP_SOURCE_QUENCH
525             && inside.icmp.type != ICMP_TIME_EXCEEDED
526             && inside.icmp.type != ICMP_PARAMETERPROB
527             && inside.icmp.type != ICMP_REDIRECT)
528                 return NULL;
529
530         /* Ignore ICMP's containing fragments (shouldn't happen) */
531         if (inside.ip.frag_off & htons(IP_OFFSET)) {
532                 DEBUGP("icmp_error_track: fragment of proto %u\n",
533                        inside.ip.protocol);
534                 return NULL;
535         }
536
537         innerproto = ip_ct_find_proto(inside.ip.protocol);
538         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
539         /* Are they talking about one of our connections? */
540         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
541                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
542                 return NULL;
543         }
544
545         /* Ordinarily, we'd expect the inverted tupleproto, but it's
546            been preserved inside the ICMP. */
547         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
548                 DEBUGP("icmp_error_track: Can't invert tuple\n");
549                 return NULL;
550         }
551
552         *ctinfo = IP_CT_RELATED;
553
554         h = ip_conntrack_find_get(&innertuple, NULL);
555         if (!h) {
556                 /* Locally generated ICMPs will match inverted if they
557                    haven't been SNAT'ed yet */
558                 /* FIXME: NAT code has to handle half-done double NAT --RR */
559                 if (hooknum == NF_IP_LOCAL_OUT)
560                         h = ip_conntrack_find_get(&origtuple, NULL);
561
562                 if (!h) {
563                         DEBUGP("icmp_error_track: no match\n");
564                         return NULL;
565                 }
566                 /* Reverse direction from that found */
567                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
568                         *ctinfo += IP_CT_IS_REPLY;
569         } else {
570                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
571                         *ctinfo += IP_CT_IS_REPLY;
572         }
573
574         /* Update skb to refer to this connection */
575         skb->nfct = &h->ctrack->infos[*ctinfo];
576         return h->ctrack;
577 }
578
579 /* There's a small race here where we may free a just-assured
580    connection.  Too bad: we're in trouble anyway. */
581 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
582 {
583         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
584 }
585
586 static int early_drop(struct list_head *chain)
587 {
588         /* Traverse backwards: gives us oldest, which is roughly LRU */
589         struct ip_conntrack_tuple_hash *h;
590         int dropped = 0;
591
592         READ_LOCK(&ip_conntrack_lock);
593         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
594         if (h)
595                 atomic_inc(&h->ctrack->ct_general.use);
596         READ_UNLOCK(&ip_conntrack_lock);
597
598         if (!h)
599                 return dropped;
600
601         if (del_timer(&h->ctrack->timeout)) {
602                 death_by_timeout((unsigned long)h->ctrack);
603                 dropped = 1;
604         }
605         ip_conntrack_put(h->ctrack);
606         return dropped;
607 }
608
609 static inline int helper_cmp(const struct ip_conntrack_helper *i,
610                              const struct ip_conntrack_tuple *rtuple)
611 {
612         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
613 }
614
615 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
616 {
617         return LIST_FIND(&helpers, helper_cmp,
618                          struct ip_conntrack_helper *,
619                          tuple);
620 }
621
622 /* Allocate a new conntrack: we return -ENOMEM if classification
623    failed due to stress.  Otherwise it really is unclassifiable. */
624 static struct ip_conntrack_tuple_hash *
625 init_conntrack(const struct ip_conntrack_tuple *tuple,
626                struct ip_conntrack_protocol *protocol,
627                struct sk_buff *skb)
628 {
629         struct ip_conntrack *conntrack;
630         struct ip_conntrack_tuple repl_tuple;
631         size_t hash;
632         struct ip_conntrack_expect *expected;
633         int i;
634         static unsigned int drop_next;
635
636         if (!ip_conntrack_hash_rnd_initted) {
637                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
638                 ip_conntrack_hash_rnd_initted = 1;
639         }
640
641         hash = hash_conntrack(tuple);
642
643         if (ip_conntrack_max &&
644             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
645                 /* Try dropping from random chain, or else from the
646                    chain about to put into (in case they're trying to
647                    bomb one hash chain). */
648                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
649
650                 if (!early_drop(&ip_conntrack_hash[next])
651                     && !early_drop(&ip_conntrack_hash[hash])) {
652                         if (net_ratelimit())
653                                 printk(KERN_WARNING
654                                        "ip_conntrack: table full, dropping"
655                                        " packet.\n");
656                         return ERR_PTR(-ENOMEM);
657                 }
658         }
659
660         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
661                 DEBUGP("Can't invert tuple.\n");
662                 return NULL;
663         }
664
665         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
666         if (!conntrack) {
667                 DEBUGP("Can't allocate conntrack.\n");
668                 return ERR_PTR(-ENOMEM);
669         }
670
671         memset(conntrack, 0, sizeof(*conntrack));
672         atomic_set(&conntrack->ct_general.use, 1);
673         conntrack->ct_general.destroy = destroy_conntrack;
674         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
675         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
676         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
677         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
678         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
679         conntrack->xid[IP_CT_DIR_REPLY] = -1;
680         for (i=0; i < IP_CT_NUMBER; i++)
681                 conntrack->infos[i].master = &conntrack->ct_general;
682
683         if (!protocol->new(conntrack, skb)) {
684                 kmem_cache_free(ip_conntrack_cachep, conntrack);
685                 return NULL;
686         }
687         /* Don't set timer yet: wait for confirmation */
688         init_timer(&conntrack->timeout);
689         conntrack->timeout.data = (unsigned long)conntrack;
690         conntrack->timeout.function = death_by_timeout;
691
692         INIT_LIST_HEAD(&conntrack->sibling_list);
693
694         WRITE_LOCK(&ip_conntrack_lock);
695         /* Need finding and deleting of expected ONLY if we win race */
696         READ_LOCK(&ip_conntrack_expect_tuple_lock);
697         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
698                              struct ip_conntrack_expect *, tuple);
699         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
700
701         /* If master is not in hash table yet (ie. packet hasn't left
702            this machine yet), how can other end know about expected?
703            Hence these are not the droids you are looking for (if
704            master ct never got confirmed, we'd hold a reference to it
705            and weird things would happen to future packets). */
706         if (expected && !is_confirmed(expected->expectant))
707                 expected = NULL;
708
709         /* Look up the conntrack helper for master connections only */
710         if (!expected)
711                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
712
713         /* If the expectation is dying, then this is a loser. */
714         if (expected
715             && expected->expectant->helper->timeout
716             && ! del_timer(&expected->timeout))
717                 expected = NULL;
718
719         if (expected) {
720                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
721                         conntrack, expected);
722                 /* Welcome, Mr. Bond.  We've been expecting you... */
723                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
724                 conntrack->master = expected;
725                 expected->sibling = conntrack;
726                 LIST_DELETE(&ip_conntrack_expect_list, expected);
727                 expected->expectant->expecting--;
728                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
729         }
730         atomic_inc(&ip_conntrack_count);
731         WRITE_UNLOCK(&ip_conntrack_lock);
732
733         if (expected && expected->expectfn)
734                 expected->expectfn(conntrack);
735         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
736 }
737
738 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
739 static inline struct ip_conntrack *
740 resolve_normal_ct(struct sk_buff *skb,
741                   struct ip_conntrack_protocol *proto,
742                   int *set_reply,
743                   unsigned int hooknum,
744                   enum ip_conntrack_info *ctinfo)
745 {
746         struct ip_conntrack_tuple tuple;
747         struct ip_conntrack_tuple_hash *h;
748
749         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
750
751         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
752                 return NULL;
753
754         /* look for tuple match */
755         h = ip_conntrack_find_get(&tuple, NULL);
756         if (!h) {
757                 h = init_conntrack(&tuple, proto, skb);
758                 if (!h)
759                         return NULL;
760                 if (IS_ERR(h))
761                         return (void *)h;
762         }
763
764         /* It exists; we have (non-exclusive) reference. */
765         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
766                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
767                 /* Please set reply bit if this packet OK */
768                 *set_reply = 1;
769         } else {
770                 /* Once we've had two way comms, always ESTABLISHED. */
771                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
772                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
773                                h->ctrack);
774                         *ctinfo = IP_CT_ESTABLISHED;
775                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
776                         DEBUGP("ip_conntrack_in: related packet for %p\n",
777                                h->ctrack);
778                         *ctinfo = IP_CT_RELATED;
779                 } else {
780                         DEBUGP("ip_conntrack_in: new packet for %p\n",
781                                h->ctrack);
782                         *ctinfo = IP_CT_NEW;
783                 }
784                 *set_reply = 0;
785         }
786         skb->nfct = &h->ctrack->infos[*ctinfo];
787         return h->ctrack;
788 }
789
790 /* Netfilter hook itself. */
791 unsigned int ip_conntrack_in(unsigned int hooknum,
792                              struct sk_buff **pskb,
793                              const struct net_device *in,
794                              const struct net_device *out,
795                              int (*okfn)(struct sk_buff *))
796 {
797         struct ip_conntrack *ct;
798         enum ip_conntrack_info ctinfo;
799         struct ip_conntrack_protocol *proto;
800         int set_reply;
801         int ret;
802
803         /* Never happen */
804         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
805                 if (net_ratelimit()) {
806                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
807                        (*pskb)->nh.iph->protocol, hooknum);
808                 }
809                 return NF_DROP;
810         }
811
812         /* FIXME: Do this right please. --RR */
813         (*pskb)->nfcache |= NFC_UNKNOWN;
814
815 /* Doesn't cover locally-generated broadcast, so not worth it. */
816 #if 0
817         /* Ignore broadcast: no `connection'. */
818         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
819                 printk("Broadcast packet!\n");
820                 return NF_ACCEPT;
821         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
822                    == htonl(0x000000FF)) {
823                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
824                        NIPQUAD((*pskb)->nh.iph->saddr),
825                        NIPQUAD((*pskb)->nh.iph->daddr),
826                        (*pskb)->sk, (*pskb)->pkt_type);
827         }
828 #endif
829
830         /* Previously seen (loopback or untracked)?  Ignore. */
831         if ((*pskb)->nfct)
832                 return NF_ACCEPT;
833
834         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
835
836         /* It may be an icmp error... */
837         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
838             && icmp_error_track(*pskb, &ctinfo, hooknum))
839                 return NF_ACCEPT;
840
841         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
842                 /* Not valid part of a connection */
843                 return NF_ACCEPT;
844
845         if (IS_ERR(ct))
846                 /* Too stressed to deal. */
847                 return NF_DROP;
848
849         IP_NF_ASSERT((*pskb)->nfct);
850
851         ret = proto->packet(ct, *pskb, ctinfo);
852         if (ret == -1) {
853                 /* Invalid */
854                 nf_conntrack_put((*pskb)->nfct);
855                 (*pskb)->nfct = NULL;
856                 return NF_ACCEPT;
857         }
858
859         if (ret != NF_DROP && ct->helper) {
860                 ret = ct->helper->help(*pskb, ct, ctinfo);
861                 if (ret == -1) {
862                         /* Invalid */
863                         nf_conntrack_put((*pskb)->nfct);
864                         (*pskb)->nfct = NULL;
865                         return NF_ACCEPT;
866                 }
867         }
868         if (set_reply)
869                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
870
871         return ret;
872 }
873
874 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
875                    const struct ip_conntrack_tuple *orig)
876 {
877         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
878 }
879
880 static inline int resent_expect(const struct ip_conntrack_expect *i,
881                                 const struct ip_conntrack_tuple *tuple,
882                                 const struct ip_conntrack_tuple *mask)
883 {
884         DEBUGP("resent_expect\n");
885         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
886         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
887         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
888         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
889                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
890                 && ip_ct_tuple_equal(&i->mask, mask));
891 }
892
893 /* Would two expected things clash? */
894 static inline int expect_clash(const struct ip_conntrack_expect *i,
895                                const struct ip_conntrack_tuple *tuple,
896                                const struct ip_conntrack_tuple *mask)
897 {
898         /* Part covered by intersection of masks must be unequal,
899            otherwise they clash */
900         struct ip_conntrack_tuple intersect_mask
901                 = { { i->mask.src.ip & mask->src.ip,
902                       { i->mask.src.u.all & mask->src.u.all } },
903                     { i->mask.dst.ip & mask->dst.ip,
904                       { i->mask.dst.u.all & mask->dst.u.all },
905                       i->mask.dst.protonum & mask->dst.protonum } };
906
907         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
908 }
909
910 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
911 {
912         WRITE_LOCK(&ip_conntrack_lock);
913         unexpect_related(expect);
914         WRITE_UNLOCK(&ip_conntrack_lock);
915 }
916         
917 static void expectation_timed_out(unsigned long ul_expect)
918 {
919         struct ip_conntrack_expect *expect = (void *) ul_expect;
920
921         DEBUGP("expectation %p timed out\n", expect);   
922         WRITE_LOCK(&ip_conntrack_lock);
923         __unexpect_related(expect);
924         WRITE_UNLOCK(&ip_conntrack_lock);
925 }
926
927 struct ip_conntrack_expect *
928 ip_conntrack_expect_alloc(void)
929 {
930         struct ip_conntrack_expect *new;
931         
932         new = (struct ip_conntrack_expect *)
933                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
934         if (!new) {
935                 DEBUGP("expect_related: OOM allocating expect\n");
936                 return NULL;
937         }
938
939         /* tuple_cmp compares whole union, we have to initialized cleanly */
940         memset(new, 0, sizeof(struct ip_conntrack_expect));
941
942         return new;
943 }
944
945 static void
946 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
947                            struct ip_conntrack *related_to)
948 {
949         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
950         new->expectant = related_to;
951         new->sibling = NULL;
952         atomic_set(&new->use, 1);
953
954         /* add to expected list for this connection */
955         list_add_tail(&new->expected_list, &related_to->sibling_list);
956         /* add to global list of expectations */
957         list_prepend(&ip_conntrack_expect_list, &new->list);
958         /* add and start timer if required */
959         if (related_to->helper->timeout) {
960                 init_timer(&new->timeout);
961                 new->timeout.data = (unsigned long)new;
962                 new->timeout.function = expectation_timed_out;
963                 new->timeout.expires = jiffies +
964                                         related_to->helper->timeout * HZ;
965                 add_timer(&new->timeout);
966         }
967         related_to->expecting++;
968 }
969
970 /* Add a related connection. */
971 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
972                                 struct ip_conntrack *related_to)
973 {
974         struct ip_conntrack_expect *old;
975         int ret = 0;
976
977         WRITE_LOCK(&ip_conntrack_lock);
978         /* Because of the write lock, no reader can walk the lists,
979          * so there is no need to use the tuple lock too */
980
981         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
982         DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
983         DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
984
985         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
986                         struct ip_conntrack_expect *, &expect->tuple, 
987                         &expect->mask);
988         if (old) {
989                 /* Helper private data may contain offsets but no pointers
990                    pointing into the payload - otherwise we should have to copy 
991                    the data filled out by the helper over the old one */
992                 DEBUGP("expect_related: resent packet\n");
993                 if (related_to->helper->timeout) {
994                         if (!del_timer(&old->timeout)) {
995                                 /* expectation is dying. Fall through */
996                                 goto out;
997                         } else {
998                                 old->timeout.expires = jiffies + 
999                                         related_to->helper->timeout * HZ;
1000                                 add_timer(&old->timeout);
1001                         }
1002                 }
1003
1004                 WRITE_UNLOCK(&ip_conntrack_lock);
1005                 kfree(expect);
1006                 return -EEXIST;
1007
1008         } else if (related_to->helper->max_expected && 
1009                    related_to->expecting >= related_to->helper->max_expected) {
1010                 /* old == NULL */
1011                 if (!(related_to->helper->flags & 
1012                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1013                         WRITE_UNLOCK(&ip_conntrack_lock);
1014                         if (net_ratelimit())
1015                                 printk(KERN_WARNING
1016                                        "ip_conntrack: max number of expected "
1017                                        "connections %i of %s reached for "
1018                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1019                                        related_to->helper->max_expected,
1020                                        related_to->helper->name,
1021                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1022                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1023                         kfree(expect);
1024                         return -EPERM;
1025                 }
1026                 DEBUGP("ip_conntrack: max number of expected "
1027                        "connections %i of %s reached for "
1028                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1029                        related_to->helper->max_expected,
1030                        related_to->helper->name,
1031                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1032                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1033  
1034                 /* choose the the oldest expectation to evict */
1035                 list_for_each_entry(old, &related_to->sibling_list, 
1036                                                       expected_list)
1037                         if (old->sibling == NULL)
1038                                 break;
1039
1040                 /* We cannot fail since related_to->expecting is the number
1041                  * of unconfirmed expectations */
1042                 IP_NF_ASSERT(old && old->sibling == NULL);
1043
1044                 /* newnat14 does not reuse the real allocated memory
1045                  * structures but rather unexpects the old and
1046                  * allocates a new.  unexpect_related will decrement
1047                  * related_to->expecting. 
1048                  */
1049                 unexpect_related(old);
1050                 ret = -EPERM;
1051         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1052                              struct ip_conntrack_expect *, &expect->tuple, 
1053                              &expect->mask)) {
1054                 WRITE_UNLOCK(&ip_conntrack_lock);
1055                 DEBUGP("expect_related: busy!\n");
1056
1057                 kfree(expect);
1058                 return -EBUSY;
1059         }
1060
1061 out:    ip_conntrack_expect_insert(expect, related_to);
1062
1063         WRITE_UNLOCK(&ip_conntrack_lock);
1064
1065         return ret;
1066 }
1067
1068 /* Change tuple in an existing expectation */
1069 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1070                                struct ip_conntrack_tuple *newtuple)
1071 {
1072         int ret;
1073
1074         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1075         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1076         DEBUGP("change_expect:\n");
1077         DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
1078         DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
1079         DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
1080         if (expect->ct_tuple.dst.protonum == 0) {
1081                 /* Never seen before */
1082                 DEBUGP("change expect: never seen before\n");
1083                 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
1084                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1085                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1086                         /* Force NAT to find an unused tuple */
1087                         ret = -1;
1088                 } else {
1089                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1090                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1091                         ret = 0;
1092                 }
1093         } else {
1094                 /* Resent packet */
1095                 DEBUGP("change expect: resent packet\n");
1096                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1097                         ret = 0;
1098                 } else {
1099                         /* Force NAT to choose again the same port */
1100                         ret = -1;
1101                 }
1102         }
1103         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1104         
1105         return ret;
1106 }
1107
1108 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1109    return 0 and don't do alteration. */
1110 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1111                              const struct ip_conntrack_tuple *newreply)
1112 {
1113         WRITE_LOCK(&ip_conntrack_lock);
1114         if (__ip_conntrack_find(newreply, conntrack)) {
1115                 WRITE_UNLOCK(&ip_conntrack_lock);
1116                 return 0;
1117         }
1118         /* Should be unconfirmed, so not in hash table yet */
1119         IP_NF_ASSERT(!is_confirmed(conntrack));
1120
1121         DEBUGP("Altering reply tuple of %p to ", conntrack);
1122         DUMP_TUPLE(newreply);
1123
1124         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1125         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1126                 conntrack->helper = ip_ct_find_helper(newreply);
1127         WRITE_UNLOCK(&ip_conntrack_lock);
1128
1129         return 1;
1130 }
1131
1132 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1133 {
1134         WRITE_LOCK(&ip_conntrack_lock);
1135         list_prepend(&helpers, me);
1136         WRITE_UNLOCK(&ip_conntrack_lock);
1137
1138         return 0;
1139 }
1140
1141 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1142                          const struct ip_conntrack_helper *me)
1143 {
1144         if (i->ctrack->helper == me) {
1145                 /* Get rid of any expected. */
1146                 remove_expectations(i->ctrack, 0);
1147                 /* And *then* set helper to NULL */
1148                 i->ctrack->helper = NULL;
1149         }
1150         return 0;
1151 }
1152
1153 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1154 {
1155         unsigned int i;
1156
1157         /* Need write lock here, to delete helper. */
1158         WRITE_LOCK(&ip_conntrack_lock);
1159         LIST_DELETE(&helpers, me);
1160
1161         /* Get rid of expecteds, set helpers to NULL. */
1162         for (i = 0; i < ip_conntrack_htable_size; i++)
1163                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1164                             struct ip_conntrack_tuple_hash *, me);
1165         WRITE_UNLOCK(&ip_conntrack_lock);
1166
1167         /* Someone could be still looking at the helper in a bh. */
1168         synchronize_net();
1169 }
1170
1171 static inline void ct_add_counters(struct ip_conntrack *ct,
1172                                    enum ip_conntrack_info ctinfo,
1173                                    const struct sk_buff *skb)
1174 {
1175 #ifdef CONFIG_IP_NF_CT_ACCT
1176         if (skb) {
1177                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1178                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1179                                         ntohs(skb->nh.iph->tot_len);
1180         }
1181 #endif
1182 }
1183
1184 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1185 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1186                         enum ip_conntrack_info ctinfo,
1187                         const struct sk_buff *skb,
1188                         unsigned long extra_jiffies)
1189 {
1190         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1191
1192         /* If not in hash table, timer will not be active yet */
1193         if (!is_confirmed(ct)) {
1194                 ct->timeout.expires = extra_jiffies;
1195                 ct_add_counters(ct, ctinfo, skb);
1196         } else {
1197                 WRITE_LOCK(&ip_conntrack_lock);
1198                 /* Need del_timer for race avoidance (may already be dying). */
1199                 if (del_timer(&ct->timeout)) {
1200                         ct->timeout.expires = jiffies + extra_jiffies;
1201                         add_timer(&ct->timeout);
1202                 }
1203                 ct_add_counters(ct, ctinfo, skb);
1204                 WRITE_UNLOCK(&ip_conntrack_lock);
1205         }
1206 }
1207
1208 /* Returns new sk_buff, or NULL */
1209 struct sk_buff *
1210 ip_ct_gather_frags(struct sk_buff *skb)
1211 {
1212         struct sock *sk = skb->sk;
1213 #ifdef CONFIG_NETFILTER_DEBUG
1214         unsigned int olddebug = skb->nf_debug;
1215 #endif
1216         if (sk) {
1217                 sock_hold(sk);
1218                 skb_orphan(skb);
1219         }
1220
1221         local_bh_disable(); 
1222         skb = ip_defrag(skb);
1223         local_bh_enable();
1224
1225         if (!skb) {
1226                 if (sk)
1227                         sock_put(sk);
1228                 return skb;
1229         }
1230
1231         if (sk) {
1232                 skb_set_owner_w(skb, sk);
1233                 sock_put(sk);
1234         }
1235
1236         ip_send_check(skb->nh.iph);
1237         skb->nfcache |= NFC_ALTERED;
1238 #ifdef CONFIG_NETFILTER_DEBUG
1239         /* Packet path as if nothing had happened. */
1240         skb->nf_debug = olddebug;
1241 #endif
1242         return skb;
1243 }
1244
1245 /* Used by ipt_REJECT. */
1246 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1247 {
1248         struct ip_conntrack *ct;
1249         enum ip_conntrack_info ctinfo;
1250
1251         ct = __ip_conntrack_get(nfct, &ctinfo);
1252
1253         /* This ICMP is in reverse direction to the packet which
1254            caused it */
1255         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1256                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1257         else
1258                 ctinfo = IP_CT_RELATED;
1259
1260         /* Attach new skbuff, and increment count */
1261         nskb->nfct = &ct->infos[ctinfo];
1262         atomic_inc(&ct->ct_general.use);
1263 }
1264
1265 static inline int
1266 do_kill(const struct ip_conntrack_tuple_hash *i,
1267         int (*kill)(const struct ip_conntrack *i, void *data),
1268         void *data)
1269 {
1270         return kill(i->ctrack, data);
1271 }
1272
1273 /* Bring out ya dead! */
1274 static struct ip_conntrack_tuple_hash *
1275 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1276                 void *data, unsigned int *bucket)
1277 {
1278         struct ip_conntrack_tuple_hash *h = NULL;
1279
1280         READ_LOCK(&ip_conntrack_lock);
1281         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1282                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1283                               struct ip_conntrack_tuple_hash *, kill, data);
1284         }
1285         if (h)
1286                 atomic_inc(&h->ctrack->ct_general.use);
1287         READ_UNLOCK(&ip_conntrack_lock);
1288
1289         return h;
1290 }
1291
1292 void
1293 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1294                         void *data)
1295 {
1296         struct ip_conntrack_tuple_hash *h;
1297         unsigned int bucket = 0;
1298
1299         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1300                 /* Time to push up daises... */
1301                 if (del_timer(&h->ctrack->timeout))
1302                         death_by_timeout((unsigned long)h->ctrack);
1303                 /* ... else the timer will get him soon. */
1304
1305                 ip_conntrack_put(h->ctrack);
1306         }
1307 }
1308
1309 /* Fast function for those who don't want to parse /proc (and I don't
1310    blame them). */
1311 /* Reversing the socket's dst/src point of view gives us the reply
1312    mapping. */
1313 static int
1314 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1315 {
1316         struct inet_opt *inet = inet_sk(sk);
1317         struct ip_conntrack_tuple_hash *h;
1318         struct ip_conntrack_tuple tuple;
1319         
1320         IP_CT_TUPLE_U_BLANK(&tuple);
1321         tuple.src.ip = inet->rcv_saddr;
1322         tuple.src.u.tcp.port = inet->sport;
1323         tuple.dst.ip = inet->daddr;
1324         tuple.dst.u.tcp.port = inet->dport;
1325         tuple.dst.protonum = IPPROTO_TCP;
1326
1327         /* We only do TCP at the moment: is there a better way? */
1328         if (strcmp(sk->sk_prot->name, "TCP")) {
1329                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1330                 return -ENOPROTOOPT;
1331         }
1332
1333         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1334                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1335                        *len, sizeof(struct sockaddr_in));
1336                 return -EINVAL;
1337         }
1338
1339         h = ip_conntrack_find_get(&tuple, NULL);
1340         if (h) {
1341                 struct sockaddr_in sin;
1342
1343                 sin.sin_family = AF_INET;
1344                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1345                         .tuple.dst.u.tcp.port;
1346                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1347                         .tuple.dst.ip;
1348
1349                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1350                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1351                 ip_conntrack_put(h->ctrack);
1352                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1353                         return -EFAULT;
1354                 else
1355                         return 0;
1356         }
1357         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1358                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1359                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1360         return -ENOENT;
1361 }
1362
1363 static struct nf_sockopt_ops so_getorigdst = {
1364         .pf             = PF_INET,
1365         .get_optmin     = SO_ORIGINAL_DST,
1366         .get_optmax     = SO_ORIGINAL_DST+1,
1367         .get            = &getorigdst,
1368 };
1369
1370 static int kill_all(const struct ip_conntrack *i, void *data)
1371 {
1372         return 1;
1373 }
1374
1375 /* Mishearing the voices in his head, our hero wonders how he's
1376    supposed to kill the mall. */
1377 void ip_conntrack_cleanup(void)
1378 {
1379         ip_ct_attach = NULL;
1380         /* This makes sure all current packets have passed through
1381            netfilter framework.  Roll on, two-stage module
1382            delete... */
1383         synchronize_net();
1384  
1385  i_see_dead_people:
1386         ip_ct_selective_cleanup(kill_all, NULL);
1387         if (atomic_read(&ip_conntrack_count) != 0) {
1388                 schedule();
1389                 goto i_see_dead_people;
1390         }
1391
1392         kmem_cache_destroy(ip_conntrack_cachep);
1393         vfree(ip_conntrack_hash);
1394         nf_unregister_sockopt(&so_getorigdst);
1395 }
1396
1397 static int hashsize;
1398 MODULE_PARM(hashsize, "i");
1399
1400 int __init ip_conntrack_init(void)
1401 {
1402         unsigned int i;
1403         int ret;
1404
1405         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1406          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1407         if (hashsize) {
1408                 ip_conntrack_htable_size = hashsize;
1409         } else {
1410                 ip_conntrack_htable_size
1411                         = (((num_physpages << PAGE_SHIFT) / 16384)
1412                            / sizeof(struct list_head));
1413                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1414                         ip_conntrack_htable_size = 8192;
1415                 if (ip_conntrack_htable_size < 16)
1416                         ip_conntrack_htable_size = 16;
1417         }
1418         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1419
1420         printk("ip_conntrack version %s (%u buckets, %d max)"
1421                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1422                ip_conntrack_htable_size, ip_conntrack_max,
1423                sizeof(struct ip_conntrack));
1424
1425         ret = nf_register_sockopt(&so_getorigdst);
1426         if (ret != 0) {
1427                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1428                 return ret;
1429         }
1430
1431         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1432                                     * ip_conntrack_htable_size);
1433         if (!ip_conntrack_hash) {
1434                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1435                 goto err_unreg_sockopt;
1436         }
1437
1438         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1439                                                 sizeof(struct ip_conntrack), 0,
1440                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1441         if (!ip_conntrack_cachep) {
1442                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1443                 goto err_free_hash;
1444         }
1445         /* Don't NEED lock here, but good form anyway. */
1446         WRITE_LOCK(&ip_conntrack_lock);
1447         /* Sew in builtin protocols. */
1448         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1449         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1450         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1451         WRITE_UNLOCK(&ip_conntrack_lock);
1452
1453         for (i = 0; i < ip_conntrack_htable_size; i++)
1454                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1455
1456         /* For use by ipt_REJECT */
1457         ip_ct_attach = ip_conntrack_attach;
1458
1459         /* Set up fake conntrack:
1460             - to never be deleted, not in any hashes */
1461         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1462         /*  - and look it like as a confirmed connection */
1463         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1464         /*  - and prepare the ctinfo field for REJECT & NAT. */
1465         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1466         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1467         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1468                         &ip_conntrack_untracked.ct_general;
1469
1470         return ret;
1471
1472 err_free_hash:
1473         vfree(ip_conntrack_hash);
1474 err_unreg_sockopt:
1475         nf_unregister_sockopt(&so_getorigdst);
1476
1477         return -ENOMEM;
1478 }