444d82571652c62050ab4ae73a376d205c7a6db0
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.4"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id;
81 static unsigned int ip_conntrack_expect_next_id;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
84 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_possible_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
152                             unsigned int size, unsigned int rnd)
153 {
154         return (jhash_3words(tuple->src.ip,
155                              (tuple->dst.ip ^ tuple->dst.protonum),
156                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
157                              rnd) % size);
158 }
159
160 static u_int32_t
161 hash_conntrack(const struct ip_conntrack_tuple *tuple)
162 {
163         return __hash_conntrack(tuple, ip_conntrack_htable_size,
164                                 ip_conntrack_hash_rnd);
165 }
166
167 int
168 ip_ct_get_tuple(const struct iphdr *iph,
169                 const struct sk_buff *skb,
170                 unsigned int dataoff,
171                 struct ip_conntrack_tuple *tuple,
172                 const struct ip_conntrack_protocol *protocol)
173 {
174         /* Never happen */
175         if (iph->frag_off & htons(IP_OFFSET)) {
176                 printk("ip_conntrack_core: Frag of proto %u.\n",
177                        iph->protocol);
178                 return 0;
179         }
180
181         tuple->src.ip = iph->saddr;
182         tuple->dst.ip = iph->daddr;
183         tuple->dst.protonum = iph->protocol;
184         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
185
186         return protocol->pkt_to_tuple(skb, dataoff, tuple);
187 }
188
189 int
190 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
191                    const struct ip_conntrack_tuple *orig,
192                    const struct ip_conntrack_protocol *protocol)
193 {
194         inverse->src.ip = orig->dst.ip;
195         inverse->dst.ip = orig->src.ip;
196         inverse->dst.protonum = orig->dst.protonum;
197         inverse->dst.dir = !orig->dst.dir;
198
199         return protocol->invert_tuple(inverse, orig);
200 }
201
202
203 /* ip_conntrack_expect helper functions */
204 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
205 {
206         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
207         IP_NF_ASSERT(!timer_pending(&exp->timeout));
208         list_del(&exp->list);
209         CONNTRACK_STAT_INC(expect_delete);
210         exp->master->expecting--;
211         ip_conntrack_expect_put(exp);
212 }
213
214 static void expectation_timed_out(unsigned long ul_expect)
215 {
216         struct ip_conntrack_expect *exp = (void *)ul_expect;
217
218         write_lock_bh(&ip_conntrack_lock);
219         ip_ct_unlink_expect(exp);
220         write_unlock_bh(&ip_conntrack_lock);
221         ip_conntrack_expect_put(exp);
222 }
223
224 struct ip_conntrack_expect *
225 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
226 {
227         struct ip_conntrack_expect *i;
228         
229         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
230                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
231                         atomic_inc(&i->use);
232                         return i;
233                 }
234         }
235         return NULL;
236 }
237
238 /* Just find a expectation corresponding to a tuple. */
239 struct ip_conntrack_expect *
240 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
241 {
242         struct ip_conntrack_expect *i;
243         
244         read_lock_bh(&ip_conntrack_lock);
245         i = __ip_conntrack_expect_find(tuple);
246         read_unlock_bh(&ip_conntrack_lock);
247
248         return i;
249 }
250
251 /* If an expectation for this connection is found, it gets delete from
252  * global list then returned. */
253 static struct ip_conntrack_expect *
254 find_expectation(const struct ip_conntrack_tuple *tuple)
255 {
256         struct ip_conntrack_expect *i;
257
258         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
259                 /* If master is not in hash table yet (ie. packet hasn't left
260                    this machine yet), how can other end know about expected?
261                    Hence these are not the droids you are looking for (if
262                    master ct never got confirmed, we'd hold a reference to it
263                    and weird things would happen to future packets). */
264                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
265                     && is_confirmed(i->master)) {
266                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
267                                 atomic_inc(&i->use);
268                                 return i;
269                         } else if (del_timer(&i->timeout)) {
270                                 ip_ct_unlink_expect(i);
271                                 return i;
272                         }
273                 }
274         }
275         return NULL;
276 }
277
278 /* delete all expectations for this conntrack */
279 void ip_ct_remove_expectations(struct ip_conntrack *ct)
280 {
281         struct ip_conntrack_expect *i, *tmp;
282
283         /* Optimization: most connection never expect any others. */
284         if (ct->expecting == 0)
285                 return;
286
287         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
288                 if (i->master == ct && del_timer(&i->timeout)) {
289                         ip_ct_unlink_expect(i);
290                         ip_conntrack_expect_put(i);
291                 }
292         }
293 }
294
295 static void
296 clean_from_lists(struct ip_conntrack *ct)
297 {
298         unsigned int ho, hr;
299         
300         DEBUGP("clean_from_lists(%p)\n", ct);
301         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
302
303         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
304         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
305         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
306         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
307
308         /* Destroy all pending expectations */
309         ip_ct_remove_expectations(ct);
310 }
311
312 static void
313 destroy_conntrack(struct nf_conntrack *nfct)
314 {
315         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
316         struct ip_conntrack_protocol *proto;
317
318         DEBUGP("destroy_conntrack(%p)\n", ct);
319         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
320         IP_NF_ASSERT(!timer_pending(&ct->timeout));
321
322         ip_conntrack_event(IPCT_DESTROY, ct);
323         set_bit(IPS_DYING_BIT, &ct->status);
324
325         /* To make sure we don't get any weird locking issues here:
326          * destroy_conntrack() MUST NOT be called with a write lock
327          * to ip_conntrack_lock!!! -HW */
328         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
329         if (proto && proto->destroy)
330                 proto->destroy(ct);
331
332         if (ip_conntrack_destroyed)
333                 ip_conntrack_destroyed(ct);
334
335         write_lock_bh(&ip_conntrack_lock);
336         /* Expectations will have been removed in clean_from_lists,
337          * except TFTP can create an expectation on the first packet,
338          * before connection is in the list, so we need to clean here,
339          * too. */
340         ip_ct_remove_expectations(ct);
341
342         /* We overload first tuple to link into unconfirmed list. */
343         if (!is_confirmed(ct)) {
344                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
345                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346         }
347
348         CONNTRACK_STAT_INC(delete);
349         write_unlock_bh(&ip_conntrack_lock);
350
351         if (ct->master)
352                 ip_conntrack_put(ct->master);
353
354         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
355         ip_conntrack_free(ct);
356 }
357
358 static void death_by_timeout(unsigned long ul_conntrack)
359 {
360         struct ip_conntrack *ct = (void *)ul_conntrack;
361
362         write_lock_bh(&ip_conntrack_lock);
363         /* Inside lock so preempt is disabled on module removal path.
364          * Otherwise we can get spurious warnings. */
365         CONNTRACK_STAT_INC(delete_list);
366         clean_from_lists(ct);
367         write_unlock_bh(&ip_conntrack_lock);
368         ip_conntrack_put(ct);
369 }
370
371 static inline int
372 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
373                     const struct ip_conntrack_tuple *tuple,
374                     const struct ip_conntrack *ignored_conntrack)
375 {
376         ASSERT_READ_LOCK(&ip_conntrack_lock);
377         return tuplehash_to_ctrack(i) != ignored_conntrack
378                 && ip_ct_tuple_equal(tuple, &i->tuple);
379 }
380
381 struct ip_conntrack_tuple_hash *
382 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
383                     const struct ip_conntrack *ignored_conntrack)
384 {
385         struct ip_conntrack_tuple_hash *h;
386         unsigned int hash = hash_conntrack(tuple);
387
388         ASSERT_READ_LOCK(&ip_conntrack_lock);
389         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
390                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
391                         CONNTRACK_STAT_INC(found);
392                         return h;
393                 }
394                 CONNTRACK_STAT_INC(searched);
395         }
396
397         return NULL;
398 }
399
400 /* Find a connection corresponding to a tuple. */
401 struct ip_conntrack_tuple_hash *
402 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
403                       const struct ip_conntrack *ignored_conntrack)
404 {
405         struct ip_conntrack_tuple_hash *h;
406
407         read_lock_bh(&ip_conntrack_lock);
408         h = __ip_conntrack_find(tuple, ignored_conntrack);
409         if (h)
410                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
411         read_unlock_bh(&ip_conntrack_lock);
412
413         return h;
414 }
415
416 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
417                                         unsigned int hash,
418                                         unsigned int repl_hash) 
419 {
420         ct->id = ++ip_conntrack_next_id;
421         list_prepend(&ip_conntrack_hash[hash],
422                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
423         list_prepend(&ip_conntrack_hash[repl_hash],
424                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
425 }
426
427 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
428 {
429         unsigned int hash, repl_hash;
430
431         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
432         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
433
434         write_lock_bh(&ip_conntrack_lock);
435         __ip_conntrack_hash_insert(ct, hash, repl_hash);
436         write_unlock_bh(&ip_conntrack_lock);
437 }
438
439 /* Confirm a connection given skb; places it in hash table */
440 int
441 __ip_conntrack_confirm(struct sk_buff **pskb)
442 {
443         unsigned int hash, repl_hash;
444         struct ip_conntrack *ct;
445         enum ip_conntrack_info ctinfo;
446
447         ct = ip_conntrack_get(*pskb, &ctinfo);
448
449         /* ipt_REJECT uses ip_conntrack_attach to attach related
450            ICMP/TCP RST packets in other direction.  Actual packet
451            which created connection will be IP_CT_NEW or for an
452            expected connection, IP_CT_RELATED. */
453         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
454                 return NF_ACCEPT;
455
456         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
457         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
458
459         /* We're not in hash table, and we refuse to set up related
460            connections for unconfirmed conns.  But packet copies and
461            REJECT will give spurious warnings here. */
462         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
463
464         /* No external references means noone else could have
465            confirmed us. */
466         IP_NF_ASSERT(!is_confirmed(ct));
467         DEBUGP("Confirming conntrack %p\n", ct);
468
469         write_lock_bh(&ip_conntrack_lock);
470
471         /* See if there's one in the list already, including reverse:
472            NAT could have grabbed it without realizing, since we're
473            not in the hash.  If there is, we lost race. */
474         if (!LIST_FIND(&ip_conntrack_hash[hash],
475                        conntrack_tuple_cmp,
476                        struct ip_conntrack_tuple_hash *,
477                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
478             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
479                           conntrack_tuple_cmp,
480                           struct ip_conntrack_tuple_hash *,
481                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
482                 /* Remove from unconfirmed list */
483                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
484
485                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
486                 /* Timer relative to confirmation time, not original
487                    setting time, otherwise we'd get timer wrap in
488                    weird delay cases. */
489                 ct->timeout.expires += jiffies;
490                 add_timer(&ct->timeout);
491                 atomic_inc(&ct->ct_general.use);
492                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
493                 CONNTRACK_STAT_INC(insert);
494                 write_unlock_bh(&ip_conntrack_lock);
495                 if (ct->helper)
496                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
497 #ifdef CONFIG_IP_NF_NAT_NEEDED
498                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
499                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
500                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
501 #endif
502                 ip_conntrack_event_cache(master_ct(ct) ?
503                                          IPCT_RELATED : IPCT_NEW, *pskb);
504
505                 return NF_ACCEPT;
506         }
507
508         CONNTRACK_STAT_INC(insert_failed);
509         write_unlock_bh(&ip_conntrack_lock);
510
511         return NF_DROP;
512 }
513
514 /* Returns true if a connection correspondings to the tuple (required
515    for NAT). */
516 int
517 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
518                          const struct ip_conntrack *ignored_conntrack)
519 {
520         struct ip_conntrack_tuple_hash *h;
521
522         read_lock_bh(&ip_conntrack_lock);
523         h = __ip_conntrack_find(tuple, ignored_conntrack);
524         read_unlock_bh(&ip_conntrack_lock);
525
526         return h != NULL;
527 }
528
529 /* There's a small race here where we may free a just-assured
530    connection.  Too bad: we're in trouble anyway. */
531 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
532 {
533         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
534 }
535
536 static int early_drop(struct list_head *chain)
537 {
538         /* Traverse backwards: gives us oldest, which is roughly LRU */
539         struct ip_conntrack_tuple_hash *h;
540         struct ip_conntrack *ct = NULL;
541         int dropped = 0;
542
543         read_lock_bh(&ip_conntrack_lock);
544         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
545         if (h) {
546                 ct = tuplehash_to_ctrack(h);
547                 atomic_inc(&ct->ct_general.use);
548         }
549         read_unlock_bh(&ip_conntrack_lock);
550
551         if (!ct)
552                 return dropped;
553
554         if (del_timer(&ct->timeout)) {
555                 death_by_timeout((unsigned long)ct);
556                 dropped = 1;
557                 CONNTRACK_STAT_INC(early_drop);
558         }
559         ip_conntrack_put(ct);
560         return dropped;
561 }
562
563 static inline int helper_cmp(const struct ip_conntrack_helper *i,
564                              const struct ip_conntrack_tuple *rtuple)
565 {
566         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
567 }
568
569 static struct ip_conntrack_helper *
570 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
571 {
572         return LIST_FIND(&helpers, helper_cmp,
573                          struct ip_conntrack_helper *,
574                          tuple);
575 }
576
577 struct ip_conntrack_helper *
578 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
579 {
580         struct ip_conntrack_helper *helper;
581
582         /* need ip_conntrack_lock to assure that helper exists until
583          * try_module_get() is called */
584         read_lock_bh(&ip_conntrack_lock);
585
586         helper = __ip_conntrack_helper_find(tuple);
587         if (helper) {
588                 /* need to increase module usage count to assure helper will
589                  * not go away while the caller is e.g. busy putting a
590                  * conntrack in the hash that uses the helper */
591                 if (!try_module_get(helper->me))
592                         helper = NULL;
593         }
594
595         read_unlock_bh(&ip_conntrack_lock);
596
597         return helper;
598 }
599
600 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
601 {
602         module_put(helper->me);
603 }
604
605 struct ip_conntrack_protocol *
606 __ip_conntrack_proto_find(u_int8_t protocol)
607 {
608         return ip_ct_protos[protocol];
609 }
610
611 /* this is guaranteed to always return a valid protocol helper, since
612  * it falls back to generic_protocol */
613 struct ip_conntrack_protocol *
614 ip_conntrack_proto_find_get(u_int8_t protocol)
615 {
616         struct ip_conntrack_protocol *p;
617
618         preempt_disable();
619         p = __ip_conntrack_proto_find(protocol);
620         if (p) {
621                 if (!try_module_get(p->me))
622                         p = &ip_conntrack_generic_protocol;
623         }
624         preempt_enable();
625         
626         return p;
627 }
628
629 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
630 {
631         module_put(p->me);
632 }
633
634 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
635                                         struct ip_conntrack_tuple *repl)
636 {
637         struct ip_conntrack *conntrack;
638
639         if (!ip_conntrack_hash_rnd_initted) {
640                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
641                 ip_conntrack_hash_rnd_initted = 1;
642         }
643
644         if (ip_conntrack_max
645             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
646                 unsigned int hash = hash_conntrack(orig);
647                 /* Try dropping from this hash chain. */
648                 if (!early_drop(&ip_conntrack_hash[hash])) {
649                         if (net_ratelimit())
650                                 printk(KERN_WARNING
651                                        "ip_conntrack: table full, dropping"
652                                        " packet.\n");
653                         return ERR_PTR(-ENOMEM);
654                 }
655         }
656
657         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
658         if (!conntrack) {
659                 DEBUGP("Can't allocate conntrack.\n");
660                 return ERR_PTR(-ENOMEM);
661         }
662
663         memset(conntrack, 0, sizeof(*conntrack));
664         atomic_set(&conntrack->ct_general.use, 1);
665         conntrack->ct_general.destroy = destroy_conntrack;
666         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
667         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
668 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
669         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
670         conntrack->xid[IP_CT_DIR_REPLY] = -1;
671         conntrack->priority = (u_int32_t)-1;
672 #endif
673         /* Don't set timer yet: wait for confirmation */
674         init_timer(&conntrack->timeout);
675         conntrack->timeout.data = (unsigned long)conntrack;
676         conntrack->timeout.function = death_by_timeout;
677
678         atomic_inc(&ip_conntrack_count);
679
680         return conntrack;
681 }
682
683 void
684 ip_conntrack_free(struct ip_conntrack *conntrack)
685 {
686         atomic_dec(&ip_conntrack_count);
687         kmem_cache_free(ip_conntrack_cachep, conntrack);
688 }
689
690 /* Allocate a new conntrack: we return -ENOMEM if classification
691  * failed due to stress.   Otherwise it really is unclassifiable */
692 static struct ip_conntrack_tuple_hash *
693 init_conntrack(struct ip_conntrack_tuple *tuple,
694                struct ip_conntrack_protocol *protocol,
695                struct sk_buff *skb)
696 {
697         struct ip_conntrack *conntrack;
698         struct ip_conntrack_tuple repl_tuple;
699         struct ip_conntrack_expect *exp;
700
701         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
702                 DEBUGP("Can't invert tuple.\n");
703                 return NULL;
704         }
705
706         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
707         if (conntrack == NULL || IS_ERR(conntrack))
708                 return (struct ip_conntrack_tuple_hash *)conntrack;
709
710         if (!protocol->new(conntrack, skb)) {
711                 ip_conntrack_free(conntrack);
712                 return NULL;
713         }
714
715         write_lock_bh(&ip_conntrack_lock);
716         exp = find_expectation(tuple);
717
718         if (exp) {
719                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
720                         conntrack, exp);
721                 /* Welcome, Mr. Bond.  We've been expecting you... */
722                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
723                 conntrack->master = exp->master;
724 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
725                 conntrack->mark = exp->master->mark;
726 #endif
727 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
728     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
729                 /* this is ugly, but there is no other place where to put it */
730                 conntrack->nat.masq_index = exp->master->nat.masq_index;
731 #endif
732                 nf_conntrack_get(&conntrack->master->ct_general);
733                 CONNTRACK_STAT_INC(expect_new);
734         } else {
735                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
736
737                 CONNTRACK_STAT_INC(new);
738         }
739
740         /* Overload tuple linked list to put us in unconfirmed list. */
741         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
742
743         write_unlock_bh(&ip_conntrack_lock);
744
745         if (exp) {
746                 if (exp->expectfn)
747                         exp->expectfn(conntrack, exp);
748                 ip_conntrack_expect_put(exp);
749         }
750
751         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
752 }
753
754 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
755 static inline struct ip_conntrack *
756 resolve_normal_ct(struct sk_buff *skb,
757                   struct ip_conntrack_protocol *proto,
758                   int *set_reply,
759                   unsigned int hooknum,
760                   enum ip_conntrack_info *ctinfo)
761 {
762         struct ip_conntrack_tuple tuple;
763         struct ip_conntrack_tuple_hash *h;
764         struct ip_conntrack *ct;
765
766         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
767
768         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
769                                 &tuple,proto))
770                 return NULL;
771
772         /* look for tuple match */
773         h = ip_conntrack_find_get(&tuple, NULL);
774         if (!h) {
775                 h = init_conntrack(&tuple, proto, skb);
776                 if (!h)
777                         return NULL;
778                 if (IS_ERR(h))
779                         return (void *)h;
780         }
781         ct = tuplehash_to_ctrack(h);
782
783         /* It exists; we have (non-exclusive) reference. */
784         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
785                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
786                 /* Please set reply bit if this packet OK */
787                 *set_reply = 1;
788         } else {
789                 /* Once we've had two way comms, always ESTABLISHED. */
790                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
791                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
792                                ct);
793                         *ctinfo = IP_CT_ESTABLISHED;
794                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
795                         DEBUGP("ip_conntrack_in: related packet for %p\n",
796                                ct);
797                         *ctinfo = IP_CT_RELATED;
798                 } else {
799                         DEBUGP("ip_conntrack_in: new packet for %p\n",
800                                ct);
801                         *ctinfo = IP_CT_NEW;
802                 }
803                 *set_reply = 0;
804         }
805         skb->nfct = &ct->ct_general;
806         skb->nfctinfo = *ctinfo;
807         return ct;
808 }
809
810 /* Netfilter hook itself. */
811 unsigned int ip_conntrack_in(unsigned int hooknum,
812                              struct sk_buff **pskb,
813                              const struct net_device *in,
814                              const struct net_device *out,
815                              int (*okfn)(struct sk_buff *))
816 {
817         struct ip_conntrack *ct;
818         enum ip_conntrack_info ctinfo;
819         struct ip_conntrack_protocol *proto;
820         int set_reply = 0;
821         int ret;
822
823         /* Previously seen (loopback or untracked)?  Ignore. */
824         if ((*pskb)->nfct) {
825                 CONNTRACK_STAT_INC(ignore);
826                 return NF_ACCEPT;
827         }
828
829         /* Never happen */
830         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
831                 if (net_ratelimit()) {
832                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
833                        (*pskb)->nh.iph->protocol, hooknum);
834                 }
835                 return NF_DROP;
836         }
837
838 /* Doesn't cover locally-generated broadcast, so not worth it. */
839 #if 0
840         /* Ignore broadcast: no `connection'. */
841         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
842                 printk("Broadcast packet!\n");
843                 return NF_ACCEPT;
844         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
845                    == htonl(0x000000FF)) {
846                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
847                        NIPQUAD((*pskb)->nh.iph->saddr),
848                        NIPQUAD((*pskb)->nh.iph->daddr),
849                        (*pskb)->sk, (*pskb)->pkt_type);
850         }
851 #endif
852
853         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
854
855         /* It may be an special packet, error, unclean...
856          * inverse of the return code tells to the netfilter
857          * core what to do with the packet. */
858         if (proto->error != NULL 
859             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
860                 CONNTRACK_STAT_INC(error);
861                 CONNTRACK_STAT_INC(invalid);
862                 return -ret;
863         }
864
865         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
866                 /* Not valid part of a connection */
867                 CONNTRACK_STAT_INC(invalid);
868                 return NF_ACCEPT;
869         }
870
871         if (IS_ERR(ct)) {
872                 /* Too stressed to deal. */
873                 CONNTRACK_STAT_INC(drop);
874                 return NF_DROP;
875         }
876
877         IP_NF_ASSERT((*pskb)->nfct);
878
879         ret = proto->packet(ct, *pskb, ctinfo);
880         if (ret < 0) {
881                 /* Invalid: inverse of the return code tells
882                  * the netfilter core what to do*/
883                 nf_conntrack_put((*pskb)->nfct);
884                 (*pskb)->nfct = NULL;
885                 CONNTRACK_STAT_INC(invalid);
886                 return -ret;
887         }
888
889         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
890                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
891
892         return ret;
893 }
894
895 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
896                    const struct ip_conntrack_tuple *orig)
897 {
898         return ip_ct_invert_tuple(inverse, orig, 
899                                   __ip_conntrack_proto_find(orig->dst.protonum));
900 }
901
902 /* Would two expected things clash? */
903 static inline int expect_clash(const struct ip_conntrack_expect *a,
904                                const struct ip_conntrack_expect *b)
905 {
906         /* Part covered by intersection of masks must be unequal,
907            otherwise they clash */
908         struct ip_conntrack_tuple intersect_mask
909                 = { { a->mask.src.ip & b->mask.src.ip,
910                       { a->mask.src.u.all & b->mask.src.u.all } },
911                     { a->mask.dst.ip & b->mask.dst.ip,
912                       { a->mask.dst.u.all & b->mask.dst.u.all },
913                       a->mask.dst.protonum & b->mask.dst.protonum } };
914
915         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
916 }
917
918 static inline int expect_matches(const struct ip_conntrack_expect *a,
919                                  const struct ip_conntrack_expect *b)
920 {
921         return a->master == b->master
922                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
923                 && ip_ct_tuple_equal(&a->mask, &b->mask);
924 }
925
926 /* Generally a bad idea to call this: could have matched already. */
927 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
928 {
929         struct ip_conntrack_expect *i;
930
931         write_lock_bh(&ip_conntrack_lock);
932         /* choose the the oldest expectation to evict */
933         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
934                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
935                         ip_ct_unlink_expect(i);
936                         write_unlock_bh(&ip_conntrack_lock);
937                         ip_conntrack_expect_put(i);
938                         return;
939                 }
940         }
941         write_unlock_bh(&ip_conntrack_lock);
942 }
943
944 /* We don't increase the master conntrack refcount for non-fulfilled
945  * conntracks. During the conntrack destruction, the expectations are 
946  * always killed before the conntrack itself */
947 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
948 {
949         struct ip_conntrack_expect *new;
950
951         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
952         if (!new) {
953                 DEBUGP("expect_related: OOM allocating expect\n");
954                 return NULL;
955         }
956         new->master = me;
957         atomic_set(&new->use, 1);
958         return new;
959 }
960
961 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
962 {
963         if (atomic_dec_and_test(&exp->use))
964                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
965 }
966
967 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
968 {
969         atomic_inc(&exp->use);
970         exp->master->expecting++;
971         list_add(&exp->list, &ip_conntrack_expect_list);
972
973         init_timer(&exp->timeout);
974         exp->timeout.data = (unsigned long)exp;
975         exp->timeout.function = expectation_timed_out;
976         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
977         add_timer(&exp->timeout);
978
979         exp->id = ++ip_conntrack_expect_next_id;
980         atomic_inc(&exp->use);
981         CONNTRACK_STAT_INC(expect_create);
982 }
983
984 /* Race with expectations being used means we could have none to find; OK. */
985 static void evict_oldest_expect(struct ip_conntrack *master)
986 {
987         struct ip_conntrack_expect *i;
988
989         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
990                 if (i->master == master) {
991                         if (del_timer(&i->timeout)) {
992                                 ip_ct_unlink_expect(i);
993                                 ip_conntrack_expect_put(i);
994                         }
995                         break;
996                 }
997         }
998 }
999
1000 static inline int refresh_timer(struct ip_conntrack_expect *i)
1001 {
1002         if (!del_timer(&i->timeout))
1003                 return 0;
1004
1005         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1006         add_timer(&i->timeout);
1007         return 1;
1008 }
1009
1010 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1011 {
1012         struct ip_conntrack_expect *i;
1013         int ret;
1014
1015         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1016         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1017         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1018
1019         write_lock_bh(&ip_conntrack_lock);
1020         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1021                 if (expect_matches(i, expect)) {
1022                         /* Refresh timer: if it's dying, ignore.. */
1023                         if (refresh_timer(i)) {
1024                                 ret = 0;
1025                                 goto out;
1026                         }
1027                 } else if (expect_clash(i, expect)) {
1028                         ret = -EBUSY;
1029                         goto out;
1030                 }
1031         }
1032
1033         /* Will be over limit? */
1034         if (expect->master->helper->max_expected && 
1035             expect->master->expecting >= expect->master->helper->max_expected)
1036                 evict_oldest_expect(expect->master);
1037
1038         ip_conntrack_expect_insert(expect);
1039         ip_conntrack_expect_event(IPEXP_NEW, expect);
1040         ret = 0;
1041 out:
1042         write_unlock_bh(&ip_conntrack_lock);
1043         return ret;
1044 }
1045
1046 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1047    implicitly racy: see __ip_conntrack_confirm */
1048 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1049                               const struct ip_conntrack_tuple *newreply)
1050 {
1051         write_lock_bh(&ip_conntrack_lock);
1052         /* Should be unconfirmed, so not in hash table yet */
1053         IP_NF_ASSERT(!is_confirmed(conntrack));
1054
1055         DEBUGP("Altering reply tuple of %p to ", conntrack);
1056         DUMP_TUPLE(newreply);
1057
1058         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1059         if (!conntrack->master && conntrack->expecting == 0)
1060                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1061         write_unlock_bh(&ip_conntrack_lock);
1062 }
1063
1064 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1065 {
1066         BUG_ON(me->timeout == 0);
1067         write_lock_bh(&ip_conntrack_lock);
1068         list_prepend(&helpers, me);
1069         write_unlock_bh(&ip_conntrack_lock);
1070
1071         return 0;
1072 }
1073
1074 struct ip_conntrack_helper *
1075 __ip_conntrack_helper_find_byname(const char *name)
1076 {
1077         struct ip_conntrack_helper *h;
1078
1079         list_for_each_entry(h, &helpers, list) {
1080                 if (!strcmp(h->name, name))
1081                         return h;
1082         }
1083
1084         return NULL;
1085 }
1086
1087 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1088                          const struct ip_conntrack_helper *me)
1089 {
1090         if (tuplehash_to_ctrack(i)->helper == me) {
1091                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1092                 tuplehash_to_ctrack(i)->helper = NULL;
1093         }
1094         return 0;
1095 }
1096
1097 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1098 {
1099         unsigned int i;
1100         struct ip_conntrack_expect *exp, *tmp;
1101
1102         /* Need write lock here, to delete helper. */
1103         write_lock_bh(&ip_conntrack_lock);
1104         LIST_DELETE(&helpers, me);
1105
1106         /* Get rid of expectations */
1107         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1108                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1109                         ip_ct_unlink_expect(exp);
1110                         ip_conntrack_expect_put(exp);
1111                 }
1112         }
1113         /* Get rid of expecteds, set helpers to NULL. */
1114         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1115         for (i = 0; i < ip_conntrack_htable_size; i++)
1116                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1117                             struct ip_conntrack_tuple_hash *, me);
1118         write_unlock_bh(&ip_conntrack_lock);
1119
1120         /* Someone could be still looking at the helper in a bh. */
1121         synchronize_net();
1122 }
1123
1124 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1125 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1126                         enum ip_conntrack_info ctinfo,
1127                         const struct sk_buff *skb,
1128                         unsigned long extra_jiffies,
1129                         int do_acct)
1130 {
1131         int event = 0;
1132
1133         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1134         IP_NF_ASSERT(skb);
1135
1136         write_lock_bh(&ip_conntrack_lock);
1137
1138         /* If not in hash table, timer will not be active yet */
1139         if (!is_confirmed(ct)) {
1140                 ct->timeout.expires = extra_jiffies;
1141                 event = IPCT_REFRESH;
1142         } else {
1143                 /* Need del_timer for race avoidance (may already be dying). */
1144                 if (del_timer(&ct->timeout)) {
1145                         ct->timeout.expires = jiffies + extra_jiffies;
1146                         add_timer(&ct->timeout);
1147                         event = IPCT_REFRESH;
1148                 }
1149         }
1150
1151 #ifdef CONFIG_IP_NF_CT_ACCT
1152         if (do_acct) {
1153                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1154                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1155                                                 ntohs(skb->nh.iph->tot_len);
1156                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1157                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1158                         event |= IPCT_COUNTER_FILLING;
1159         }
1160 #endif
1161
1162         write_unlock_bh(&ip_conntrack_lock);
1163
1164         /* must be unlocked when calling event cache */
1165         if (event)
1166                 ip_conntrack_event_cache(event, skb);
1167 }
1168
1169 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1170     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1171 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1172  * in ip_conntrack_core, since we don't want the protocols to autoload
1173  * or depend on ctnetlink */
1174 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1175                                const struct ip_conntrack_tuple *tuple)
1176 {
1177         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1178                 &tuple->src.u.tcp.port);
1179         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1180                 &tuple->dst.u.tcp.port);
1181         return 0;
1182
1183 nfattr_failure:
1184         return -1;
1185 }
1186
1187 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1188                                struct ip_conntrack_tuple *t)
1189 {
1190         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1191                 return -EINVAL;
1192
1193         t->src.u.tcp.port =
1194                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1195         t->dst.u.tcp.port =
1196                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1197
1198         return 0;
1199 }
1200 #endif
1201
1202 /* Returns new sk_buff, or NULL */
1203 struct sk_buff *
1204 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1205 {
1206         skb_orphan(skb);
1207
1208         local_bh_disable(); 
1209         skb = ip_defrag(skb, user);
1210         local_bh_enable();
1211
1212         if (skb)
1213                 ip_send_check(skb->nh.iph);
1214         return skb;
1215 }
1216
1217 /* Used by ipt_REJECT. */
1218 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1219 {
1220         struct ip_conntrack *ct;
1221         enum ip_conntrack_info ctinfo;
1222
1223         /* This ICMP is in reverse direction to the packet which caused it */
1224         ct = ip_conntrack_get(skb, &ctinfo);
1225         
1226         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1227                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1228         else
1229                 ctinfo = IP_CT_RELATED;
1230
1231         /* Attach to new skbuff, and increment count */
1232         nskb->nfct = &ct->ct_general;
1233         nskb->nfctinfo = ctinfo;
1234         nf_conntrack_get(nskb->nfct);
1235 }
1236
1237 static inline int
1238 do_iter(const struct ip_conntrack_tuple_hash *i,
1239         int (*iter)(struct ip_conntrack *i, void *data),
1240         void *data)
1241 {
1242         return iter(tuplehash_to_ctrack(i), data);
1243 }
1244
1245 /* Bring out ya dead! */
1246 static struct ip_conntrack_tuple_hash *
1247 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1248                 void *data, unsigned int *bucket)
1249 {
1250         struct ip_conntrack_tuple_hash *h = NULL;
1251
1252         write_lock_bh(&ip_conntrack_lock);
1253         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1254                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1255                                 struct ip_conntrack_tuple_hash *, iter, data);
1256                 if (h)
1257                         break;
1258         }
1259         if (!h)
1260                 h = LIST_FIND_W(&unconfirmed, do_iter,
1261                                 struct ip_conntrack_tuple_hash *, iter, data);
1262         if (h)
1263                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1264         write_unlock_bh(&ip_conntrack_lock);
1265
1266         return h;
1267 }
1268
1269 void
1270 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1271 {
1272         struct ip_conntrack_tuple_hash *h;
1273         unsigned int bucket = 0;
1274
1275         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1276                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1277                 /* Time to push up daises... */
1278                 if (del_timer(&ct->timeout))
1279                         death_by_timeout((unsigned long)ct);
1280                 /* ... else the timer will get him soon. */
1281
1282                 ip_conntrack_put(ct);
1283         }
1284 }
1285
1286 /* Fast function for those who don't want to parse /proc (and I don't
1287    blame them). */
1288 /* Reversing the socket's dst/src point of view gives us the reply
1289    mapping. */
1290 static int
1291 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1292 {
1293         struct inet_sock *inet = inet_sk(sk);
1294         struct ip_conntrack_tuple_hash *h;
1295         struct ip_conntrack_tuple tuple;
1296         
1297         IP_CT_TUPLE_U_BLANK(&tuple);
1298         tuple.src.ip = inet->rcv_saddr;
1299         tuple.src.u.tcp.port = inet->sport;
1300         tuple.dst.ip = inet->daddr;
1301         tuple.dst.u.tcp.port = inet->dport;
1302         tuple.dst.protonum = IPPROTO_TCP;
1303
1304         /* We only do TCP at the moment: is there a better way? */
1305         if (strcmp(sk->sk_prot->name, "TCP")) {
1306                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1307                 return -ENOPROTOOPT;
1308         }
1309
1310         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1311                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1312                        *len, sizeof(struct sockaddr_in));
1313                 return -EINVAL;
1314         }
1315
1316         h = ip_conntrack_find_get(&tuple, NULL);
1317         if (h) {
1318                 struct sockaddr_in sin;
1319                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1320
1321                 sin.sin_family = AF_INET;
1322                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1323                         .tuple.dst.u.tcp.port;
1324                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1325                         .tuple.dst.ip;
1326                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1327
1328                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1329                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1330                 ip_conntrack_put(ct);
1331                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1332                         return -EFAULT;
1333                 else
1334                         return 0;
1335         }
1336         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1337                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1338                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1339         return -ENOENT;
1340 }
1341
1342 static struct nf_sockopt_ops so_getorigdst = {
1343         .pf             = PF_INET,
1344         .get_optmin     = SO_ORIGINAL_DST,
1345         .get_optmax     = SO_ORIGINAL_DST+1,
1346         .get            = &getorigdst,
1347 };
1348
1349 static int kill_all(struct ip_conntrack *i, void *data)
1350 {
1351         return 1;
1352 }
1353
1354 void ip_conntrack_flush(void)
1355 {
1356         ip_ct_iterate_cleanup(kill_all, NULL);
1357 }
1358
1359 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1360 {
1361         if (vmalloced)
1362                 vfree(hash);
1363         else
1364                 free_pages((unsigned long)hash, 
1365                            get_order(sizeof(struct list_head) * size));
1366 }
1367
1368 /* Mishearing the voices in his head, our hero wonders how he's
1369    supposed to kill the mall. */
1370 void ip_conntrack_cleanup(void)
1371 {
1372         ip_ct_attach = NULL;
1373
1374         /* This makes sure all current packets have passed through
1375            netfilter framework.  Roll on, two-stage module
1376            delete... */
1377         synchronize_net();
1378
1379         ip_ct_event_cache_flush();
1380  i_see_dead_people:
1381         ip_conntrack_flush();
1382         if (atomic_read(&ip_conntrack_count) != 0) {
1383                 schedule();
1384                 goto i_see_dead_people;
1385         }
1386         /* wait until all references to ip_conntrack_untracked are dropped */
1387         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1388                 schedule();
1389
1390         kmem_cache_destroy(ip_conntrack_cachep);
1391         kmem_cache_destroy(ip_conntrack_expect_cachep);
1392         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1393                             ip_conntrack_htable_size);
1394         nf_unregister_sockopt(&so_getorigdst);
1395 }
1396
1397 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1398 {
1399         struct list_head *hash;
1400         unsigned int i;
1401
1402         *vmalloced = 0; 
1403         hash = (void*)__get_free_pages(GFP_KERNEL, 
1404                                        get_order(sizeof(struct list_head)
1405                                                  * size));
1406         if (!hash) { 
1407                 *vmalloced = 1;
1408                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1409                 hash = vmalloc(sizeof(struct list_head) * size);
1410         }
1411
1412         if (hash)
1413                 for (i = 0; i < size; i++)
1414                         INIT_LIST_HEAD(&hash[i]);
1415
1416         return hash;
1417 }
1418
1419 static int set_hashsize(const char *val, struct kernel_param *kp)
1420 {
1421         int i, bucket, hashsize, vmalloced;
1422         int old_vmalloced, old_size;
1423         int rnd;
1424         struct list_head *hash, *old_hash;
1425         struct ip_conntrack_tuple_hash *h;
1426
1427         /* On boot, we can set this without any fancy locking. */
1428         if (!ip_conntrack_htable_size)
1429                 return param_set_int(val, kp);
1430
1431         hashsize = simple_strtol(val, NULL, 0);
1432         if (!hashsize)
1433                 return -EINVAL;
1434
1435         hash = alloc_hashtable(hashsize, &vmalloced);
1436         if (!hash)
1437                 return -ENOMEM;
1438
1439         /* We have to rehash for the new table anyway, so we also can 
1440          * use a new random seed */
1441         get_random_bytes(&rnd, 4);
1442
1443         write_lock_bh(&ip_conntrack_lock);
1444         for (i = 0; i < ip_conntrack_htable_size; i++) {
1445                 while (!list_empty(&ip_conntrack_hash[i])) {
1446                         h = list_entry(ip_conntrack_hash[i].next,
1447                                        struct ip_conntrack_tuple_hash, list);
1448                         list_del(&h->list);
1449                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1450                         list_add_tail(&h->list, &hash[bucket]);
1451                 }
1452         }
1453         old_size = ip_conntrack_htable_size;
1454         old_vmalloced = ip_conntrack_vmalloc;
1455         old_hash = ip_conntrack_hash;
1456
1457         ip_conntrack_htable_size = hashsize;
1458         ip_conntrack_vmalloc = vmalloced;
1459         ip_conntrack_hash = hash;
1460         ip_conntrack_hash_rnd = rnd;
1461         write_unlock_bh(&ip_conntrack_lock);
1462
1463         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1464         return 0;
1465 }
1466
1467 module_param_call(hashsize, set_hashsize, param_get_uint,
1468                   &ip_conntrack_htable_size, 0600);
1469
1470 int __init ip_conntrack_init(void)
1471 {
1472         unsigned int i;
1473         int ret;
1474
1475         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1476          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1477         if (!ip_conntrack_htable_size) {
1478                 ip_conntrack_htable_size
1479                         = (((num_physpages << PAGE_SHIFT) / 16384)
1480                            / sizeof(struct list_head));
1481                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1482                         ip_conntrack_htable_size = 8192;
1483                 if (ip_conntrack_htable_size < 16)
1484                         ip_conntrack_htable_size = 16;
1485         }
1486         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1487
1488         printk("ip_conntrack version %s (%u buckets, %d max)"
1489                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1490                ip_conntrack_htable_size, ip_conntrack_max,
1491                sizeof(struct ip_conntrack));
1492
1493         ret = nf_register_sockopt(&so_getorigdst);
1494         if (ret != 0) {
1495                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1496                 return ret;
1497         }
1498
1499         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1500                                             &ip_conntrack_vmalloc);
1501         if (!ip_conntrack_hash) {
1502                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1503                 goto err_unreg_sockopt;
1504         }
1505
1506         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1507                                                 sizeof(struct ip_conntrack), 0,
1508                                                 0, NULL, NULL);
1509         if (!ip_conntrack_cachep) {
1510                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1511                 goto err_free_hash;
1512         }
1513
1514         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1515                                         sizeof(struct ip_conntrack_expect),
1516                                         0, 0, NULL, NULL);
1517         if (!ip_conntrack_expect_cachep) {
1518                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1519                 goto err_free_conntrack_slab;
1520         }
1521
1522         /* Don't NEED lock here, but good form anyway. */
1523         write_lock_bh(&ip_conntrack_lock);
1524         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1525                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1526         /* Sew in builtin protocols. */
1527         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1528         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1529         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1530         write_unlock_bh(&ip_conntrack_lock);
1531
1532         /* For use by ipt_REJECT */
1533         ip_ct_attach = ip_conntrack_attach;
1534
1535         /* Set up fake conntrack:
1536             - to never be deleted, not in any hashes */
1537         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1538         /*  - and look it like as a confirmed connection */
1539         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1540
1541         return ret;
1542
1543 err_free_conntrack_slab:
1544         kmem_cache_destroy(ip_conntrack_cachep);
1545 err_free_hash:
1546         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1547                             ip_conntrack_htable_size);
1548 err_unreg_sockopt:
1549         nf_unregister_sockopt(&so_getorigdst);
1550
1551         return -ENOMEM;
1552 }