upgrade to fedora-2.6.12-1.1398.FC4 + vserver 2.0.rc7
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40
41 /* This rwlock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
44 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION    "2.1"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DECLARE_RWLOCK(ip_conntrack_lock);
61
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size = 0;
70 int ip_conntrack_max;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep;
73 static kmem_cache_t *ip_conntrack_expect_cachep;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
78
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81 void 
82 ip_conntrack_put(struct ip_conntrack *ct)
83 {
84         IP_NF_ASSERT(ct);
85         nf_conntrack_put(&ct->ct_general);
86 }
87
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
90
91 static u_int32_t
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
93 {
94 #if 0
95         dump_tuple(tuple);
96 #endif
97         return (jhash_3words(tuple->src.ip,
98                              (tuple->dst.ip ^ tuple->dst.protonum),
99                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
100                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101 }
102
103 int
104 ip_ct_get_tuple(const struct iphdr *iph,
105                 const struct sk_buff *skb,
106                 unsigned int dataoff,
107                 struct ip_conntrack_tuple *tuple,
108                 const struct ip_conntrack_protocol *protocol)
109 {
110         /* Never happen */
111         if (iph->frag_off & htons(IP_OFFSET)) {
112                 printk("ip_conntrack_core: Frag of proto %u.\n",
113                        iph->protocol);
114                 return 0;
115         }
116
117         tuple->src.ip = iph->saddr;
118         tuple->dst.ip = iph->daddr;
119         tuple->dst.protonum = iph->protocol;
120         tuple->src.u.all = tuple->dst.u.all = 0;
121         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
122
123         return protocol->pkt_to_tuple(skb, dataoff, tuple);
124 }
125
126 int
127 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
128                    const struct ip_conntrack_tuple *orig,
129                    const struct ip_conntrack_protocol *protocol)
130 {
131         inverse->src.ip = orig->dst.ip;
132         inverse->dst.ip = orig->src.ip;
133         inverse->dst.protonum = orig->dst.protonum;
134         inverse->dst.dir = !orig->dst.dir;
135
136         inverse->src.u.all = inverse->dst.u.all = 0;
137
138         return protocol->invert_tuple(inverse, orig);
139 }
140
141
142 /* ip_conntrack_expect helper functions */
143 static void destroy_expect(struct ip_conntrack_expect *exp)
144 {
145         ip_conntrack_put(exp->master);
146         IP_NF_ASSERT(!timer_pending(&exp->timeout));
147         kmem_cache_free(ip_conntrack_expect_cachep, exp);
148         CONNTRACK_STAT_INC(expect_delete);
149 }
150
151 static void unlink_expect(struct ip_conntrack_expect *exp)
152 {
153         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
154         list_del(&exp->list);
155         /* Logically in destroy_expect, but we hold the lock here. */
156         exp->master->expecting--;
157 }
158
159 static void expectation_timed_out(unsigned long ul_expect)
160 {
161         struct ip_conntrack_expect *exp = (void *)ul_expect;
162
163         WRITE_LOCK(&ip_conntrack_lock);
164         unlink_expect(exp);
165         WRITE_UNLOCK(&ip_conntrack_lock);
166         destroy_expect(exp);
167 }
168
169 /* If an expectation for this connection is found, it gets delete from
170  * global list then returned. */
171 static struct ip_conntrack_expect *
172 find_expectation(const struct ip_conntrack_tuple *tuple)
173 {
174         struct ip_conntrack_expect *i;
175
176         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
177                 /* If master is not in hash table yet (ie. packet hasn't left
178                    this machine yet), how can other end know about expected?
179                    Hence these are not the droids you are looking for (if
180                    master ct never got confirmed, we'd hold a reference to it
181                    and weird things would happen to future packets). */
182                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
183                     && is_confirmed(i->master)
184                     && del_timer(&i->timeout)) {
185                         unlink_expect(i);
186                         return i;
187                 }
188         }
189         return NULL;
190 }
191
192 /* delete all expectations for this conntrack */
193 static void remove_expectations(struct ip_conntrack *ct)
194 {
195         struct ip_conntrack_expect *i, *tmp;
196
197         /* Optimization: most connection never expect any others. */
198         if (ct->expecting == 0)
199                 return;
200
201         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
202                 if (i->master == ct && del_timer(&i->timeout)) {
203                         unlink_expect(i);
204                         destroy_expect(i);
205                 }
206         }
207 }
208
209 static void
210 clean_from_lists(struct ip_conntrack *ct)
211 {
212         unsigned int ho, hr;
213         
214         DEBUGP("clean_from_lists(%p)\n", ct);
215         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
216
217         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
218         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
219         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
220         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
221
222         /* Destroy all pending expectations */
223         remove_expectations(ct);
224 }
225
226 static void
227 destroy_conntrack(struct nf_conntrack *nfct)
228 {
229         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
230         struct ip_conntrack_protocol *proto;
231
232         DEBUGP("destroy_conntrack(%p)\n", ct);
233         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
234         IP_NF_ASSERT(!timer_pending(&ct->timeout));
235
236         /* To make sure we don't get any weird locking issues here:
237          * destroy_conntrack() MUST NOT be called with a write lock
238          * to ip_conntrack_lock!!! -HW */
239         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
240         if (proto && proto->destroy)
241                 proto->destroy(ct);
242
243         if (ip_conntrack_destroyed)
244                 ip_conntrack_destroyed(ct);
245
246         WRITE_LOCK(&ip_conntrack_lock);
247         /* Expectations will have been removed in clean_from_lists,
248          * except TFTP can create an expectation on the first packet,
249          * before connection is in the list, so we need to clean here,
250          * too. */
251         remove_expectations(ct);
252
253         /* We overload first tuple to link into unconfirmed list. */
254         if (!is_confirmed(ct)) {
255                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
256                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
257         }
258
259         CONNTRACK_STAT_INC(delete);
260         WRITE_UNLOCK(&ip_conntrack_lock);
261
262         if (ct->master)
263                 ip_conntrack_put(ct->master);
264
265         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
266         kmem_cache_free(ip_conntrack_cachep, ct);
267         atomic_dec(&ip_conntrack_count);
268 }
269
270 static void death_by_timeout(unsigned long ul_conntrack)
271 {
272         struct ip_conntrack *ct = (void *)ul_conntrack;
273
274         WRITE_LOCK(&ip_conntrack_lock);
275         /* Inside lock so preempt is disabled on module removal path.
276          * Otherwise we can get spurious warnings. */
277         CONNTRACK_STAT_INC(delete_list);
278         clean_from_lists(ct);
279         WRITE_UNLOCK(&ip_conntrack_lock);
280         ip_conntrack_put(ct);
281 }
282
283 static inline int
284 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
285                     const struct ip_conntrack_tuple *tuple,
286                     const struct ip_conntrack *ignored_conntrack)
287 {
288         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
289         return tuplehash_to_ctrack(i) != ignored_conntrack
290                 && ip_ct_tuple_equal(tuple, &i->tuple);
291 }
292
293 static struct ip_conntrack_tuple_hash *
294 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
295                     const struct ip_conntrack *ignored_conntrack)
296 {
297         struct ip_conntrack_tuple_hash *h;
298         unsigned int hash = hash_conntrack(tuple);
299
300         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
301         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
302                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
303                         CONNTRACK_STAT_INC(found);
304                         return h;
305                 }
306                 CONNTRACK_STAT_INC(searched);
307         }
308
309         return NULL;
310 }
311
312 /* Find a connection corresponding to a tuple. */
313 struct ip_conntrack_tuple_hash *
314 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
315                       const struct ip_conntrack *ignored_conntrack)
316 {
317         struct ip_conntrack_tuple_hash *h;
318
319         READ_LOCK(&ip_conntrack_lock);
320         h = __ip_conntrack_find(tuple, ignored_conntrack);
321         if (h)
322                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
323         READ_UNLOCK(&ip_conntrack_lock);
324
325         return h;
326 }
327
328 /* Confirm a connection given skb; places it in hash table */
329 int
330 __ip_conntrack_confirm(struct sk_buff **pskb)
331 {
332         unsigned int hash, repl_hash;
333         struct ip_conntrack *ct;
334         enum ip_conntrack_info ctinfo;
335
336         ct = ip_conntrack_get(*pskb, &ctinfo);
337
338         /* ipt_REJECT uses ip_conntrack_attach to attach related
339            ICMP/TCP RST packets in other direction.  Actual packet
340            which created connection will be IP_CT_NEW or for an
341            expected connection, IP_CT_RELATED. */
342         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
343                 return NF_ACCEPT;
344
345         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
346         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
347
348         /* We're not in hash table, and we refuse to set up related
349            connections for unconfirmed conns.  But packet copies and
350            REJECT will give spurious warnings here. */
351         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
352
353         /* No external references means noone else could have
354            confirmed us. */
355         IP_NF_ASSERT(!is_confirmed(ct));
356         DEBUGP("Confirming conntrack %p\n", ct);
357
358         WRITE_LOCK(&ip_conntrack_lock);
359
360         /* See if there's one in the list already, including reverse:
361            NAT could have grabbed it without realizing, since we're
362            not in the hash.  If there is, we lost race. */
363         if (!LIST_FIND(&ip_conntrack_hash[hash],
364                        conntrack_tuple_cmp,
365                        struct ip_conntrack_tuple_hash *,
366                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
367             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
368                           conntrack_tuple_cmp,
369                           struct ip_conntrack_tuple_hash *,
370                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
371                 /* Remove from unconfirmed list */
372                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
373
374                 list_prepend(&ip_conntrack_hash[hash],
375                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
376                 list_prepend(&ip_conntrack_hash[repl_hash],
377                              &ct->tuplehash[IP_CT_DIR_REPLY]);
378                 /* Timer relative to confirmation time, not original
379                    setting time, otherwise we'd get timer wrap in
380                    weird delay cases. */
381                 ct->timeout.expires += jiffies;
382                 add_timer(&ct->timeout);
383                 atomic_inc(&ct->ct_general.use);
384                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
385                 CONNTRACK_STAT_INC(insert);
386                 WRITE_UNLOCK(&ip_conntrack_lock);
387                 return NF_ACCEPT;
388         }
389
390         CONNTRACK_STAT_INC(insert_failed);
391         WRITE_UNLOCK(&ip_conntrack_lock);
392
393         return NF_DROP;
394 }
395
396 /* Returns true if a connection correspondings to the tuple (required
397    for NAT). */
398 int
399 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
400                          const struct ip_conntrack *ignored_conntrack)
401 {
402         struct ip_conntrack_tuple_hash *h;
403
404         READ_LOCK(&ip_conntrack_lock);
405         h = __ip_conntrack_find(tuple, ignored_conntrack);
406         READ_UNLOCK(&ip_conntrack_lock);
407
408         return h != NULL;
409 }
410
411 /* There's a small race here where we may free a just-assured
412    connection.  Too bad: we're in trouble anyway. */
413 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
414 {
415         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
416 }
417
418 static int early_drop(struct list_head *chain)
419 {
420         /* Traverse backwards: gives us oldest, which is roughly LRU */
421         struct ip_conntrack_tuple_hash *h;
422         struct ip_conntrack *ct = NULL;
423         int dropped = 0;
424
425         READ_LOCK(&ip_conntrack_lock);
426         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
427         if (h) {
428                 ct = tuplehash_to_ctrack(h);
429                 atomic_inc(&ct->ct_general.use);
430         }
431         READ_UNLOCK(&ip_conntrack_lock);
432
433         if (!ct)
434                 return dropped;
435
436         if (del_timer(&ct->timeout)) {
437                 death_by_timeout((unsigned long)ct);
438                 dropped = 1;
439                 CONNTRACK_STAT_INC(early_drop);
440         }
441         ip_conntrack_put(ct);
442         return dropped;
443 }
444
445 static inline int helper_cmp(const struct ip_conntrack_helper *i,
446                              const struct ip_conntrack_tuple *rtuple)
447 {
448         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
449 }
450
451 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
452 {
453         return LIST_FIND(&helpers, helper_cmp,
454                          struct ip_conntrack_helper *,
455                          tuple);
456 }
457
458 /* Allocate a new conntrack: we return -ENOMEM if classification
459    failed due to stress.  Otherwise it really is unclassifiable. */
460 static struct ip_conntrack_tuple_hash *
461 init_conntrack(const struct ip_conntrack_tuple *tuple,
462                struct ip_conntrack_protocol *protocol,
463                struct sk_buff *skb)
464 {
465         struct ip_conntrack *conntrack;
466         struct ip_conntrack_tuple repl_tuple;
467         size_t hash;
468         struct ip_conntrack_expect *exp;
469
470         if (!ip_conntrack_hash_rnd_initted) {
471                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
472                 ip_conntrack_hash_rnd_initted = 1;
473         }
474
475         hash = hash_conntrack(tuple);
476
477         if (ip_conntrack_max
478             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
479                 /* Try dropping from this hash chain. */
480                 if (!early_drop(&ip_conntrack_hash[hash])) {
481                         if (net_ratelimit())
482                                 printk(KERN_WARNING
483                                        "ip_conntrack: table full, dropping"
484                                        " packet.\n");
485                         return ERR_PTR(-ENOMEM);
486                 }
487         }
488
489         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
490                 DEBUGP("Can't invert tuple.\n");
491                 return NULL;
492         }
493
494         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
495         if (!conntrack) {
496                 DEBUGP("Can't allocate conntrack.\n");
497                 return ERR_PTR(-ENOMEM);
498         }
499
500         memset(conntrack, 0, sizeof(*conntrack));
501         atomic_set(&conntrack->ct_general.use, 1);
502         conntrack->ct_general.destroy = destroy_conntrack;
503         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
504         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
505 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
506         conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
507         conntrack->xid[IP_CT_DIR_REPLY] = -1;
508 #endif
509         if (!protocol->new(conntrack, skb)) {
510                 kmem_cache_free(ip_conntrack_cachep, conntrack);
511                 return NULL;
512         }
513         /* Don't set timer yet: wait for confirmation */
514         init_timer(&conntrack->timeout);
515         conntrack->timeout.data = (unsigned long)conntrack;
516         conntrack->timeout.function = death_by_timeout;
517
518         WRITE_LOCK(&ip_conntrack_lock);
519         exp = find_expectation(tuple);
520
521         if (exp) {
522                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
523                         conntrack, exp);
524                 /* Welcome, Mr. Bond.  We've been expecting you... */
525                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
526                 conntrack->master = exp->master;
527 #if CONFIG_IP_NF_CONNTRACK_MARK
528                 conntrack->mark = exp->master->mark;
529 #endif
530                 nf_conntrack_get(&conntrack->master->ct_general);
531                 CONNTRACK_STAT_INC(expect_new);
532         } else {
533                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
534
535                 CONNTRACK_STAT_INC(new);
536         }
537
538         /* Overload tuple linked list to put us in unconfirmed list. */
539         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
540
541         atomic_inc(&ip_conntrack_count);
542         WRITE_UNLOCK(&ip_conntrack_lock);
543
544         if (exp) {
545                 if (exp->expectfn)
546                         exp->expectfn(conntrack, exp);
547                 destroy_expect(exp);
548         }
549
550         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
551 }
552
553 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
554 static inline struct ip_conntrack *
555 resolve_normal_ct(struct sk_buff *skb,
556                   struct ip_conntrack_protocol *proto,
557                   int *set_reply,
558                   unsigned int hooknum,
559                   enum ip_conntrack_info *ctinfo)
560 {
561         struct ip_conntrack_tuple tuple;
562         struct ip_conntrack_tuple_hash *h;
563         struct ip_conntrack *ct;
564
565         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
566
567         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
568                                 &tuple,proto))
569                 return NULL;
570
571         /* look for tuple match */
572         h = ip_conntrack_find_get(&tuple, NULL);
573         if (!h) {
574                 h = init_conntrack(&tuple, proto, skb);
575                 if (!h)
576                         return NULL;
577                 if (IS_ERR(h))
578                         return (void *)h;
579         }
580         ct = tuplehash_to_ctrack(h);
581
582         /* It exists; we have (non-exclusive) reference. */
583         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
584                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
585                 /* Please set reply bit if this packet OK */
586                 *set_reply = 1;
587         } else {
588                 /* Once we've had two way comms, always ESTABLISHED. */
589                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
590                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
591                                ct);
592                         *ctinfo = IP_CT_ESTABLISHED;
593                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
594                         DEBUGP("ip_conntrack_in: related packet for %p\n",
595                                ct);
596                         *ctinfo = IP_CT_RELATED;
597                 } else {
598                         DEBUGP("ip_conntrack_in: new packet for %p\n",
599                                ct);
600                         *ctinfo = IP_CT_NEW;
601                 }
602                 *set_reply = 0;
603         }
604         skb->nfct = &ct->ct_general;
605         skb->nfctinfo = *ctinfo;
606         return ct;
607 }
608
609 /* Netfilter hook itself. */
610 unsigned int ip_conntrack_in(unsigned int hooknum,
611                              struct sk_buff **pskb,
612                              const struct net_device *in,
613                              const struct net_device *out,
614                              int (*okfn)(struct sk_buff *))
615 {
616         struct ip_conntrack *ct;
617         enum ip_conntrack_info ctinfo;
618         struct ip_conntrack_protocol *proto;
619         int set_reply;
620         int ret;
621
622         /* Previously seen (loopback or untracked)?  Ignore. */
623         if ((*pskb)->nfct) {
624                 CONNTRACK_STAT_INC(ignore);
625                 return NF_ACCEPT;
626         }
627
628         /* Never happen */
629         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
630                 if (net_ratelimit()) {
631                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
632                        (*pskb)->nh.iph->protocol, hooknum);
633                 }
634                 return NF_DROP;
635         }
636
637         /* FIXME: Do this right please. --RR */
638         (*pskb)->nfcache |= NFC_UNKNOWN;
639
640 /* Doesn't cover locally-generated broadcast, so not worth it. */
641 #if 0
642         /* Ignore broadcast: no `connection'. */
643         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
644                 printk("Broadcast packet!\n");
645                 return NF_ACCEPT;
646         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
647                    == htonl(0x000000FF)) {
648                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
649                        NIPQUAD((*pskb)->nh.iph->saddr),
650                        NIPQUAD((*pskb)->nh.iph->daddr),
651                        (*pskb)->sk, (*pskb)->pkt_type);
652         }
653 #endif
654
655         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
656
657         /* It may be an special packet, error, unclean...
658          * inverse of the return code tells to the netfilter
659          * core what to do with the packet. */
660         if (proto->error != NULL 
661             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
662                 CONNTRACK_STAT_INC(error);
663                 CONNTRACK_STAT_INC(invalid);
664                 return -ret;
665         }
666
667         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
668                 /* Not valid part of a connection */
669                 CONNTRACK_STAT_INC(invalid);
670                 return NF_ACCEPT;
671         }
672
673         if (IS_ERR(ct)) {
674                 /* Too stressed to deal. */
675                 CONNTRACK_STAT_INC(drop);
676                 return NF_DROP;
677         }
678
679         IP_NF_ASSERT((*pskb)->nfct);
680
681         ret = proto->packet(ct, *pskb, ctinfo);
682         if (ret < 0) {
683                 /* Invalid: inverse of the return code tells
684                  * the netfilter core what to do*/
685                 nf_conntrack_put((*pskb)->nfct);
686                 (*pskb)->nfct = NULL;
687                 CONNTRACK_STAT_INC(invalid);
688                 return -ret;
689         }
690
691         if (set_reply)
692                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
693
694         return ret;
695 }
696
697 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
698                    const struct ip_conntrack_tuple *orig)
699 {
700         return ip_ct_invert_tuple(inverse, orig, 
701                                   ip_ct_find_proto(orig->dst.protonum));
702 }
703
704 /* Would two expected things clash? */
705 static inline int expect_clash(const struct ip_conntrack_expect *a,
706                                const struct ip_conntrack_expect *b)
707 {
708         /* Part covered by intersection of masks must be unequal,
709            otherwise they clash */
710         struct ip_conntrack_tuple intersect_mask
711                 = { { a->mask.src.ip & b->mask.src.ip,
712                       { a->mask.src.u.all & b->mask.src.u.all } },
713                     { a->mask.dst.ip & b->mask.dst.ip,
714                       { a->mask.dst.u.all & b->mask.dst.u.all },
715                       a->mask.dst.protonum & b->mask.dst.protonum } };
716
717         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
718 }
719
720 static inline int expect_matches(const struct ip_conntrack_expect *a,
721                                  const struct ip_conntrack_expect *b)
722 {
723         return a->master == b->master
724                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
725                 && ip_ct_tuple_equal(&a->mask, &b->mask);
726 }
727
728 /* Generally a bad idea to call this: could have matched already. */
729 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
730 {
731         struct ip_conntrack_expect *i;
732
733         WRITE_LOCK(&ip_conntrack_lock);
734         /* choose the the oldest expectation to evict */
735         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
736                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
737                         unlink_expect(i);
738                         WRITE_UNLOCK(&ip_conntrack_lock);
739                         destroy_expect(i);
740                         return;
741                 }
742         }
743         WRITE_UNLOCK(&ip_conntrack_lock);
744 }
745
746 struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
747 {
748         struct ip_conntrack_expect *new;
749
750         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
751         if (!new) {
752                 DEBUGP("expect_related: OOM allocating expect\n");
753                 return NULL;
754         }
755         new->master = NULL;
756         return new;
757 }
758
759 void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
760 {
761         kmem_cache_free(ip_conntrack_expect_cachep, expect);
762 }
763
764 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
765 {
766         atomic_inc(&exp->master->ct_general.use);
767         exp->master->expecting++;
768         list_add(&exp->list, &ip_conntrack_expect_list);
769
770         if (exp->master->helper->timeout) {
771                 init_timer(&exp->timeout);
772                 exp->timeout.data = (unsigned long)exp;
773                 exp->timeout.function = expectation_timed_out;
774                 exp->timeout.expires
775                         = jiffies + exp->master->helper->timeout * HZ;
776                 add_timer(&exp->timeout);
777         } else
778                 exp->timeout.function = NULL;
779
780         CONNTRACK_STAT_INC(expect_create);
781 }
782
783 /* Race with expectations being used means we could have none to find; OK. */
784 static void evict_oldest_expect(struct ip_conntrack *master)
785 {
786         struct ip_conntrack_expect *i;
787
788         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
789                 if (i->master == master) {
790                         if (del_timer(&i->timeout)) {
791                                 unlink_expect(i);
792                                 destroy_expect(i);
793                         }
794                         break;
795                 }
796         }
797 }
798
799 static inline int refresh_timer(struct ip_conntrack_expect *i)
800 {
801         if (!del_timer(&i->timeout))
802                 return 0;
803
804         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
805         add_timer(&i->timeout);
806         return 1;
807 }
808
809 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
810 {
811         struct ip_conntrack_expect *i;
812         int ret;
813
814         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
815         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
816         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
817
818         WRITE_LOCK(&ip_conntrack_lock);
819         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
820                 if (expect_matches(i, expect)) {
821                         /* Refresh timer: if it's dying, ignore.. */
822                         if (refresh_timer(i)) {
823                                 ret = 0;
824                                 /* We don't need the one they've given us. */
825                                 ip_conntrack_expect_free(expect);
826                                 goto out;
827                         }
828                 } else if (expect_clash(i, expect)) {
829                         ret = -EBUSY;
830                         goto out;
831                 }
832         }
833
834         /* Will be over limit? */
835         if (expect->master->helper->max_expected && 
836             expect->master->expecting >= expect->master->helper->max_expected)
837                 evict_oldest_expect(expect->master);
838
839         ip_conntrack_expect_insert(expect);
840         ret = 0;
841 out:
842         WRITE_UNLOCK(&ip_conntrack_lock);
843         return ret;
844 }
845
846 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
847    implicitly racy: see __ip_conntrack_confirm */
848 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
849                               const struct ip_conntrack_tuple *newreply)
850 {
851         WRITE_LOCK(&ip_conntrack_lock);
852         /* Should be unconfirmed, so not in hash table yet */
853         IP_NF_ASSERT(!is_confirmed(conntrack));
854
855         DEBUGP("Altering reply tuple of %p to ", conntrack);
856         DUMP_TUPLE(newreply);
857
858         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
859         if (!conntrack->master && conntrack->expecting == 0)
860                 conntrack->helper = ip_ct_find_helper(newreply);
861         WRITE_UNLOCK(&ip_conntrack_lock);
862 }
863
864 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
865 {
866         BUG_ON(me->timeout == 0);
867         WRITE_LOCK(&ip_conntrack_lock);
868         list_prepend(&helpers, me);
869         WRITE_UNLOCK(&ip_conntrack_lock);
870
871         return 0;
872 }
873
874 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
875                          const struct ip_conntrack_helper *me)
876 {
877         if (tuplehash_to_ctrack(i)->helper == me)
878                 tuplehash_to_ctrack(i)->helper = NULL;
879         return 0;
880 }
881
882 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
883 {
884         unsigned int i;
885         struct ip_conntrack_expect *exp, *tmp;
886
887         /* Need write lock here, to delete helper. */
888         WRITE_LOCK(&ip_conntrack_lock);
889         LIST_DELETE(&helpers, me);
890
891         /* Get rid of expectations */
892         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
893                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
894                         unlink_expect(exp);
895                         destroy_expect(exp);
896                 }
897         }
898         /* Get rid of expecteds, set helpers to NULL. */
899         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
900         for (i = 0; i < ip_conntrack_htable_size; i++)
901                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
902                             struct ip_conntrack_tuple_hash *, me);
903         WRITE_UNLOCK(&ip_conntrack_lock);
904
905         /* Someone could be still looking at the helper in a bh. */
906         synchronize_net();
907 }
908
909 static inline void ct_add_counters(struct ip_conntrack *ct,
910                                    enum ip_conntrack_info ctinfo,
911                                    const struct sk_buff *skb)
912 {
913 #ifdef CONFIG_IP_NF_CT_ACCT
914         if (skb) {
915                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
916                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
917                                         ntohs(skb->nh.iph->tot_len);
918         }
919 #endif
920 }
921
922 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
923 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
924                         enum ip_conntrack_info ctinfo,
925                         const struct sk_buff *skb,
926                         unsigned long extra_jiffies)
927 {
928         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
929
930         /* If not in hash table, timer will not be active yet */
931         if (!is_confirmed(ct)) {
932                 ct->timeout.expires = extra_jiffies;
933                 ct_add_counters(ct, ctinfo, skb);
934         } else {
935                 WRITE_LOCK(&ip_conntrack_lock);
936                 /* Need del_timer for race avoidance (may already be dying). */
937                 if (del_timer(&ct->timeout)) {
938                         ct->timeout.expires = jiffies + extra_jiffies;
939                         add_timer(&ct->timeout);
940                 }
941                 ct_add_counters(ct, ctinfo, skb);
942                 WRITE_UNLOCK(&ip_conntrack_lock);
943         }
944 }
945
946 /* Returns new sk_buff, or NULL */
947 struct sk_buff *
948 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
949 {
950 #ifdef CONFIG_NETFILTER_DEBUG
951         unsigned int olddebug = skb->nf_debug;
952 #endif
953
954         skb_orphan(skb);
955
956         local_bh_disable(); 
957         skb = ip_defrag(skb, user);
958         local_bh_enable();
959
960         if (skb) {
961                 ip_send_check(skb->nh.iph);
962                 skb->nfcache |= NFC_ALTERED;
963 #ifdef CONFIG_NETFILTER_DEBUG
964                 /* Packet path as if nothing had happened. */
965                 skb->nf_debug = olddebug;
966 #endif
967         }
968
969         return skb;
970 }
971
972 /* Used by ipt_REJECT. */
973 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
974 {
975         struct ip_conntrack *ct;
976         enum ip_conntrack_info ctinfo;
977
978         /* This ICMP is in reverse direction to the packet which caused it */
979         ct = ip_conntrack_get(skb, &ctinfo);
980         
981         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
982                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
983         else
984                 ctinfo = IP_CT_RELATED;
985
986         /* Attach to new skbuff, and increment count */
987         nskb->nfct = &ct->ct_general;
988         nskb->nfctinfo = ctinfo;
989         nf_conntrack_get(nskb->nfct);
990 }
991
992 static inline int
993 do_iter(const struct ip_conntrack_tuple_hash *i,
994         int (*iter)(struct ip_conntrack *i, void *data),
995         void *data)
996 {
997         return iter(tuplehash_to_ctrack(i), data);
998 }
999
1000 /* Bring out ya dead! */
1001 static struct ip_conntrack_tuple_hash *
1002 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1003                 void *data, unsigned int *bucket)
1004 {
1005         struct ip_conntrack_tuple_hash *h = NULL;
1006
1007         WRITE_LOCK(&ip_conntrack_lock);
1008         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1009                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1010                                 struct ip_conntrack_tuple_hash *, iter, data);
1011                 if (h)
1012                         break;
1013         }
1014         if (!h)
1015                 h = LIST_FIND_W(&unconfirmed, do_iter,
1016                                 struct ip_conntrack_tuple_hash *, iter, data);
1017         if (h)
1018                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1019         WRITE_UNLOCK(&ip_conntrack_lock);
1020
1021         return h;
1022 }
1023
1024 void
1025 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1026 {
1027         struct ip_conntrack_tuple_hash *h;
1028         unsigned int bucket = 0;
1029
1030         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1031                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1032                 /* Time to push up daises... */
1033                 if (del_timer(&ct->timeout))
1034                         death_by_timeout((unsigned long)ct);
1035                 /* ... else the timer will get him soon. */
1036
1037                 ip_conntrack_put(ct);
1038         }
1039 }
1040
1041 /* Fast function for those who don't want to parse /proc (and I don't
1042    blame them). */
1043 /* Reversing the socket's dst/src point of view gives us the reply
1044    mapping. */
1045 static int
1046 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1047 {
1048         struct inet_sock *inet = inet_sk(sk);
1049         struct ip_conntrack_tuple_hash *h;
1050         struct ip_conntrack_tuple tuple;
1051         
1052         IP_CT_TUPLE_U_BLANK(&tuple);
1053         tuple.src.ip = inet->rcv_saddr;
1054         tuple.src.u.tcp.port = inet->sport;
1055         tuple.dst.ip = inet->daddr;
1056         tuple.dst.u.tcp.port = inet->dport;
1057         tuple.dst.protonum = IPPROTO_TCP;
1058
1059         /* We only do TCP at the moment: is there a better way? */
1060         if (strcmp(sk->sk_prot->name, "TCP")) {
1061                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1062                 return -ENOPROTOOPT;
1063         }
1064
1065         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1066                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1067                        *len, sizeof(struct sockaddr_in));
1068                 return -EINVAL;
1069         }
1070
1071         h = ip_conntrack_find_get(&tuple, NULL);
1072         if (h) {
1073                 struct sockaddr_in sin;
1074                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1075
1076                 sin.sin_family = AF_INET;
1077                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1078                         .tuple.dst.u.tcp.port;
1079                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1080                         .tuple.dst.ip;
1081
1082                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1083                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1084                 ip_conntrack_put(ct);
1085                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1086                         return -EFAULT;
1087                 else
1088                         return 0;
1089         }
1090         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1091                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1092                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1093         return -ENOENT;
1094 }
1095
1096 static struct nf_sockopt_ops so_getorigdst = {
1097         .pf             = PF_INET,
1098         .get_optmin     = SO_ORIGINAL_DST,
1099         .get_optmax     = SO_ORIGINAL_DST+1,
1100         .get            = &getorigdst,
1101 };
1102
1103 static int kill_all(struct ip_conntrack *i, void *data)
1104 {
1105         return 1;
1106 }
1107
1108 static void free_conntrack_hash(void)
1109 {
1110         if (ip_conntrack_vmalloc)
1111                 vfree(ip_conntrack_hash);
1112         else
1113                 free_pages((unsigned long)ip_conntrack_hash, 
1114                            get_order(sizeof(struct list_head)
1115                                      * ip_conntrack_htable_size));
1116 }
1117
1118 /* Mishearing the voices in his head, our hero wonders how he's
1119    supposed to kill the mall. */
1120 void ip_conntrack_cleanup(void)
1121 {
1122         ip_ct_attach = NULL;
1123         /* This makes sure all current packets have passed through
1124            netfilter framework.  Roll on, two-stage module
1125            delete... */
1126         synchronize_net();
1127  
1128  i_see_dead_people:
1129         ip_ct_iterate_cleanup(kill_all, NULL);
1130         if (atomic_read(&ip_conntrack_count) != 0) {
1131                 schedule();
1132                 goto i_see_dead_people;
1133         }
1134
1135         kmem_cache_destroy(ip_conntrack_cachep);
1136         kmem_cache_destroy(ip_conntrack_expect_cachep);
1137         free_conntrack_hash();
1138         nf_unregister_sockopt(&so_getorigdst);
1139 }
1140
1141 static int hashsize;
1142 module_param(hashsize, int, 0400);
1143
1144 int __init ip_conntrack_init(void)
1145 {
1146         unsigned int i;
1147         int ret;
1148
1149         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1150          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1151         if (hashsize) {
1152                 ip_conntrack_htable_size = hashsize;
1153         } else {
1154                 ip_conntrack_htable_size
1155                         = (((num_physpages << PAGE_SHIFT) / 16384)
1156                            / sizeof(struct list_head));
1157                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1158                         ip_conntrack_htable_size = 8192;
1159                 if (ip_conntrack_htable_size < 16)
1160                         ip_conntrack_htable_size = 16;
1161         }
1162         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1163
1164         printk("ip_conntrack version %s (%u buckets, %d max)"
1165                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1166                ip_conntrack_htable_size, ip_conntrack_max,
1167                sizeof(struct ip_conntrack));
1168
1169         ret = nf_register_sockopt(&so_getorigdst);
1170         if (ret != 0) {
1171                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1172                 return ret;
1173         }
1174
1175         /* AK: the hash table is twice as big than needed because it
1176            uses list_head.  it would be much nicer to caches to use a
1177            single pointer list head here. */
1178         ip_conntrack_vmalloc = 0; 
1179         ip_conntrack_hash 
1180                 =(void*)__get_free_pages(GFP_KERNEL, 
1181                                          get_order(sizeof(struct list_head)
1182                                                    *ip_conntrack_htable_size));
1183         if (!ip_conntrack_hash) { 
1184                 ip_conntrack_vmalloc = 1;
1185                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1186                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1187                                             * ip_conntrack_htable_size);
1188         }
1189         if (!ip_conntrack_hash) {
1190                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1191                 goto err_unreg_sockopt;
1192         }
1193
1194         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1195                                                 sizeof(struct ip_conntrack), 0,
1196                                                 0, NULL, NULL);
1197         if (!ip_conntrack_cachep) {
1198                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1199                 goto err_free_hash;
1200         }
1201
1202         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1203                                         sizeof(struct ip_conntrack_expect),
1204                                         0, 0, NULL, NULL);
1205         if (!ip_conntrack_expect_cachep) {
1206                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1207                 goto err_free_conntrack_slab;
1208         }
1209
1210         /* Don't NEED lock here, but good form anyway. */
1211         WRITE_LOCK(&ip_conntrack_lock);
1212         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1213                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1214         /* Sew in builtin protocols. */
1215         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1216         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1217         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1218         WRITE_UNLOCK(&ip_conntrack_lock);
1219
1220         for (i = 0; i < ip_conntrack_htable_size; i++)
1221                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1222
1223         /* For use by ipt_REJECT */
1224         ip_ct_attach = ip_conntrack_attach;
1225
1226         /* Set up fake conntrack:
1227             - to never be deleted, not in any hashes */
1228         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1229         /*  - and look it like as a confirmed connection */
1230         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1231
1232         return ret;
1233
1234 err_free_conntrack_slab:
1235         kmem_cache_destroy(ip_conntrack_cachep);
1236 err_free_hash:
1237         free_conntrack_hash();
1238 err_unreg_sockopt:
1239         nf_unregister_sockopt(&so_getorigdst);
1240
1241         return -ENOMEM;
1242 }