vserver 1.9.3
[linux-2.6.git] / net / ipv4 / netfilter / ip_nat_core.c
1 /* NAT for netfilter; shared with compatibility layer. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
18 #include <net/icmp.h>
19 #include <net/ip.h>
20 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
23
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
26
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
36
37 #if 0
38 #define DEBUGP printk
39 #else
40 #define DEBUGP(format, args...)
41 #endif
42
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
45
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
48
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
51 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
52
53
54 /* We keep extra hashes for each conntrack, for fast searching. */
55 static inline size_t
56 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
57 {
58         /* Modified src and dst, to ensure we don't create two
59            identical streams. */
60         return (src + dst + proto) % ip_nat_htable_size;
61 }
62
63 static inline size_t
64 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
65 {
66         /* Original src, to ensure we map it consistently if poss. */
67         return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
68 }
69
70 /* Noone using conntrack by the time this called. */
71 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
72 {
73         struct ip_nat_info *info = &conn->nat.info;
74         unsigned int hs, hp;
75
76         if (!info->initialized)
77                 return;
78
79         hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
80                          conn->tuplehash[IP_CT_DIR_ORIGINAL]
81                          .tuple.dst.protonum);
82
83         hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
84                               conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
85                               conn->tuplehash[IP_CT_DIR_REPLY]
86                               .tuple.dst.protonum);
87
88         WRITE_LOCK(&ip_nat_lock);
89         list_del(&info->bysource);
90         list_del(&info->byipsproto);
91         WRITE_UNLOCK(&ip_nat_lock);
92 }
93
94 /* We do checksum mangling, so if they were wrong before they're still
95  * wrong.  Also works for incomplete packets (eg. ICMP dest
96  * unreachables.) */
97 u_int16_t
98 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
99 {
100         u_int32_t diffs[] = { oldvalinv, newval };
101         return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
102                                       oldcheck^0xFFFF));
103 }
104
105 /* Is this tuple already taken? (not by us) */
106 int
107 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
108                   const struct ip_conntrack *ignored_conntrack)
109 {
110         /* Conntrack tracking doesn't keep track of outgoing tuples; only
111            incoming ones.  NAT means they don't have a fixed mapping,
112            so we invert the tuple and look for the incoming reply.
113
114            We could keep a separate hash if this proves too slow. */
115         struct ip_conntrack_tuple reply;
116
117         invert_tuplepr(&reply, tuple);
118         return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
119 }
120
121 /* Does tuple + the source manip come within the range mr */
122 static int
123 in_range(const struct ip_conntrack_tuple *tuple,
124          const struct ip_conntrack_manip *manip,
125          const struct ip_nat_multi_range *mr)
126 {
127         struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
128         unsigned int i;
129         struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
130
131         for (i = 0; i < mr->rangesize; i++) {
132                 /* If we are allowed to map IPs, then we must be in the
133                    range specified, otherwise we must be unchanged. */
134                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
135                         if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
136                             || (ntohl(newtuple.src.ip)
137                                 > ntohl(mr->range[i].max_ip)))
138                                 continue;
139                 } else {
140                         if (newtuple.src.ip != tuple->src.ip)
141                                 continue;
142                 }
143
144                 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
145                     || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
146                                        &mr->range[i].min, &mr->range[i].max))
147                         return 1;
148         }
149         return 0;
150 }
151
152 static inline int
153 src_cmp(const struct ip_conntrack *ct,
154         const struct ip_conntrack_tuple *tuple,
155         const struct ip_nat_multi_range *mr)
156 {
157         return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
158                 == tuple->dst.protonum
159                 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
160                 == tuple->src.ip
161                 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
162                 == tuple->src.u.all
163                 && in_range(tuple,
164                             &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src, mr));
165 }
166
167 /* Only called for SRC manip */
168 static struct ip_conntrack_manip *
169 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
170                      const struct ip_nat_multi_range *mr)
171 {
172         unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
173         struct ip_conntrack *ct;
174
175         MUST_BE_READ_LOCKED(&ip_nat_lock);
176         list_for_each_entry(ct, &bysource[h], nat.info.bysource)
177                 if (src_cmp(ct, tuple, mr))
178                         return &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
179         return NULL;
180 }
181
182 #ifdef CONFIG_IP_NF_NAT_LOCAL
183 /* If it's really a local destination manip, it may need to do a
184    source manip too. */
185 static int
186 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
187 {
188         struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
189         struct rtable *rt;
190
191         /* FIXME: IPTOS_TOS(iph->tos) --RR */
192         if (ip_route_output_key(&rt, &fl) != 0) {
193                 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
194                        NIPQUAD(var_ip));
195                 return 0;
196         }
197
198         *other_ipp = rt->rt_src;
199         ip_rt_put(rt);
200         return 1;
201 }
202 #endif
203
204 /* Simple way to iterate through all. */
205 static inline int fake_cmp(const struct ip_conntrack *ct,
206                            u_int32_t src, u_int32_t dst, u_int16_t protonum,
207                            unsigned int *score, const struct ip_conntrack *ct2)
208 {
209         /* Compare backwards: we're dealing with OUTGOING tuples, and
210            inside the conntrack is the REPLY tuple.  Don't count this
211            conntrack. */
212         if (ct != ct2
213             && ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
214             && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
215             && (ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum == protonum))
216                 (*score)++;
217         return 0;
218 }
219
220 static inline unsigned int
221 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
222            const struct ip_conntrack *conntrack)
223 {
224         struct ip_conntrack *ct;
225         unsigned int score = 0;
226         unsigned int h;
227
228         MUST_BE_READ_LOCKED(&ip_nat_lock);
229         h = hash_by_ipsproto(src, dst, protonum);
230         list_for_each_entry(ct, &byipsproto[h], nat.info.byipsproto)
231                 fake_cmp(ct, src, dst, protonum, &score, conntrack);
232
233         return score;
234 }
235
236 /* For [FUTURE] fragmentation handling, we want the least-used
237    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
238    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
239    1-65535, we don't do pro-rata allocation based on ports; we choose
240    the ip with the lowest src-ip/dst-ip/proto usage.
241
242    If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
243    range), we eliminate that and try again.  This is not the most
244    efficient approach, but if you're worried about that, don't hand us
245    ranges you don't really have.  */
246 static struct ip_nat_range *
247 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
248                     const struct ip_nat_multi_range *mr,
249                     const struct ip_conntrack *conntrack,
250                     unsigned int hooknum)
251 {
252         unsigned int i;
253         struct {
254                 const struct ip_nat_range *range;
255                 unsigned int score;
256                 struct ip_conntrack_tuple tuple;
257         } best = { NULL,  0xFFFFFFFF };
258         u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
259         static unsigned int randomness;
260
261         if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
262                 var_ipp = &tuple->src.ip;
263                 saved_ip = tuple->dst.ip;
264                 other_ipp = &tuple->dst.ip;
265         } else {
266                 var_ipp = &tuple->dst.ip;
267                 saved_ip = tuple->src.ip;
268                 other_ipp = &tuple->src.ip;
269         }
270         /* Don't do do_extra_mangle unless necessary (overrides
271            explicit socket bindings, for example) */
272         orig_dstip = tuple->dst.ip;
273
274         IP_NF_ASSERT(mr->rangesize >= 1);
275         for (i = 0; i < mr->rangesize; i++) {
276                 /* Host order */
277                 u_int32_t minip, maxip, j;
278
279                 /* Don't do ranges which are already eliminated. */
280                 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
281                         continue;
282                 }
283
284                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
285                         minip = ntohl(mr->range[i].min_ip);
286                         maxip = ntohl(mr->range[i].max_ip);
287                 } else
288                         minip = maxip = ntohl(*var_ipp);
289
290                 randomness++;
291                 for (j = 0; j < maxip - minip + 1; j++) {
292                         unsigned int score;
293
294                         *var_ipp = htonl(minip + (randomness + j) 
295                                          % (maxip - minip + 1));
296
297                         /* Reset the other ip in case it was mangled by
298                          * do_extra_mangle last time. */
299                         *other_ipp = saved_ip;
300
301 #ifdef CONFIG_IP_NF_NAT_LOCAL
302                         if (hooknum == NF_IP_LOCAL_OUT
303                             && *var_ipp != orig_dstip
304                             && !do_extra_mangle(*var_ipp, other_ipp)) {
305                                 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
306                                        i, NIPQUAD(*var_ipp));
307                                 /* Can't route?  This whole range part is
308                                  * probably screwed, but keep trying
309                                  * anyway. */
310                                 continue;
311                         }
312 #endif
313
314                         /* Count how many others map onto this. */
315                         score = count_maps(tuple->src.ip, tuple->dst.ip,
316                                            tuple->dst.protonum, conntrack);
317                         if (score < best.score) {
318                                 /* Optimization: doesn't get any better than
319                                    this. */
320                                 if (score == 0)
321                                         return (struct ip_nat_range *)
322                                                 &mr->range[i];
323
324                                 best.score = score;
325                                 best.tuple = *tuple;
326                                 best.range = &mr->range[i];
327                         }
328                 }
329         }
330         *tuple = best.tuple;
331
332         /* Discard const. */
333         return (struct ip_nat_range *)best.range;
334 }
335
336 /* Fast version doesn't iterate through hash chains, but only handles
337    common case of single IP address (null NAT, masquerade) */
338 static struct ip_nat_range *
339 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
340                          const struct ip_nat_multi_range *mr,
341                          const struct ip_conntrack *conntrack,
342                          unsigned int hooknum)
343 {
344         if (mr->rangesize != 1
345             || (mr->range[0].flags & IP_NAT_RANGE_FULL)
346             || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
347                 && mr->range[0].min_ip != mr->range[0].max_ip))
348                 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
349
350         if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
351                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
352                         tuple->src.ip = mr->range[0].min_ip;
353                 else {
354                         /* Only do extra mangle when required (breaks
355                            socket binding) */
356 #ifdef CONFIG_IP_NF_NAT_LOCAL
357                         if (tuple->dst.ip != mr->range[0].min_ip
358                             && hooknum == NF_IP_LOCAL_OUT
359                             && !do_extra_mangle(mr->range[0].min_ip,
360                                                 &tuple->src.ip))
361                                 return NULL;
362 #endif
363                         tuple->dst.ip = mr->range[0].min_ip;
364                 }
365         }
366
367         /* Discard const. */
368         return (struct ip_nat_range *)&mr->range[0];
369 }
370
371 static int
372 get_unique_tuple(struct ip_conntrack_tuple *tuple,
373                  const struct ip_conntrack_tuple *orig_tuple,
374                  const struct ip_nat_multi_range *mrr,
375                  struct ip_conntrack *conntrack,
376                  unsigned int hooknum)
377 {
378         struct ip_nat_protocol *proto
379                 = ip_nat_find_proto(orig_tuple->dst.protonum);
380         struct ip_nat_range *rptr;
381         unsigned int i;
382         int ret;
383
384         /* We temporarily use flags for marking full parts, but we
385            always clean up afterwards */
386         struct ip_nat_multi_range *mr = (void *)mrr;
387
388         /* 1) If this srcip/proto/src-proto-part is currently mapped,
389            and that same mapping gives a unique tuple within the given
390            range, use that.
391
392            This is only required for source (ie. NAT/masq) mappings.
393            So far, we don't do local source mappings, so multiple
394            manips not an issue.  */
395         if (hooknum == NF_IP_POST_ROUTING) {
396                 struct ip_conntrack_manip *manip;
397
398                 manip = find_appropriate_src(orig_tuple, mr);
399                 if (manip) {
400                         /* Apply same source manipulation. */
401                         *tuple = ((struct ip_conntrack_tuple)
402                                   { *manip, orig_tuple->dst });
403                         DEBUGP("get_unique_tuple: Found current src map\n");
404                         if (!ip_nat_used_tuple(tuple, conntrack))
405                                 return 1;
406                 }
407         }
408
409         /* 2) Select the least-used IP/proto combination in the given
410            range.
411         */
412         *tuple = *orig_tuple;
413         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
414                != NULL) {
415                 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
416                 /* 3) The per-protocol part of the manip is made to
417                    map into the range to make a unique tuple. */
418
419                 /* Only bother mapping if it's not already in range
420                    and unique */
421                 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
422                      || proto->in_range(tuple, HOOK2MANIP(hooknum),
423                                         &rptr->min, &rptr->max))
424                     && !ip_nat_used_tuple(tuple, conntrack)) {
425                         ret = 1;
426                         goto clear_fulls;
427                 } else {
428                         if (proto->unique_tuple(tuple, rptr,
429                                                 HOOK2MANIP(hooknum),
430                                                 conntrack)) {
431                                 /* Must be unique. */
432                                 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
433                                                                 conntrack));
434                                 ret = 1;
435                                 goto clear_fulls;
436                         } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
437                                 /* Try implicit source NAT; protocol
438                                    may be able to play with ports to
439                                    make it unique. */
440                                 struct ip_nat_range r
441                                         = { IP_NAT_RANGE_MAP_IPS, 
442                                             tuple->src.ip, tuple->src.ip,
443                                             { 0 }, { 0 } };
444                                 DEBUGP("Trying implicit mapping\n");
445                                 if (proto->unique_tuple(tuple, &r,
446                                                         IP_NAT_MANIP_SRC,
447                                                         conntrack)) {
448                                         /* Must be unique. */
449                                         IP_NF_ASSERT(!ip_nat_used_tuple
450                                                      (tuple, conntrack));
451                                         ret = 1;
452                                         goto clear_fulls;
453                                 }
454                         }
455                         DEBUGP("Protocol can't get unique tuple %u.\n",
456                                hooknum);
457                 }
458
459                 /* Eliminate that from range, and try again. */
460                 rptr->flags |= IP_NAT_RANGE_FULL;
461                 *tuple = *orig_tuple;
462         }
463
464         ret = 0;
465
466  clear_fulls:
467         /* Clear full flags. */
468         IP_NF_ASSERT(mr->rangesize >= 1);
469         for (i = 0; i < mr->rangesize; i++)
470                 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
471
472         return ret;
473 }
474
475 /* Where to manip the reply packets (will be reverse manip). */
476 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
477 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
478     [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
479 #ifdef CONFIG_IP_NF_NAT_LOCAL
480     [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
481     [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
482 #endif
483 };
484
485 unsigned int
486 ip_nat_setup_info(struct ip_conntrack *conntrack,
487                   const struct ip_nat_multi_range *mr,
488                   unsigned int hooknum)
489 {
490         struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
491         struct ip_conntrack_tuple orig_tp;
492         struct ip_nat_info *info = &conntrack->nat.info;
493         int in_hashes = info->initialized;
494
495         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
496         IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
497                      || hooknum == NF_IP_POST_ROUTING
498                      || hooknum == NF_IP_LOCAL_IN
499                      || hooknum == NF_IP_LOCAL_OUT);
500         IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
501         IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
502
503         /* What we've got will look like inverse of reply. Normally
504            this is what is in the conntrack, except for prior
505            manipulations (future optimization: if num_manips == 0,
506            orig_tp =
507            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
508         invert_tuplepr(&orig_tp,
509                        &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
510
511 #if 0
512         {
513         unsigned int i;
514
515         DEBUGP("Hook %u (%s), ", hooknum,
516                HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
517         DUMP_TUPLE(&orig_tp);
518         DEBUGP("Range %p: ", mr);
519         for (i = 0; i < mr->rangesize; i++) {
520                 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
521                        i,
522                        (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
523                        ? " MAP_IPS" : "",
524                        (mr->range[i].flags
525                         & IP_NAT_RANGE_PROTO_SPECIFIED)
526                        ? " PROTO_SPECIFIED" : "",
527                        (mr->range[i].flags & IP_NAT_RANGE_FULL)
528                        ? " FULL" : "",
529                        NIPQUAD(mr->range[i].min_ip),
530                        NIPQUAD(mr->range[i].max_ip),
531                        mr->range[i].min.all,
532                        mr->range[i].max.all);
533         }
534         }
535 #endif
536
537         do {
538                 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
539                                       hooknum)) {
540                         DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
541                                conntrack);
542                         return NF_DROP;
543                 }
544
545 #if 0
546                 DEBUGP("Hook %u (%s) %p\n", hooknum,
547                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
548                        conntrack);
549                 DEBUGP("Original: ");
550                 DUMP_TUPLE(&orig_tp);
551                 DEBUGP("New: ");
552                 DUMP_TUPLE(&new_tuple);
553 #endif
554
555                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
556                    the original (A/B/C/D') and the mangled one (E/F/G/H').
557
558                    We're only allowed to work with the SRC per-proto
559                    part, so we create inverses of both to start, then
560                    derive the other fields we need.  */
561
562                 /* Reply connection: simply invert the new tuple
563                    (G/H/E/F') */
564                 invert_tuplepr(&reply, &new_tuple);
565
566                 /* Alter conntrack table so it recognizes replies.
567                    If fail this race (reply tuple now used), repeat. */
568         } while (!ip_conntrack_alter_reply(conntrack, &reply));
569
570         /* FIXME: We can simply used existing conntrack reply tuple
571            here --RR */
572         /* Create inverse of original: C/D/A/B' */
573         invert_tuplepr(&inv_tuple, &orig_tp);
574
575         /* Has source changed?. */
576         if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
577                 /* In this direction, a source manip. */
578                 info->manips[info->num_manips++] =
579                         ((struct ip_nat_info_manip)
580                          { IP_CT_DIR_ORIGINAL, hooknum,
581                            IP_NAT_MANIP_SRC, new_tuple.src });
582
583                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
584
585                 /* In the reverse direction, a destination manip. */
586                 info->manips[info->num_manips++] =
587                         ((struct ip_nat_info_manip)
588                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
589                            IP_NAT_MANIP_DST, orig_tp.src });
590                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
591         }
592
593         /* Has destination changed? */
594         if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
595                 /* In this direction, a destination manip */
596                 info->manips[info->num_manips++] =
597                         ((struct ip_nat_info_manip)
598                          { IP_CT_DIR_ORIGINAL, hooknum,
599                            IP_NAT_MANIP_DST, reply.src });
600
601                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
602
603                 /* In the reverse direction, a source manip. */
604                 info->manips[info->num_manips++] =
605                         ((struct ip_nat_info_manip)
606                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
607                            IP_NAT_MANIP_SRC, inv_tuple.src });
608                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
609         }
610
611         /* If there's a helper, assign it; based on new tuple. */
612         if (!conntrack->master)
613                 info->helper = __ip_nat_find_helper(&reply);
614
615         /* It's done. */
616         info->initialized |= (1 << HOOK2MANIP(hooknum));
617
618         if (in_hashes)
619                 replace_in_hashes(conntrack, info);
620         else
621                 place_in_hashes(conntrack, info);
622
623         return NF_ACCEPT;
624 }
625
626 void replace_in_hashes(struct ip_conntrack *conntrack,
627                        struct ip_nat_info *info)
628 {
629         /* Source has changed, so replace in hashes. */
630         unsigned int srchash
631                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
632                               .tuple.src,
633                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
634                               .tuple.dst.protonum);
635         /* We place packet as seen OUTGOUNG in byips_proto hash
636            (ie. reverse dst and src of reply packet. */
637         unsigned int ipsprotohash
638                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
639                                    .tuple.dst.ip,
640                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
641                                    .tuple.src.ip,
642                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
643                                    .tuple.dst.protonum);
644
645         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
646         list_move(&info->bysource, &bysource[srchash]);
647         list_move(&info->byipsproto, &byipsproto[ipsprotohash]);
648 }
649
650 void place_in_hashes(struct ip_conntrack *conntrack,
651                      struct ip_nat_info *info)
652 {
653         unsigned int srchash
654                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
655                               .tuple.src,
656                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
657                               .tuple.dst.protonum);
658         /* We place packet as seen OUTGOUNG in byips_proto hash
659            (ie. reverse dst and src of reply packet. */
660         unsigned int ipsprotohash
661                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
662                                    .tuple.dst.ip,
663                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
664                                    .tuple.src.ip,
665                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
666                                    .tuple.dst.protonum);
667
668         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
669         list_add(&info->bysource, &bysource[srchash]);
670         list_add(&info->byipsproto, &byipsproto[ipsprotohash]);
671 }
672
673 /* Returns true if succeeded. */
674 static int
675 manip_pkt(u_int16_t proto,
676           struct sk_buff **pskb,
677           unsigned int iphdroff,
678           const struct ip_conntrack_manip *manip,
679           enum ip_nat_manip_type maniptype)
680 {
681         struct iphdr *iph;
682
683         (*pskb)->nfcache |= NFC_ALTERED;
684         if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
685                 return 0;
686
687         iph = (void *)(*pskb)->data + iphdroff;
688
689         /* Manipulate protcol part. */
690         if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff + iph->ihl*4,
691                                                  manip, maniptype))
692                 return 0;
693
694         iph = (void *)(*pskb)->data + iphdroff;
695
696         if (maniptype == IP_NAT_MANIP_SRC) {
697                 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
698                                                 iph->check);
699                 iph->saddr = manip->ip;
700         } else {
701                 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
702                                                 iph->check);
703                 iph->daddr = manip->ip;
704         }
705         return 1;
706 }
707
708 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
709                                  struct sk_buff *skb)
710 {
711         struct ip_conntrack_protocol *proto;
712         int ret = 1;
713
714         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
715         proto = ip_ct_find_proto(skb->nh.iph->protocol);
716         if (proto->exp_matches_pkt)
717                 ret = proto->exp_matches_pkt(exp, skb);
718
719         return ret;
720 }
721
722 /* Do packet manipulations according to binding. */
723 unsigned int
724 do_bindings(struct ip_conntrack *ct,
725             enum ip_conntrack_info ctinfo,
726             struct ip_nat_info *info,
727             unsigned int hooknum,
728             struct sk_buff **pskb)
729 {
730         unsigned int i;
731         struct ip_nat_helper *helper;
732         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
733         int proto = (*pskb)->nh.iph->protocol;
734
735         /* Need nat lock to protect against modification, but neither
736            conntrack (referenced) and helper (deleted with
737            synchronize_bh()) can vanish. */
738         READ_LOCK(&ip_nat_lock);
739         for (i = 0; i < info->num_manips; i++) {
740                 if (info->manips[i].direction == dir
741                     && info->manips[i].hooknum == hooknum) {
742                         DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
743                                *pskb,
744                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
745                                ? "SRC" : "DST",
746                                NIPQUAD(info->manips[i].manip.ip),
747                                htons(info->manips[i].manip.u.all));
748                         if (!manip_pkt(proto, pskb, 0,
749                                        &info->manips[i].manip,
750                                        info->manips[i].maniptype)) {
751                                 READ_UNLOCK(&ip_nat_lock);
752                                 return NF_DROP;
753                         }
754                 }
755         }
756         helper = info->helper;
757         READ_UNLOCK(&ip_nat_lock);
758
759         if (helper) {
760                 struct ip_conntrack_expect *exp = NULL;
761                 struct list_head *cur_item;
762                 int ret = NF_ACCEPT;
763                 int helper_called = 0;
764
765                 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
766
767                 /* Always defragged for helpers */
768                 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
769                                & htons(IP_MF|IP_OFFSET)));
770
771                 /* Have to grab read lock before sibling_list traversal */
772                 READ_LOCK(&ip_conntrack_lock);
773                 list_for_each_prev(cur_item, &ct->sibling_list) { 
774                         exp = list_entry(cur_item, struct ip_conntrack_expect, 
775                                          expected_list);
776                                          
777                         /* if this expectation is already established, skip */
778                         if (exp->sibling)
779                                 continue;
780
781                         if (exp_for_packet(exp, *pskb)) {
782                                 /* FIXME: May be true multiple times in the
783                                  * case of UDP!! */
784                                 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
785                                 ret = helper->help(ct, exp, info, ctinfo, 
786                                                    hooknum, pskb);
787                                 if (ret != NF_ACCEPT) {
788                                         READ_UNLOCK(&ip_conntrack_lock);
789                                         return ret;
790                                 }
791                                 helper_called = 1;
792                         }
793                 }
794                 /* Helper might want to manip the packet even when there is no
795                  * matching expectation for this packet */
796                 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
797                         DEBUGP("calling nat helper for packet without expectation\n");
798                         ret = helper->help(ct, NULL, info, ctinfo, 
799                                            hooknum, pskb);
800                         if (ret != NF_ACCEPT) {
801                                 READ_UNLOCK(&ip_conntrack_lock);
802                                 return ret;
803                         }
804                 }
805                 READ_UNLOCK(&ip_conntrack_lock);
806                 
807                 /* Adjust sequence number only once per packet 
808                  * (helper is called at all hooks) */
809                 if (proto == IPPROTO_TCP
810                     && (hooknum == NF_IP_POST_ROUTING
811                         || hooknum == NF_IP_LOCAL_IN)) {
812                         DEBUGP("ip_nat_core: adjusting sequence number\n");
813                         /* future: put this in a l4-proto specific function,
814                          * and call this function here. */
815                         if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
816                                 ret = NF_DROP;
817                 }
818
819                 return ret;
820
821         } else 
822                 return NF_ACCEPT;
823
824         /* not reached */
825 }
826
827 int
828 icmp_reply_translation(struct sk_buff **pskb,
829                        struct ip_conntrack *conntrack,
830                        unsigned int hooknum,
831                        int dir)
832 {
833         struct {
834                 struct icmphdr icmp;
835                 struct iphdr ip;
836         } *inside;
837         unsigned int i;
838         struct ip_nat_info *info = &conntrack->nat.info;
839         int hdrlen;
840
841         if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
842                 return 0;
843         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
844
845         /* We're actually going to mangle it beyond trivial checksum
846            adjustment, so make sure the current checksum is correct. */
847         if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
848                 hdrlen = (*pskb)->nh.iph->ihl * 4;
849                 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
850                                                 (*pskb)->len - hdrlen, 0)))
851                         return 0;
852         }
853
854         /* Must be RELATED */
855         IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
856                      (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
857
858         /* Redirects on non-null nats must be dropped, else they'll
859            start talking to each other without our translation, and be
860            confused... --RR */
861         if (inside->icmp.type == ICMP_REDIRECT) {
862                 /* Don't care about races here. */
863                 if (info->initialized
864                     != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
865                     || info->num_manips != 0)
866                         return 0;
867         }
868
869         DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
870                *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
871         /* Note: May not be from a NAT'd host, but probably safest to
872            do translation always as if it came from the host itself
873            (even though a "host unreachable" coming from the host
874            itself is a bit weird).
875
876            More explanation: some people use NAT for anonymizing.
877            Also, CERT recommends dropping all packets from private IP
878            addresses (although ICMP errors from internal links with
879            such addresses are not too uncommon, as Alan Cox points
880            out) */
881
882         READ_LOCK(&ip_nat_lock);
883         for (i = 0; i < info->num_manips; i++) {
884                 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
885                        i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
886                        "ORIG" : "REPLY", info->manips[i].hooknum);
887
888                 if (info->manips[i].direction != dir)
889                         continue;
890
891                 /* Mapping the inner packet is just like a normal
892                    packet, except it was never src/dst reversed, so
893                    where we would normally apply a dst manip, we apply
894                    a src, and vice versa. */
895                 if (info->manips[i].hooknum == hooknum) {
896                         DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
897                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
898                                ? "DST" : "SRC",
899                                NIPQUAD(info->manips[i].manip.ip),
900                                ntohs(info->manips[i].manip.u.udp.port));
901                         if (!manip_pkt(inside->ip.protocol, pskb,
902                                        (*pskb)->nh.iph->ihl*4
903                                        + sizeof(inside->icmp),
904                                        &info->manips[i].manip,
905                                        !info->manips[i].maniptype))
906                                 goto unlock_fail;
907
908                         /* Outer packet needs to have IP header NATed like
909                            it's a reply. */
910
911                         /* Use mapping to map outer packet: 0 give no
912                            per-proto mapping */
913                         DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
914                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
915                                ? "SRC" : "DST",
916                                NIPQUAD(info->manips[i].manip.ip));
917                         if (!manip_pkt(0, pskb, 0,
918                                        &info->manips[i].manip,
919                                        info->manips[i].maniptype))
920                                 goto unlock_fail;
921                 }
922         }
923         READ_UNLOCK(&ip_nat_lock);
924
925         hdrlen = (*pskb)->nh.iph->ihl * 4;
926
927         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
928
929         inside->icmp.checksum = 0;
930         inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
931                                                        (*pskb)->len - hdrlen,
932                                                        0));
933         return 1;
934
935  unlock_fail:
936         READ_UNLOCK(&ip_nat_lock);
937         return 0;
938 }
939
940 int __init ip_nat_init(void)
941 {
942         size_t i;
943
944         /* Leave them the same for the moment. */
945         ip_nat_htable_size = ip_conntrack_htable_size;
946
947         /* One vmalloc for both hash tables */
948         bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
949         if (!bysource) {
950                 return -ENOMEM;
951         }
952         byipsproto = bysource + ip_nat_htable_size;
953
954         /* Sew in builtin protocols. */
955         WRITE_LOCK(&ip_nat_lock);
956         for (i = 0; i < MAX_IP_NAT_PROTO; i++)
957                 ip_nat_protos[i] = &ip_nat_unknown_protocol;
958         ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
959         ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
960         ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
961         WRITE_UNLOCK(&ip_nat_lock);
962
963         for (i = 0; i < ip_nat_htable_size; i++) {
964                 INIT_LIST_HEAD(&bysource[i]);
965                 INIT_LIST_HEAD(&byipsproto[i]);
966         }
967
968         /* FIXME: Man, this is a hack.  <SIGH> */
969         IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
970         ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
971         
972         /* Initialize fake conntrack so that NAT will skip it */
973         ip_conntrack_untracked.nat.info.initialized |= 
974                 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
975
976         return 0;
977 }
978
979 /* Clear NAT section of all conntracks, in case we're loaded again. */
980 static int clean_nat(const struct ip_conntrack *i, void *data)
981 {
982         memset((void *)&i->nat, 0, sizeof(i->nat));
983         return 0;
984 }
985
986 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
987 void ip_nat_cleanup(void)
988 {
989         ip_ct_selective_cleanup(&clean_nat, NULL);
990         ip_conntrack_destroyed = NULL;
991         vfree(bysource);
992 }