ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / ipv4 / netfilter / ip_nat_core.c
1 /* NAT for netfilter; shared with compatibility layer. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
18 #include <net/icmp.h>
19 #include <net/ip.h>
20 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
23
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
26
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
36
37 #if 0
38 #define DEBUGP printk
39 #else
40 #define DEBUGP(format, args...)
41 #endif
42
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
45
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
48
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
51 LIST_HEAD(protos);
52 LIST_HEAD(helpers);
53
54 extern struct ip_nat_protocol unknown_nat_protocol;
55
56 /* We keep extra hashes for each conntrack, for fast searching. */
57 static inline size_t
58 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
59 {
60         /* Modified src and dst, to ensure we don't create two
61            identical streams. */
62         return (src + dst + proto) % ip_nat_htable_size;
63 }
64
65 static inline size_t
66 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
67 {
68         /* Original src, to ensure we map it consistently if poss. */
69         return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
70 }
71
72 /* Noone using conntrack by the time this called. */
73 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
74 {
75         struct ip_nat_info *info = &conn->nat.info;
76         unsigned int hs, hp;
77
78         if (!info->initialized)
79                 return;
80
81         IP_NF_ASSERT(info->bysource.conntrack);
82         IP_NF_ASSERT(info->byipsproto.conntrack);
83
84         hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
85                          conn->tuplehash[IP_CT_DIR_ORIGINAL]
86                          .tuple.dst.protonum);
87
88         hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
89                               conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
90                               conn->tuplehash[IP_CT_DIR_REPLY]
91                               .tuple.dst.protonum);
92
93         WRITE_LOCK(&ip_nat_lock);
94         LIST_DELETE(&bysource[hs], &info->bysource);
95         LIST_DELETE(&byipsproto[hp], &info->byipsproto);
96         WRITE_UNLOCK(&ip_nat_lock);
97 }
98
99 /* We do checksum mangling, so if they were wrong before they're still
100  * wrong.  Also works for incomplete packets (eg. ICMP dest
101  * unreachables.) */
102 u_int16_t
103 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
104 {
105         u_int32_t diffs[] = { oldvalinv, newval };
106         return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
107                                       oldcheck^0xFFFF));
108 }
109
110 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
111 {
112         return i->protonum == proto;
113 }
114
115 struct ip_nat_protocol *
116 find_nat_proto(u_int16_t protonum)
117 {
118         struct ip_nat_protocol *i;
119
120         MUST_BE_READ_LOCKED(&ip_nat_lock);
121         i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
122         if (!i)
123                 i = &unknown_nat_protocol;
124         return i;
125 }
126
127 /* Is this tuple already taken? (not by us) */
128 int
129 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
130                   const struct ip_conntrack *ignored_conntrack)
131 {
132         /* Conntrack tracking doesn't keep track of outgoing tuples; only
133            incoming ones.  NAT means they don't have a fixed mapping,
134            so we invert the tuple and look for the incoming reply.
135
136            We could keep a separate hash if this proves too slow. */
137         struct ip_conntrack_tuple reply;
138
139         invert_tuplepr(&reply, tuple);
140         return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
141 }
142
143 /* Does tuple + the source manip come within the range mr */
144 static int
145 in_range(const struct ip_conntrack_tuple *tuple,
146          const struct ip_conntrack_manip *manip,
147          const struct ip_nat_multi_range *mr)
148 {
149         struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
150         unsigned int i;
151         struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
152
153         for (i = 0; i < mr->rangesize; i++) {
154                 /* If we are allowed to map IPs, then we must be in the
155                    range specified, otherwise we must be unchanged. */
156                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
157                         if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
158                             || (ntohl(newtuple.src.ip)
159                                 > ntohl(mr->range[i].max_ip)))
160                                 continue;
161                 } else {
162                         if (newtuple.src.ip != tuple->src.ip)
163                                 continue;
164                 }
165
166                 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
167                     || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
168                                        &mr->range[i].min, &mr->range[i].max))
169                         return 1;
170         }
171         return 0;
172 }
173
174 static inline int
175 src_cmp(const struct ip_nat_hash *i,
176         const struct ip_conntrack_tuple *tuple,
177         const struct ip_nat_multi_range *mr)
178 {
179         return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
180                 == tuple->dst.protonum
181                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
182                 == tuple->src.ip
183                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
184                 == tuple->src.u.all
185                 && in_range(tuple,
186                             &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
187                             .tuple.src,
188                             mr));
189 }
190
191 /* Only called for SRC manip */
192 static struct ip_conntrack_manip *
193 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
194                      const struct ip_nat_multi_range *mr)
195 {
196         unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
197         struct ip_nat_hash *i;
198
199         MUST_BE_READ_LOCKED(&ip_nat_lock);
200         i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
201         if (i)
202                 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
203         else
204                 return NULL;
205 }
206
207 #ifdef CONFIG_IP_NF_NAT_LOCAL
208 /* If it's really a local destination manip, it may need to do a
209    source manip too. */
210 static int
211 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
212 {
213         struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
214         struct rtable *rt;
215
216         /* FIXME: IPTOS_TOS(iph->tos) --RR */
217         if (ip_route_output_key(&rt, &fl) != 0) {
218                 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
219                        NIPQUAD(var_ip));
220                 return 0;
221         }
222
223         *other_ipp = rt->rt_src;
224         ip_rt_put(rt);
225         return 1;
226 }
227 #endif
228
229 /* Simple way to iterate through all. */
230 static inline int fake_cmp(const struct ip_nat_hash *i,
231                            u_int32_t src, u_int32_t dst, u_int16_t protonum,
232                            unsigned int *score,
233                            const struct ip_conntrack *conntrack)
234 {
235         /* Compare backwards: we're dealing with OUTGOING tuples, and
236            inside the conntrack is the REPLY tuple.  Don't count this
237            conntrack. */
238         if (i->conntrack != conntrack
239             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
240             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
241             && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
242                 == protonum))
243                 (*score)++;
244         return 0;
245 }
246
247 static inline unsigned int
248 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
249            const struct ip_conntrack *conntrack)
250 {
251         unsigned int score = 0;
252         unsigned int h;
253
254         MUST_BE_READ_LOCKED(&ip_nat_lock);
255         h = hash_by_ipsproto(src, dst, protonum);
256         LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
257                   src, dst, protonum, &score, conntrack);
258
259         return score;
260 }
261
262 /* For [FUTURE] fragmentation handling, we want the least-used
263    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
264    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
265    1-65535, we don't do pro-rata allocation based on ports; we choose
266    the ip with the lowest src-ip/dst-ip/proto usage.
267
268    If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
269    range), we eliminate that and try again.  This is not the most
270    efficient approach, but if you're worried about that, don't hand us
271    ranges you don't really have.  */
272 static struct ip_nat_range *
273 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
274                     const struct ip_nat_multi_range *mr,
275                     const struct ip_conntrack *conntrack,
276                     unsigned int hooknum)
277 {
278         unsigned int i;
279         struct {
280                 const struct ip_nat_range *range;
281                 unsigned int score;
282                 struct ip_conntrack_tuple tuple;
283         } best = { NULL,  0xFFFFFFFF };
284         u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
285         static unsigned int randomness;
286
287         if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
288                 var_ipp = &tuple->src.ip;
289                 saved_ip = tuple->dst.ip;
290                 other_ipp = &tuple->dst.ip;
291         } else {
292                 var_ipp = &tuple->dst.ip;
293                 saved_ip = tuple->src.ip;
294                 other_ipp = &tuple->src.ip;
295         }
296         /* Don't do do_extra_mangle unless necessary (overrides
297            explicit socket bindings, for example) */
298         orig_dstip = tuple->dst.ip;
299
300         IP_NF_ASSERT(mr->rangesize >= 1);
301         for (i = 0; i < mr->rangesize; i++) {
302                 /* Host order */
303                 u_int32_t minip, maxip, j;
304
305                 /* Don't do ranges which are already eliminated. */
306                 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
307                         continue;
308                 }
309
310                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
311                         minip = ntohl(mr->range[i].min_ip);
312                         maxip = ntohl(mr->range[i].max_ip);
313                 } else
314                         minip = maxip = ntohl(*var_ipp);
315
316                 randomness++;
317                 for (j = 0; j < maxip - minip + 1; j++) {
318                         unsigned int score;
319
320                         *var_ipp = htonl(minip + (randomness + j) 
321                                          % (maxip - minip + 1));
322
323                         /* Reset the other ip in case it was mangled by
324                          * do_extra_mangle last time. */
325                         *other_ipp = saved_ip;
326
327 #ifdef CONFIG_IP_NF_NAT_LOCAL
328                         if (hooknum == NF_IP_LOCAL_OUT
329                             && *var_ipp != orig_dstip
330                             && !do_extra_mangle(*var_ipp, other_ipp)) {
331                                 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
332                                        i, NIPQUAD(*var_ipp));
333                                 /* Can't route?  This whole range part is
334                                  * probably screwed, but keep trying
335                                  * anyway. */
336                                 continue;
337                         }
338 #endif
339
340                         /* Count how many others map onto this. */
341                         score = count_maps(tuple->src.ip, tuple->dst.ip,
342                                            tuple->dst.protonum, conntrack);
343                         if (score < best.score) {
344                                 /* Optimization: doesn't get any better than
345                                    this. */
346                                 if (score == 0)
347                                         return (struct ip_nat_range *)
348                                                 &mr->range[i];
349
350                                 best.score = score;
351                                 best.tuple = *tuple;
352                                 best.range = &mr->range[i];
353                         }
354                 }
355         }
356         *tuple = best.tuple;
357
358         /* Discard const. */
359         return (struct ip_nat_range *)best.range;
360 }
361
362 /* Fast version doesn't iterate through hash chains, but only handles
363    common case of single IP address (null NAT, masquerade) */
364 static struct ip_nat_range *
365 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
366                          const struct ip_nat_multi_range *mr,
367                          const struct ip_conntrack *conntrack,
368                          unsigned int hooknum)
369 {
370         if (mr->rangesize != 1
371             || (mr->range[0].flags & IP_NAT_RANGE_FULL)
372             || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
373                 && mr->range[0].min_ip != mr->range[0].max_ip))
374                 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
375
376         if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
377                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
378                         tuple->src.ip = mr->range[0].min_ip;
379                 else {
380                         /* Only do extra mangle when required (breaks
381                            socket binding) */
382 #ifdef CONFIG_IP_NF_NAT_LOCAL
383                         if (tuple->dst.ip != mr->range[0].min_ip
384                             && hooknum == NF_IP_LOCAL_OUT
385                             && !do_extra_mangle(mr->range[0].min_ip,
386                                                 &tuple->src.ip))
387                                 return NULL;
388 #endif
389                         tuple->dst.ip = mr->range[0].min_ip;
390                 }
391         }
392
393         /* Discard const. */
394         return (struct ip_nat_range *)&mr->range[0];
395 }
396
397 static int
398 get_unique_tuple(struct ip_conntrack_tuple *tuple,
399                  const struct ip_conntrack_tuple *orig_tuple,
400                  const struct ip_nat_multi_range *mrr,
401                  struct ip_conntrack *conntrack,
402                  unsigned int hooknum)
403 {
404         struct ip_nat_protocol *proto
405                 = find_nat_proto(orig_tuple->dst.protonum);
406         struct ip_nat_range *rptr;
407         unsigned int i;
408         int ret;
409
410         /* We temporarily use flags for marking full parts, but we
411            always clean up afterwards */
412         struct ip_nat_multi_range *mr = (void *)mrr;
413
414         /* 1) If this srcip/proto/src-proto-part is currently mapped,
415            and that same mapping gives a unique tuple within the given
416            range, use that.
417
418            This is only required for source (ie. NAT/masq) mappings.
419            So far, we don't do local source mappings, so multiple
420            manips not an issue.  */
421         if (hooknum == NF_IP_POST_ROUTING) {
422                 struct ip_conntrack_manip *manip;
423
424                 manip = find_appropriate_src(orig_tuple, mr);
425                 if (manip) {
426                         /* Apply same source manipulation. */
427                         *tuple = ((struct ip_conntrack_tuple)
428                                   { *manip, orig_tuple->dst });
429                         DEBUGP("get_unique_tuple: Found current src map\n");
430                         if (!ip_nat_used_tuple(tuple, conntrack))
431                                 return 1;
432                 }
433         }
434
435         /* 2) Select the least-used IP/proto combination in the given
436            range.
437         */
438         *tuple = *orig_tuple;
439         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
440                != NULL) {
441                 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
442                 /* 3) The per-protocol part of the manip is made to
443                    map into the range to make a unique tuple. */
444
445                 /* Only bother mapping if it's not already in range
446                    and unique */
447                 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
448                      || proto->in_range(tuple, HOOK2MANIP(hooknum),
449                                         &rptr->min, &rptr->max))
450                     && !ip_nat_used_tuple(tuple, conntrack)) {
451                         ret = 1;
452                         goto clear_fulls;
453                 } else {
454                         if (proto->unique_tuple(tuple, rptr,
455                                                 HOOK2MANIP(hooknum),
456                                                 conntrack)) {
457                                 /* Must be unique. */
458                                 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
459                                                                 conntrack));
460                                 ret = 1;
461                                 goto clear_fulls;
462                         } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
463                                 /* Try implicit source NAT; protocol
464                                    may be able to play with ports to
465                                    make it unique. */
466                                 struct ip_nat_range r
467                                         = { IP_NAT_RANGE_MAP_IPS, 
468                                             tuple->src.ip, tuple->src.ip,
469                                             { 0 }, { 0 } };
470                                 DEBUGP("Trying implicit mapping\n");
471                                 if (proto->unique_tuple(tuple, &r,
472                                                         IP_NAT_MANIP_SRC,
473                                                         conntrack)) {
474                                         /* Must be unique. */
475                                         IP_NF_ASSERT(!ip_nat_used_tuple
476                                                      (tuple, conntrack));
477                                         ret = 1;
478                                         goto clear_fulls;
479                                 }
480                         }
481                         DEBUGP("Protocol can't get unique tuple %u.\n",
482                                hooknum);
483                 }
484
485                 /* Eliminate that from range, and try again. */
486                 rptr->flags |= IP_NAT_RANGE_FULL;
487                 *tuple = *orig_tuple;
488         }
489
490         ret = 0;
491
492  clear_fulls:
493         /* Clear full flags. */
494         IP_NF_ASSERT(mr->rangesize >= 1);
495         for (i = 0; i < mr->rangesize; i++)
496                 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
497
498         return ret;
499 }
500
501 static inline int
502 helper_cmp(const struct ip_nat_helper *helper,
503            const struct ip_conntrack_tuple *tuple)
504 {
505         return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
506 }
507
508 /* Where to manip the reply packets (will be reverse manip). */
509 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
510 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
511     [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
512 #ifdef CONFIG_IP_NF_NAT_LOCAL
513     [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
514     [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
515 #endif
516 };
517
518 unsigned int
519 ip_nat_setup_info(struct ip_conntrack *conntrack,
520                   const struct ip_nat_multi_range *mr,
521                   unsigned int hooknum)
522 {
523         struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
524         struct ip_conntrack_tuple orig_tp;
525         struct ip_nat_info *info = &conntrack->nat.info;
526         int in_hashes = info->initialized;
527
528         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
529         IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
530                      || hooknum == NF_IP_POST_ROUTING
531                      || hooknum == NF_IP_LOCAL_OUT);
532         IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
533         IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
534
535         /* What we've got will look like inverse of reply. Normally
536            this is what is in the conntrack, except for prior
537            manipulations (future optimization: if num_manips == 0,
538            orig_tp =
539            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
540         invert_tuplepr(&orig_tp,
541                        &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
542
543 #if 0
544         {
545         unsigned int i;
546
547         DEBUGP("Hook %u (%s), ", hooknum,
548                HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
549         DUMP_TUPLE(&orig_tp);
550         DEBUGP("Range %p: ", mr);
551         for (i = 0; i < mr->rangesize; i++) {
552                 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
553                        i,
554                        (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
555                        ? " MAP_IPS" : "",
556                        (mr->range[i].flags
557                         & IP_NAT_RANGE_PROTO_SPECIFIED)
558                        ? " PROTO_SPECIFIED" : "",
559                        (mr->range[i].flags & IP_NAT_RANGE_FULL)
560                        ? " FULL" : "",
561                        NIPQUAD(mr->range[i].min_ip),
562                        NIPQUAD(mr->range[i].max_ip),
563                        mr->range[i].min.all,
564                        mr->range[i].max.all);
565         }
566         }
567 #endif
568
569         do {
570                 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
571                                       hooknum)) {
572                         DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
573                                conntrack);
574                         return NF_DROP;
575                 }
576
577 #if 0
578                 DEBUGP("Hook %u (%s) %p\n", hooknum,
579                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
580                        conntrack);
581                 DEBUGP("Original: ");
582                 DUMP_TUPLE(&orig_tp);
583                 DEBUGP("New: ");
584                 DUMP_TUPLE(&new_tuple);
585 #endif
586
587                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
588                    the original (A/B/C/D') and the mangled one (E/F/G/H').
589
590                    We're only allowed to work with the SRC per-proto
591                    part, so we create inverses of both to start, then
592                    derive the other fields we need.  */
593
594                 /* Reply connection: simply invert the new tuple
595                    (G/H/E/F') */
596                 invert_tuplepr(&reply, &new_tuple);
597
598                 /* Alter conntrack table so it recognizes replies.
599                    If fail this race (reply tuple now used), repeat. */
600         } while (!ip_conntrack_alter_reply(conntrack, &reply));
601
602         /* FIXME: We can simply used existing conntrack reply tuple
603            here --RR */
604         /* Create inverse of original: C/D/A/B' */
605         invert_tuplepr(&inv_tuple, &orig_tp);
606
607         /* Has source changed?. */
608         if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
609                 /* In this direction, a source manip. */
610                 info->manips[info->num_manips++] =
611                         ((struct ip_nat_info_manip)
612                          { IP_CT_DIR_ORIGINAL, hooknum,
613                            IP_NAT_MANIP_SRC, new_tuple.src });
614
615                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
616
617                 /* In the reverse direction, a destination manip. */
618                 info->manips[info->num_manips++] =
619                         ((struct ip_nat_info_manip)
620                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
621                            IP_NAT_MANIP_DST, orig_tp.src });
622                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
623         }
624
625         /* Has destination changed? */
626         if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
627                 /* In this direction, a destination manip */
628                 info->manips[info->num_manips++] =
629                         ((struct ip_nat_info_manip)
630                          { IP_CT_DIR_ORIGINAL, hooknum,
631                            IP_NAT_MANIP_DST, reply.src });
632
633                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
634
635                 /* In the reverse direction, a source manip. */
636                 info->manips[info->num_manips++] =
637                         ((struct ip_nat_info_manip)
638                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
639                            IP_NAT_MANIP_SRC, inv_tuple.src });
640                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
641         }
642
643         /* If there's a helper, assign it; based on new tuple. */
644         if (!conntrack->master)
645                 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
646                                          &reply);
647
648         /* It's done. */
649         info->initialized |= (1 << HOOK2MANIP(hooknum));
650
651         if (in_hashes) {
652                 IP_NF_ASSERT(info->bysource.conntrack);
653                 replace_in_hashes(conntrack, info);
654         } else {
655                 place_in_hashes(conntrack, info);
656         }
657
658         return NF_ACCEPT;
659 }
660
661 void replace_in_hashes(struct ip_conntrack *conntrack,
662                        struct ip_nat_info *info)
663 {
664         /* Source has changed, so replace in hashes. */
665         unsigned int srchash
666                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
667                               .tuple.src,
668                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
669                               .tuple.dst.protonum);
670         /* We place packet as seen OUTGOUNG in byips_proto hash
671            (ie. reverse dst and src of reply packet. */
672         unsigned int ipsprotohash
673                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
674                                    .tuple.dst.ip,
675                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
676                                    .tuple.src.ip,
677                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
678                                    .tuple.dst.protonum);
679
680         IP_NF_ASSERT(info->bysource.conntrack == conntrack);
681         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
682
683         list_del(&info->bysource.list);
684         list_del(&info->byipsproto.list);
685
686         list_prepend(&bysource[srchash], &info->bysource);
687         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
688 }
689
690 void place_in_hashes(struct ip_conntrack *conntrack,
691                      struct ip_nat_info *info)
692 {
693         unsigned int srchash
694                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
695                               .tuple.src,
696                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
697                               .tuple.dst.protonum);
698         /* We place packet as seen OUTGOUNG in byips_proto hash
699            (ie. reverse dst and src of reply packet. */
700         unsigned int ipsprotohash
701                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
702                                    .tuple.dst.ip,
703                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
704                                    .tuple.src.ip,
705                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
706                                    .tuple.dst.protonum);
707
708         IP_NF_ASSERT(!info->bysource.conntrack);
709
710         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
711         info->byipsproto.conntrack = conntrack;
712         info->bysource.conntrack = conntrack;
713
714         list_prepend(&bysource[srchash], &info->bysource);
715         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
716 }
717
718 /* Returns true if succeeded. */
719 static int
720 manip_pkt(u_int16_t proto,
721           struct sk_buff **pskb,
722           unsigned int iphdroff,
723           const struct ip_conntrack_manip *manip,
724           enum ip_nat_manip_type maniptype)
725 {
726         struct iphdr *iph;
727
728         (*pskb)->nfcache |= NFC_ALTERED;
729         if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
730                 return 0;
731
732         iph = (void *)(*pskb)->data + iphdroff;
733
734         /* Manipulate protcol part. */
735         if (!find_nat_proto(proto)->manip_pkt(pskb,
736                                               iphdroff + iph->ihl*4,
737                                               manip, maniptype))
738                 return 0;
739
740         iph = (void *)(*pskb)->data + iphdroff;
741
742         if (maniptype == IP_NAT_MANIP_SRC) {
743                 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
744                                                 iph->check);
745                 iph->saddr = manip->ip;
746         } else {
747                 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
748                                                 iph->check);
749                 iph->daddr = manip->ip;
750         }
751         return 1;
752 }
753
754 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
755                                  struct sk_buff *skb)
756 {
757         struct ip_conntrack_protocol *proto;
758         int ret = 1;
759
760         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
761         proto = __ip_ct_find_proto(skb->nh.iph->protocol);
762         if (proto->exp_matches_pkt)
763                 ret = proto->exp_matches_pkt(exp, skb);
764
765         return ret;
766 }
767
768 /* Do packet manipulations according to binding. */
769 unsigned int
770 do_bindings(struct ip_conntrack *ct,
771             enum ip_conntrack_info ctinfo,
772             struct ip_nat_info *info,
773             unsigned int hooknum,
774             struct sk_buff **pskb)
775 {
776         unsigned int i;
777         struct ip_nat_helper *helper;
778         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
779         int proto = (*pskb)->nh.iph->protocol;
780
781         /* Need nat lock to protect against modification, but neither
782            conntrack (referenced) and helper (deleted with
783            synchronize_bh()) can vanish. */
784         READ_LOCK(&ip_nat_lock);
785         for (i = 0; i < info->num_manips; i++) {
786                 if (info->manips[i].direction == dir
787                     && info->manips[i].hooknum == hooknum) {
788                         DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
789                                *pskb,
790                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
791                                ? "SRC" : "DST",
792                                NIPQUAD(info->manips[i].manip.ip),
793                                htons(info->manips[i].manip.u.all));
794                         if (!manip_pkt(proto, pskb, 0,
795                                        &info->manips[i].manip,
796                                        info->manips[i].maniptype)) {
797                                 READ_UNLOCK(&ip_nat_lock);
798                                 return NF_DROP;
799                         }
800                 }
801         }
802         helper = info->helper;
803         READ_UNLOCK(&ip_nat_lock);
804
805         if (helper) {
806                 struct ip_conntrack_expect *exp = NULL;
807                 struct list_head *cur_item;
808                 int ret = NF_ACCEPT;
809                 int helper_called = 0;
810
811                 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
812
813                 /* Always defragged for helpers */
814                 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
815                                & htons(IP_MF|IP_OFFSET)));
816
817                 /* Have to grab read lock before sibling_list traversal */
818                 READ_LOCK(&ip_conntrack_lock);
819                 list_for_each(cur_item, &ct->sibling_list) { 
820                         exp = list_entry(cur_item, struct ip_conntrack_expect, 
821                                          expected_list);
822                                          
823                         /* if this expectation is already established, skip */
824                         if (exp->sibling)
825                                 continue;
826
827                         if (exp_for_packet(exp, *pskb)) {
828                                 /* FIXME: May be true multiple times in the
829                                  * case of UDP!! */
830                                 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
831                                 ret = helper->help(ct, exp, info, ctinfo, 
832                                                    hooknum, pskb);
833                                 if (ret != NF_ACCEPT) {
834                                         READ_UNLOCK(&ip_conntrack_lock);
835                                         return ret;
836                                 }
837                                 helper_called = 1;
838                         }
839                 }
840                 /* Helper might want to manip the packet even when there is no
841                  * matching expectation for this packet */
842                 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
843                         DEBUGP("calling nat helper for packet without expectation\n");
844                         ret = helper->help(ct, NULL, info, ctinfo, 
845                                            hooknum, pskb);
846                         if (ret != NF_ACCEPT) {
847                                 READ_UNLOCK(&ip_conntrack_lock);
848                                 return ret;
849                         }
850                 }
851                 READ_UNLOCK(&ip_conntrack_lock);
852                 
853                 /* Adjust sequence number only once per packet 
854                  * (helper is called at all hooks) */
855                 if (proto == IPPROTO_TCP
856                     && (hooknum == NF_IP_POST_ROUTING
857                         || hooknum == NF_IP_LOCAL_IN)) {
858                         DEBUGP("ip_nat_core: adjusting sequence number\n");
859                         /* future: put this in a l4-proto specific function,
860                          * and call this function here. */
861                         if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
862                                 ret = NF_DROP;
863                 }
864
865                 return ret;
866
867         } else 
868                 return NF_ACCEPT;
869
870         /* not reached */
871 }
872
873 int
874 icmp_reply_translation(struct sk_buff **pskb,
875                        struct ip_conntrack *conntrack,
876                        unsigned int hooknum,
877                        int dir)
878 {
879         struct {
880                 struct icmphdr icmp;
881                 struct iphdr ip;
882         } *inside;
883         unsigned int i;
884         struct ip_nat_info *info = &conntrack->nat.info;
885         int hdrlen;
886
887         if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
888                 return 0;
889         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
890
891         /* We're actually going to mangle it beyond trivial checksum
892            adjustment, so make sure the current checksum is correct. */
893         if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
894                 hdrlen = (*pskb)->nh.iph->ihl * 4;
895                 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
896                                                 (*pskb)->len - hdrlen, 0)))
897                         return 0;
898         }
899
900         /* Must be RELATED */
901         IP_NF_ASSERT((*pskb)->nfct
902                      - (struct ip_conntrack *)(*pskb)->nfct->master
903                      == IP_CT_RELATED
904                      || (*pskb)->nfct
905                      - (struct ip_conntrack *)(*pskb)->nfct->master
906                      == IP_CT_RELATED+IP_CT_IS_REPLY);
907
908         /* Redirects on non-null nats must be dropped, else they'll
909            start talking to each other without our translation, and be
910            confused... --RR */
911         if (inside->icmp.type == ICMP_REDIRECT) {
912                 /* Don't care about races here. */
913                 if (info->initialized
914                     != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
915                     || info->num_manips != 0)
916                         return 0;
917         }
918
919         DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
920                *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
921         /* Note: May not be from a NAT'd host, but probably safest to
922            do translation always as if it came from the host itself
923            (even though a "host unreachable" coming from the host
924            itself is a bit weird).
925
926            More explanation: some people use NAT for anonymizing.
927            Also, CERT recommends dropping all packets from private IP
928            addresses (although ICMP errors from internal links with
929            such addresses are not too uncommon, as Alan Cox points
930            out) */
931
932         READ_LOCK(&ip_nat_lock);
933         for (i = 0; i < info->num_manips; i++) {
934                 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
935                        i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
936                        "ORIG" : "REPLY", info->manips[i].hooknum);
937
938                 if (info->manips[i].direction != dir)
939                         continue;
940
941                 /* Mapping the inner packet is just like a normal
942                    packet, except it was never src/dst reversed, so
943                    where we would normally apply a dst manip, we apply
944                    a src, and vice versa. */
945                 if (info->manips[i].hooknum == hooknum) {
946                         DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
947                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
948                                ? "DST" : "SRC",
949                                NIPQUAD(info->manips[i].manip.ip),
950                                ntohs(info->manips[i].manip.u.udp.port));
951                         if (!manip_pkt(inside->ip.protocol, pskb,
952                                        (*pskb)->nh.iph->ihl*4
953                                        + sizeof(inside->icmp),
954                                        &info->manips[i].manip,
955                                        !info->manips[i].maniptype))
956                                 goto unlock_fail;
957
958                         /* Outer packet needs to have IP header NATed like
959                            it's a reply. */
960
961                         /* Use mapping to map outer packet: 0 give no
962                            per-proto mapping */
963                         DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
964                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
965                                ? "SRC" : "DST",
966                                NIPQUAD(info->manips[i].manip.ip));
967                         if (!manip_pkt(0, pskb, 0,
968                                        &info->manips[i].manip,
969                                        info->manips[i].maniptype))
970                                 goto unlock_fail;
971                 }
972         }
973         READ_UNLOCK(&ip_nat_lock);
974
975         hdrlen = (*pskb)->nh.iph->ihl * 4;
976
977         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
978
979         inside->icmp.checksum = 0;
980         inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
981                                                        (*pskb)->len - hdrlen,
982                                                        0));
983         return 1;
984
985  unlock_fail:
986         READ_UNLOCK(&ip_nat_lock);
987         return 0;
988 }
989
990 int __init ip_nat_init(void)
991 {
992         size_t i;
993
994         /* Leave them the same for the moment. */
995         ip_nat_htable_size = ip_conntrack_htable_size;
996
997         /* One vmalloc for both hash tables */
998         bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
999         if (!bysource) {
1000                 return -ENOMEM;
1001         }
1002         byipsproto = bysource + ip_nat_htable_size;
1003
1004         /* Sew in builtin protocols. */
1005         WRITE_LOCK(&ip_nat_lock);
1006         list_append(&protos, &ip_nat_protocol_tcp);
1007         list_append(&protos, &ip_nat_protocol_udp);
1008         list_append(&protos, &ip_nat_protocol_icmp);
1009         WRITE_UNLOCK(&ip_nat_lock);
1010
1011         for (i = 0; i < ip_nat_htable_size; i++) {
1012                 INIT_LIST_HEAD(&bysource[i]);
1013                 INIT_LIST_HEAD(&byipsproto[i]);
1014         }
1015
1016         /* FIXME: Man, this is a hack.  <SIGH> */
1017         IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1018         ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1019         
1020         /* Initialize fake conntrack so that NAT will skip it */
1021         ip_conntrack_untracked.nat.info.initialized |= 
1022                 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
1023
1024         return 0;
1025 }
1026
1027 /* Clear NAT section of all conntracks, in case we're loaded again. */
1028 static int clean_nat(const struct ip_conntrack *i, void *data)
1029 {
1030         memset((void *)&i->nat, 0, sizeof(i->nat));
1031         return 0;
1032 }
1033
1034 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1035 void ip_nat_cleanup(void)
1036 {
1037         ip_ct_selective_cleanup(&clean_nat, NULL);
1038         ip_conntrack_destroyed = NULL;
1039         vfree(bysource);
1040 }