1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
40 #define DEBUGP(format, args...)
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
54 extern struct ip_nat_protocol unknown_nat_protocol;
56 /* We keep extra hashes for each conntrack, for fast searching. */
58 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
60 /* Modified src and dst, to ensure we don't create two
62 return (src + dst + proto) % ip_nat_htable_size;
66 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
68 /* Original src, to ensure we map it consistently if poss. */
69 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
72 /* Noone using conntrack by the time this called. */
73 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
75 struct ip_nat_info *info = &conn->nat.info;
78 if (!info->initialized)
81 IP_NF_ASSERT(info->bysource.conntrack);
82 IP_NF_ASSERT(info->byipsproto.conntrack);
84 hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
85 conn->tuplehash[IP_CT_DIR_ORIGINAL]
88 hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
89 conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
90 conn->tuplehash[IP_CT_DIR_REPLY]
93 WRITE_LOCK(&ip_nat_lock);
94 LIST_DELETE(&bysource[hs], &info->bysource);
95 LIST_DELETE(&byipsproto[hp], &info->byipsproto);
96 WRITE_UNLOCK(&ip_nat_lock);
99 /* We do checksum mangling, so if they were wrong before they're still
100 * wrong. Also works for incomplete packets (eg. ICMP dest
103 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
105 u_int32_t diffs[] = { oldvalinv, newval };
106 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
110 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
112 return i->protonum == proto;
115 struct ip_nat_protocol *
116 find_nat_proto(u_int16_t protonum)
118 struct ip_nat_protocol *i;
120 MUST_BE_READ_LOCKED(&ip_nat_lock);
121 i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
123 i = &unknown_nat_protocol;
127 /* Is this tuple already taken? (not by us) */
129 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
130 const struct ip_conntrack *ignored_conntrack)
132 /* Conntrack tracking doesn't keep track of outgoing tuples; only
133 incoming ones. NAT means they don't have a fixed mapping,
134 so we invert the tuple and look for the incoming reply.
136 We could keep a separate hash if this proves too slow. */
137 struct ip_conntrack_tuple reply;
139 invert_tuplepr(&reply, tuple);
140 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
143 /* Does tuple + the source manip come within the range mr */
145 in_range(const struct ip_conntrack_tuple *tuple,
146 const struct ip_conntrack_manip *manip,
147 const struct ip_nat_multi_range *mr)
149 struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
151 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
153 for (i = 0; i < mr->rangesize; i++) {
154 /* If we are allowed to map IPs, then we must be in the
155 range specified, otherwise we must be unchanged. */
156 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
157 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
158 || (ntohl(newtuple.src.ip)
159 > ntohl(mr->range[i].max_ip)))
162 if (newtuple.src.ip != tuple->src.ip)
166 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
167 || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
168 &mr->range[i].min, &mr->range[i].max))
175 src_cmp(const struct ip_nat_hash *i,
176 const struct ip_conntrack_tuple *tuple,
177 const struct ip_nat_multi_range *mr)
179 return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
180 == tuple->dst.protonum
181 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
183 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
186 &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
191 /* Only called for SRC manip */
192 static struct ip_conntrack_manip *
193 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
194 const struct ip_nat_multi_range *mr)
196 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
197 struct ip_nat_hash *i;
199 MUST_BE_READ_LOCKED(&ip_nat_lock);
200 i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
202 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
207 #ifdef CONFIG_IP_NF_NAT_LOCAL
208 /* If it's really a local destination manip, it may need to do a
211 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
213 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
216 /* FIXME: IPTOS_TOS(iph->tos) --RR */
217 if (ip_route_output_key(&rt, &fl) != 0) {
218 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
223 *other_ipp = rt->rt_src;
229 /* Simple way to iterate through all. */
230 static inline int fake_cmp(const struct ip_nat_hash *i,
231 u_int32_t src, u_int32_t dst, u_int16_t protonum,
233 const struct ip_conntrack *conntrack)
235 /* Compare backwards: we're dealing with OUTGOING tuples, and
236 inside the conntrack is the REPLY tuple. Don't count this
238 if (i->conntrack != conntrack
239 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
240 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
241 && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
247 static inline unsigned int
248 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
249 const struct ip_conntrack *conntrack)
251 unsigned int score = 0;
254 MUST_BE_READ_LOCKED(&ip_nat_lock);
255 h = hash_by_ipsproto(src, dst, protonum);
256 LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
257 src, dst, protonum, &score, conntrack);
262 /* For [FUTURE] fragmentation handling, we want the least-used
263 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
264 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
265 1-65535, we don't do pro-rata allocation based on ports; we choose
266 the ip with the lowest src-ip/dst-ip/proto usage.
268 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
269 range), we eliminate that and try again. This is not the most
270 efficient approach, but if you're worried about that, don't hand us
271 ranges you don't really have. */
272 static struct ip_nat_range *
273 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
274 const struct ip_nat_multi_range *mr,
275 const struct ip_conntrack *conntrack,
276 unsigned int hooknum)
280 const struct ip_nat_range *range;
282 struct ip_conntrack_tuple tuple;
283 } best = { NULL, 0xFFFFFFFF };
284 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
285 static unsigned int randomness;
287 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
288 var_ipp = &tuple->src.ip;
289 saved_ip = tuple->dst.ip;
290 other_ipp = &tuple->dst.ip;
292 var_ipp = &tuple->dst.ip;
293 saved_ip = tuple->src.ip;
294 other_ipp = &tuple->src.ip;
296 /* Don't do do_extra_mangle unless necessary (overrides
297 explicit socket bindings, for example) */
298 orig_dstip = tuple->dst.ip;
300 IP_NF_ASSERT(mr->rangesize >= 1);
301 for (i = 0; i < mr->rangesize; i++) {
303 u_int32_t minip, maxip, j;
305 /* Don't do ranges which are already eliminated. */
306 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
310 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
311 minip = ntohl(mr->range[i].min_ip);
312 maxip = ntohl(mr->range[i].max_ip);
314 minip = maxip = ntohl(*var_ipp);
317 for (j = 0; j < maxip - minip + 1; j++) {
320 *var_ipp = htonl(minip + (randomness + j)
321 % (maxip - minip + 1));
323 /* Reset the other ip in case it was mangled by
324 * do_extra_mangle last time. */
325 *other_ipp = saved_ip;
327 #ifdef CONFIG_IP_NF_NAT_LOCAL
328 if (hooknum == NF_IP_LOCAL_OUT
329 && *var_ipp != orig_dstip
330 && !do_extra_mangle(*var_ipp, other_ipp)) {
331 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
332 i, NIPQUAD(*var_ipp));
333 /* Can't route? This whole range part is
334 * probably screwed, but keep trying
340 /* Count how many others map onto this. */
341 score = count_maps(tuple->src.ip, tuple->dst.ip,
342 tuple->dst.protonum, conntrack);
343 if (score < best.score) {
344 /* Optimization: doesn't get any better than
347 return (struct ip_nat_range *)
352 best.range = &mr->range[i];
359 return (struct ip_nat_range *)best.range;
362 /* Fast version doesn't iterate through hash chains, but only handles
363 common case of single IP address (null NAT, masquerade) */
364 static struct ip_nat_range *
365 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
366 const struct ip_nat_multi_range *mr,
367 const struct ip_conntrack *conntrack,
368 unsigned int hooknum)
370 if (mr->rangesize != 1
371 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
372 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
373 && mr->range[0].min_ip != mr->range[0].max_ip))
374 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
376 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
377 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
378 tuple->src.ip = mr->range[0].min_ip;
380 /* Only do extra mangle when required (breaks
382 #ifdef CONFIG_IP_NF_NAT_LOCAL
383 if (tuple->dst.ip != mr->range[0].min_ip
384 && hooknum == NF_IP_LOCAL_OUT
385 && !do_extra_mangle(mr->range[0].min_ip,
389 tuple->dst.ip = mr->range[0].min_ip;
394 return (struct ip_nat_range *)&mr->range[0];
398 get_unique_tuple(struct ip_conntrack_tuple *tuple,
399 const struct ip_conntrack_tuple *orig_tuple,
400 const struct ip_nat_multi_range *mrr,
401 struct ip_conntrack *conntrack,
402 unsigned int hooknum)
404 struct ip_nat_protocol *proto
405 = find_nat_proto(orig_tuple->dst.protonum);
406 struct ip_nat_range *rptr;
410 /* We temporarily use flags for marking full parts, but we
411 always clean up afterwards */
412 struct ip_nat_multi_range *mr = (void *)mrr;
414 /* 1) If this srcip/proto/src-proto-part is currently mapped,
415 and that same mapping gives a unique tuple within the given
418 This is only required for source (ie. NAT/masq) mappings.
419 So far, we don't do local source mappings, so multiple
420 manips not an issue. */
421 if (hooknum == NF_IP_POST_ROUTING) {
422 struct ip_conntrack_manip *manip;
424 manip = find_appropriate_src(orig_tuple, mr);
426 /* Apply same source manipulation. */
427 *tuple = ((struct ip_conntrack_tuple)
428 { *manip, orig_tuple->dst });
429 DEBUGP("get_unique_tuple: Found current src map\n");
430 if (!ip_nat_used_tuple(tuple, conntrack))
435 /* 2) Select the least-used IP/proto combination in the given
438 *tuple = *orig_tuple;
439 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
441 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
442 /* 3) The per-protocol part of the manip is made to
443 map into the range to make a unique tuple. */
445 /* Only bother mapping if it's not already in range
447 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
448 || proto->in_range(tuple, HOOK2MANIP(hooknum),
449 &rptr->min, &rptr->max))
450 && !ip_nat_used_tuple(tuple, conntrack)) {
454 if (proto->unique_tuple(tuple, rptr,
457 /* Must be unique. */
458 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
462 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
463 /* Try implicit source NAT; protocol
464 may be able to play with ports to
466 struct ip_nat_range r
467 = { IP_NAT_RANGE_MAP_IPS,
468 tuple->src.ip, tuple->src.ip,
470 DEBUGP("Trying implicit mapping\n");
471 if (proto->unique_tuple(tuple, &r,
474 /* Must be unique. */
475 IP_NF_ASSERT(!ip_nat_used_tuple
481 DEBUGP("Protocol can't get unique tuple %u.\n",
485 /* Eliminate that from range, and try again. */
486 rptr->flags |= IP_NAT_RANGE_FULL;
487 *tuple = *orig_tuple;
493 /* Clear full flags. */
494 IP_NF_ASSERT(mr->rangesize >= 1);
495 for (i = 0; i < mr->rangesize; i++)
496 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
502 helper_cmp(const struct ip_nat_helper *helper,
503 const struct ip_conntrack_tuple *tuple)
505 return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
508 /* Where to manip the reply packets (will be reverse manip). */
509 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
510 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
511 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
512 #ifdef CONFIG_IP_NF_NAT_LOCAL
513 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
514 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
519 ip_nat_setup_info(struct ip_conntrack *conntrack,
520 const struct ip_nat_multi_range *mr,
521 unsigned int hooknum)
523 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
524 struct ip_conntrack_tuple orig_tp;
525 struct ip_nat_info *info = &conntrack->nat.info;
526 int in_hashes = info->initialized;
528 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
529 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
530 || hooknum == NF_IP_POST_ROUTING
531 || hooknum == NF_IP_LOCAL_OUT);
532 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
533 IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
535 /* What we've got will look like inverse of reply. Normally
536 this is what is in the conntrack, except for prior
537 manipulations (future optimization: if num_manips == 0,
539 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
540 invert_tuplepr(&orig_tp,
541 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
547 DEBUGP("Hook %u (%s), ", hooknum,
548 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
549 DUMP_TUPLE(&orig_tp);
550 DEBUGP("Range %p: ", mr);
551 for (i = 0; i < mr->rangesize; i++) {
552 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
554 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
557 & IP_NAT_RANGE_PROTO_SPECIFIED)
558 ? " PROTO_SPECIFIED" : "",
559 (mr->range[i].flags & IP_NAT_RANGE_FULL)
561 NIPQUAD(mr->range[i].min_ip),
562 NIPQUAD(mr->range[i].max_ip),
563 mr->range[i].min.all,
564 mr->range[i].max.all);
570 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
572 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
578 DEBUGP("Hook %u (%s) %p\n", hooknum,
579 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
581 DEBUGP("Original: ");
582 DUMP_TUPLE(&orig_tp);
584 DUMP_TUPLE(&new_tuple);
587 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
588 the original (A/B/C/D') and the mangled one (E/F/G/H').
590 We're only allowed to work with the SRC per-proto
591 part, so we create inverses of both to start, then
592 derive the other fields we need. */
594 /* Reply connection: simply invert the new tuple
596 invert_tuplepr(&reply, &new_tuple);
598 /* Alter conntrack table so it recognizes replies.
599 If fail this race (reply tuple now used), repeat. */
600 } while (!ip_conntrack_alter_reply(conntrack, &reply));
602 /* FIXME: We can simply used existing conntrack reply tuple
604 /* Create inverse of original: C/D/A/B' */
605 invert_tuplepr(&inv_tuple, &orig_tp);
607 /* Has source changed?. */
608 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
609 /* In this direction, a source manip. */
610 info->manips[info->num_manips++] =
611 ((struct ip_nat_info_manip)
612 { IP_CT_DIR_ORIGINAL, hooknum,
613 IP_NAT_MANIP_SRC, new_tuple.src });
615 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
617 /* In the reverse direction, a destination manip. */
618 info->manips[info->num_manips++] =
619 ((struct ip_nat_info_manip)
620 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
621 IP_NAT_MANIP_DST, orig_tp.src });
622 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
625 /* Has destination changed? */
626 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
627 /* In this direction, a destination manip */
628 info->manips[info->num_manips++] =
629 ((struct ip_nat_info_manip)
630 { IP_CT_DIR_ORIGINAL, hooknum,
631 IP_NAT_MANIP_DST, reply.src });
633 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
635 /* In the reverse direction, a source manip. */
636 info->manips[info->num_manips++] =
637 ((struct ip_nat_info_manip)
638 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
639 IP_NAT_MANIP_SRC, inv_tuple.src });
640 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
643 /* If there's a helper, assign it; based on new tuple. */
644 if (!conntrack->master)
645 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
649 info->initialized |= (1 << HOOK2MANIP(hooknum));
652 IP_NF_ASSERT(info->bysource.conntrack);
653 replace_in_hashes(conntrack, info);
655 place_in_hashes(conntrack, info);
661 void replace_in_hashes(struct ip_conntrack *conntrack,
662 struct ip_nat_info *info)
664 /* Source has changed, so replace in hashes. */
666 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
668 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
669 .tuple.dst.protonum);
670 /* We place packet as seen OUTGOUNG in byips_proto hash
671 (ie. reverse dst and src of reply packet. */
672 unsigned int ipsprotohash
673 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
675 conntrack->tuplehash[IP_CT_DIR_REPLY]
677 conntrack->tuplehash[IP_CT_DIR_REPLY]
678 .tuple.dst.protonum);
680 IP_NF_ASSERT(info->bysource.conntrack == conntrack);
681 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
683 list_del(&info->bysource.list);
684 list_del(&info->byipsproto.list);
686 list_prepend(&bysource[srchash], &info->bysource);
687 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
690 void place_in_hashes(struct ip_conntrack *conntrack,
691 struct ip_nat_info *info)
694 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
696 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
697 .tuple.dst.protonum);
698 /* We place packet as seen OUTGOUNG in byips_proto hash
699 (ie. reverse dst and src of reply packet. */
700 unsigned int ipsprotohash
701 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
703 conntrack->tuplehash[IP_CT_DIR_REPLY]
705 conntrack->tuplehash[IP_CT_DIR_REPLY]
706 .tuple.dst.protonum);
708 IP_NF_ASSERT(!info->bysource.conntrack);
710 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
711 info->byipsproto.conntrack = conntrack;
712 info->bysource.conntrack = conntrack;
714 list_prepend(&bysource[srchash], &info->bysource);
715 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
718 /* Returns true if succeeded. */
720 manip_pkt(u_int16_t proto,
721 struct sk_buff **pskb,
722 unsigned int iphdroff,
723 const struct ip_conntrack_manip *manip,
724 enum ip_nat_manip_type maniptype)
728 (*pskb)->nfcache |= NFC_ALTERED;
729 if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
732 iph = (void *)(*pskb)->data + iphdroff;
734 /* Manipulate protcol part. */
735 if (!find_nat_proto(proto)->manip_pkt(pskb,
736 iphdroff + iph->ihl*4,
740 iph = (void *)(*pskb)->data + iphdroff;
742 if (maniptype == IP_NAT_MANIP_SRC) {
743 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
745 iph->saddr = manip->ip;
747 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
749 iph->daddr = manip->ip;
754 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
757 struct ip_conntrack_protocol *proto;
760 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
761 proto = __ip_ct_find_proto(skb->nh.iph->protocol);
762 if (proto->exp_matches_pkt)
763 ret = proto->exp_matches_pkt(exp, skb);
768 /* Do packet manipulations according to binding. */
770 do_bindings(struct ip_conntrack *ct,
771 enum ip_conntrack_info ctinfo,
772 struct ip_nat_info *info,
773 unsigned int hooknum,
774 struct sk_buff **pskb)
777 struct ip_nat_helper *helper;
778 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
779 int proto = (*pskb)->nh.iph->protocol;
781 /* Need nat lock to protect against modification, but neither
782 conntrack (referenced) and helper (deleted with
783 synchronize_bh()) can vanish. */
784 READ_LOCK(&ip_nat_lock);
785 for (i = 0; i < info->num_manips; i++) {
786 if (info->manips[i].direction == dir
787 && info->manips[i].hooknum == hooknum) {
788 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
790 info->manips[i].maniptype == IP_NAT_MANIP_SRC
792 NIPQUAD(info->manips[i].manip.ip),
793 htons(info->manips[i].manip.u.all));
794 if (!manip_pkt(proto, pskb, 0,
795 &info->manips[i].manip,
796 info->manips[i].maniptype)) {
797 READ_UNLOCK(&ip_nat_lock);
802 helper = info->helper;
803 READ_UNLOCK(&ip_nat_lock);
806 struct ip_conntrack_expect *exp = NULL;
807 struct list_head *cur_item;
809 int helper_called = 0;
811 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
813 /* Always defragged for helpers */
814 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
815 & htons(IP_MF|IP_OFFSET)));
817 /* Have to grab read lock before sibling_list traversal */
818 READ_LOCK(&ip_conntrack_lock);
819 list_for_each(cur_item, &ct->sibling_list) {
820 exp = list_entry(cur_item, struct ip_conntrack_expect,
823 /* if this expectation is already established, skip */
827 if (exp_for_packet(exp, *pskb)) {
828 /* FIXME: May be true multiple times in the
830 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
831 ret = helper->help(ct, exp, info, ctinfo,
833 if (ret != NF_ACCEPT) {
834 READ_UNLOCK(&ip_conntrack_lock);
840 /* Helper might want to manip the packet even when there is no
841 * matching expectation for this packet */
842 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
843 DEBUGP("calling nat helper for packet without expectation\n");
844 ret = helper->help(ct, NULL, info, ctinfo,
846 if (ret != NF_ACCEPT) {
847 READ_UNLOCK(&ip_conntrack_lock);
851 READ_UNLOCK(&ip_conntrack_lock);
853 /* Adjust sequence number only once per packet
854 * (helper is called at all hooks) */
855 if (proto == IPPROTO_TCP
856 && (hooknum == NF_IP_POST_ROUTING
857 || hooknum == NF_IP_LOCAL_IN)) {
858 DEBUGP("ip_nat_core: adjusting sequence number\n");
859 /* future: put this in a l4-proto specific function,
860 * and call this function here. */
861 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
874 icmp_reply_translation(struct sk_buff **pskb,
875 struct ip_conntrack *conntrack,
876 unsigned int hooknum,
884 struct ip_nat_info *info = &conntrack->nat.info;
887 if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
889 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
891 /* We're actually going to mangle it beyond trivial checksum
892 adjustment, so make sure the current checksum is correct. */
893 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
894 hdrlen = (*pskb)->nh.iph->ihl * 4;
895 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
896 (*pskb)->len - hdrlen, 0)))
900 /* Must be RELATED */
901 IP_NF_ASSERT((*pskb)->nfct
902 - (struct ip_conntrack *)(*pskb)->nfct->master
905 - (struct ip_conntrack *)(*pskb)->nfct->master
906 == IP_CT_RELATED+IP_CT_IS_REPLY);
908 /* Redirects on non-null nats must be dropped, else they'll
909 start talking to each other without our translation, and be
911 if (inside->icmp.type == ICMP_REDIRECT) {
912 /* Don't care about races here. */
913 if (info->initialized
914 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
915 || info->num_manips != 0)
919 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
920 *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
921 /* Note: May not be from a NAT'd host, but probably safest to
922 do translation always as if it came from the host itself
923 (even though a "host unreachable" coming from the host
924 itself is a bit weird).
926 More explanation: some people use NAT for anonymizing.
927 Also, CERT recommends dropping all packets from private IP
928 addresses (although ICMP errors from internal links with
929 such addresses are not too uncommon, as Alan Cox points
932 READ_LOCK(&ip_nat_lock);
933 for (i = 0; i < info->num_manips; i++) {
934 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
935 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
936 "ORIG" : "REPLY", info->manips[i].hooknum);
938 if (info->manips[i].direction != dir)
941 /* Mapping the inner packet is just like a normal
942 packet, except it was never src/dst reversed, so
943 where we would normally apply a dst manip, we apply
944 a src, and vice versa. */
945 if (info->manips[i].hooknum == hooknum) {
946 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
947 info->manips[i].maniptype == IP_NAT_MANIP_SRC
949 NIPQUAD(info->manips[i].manip.ip),
950 ntohs(info->manips[i].manip.u.udp.port));
951 if (!manip_pkt(inside->ip.protocol, pskb,
952 (*pskb)->nh.iph->ihl*4
953 + sizeof(inside->icmp),
954 &info->manips[i].manip,
955 !info->manips[i].maniptype))
958 /* Outer packet needs to have IP header NATed like
961 /* Use mapping to map outer packet: 0 give no
963 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
964 info->manips[i].maniptype == IP_NAT_MANIP_SRC
966 NIPQUAD(info->manips[i].manip.ip));
967 if (!manip_pkt(0, pskb, 0,
968 &info->manips[i].manip,
969 info->manips[i].maniptype))
973 READ_UNLOCK(&ip_nat_lock);
975 hdrlen = (*pskb)->nh.iph->ihl * 4;
977 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
979 inside->icmp.checksum = 0;
980 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
981 (*pskb)->len - hdrlen,
986 READ_UNLOCK(&ip_nat_lock);
990 int __init ip_nat_init(void)
994 /* Leave them the same for the moment. */
995 ip_nat_htable_size = ip_conntrack_htable_size;
997 /* One vmalloc for both hash tables */
998 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
1002 byipsproto = bysource + ip_nat_htable_size;
1004 /* Sew in builtin protocols. */
1005 WRITE_LOCK(&ip_nat_lock);
1006 list_append(&protos, &ip_nat_protocol_tcp);
1007 list_append(&protos, &ip_nat_protocol_udp);
1008 list_append(&protos, &ip_nat_protocol_icmp);
1009 WRITE_UNLOCK(&ip_nat_lock);
1011 for (i = 0; i < ip_nat_htable_size; i++) {
1012 INIT_LIST_HEAD(&bysource[i]);
1013 INIT_LIST_HEAD(&byipsproto[i]);
1016 /* FIXME: Man, this is a hack. <SIGH> */
1017 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1018 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1020 /* Initialize fake conntrack so that NAT will skip it */
1021 ip_conntrack_untracked.nat.info.initialized |=
1022 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
1027 /* Clear NAT section of all conntracks, in case we're loaded again. */
1028 static int clean_nat(const struct ip_conntrack *i, void *data)
1030 memset((void *)&i->nat, 0, sizeof(i->nat));
1034 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1035 void ip_nat_cleanup(void)
1037 ip_ct_selective_cleanup(&clean_nat, NULL);
1038 ip_conntrack_destroyed = NULL;