1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
40 #define DEBUGP(format, args...)
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
54 extern struct ip_nat_protocol unknown_nat_protocol;
56 /* We keep extra hashes for each conntrack, for fast searching. */
58 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
60 /* Modified src and dst, to ensure we don't create two
62 return (src + dst + proto) % ip_nat_htable_size;
66 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
68 /* Original src, to ensure we map it consistently if poss. */
69 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
72 /* Noone using conntrack by the time this called. */
73 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
75 struct ip_nat_info *info = &conn->nat.info;
78 if (!info->initialized)
81 IP_NF_ASSERT(info->bysource.conntrack);
82 IP_NF_ASSERT(info->byipsproto.conntrack);
84 hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
85 conn->tuplehash[IP_CT_DIR_ORIGINAL]
88 hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
89 conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
90 conn->tuplehash[IP_CT_DIR_REPLY]
93 WRITE_LOCK(&ip_nat_lock);
94 LIST_DELETE(&bysource[hs], &info->bysource);
95 LIST_DELETE(&byipsproto[hp], &info->byipsproto);
96 WRITE_UNLOCK(&ip_nat_lock);
99 /* We do checksum mangling, so if they were wrong before they're still
100 * wrong. Also works for incomplete packets (eg. ICMP dest
103 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
105 u_int32_t diffs[] = { oldvalinv, newval };
106 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
110 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
112 return i->protonum == proto;
115 struct ip_nat_protocol *
116 find_nat_proto(u_int16_t protonum)
118 struct ip_nat_protocol *i;
120 MUST_BE_READ_LOCKED(&ip_nat_lock);
121 i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
123 i = &unknown_nat_protocol;
127 /* Is this tuple already taken? (not by us) */
129 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
130 const struct ip_conntrack *ignored_conntrack)
132 /* Conntrack tracking doesn't keep track of outgoing tuples; only
133 incoming ones. NAT means they don't have a fixed mapping,
134 so we invert the tuple and look for the incoming reply.
136 We could keep a separate hash if this proves too slow. */
137 struct ip_conntrack_tuple reply;
139 invert_tuplepr(&reply, tuple);
140 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
143 /* Does tuple + the source manip come within the range mr */
145 in_range(const struct ip_conntrack_tuple *tuple,
146 const struct ip_conntrack_manip *manip,
147 const struct ip_nat_multi_range *mr)
149 struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
151 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
153 for (i = 0; i < mr->rangesize; i++) {
154 /* If we are allowed to map IPs, then we must be in the
155 range specified, otherwise we must be unchanged. */
156 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
157 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
158 || (ntohl(newtuple.src.ip)
159 > ntohl(mr->range[i].max_ip)))
162 if (newtuple.src.ip != tuple->src.ip)
166 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
167 || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
168 &mr->range[i].min, &mr->range[i].max))
175 src_cmp(const struct ip_nat_hash *i,
176 const struct ip_conntrack_tuple *tuple,
177 const struct ip_nat_multi_range *mr)
179 return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
180 == tuple->dst.protonum
181 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
183 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
186 &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
191 /* Only called for SRC manip */
192 static struct ip_conntrack_manip *
193 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
194 const struct ip_nat_multi_range *mr)
196 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
197 struct ip_nat_hash *i;
199 MUST_BE_READ_LOCKED(&ip_nat_lock);
200 i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
202 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
207 #ifdef CONFIG_IP_NF_NAT_LOCAL
208 /* If it's really a local destination manip, it may need to do a
211 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
213 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
216 /* FIXME: IPTOS_TOS(iph->tos) --RR */
217 if (ip_route_output_key(&rt, &fl) != 0) {
218 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
223 *other_ipp = rt->rt_src;
229 /* Simple way to iterate through all. */
230 static inline int fake_cmp(const struct ip_nat_hash *i,
231 u_int32_t src, u_int32_t dst, u_int16_t protonum,
233 const struct ip_conntrack *conntrack)
235 /* Compare backwards: we're dealing with OUTGOING tuples, and
236 inside the conntrack is the REPLY tuple. Don't count this
238 if (i->conntrack != conntrack
239 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
240 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
241 && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
247 static inline unsigned int
248 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
249 const struct ip_conntrack *conntrack)
251 unsigned int score = 0;
254 MUST_BE_READ_LOCKED(&ip_nat_lock);
255 h = hash_by_ipsproto(src, dst, protonum);
256 LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
257 src, dst, protonum, &score, conntrack);
262 /* For [FUTURE] fragmentation handling, we want the least-used
263 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
264 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
265 1-65535, we don't do pro-rata allocation based on ports; we choose
266 the ip with the lowest src-ip/dst-ip/proto usage.
268 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
269 range), we eliminate that and try again. This is not the most
270 efficient approach, but if you're worried about that, don't hand us
271 ranges you don't really have. */
272 static struct ip_nat_range *
273 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
274 const struct ip_nat_multi_range *mr,
275 const struct ip_conntrack *conntrack,
276 unsigned int hooknum)
280 const struct ip_nat_range *range;
282 struct ip_conntrack_tuple tuple;
283 } best = { NULL, 0xFFFFFFFF };
284 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
285 static unsigned int randomness;
287 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
288 var_ipp = &tuple->src.ip;
289 saved_ip = tuple->dst.ip;
290 other_ipp = &tuple->dst.ip;
292 var_ipp = &tuple->dst.ip;
293 saved_ip = tuple->src.ip;
294 other_ipp = &tuple->src.ip;
296 /* Don't do do_extra_mangle unless necessary (overrides
297 explicit socket bindings, for example) */
298 orig_dstip = tuple->dst.ip;
300 IP_NF_ASSERT(mr->rangesize >= 1);
301 for (i = 0; i < mr->rangesize; i++) {
303 u_int32_t minip, maxip, j;
305 /* Don't do ranges which are already eliminated. */
306 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
310 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
311 minip = ntohl(mr->range[i].min_ip);
312 maxip = ntohl(mr->range[i].max_ip);
314 minip = maxip = ntohl(*var_ipp);
317 for (j = 0; j < maxip - minip + 1; j++) {
320 *var_ipp = htonl(minip + (randomness + j)
321 % (maxip - minip + 1));
323 /* Reset the other ip in case it was mangled by
324 * do_extra_mangle last time. */
325 *other_ipp = saved_ip;
327 #ifdef CONFIG_IP_NF_NAT_LOCAL
328 if (hooknum == NF_IP_LOCAL_OUT
329 && *var_ipp != orig_dstip
330 && !do_extra_mangle(*var_ipp, other_ipp)) {
331 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
332 i, NIPQUAD(*var_ipp));
333 /* Can't route? This whole range part is
334 * probably screwed, but keep trying
340 /* Count how many others map onto this. */
341 score = count_maps(tuple->src.ip, tuple->dst.ip,
342 tuple->dst.protonum, conntrack);
343 if (score < best.score) {
344 /* Optimization: doesn't get any better than
347 return (struct ip_nat_range *)
352 best.range = &mr->range[i];
359 return (struct ip_nat_range *)best.range;
362 /* Fast version doesn't iterate through hash chains, but only handles
363 common case of single IP address (null NAT, masquerade) */
364 static struct ip_nat_range *
365 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
366 const struct ip_nat_multi_range *mr,
367 const struct ip_conntrack *conntrack,
368 unsigned int hooknum)
370 if (mr->rangesize != 1
371 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
372 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
373 && mr->range[0].min_ip != mr->range[0].max_ip))
374 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
376 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
377 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
378 tuple->src.ip = mr->range[0].min_ip;
380 /* Only do extra mangle when required (breaks
382 #ifdef CONFIG_IP_NF_NAT_LOCAL
383 if (tuple->dst.ip != mr->range[0].min_ip
384 && hooknum == NF_IP_LOCAL_OUT
385 && !do_extra_mangle(mr->range[0].min_ip,
389 tuple->dst.ip = mr->range[0].min_ip;
394 return (struct ip_nat_range *)&mr->range[0];
398 get_unique_tuple(struct ip_conntrack_tuple *tuple,
399 const struct ip_conntrack_tuple *orig_tuple,
400 const struct ip_nat_multi_range *mrr,
401 struct ip_conntrack *conntrack,
402 unsigned int hooknum)
404 struct ip_nat_protocol *proto
405 = find_nat_proto(orig_tuple->dst.protonum);
406 struct ip_nat_range *rptr;
410 /* We temporarily use flags for marking full parts, but we
411 always clean up afterwards */
412 struct ip_nat_multi_range *mr = (void *)mrr;
414 /* 1) If this srcip/proto/src-proto-part is currently mapped,
415 and that same mapping gives a unique tuple within the given
418 This is only required for source (ie. NAT/masq) mappings.
419 So far, we don't do local source mappings, so multiple
420 manips not an issue. */
421 if (hooknum == NF_IP_POST_ROUTING) {
422 struct ip_conntrack_manip *manip;
424 manip = find_appropriate_src(orig_tuple, mr);
426 /* Apply same source manipulation. */
427 *tuple = ((struct ip_conntrack_tuple)
428 { *manip, orig_tuple->dst });
429 DEBUGP("get_unique_tuple: Found current src map\n");
430 if (!ip_nat_used_tuple(tuple, conntrack))
435 /* 2) Select the least-used IP/proto combination in the given
438 *tuple = *orig_tuple;
439 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
441 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
442 /* 3) The per-protocol part of the manip is made to
443 map into the range to make a unique tuple. */
445 /* Only bother mapping if it's not already in range
447 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
448 || proto->in_range(tuple, HOOK2MANIP(hooknum),
449 &rptr->min, &rptr->max))
450 && !ip_nat_used_tuple(tuple, conntrack)) {
454 if (proto->unique_tuple(tuple, rptr,
457 /* Must be unique. */
458 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
462 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
463 /* Try implicit source NAT; protocol
464 may be able to play with ports to
466 struct ip_nat_range r
467 = { IP_NAT_RANGE_MAP_IPS,
468 tuple->src.ip, tuple->src.ip,
470 DEBUGP("Trying implicit mapping\n");
471 if (proto->unique_tuple(tuple, &r,
474 /* Must be unique. */
475 IP_NF_ASSERT(!ip_nat_used_tuple
481 DEBUGP("Protocol can't get unique tuple %u.\n",
485 /* Eliminate that from range, and try again. */
486 rptr->flags |= IP_NAT_RANGE_FULL;
487 *tuple = *orig_tuple;
493 /* Clear full flags. */
494 IP_NF_ASSERT(mr->rangesize >= 1);
495 for (i = 0; i < mr->rangesize; i++)
496 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
502 helper_cmp(const struct ip_nat_helper *helper,
503 const struct ip_conntrack_tuple *tuple)
505 return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
508 /* Where to manip the reply packets (will be reverse manip). */
509 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
510 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
511 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
512 #ifdef CONFIG_IP_NF_NAT_LOCAL
513 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
514 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
519 ip_nat_setup_info(struct ip_conntrack *conntrack,
520 const struct ip_nat_multi_range *mr,
521 unsigned int hooknum)
523 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
524 struct ip_conntrack_tuple orig_tp;
525 struct ip_nat_info *info = &conntrack->nat.info;
526 int in_hashes = info->initialized;
528 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
529 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
530 || hooknum == NF_IP_POST_ROUTING
531 || hooknum == NF_IP_LOCAL_IN
532 || hooknum == NF_IP_LOCAL_OUT);
533 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
534 IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
536 /* What we've got will look like inverse of reply. Normally
537 this is what is in the conntrack, except for prior
538 manipulations (future optimization: if num_manips == 0,
540 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
541 invert_tuplepr(&orig_tp,
542 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
548 DEBUGP("Hook %u (%s), ", hooknum,
549 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
550 DUMP_TUPLE(&orig_tp);
551 DEBUGP("Range %p: ", mr);
552 for (i = 0; i < mr->rangesize; i++) {
553 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
555 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
558 & IP_NAT_RANGE_PROTO_SPECIFIED)
559 ? " PROTO_SPECIFIED" : "",
560 (mr->range[i].flags & IP_NAT_RANGE_FULL)
562 NIPQUAD(mr->range[i].min_ip),
563 NIPQUAD(mr->range[i].max_ip),
564 mr->range[i].min.all,
565 mr->range[i].max.all);
571 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
573 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
579 DEBUGP("Hook %u (%s) %p\n", hooknum,
580 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
582 DEBUGP("Original: ");
583 DUMP_TUPLE(&orig_tp);
585 DUMP_TUPLE(&new_tuple);
588 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
589 the original (A/B/C/D') and the mangled one (E/F/G/H').
591 We're only allowed to work with the SRC per-proto
592 part, so we create inverses of both to start, then
593 derive the other fields we need. */
595 /* Reply connection: simply invert the new tuple
597 invert_tuplepr(&reply, &new_tuple);
599 /* Alter conntrack table so it recognizes replies.
600 If fail this race (reply tuple now used), repeat. */
601 } while (!ip_conntrack_alter_reply(conntrack, &reply));
603 /* FIXME: We can simply used existing conntrack reply tuple
605 /* Create inverse of original: C/D/A/B' */
606 invert_tuplepr(&inv_tuple, &orig_tp);
608 /* Has source changed?. */
609 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
610 /* In this direction, a source manip. */
611 info->manips[info->num_manips++] =
612 ((struct ip_nat_info_manip)
613 { IP_CT_DIR_ORIGINAL, hooknum,
614 IP_NAT_MANIP_SRC, new_tuple.src });
616 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
618 /* In the reverse direction, a destination manip. */
619 info->manips[info->num_manips++] =
620 ((struct ip_nat_info_manip)
621 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
622 IP_NAT_MANIP_DST, orig_tp.src });
623 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
626 /* Has destination changed? */
627 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
628 /* In this direction, a destination manip */
629 info->manips[info->num_manips++] =
630 ((struct ip_nat_info_manip)
631 { IP_CT_DIR_ORIGINAL, hooknum,
632 IP_NAT_MANIP_DST, reply.src });
634 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
636 /* In the reverse direction, a source manip. */
637 info->manips[info->num_manips++] =
638 ((struct ip_nat_info_manip)
639 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
640 IP_NAT_MANIP_SRC, inv_tuple.src });
641 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
644 /* If there's a helper, assign it; based on new tuple. */
645 if (!conntrack->master)
646 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
650 info->initialized |= (1 << HOOK2MANIP(hooknum));
653 IP_NF_ASSERT(info->bysource.conntrack);
654 replace_in_hashes(conntrack, info);
656 place_in_hashes(conntrack, info);
662 void replace_in_hashes(struct ip_conntrack *conntrack,
663 struct ip_nat_info *info)
665 /* Source has changed, so replace in hashes. */
667 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
669 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
670 .tuple.dst.protonum);
671 /* We place packet as seen OUTGOUNG in byips_proto hash
672 (ie. reverse dst and src of reply packet. */
673 unsigned int ipsprotohash
674 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
676 conntrack->tuplehash[IP_CT_DIR_REPLY]
678 conntrack->tuplehash[IP_CT_DIR_REPLY]
679 .tuple.dst.protonum);
681 IP_NF_ASSERT(info->bysource.conntrack == conntrack);
682 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
684 list_del(&info->bysource.list);
685 list_del(&info->byipsproto.list);
687 list_prepend(&bysource[srchash], &info->bysource);
688 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
691 void place_in_hashes(struct ip_conntrack *conntrack,
692 struct ip_nat_info *info)
695 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
697 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
698 .tuple.dst.protonum);
699 /* We place packet as seen OUTGOUNG in byips_proto hash
700 (ie. reverse dst and src of reply packet. */
701 unsigned int ipsprotohash
702 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
704 conntrack->tuplehash[IP_CT_DIR_REPLY]
706 conntrack->tuplehash[IP_CT_DIR_REPLY]
707 .tuple.dst.protonum);
709 IP_NF_ASSERT(!info->bysource.conntrack);
711 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
712 info->byipsproto.conntrack = conntrack;
713 info->bysource.conntrack = conntrack;
715 list_prepend(&bysource[srchash], &info->bysource);
716 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
719 /* Returns true if succeeded. */
721 manip_pkt(u_int16_t proto,
722 struct sk_buff **pskb,
723 unsigned int iphdroff,
724 const struct ip_conntrack_manip *manip,
725 enum ip_nat_manip_type maniptype)
729 (*pskb)->nfcache |= NFC_ALTERED;
730 if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
733 iph = (void *)(*pskb)->data + iphdroff;
735 /* Manipulate protcol part. */
736 if (!find_nat_proto(proto)->manip_pkt(pskb,
737 iphdroff + iph->ihl*4,
741 iph = (void *)(*pskb)->data + iphdroff;
743 if (maniptype == IP_NAT_MANIP_SRC) {
744 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
746 iph->saddr = manip->ip;
748 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
750 iph->daddr = manip->ip;
755 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
758 struct ip_conntrack_protocol *proto;
761 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
762 proto = __ip_ct_find_proto(skb->nh.iph->protocol);
763 if (proto->exp_matches_pkt)
764 ret = proto->exp_matches_pkt(exp, skb);
769 /* Do packet manipulations according to binding. */
771 do_bindings(struct ip_conntrack *ct,
772 enum ip_conntrack_info ctinfo,
773 struct ip_nat_info *info,
774 unsigned int hooknum,
775 struct sk_buff **pskb)
778 struct ip_nat_helper *helper;
779 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
780 int proto = (*pskb)->nh.iph->protocol;
782 /* Need nat lock to protect against modification, but neither
783 conntrack (referenced) and helper (deleted with
784 synchronize_bh()) can vanish. */
785 READ_LOCK(&ip_nat_lock);
786 for (i = 0; i < info->num_manips; i++) {
787 if (info->manips[i].direction == dir
788 && info->manips[i].hooknum == hooknum) {
789 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
791 info->manips[i].maniptype == IP_NAT_MANIP_SRC
793 NIPQUAD(info->manips[i].manip.ip),
794 htons(info->manips[i].manip.u.all));
795 if (!manip_pkt(proto, pskb, 0,
796 &info->manips[i].manip,
797 info->manips[i].maniptype)) {
798 READ_UNLOCK(&ip_nat_lock);
803 helper = info->helper;
804 READ_UNLOCK(&ip_nat_lock);
807 struct ip_conntrack_expect *exp = NULL;
808 struct list_head *cur_item;
810 int helper_called = 0;
812 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
814 /* Always defragged for helpers */
815 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
816 & htons(IP_MF|IP_OFFSET)));
818 /* Have to grab read lock before sibling_list traversal */
819 READ_LOCK(&ip_conntrack_lock);
820 list_for_each_prev(cur_item, &ct->sibling_list) {
821 exp = list_entry(cur_item, struct ip_conntrack_expect,
824 /* if this expectation is already established, skip */
828 if (exp_for_packet(exp, *pskb)) {
829 /* FIXME: May be true multiple times in the
831 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
832 ret = helper->help(ct, exp, info, ctinfo,
834 if (ret != NF_ACCEPT) {
835 READ_UNLOCK(&ip_conntrack_lock);
841 /* Helper might want to manip the packet even when there is no
842 * matching expectation for this packet */
843 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
844 DEBUGP("calling nat helper for packet without expectation\n");
845 ret = helper->help(ct, NULL, info, ctinfo,
847 if (ret != NF_ACCEPT) {
848 READ_UNLOCK(&ip_conntrack_lock);
852 READ_UNLOCK(&ip_conntrack_lock);
854 /* Adjust sequence number only once per packet
855 * (helper is called at all hooks) */
856 if (proto == IPPROTO_TCP
857 && (hooknum == NF_IP_POST_ROUTING
858 || hooknum == NF_IP_LOCAL_IN)) {
859 DEBUGP("ip_nat_core: adjusting sequence number\n");
860 /* future: put this in a l4-proto specific function,
861 * and call this function here. */
862 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
875 icmp_reply_translation(struct sk_buff **pskb,
876 struct ip_conntrack *conntrack,
877 unsigned int hooknum,
885 struct ip_nat_info *info = &conntrack->nat.info;
888 if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
890 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
892 /* We're actually going to mangle it beyond trivial checksum
893 adjustment, so make sure the current checksum is correct. */
894 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
895 hdrlen = (*pskb)->nh.iph->ihl * 4;
896 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
897 (*pskb)->len - hdrlen, 0)))
901 /* Must be RELATED */
902 IP_NF_ASSERT((*pskb)->nfct
903 - ((struct ip_conntrack *)(*pskb)->nfct->master)->infos
906 - ((struct ip_conntrack *)(*pskb)->nfct->master)->infos
907 == IP_CT_RELATED+IP_CT_IS_REPLY);
909 /* Redirects on non-null nats must be dropped, else they'll
910 start talking to each other without our translation, and be
912 if (inside->icmp.type == ICMP_REDIRECT) {
913 /* Don't care about races here. */
914 if (info->initialized
915 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
916 || info->num_manips != 0)
920 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
921 *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
922 /* Note: May not be from a NAT'd host, but probably safest to
923 do translation always as if it came from the host itself
924 (even though a "host unreachable" coming from the host
925 itself is a bit weird).
927 More explanation: some people use NAT for anonymizing.
928 Also, CERT recommends dropping all packets from private IP
929 addresses (although ICMP errors from internal links with
930 such addresses are not too uncommon, as Alan Cox points
933 READ_LOCK(&ip_nat_lock);
934 for (i = 0; i < info->num_manips; i++) {
935 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
936 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
937 "ORIG" : "REPLY", info->manips[i].hooknum);
939 if (info->manips[i].direction != dir)
942 /* Mapping the inner packet is just like a normal
943 packet, except it was never src/dst reversed, so
944 where we would normally apply a dst manip, we apply
945 a src, and vice versa. */
946 if (info->manips[i].hooknum == hooknum) {
947 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
948 info->manips[i].maniptype == IP_NAT_MANIP_SRC
950 NIPQUAD(info->manips[i].manip.ip),
951 ntohs(info->manips[i].manip.u.udp.port));
952 if (!manip_pkt(inside->ip.protocol, pskb,
953 (*pskb)->nh.iph->ihl*4
954 + sizeof(inside->icmp),
955 &info->manips[i].manip,
956 !info->manips[i].maniptype))
959 /* Outer packet needs to have IP header NATed like
962 /* Use mapping to map outer packet: 0 give no
964 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
965 info->manips[i].maniptype == IP_NAT_MANIP_SRC
967 NIPQUAD(info->manips[i].manip.ip));
968 if (!manip_pkt(0, pskb, 0,
969 &info->manips[i].manip,
970 info->manips[i].maniptype))
974 READ_UNLOCK(&ip_nat_lock);
976 hdrlen = (*pskb)->nh.iph->ihl * 4;
978 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
980 inside->icmp.checksum = 0;
981 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
982 (*pskb)->len - hdrlen,
987 READ_UNLOCK(&ip_nat_lock);
991 int __init ip_nat_init(void)
995 /* Leave them the same for the moment. */
996 ip_nat_htable_size = ip_conntrack_htable_size;
998 /* One vmalloc for both hash tables */
999 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
1003 byipsproto = bysource + ip_nat_htable_size;
1005 /* Sew in builtin protocols. */
1006 WRITE_LOCK(&ip_nat_lock);
1007 list_append(&protos, &ip_nat_protocol_tcp);
1008 list_append(&protos, &ip_nat_protocol_udp);
1009 list_append(&protos, &ip_nat_protocol_icmp);
1010 WRITE_UNLOCK(&ip_nat_lock);
1012 for (i = 0; i < ip_nat_htable_size; i++) {
1013 INIT_LIST_HEAD(&bysource[i]);
1014 INIT_LIST_HEAD(&byipsproto[i]);
1017 /* FIXME: Man, this is a hack. <SIGH> */
1018 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1019 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1021 /* Initialize fake conntrack so that NAT will skip it */
1022 ip_conntrack_untracked.nat.info.initialized |=
1023 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
1028 /* Clear NAT section of all conntracks, in case we're loaded again. */
1029 static int clean_nat(const struct ip_conntrack *i, void *data)
1031 memset((void *)&i->nat, 0, sizeof(i->nat));
1035 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1036 void ip_nat_cleanup(void)
1038 ip_ct_selective_cleanup(&clean_nat, NULL);
1039 ip_conntrack_destroyed = NULL;