1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
40 #define DEBUGP(format, args...)
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
51 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
54 /* We keep extra hashes for each conntrack, for fast searching. */
56 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
58 /* Modified src and dst, to ensure we don't create two
60 return (src + dst + proto) % ip_nat_htable_size;
64 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
66 /* Original src, to ensure we map it consistently if poss. */
67 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
70 /* Noone using conntrack by the time this called. */
71 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
73 struct ip_nat_info *info = &conn->nat.info;
76 if (!info->initialized)
79 hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
80 conn->tuplehash[IP_CT_DIR_ORIGINAL]
83 hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
84 conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
85 conn->tuplehash[IP_CT_DIR_REPLY]
88 WRITE_LOCK(&ip_nat_lock);
89 list_del(&info->bysource);
90 list_del(&info->byipsproto);
91 WRITE_UNLOCK(&ip_nat_lock);
94 /* We do checksum mangling, so if they were wrong before they're still
95 * wrong. Also works for incomplete packets (eg. ICMP dest
98 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
100 u_int32_t diffs[] = { oldvalinv, newval };
101 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
105 /* Is this tuple already taken? (not by us) */
107 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack *ignored_conntrack)
110 /* Conntrack tracking doesn't keep track of outgoing tuples; only
111 incoming ones. NAT means they don't have a fixed mapping,
112 so we invert the tuple and look for the incoming reply.
114 We could keep a separate hash if this proves too slow. */
115 struct ip_conntrack_tuple reply;
117 invert_tuplepr(&reply, tuple);
118 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
121 /* Does tuple + the source manip come within the range mr */
123 in_range(const struct ip_conntrack_tuple *tuple,
124 const struct ip_conntrack_manip *manip,
125 const struct ip_nat_multi_range *mr)
127 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
129 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
131 for (i = 0; i < mr->rangesize; i++) {
132 /* If we are allowed to map IPs, then we must be in the
133 range specified, otherwise we must be unchanged. */
134 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
135 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
136 || (ntohl(newtuple.src.ip)
137 > ntohl(mr->range[i].max_ip)))
140 if (newtuple.src.ip != tuple->src.ip)
144 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
145 || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
146 &mr->range[i].min, &mr->range[i].max))
153 src_cmp(const struct ip_conntrack *ct,
154 const struct ip_conntrack_tuple *tuple,
155 const struct ip_nat_multi_range *mr)
157 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
158 == tuple->dst.protonum
159 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
161 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
164 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src, mr));
167 /* Only called for SRC manip */
168 static struct ip_conntrack_manip *
169 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
170 const struct ip_nat_multi_range *mr)
172 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
173 struct ip_conntrack *ct;
175 MUST_BE_READ_LOCKED(&ip_nat_lock);
176 list_for_each_entry(ct, &bysource[h], nat.info.bysource)
177 if (src_cmp(ct, tuple, mr))
178 return &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
182 #ifdef CONFIG_IP_NF_NAT_LOCAL
183 /* If it's really a local destination manip, it may need to do a
186 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
188 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
191 /* FIXME: IPTOS_TOS(iph->tos) --RR */
192 if (ip_route_output_key(&rt, &fl) != 0) {
193 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
198 *other_ipp = rt->rt_src;
204 /* Simple way to iterate through all. */
205 static inline int fake_cmp(const struct ip_conntrack *ct,
206 u_int32_t src, u_int32_t dst, u_int16_t protonum,
207 unsigned int *score, const struct ip_conntrack *ct2)
209 /* Compare backwards: we're dealing with OUTGOING tuples, and
210 inside the conntrack is the REPLY tuple. Don't count this
213 && ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
214 && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
215 && (ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum == protonum))
220 static inline unsigned int
221 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
222 const struct ip_conntrack *conntrack)
224 struct ip_conntrack *ct;
225 unsigned int score = 0;
228 MUST_BE_READ_LOCKED(&ip_nat_lock);
229 h = hash_by_ipsproto(src, dst, protonum);
230 list_for_each_entry(ct, &byipsproto[h], nat.info.byipsproto)
231 fake_cmp(ct, src, dst, protonum, &score, conntrack);
236 /* For [FUTURE] fragmentation handling, we want the least-used
237 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
238 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
239 1-65535, we don't do pro-rata allocation based on ports; we choose
240 the ip with the lowest src-ip/dst-ip/proto usage.
242 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
243 range), we eliminate that and try again. This is not the most
244 efficient approach, but if you're worried about that, don't hand us
245 ranges you don't really have. */
246 static struct ip_nat_range *
247 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
248 const struct ip_nat_multi_range *mr,
249 const struct ip_conntrack *conntrack,
250 unsigned int hooknum)
254 const struct ip_nat_range *range;
256 struct ip_conntrack_tuple tuple;
257 } best = { NULL, 0xFFFFFFFF };
258 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
259 static unsigned int randomness;
261 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
262 var_ipp = &tuple->src.ip;
263 saved_ip = tuple->dst.ip;
264 other_ipp = &tuple->dst.ip;
266 var_ipp = &tuple->dst.ip;
267 saved_ip = tuple->src.ip;
268 other_ipp = &tuple->src.ip;
270 /* Don't do do_extra_mangle unless necessary (overrides
271 explicit socket bindings, for example) */
272 orig_dstip = tuple->dst.ip;
274 IP_NF_ASSERT(mr->rangesize >= 1);
275 for (i = 0; i < mr->rangesize; i++) {
277 u_int32_t minip, maxip, j;
279 /* Don't do ranges which are already eliminated. */
280 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
284 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
285 minip = ntohl(mr->range[i].min_ip);
286 maxip = ntohl(mr->range[i].max_ip);
288 minip = maxip = ntohl(*var_ipp);
291 for (j = 0; j < maxip - minip + 1; j++) {
294 *var_ipp = htonl(minip + (randomness + j)
295 % (maxip - minip + 1));
297 /* Reset the other ip in case it was mangled by
298 * do_extra_mangle last time. */
299 *other_ipp = saved_ip;
301 #ifdef CONFIG_IP_NF_NAT_LOCAL
302 if (hooknum == NF_IP_LOCAL_OUT
303 && *var_ipp != orig_dstip
304 && !do_extra_mangle(*var_ipp, other_ipp)) {
305 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
306 i, NIPQUAD(*var_ipp));
307 /* Can't route? This whole range part is
308 * probably screwed, but keep trying
314 /* Count how many others map onto this. */
315 score = count_maps(tuple->src.ip, tuple->dst.ip,
316 tuple->dst.protonum, conntrack);
317 if (score < best.score) {
318 /* Optimization: doesn't get any better than
321 return (struct ip_nat_range *)
326 best.range = &mr->range[i];
333 return (struct ip_nat_range *)best.range;
336 /* Fast version doesn't iterate through hash chains, but only handles
337 common case of single IP address (null NAT, masquerade) */
338 static struct ip_nat_range *
339 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
340 const struct ip_nat_multi_range *mr,
341 const struct ip_conntrack *conntrack,
342 unsigned int hooknum)
344 if (mr->rangesize != 1
345 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
346 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
347 && mr->range[0].min_ip != mr->range[0].max_ip))
348 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
350 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
351 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
352 tuple->src.ip = mr->range[0].min_ip;
354 /* Only do extra mangle when required (breaks
356 #ifdef CONFIG_IP_NF_NAT_LOCAL
357 if (tuple->dst.ip != mr->range[0].min_ip
358 && hooknum == NF_IP_LOCAL_OUT
359 && !do_extra_mangle(mr->range[0].min_ip,
363 tuple->dst.ip = mr->range[0].min_ip;
368 return (struct ip_nat_range *)&mr->range[0];
372 get_unique_tuple(struct ip_conntrack_tuple *tuple,
373 const struct ip_conntrack_tuple *orig_tuple,
374 const struct ip_nat_multi_range *mrr,
375 struct ip_conntrack *conntrack,
376 unsigned int hooknum)
378 struct ip_nat_protocol *proto
379 = ip_nat_find_proto(orig_tuple->dst.protonum);
380 struct ip_nat_range *rptr;
384 /* We temporarily use flags for marking full parts, but we
385 always clean up afterwards */
386 struct ip_nat_multi_range *mr = (void *)mrr;
388 /* 1) If this srcip/proto/src-proto-part is currently mapped,
389 and that same mapping gives a unique tuple within the given
392 This is only required for source (ie. NAT/masq) mappings.
393 So far, we don't do local source mappings, so multiple
394 manips not an issue. */
395 if (hooknum == NF_IP_POST_ROUTING) {
396 struct ip_conntrack_manip *manip;
398 manip = find_appropriate_src(orig_tuple, mr);
400 /* Apply same source manipulation. */
401 *tuple = ((struct ip_conntrack_tuple)
402 { *manip, orig_tuple->dst });
403 DEBUGP("get_unique_tuple: Found current src map\n");
404 if (!ip_nat_used_tuple(tuple, conntrack))
409 /* 2) Select the least-used IP/proto combination in the given
412 *tuple = *orig_tuple;
413 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
415 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
416 /* 3) The per-protocol part of the manip is made to
417 map into the range to make a unique tuple. */
419 /* Only bother mapping if it's not already in range
421 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
422 || proto->in_range(tuple, HOOK2MANIP(hooknum),
423 &rptr->min, &rptr->max))
424 && !ip_nat_used_tuple(tuple, conntrack)) {
428 if (proto->unique_tuple(tuple, rptr,
431 /* Must be unique. */
432 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
436 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
437 /* Try implicit source NAT; protocol
438 may be able to play with ports to
440 struct ip_nat_range r
441 = { IP_NAT_RANGE_MAP_IPS,
442 tuple->src.ip, tuple->src.ip,
444 DEBUGP("Trying implicit mapping\n");
445 if (proto->unique_tuple(tuple, &r,
448 /* Must be unique. */
449 IP_NF_ASSERT(!ip_nat_used_tuple
455 DEBUGP("Protocol can't get unique tuple %u.\n",
459 /* Eliminate that from range, and try again. */
460 rptr->flags |= IP_NAT_RANGE_FULL;
461 *tuple = *orig_tuple;
467 /* Clear full flags. */
468 IP_NF_ASSERT(mr->rangesize >= 1);
469 for (i = 0; i < mr->rangesize; i++)
470 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
475 /* Where to manip the reply packets (will be reverse manip). */
476 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
477 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
478 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
479 #ifdef CONFIG_IP_NF_NAT_LOCAL
480 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
481 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
486 ip_nat_setup_info(struct ip_conntrack *conntrack,
487 const struct ip_nat_multi_range *mr,
488 unsigned int hooknum)
490 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
491 struct ip_conntrack_tuple orig_tp;
492 struct ip_nat_info *info = &conntrack->nat.info;
493 int in_hashes = info->initialized;
495 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
496 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
497 || hooknum == NF_IP_POST_ROUTING
498 || hooknum == NF_IP_LOCAL_IN
499 || hooknum == NF_IP_LOCAL_OUT);
500 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
501 IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
503 /* What we've got will look like inverse of reply. Normally
504 this is what is in the conntrack, except for prior
505 manipulations (future optimization: if num_manips == 0,
507 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
508 invert_tuplepr(&orig_tp,
509 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
515 DEBUGP("Hook %u (%s), ", hooknum,
516 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
517 DUMP_TUPLE(&orig_tp);
518 DEBUGP("Range %p: ", mr);
519 for (i = 0; i < mr->rangesize; i++) {
520 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
522 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
525 & IP_NAT_RANGE_PROTO_SPECIFIED)
526 ? " PROTO_SPECIFIED" : "",
527 (mr->range[i].flags & IP_NAT_RANGE_FULL)
529 NIPQUAD(mr->range[i].min_ip),
530 NIPQUAD(mr->range[i].max_ip),
531 mr->range[i].min.all,
532 mr->range[i].max.all);
538 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
540 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
546 DEBUGP("Hook %u (%s) %p\n", hooknum,
547 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
549 DEBUGP("Original: ");
550 DUMP_TUPLE(&orig_tp);
552 DUMP_TUPLE(&new_tuple);
555 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
556 the original (A/B/C/D') and the mangled one (E/F/G/H').
558 We're only allowed to work with the SRC per-proto
559 part, so we create inverses of both to start, then
560 derive the other fields we need. */
562 /* Reply connection: simply invert the new tuple
564 invert_tuplepr(&reply, &new_tuple);
566 /* Alter conntrack table so it recognizes replies.
567 If fail this race (reply tuple now used), repeat. */
568 } while (!ip_conntrack_alter_reply(conntrack, &reply));
570 /* FIXME: We can simply used existing conntrack reply tuple
572 /* Create inverse of original: C/D/A/B' */
573 invert_tuplepr(&inv_tuple, &orig_tp);
575 /* Has source changed?. */
576 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
577 /* In this direction, a source manip. */
578 info->manips[info->num_manips++] =
579 ((struct ip_nat_info_manip)
580 { IP_CT_DIR_ORIGINAL, hooknum,
581 IP_NAT_MANIP_SRC, new_tuple.src });
583 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
585 /* In the reverse direction, a destination manip. */
586 info->manips[info->num_manips++] =
587 ((struct ip_nat_info_manip)
588 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
589 IP_NAT_MANIP_DST, orig_tp.src });
590 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
593 /* Has destination changed? */
594 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
595 /* In this direction, a destination manip */
596 info->manips[info->num_manips++] =
597 ((struct ip_nat_info_manip)
598 { IP_CT_DIR_ORIGINAL, hooknum,
599 IP_NAT_MANIP_DST, reply.src });
601 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
603 /* In the reverse direction, a source manip. */
604 info->manips[info->num_manips++] =
605 ((struct ip_nat_info_manip)
606 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
607 IP_NAT_MANIP_SRC, inv_tuple.src });
608 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
611 /* If there's a helper, assign it; based on new tuple. */
612 if (!conntrack->master)
613 info->helper = __ip_nat_find_helper(&reply);
616 info->initialized |= (1 << HOOK2MANIP(hooknum));
619 replace_in_hashes(conntrack, info);
621 place_in_hashes(conntrack, info);
626 void replace_in_hashes(struct ip_conntrack *conntrack,
627 struct ip_nat_info *info)
629 /* Source has changed, so replace in hashes. */
631 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
633 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
634 .tuple.dst.protonum);
635 /* We place packet as seen OUTGOUNG in byips_proto hash
636 (ie. reverse dst and src of reply packet. */
637 unsigned int ipsprotohash
638 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
640 conntrack->tuplehash[IP_CT_DIR_REPLY]
642 conntrack->tuplehash[IP_CT_DIR_REPLY]
643 .tuple.dst.protonum);
645 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
646 list_move(&info->bysource, &bysource[srchash]);
647 list_move(&info->byipsproto, &byipsproto[ipsprotohash]);
650 void place_in_hashes(struct ip_conntrack *conntrack,
651 struct ip_nat_info *info)
654 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
656 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
657 .tuple.dst.protonum);
658 /* We place packet as seen OUTGOUNG in byips_proto hash
659 (ie. reverse dst and src of reply packet. */
660 unsigned int ipsprotohash
661 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
663 conntrack->tuplehash[IP_CT_DIR_REPLY]
665 conntrack->tuplehash[IP_CT_DIR_REPLY]
666 .tuple.dst.protonum);
668 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
669 list_add(&info->bysource, &bysource[srchash]);
670 list_add(&info->byipsproto, &byipsproto[ipsprotohash]);
673 /* Returns true if succeeded. */
675 manip_pkt(u_int16_t proto,
676 struct sk_buff **pskb,
677 unsigned int iphdroff,
678 const struct ip_conntrack_manip *manip,
679 enum ip_nat_manip_type maniptype)
683 (*pskb)->nfcache |= NFC_ALTERED;
684 if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
687 iph = (void *)(*pskb)->data + iphdroff;
689 /* Manipulate protcol part. */
690 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff + iph->ihl*4,
694 iph = (void *)(*pskb)->data + iphdroff;
696 if (maniptype == IP_NAT_MANIP_SRC) {
697 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
699 iph->saddr = manip->ip;
701 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
703 iph->daddr = manip->ip;
708 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
711 struct ip_conntrack_protocol *proto;
714 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
715 proto = ip_ct_find_proto(skb->nh.iph->protocol);
716 if (proto->exp_matches_pkt)
717 ret = proto->exp_matches_pkt(exp, skb);
722 /* Do packet manipulations according to binding. */
724 do_bindings(struct ip_conntrack *ct,
725 enum ip_conntrack_info ctinfo,
726 struct ip_nat_info *info,
727 unsigned int hooknum,
728 struct sk_buff **pskb)
731 struct ip_nat_helper *helper;
732 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
733 int proto = (*pskb)->nh.iph->protocol;
735 /* Need nat lock to protect against modification, but neither
736 conntrack (referenced) and helper (deleted with
737 synchronize_bh()) can vanish. */
738 READ_LOCK(&ip_nat_lock);
739 for (i = 0; i < info->num_manips; i++) {
740 if (info->manips[i].direction == dir
741 && info->manips[i].hooknum == hooknum) {
742 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
744 info->manips[i].maniptype == IP_NAT_MANIP_SRC
746 NIPQUAD(info->manips[i].manip.ip),
747 htons(info->manips[i].manip.u.all));
748 if (!manip_pkt(proto, pskb, 0,
749 &info->manips[i].manip,
750 info->manips[i].maniptype)) {
751 READ_UNLOCK(&ip_nat_lock);
756 helper = info->helper;
757 READ_UNLOCK(&ip_nat_lock);
760 struct ip_conntrack_expect *exp = NULL;
761 struct list_head *cur_item;
763 int helper_called = 0;
765 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
767 /* Always defragged for helpers */
768 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
769 & htons(IP_MF|IP_OFFSET)));
771 /* Have to grab read lock before sibling_list traversal */
772 READ_LOCK(&ip_conntrack_lock);
773 list_for_each_prev(cur_item, &ct->sibling_list) {
774 exp = list_entry(cur_item, struct ip_conntrack_expect,
777 /* if this expectation is already established, skip */
781 if (exp_for_packet(exp, *pskb)) {
782 /* FIXME: May be true multiple times in the
784 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
785 ret = helper->help(ct, exp, info, ctinfo,
787 if (ret != NF_ACCEPT) {
788 READ_UNLOCK(&ip_conntrack_lock);
794 /* Helper might want to manip the packet even when there is no
795 * matching expectation for this packet */
796 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
797 DEBUGP("calling nat helper for packet without expectation\n");
798 ret = helper->help(ct, NULL, info, ctinfo,
800 if (ret != NF_ACCEPT) {
801 READ_UNLOCK(&ip_conntrack_lock);
805 READ_UNLOCK(&ip_conntrack_lock);
807 /* Adjust sequence number only once per packet
808 * (helper is called at all hooks) */
809 if (proto == IPPROTO_TCP
810 && (hooknum == NF_IP_POST_ROUTING
811 || hooknum == NF_IP_LOCAL_IN)) {
812 DEBUGP("ip_nat_core: adjusting sequence number\n");
813 /* future: put this in a l4-proto specific function,
814 * and call this function here. */
815 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
828 icmp_reply_translation(struct sk_buff **pskb,
829 struct ip_conntrack *conntrack,
830 unsigned int hooknum,
838 struct ip_nat_info *info = &conntrack->nat.info;
841 if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
843 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
845 /* We're actually going to mangle it beyond trivial checksum
846 adjustment, so make sure the current checksum is correct. */
847 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
848 hdrlen = (*pskb)->nh.iph->ihl * 4;
849 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
850 (*pskb)->len - hdrlen, 0)))
854 /* Must be RELATED */
855 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
856 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
858 /* Redirects on non-null nats must be dropped, else they'll
859 start talking to each other without our translation, and be
861 if (inside->icmp.type == ICMP_REDIRECT) {
862 /* Don't care about races here. */
863 if (info->initialized
864 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
865 || info->num_manips != 0)
869 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
870 *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
871 /* Note: May not be from a NAT'd host, but probably safest to
872 do translation always as if it came from the host itself
873 (even though a "host unreachable" coming from the host
874 itself is a bit weird).
876 More explanation: some people use NAT for anonymizing.
877 Also, CERT recommends dropping all packets from private IP
878 addresses (although ICMP errors from internal links with
879 such addresses are not too uncommon, as Alan Cox points
882 READ_LOCK(&ip_nat_lock);
883 for (i = 0; i < info->num_manips; i++) {
884 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
885 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
886 "ORIG" : "REPLY", info->manips[i].hooknum);
888 if (info->manips[i].direction != dir)
891 /* Mapping the inner packet is just like a normal
892 packet, except it was never src/dst reversed, so
893 where we would normally apply a dst manip, we apply
894 a src, and vice versa. */
895 if (info->manips[i].hooknum == hooknum) {
896 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
897 info->manips[i].maniptype == IP_NAT_MANIP_SRC
899 NIPQUAD(info->manips[i].manip.ip),
900 ntohs(info->manips[i].manip.u.udp.port));
901 if (!manip_pkt(inside->ip.protocol, pskb,
902 (*pskb)->nh.iph->ihl*4
903 + sizeof(inside->icmp),
904 &info->manips[i].manip,
905 !info->manips[i].maniptype))
908 /* Outer packet needs to have IP header NATed like
911 /* Use mapping to map outer packet: 0 give no
913 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
914 info->manips[i].maniptype == IP_NAT_MANIP_SRC
916 NIPQUAD(info->manips[i].manip.ip));
917 if (!manip_pkt(0, pskb, 0,
918 &info->manips[i].manip,
919 info->manips[i].maniptype))
923 READ_UNLOCK(&ip_nat_lock);
925 hdrlen = (*pskb)->nh.iph->ihl * 4;
927 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
929 inside->icmp.checksum = 0;
930 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
931 (*pskb)->len - hdrlen,
936 READ_UNLOCK(&ip_nat_lock);
940 int __init ip_nat_init(void)
944 /* Leave them the same for the moment. */
945 ip_nat_htable_size = ip_conntrack_htable_size;
947 /* One vmalloc for both hash tables */
948 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
952 byipsproto = bysource + ip_nat_htable_size;
954 /* Sew in builtin protocols. */
955 WRITE_LOCK(&ip_nat_lock);
956 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
957 ip_nat_protos[i] = &ip_nat_unknown_protocol;
958 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
959 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
960 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
961 WRITE_UNLOCK(&ip_nat_lock);
963 for (i = 0; i < ip_nat_htable_size; i++) {
964 INIT_LIST_HEAD(&bysource[i]);
965 INIT_LIST_HEAD(&byipsproto[i]);
968 /* FIXME: Man, this is a hack. <SIGH> */
969 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
970 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
972 /* Initialize fake conntrack so that NAT will skip it */
973 ip_conntrack_untracked.nat.info.initialized |=
974 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
979 /* Clear NAT section of all conntracks, in case we're loaded again. */
980 static int clean_nat(const struct ip_conntrack *i, void *data)
982 memset((void *)&i->nat, 0, sizeof(i->nat));
986 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
987 void ip_nat_cleanup(void)
989 ip_ct_selective_cleanup(&clean_nat, NULL);
990 ip_conntrack_destroyed = NULL;