fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / net / ipv4 / netfilter / ip_nat_helper.c
index a49c722..ee80feb 100644 (file)
@@ -15,7 +15,6 @@
  *             - make ip_nat_resize_packet more generic (TCP and UDP)
  *             - add ip_nat_mangle_udp_packet
  */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/types.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
-
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
 #include <linux/netfilter_ipv4/ip_nat_core.h>
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
@@ -47,7 +42,7 @@
 #define DUMP_OFFSET(x)
 #endif
 
-DECLARE_LOCK(ip_nat_seqofs_lock);
+static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
 
 /* Setup TCP sequence correction given this change at this sequence */
 static inline void 
@@ -70,9 +65,9 @@ adjust_tcp_sequence(u32 seq,
        DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
        DUMP_OFFSET(this_way);
 
-       LOCK_BH(&ip_nat_seqofs_lock);
+       spin_lock_bh(&ip_nat_seqofs_lock);
 
-       /* SYN adjust. If it's uninitialized, of this is after last
+       /* SYN adjust. If it's uninitialized, or this is after last
         * correction, record it: we don't handle more than one
         * adjustment in the window, but do deal with common case of a
         * retransmit */
@@ -82,7 +77,7 @@ adjust_tcp_sequence(u32 seq,
                    this_way->offset_before = this_way->offset_after;
                    this_way->offset_after += sizediff;
        }
-       UNLOCK_BH(&ip_nat_seqofs_lock);
+       spin_unlock_bh(&ip_nat_seqofs_lock);
 
        DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
        DUMP_OFFSET(this_way);
@@ -142,9 +137,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
        /* Transfer socket to new skb. */
        if ((*pskb)->sk)
                skb_set_owner_w(nskb, (*pskb)->sk);
-#ifdef CONFIG_NETFILTER_DEBUG
-       nskb->nf_debug = (*pskb)->nf_debug;
-#endif
        kfree_skb(*pskb);
        *pskb = nskb;
        return 1;
@@ -169,9 +161,9 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
 {
        struct iphdr *iph;
        struct tcphdr *tcph;
-       int datalen;
+       int oldlen, datalen;
 
-       if (!skb_ip_make_writable(pskb, (*pskb)->len))
+       if (!skb_make_writable(pskb, (*pskb)->len))
                return 0;
 
        if (rep_len > match_len
@@ -184,19 +176,32 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
        iph = (*pskb)->nh.iph;
        tcph = (void *)iph + iph->ihl*4;
 
+       oldlen = (*pskb)->len - iph->ihl*4;
        mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
                        match_offset, match_len, rep_buffer, rep_len);
 
        datalen = (*pskb)->len - iph->ihl*4;
-       tcph->check = 0;
-       tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
-                                  csum_partial((char *)tcph, datalen, 0));
-
-       adjust_tcp_sequence(ntohl(tcph->seq),
-                           (int)rep_len - (int)match_len,
-                           ct, ctinfo);
+       if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
+               tcph->check = 0;
+               tcph->check = tcp_v4_check(tcph, datalen,
+                                          iph->saddr, iph->daddr,
+                                          csum_partial((char *)tcph,
+                                                       datalen, 0));
+       } else
+               nf_proto_csum_replace2(&tcph->check, *pskb,
+                                       htons(oldlen), htons(datalen), 1);
+
+       if (rep_len != match_len) {
+               set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+               adjust_tcp_sequence(ntohl(tcph->seq),
+                                   (int)rep_len - (int)match_len,
+                                   ct, ctinfo);
+               /* Tell TCP window tracking about seq change */
+               ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
+       }
        return 1;
 }
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
                        
 /* Generic function for mangling variable-length address changes inside
  * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
@@ -219,6 +224,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 {
        struct iphdr *iph;
        struct udphdr *udph;
+       int datalen, oldlen;
 
        /* UDP helpers might accidentally mangle the wrong packet */
        iph = (*pskb)->nh.iph;
@@ -226,7 +232,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
                               match_offset + match_len)
                return 0;
 
-       if (!skb_ip_make_writable(pskb, (*pskb)->len))
+       if (!skb_make_writable(pskb, (*pskb)->len))
                return 0;
 
        if (rep_len > match_len
@@ -236,24 +242,33 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 
        iph = (*pskb)->nh.iph;
        udph = (void *)iph + iph->ihl*4;
+
+       oldlen = (*pskb)->len - iph->ihl*4;
        mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
                        match_offset, match_len, rep_buffer, rep_len);
 
        /* update the length of the UDP packet */
-       udph->len = htons((*pskb)->len - iph->ihl*4);
+       datalen = (*pskb)->len - iph->ihl*4;
+       udph->len = htons(datalen);
 
        /* fix udp checksum if udp checksum was previously calculated */
-       if (udph->check) {
-               int datalen = (*pskb)->len - iph->ihl * 4;
+       if (!udph->check && (*pskb)->ip_summed != CHECKSUM_PARTIAL)
+               return 1;
+
+       if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
                udph->check = 0;
                udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
                                                datalen, IPPROTO_UDP,
                                                csum_partial((char *)udph,
                                                             datalen, 0));
-       }
-
+               if (!udph->check)
+                       udph->check = CSUM_MANGLED_0;
+       } else
+               nf_proto_csum_replace2(&udph->check, *pskb,
+                                       htons(oldlen), htons(datalen), 1);
        return 1;
 }
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
 
 /* Adjust one found SACK option including checksum correction */
 static void
@@ -264,37 +279,34 @@ sack_adjust(struct sk_buff *skb,
            struct ip_nat_seq *natseq)
 {
        while (sackoff < sackend) {
-               struct tcp_sack_block *sack;
-               u_int32_t new_start_seq, new_end_seq;
+               struct tcp_sack_block_wire *sack;
+               __be32 new_start_seq, new_end_seq;
 
                sack = (void *)skb->data + sackoff;
                if (after(ntohl(sack->start_seq) - natseq->offset_before,
                          natseq->correction_pos))
-                       new_start_seq = ntohl(sack->start_seq) 
-                                       - natseq->offset_after;
+                       new_start_seq = htonl(ntohl(sack->start_seq)
+                                       - natseq->offset_after);
                else
-                       new_start_seq = ntohl(sack->start_seq) 
-                                       - natseq->offset_before;
-               new_start_seq = htonl(new_start_seq);
+                       new_start_seq = htonl(ntohl(sack->start_seq)
+                                       - natseq->offset_before);
 
                if (after(ntohl(sack->end_seq) - natseq->offset_before,
                          natseq->correction_pos))
-                       new_end_seq = ntohl(sack->end_seq)
-                                     - natseq->offset_after;
+                       new_end_seq = htonl(ntohl(sack->end_seq)
+                                     - natseq->offset_after);
                else
-                       new_end_seq = ntohl(sack->end_seq)
-                                     - natseq->offset_before;
-               new_end_seq = htonl(new_end_seq);
+                       new_end_seq = htonl(ntohl(sack->end_seq)
+                                     - natseq->offset_before);
 
                DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
                        ntohl(sack->start_seq), new_start_seq,
                        ntohl(sack->end_seq), new_end_seq);
 
-               tcph->check = 
-                       ip_nat_cheat_check(~sack->start_seq, new_start_seq,
-                                          ip_nat_cheat_check(~sack->end_seq, 
-                                                             new_end_seq,
-                                                             tcph->check));
+               nf_proto_csum_replace4(&tcph->check, skb,
+                                       sack->start_seq, new_start_seq, 0);
+               nf_proto_csum_replace4(&tcph->check, skb,
+                                       sack->end_seq, new_end_seq, 0);
                sack->start_seq = new_start_seq;
                sack->end_seq = new_end_seq;
                sackoff += sizeof(*sack);
@@ -313,7 +325,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
        optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
        optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
 
-       if (!skb_ip_make_writable(pskb, optend))
+       if (!skb_make_writable(pskb, optend))
                return 0;
 
        dir = CTINFO2DIR(ctinfo);
@@ -346,14 +358,15 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
        return 1;
 }
 
-/* TCP sequence number adjustment.  Returns true or false.  */
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
 int
 ip_nat_seq_adjust(struct sk_buff **pskb, 
                  struct ip_conntrack *ct, 
                  enum ip_conntrack_info ctinfo)
 {
        struct tcphdr *tcph;
-       int dir, newseq, newack;
+       int dir;
+       __be32 newseq, newack;
        struct ip_nat_seq *this_way, *other_way;        
 
        dir = CTINFO2DIR(ctinfo);
@@ -361,32 +374,23 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
        this_way = &ct->nat.info.seq[dir];
        other_way = &ct->nat.info.seq[!dir];
 
-       /* No adjustments to make?  Very common case. */
-       if (!this_way->offset_before && !this_way->offset_after
-           && !other_way->offset_before && !other_way->offset_after)
-               return 1;
-
-       if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+       if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
                return 0;
 
        tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
        if (after(ntohl(tcph->seq), this_way->correction_pos))
-               newseq = ntohl(tcph->seq) + this_way->offset_after;
+               newseq = htonl(ntohl(tcph->seq) + this_way->offset_after);
        else
-               newseq = ntohl(tcph->seq) + this_way->offset_before;
-       newseq = htonl(newseq);
+               newseq = htonl(ntohl(tcph->seq) + this_way->offset_before);
 
        if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
                  other_way->correction_pos))
-               newack = ntohl(tcph->ack_seq) - other_way->offset_after;
+               newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_after);
        else
-               newack = ntohl(tcph->ack_seq) - other_way->offset_before;
-       newack = htonl(newack);
+               newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before);
 
-       tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
-                                        ip_nat_cheat_check(~tcph->ack_seq, 
-                                                           newack, 
-                                                           tcph->check));
+       nf_proto_csum_replace4(&tcph->check, *pskb, tcph->seq, newseq, 0);
+       nf_proto_csum_replace4(&tcph->check, *pskb, tcph->ack_seq, newack, 0);
 
        DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
                ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
@@ -395,60 +399,38 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
        tcph->seq = newseq;
        tcph->ack_seq = newack;
 
-       return ip_nat_sack_adjust(pskb, tcph, ct, ctinfo);
-}
-
-static inline int
-helper_cmp(const struct ip_nat_helper *helper,
-          const struct ip_conntrack_tuple *tuple)
-{
-       return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
-}
-
-int ip_nat_helper_register(struct ip_nat_helper *me)
-{
-       int ret = 0;
-
-       WRITE_LOCK(&ip_nat_lock);
-       if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,&me->tuple))
-               ret = -EBUSY;
-       else
-               list_prepend(&helpers, me);
-       WRITE_UNLOCK(&ip_nat_lock);
-
-       return ret;
-}
-
-static int
-kill_helper(const struct ip_conntrack *i, void *helper)
-{
-       int ret;
+       if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
+               return 0;
 
-       READ_LOCK(&ip_nat_lock);
-       ret = (i->nat.info.helper == helper);
-       READ_UNLOCK(&ip_nat_lock);
+       ip_conntrack_tcp_update(*pskb, ct, dir);
 
-       return ret;
+       return 1;
 }
+EXPORT_SYMBOL(ip_nat_seq_adjust);
 
-void ip_nat_helper_unregister(struct ip_nat_helper *me)
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void ip_nat_follow_master(struct ip_conntrack *ct,
+                         struct ip_conntrack_expect *exp)
 {
-       WRITE_LOCK(&ip_nat_lock);
-       /* Autoloading conntrack helper might have failed */
-       if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,&me->tuple)) {
-               LIST_DELETE(&helpers, me);
-       }
-       WRITE_UNLOCK(&ip_nat_lock);
-
-       /* Someone could be still looking at the helper in a bh. */
-       synchronize_net();
-
-       /* Find anything using it, and umm, kill them.  We can't turn
-          them into normal connections: if we've adjusted SYNs, then
-          they'll ackstorm.  So we just drop it.  We used to just
-          bump module count when a connection existed, but that
-          forces admins to gen fake RSTs or bounce box, either of
-          which is just a long-winded way of making things
-          worse. --RR */
-       ip_ct_selective_cleanup(kill_helper, me);
+       struct ip_nat_range range;
+
+       /* This must be a fresh one. */
+       BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+       /* Change src to where master sends to */
+       range.flags = IP_NAT_RANGE_MAP_IPS;
+       range.min_ip = range.max_ip
+               = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
+       /* hook doesn't matter, but it has to do source manip */
+       ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+
+       /* For DST manip, map port here to where it's expected. */
+       range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+       range.min = range.max = exp->saved_proto;
+       range.min_ip = range.max_ip
+               = ct->master->tuplehash[!exp->dir].tuple.src.ip;
+       /* hook doesn't matter, but it has to do destination manip */
+       ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
 }
+EXPORT_SYMBOL(ip_nat_follow_master);