vserver 1.9.3
[linux-2.6.git] / net / ipv6 / ip6_output.c
index d6f0db4..33260cf 100644 (file)
@@ -54,8 +54,9 @@
 #include <net/rawv6.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
+#include <net/checksum.h>
 
-static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
+static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**));
 
 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
 {
@@ -87,7 +88,7 @@ static inline int ip6_output_finish(struct sk_buff *skb)
        } else if (dst->neighbour)
                return dst->neighbour->output(skb);
 
-       IP6_INC_STATS_BH(Ip6OutNoRoutes);
+       IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
        kfree_skb(skb);
        return -EINVAL;
 
@@ -107,8 +108,9 @@ static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 }
 
 
-int ip6_output2(struct sk_buff *skb)
+static int ip6_output2(struct sk_buff **pskb)
 {
+       struct sk_buff *skb = *pskb;
        struct dst_entry *dst = skb->dst;
        struct net_device *dev = dst->dev;
 
@@ -132,24 +134,26 @@ int ip6_output2(struct sk_buff *skb)
                                        ip6_dev_loopback_xmit);
 
                        if (skb->nh.ipv6h->hop_limit == 0) {
-                               IP6_INC_STATS(Ip6OutDiscards);
+                               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                                kfree_skb(skb);
                                return 0;
                        }
                }
 
-               IP6_INC_STATS(Ip6OutMcastPkts);
+               IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
        }
 
        return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 }
 
-int ip6_output(struct sk_buff *skb)
+int ip6_output(struct sk_buff **pskb)
 {
+       struct sk_buff *skb = *pskb;
+
        if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
-               return ip6_fragment(skb, ip6_output2);
+               return ip6_fragment(pskb, ip6_output2);
        else
-               return ip6_output2(skb);
+               return ip6_output2(pskb);
 }
 
 #ifdef CONFIG_NETFILTER
@@ -169,7 +173,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
        dst = ip6_route_output(skb->sk, &fl);
 
        if (dst->error) {
-               IP6_INC_STATS(Ip6OutNoRoutes);
+               IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
                LIMIT_NETDEBUG(
                        printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
                dst_release(dst);
@@ -228,7 +232,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                        kfree_skb(skb);
                        skb = skb2;
                        if (skb == NULL) {      
-                               IP6_INC_STATS(Ip6OutDiscards);
+                               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                                return -ENOBUFS;
                        }
                        if (sk)
@@ -262,7 +266,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 
        mtu = dst_pmtu(dst);
        if ((skb->len <= mtu) || ipfragok) {
-               IP6_INC_STATS(Ip6OutRequests);
+               IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
                return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
        }
 
@@ -270,7 +274,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
        skb->dev = dst->dev;
        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-       IP6_INC_STATS(Ip6FragFails);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
        kfree_skb(skb);
        return -EMSGSIZE;
 }
@@ -346,13 +350,13 @@ int ip6_forward(struct sk_buff *skb)
 {
        struct dst_entry *dst = skb->dst;
        struct ipv6hdr *hdr = skb->nh.ipv6h;
-       struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
+       struct inet6_skb_parm *opt = IP6CB(skb);
        
        if (ipv6_devconf.forwarding == 0)
                goto error;
 
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
-               IP6_INC_STATS(Ip6InDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
 
@@ -391,7 +395,7 @@ int ip6_forward(struct sk_buff *skb)
        }
 
        if (!xfrm6_route_forward(skb)) {
-               IP6_INC_STATS(Ip6InDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
 
@@ -429,14 +433,14 @@ int ip6_forward(struct sk_buff *skb)
                /* Again, force OUTPUT device used as source address */
                skb->dev = dst->dev;
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
-               IP6_INC_STATS_BH(Ip6InTooBigErrors);
-               IP6_INC_STATS_BH(Ip6FragFails);
+               IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
+               IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
                kfree_skb(skb);
                return -EMSGSIZE;
        }
 
        if (skb_cow(skb, dst->dev->hard_header_len)) {
-               IP6_INC_STATS(Ip6OutDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                goto drop;
        }
 
@@ -446,11 +450,11 @@ int ip6_forward(struct sk_buff *skb)
  
        hdr->hop_limit--;
 
-       IP6_INC_STATS_BH(Ip6OutForwDatagrams);
+       IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
        return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 
 error:
-       IP6_INC_STATS_BH(Ip6InAddrErrors);
+       IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 drop:
        kfree_skb(skb);
        return -EINVAL;
@@ -473,6 +477,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        /* Connection association is same as pre-frag packet */
        to->nfct = from->nfct;
        nf_conntrack_get(to->nfct);
+       to->nfctinfo = from->nfctinfo;
 #ifdef CONFIG_BRIDGE_NETFILTER
        nf_bridge_put(to->nf_bridge);
        to->nf_bridge = from->nf_bridge;
@@ -513,11 +518,11 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
        return offset;
 }
 
-static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
 {
        struct net_device *dev;
+       struct sk_buff *frag, *skb = *pskb;
        struct rt6_info *rt = (struct rt6_info*)skb->dst;
-       struct sk_buff *frag;
        struct ipv6hdr *tmp_hdr;
        struct frag_hdr *fh;
        unsigned int mtu, hlen, left, len;
@@ -558,12 +563,12 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
                err = 0;
                offset = 0;
                frag = skb_shinfo(skb)->frag_list;
-               skb_shinfo(skb)->frag_list = 0;
+               skb_shinfo(skb)->frag_list = NULL;
                /* BUILD HEADER */
 
                tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
                if (!tmp_hdr) {
-                       IP6_INC_STATS(Ip6FragFails);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        return -ENOMEM;
                }
 
@@ -604,8 +609,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
                                frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
                                ip6_copy_metadata(frag, skb);
                        }
-                       err = output(skb);
-
+                       
+                       err = output(&skb);
                        if (err || !frag)
                                break;
 
@@ -618,7 +623,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
                        kfree(tmp_hdr);
 
                if (err == 0) {
-                       IP6_INC_STATS(Ip6FragOKs);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
                        return 0;
                }
 
@@ -628,7 +633,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
                        frag = skb;
                }
 
-               IP6_INC_STATS(Ip6FragFails);
+               IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                return err;
        }
 
@@ -661,7 +666,7 @@ slow_path:
 
                if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
                        NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
-                       IP6_INC_STATS(Ip6FragFails);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        err = -ENOMEM;
                        goto fail;
                }
@@ -719,19 +724,19 @@ slow_path:
                 *      Put this fragment into the sending queue.
                 */
 
-               IP6_INC_STATS(Ip6FragCreates);
+               IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 
-               err = output(frag);
+               err = output(&frag);
                if (err)
                        goto fail;
        }
        kfree_skb(skb);
-       IP6_INC_STATS(Ip6FragOKs);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
        return err;
 
 fail:
        kfree_skb(skb); 
-       IP6_INC_STATS(Ip6FragFails);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
        return err;
 }
 
@@ -793,10 +798,6 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
                        goto out_err_release;
                }
        }
-       if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) {
-               err = -ENETUNREACH;
-               goto out_err_release;
-        }
 
        return 0;
 
@@ -818,7 +819,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
        int exthdrlen;
        int hh_len;
        int mtu;
-       int copy = 0;
+       int copy;
        int err;
        int offset = 0;
        int csummode = CHECKSUM_NONE;
@@ -849,8 +850,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                np->cork.hop_limit = hlimit;
                inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
                inet->cork.length = 0;
-               inet->sndmsg_page = NULL;
-               inet->sndmsg_off = 0;
+               sk->sk_sndmsg_page = NULL;
+               sk->sk_sndmsg_off = 0;
                exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
                length += exthdrlen;
                transhdrlen += exthdrlen;
@@ -876,29 +877,79 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                }
        }
 
+       /*
+        * Let's try using as much space as possible.
+        * Use MTU if total length of the message fits into the MTU.
+        * Otherwise, we need to reserve fragment header and
+        * fragment alignment (= 8-15 octects, in total).
+        *
+        * Note that we may need to "move" the data from the tail of
+        * of the buffer to the new fragment when we split 
+        * the message.
+        *
+        * FIXME: It may be fragmented into multiple chunks 
+        *        at once if non-fragmentable extension headers
+        *        are too large.
+        * --yoshfuji 
+        */
+
        inet->cork.length += length;
 
        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                goto alloc_new_skb;
 
        while (length > 0) {
-               if ((copy = maxfraglen - skb->len) <= 0) {
+               /* Check if the remaining data fits into current packet. */
+               copy = mtu - skb->len;
+               if (copy < length)
+                       copy = maxfraglen - skb->len;
+
+               if (copy <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
+                       unsigned int fraggap;
                        unsigned int alloclen;
-                       BUG_TRAP(copy == 0);
+                       struct sk_buff *skb_prev;
 alloc_new_skb:
-                       datalen = maxfraglen - fragheaderlen;
-                       if (datalen > length)
-                               datalen = length;
+                       skb_prev = skb;
+
+                       /* There's no room in the current skb */
+                       if (skb_prev)
+                               fraggap = skb_prev->len - maxfraglen;
+                       else
+                               fraggap = 0;
+
+                       /*
+                        * If remaining data exceeds the mtu,
+                        * we know we need more fragment(s).
+                        */
+                       datalen = length + fraggap;
+                       if (datalen > mtu - fragheaderlen)
+                               datalen = maxfraglen - fragheaderlen;
+
                        fraglen = datalen + fragheaderlen;
                        if ((flags & MSG_MORE) &&
                            !(rt->u.dst.dev->features&NETIF_F_SG))
-                               alloclen = maxfraglen;
+                               alloclen = mtu;
                        else
-                               alloclen = fraglen;
+                               alloclen = datalen + fragheaderlen;
+
+                       /*
+                        * The last fragment gets additional space at tail.
+                        * Note: we overallocate on fragments with MSG_MODE
+                        * because we have no idea if we're the last one.
+                        */
+                       if (datalen == length + fraggap)
+                               alloclen += rt->u.dst.trailer_len;
+
+                       /*
+                        * We just reserve space for fragment header.
+                        * Note: this may be overallocation if the message 
+                        * (without MSG_MORE) fits into the MTU.
+                        */
                        alloclen += sizeof(struct frag_hdr);
+
                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk,
                                                alloclen + hh_len,
@@ -920,7 +971,7 @@ alloc_new_skb:
                         */
                        skb->ip_summed = csummode;
                        skb->csum = 0;
-                       /* reserve 8 byte for fragmentation */
+                       /* reserve for fragmentation */
                        skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 
                        /*
@@ -930,15 +981,29 @@ alloc_new_skb:
                        skb->nh.raw = data + exthdrlen;
                        data += fragheaderlen;
                        skb->h.raw = data + exthdrlen;
-                       copy = datalen - transhdrlen;
-                       if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
+
+                       if (fraggap) {
+                               skb->csum = skb_copy_and_csum_bits(
+                                       skb_prev, maxfraglen,
+                                       data + transhdrlen, fraggap, 0);
+                               skb_prev->csum = csum_sub(skb_prev->csum,
+                                                         skb->csum);
+                               data += fraggap;
+                               skb_trim(skb_prev, maxfraglen);
+                       }
+                       copy = datalen - transhdrlen - fraggap;
+                       if (copy < 0) {
+                               err = -EINVAL;
+                               kfree_skb(skb);
+                               goto error;
+                       } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
                                goto error;
                        }
 
                        offset += copy;
-                       length -= datalen;
+                       length -= datalen - fraggap;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        csummode = CHECKSUM_NONE;
@@ -966,8 +1031,8 @@ alloc_new_skb:
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-                       struct page *page = inet->sndmsg_page;
-                       int off = inet->sndmsg_off;
+                       struct page *page = sk->sk_sndmsg_page;
+                       int off = sk->sk_sndmsg_off;
                        unsigned int left;
 
                        if (page && (left = PAGE_SIZE - off) > 0) {
@@ -979,7 +1044,7 @@ alloc_new_skb:
                                                goto error;
                                        }
                                        get_page(page);
-                                       skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
+                                       skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
                                        frag = &skb_shinfo(skb)->frags[i];
                                }
                        } else if(i < MAX_SKB_FRAGS) {
@@ -990,8 +1055,8 @@ alloc_new_skb:
                                        err = -ENOMEM;
                                        goto error;
                                }
-                               inet->sndmsg_page = page;
-                               inet->sndmsg_off = 0;
+                               sk->sk_sndmsg_page = page;
+                               sk->sk_sndmsg_off = 0;
 
                                skb_fill_page_desc(skb, i, page, 0, 0);
                                frag = &skb_shinfo(skb)->frags[i];
@@ -1005,7 +1070,7 @@ alloc_new_skb:
                                err = -EFAULT;
                                goto error;
                        }
-                       inet->sndmsg_off += copy;
+                       sk->sk_sndmsg_off += copy;
                        frag->size += copy;
                        skb->len += copy;
                        skb->data_len += copy;
@@ -1016,7 +1081,7 @@ alloc_new_skb:
        return 0;
 error:
        inet->cork.length -= length;
-       IP6_INC_STATS(Ip6OutDiscards);
+       IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
        return err;
 }
 
@@ -1076,7 +1141,7 @@ int ip6_push_pending_frames(struct sock *sk)
        ipv6_addr_copy(&hdr->daddr, final_dst);
 
        skb->dst = dst_clone(&rt->u.dst);
-       IP6_INC_STATS(Ip6OutRequests);  
+       IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
        if (err) {
                if (err > 0)
@@ -1108,7 +1173,7 @@ void ip6_flush_pending_frames(struct sock *sk)
        struct sk_buff *skb;
 
        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
-               IP6_INC_STATS(Ip6OutDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                kfree_skb(skb);
        }