vserver 1.9.5.x5
[linux-2.6.git] / net / ipv6 / ip6_output.c
index 0057672..fce2a87 100644 (file)
 #include <net/rawv6.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
+#include <net/checksum.h>
 
-static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**));
+static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
 {
        static u32 ipv6_fragmentation_id = 1;
-       static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
+       static DEFINE_SPINLOCK(ip6_id_lock);
 
        spin_lock_bh(&ip6_id_lock);
        fhdr->identification = htonl(ipv6_fragmentation_id);
@@ -87,7 +88,7 @@ static inline int ip6_output_finish(struct sk_buff *skb)
        } else if (dst->neighbour)
                return dst->neighbour->output(skb);
 
-       IP6_INC_STATS_BH(OutNoRoutes);
+       IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
        kfree_skb(skb);
        return -EINVAL;
 
@@ -107,9 +108,8 @@ static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 }
 
 
-static int ip6_output2(struct sk_buff **pskb)
+static int ip6_output2(struct sk_buff *skb)
 {
-       struct sk_buff *skb = *pskb;
        struct dst_entry *dst = skb->dst;
        struct net_device *dev = dst->dev;
 
@@ -133,26 +133,24 @@ static int ip6_output2(struct sk_buff **pskb)
                                        ip6_dev_loopback_xmit);
 
                        if (skb->nh.ipv6h->hop_limit == 0) {
-                               IP6_INC_STATS(OutDiscards);
+                               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                                kfree_skb(skb);
                                return 0;
                        }
                }
 
-               IP6_INC_STATS(OutMcastPkts);
+               IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
        }
 
        return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 }
 
-int ip6_output(struct sk_buff **pskb)
+int ip6_output(struct sk_buff *skb)
 {
-       struct sk_buff *skb = *pskb;
-
-       if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
-               return ip6_fragment(pskb, ip6_output2);
+       if (skb->len > dst_pmtu(skb->dst))
+               return ip6_fragment(skb, ip6_output2);
        else
-               return ip6_output2(pskb);
+               return ip6_output2(skb);
 }
 
 #ifdef CONFIG_NETFILTER
@@ -172,7 +170,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
        dst = ip6_route_output(skb->sk, &fl);
 
        if (dst->error) {
-               IP6_INC_STATS(OutNoRoutes);
+               IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
                LIMIT_NETDEBUG(
                        printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
                dst_release(dst);
@@ -231,7 +229,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                        kfree_skb(skb);
                        skb = skb2;
                        if (skb == NULL) {      
-                               IP6_INC_STATS(OutDiscards);
+                               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                                return -ENOBUFS;
                        }
                        if (sk)
@@ -265,7 +263,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 
        mtu = dst_pmtu(dst);
        if ((skb->len <= mtu) || ipfragok) {
-               IP6_INC_STATS(OutRequests);
+               IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
                return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
        }
 
@@ -273,7 +271,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
        skb->dev = dst->dev;
        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-       IP6_INC_STATS(FragFails);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
        kfree_skb(skb);
        return -EMSGSIZE;
 }
@@ -313,7 +311,7 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
        return 0;
 }
 
-int ip6_call_ra_chain(struct sk_buff *skb, int sel)
+static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 {
        struct ip6_ra_chain *ra;
        struct sock *last = NULL;
@@ -355,7 +353,7 @@ int ip6_forward(struct sk_buff *skb)
                goto error;
 
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
-               IP6_INC_STATS(InDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
 
@@ -394,7 +392,7 @@ int ip6_forward(struct sk_buff *skb)
        }
 
        if (!xfrm6_route_forward(skb)) {
-               IP6_INC_STATS(InDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
 
@@ -432,14 +430,14 @@ int ip6_forward(struct sk_buff *skb)
                /* Again, force OUTPUT device used as source address */
                skb->dev = dst->dev;
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
-               IP6_INC_STATS_BH(InTooBigErrors);
-               IP6_INC_STATS_BH(FragFails);
+               IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
+               IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
                kfree_skb(skb);
                return -EMSGSIZE;
        }
 
        if (skb_cow(skb, dst->dev->hard_header_len)) {
-               IP6_INC_STATS(OutDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                goto drop;
        }
 
@@ -449,11 +447,11 @@ int ip6_forward(struct sk_buff *skb)
  
        hdr->hop_limit--;
 
-       IP6_INC_STATS_BH(OutForwDatagrams);
+       IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
        return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 
 error:
-       IP6_INC_STATS_BH(InAddrErrors);
+       IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 drop:
        kfree_skb(skb);
        return -EINVAL;
@@ -465,6 +463,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        to->priority = from->priority;
        to->protocol = from->protocol;
        to->security = from->security;
+       dst_release(to->dst);
        to->dst = dst_clone(from->dst);
        to->dev = from->dev;
 
@@ -476,6 +475,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        /* Connection association is same as pre-frag packet */
        to->nfct = from->nfct;
        nf_conntrack_get(to->nfct);
+       to->nfctinfo = from->nfctinfo;
 #ifdef CONFIG_BRIDGE_NETFILTER
        nf_bridge_put(to->nf_bridge);
        to->nf_bridge = from->nf_bridge;
@@ -516,10 +516,10 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
        return offset;
 }
 
-static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
+static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
        struct net_device *dev;
-       struct sk_buff *frag, *skb = *pskb;
+       struct sk_buff *frag;
        struct rt6_info *rt = (struct rt6_info*)skb->dst;
        struct ipv6hdr *tmp_hdr;
        struct frag_hdr *fh;
@@ -561,12 +561,12 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                err = 0;
                offset = 0;
                frag = skb_shinfo(skb)->frag_list;
-               skb_shinfo(skb)->frag_list = 0;
+               skb_shinfo(skb)->frag_list = NULL;
                /* BUILD HEADER */
 
                tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
                if (!tmp_hdr) {
-                       IP6_INC_STATS(FragFails);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        return -ENOMEM;
                }
 
@@ -593,6 +593,7 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                        /* Prepare header of the next frame,
                         * before previous one went down. */
                        if (frag) {
+                               frag->ip_summed = CHECKSUM_NONE;
                                frag->h.raw = frag->data;
                                fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
                                frag->nh.raw = __skb_push(frag, hlen);
@@ -607,13 +608,11 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                                frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
                                ip6_copy_metadata(frag, skb);
                        }
-                       err = output(pskb);
-                       if (err || !frag) {
-                               if (unlikely(skb != *pskb))
-                                       skb = *pskb;
-                               break;
-                       }
                        
+                       err = output(skb);
+                       if (err || !frag)
+                               break;
+
                        skb = frag;
                        frag = skb->next;
                        skb->next = NULL;
@@ -623,7 +622,7 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                        kfree(tmp_hdr);
 
                if (err == 0) {
-                       IP6_INC_STATS(FragOKs);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
                        return 0;
                }
 
@@ -633,7 +632,7 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                        frag = skb;
                }
 
-               IP6_INC_STATS(FragFails);
+               IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                return err;
        }
 
@@ -666,7 +665,7 @@ slow_path:
 
                if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
                        NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
-                       IP6_INC_STATS(FragFails);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        err = -ENOMEM;
                        goto fail;
                }
@@ -724,19 +723,19 @@ slow_path:
                 *      Put this fragment into the sending queue.
                 */
 
-               IP6_INC_STATS(FragCreates);
+               IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 
-               err = output(&frag);
+               err = output(frag);
                if (err)
                        goto fail;
        }
        kfree_skb(skb);
-       IP6_INC_STATS(FragOKs);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
        return err;
 
 fail:
        kfree_skb(skb); 
-       IP6_INC_STATS(FragFails);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
        return err;
 }
 
@@ -748,7 +747,7 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
        if (sk) {
                struct ipv6_pinfo *np = inet6_sk(sk);
        
-               *dst = __sk_dst_check(sk, np->dst_cookie);
+               *dst = sk_dst_check(sk, np->dst_cookie);
                if (*dst) {
                        struct rt6_info *rt = (struct rt6_info*)*dst;
        
@@ -771,13 +770,13 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
                                 */
        
                        if (((rt->rt6i_dst.plen != 128 ||
-                             ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
+                             !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
                             && (np->daddr_cache == NULL ||
-                                ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
+                                !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
                            || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
+                               dst_release(*dst);
                                *dst = NULL;
-                       } else
-                               dst_hold(*dst);
+                       }
                }
        }
 
@@ -798,10 +797,6 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
                        goto out_err_release;
                }
        }
-       if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) {
-               err = -ENETUNREACH;
-               goto out_err_release;
-        }
 
        return 0;
 
@@ -816,14 +811,14 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                    int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
                    unsigned int flags)
 {
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sk_buff *skb;
        unsigned int maxfraglen, fragheaderlen;
        int exthdrlen;
        int hh_len;
        int mtu;
-       int copy = 0;
+       int copy;
        int err;
        int offset = 0;
        int csummode = CHECKSUM_NONE;
@@ -854,8 +849,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                np->cork.hop_limit = hlimit;
                inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
                inet->cork.length = 0;
-               inet->sndmsg_page = NULL;
-               inet->sndmsg_off = 0;
+               sk->sk_sndmsg_page = NULL;
+               sk->sk_sndmsg_off = 0;
                exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
                length += exthdrlen;
                transhdrlen += exthdrlen;
@@ -881,29 +876,79 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                }
        }
 
+       /*
+        * Let's try using as much space as possible.
+        * Use MTU if total length of the message fits into the MTU.
+        * Otherwise, we need to reserve fragment header and
+        * fragment alignment (= 8-15 octects, in total).
+        *
+        * Note that we may need to "move" the data from the tail of
+        * of the buffer to the new fragment when we split 
+        * the message.
+        *
+        * FIXME: It may be fragmented into multiple chunks 
+        *        at once if non-fragmentable extension headers
+        *        are too large.
+        * --yoshfuji 
+        */
+
        inet->cork.length += length;
 
        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                goto alloc_new_skb;
 
        while (length > 0) {
-               if ((copy = maxfraglen - skb->len) <= 0) {
+               /* Check if the remaining data fits into current packet. */
+               copy = mtu - skb->len;
+               if (copy < length)
+                       copy = maxfraglen - skb->len;
+
+               if (copy <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
+                       unsigned int fraggap;
                        unsigned int alloclen;
-                       BUG_TRAP(copy == 0);
+                       struct sk_buff *skb_prev;
 alloc_new_skb:
-                       datalen = maxfraglen - fragheaderlen;
-                       if (datalen > length)
-                               datalen = length;
+                       skb_prev = skb;
+
+                       /* There's no room in the current skb */
+                       if (skb_prev)
+                               fraggap = skb_prev->len - maxfraglen;
+                       else
+                               fraggap = 0;
+
+                       /*
+                        * If remaining data exceeds the mtu,
+                        * we know we need more fragment(s).
+                        */
+                       datalen = length + fraggap;
+                       if (datalen > mtu - fragheaderlen)
+                               datalen = maxfraglen - fragheaderlen;
+
                        fraglen = datalen + fragheaderlen;
                        if ((flags & MSG_MORE) &&
                            !(rt->u.dst.dev->features&NETIF_F_SG))
-                               alloclen = maxfraglen;
+                               alloclen = mtu;
                        else
-                               alloclen = fraglen;
+                               alloclen = datalen + fragheaderlen;
+
+                       /*
+                        * The last fragment gets additional space at tail.
+                        * Note: we overallocate on fragments with MSG_MODE
+                        * because we have no idea if we're the last one.
+                        */
+                       if (datalen == length + fraggap)
+                               alloclen += rt->u.dst.trailer_len;
+
+                       /*
+                        * We just reserve space for fragment header.
+                        * Note: this may be overallocation if the message 
+                        * (without MSG_MORE) fits into the MTU.
+                        */
                        alloclen += sizeof(struct frag_hdr);
+
                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk,
                                                alloclen + hh_len,
@@ -925,7 +970,7 @@ alloc_new_skb:
                         */
                        skb->ip_summed = csummode;
                        skb->csum = 0;
-                       /* reserve 8 byte for fragmentation */
+                       /* reserve for fragmentation */
                        skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 
                        /*
@@ -935,15 +980,29 @@ alloc_new_skb:
                        skb->nh.raw = data + exthdrlen;
                        data += fragheaderlen;
                        skb->h.raw = data + exthdrlen;
-                       copy = datalen - transhdrlen;
-                       if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
+
+                       if (fraggap) {
+                               skb->csum = skb_copy_and_csum_bits(
+                                       skb_prev, maxfraglen,
+                                       data + transhdrlen, fraggap, 0);
+                               skb_prev->csum = csum_sub(skb_prev->csum,
+                                                         skb->csum);
+                               data += fraggap;
+                               skb_trim(skb_prev, maxfraglen);
+                       }
+                       copy = datalen - transhdrlen - fraggap;
+                       if (copy < 0) {
+                               err = -EINVAL;
+                               kfree_skb(skb);
+                               goto error;
+                       } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
                                goto error;
                        }
 
                        offset += copy;
-                       length -= datalen;
+                       length -= datalen - fraggap;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        csummode = CHECKSUM_NONE;
@@ -971,8 +1030,8 @@ alloc_new_skb:
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-                       struct page *page = inet->sndmsg_page;
-                       int off = inet->sndmsg_off;
+                       struct page *page = sk->sk_sndmsg_page;
+                       int off = sk->sk_sndmsg_off;
                        unsigned int left;
 
                        if (page && (left = PAGE_SIZE - off) > 0) {
@@ -984,7 +1043,7 @@ alloc_new_skb:
                                                goto error;
                                        }
                                        get_page(page);
-                                       skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
+                                       skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
                                        frag = &skb_shinfo(skb)->frags[i];
                                }
                        } else if(i < MAX_SKB_FRAGS) {
@@ -995,8 +1054,8 @@ alloc_new_skb:
                                        err = -ENOMEM;
                                        goto error;
                                }
-                               inet->sndmsg_page = page;
-                               inet->sndmsg_off = 0;
+                               sk->sk_sndmsg_page = page;
+                               sk->sk_sndmsg_off = 0;
 
                                skb_fill_page_desc(skb, i, page, 0, 0);
                                frag = &skb_shinfo(skb)->frags[i];
@@ -1010,7 +1069,7 @@ alloc_new_skb:
                                err = -EFAULT;
                                goto error;
                        }
-                       inet->sndmsg_off += copy;
+                       sk->sk_sndmsg_off += copy;
                        frag->size += copy;
                        skb->len += copy;
                        skb->data_len += copy;
@@ -1021,7 +1080,7 @@ alloc_new_skb:
        return 0;
 error:
        inet->cork.length -= length;
-       IP6_INC_STATS(OutDiscards);
+       IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
        return err;
 }
 
@@ -1030,7 +1089,7 @@ int ip6_push_pending_frames(struct sock *sk)
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6hdr *hdr;
        struct ipv6_txoptions *opt = np->cork.opt;
@@ -1081,7 +1140,7 @@ int ip6_push_pending_frames(struct sock *sk)
        ipv6_addr_copy(&hdr->daddr, final_dst);
 
        skb->dst = dst_clone(&rt->u.dst);
-       IP6_INC_STATS(OutRequests);     
+       IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
        if (err) {
                if (err > 0)
@@ -1108,12 +1167,12 @@ error:
 
 void ip6_flush_pending_frames(struct sock *sk)
 {
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sk_buff *skb;
 
        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
-               IP6_INC_STATS(OutDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                kfree_skb(skb);
        }