Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / net / ipv6 / ip6_output.c
index 0057672..e460489 100644 (file)
 #include <net/rawv6.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
+#include <net/checksum.h>
 
-static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**));
+static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
 {
        static u32 ipv6_fragmentation_id = 1;
-       static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
+       static DEFINE_SPINLOCK(ip6_id_lock);
 
        spin_lock_bh(&ip6_id_lock);
        fhdr->identification = htonl(ipv6_fragmentation_id);
@@ -87,7 +88,7 @@ static inline int ip6_output_finish(struct sk_buff *skb)
        } else if (dst->neighbour)
                return dst->neighbour->output(skb);
 
-       IP6_INC_STATS_BH(OutNoRoutes);
+       IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
        kfree_skb(skb);
        return -EINVAL;
 
@@ -107,9 +108,8 @@ static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 }
 
 
-static int ip6_output2(struct sk_buff **pskb)
+static int ip6_output2(struct sk_buff *skb)
 {
-       struct sk_buff *skb = *pskb;
        struct dst_entry *dst = skb->dst;
        struct net_device *dev = dst->dev;
 
@@ -133,71 +133,25 @@ static int ip6_output2(struct sk_buff **pskb)
                                        ip6_dev_loopback_xmit);
 
                        if (skb->nh.ipv6h->hop_limit == 0) {
-                               IP6_INC_STATS(OutDiscards);
+                               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                                kfree_skb(skb);
                                return 0;
                        }
                }
 
-               IP6_INC_STATS(OutMcastPkts);
+               IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
        }
 
        return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 }
 
-int ip6_output(struct sk_buff **pskb)
+int ip6_output(struct sk_buff *skb)
 {
-       struct sk_buff *skb = *pskb;
-
-       if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
-               return ip6_fragment(pskb, ip6_output2);
+       if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
+                               dst_allfrag(skb->dst))
+               return ip6_fragment(skb, ip6_output2);
        else
-               return ip6_output2(pskb);
-}
-
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb)
-{
-       struct ipv6hdr *iph = skb->nh.ipv6h;
-       struct dst_entry *dst;
-       struct flowi fl = {
-               .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
-               .nl_u =
-               { .ip6_u =
-                 { .daddr = iph->daddr,
-                   .saddr = iph->saddr, } },
-               .proto = iph->nexthdr,
-       };
-
-       dst = ip6_route_output(skb->sk, &fl);
-
-       if (dst->error) {
-               IP6_INC_STATS(OutNoRoutes);
-               LIMIT_NETDEBUG(
-                       printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
-               dst_release(dst);
-               return -EINVAL;
-       }
-
-       /* Drop old route. */
-       dst_release(skb->dst);
-
-       skb->dst = dst;
-       return 0;
-}
-#endif
-
-static inline int ip6_maybe_reroute(struct sk_buff *skb)
-{
-#ifdef CONFIG_NETFILTER
-       if (skb->nfcache & NFC_ALTERED){
-               if (ip6_route_me_harder(skb) != 0){
-                       kfree_skb(skb);
-                       return -EINVAL;
-               }
-       }
-#endif /* CONFIG_NETFILTER */
-       return dst_output(skb);
+               return ip6_output2(skb);
 }
 
 /*
@@ -207,13 +161,13 @@ static inline int ip6_maybe_reroute(struct sk_buff *skb)
 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
             struct ipv6_txoptions *opt, int ipfragok)
 {
-       struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
+       struct ipv6_pinfo *np = inet6_sk(sk);
        struct in6_addr *first_hop = &fl->fl6_dst;
        struct dst_entry *dst = skb->dst;
        struct ipv6hdr *hdr;
        u8  proto = fl->proto;
        int seg_len = skb->len;
-       int hlimit;
+       int hlimit, tclass;
        u32 mtu;
 
        if (opt) {
@@ -231,7 +185,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                        kfree_skb(skb);
                        skb = skb2;
                        if (skb == NULL) {      
-                               IP6_INC_STATS(OutDiscards);
+                               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                                return -ENOBUFS;
                        }
                        if (sk)
@@ -249,12 +203,21 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
         *      Fill in the IPv6 header
         */
 
-       *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
        hlimit = -1;
        if (np)
                hlimit = np->hop_limit;
        if (hlimit < 0)
                hlimit = dst_metric(dst, RTAX_HOPLIMIT);
+       if (hlimit < 0)
+               hlimit = ipv6_get_hoplimit(dst->dev);
+
+       tclass = -1;
+       if (np)
+               tclass = np->tclass;
+       if (tclass < 0)
+               tclass = 0;
+
+       *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 
        hdr->payload_len = htons(seg_len);
        hdr->nexthdr = proto;
@@ -263,17 +226,20 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
        ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
        ipv6_addr_copy(&hdr->daddr, first_hop);
 
-       mtu = dst_pmtu(dst);
+       skb->priority = sk->sk_priority;
+
+       mtu = dst_mtu(dst);
        if ((skb->len <= mtu) || ipfragok) {
-               IP6_INC_STATS(OutRequests);
-               return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
+               IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+               return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
+                               dst_output);
        }
 
        if (net_ratelimit())
                printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
        skb->dev = dst->dev;
        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-       IP6_INC_STATS(FragFails);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
        kfree_skb(skb);
        return -EMSGSIZE;
 }
@@ -313,7 +279,7 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
        return 0;
 }
 
-int ip6_call_ra_chain(struct sk_buff *skb, int sel)
+static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 {
        struct ip6_ra_chain *ra;
        struct sock *last = NULL;
@@ -321,7 +287,9 @@ int ip6_call_ra_chain(struct sk_buff *skb, int sel)
        read_lock(&ip6_ra_lock);
        for (ra = ip6_ra_chain; ra; ra = ra->next) {
                struct sock *sk = ra->sk;
-               if (sk && ra->sel == sel) {
+               if (sk && ra->sel == sel &&
+                   (!sk->sk_bound_dev_if ||
+                    sk->sk_bound_dev_if == skb->dev->ifindex)) {
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
@@ -355,7 +323,7 @@ int ip6_forward(struct sk_buff *skb)
                goto error;
 
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
-               IP6_INC_STATS(InDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
 
@@ -394,9 +362,10 @@ int ip6_forward(struct sk_buff *skb)
        }
 
        if (!xfrm6_route_forward(skb)) {
-               IP6_INC_STATS(InDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
+       dst = skb->dst;
 
        /* IPv6 specs say nothing about it, but it is clear that we cannot
           send redirects to source routed frames.
@@ -428,18 +397,18 @@ int ip6_forward(struct sk_buff *skb)
                goto error;
        }
 
-       if (skb->len > dst_pmtu(dst)) {
+       if (skb->len > dst_mtu(dst)) {
                /* Again, force OUTPUT device used as source address */
                skb->dev = dst->dev;
-               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
-               IP6_INC_STATS_BH(InTooBigErrors);
-               IP6_INC_STATS_BH(FragFails);
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
+               IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
+               IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
                kfree_skb(skb);
                return -EMSGSIZE;
        }
 
        if (skb_cow(skb, dst->dev->hard_header_len)) {
-               IP6_INC_STATS(OutDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                goto drop;
        }
 
@@ -449,11 +418,11 @@ int ip6_forward(struct sk_buff *skb)
  
        hdr->hop_limit--;
 
-       IP6_INC_STATS_BH(OutForwDatagrams);
+       IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
        return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 
 error:
-       IP6_INC_STATS_BH(InAddrErrors);
+       IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 drop:
        kfree_skb(skb);
        return -EINVAL;
@@ -464,7 +433,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
-       to->security = from->security;
+       dst_release(to->dst);
        to->dst = dst_clone(from->dst);
        to->dev = from->dev;
 
@@ -474,16 +443,20 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 #ifdef CONFIG_NETFILTER
        to->nfmark = from->nfmark;
        /* Connection association is same as pre-frag packet */
+       nf_conntrack_put(to->nfct);
        to->nfct = from->nfct;
        nf_conntrack_get(to->nfct);
+       to->nfctinfo = from->nfctinfo;
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       nf_conntrack_put_reasm(to->nfct_reasm);
+       to->nfct_reasm = from->nfct_reasm;
+       nf_conntrack_get_reasm(to->nfct_reasm);
+#endif
 #ifdef CONFIG_BRIDGE_NETFILTER
        nf_bridge_put(to->nf_bridge);
        to->nf_bridge = from->nf_bridge;
        nf_bridge_get(to->nf_bridge);
 #endif
-#ifdef CONFIG_NETFILTER_DEBUG
-       to->nf_debug = from->nf_debug;
-#endif
 #endif
 }
 
@@ -516,11 +489,12 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
        return offset;
 }
 
-static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
+static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
        struct net_device *dev;
-       struct sk_buff *frag, *skb = *pskb;
+       struct sk_buff *frag;
        struct rt6_info *rt = (struct rt6_info*)skb->dst;
+       struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
        struct ipv6hdr *tmp_hdr;
        struct frag_hdr *fh;
        unsigned int mtu, hlen, left, len;
@@ -532,7 +506,12 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
        hlen = ip6_find_1stfragopt(skb, &prevhdr);
        nexthdr = *prevhdr;
 
-       mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
+       mtu = dst_mtu(&rt->u.dst);
+       if (np && np->frag_size < mtu) {
+               if (np->frag_size)
+                       mtu = np->frag_size;
+       }
+       mtu -= hlen + sizeof(struct frag_hdr);
 
        if (skb_shinfo(skb)->frag_list) {
                int first_len = skb_pagelen(skb);
@@ -549,24 +528,28 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                            skb_headroom(frag) < hlen)
                            goto slow_path;
 
-                       /* Correct socket ownership. */
-                       if (frag->sk == NULL)
-                               goto slow_path;
-
                        /* Partially cloned skb? */
                        if (skb_shared(frag))
                                goto slow_path;
+
+                       BUG_ON(frag->sk);
+                       if (skb->sk) {
+                               sock_hold(skb->sk);
+                               frag->sk = skb->sk;
+                               frag->destructor = sock_wfree;
+                               skb->truesize -= frag->truesize;
+                       }
                }
 
                err = 0;
                offset = 0;
                frag = skb_shinfo(skb)->frag_list;
-               skb_shinfo(skb)->frag_list = 0;
+               skb_shinfo(skb)->frag_list = NULL;
                /* BUILD HEADER */
 
                tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
                if (!tmp_hdr) {
-                       IP6_INC_STATS(FragFails);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        return -ENOMEM;
                }
 
@@ -593,6 +576,7 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                        /* Prepare header of the next frame,
                         * before previous one went down. */
                        if (frag) {
+                               frag->ip_summed = CHECKSUM_NONE;
                                frag->h.raw = frag->data;
                                fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
                                frag->nh.raw = __skb_push(frag, hlen);
@@ -607,23 +591,20 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                                frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
                                ip6_copy_metadata(frag, skb);
                        }
-                       err = output(pskb);
-                       if (err || !frag) {
-                               if (unlikely(skb != *pskb))
-                                       skb = *pskb;
-                               break;
-                       }
                        
+                       err = output(skb);
+                       if (err || !frag)
+                               break;
+
                        skb = frag;
                        frag = skb->next;
                        skb->next = NULL;
                }
 
-               if (tmp_hdr)
-                       kfree(tmp_hdr);
+               kfree(tmp_hdr);
 
                if (err == 0) {
-                       IP6_INC_STATS(FragOKs);
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
                        return 0;
                }
 
@@ -633,7 +614,7 @@ static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
                        frag = skb;
                }
 
-               IP6_INC_STATS(FragFails);
+               IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                return err;
        }
 
@@ -665,8 +646,8 @@ slow_path:
                 */
 
                if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
-                       NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
-                       IP6_INC_STATS(FragFails);
+                       NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
+                       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                        err = -ENOMEM;
                        goto fail;
                }
@@ -699,7 +680,7 @@ slow_path:
                 */
                fh->nexthdr = nexthdr;
                fh->reserved = 0;
-               if (frag_id) {
+               if (!frag_id) {
                        ipv6_select_ident(skb, fh);
                        frag_id = fh->identification;
                } else
@@ -724,19 +705,19 @@ slow_path:
                 *      Put this fragment into the sending queue.
                 */
 
-               IP6_INC_STATS(FragCreates);
+               IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 
-               err = output(&frag);
+               err = output(frag);
                if (err)
                        goto fail;
        }
        kfree_skb(skb);
-       IP6_INC_STATS(FragOKs);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
        return err;
 
 fail:
        kfree_skb(skb); 
-       IP6_INC_STATS(FragFails);
+       IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
        return err;
 }
 
@@ -748,36 +729,37 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
        if (sk) {
                struct ipv6_pinfo *np = inet6_sk(sk);
        
-               *dst = __sk_dst_check(sk, np->dst_cookie);
+               *dst = sk_dst_check(sk, np->dst_cookie);
                if (*dst) {
                        struct rt6_info *rt = (struct rt6_info*)*dst;
        
-                               /* Yes, checking route validity in not connected
-                                  case is not very simple. Take into account,
-                                  that we do not support routing by source, TOS,
-                                  and MSG_DONTROUTE            --ANK (980726)
-       
-                                  1. If route was host route, check that
-                                     cached destination is current.
-                                     If it is network route, we still may
-                                     check its validity using saved pointer
-                                     to the last used address: daddr_cache.
-                                     We do not want to save whole address now,
-                                     (because main consumer of this service
-                                      is tcp, which has not this problem),
-                                     so that the last trick works only on connected
-                                     sockets.
-                                  2. oif also should be the same.
-                                */
-       
+                       /* Yes, checking route validity in not connected
+                        * case is not very simple. Take into account,
+                        * that we do not support routing by source, TOS,
+                        * and MSG_DONTROUTE            --ANK (980726)
+                        *
+                        * 1. If route was host route, check that
+                        *    cached destination is current.
+                        *    If it is network route, we still may
+                        *    check its validity using saved pointer
+                        *    to the last used address: daddr_cache.
+                        *    We do not want to save whole address now,
+                        *    (because main consumer of this service
+                        *    is tcp, which has not this problem),
+                        *    so that the last trick works only on connected
+                        *    sockets.
+                        * 2. oif also should be the same.
+                        */
                        if (((rt->rt6i_dst.plen != 128 ||
-                             ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
+                             !ipv6_addr_equal(&fl->fl6_dst,
+                                              &rt->rt6i_dst.addr))
                             && (np->daddr_cache == NULL ||
-                                ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
+                                !ipv6_addr_equal(&fl->fl6_dst,
+                                                 np->daddr_cache)))
                            || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
+                               dst_release(*dst);
                                *dst = NULL;
-                       } else
-                               dst_hold(*dst);
+                       }
                }
        }
 
@@ -790,18 +772,9 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
        if (ipv6_addr_any(&fl->fl6_src)) {
                err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 
-               if (err) {
-#if IP6_DEBUG >= 2
-                       printk(KERN_DEBUG "ip6_dst_lookup: "
-                              "no available source address\n");
-#endif
+               if (err)
                        goto out_err_release;
-               }
        }
-       if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) {
-               err = -ENETUNREACH;
-               goto out_err_release;
-        }
 
        return 0;
 
@@ -811,19 +784,82 @@ out_err_release:
        return err;
 }
 
-int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
-                   void *from, int length, int transhdrlen,
-                   int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
-                   unsigned int flags)
+EXPORT_SYMBOL_GPL(ip6_dst_lookup);
+
+static inline int ip6_ufo_append_data(struct sock *sk,
+                       int getfrag(void *from, char *to, int offset, int len,
+                       int odd, struct sk_buff *skb),
+                       void *from, int length, int hh_len, int fragheaderlen,
+                       int transhdrlen, int mtu,unsigned int flags)
+
 {
-       struct inet_opt *inet = inet_sk(sk);
+       struct sk_buff *skb;
+       int err;
+
+       /* There is support for UDP large send offload by network
+        * device, so create one single skb packet containing complete
+        * udp datagram
+        */
+       if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+               skb = sock_alloc_send_skb(sk,
+                       hh_len + fragheaderlen + transhdrlen + 20,
+                       (flags & MSG_DONTWAIT), &err);
+               if (skb == NULL)
+                       return -ENOMEM;
+
+               /* reserve space for Hardware header */
+               skb_reserve(skb, hh_len);
+
+               /* create space for UDP/IP header */
+               skb_put(skb,fragheaderlen + transhdrlen);
+
+               /* initialize network header pointer */
+               skb->nh.raw = skb->data;
+
+               /* initialize protocol header pointer */
+               skb->h.raw = skb->data + fragheaderlen;
+
+               skb->ip_summed = CHECKSUM_HW;
+               skb->csum = 0;
+               sk->sk_sndmsg_off = 0;
+       }
+
+       err = skb_append_datato_frags(sk,skb, getfrag, from,
+                                     (length - transhdrlen));
+       if (!err) {
+               struct frag_hdr fhdr;
+
+               /* specify the length of each IP datagram fragment*/
+               skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
+                                               sizeof(struct frag_hdr);
+               ipv6_select_ident(skb, &fhdr);
+               skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
+               __skb_queue_tail(&sk->sk_write_queue, skb);
+
+               return 0;
+       }
+       /* There is not enough support do UPD LSO,
+        * so follow normal path
+        */
+       kfree_skb(skb);
+
+       return err;
+}
+
+int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
+       int offset, int len, int odd, struct sk_buff *skb),
+       void *from, int length, int transhdrlen,
+       int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
+       struct rt6_info *rt, unsigned int flags)
+{
+       struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sk_buff *skb;
        unsigned int maxfraglen, fragheaderlen;
        int exthdrlen;
        int hh_len;
        int mtu;
-       int copy = 0;
+       int copy;
        int err;
        int offset = 0;
        int csummode = CHECKSUM_NONE;
@@ -852,10 +888,18 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                np->cork.rt = rt;
                inet->cork.fl = *fl;
                np->cork.hop_limit = hlimit;
-               inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
+               np->cork.tclass = tclass;
+               mtu = dst_mtu(rt->u.dst.path);
+               if (np->frag_size < mtu) {
+                       if (np->frag_size)
+                               mtu = np->frag_size;
+               }
+               inet->cork.fragsize = mtu;
+               if (dst_allfrag(rt->u.dst.path))
+                       inet->cork.flags |= IPCORK_ALLFRAG;
                inet->cork.length = 0;
-               inet->sndmsg_page = NULL;
-               inet->sndmsg_off = 0;
+               sk->sk_sndmsg_page = NULL;
+               sk->sk_sndmsg_off = 0;
                exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
                length += exthdrlen;
                transhdrlen += exthdrlen;
@@ -881,29 +925,89 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
                }
        }
 
+       /*
+        * Let's try using as much space as possible.
+        * Use MTU if total length of the message fits into the MTU.
+        * Otherwise, we need to reserve fragment header and
+        * fragment alignment (= 8-15 octects, in total).
+        *
+        * Note that we may need to "move" the data from the tail of
+        * of the buffer to the new fragment when we split 
+        * the message.
+        *
+        * FIXME: It may be fragmented into multiple chunks 
+        *        at once if non-fragmentable extension headers
+        *        are too large.
+        * --yoshfuji 
+        */
+
        inet->cork.length += length;
+       if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
+           (rt->u.dst.dev->features & NETIF_F_UFO)) {
+
+               err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
+                                         fragheaderlen, transhdrlen, mtu,
+                                         flags);
+               if (err)
+                       goto error;
+               return 0;
+       }
 
        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                goto alloc_new_skb;
 
        while (length > 0) {
-               if ((copy = maxfraglen - skb->len) <= 0) {
+               /* Check if the remaining data fits into current packet. */
+               copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+               if (copy < length)
+                       copy = maxfraglen - skb->len;
+
+               if (copy <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
+                       unsigned int fraggap;
                        unsigned int alloclen;
-                       BUG_TRAP(copy == 0);
+                       struct sk_buff *skb_prev;
 alloc_new_skb:
-                       datalen = maxfraglen - fragheaderlen;
-                       if (datalen > length)
-                               datalen = length;
+                       skb_prev = skb;
+
+                       /* There's no room in the current skb */
+                       if (skb_prev)
+                               fraggap = skb_prev->len - maxfraglen;
+                       else
+                               fraggap = 0;
+
+                       /*
+                        * If remaining data exceeds the mtu,
+                        * we know we need more fragment(s).
+                        */
+                       datalen = length + fraggap;
+                       if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
+                               datalen = maxfraglen - fragheaderlen;
+
                        fraglen = datalen + fragheaderlen;
                        if ((flags & MSG_MORE) &&
                            !(rt->u.dst.dev->features&NETIF_F_SG))
-                               alloclen = maxfraglen;
+                               alloclen = mtu;
                        else
-                               alloclen = fraglen;
+                               alloclen = datalen + fragheaderlen;
+
+                       /*
+                        * The last fragment gets additional space at tail.
+                        * Note: we overallocate on fragments with MSG_MODE
+                        * because we have no idea if we're the last one.
+                        */
+                       if (datalen == length + fraggap)
+                               alloclen += rt->u.dst.trailer_len;
+
+                       /*
+                        * We just reserve space for fragment header.
+                        * Note: this may be overallocation if the message 
+                        * (without MSG_MORE) fits into the MTU.
+                        */
                        alloclen += sizeof(struct frag_hdr);
+
                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk,
                                                alloclen + hh_len,
@@ -925,7 +1029,7 @@ alloc_new_skb:
                         */
                        skb->ip_summed = csummode;
                        skb->csum = 0;
-                       /* reserve 8 byte for fragmentation */
+                       /* reserve for fragmentation */
                        skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 
                        /*
@@ -935,15 +1039,29 @@ alloc_new_skb:
                        skb->nh.raw = data + exthdrlen;
                        data += fragheaderlen;
                        skb->h.raw = data + exthdrlen;
-                       copy = datalen - transhdrlen;
-                       if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
+
+                       if (fraggap) {
+                               skb->csum = skb_copy_and_csum_bits(
+                                       skb_prev, maxfraglen,
+                                       data + transhdrlen, fraggap, 0);
+                               skb_prev->csum = csum_sub(skb_prev->csum,
+                                                         skb->csum);
+                               data += fraggap;
+                               skb_trim(skb_prev, maxfraglen);
+                       }
+                       copy = datalen - transhdrlen - fraggap;
+                       if (copy < 0) {
+                               err = -EINVAL;
+                               kfree_skb(skb);
+                               goto error;
+                       } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
                                goto error;
                        }
 
                        offset += copy;
-                       length -= datalen;
+                       length -= datalen - fraggap;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        csummode = CHECKSUM_NONE;
@@ -971,8 +1089,8 @@ alloc_new_skb:
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-                       struct page *page = inet->sndmsg_page;
-                       int off = inet->sndmsg_off;
+                       struct page *page = sk->sk_sndmsg_page;
+                       int off = sk->sk_sndmsg_off;
                        unsigned int left;
 
                        if (page && (left = PAGE_SIZE - off) > 0) {
@@ -984,7 +1102,7 @@ alloc_new_skb:
                                                goto error;
                                        }
                                        get_page(page);
-                                       skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
+                                       skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
                                        frag = &skb_shinfo(skb)->frags[i];
                                }
                        } else if(i < MAX_SKB_FRAGS) {
@@ -995,8 +1113,8 @@ alloc_new_skb:
                                        err = -ENOMEM;
                                        goto error;
                                }
-                               inet->sndmsg_page = page;
-                               inet->sndmsg_off = 0;
+                               sk->sk_sndmsg_page = page;
+                               sk->sk_sndmsg_off = 0;
 
                                skb_fill_page_desc(skb, i, page, 0, 0);
                                frag = &skb_shinfo(skb)->frags[i];
@@ -1010,7 +1128,7 @@ alloc_new_skb:
                                err = -EFAULT;
                                goto error;
                        }
-                       inet->sndmsg_off += copy;
+                       sk->sk_sndmsg_off += copy;
                        frag->size += copy;
                        skb->len += copy;
                        skb->data_len += copy;
@@ -1021,7 +1139,7 @@ alloc_new_skb:
        return 0;
 error:
        inet->cork.length -= length;
-       IP6_INC_STATS(OutDiscards);
+       IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
        return err;
 }
 
@@ -1030,7 +1148,7 @@ int ip6_push_pending_frames(struct sock *sk)
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6hdr *hdr;
        struct ipv6_txoptions *opt = np->cork.opt;
@@ -1052,12 +1170,10 @@ int ip6_push_pending_frames(struct sock *sk)
                tail_skb = &(tmp_skb->next);
                skb->len += tmp_skb->len;
                skb->data_len += tmp_skb->len;
-#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
                skb->truesize += tmp_skb->truesize;
                __sock_put(tmp_skb->sk);
                tmp_skb->destructor = NULL;
                tmp_skb->sk = NULL;
-#endif
        }
 
        ipv6_addr_copy(final_dst, &fl->fl6_dst);
@@ -1069,7 +1185,8 @@ int ip6_push_pending_frames(struct sock *sk)
 
        skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
        
-       *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
+       *(u32*)hdr = fl->fl6_flowlabel |
+                    htonl(0x60000000 | ((int)np->cork.tclass << 20));
 
        if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
                hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
@@ -1080,25 +1197,26 @@ int ip6_push_pending_frames(struct sock *sk)
        ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
        ipv6_addr_copy(&hdr->daddr, final_dst);
 
+       skb->priority = sk->sk_priority;
+
        skb->dst = dst_clone(&rt->u.dst);
-       IP6_INC_STATS(OutRequests);     
+       IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
        if (err) {
                if (err > 0)
-                       err = inet->recverr ? net_xmit_errno(err) : 0;
+                       err = np->recverr ? net_xmit_errno(err) : 0;
                if (err)
                        goto error;
        }
 
 out:
        inet->cork.flags &= ~IPCORK_OPT;
-       if (np->cork.opt) {
-               kfree(np->cork.opt);
-               np->cork.opt = NULL;
-       }
+       kfree(np->cork.opt);
+       np->cork.opt = NULL;
        if (np->cork.rt) {
                dst_release(&np->cork.rt->u.dst);
                np->cork.rt = NULL;
+               inet->cork.flags &= ~IPCORK_ALLFRAG;
        }
        memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
        return err;
@@ -1108,24 +1226,23 @@ error:
 
 void ip6_flush_pending_frames(struct sock *sk)
 {
-       struct inet_opt *inet = inet_sk(sk);
+       struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sk_buff *skb;
 
        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
-               IP6_INC_STATS(OutDiscards);
+               IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
                kfree_skb(skb);
        }
 
        inet->cork.flags &= ~IPCORK_OPT;
 
-       if (np->cork.opt) {
-               kfree(np->cork.opt);
-               np->cork.opt = NULL;
-       }
+       kfree(np->cork.opt);
+       np->cork.opt = NULL;
        if (np->cork.rt) {
                dst_release(&np->cork.rt->u.dst);
                np->cork.rt = NULL;
+               inet->cork.flags &= ~IPCORK_ALLFRAG;
        }
        memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
 }