net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_pmtu(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 #ifdef CONFIG_NETFILTER
 157 int ip6_route_me_harder(struct sk_buff *skb)
 158 {
 159         struct ipv6hdr *iph = skb->nh.ipv6h;
 160         struct dst_entry *dst;
 161         struct flowi fl = {
 162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
 163                 .nl_u =
 164                 { .ip6_u =
 165                   { .daddr = iph->daddr,
 166                     .saddr = iph->saddr, } },
 167                 .proto = iph->nexthdr,
 168         };
 169
 170         dst = ip6_route_output(skb->sk, &fl);
 171
 172         if (dst->error) {
 173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 174                 LIMIT_NETDEBUG(
 175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
 176                 dst_release(dst);
 177                 return -EINVAL;
 178         }
 179
 180         /* Drop old route. */
 181         dst_release(skb->dst);
 182
 183         skb->dst = dst;
 184         return 0;
 185 }
 186 #endif
 187
 188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
 189 {
 190 #ifdef CONFIG_NETFILTER
 191         if (skb->nfcache & NFC_ALTERED){
 192                 if (ip6_route_me_harder(skb) != 0){
 193                         kfree_skb(skb);
 194                         return -EINVAL;
 195                 }
 196         }
 197 #endif /* CONFIG_NETFILTER */
 198         return dst_output(skb);
 199 }
 200
 201 /*
 202  *      xmit an sk_buff (used by TCP)
 203  */
 204
 205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 206              struct ipv6_txoptions *opt, int ipfragok)
 207 {
 208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 209         struct in6_addr *first_hop = &fl->fl6_dst;
 210         struct dst_entry *dst = skb->dst;
 211         struct ipv6hdr *hdr;
 212         u8  proto = fl->proto;
 213         int seg_len = skb->len;
 214         int hlimit;
 215         u32 mtu;
 216
 217         if (opt) {
 218                 int head_room;
 219
 220                 /* First: exthdrs may take lots of space (~8K for now)
 221                    MAX_HEADER is not enough.
 222                  */
 223                 head_room = opt->opt_nflen + opt->opt_flen;
 224                 seg_len += head_room;
 225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 226
 227                 if (skb_headroom(skb) < head_room) {
 228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 229                         kfree_skb(skb);
 230                         skb = skb2;
 231                         if (skb == NULL) {
 232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 233                                 return -ENOBUFS;
 234                         }
 235                         if (sk)
 236                                 skb_set_owner_w(skb, sk);
 237                 }
 238                 if (opt->opt_flen)
 239                         ipv6_push_frag_opts(skb, opt, &proto);
 240                 if (opt->opt_nflen)
 241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 242         }
 243
 244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 245
 246         /*
 247          *      Fill in the IPv6 header
 248          */
 249
 250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
 251         hlimit = -1;
 252         if (np)
 253                 hlimit = np->hop_limit;
 254         if (hlimit < 0)
 255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 256
 257         hdr->payload_len = htons(seg_len);
 258         hdr->nexthdr = proto;
 259         hdr->hop_limit = hlimit;
 260
 261         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 262         ipv6_addr_copy(&hdr->daddr, first_hop);
 263
 264         mtu = dst_pmtu(dst);
 265         if ((skb->len <= mtu) || ipfragok) {
 266                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 267                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
 268         }
 269
 270         if (net_ratelimit())
 271                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 272         skb->dev = dst->dev;
 273         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 274         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 275         kfree_skb(skb);
 276         return -EMSGSIZE;
 277 }
 278
 279 /*
 280  *      To avoid extra problems ND packets are send through this
 281  *      routine. It's code duplication but I really want to avoid
 282  *      extra checks since ipv6_build_header is used by TCP (which
 283  *      is for us performance critical)
 284  */
 285
 286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 287                struct in6_addr *saddr, struct in6_addr *daddr,
 288                int proto, int len)
 289 {
 290         struct ipv6_pinfo *np = inet6_sk(sk);
 291         struct ipv6hdr *hdr;
 292         int totlen;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->dev = dev;
 296
 297         totlen = len + sizeof(struct ipv6hdr);
 298
 299         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 300         skb->nh.ipv6h = hdr;
 301
 302         *(u32*)hdr = htonl(0x60000000);
 303
 304         hdr->payload_len = htons(len);
 305         hdr->nexthdr = proto;
 306         hdr->hop_limit = np->hop_limit;
 307
 308         ipv6_addr_copy(&hdr->saddr, saddr);
 309         ipv6_addr_copy(&hdr->daddr, daddr);
 310
 311         return 0;
 312 }
 313
 314 int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 315 {
 316         struct ip6_ra_chain *ra;
 317         struct sock *last = NULL;
 318
 319         read_lock(&ip6_ra_lock);
 320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 321                 struct sock *sk = ra->sk;
 322                 if (sk && ra->sel == sel) {
 323                         if (last) {
 324                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 325                                 if (skb2)
 326                                         rawv6_rcv(last, skb2);
 327                         }
 328                         last = sk;
 329                 }
 330         }
 331
 332         if (last) {
 333                 rawv6_rcv(last, skb);
 334                 read_unlock(&ip6_ra_lock);
 335                 return 1;
 336         }
 337         read_unlock(&ip6_ra_lock);
 338         return 0;
 339 }
 340
 341 static inline int ip6_forward_finish(struct sk_buff *skb)
 342 {
 343         return dst_output(skb);
 344 }
 345
 346 int ip6_forward(struct sk_buff *skb)
 347 {
 348         struct dst_entry *dst = skb->dst;
 349         struct ipv6hdr *hdr = skb->nh.ipv6h;
 350         struct inet6_skb_parm *opt = IP6CB(skb);
 351
 352         if (ipv6_devconf.forwarding == 0)
 353                 goto error;
 354
 355         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 356                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 357                 goto drop;
 358         }
 359
 360         skb->ip_summed = CHECKSUM_NONE;
 361
 362         /*
 363          *      We DO NOT make any processing on
 364          *      RA packets, pushing them to user level AS IS
 365          *      without ane WARRANTY that application will be able
 366          *      to interpret them. The reason is that we
 367          *      cannot make anything clever here.
 368          *
 369          *      We are not end-node, so that if packet contains
 370          *      AH/ESP, we cannot make anything.
 371          *      Defragmentation also would be mistake, RA packets
 372          *      cannot be fragmented, because there is no warranty
 373          *      that different fragments will go along one path. --ANK
 374          */
 375         if (opt->ra) {
 376                 u8 *ptr = skb->nh.raw + opt->ra;
 377                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 378                         return 0;
 379         }
 380
 381         /*
 382          *      check and decrement ttl
 383          */
 384         if (hdr->hop_limit <= 1) {
 385                 /* Force OUTPUT device used as source address */
 386                 skb->dev = dst->dev;
 387                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 388                             0, skb->dev);
 389
 390                 kfree_skb(skb);
 391                 return -ETIMEDOUT;
 392         }
 393
 394         if (!xfrm6_route_forward(skb)) {
 395                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 396                 goto drop;
 397         }
 398
 399         /* IPv6 specs say nothing about it, but it is clear that we cannot
 400            send redirects to source routed frames.
 401          */
 402         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 403                 struct in6_addr *target = NULL;
 404                 struct rt6_info *rt;
 405                 struct neighbour *n = dst->neighbour;
 406
 407                 /*
 408                  *      incoming and outgoing devices are the same
 409                  *      send a redirect.
 410                  */
 411
 412                 rt = (struct rt6_info *) dst;
 413                 if ((rt->rt6i_flags & RTF_GATEWAY))
 414                         target = (struct in6_addr*)&n->primary_key;
 415                 else
 416                         target = &hdr->daddr;
 417
 418                 /* Limit redirects both by destination (here)
 419                    and by source (inside ndisc_send_redirect)
 420                  */
 421                 if (xrlim_allow(dst, 1*HZ))
 422                         ndisc_send_redirect(skb, n, target);
 423         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 424                                                 |IPV6_ADDR_LINKLOCAL)) {
 425                 /* This check is security critical. */
 426                 goto error;
 427         }
 428
 429         if (skb->len > dst_pmtu(dst)) {
 430                 /* Again, force OUTPUT device used as source address */
 431                 skb->dev = dst->dev;
 432                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
 433                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 434                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 435                 kfree_skb(skb);
 436                 return -EMSGSIZE;
 437         }
 438
 439         if (skb_cow(skb, dst->dev->hard_header_len)) {
 440                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 441                 goto drop;
 442         }
 443
 444         hdr = skb->nh.ipv6h;
 445
 446         /* Mangling hops number delayed to point after skb COW */
 447
 448         hdr->hop_limit--;
 449
 450         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 451         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 452
 453 error:
 454         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 455 drop:
 456         kfree_skb(skb);
 457         return -EINVAL;
 458 }
 459
 460 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 461 {
 462         to->pkt_type = from->pkt_type;
 463         to->priority = from->priority;
 464         to->protocol = from->protocol;
 465         to->security = from->security;
 466         dst_release(to->dst);
 467         to->dst = dst_clone(from->dst);
 468         to->dev = from->dev;
 469
 470 #ifdef CONFIG_NET_SCHED
 471         to->tc_index = from->tc_index;
 472 #endif
 473 #ifdef CONFIG_NETFILTER
 474         to->nfmark = from->nfmark;
 475         /* Connection association is same as pre-frag packet */
 476         to->nfct = from->nfct;
 477         nf_conntrack_get(to->nfct);
 478         to->nfctinfo = from->nfctinfo;
 479 #ifdef CONFIG_BRIDGE_NETFILTER
 480         nf_bridge_put(to->nf_bridge);
 481         to->nf_bridge = from->nf_bridge;
 482         nf_bridge_get(to->nf_bridge);
 483 #endif
 484 #ifdef CONFIG_NETFILTER_DEBUG
 485         to->nf_debug = from->nf_debug;
 486 #endif
 487 #endif
 488 }
 489
 490 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 491 {
 492         u16 offset = sizeof(struct ipv6hdr);
 493         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 494         unsigned int packet_len = skb->tail - skb->nh.raw;
 495         int found_rhdr = 0;
 496         *nexthdr = &skb->nh.ipv6h->nexthdr;
 497
 498         while (offset + 1 <= packet_len) {
 499
 500                 switch (**nexthdr) {
 501
 502                 case NEXTHDR_HOP:
 503                 case NEXTHDR_ROUTING:
 504                 case NEXTHDR_DEST:
 505                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 506                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 507                         offset += ipv6_optlen(exthdr);
 508                         *nexthdr = &exthdr->nexthdr;
 509                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 510                         break;
 511                 default :
 512                         return offset;
 513                 }
 514         }
 515
 516         return offset;
 517 }
 518
 519 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 520 {
 521         struct net_device *dev;
 522         struct sk_buff *frag;
 523         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 524         struct ipv6hdr *tmp_hdr;
 525         struct frag_hdr *fh;
 526         unsigned int mtu, hlen, left, len;
 527         u32 frag_id = 0;
 528         int ptr, offset = 0, err=0;
 529         u8 *prevhdr, nexthdr = 0;
 530
 531         dev = rt->u.dst.dev;
 532         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 533         nexthdr = *prevhdr;
 534
 535         mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 536
 537         if (skb_shinfo(skb)->frag_list) {
 538                 int first_len = skb_pagelen(skb);
 539
 540                 if (first_len - hlen > mtu ||
 541                     ((first_len - hlen) & 7) ||
 542                     skb_cloned(skb))
 543                         goto slow_path;
 544
 545                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 546                         /* Correct geometry. */
 547                         if (frag->len > mtu ||
 548                             ((frag->len & 7) && frag->next) ||
 549                             skb_headroom(frag) < hlen)
 550                             goto slow_path;
 551
 552                         /* Correct socket ownership. */
 553                         if (frag->sk == NULL)
 554                                 goto slow_path;
 555
 556                         /* Partially cloned skb? */
 557                         if (skb_shared(frag))
 558                                 goto slow_path;
 559                 }
 560
 561                 err = 0;
 562                 offset = 0;
 563                 frag = skb_shinfo(skb)->frag_list;
 564                 skb_shinfo(skb)->frag_list = NULL;
 565                 /* BUILD HEADER */
 566
 567                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 568                 if (!tmp_hdr) {
 569                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 570                         return -ENOMEM;
 571                 }
 572
 573                 *prevhdr = NEXTHDR_FRAGMENT;
 574                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 575                 __skb_pull(skb, hlen);
 576                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 577                 skb->nh.raw = __skb_push(skb, hlen);
 578                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 579
 580                 ipv6_select_ident(skb, fh);
 581                 fh->nexthdr = nexthdr;
 582                 fh->reserved = 0;
 583                 fh->frag_off = htons(IP6_MF);
 584                 frag_id = fh->identification;
 585
 586                 first_len = skb_pagelen(skb);
 587                 skb->data_len = first_len - skb_headlen(skb);
 588                 skb->len = first_len;
 589                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 590
 591
 592                 for (;;) {
 593                         /* Prepare header of the next frame,
 594                          * before previous one went down. */
 595                         if (frag) {
 596                                 frag->h.raw = frag->data;
 597                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 598                                 frag->nh.raw = __skb_push(frag, hlen);
 599                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 600                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 601                                 fh->nexthdr = nexthdr;
 602                                 fh->reserved = 0;
 603                                 fh->frag_off = htons(offset);
 604                                 if (frag->next != NULL)
 605                                         fh->frag_off |= htons(IP6_MF);
 606                                 fh->identification = frag_id;
 607                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 608                                 ip6_copy_metadata(frag, skb);
 609                         }
 610
 611                         err = output(skb);
 612                         if (err || !frag)
 613                                 break;
 614
 615                         skb = frag;
 616                         frag = skb->next;
 617                         skb->next = NULL;
 618                 }
 619
 620                 if (tmp_hdr)
 621                         kfree(tmp_hdr);
 622
 623                 if (err == 0) {
 624                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 625                         return 0;
 626                 }
 627
 628                 while (frag) {
 629                         skb = frag->next;
 630                         kfree_skb(frag);
 631                         frag = skb;
 632                 }
 633
 634                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 635                 return err;
 636         }
 637
 638 slow_path:
 639         left = skb->len - hlen;         /* Space per frame */
 640         ptr = hlen;                     /* Where to start from */
 641
 642         /*
 643          *      Fragment the datagram.
 644          */
 645
 646         *prevhdr = NEXTHDR_FRAGMENT;
 647
 648         /*
 649          *      Keep copying data until we run out.
 650          */
 651         while(left > 0) {
 652                 len = left;
 653                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 654                 if (len > mtu)
 655                         len = mtu;
 656                 /* IF: we are not sending upto and including the packet end
 657                    then align the next start on an eight byte boundary */
 658                 if (len < left) {
 659                         len &= ~7;
 660                 }
 661                 /*
 662                  *      Allocate buffer.
 663                  */
 664
 665                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 666                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
 667                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 668                         err = -ENOMEM;
 669                         goto fail;
 670                 }
 671
 672                 /*
 673                  *      Set up data on packet
 674                  */
 675
 676                 ip6_copy_metadata(frag, skb);
 677                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 678                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 679                 frag->nh.raw = frag->data;
 680                 fh = (struct frag_hdr*)(frag->data + hlen);
 681                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 682
 683                 /*
 684                  *      Charge the memory for the fragment to any owner
 685                  *      it might possess
 686                  */
 687                 if (skb->sk)
 688                         skb_set_owner_w(frag, skb->sk);
 689
 690                 /*
 691                  *      Copy the packet header into the new buffer.
 692                  */
 693                 memcpy(frag->nh.raw, skb->data, hlen);
 694
 695                 /*
 696                  *      Build fragment header.
 697                  */
 698                 fh->nexthdr = nexthdr;
 699                 fh->reserved = 0;
 700                 if (frag_id) {
 701                         ipv6_select_ident(skb, fh);
 702                         frag_id = fh->identification;
 703                 } else
 704                         fh->identification = frag_id;
 705
 706                 /*
 707                  *      Copy a block of the IP datagram.
 708                  */
 709                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 710                         BUG();
 711                 left -= len;
 712
 713                 fh->frag_off = htons(offset);
 714                 if (left > 0)
 715                         fh->frag_off |= htons(IP6_MF);
 716                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 717
 718                 ptr += len;
 719                 offset += len;
 720
 721                 /*
 722                  *      Put this fragment into the sending queue.
 723                  */
 724
 725                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 726
 727                 err = output(frag);
 728                 if (err)
 729                         goto fail;
 730         }
 731         kfree_skb(skb);
 732         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 733         return err;
 734
 735 fail:
 736         kfree_skb(skb);
 737         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 738         return err;
 739 }
 740
 741 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 742 {
 743         int err = 0;
 744
 745         *dst = NULL;
 746         if (sk) {
 747                 struct ipv6_pinfo *np = inet6_sk(sk);
 748
 749                 *dst = sk_dst_check(sk, np->dst_cookie);
 750                 if (*dst) {
 751                         struct rt6_info *rt = (struct rt6_info*)*dst;
 752
 753                                 /* Yes, checking route validity in not connected
 754                                    case is not very simple. Take into account,
 755                                    that we do not support routing by source, TOS,
 756                                    and MSG_DONTROUTE            --ANK (980726)
 757
 758                                    1. If route was host route, check that
 759                                       cached destination is current.
 760                                       If it is network route, we still may
 761                                       check its validity using saved pointer
 762                                       to the last used address: daddr_cache.
 763                                       We do not want to save whole address now,
 764                                       (because main consumer of this service
 765                                        is tcp, which has not this problem),
 766                                       so that the last trick works only on connected
 767                                       sockets.
 768                                    2. oif also should be the same.
 769                                  */
 770
 771                         if (((rt->rt6i_dst.plen != 128 ||
 772                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 773                              && (np->daddr_cache == NULL ||
 774                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 775                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 776                                 dst_release(*dst);
 777                                 *dst = NULL;
 778                         }
 779                 }
 780         }
 781
 782         if (*dst == NULL)
 783                 *dst = ip6_route_output(sk, fl);
 784
 785         if ((err = (*dst)->error))
 786                 goto out_err_release;
 787
 788         if (ipv6_addr_any(&fl->fl6_src)) {
 789                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 790
 791                 if (err) {
 792 #if IP6_DEBUG >= 2
 793                         printk(KERN_DEBUG "ip6_dst_lookup: "
 794                                "no available source address\n");
 795 #endif
 796                         goto out_err_release;
 797                 }
 798         }
 799
 800         return 0;
 801
 802 out_err_release:
 803         dst_release(*dst);
 804         *dst = NULL;
 805         return err;
 806 }
 807
 808 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
 809                     void *from, int length, int transhdrlen,
 810                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
 811                     unsigned int flags)
 812 {
 813         struct inet_opt *inet = inet_sk(sk);
 814         struct ipv6_pinfo *np = inet6_sk(sk);
 815         struct sk_buff *skb;
 816         unsigned int maxfraglen, fragheaderlen;
 817         int exthdrlen;
 818         int hh_len;
 819         int mtu;
 820         int copy;
 821         int err;
 822         int offset = 0;
 823         int csummode = CHECKSUM_NONE;
 824
 825         if (flags&MSG_PROBE)
 826                 return 0;
 827         if (skb_queue_empty(&sk->sk_write_queue)) {
 828                 /*
 829                  * setup for corking
 830                  */
 831                 if (opt) {
 832                         if (np->cork.opt == NULL) {
 833                                 np->cork.opt = kmalloc(opt->tot_len,
 834                                                        sk->sk_allocation);
 835                                 if (unlikely(np->cork.opt == NULL))
 836                                         return -ENOBUFS;
 837                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 838                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 839                                 return -EINVAL;
 840                         }
 841                         memcpy(np->cork.opt, opt, opt->tot_len);
 842                         inet->cork.flags |= IPCORK_OPT;
 843                         /* need source address above miyazawa*/
 844                 }
 845                 dst_hold(&rt->u.dst);
 846                 np->cork.rt = rt;
 847                 inet->cork.fl = *fl;
 848                 np->cork.hop_limit = hlimit;
 849                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
 850                 inet->cork.length = 0;
 851                 sk->sk_sndmsg_page = NULL;
 852                 sk->sk_sndmsg_off = 0;
 853                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 854                 length += exthdrlen;
 855                 transhdrlen += exthdrlen;
 856         } else {
 857                 rt = np->cork.rt;
 858                 fl = &inet->cork.fl;
 859                 if (inet->cork.flags & IPCORK_OPT)
 860                         opt = np->cork.opt;
 861                 transhdrlen = 0;
 862                 exthdrlen = 0;
 863                 mtu = inet->cork.fragsize;
 864         }
 865
 866         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 867
 868         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 869         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 870
 871         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 872                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 873                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 874                         return -EMSGSIZE;
 875                 }
 876         }
 877
 878         /*
 879          * Let's try using as much space as possible.
 880          * Use MTU if total length of the message fits into the MTU.
 881          * Otherwise, we need to reserve fragment header and
 882          * fragment alignment (= 8-15 octects, in total).
 883          *
 884          * Note that we may need to "move" the data from the tail of
 885          * of the buffer to the new fragment when we split
 886          * the message.
 887          *
 888          * FIXME: It may be fragmented into multiple chunks
 889          *        at once if non-fragmentable extension headers
 890          *        are too large.
 891          * --yoshfuji
 892          */
 893
 894         inet->cork.length += length;
 895
 896         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 897                 goto alloc_new_skb;
 898
 899         while (length > 0) {
 900                 /* Check if the remaining data fits into current packet. */
 901                 copy = mtu - skb->len;
 902                 if (copy < length)
 903                         copy = maxfraglen - skb->len;
 904
 905                 if (copy <= 0) {
 906                         char *data;
 907                         unsigned int datalen;
 908                         unsigned int fraglen;
 909                         unsigned int fraggap;
 910                         unsigned int alloclen;
 911                         struct sk_buff *skb_prev;
 912 alloc_new_skb:
 913                         skb_prev = skb;
 914
 915                         /* There's no room in the current skb */
 916                         if (skb_prev)
 917                                 fraggap = skb_prev->len - maxfraglen;
 918                         else
 919                                 fraggap = 0;
 920
 921                         /*
 922                          * If remaining data exceeds the mtu,
 923                          * we know we need more fragment(s).
 924                          */
 925                         datalen = length + fraggap;
 926                         if (datalen > mtu - fragheaderlen)
 927                                 datalen = maxfraglen - fragheaderlen;
 928
 929                         fraglen = datalen + fragheaderlen;
 930                         if ((flags & MSG_MORE) &&
 931                             !(rt->u.dst.dev->features&NETIF_F_SG))
 932                                 alloclen = mtu;
 933                         else
 934                                 alloclen = datalen + fragheaderlen;
 935
 936                         /*
 937                          * The last fragment gets additional space at tail.
 938                          * Note: we overallocate on fragments with MSG_MODE
 939                          * because we have no idea if we're the last one.
 940                          */
 941                         if (datalen == length + fraggap)
 942                                 alloclen += rt->u.dst.trailer_len;
 943
 944                         /*
 945                          * We just reserve space for fragment header.
 946                          * Note: this may be overallocation if the message
 947                          * (without MSG_MORE) fits into the MTU.
 948                          */
 949                         alloclen += sizeof(struct frag_hdr);
 950
 951                         if (transhdrlen) {
 952                                 skb = sock_alloc_send_skb(sk,
 953                                                 alloclen + hh_len,
 954                                                 (flags & MSG_DONTWAIT), &err);
 955                         } else {
 956                                 skb = NULL;
 957                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 958                                     2 * sk->sk_sndbuf)
 959                                         skb = sock_wmalloc(sk,
 960                                                            alloclen + hh_len, 1,
 961                                                            sk->sk_allocation);
 962                                 if (unlikely(skb == NULL))
 963                                         err = -ENOBUFS;
 964                         }
 965                         if (skb == NULL)
 966                                 goto error;
 967                         /*
 968                          *      Fill in the control structures
 969                          */
 970                         skb->ip_summed = csummode;
 971                         skb->csum = 0;
 972                         /* reserve for fragmentation */
 973                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 974
 975                         /*
 976                          *      Find where to start putting bytes
 977                          */
 978                         data = skb_put(skb, fraglen);
 979                         skb->nh.raw = data + exthdrlen;
 980                         data += fragheaderlen;
 981                         skb->h.raw = data + exthdrlen;
 982
 983                         if (fraggap) {
 984                                 skb->csum = skb_copy_and_csum_bits(
 985                                         skb_prev, maxfraglen,
 986                                         data + transhdrlen, fraggap, 0);
 987                                 skb_prev->csum = csum_sub(skb_prev->csum,
 988                                                           skb->csum);
 989                                 data += fraggap;
 990                                 skb_trim(skb_prev, maxfraglen);
 991                         }
 992                         copy = datalen - transhdrlen - fraggap;
 993                         if (copy < 0) {
 994                                 err = -EINVAL;
 995                                 kfree_skb(skb);
 996                                 goto error;
 997                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 998                                 err = -EFAULT;
 999                                 kfree_skb(skb);
1000                                 goto error;
1001                         }
1002
1003                         offset += copy;
1004                         length -= datalen - fraggap;
1005                         transhdrlen = 0;
1006                         exthdrlen = 0;
1007                         csummode = CHECKSUM_NONE;
1008
1009                         /*
1010                          * Put the packet on the pending queue
1011                          */
1012                         __skb_queue_tail(&sk->sk_write_queue, skb);
1013                         continue;
1014                 }
1015
1016                 if (copy > length)
1017                         copy = length;
1018
1019                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1020                         unsigned int off;
1021
1022                         off = skb->len;
1023                         if (getfrag(from, skb_put(skb, copy),
1024                                                 offset, copy, off, skb) < 0) {
1025                                 __skb_trim(skb, off);
1026                                 err = -EFAULT;
1027                                 goto error;
1028                         }
1029                 } else {
1030                         int i = skb_shinfo(skb)->nr_frags;
1031                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1032                         struct page *page = sk->sk_sndmsg_page;
1033                         int off = sk->sk_sndmsg_off;
1034                         unsigned int left;
1035
1036                         if (page && (left = PAGE_SIZE - off) > 0) {
1037                                 if (copy >= left)
1038                                         copy = left;
1039                                 if (page != frag->page) {
1040                                         if (i == MAX_SKB_FRAGS) {
1041                                                 err = -EMSGSIZE;
1042                                                 goto error;
1043                                         }
1044                                         get_page(page);
1045                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1046                                         frag = &skb_shinfo(skb)->frags[i];
1047                                 }
1048                         } else if(i < MAX_SKB_FRAGS) {
1049                                 if (copy > PAGE_SIZE)
1050                                         copy = PAGE_SIZE;
1051                                 page = alloc_pages(sk->sk_allocation, 0);
1052                                 if (page == NULL) {
1053                                         err = -ENOMEM;
1054                                         goto error;
1055                                 }
1056                                 sk->sk_sndmsg_page = page;
1057                                 sk->sk_sndmsg_off = 0;
1058
1059                                 skb_fill_page_desc(skb, i, page, 0, 0);
1060                                 frag = &skb_shinfo(skb)->frags[i];
1061                                 skb->truesize += PAGE_SIZE;
1062                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1063                         } else {
1064                                 err = -EMSGSIZE;
1065                                 goto error;
1066                         }
1067                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1068                                 err = -EFAULT;
1069                                 goto error;
1070                         }
1071                         sk->sk_sndmsg_off += copy;
1072                         frag->size += copy;
1073                         skb->len += copy;
1074                         skb->data_len += copy;
1075                 }
1076                 offset += copy;
1077                 length -= copy;
1078         }
1079         return 0;
1080 error:
1081         inet->cork.length -= length;
1082         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1083         return err;
1084 }
1085
1086 int ip6_push_pending_frames(struct sock *sk)
1087 {
1088         struct sk_buff *skb, *tmp_skb;
1089         struct sk_buff **tail_skb;
1090         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1091         struct inet_opt *inet = inet_sk(sk);
1092         struct ipv6_pinfo *np = inet6_sk(sk);
1093         struct ipv6hdr *hdr;
1094         struct ipv6_txoptions *opt = np->cork.opt;
1095         struct rt6_info *rt = np->cork.rt;
1096         struct flowi *fl = &inet->cork.fl;
1097         unsigned char proto = fl->proto;
1098         int err = 0;
1099
1100         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1101                 goto out;
1102         tail_skb = &(skb_shinfo(skb)->frag_list);
1103
1104         /* move skb->data to ip header from ext header */
1105         if (skb->data < skb->nh.raw)
1106                 __skb_pull(skb, skb->nh.raw - skb->data);
1107         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1108                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1109                 *tail_skb = tmp_skb;
1110                 tail_skb = &(tmp_skb->next);
1111                 skb->len += tmp_skb->len;
1112                 skb->data_len += tmp_skb->len;
1113 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1114                 skb->truesize += tmp_skb->truesize;
1115                 __sock_put(tmp_skb->sk);
1116                 tmp_skb->destructor = NULL;
1117                 tmp_skb->sk = NULL;
1118 #endif
1119         }
1120
1121         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1122         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1123         if (opt && opt->opt_flen)
1124                 ipv6_push_frag_opts(skb, opt, &proto);
1125         if (opt && opt->opt_nflen)
1126                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1127
1128         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1129
1130         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1131
1132         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1133                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1134         else
1135                 hdr->payload_len = 0;
1136         hdr->hop_limit = np->cork.hop_limit;
1137         hdr->nexthdr = proto;
1138         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1139         ipv6_addr_copy(&hdr->daddr, final_dst);
1140
1141         skb->dst = dst_clone(&rt->u.dst);
1142         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1143         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1144         if (err) {
1145                 if (err > 0)
1146                         err = inet->recverr ? net_xmit_errno(err) : 0;
1147                 if (err)
1148                         goto error;
1149         }
1150
1151 out:
1152         inet->cork.flags &= ~IPCORK_OPT;
1153         if (np->cork.opt) {
1154                 kfree(np->cork.opt);
1155                 np->cork.opt = NULL;
1156         }
1157         if (np->cork.rt) {
1158                 dst_release(&np->cork.rt->u.dst);
1159                 np->cork.rt = NULL;
1160         }
1161         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1162         return err;
1163 error:
1164         goto out;
1165 }
1166
1167 void ip6_flush_pending_frames(struct sock *sk)
1168 {
1169         struct inet_opt *inet = inet_sk(sk);
1170         struct ipv6_pinfo *np = inet6_sk(sk);
1171         struct sk_buff *skb;
1172
1173         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1174                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1175                 kfree_skb(skb);
1176         }
1177
1178         inet->cork.flags &= ~IPCORK_OPT;
1179
1180         if (np->cork.opt) {
1181                 kfree(np->cork.opt);
1182                 np->cork.opt = NULL;
1183         }
1184         if (np->cork.rt) {
1185                 dst_release(&np->cork.rt->u.dst);
1186                 np->cork.rt = NULL;
1187         }
1188         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1189 }