VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57
58 static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**));
59
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62         static u32 ipv6_fragmentation_id = 1;
63         static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
64
65         spin_lock_bh(&ip6_id_lock);
66         fhdr->identification = htonl(ipv6_fragmentation_id);
67         if (++ipv6_fragmentation_id == 0)
68                 ipv6_fragmentation_id = 1;
69         spin_unlock_bh(&ip6_id_lock);
70 }
71
72 static inline int ip6_output_finish(struct sk_buff *skb)
73 {
74
75         struct dst_entry *dst = skb->dst;
76         struct hh_cache *hh = dst->hh;
77
78         if (hh) {
79                 int hh_alen;
80
81                 read_lock_bh(&hh->hh_lock);
82                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
83                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
84                 read_unlock_bh(&hh->hh_lock);
85                 skb_push(skb, hh->hh_len);
86                 return hh->hh_output(skb);
87         } else if (dst->neighbour)
88                 return dst->neighbour->output(skb);
89
90         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
91         kfree_skb(skb);
92         return -EINVAL;
93
94 }
95
96 /* dev_loopback_xmit for use with netfilter. */
97 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
98 {
99         newskb->mac.raw = newskb->data;
100         __skb_pull(newskb, newskb->nh.raw - newskb->data);
101         newskb->pkt_type = PACKET_LOOPBACK;
102         newskb->ip_summed = CHECKSUM_UNNECESSARY;
103         BUG_TRAP(newskb->dst);
104
105         netif_rx(newskb);
106         return 0;
107 }
108
109
110 static int ip6_output2(struct sk_buff **pskb)
111 {
112         struct sk_buff *skb = *pskb;
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff **pskb)
149 {
150         struct sk_buff *skb = *pskb;
151
152         if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
153                 return ip6_fragment(pskb, ip6_output2);
154         else
155                 return ip6_output2(pskb);
156 }
157
158 #ifdef CONFIG_NETFILTER
159 int ip6_route_me_harder(struct sk_buff *skb)
160 {
161         struct ipv6hdr *iph = skb->nh.ipv6h;
162         struct dst_entry *dst;
163         struct flowi fl = {
164                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
165                 .nl_u =
166                 { .ip6_u =
167                   { .daddr = iph->daddr,
168                     .saddr = iph->saddr, } },
169                 .proto = iph->nexthdr,
170         };
171
172         dst = ip6_route_output(skb->sk, &fl);
173
174         if (dst->error) {
175                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
176                 LIMIT_NETDEBUG(
177                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
178                 dst_release(dst);
179                 return -EINVAL;
180         }
181
182         /* Drop old route. */
183         dst_release(skb->dst);
184
185         skb->dst = dst;
186         return 0;
187 }
188 #endif
189
190 static inline int ip6_maybe_reroute(struct sk_buff *skb)
191 {
192 #ifdef CONFIG_NETFILTER
193         if (skb->nfcache & NFC_ALTERED){
194                 if (ip6_route_me_harder(skb) != 0){
195                         kfree_skb(skb);
196                         return -EINVAL;
197                 }
198         }
199 #endif /* CONFIG_NETFILTER */
200         return dst_output(skb);
201 }
202
203 /*
204  *      xmit an sk_buff (used by TCP)
205  */
206
207 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
208              struct ipv6_txoptions *opt, int ipfragok)
209 {
210         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
211         struct in6_addr *first_hop = &fl->fl6_dst;
212         struct dst_entry *dst = skb->dst;
213         struct ipv6hdr *hdr;
214         u8  proto = fl->proto;
215         int seg_len = skb->len;
216         int hlimit;
217         u32 mtu;
218
219         if (opt) {
220                 int head_room;
221
222                 /* First: exthdrs may take lots of space (~8K for now)
223                    MAX_HEADER is not enough.
224                  */
225                 head_room = opt->opt_nflen + opt->opt_flen;
226                 seg_len += head_room;
227                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
228
229                 if (skb_headroom(skb) < head_room) {
230                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
231                         kfree_skb(skb);
232                         skb = skb2;
233                         if (skb == NULL) {      
234                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
235                                 return -ENOBUFS;
236                         }
237                         if (sk)
238                                 skb_set_owner_w(skb, sk);
239                 }
240                 if (opt->opt_flen)
241                         ipv6_push_frag_opts(skb, opt, &proto);
242                 if (opt->opt_nflen)
243                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
244         }
245
246         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
247
248         /*
249          *      Fill in the IPv6 header
250          */
251
252         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
253         hlimit = -1;
254         if (np)
255                 hlimit = np->hop_limit;
256         if (hlimit < 0)
257                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
258
259         hdr->payload_len = htons(seg_len);
260         hdr->nexthdr = proto;
261         hdr->hop_limit = hlimit;
262
263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264         ipv6_addr_copy(&hdr->daddr, first_hop);
265
266         mtu = dst_pmtu(dst);
267         if ((skb->len <= mtu) || ipfragok) {
268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270         }
271
272         if (net_ratelimit())
273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274         skb->dev = dst->dev;
275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277         kfree_skb(skb);
278         return -EMSGSIZE;
279 }
280
281 /*
282  *      To avoid extra problems ND packets are send through this
283  *      routine. It's code duplication but I really want to avoid
284  *      extra checks since ipv6_build_header is used by TCP (which
285  *      is for us performance critical)
286  */
287
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289                struct in6_addr *saddr, struct in6_addr *daddr,
290                int proto, int len)
291 {
292         struct ipv6_pinfo *np = inet6_sk(sk);
293         struct ipv6hdr *hdr;
294         int totlen;
295
296         skb->protocol = htons(ETH_P_IPV6);
297         skb->dev = dev;
298
299         totlen = len + sizeof(struct ipv6hdr);
300
301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302         skb->nh.ipv6h = hdr;
303
304         *(u32*)hdr = htonl(0x60000000);
305
306         hdr->payload_len = htons(len);
307         hdr->nexthdr = proto;
308         hdr->hop_limit = np->hop_limit;
309
310         ipv6_addr_copy(&hdr->saddr, saddr);
311         ipv6_addr_copy(&hdr->daddr, daddr);
312
313         return 0;
314 }
315
316 int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318         struct ip6_ra_chain *ra;
319         struct sock *last = NULL;
320
321         read_lock(&ip6_ra_lock);
322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
323                 struct sock *sk = ra->sk;
324                 if (sk && ra->sel == sel) {
325                         if (last) {
326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327                                 if (skb2)
328                                         rawv6_rcv(last, skb2);
329                         }
330                         last = sk;
331                 }
332         }
333
334         if (last) {
335                 rawv6_rcv(last, skb);
336                 read_unlock(&ip6_ra_lock);
337                 return 1;
338         }
339         read_unlock(&ip6_ra_lock);
340         return 0;
341 }
342
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345         return dst_output(skb);
346 }
347
348 int ip6_forward(struct sk_buff *skb)
349 {
350         struct dst_entry *dst = skb->dst;
351         struct ipv6hdr *hdr = skb->nh.ipv6h;
352         struct inet6_skb_parm *opt = IP6CB(skb);
353         
354         if (ipv6_devconf.forwarding == 0)
355                 goto error;
356
357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359                 goto drop;
360         }
361
362         skb->ip_summed = CHECKSUM_NONE;
363
364         /*
365          *      We DO NOT make any processing on
366          *      RA packets, pushing them to user level AS IS
367          *      without ane WARRANTY that application will be able
368          *      to interpret them. The reason is that we
369          *      cannot make anything clever here.
370          *
371          *      We are not end-node, so that if packet contains
372          *      AH/ESP, we cannot make anything.
373          *      Defragmentation also would be mistake, RA packets
374          *      cannot be fragmented, because there is no warranty
375          *      that different fragments will go along one path. --ANK
376          */
377         if (opt->ra) {
378                 u8 *ptr = skb->nh.raw + opt->ra;
379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380                         return 0;
381         }
382
383         /*
384          *      check and decrement ttl
385          */
386         if (hdr->hop_limit <= 1) {
387                 /* Force OUTPUT device used as source address */
388                 skb->dev = dst->dev;
389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390                             0, skb->dev);
391
392                 kfree_skb(skb);
393                 return -ETIMEDOUT;
394         }
395
396         if (!xfrm6_route_forward(skb)) {
397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398                 goto drop;
399         }
400
401         /* IPv6 specs say nothing about it, but it is clear that we cannot
402            send redirects to source routed frames.
403          */
404         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
405                 struct in6_addr *target = NULL;
406                 struct rt6_info *rt;
407                 struct neighbour *n = dst->neighbour;
408
409                 /*
410                  *      incoming and outgoing devices are the same
411                  *      send a redirect.
412                  */
413
414                 rt = (struct rt6_info *) dst;
415                 if ((rt->rt6i_flags & RTF_GATEWAY))
416                         target = (struct in6_addr*)&n->primary_key;
417                 else
418                         target = &hdr->daddr;
419
420                 /* Limit redirects both by destination (here)
421                    and by source (inside ndisc_send_redirect)
422                  */
423                 if (xrlim_allow(dst, 1*HZ))
424                         ndisc_send_redirect(skb, n, target);
425         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
426                                                 |IPV6_ADDR_LINKLOCAL)) {
427                 /* This check is security critical. */
428                 goto error;
429         }
430
431         if (skb->len > dst_pmtu(dst)) {
432                 /* Again, force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
435                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
436                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
437                 kfree_skb(skb);
438                 return -EMSGSIZE;
439         }
440
441         if (skb_cow(skb, dst->dev->hard_header_len)) {
442                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
443                 goto drop;
444         }
445
446         hdr = skb->nh.ipv6h;
447
448         /* Mangling hops number delayed to point after skb COW */
449  
450         hdr->hop_limit--;
451
452         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
453         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
454
455 error:
456         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
457 drop:
458         kfree_skb(skb);
459         return -EINVAL;
460 }
461
462 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
463 {
464         to->pkt_type = from->pkt_type;
465         to->priority = from->priority;
466         to->protocol = from->protocol;
467         to->security = from->security;
468         to->dst = dst_clone(from->dst);
469         to->dev = from->dev;
470
471 #ifdef CONFIG_NET_SCHED
472         to->tc_index = from->tc_index;
473 #endif
474 #ifdef CONFIG_NETFILTER
475         to->nfmark = from->nfmark;
476         /* Connection association is same as pre-frag packet */
477         to->nfct = from->nfct;
478         nf_conntrack_get(to->nfct);
479 #ifdef CONFIG_BRIDGE_NETFILTER
480         nf_bridge_put(to->nf_bridge);
481         to->nf_bridge = from->nf_bridge;
482         nf_bridge_get(to->nf_bridge);
483 #endif
484 #ifdef CONFIG_NETFILTER_DEBUG
485         to->nf_debug = from->nf_debug;
486 #endif
487 #endif
488 }
489
490 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
491 {
492         u16 offset = sizeof(struct ipv6hdr);
493         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
494         unsigned int packet_len = skb->tail - skb->nh.raw;
495         int found_rhdr = 0;
496         *nexthdr = &skb->nh.ipv6h->nexthdr;
497
498         while (offset + 1 <= packet_len) {
499
500                 switch (**nexthdr) {
501
502                 case NEXTHDR_HOP:
503                 case NEXTHDR_ROUTING:
504                 case NEXTHDR_DEST:
505                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
506                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
507                         offset += ipv6_optlen(exthdr);
508                         *nexthdr = &exthdr->nexthdr;
509                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
510                         break;
511                 default :
512                         return offset;
513                 }
514         }
515
516         return offset;
517 }
518
519 static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
520 {
521         struct net_device *dev;
522         struct sk_buff *frag, *skb = *pskb;
523         struct rt6_info *rt = (struct rt6_info*)skb->dst;
524         struct ipv6hdr *tmp_hdr;
525         struct frag_hdr *fh;
526         unsigned int mtu, hlen, left, len;
527         u32 frag_id = 0;
528         int ptr, offset = 0, err=0;
529         u8 *prevhdr, nexthdr = 0;
530
531         dev = rt->u.dst.dev;
532         hlen = ip6_find_1stfragopt(skb, &prevhdr);
533         nexthdr = *prevhdr;
534
535         mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
536
537         if (skb_shinfo(skb)->frag_list) {
538                 int first_len = skb_pagelen(skb);
539
540                 if (first_len - hlen > mtu ||
541                     ((first_len - hlen) & 7) ||
542                     skb_cloned(skb))
543                         goto slow_path;
544
545                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
546                         /* Correct geometry. */
547                         if (frag->len > mtu ||
548                             ((frag->len & 7) && frag->next) ||
549                             skb_headroom(frag) < hlen)
550                             goto slow_path;
551
552                         /* Correct socket ownership. */
553                         if (frag->sk == NULL)
554                                 goto slow_path;
555
556                         /* Partially cloned skb? */
557                         if (skb_shared(frag))
558                                 goto slow_path;
559                 }
560
561                 err = 0;
562                 offset = 0;
563                 frag = skb_shinfo(skb)->frag_list;
564                 skb_shinfo(skb)->frag_list = NULL;
565                 /* BUILD HEADER */
566
567                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
568                 if (!tmp_hdr) {
569                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
570                         return -ENOMEM;
571                 }
572
573                 *prevhdr = NEXTHDR_FRAGMENT;
574                 memcpy(tmp_hdr, skb->nh.raw, hlen);
575                 __skb_pull(skb, hlen);
576                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
577                 skb->nh.raw = __skb_push(skb, hlen);
578                 memcpy(skb->nh.raw, tmp_hdr, hlen);
579
580                 ipv6_select_ident(skb, fh);
581                 fh->nexthdr = nexthdr;
582                 fh->reserved = 0;
583                 fh->frag_off = htons(IP6_MF);
584                 frag_id = fh->identification;
585
586                 first_len = skb_pagelen(skb);
587                 skb->data_len = first_len - skb_headlen(skb);
588                 skb->len = first_len;
589                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
590  
591
592                 for (;;) {
593                         /* Prepare header of the next frame,
594                          * before previous one went down. */
595                         if (frag) {
596                                 frag->h.raw = frag->data;
597                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
598                                 frag->nh.raw = __skb_push(frag, hlen);
599                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
600                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
601                                 fh->nexthdr = nexthdr;
602                                 fh->reserved = 0;
603                                 fh->frag_off = htons(offset);
604                                 if (frag->next != NULL)
605                                         fh->frag_off |= htons(IP6_MF);
606                                 fh->identification = frag_id;
607                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
608                                 ip6_copy_metadata(frag, skb);
609                         }
610                         
611                         err = output(&skb);
612                         if (err || !frag)
613                                 break;
614
615                         skb = frag;
616                         frag = skb->next;
617                         skb->next = NULL;
618                 }
619
620                 if (tmp_hdr)
621                         kfree(tmp_hdr);
622
623                 if (err == 0) {
624                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
625                         return 0;
626                 }
627
628                 while (frag) {
629                         skb = frag->next;
630                         kfree_skb(frag);
631                         frag = skb;
632                 }
633
634                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
635                 return err;
636         }
637
638 slow_path:
639         left = skb->len - hlen;         /* Space per frame */
640         ptr = hlen;                     /* Where to start from */
641
642         /*
643          *      Fragment the datagram.
644          */
645
646         *prevhdr = NEXTHDR_FRAGMENT;
647
648         /*
649          *      Keep copying data until we run out.
650          */
651         while(left > 0) {
652                 len = left;
653                 /* IF: it doesn't fit, use 'mtu' - the data space left */
654                 if (len > mtu)
655                         len = mtu;
656                 /* IF: we are not sending upto and including the packet end
657                    then align the next start on an eight byte boundary */
658                 if (len < left) {
659                         len &= ~7;
660                 }
661                 /*
662                  *      Allocate buffer.
663                  */
664
665                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
666                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
667                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
668                         err = -ENOMEM;
669                         goto fail;
670                 }
671
672                 /*
673                  *      Set up data on packet
674                  */
675
676                 ip6_copy_metadata(frag, skb);
677                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
678                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
679                 frag->nh.raw = frag->data;
680                 fh = (struct frag_hdr*)(frag->data + hlen);
681                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
682
683                 /*
684                  *      Charge the memory for the fragment to any owner
685                  *      it might possess
686                  */
687                 if (skb->sk)
688                         skb_set_owner_w(frag, skb->sk);
689
690                 /*
691                  *      Copy the packet header into the new buffer.
692                  */
693                 memcpy(frag->nh.raw, skb->data, hlen);
694
695                 /*
696                  *      Build fragment header.
697                  */
698                 fh->nexthdr = nexthdr;
699                 fh->reserved = 0;
700                 if (frag_id) {
701                         ipv6_select_ident(skb, fh);
702                         frag_id = fh->identification;
703                 } else
704                         fh->identification = frag_id;
705
706                 /*
707                  *      Copy a block of the IP datagram.
708                  */
709                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
710                         BUG();
711                 left -= len;
712
713                 fh->frag_off = htons(offset);
714                 if (left > 0)
715                         fh->frag_off |= htons(IP6_MF);
716                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
717
718                 ptr += len;
719                 offset += len;
720
721                 /*
722                  *      Put this fragment into the sending queue.
723                  */
724
725                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
726
727                 err = output(&frag);
728                 if (err)
729                         goto fail;
730         }
731         kfree_skb(skb);
732         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
733         return err;
734
735 fail:
736         kfree_skb(skb); 
737         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
738         return err;
739 }
740
741 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
742 {
743         int err = 0;
744
745         *dst = NULL;
746         if (sk) {
747                 struct ipv6_pinfo *np = inet6_sk(sk);
748         
749                 *dst = __sk_dst_check(sk, np->dst_cookie);
750                 if (*dst) {
751                         struct rt6_info *rt = (struct rt6_info*)*dst;
752         
753                                 /* Yes, checking route validity in not connected
754                                    case is not very simple. Take into account,
755                                    that we do not support routing by source, TOS,
756                                    and MSG_DONTROUTE            --ANK (980726)
757         
758                                    1. If route was host route, check that
759                                       cached destination is current.
760                                       If it is network route, we still may
761                                       check its validity using saved pointer
762                                       to the last used address: daddr_cache.
763                                       We do not want to save whole address now,
764                                       (because main consumer of this service
765                                        is tcp, which has not this problem),
766                                       so that the last trick works only on connected
767                                       sockets.
768                                    2. oif also should be the same.
769                                  */
770         
771                         if (((rt->rt6i_dst.plen != 128 ||
772                               ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
773                              && (np->daddr_cache == NULL ||
774                                  ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
775                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
776                                 *dst = NULL;
777                         } else
778                                 dst_hold(*dst);
779                 }
780         }
781
782         if (*dst == NULL)
783                 *dst = ip6_route_output(sk, fl);
784
785         if ((err = (*dst)->error))
786                 goto out_err_release;
787
788         if (ipv6_addr_any(&fl->fl6_src)) {
789                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
790
791                 if (err) {
792 #if IP6_DEBUG >= 2
793                         printk(KERN_DEBUG "ip6_dst_lookup: "
794                                "no available source address\n");
795 #endif
796                         goto out_err_release;
797                 }
798         }
799         if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) {
800                 err = -ENETUNREACH;
801                 goto out_err_release;
802         }
803
804         return 0;
805
806 out_err_release:
807         dst_release(*dst);
808         *dst = NULL;
809         return err;
810 }
811
812 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
813                     void *from, int length, int transhdrlen,
814                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
815                     unsigned int flags)
816 {
817         struct inet_opt *inet = inet_sk(sk);
818         struct ipv6_pinfo *np = inet6_sk(sk);
819         struct sk_buff *skb;
820         unsigned int maxfraglen, fragheaderlen;
821         int exthdrlen;
822         int hh_len;
823         int mtu;
824         int copy = 0;
825         int err;
826         int offset = 0;
827         int csummode = CHECKSUM_NONE;
828
829         if (flags&MSG_PROBE)
830                 return 0;
831         if (skb_queue_empty(&sk->sk_write_queue)) {
832                 /*
833                  * setup for corking
834                  */
835                 if (opt) {
836                         if (np->cork.opt == NULL) {
837                                 np->cork.opt = kmalloc(opt->tot_len,
838                                                        sk->sk_allocation);
839                                 if (unlikely(np->cork.opt == NULL))
840                                         return -ENOBUFS;
841                         } else if (np->cork.opt->tot_len < opt->tot_len) {
842                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
843                                 return -EINVAL;
844                         }
845                         memcpy(np->cork.opt, opt, opt->tot_len);
846                         inet->cork.flags |= IPCORK_OPT;
847                         /* need source address above miyazawa*/
848                 }
849                 dst_hold(&rt->u.dst);
850                 np->cork.rt = rt;
851                 inet->cork.fl = *fl;
852                 np->cork.hop_limit = hlimit;
853                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
854                 inet->cork.length = 0;
855                 sk->sk_sndmsg_page = NULL;
856                 sk->sk_sndmsg_off = 0;
857                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
858                 length += exthdrlen;
859                 transhdrlen += exthdrlen;
860         } else {
861                 rt = np->cork.rt;
862                 fl = &inet->cork.fl;
863                 if (inet->cork.flags & IPCORK_OPT)
864                         opt = np->cork.opt;
865                 transhdrlen = 0;
866                 exthdrlen = 0;
867                 mtu = inet->cork.fragsize;
868         }
869
870         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
871
872         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
873         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
874
875         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
876                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
877                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
878                         return -EMSGSIZE;
879                 }
880         }
881
882         inet->cork.length += length;
883
884         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
885                 goto alloc_new_skb;
886
887         while (length > 0) {
888                 if ((copy = maxfraglen - skb->len) <= 0) {
889                         char *data;
890                         unsigned int datalen;
891                         unsigned int fraglen;
892                         unsigned int alloclen;
893                         BUG_TRAP(copy == 0);
894 alloc_new_skb:
895                         datalen = maxfraglen - fragheaderlen;
896                         if (datalen > length)
897                                 datalen = length;
898                         fraglen = datalen + fragheaderlen;
899                         if ((flags & MSG_MORE) &&
900                             !(rt->u.dst.dev->features&NETIF_F_SG))
901                                 alloclen = maxfraglen;
902                         else
903                                 alloclen = fraglen;
904                         alloclen += sizeof(struct frag_hdr);
905                         if (transhdrlen) {
906                                 skb = sock_alloc_send_skb(sk,
907                                                 alloclen + hh_len,
908                                                 (flags & MSG_DONTWAIT), &err);
909                         } else {
910                                 skb = NULL;
911                                 if (atomic_read(&sk->sk_wmem_alloc) <=
912                                     2 * sk->sk_sndbuf)
913                                         skb = sock_wmalloc(sk,
914                                                            alloclen + hh_len, 1,
915                                                            sk->sk_allocation);
916                                 if (unlikely(skb == NULL))
917                                         err = -ENOBUFS;
918                         }
919                         if (skb == NULL)
920                                 goto error;
921                         /*
922                          *      Fill in the control structures
923                          */
924                         skb->ip_summed = csummode;
925                         skb->csum = 0;
926                         /* reserve 8 byte for fragmentation */
927                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
928
929                         /*
930                          *      Find where to start putting bytes
931                          */
932                         data = skb_put(skb, fraglen);
933                         skb->nh.raw = data + exthdrlen;
934                         data += fragheaderlen;
935                         skb->h.raw = data + exthdrlen;
936                         copy = datalen - transhdrlen;
937                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
938                                 err = -EFAULT;
939                                 kfree_skb(skb);
940                                 goto error;
941                         }
942
943                         offset += copy;
944                         length -= datalen;
945                         transhdrlen = 0;
946                         exthdrlen = 0;
947                         csummode = CHECKSUM_NONE;
948
949                         /*
950                          * Put the packet on the pending queue
951                          */
952                         __skb_queue_tail(&sk->sk_write_queue, skb);
953                         continue;
954                 }
955
956                 if (copy > length)
957                         copy = length;
958
959                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
960                         unsigned int off;
961
962                         off = skb->len;
963                         if (getfrag(from, skb_put(skb, copy),
964                                                 offset, copy, off, skb) < 0) {
965                                 __skb_trim(skb, off);
966                                 err = -EFAULT;
967                                 goto error;
968                         }
969                 } else {
970                         int i = skb_shinfo(skb)->nr_frags;
971                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
972                         struct page *page = sk->sk_sndmsg_page;
973                         int off = sk->sk_sndmsg_off;
974                         unsigned int left;
975
976                         if (page && (left = PAGE_SIZE - off) > 0) {
977                                 if (copy >= left)
978                                         copy = left;
979                                 if (page != frag->page) {
980                                         if (i == MAX_SKB_FRAGS) {
981                                                 err = -EMSGSIZE;
982                                                 goto error;
983                                         }
984                                         get_page(page);
985                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
986                                         frag = &skb_shinfo(skb)->frags[i];
987                                 }
988                         } else if(i < MAX_SKB_FRAGS) {
989                                 if (copy > PAGE_SIZE)
990                                         copy = PAGE_SIZE;
991                                 page = alloc_pages(sk->sk_allocation, 0);
992                                 if (page == NULL) {
993                                         err = -ENOMEM;
994                                         goto error;
995                                 }
996                                 sk->sk_sndmsg_page = page;
997                                 sk->sk_sndmsg_off = 0;
998
999                                 skb_fill_page_desc(skb, i, page, 0, 0);
1000                                 frag = &skb_shinfo(skb)->frags[i];
1001                                 skb->truesize += PAGE_SIZE;
1002                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1003                         } else {
1004                                 err = -EMSGSIZE;
1005                                 goto error;
1006                         }
1007                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1008                                 err = -EFAULT;
1009                                 goto error;
1010                         }
1011                         sk->sk_sndmsg_off += copy;
1012                         frag->size += copy;
1013                         skb->len += copy;
1014                         skb->data_len += copy;
1015                 }
1016                 offset += copy;
1017                 length -= copy;
1018         }
1019         return 0;
1020 error:
1021         inet->cork.length -= length;
1022         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1023         return err;
1024 }
1025
1026 int ip6_push_pending_frames(struct sock *sk)
1027 {
1028         struct sk_buff *skb, *tmp_skb;
1029         struct sk_buff **tail_skb;
1030         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1031         struct inet_opt *inet = inet_sk(sk);
1032         struct ipv6_pinfo *np = inet6_sk(sk);
1033         struct ipv6hdr *hdr;
1034         struct ipv6_txoptions *opt = np->cork.opt;
1035         struct rt6_info *rt = np->cork.rt;
1036         struct flowi *fl = &inet->cork.fl;
1037         unsigned char proto = fl->proto;
1038         int err = 0;
1039
1040         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1041                 goto out;
1042         tail_skb = &(skb_shinfo(skb)->frag_list);
1043
1044         /* move skb->data to ip header from ext header */
1045         if (skb->data < skb->nh.raw)
1046                 __skb_pull(skb, skb->nh.raw - skb->data);
1047         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1048                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1049                 *tail_skb = tmp_skb;
1050                 tail_skb = &(tmp_skb->next);
1051                 skb->len += tmp_skb->len;
1052                 skb->data_len += tmp_skb->len;
1053 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1054                 skb->truesize += tmp_skb->truesize;
1055                 __sock_put(tmp_skb->sk);
1056                 tmp_skb->destructor = NULL;
1057                 tmp_skb->sk = NULL;
1058 #endif
1059         }
1060
1061         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1062         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1063         if (opt && opt->opt_flen)
1064                 ipv6_push_frag_opts(skb, opt, &proto);
1065         if (opt && opt->opt_nflen)
1066                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1067
1068         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1069         
1070         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1071
1072         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1073                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1074         else
1075                 hdr->payload_len = 0;
1076         hdr->hop_limit = np->cork.hop_limit;
1077         hdr->nexthdr = proto;
1078         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1079         ipv6_addr_copy(&hdr->daddr, final_dst);
1080
1081         skb->dst = dst_clone(&rt->u.dst);
1082         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1083         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1084         if (err) {
1085                 if (err > 0)
1086                         err = inet->recverr ? net_xmit_errno(err) : 0;
1087                 if (err)
1088                         goto error;
1089         }
1090
1091 out:
1092         inet->cork.flags &= ~IPCORK_OPT;
1093         if (np->cork.opt) {
1094                 kfree(np->cork.opt);
1095                 np->cork.opt = NULL;
1096         }
1097         if (np->cork.rt) {
1098                 dst_release(&np->cork.rt->u.dst);
1099                 np->cork.rt = NULL;
1100         }
1101         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1102         return err;
1103 error:
1104         goto out;
1105 }
1106
1107 void ip6_flush_pending_frames(struct sock *sk)
1108 {
1109         struct inet_opt *inet = inet_sk(sk);
1110         struct ipv6_pinfo *np = inet6_sk(sk);
1111         struct sk_buff *skb;
1112
1113         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1114                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1115                 kfree_skb(skb);
1116         }
1117
1118         inet->cork.flags &= ~IPCORK_OPT;
1119
1120         if (np->cork.opt) {
1121                 kfree(np->cork.opt);
1122                 np->cork.opt = NULL;
1123         }
1124         if (np->cork.rt) {
1125                 dst_release(&np->cork.rt->u.dst);
1126                 np->cork.rt = NULL;
1127         }
1128         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1129 }