vserver 1.9.5.x5
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_pmtu(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191         if (skb->nfcache & NFC_ALTERED){
192                 if (ip6_route_me_harder(skb) != 0){
193                         kfree_skb(skb);
194                         return -EINVAL;
195                 }
196         }
197 #endif /* CONFIG_NETFILTER */
198         return dst_output(skb);
199 }
200
201 /*
202  *      xmit an sk_buff (used by TCP)
203  */
204
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206              struct ipv6_txoptions *opt, int ipfragok)
207 {
208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209         struct in6_addr *first_hop = &fl->fl6_dst;
210         struct dst_entry *dst = skb->dst;
211         struct ipv6hdr *hdr;
212         u8  proto = fl->proto;
213         int seg_len = skb->len;
214         int hlimit;
215         u32 mtu;
216
217         if (opt) {
218                 int head_room;
219
220                 /* First: exthdrs may take lots of space (~8K for now)
221                    MAX_HEADER is not enough.
222                  */
223                 head_room = opt->opt_nflen + opt->opt_flen;
224                 seg_len += head_room;
225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227                 if (skb_headroom(skb) < head_room) {
228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229                         kfree_skb(skb);
230                         skb = skb2;
231                         if (skb == NULL) {      
232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233                                 return -ENOBUFS;
234                         }
235                         if (sk)
236                                 skb_set_owner_w(skb, sk);
237                 }
238                 if (opt->opt_flen)
239                         ipv6_push_frag_opts(skb, opt, &proto);
240                 if (opt->opt_nflen)
241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242         }
243
244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246         /*
247          *      Fill in the IPv6 header
248          */
249
250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251         hlimit = -1;
252         if (np)
253                 hlimit = np->hop_limit;
254         if (hlimit < 0)
255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256
257         hdr->payload_len = htons(seg_len);
258         hdr->nexthdr = proto;
259         hdr->hop_limit = hlimit;
260
261         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
262         ipv6_addr_copy(&hdr->daddr, first_hop);
263
264         mtu = dst_pmtu(dst);
265         if ((skb->len <= mtu) || ipfragok) {
266                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
267                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
268         }
269
270         if (net_ratelimit())
271                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
272         skb->dev = dst->dev;
273         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
274         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
275         kfree_skb(skb);
276         return -EMSGSIZE;
277 }
278
279 /*
280  *      To avoid extra problems ND packets are send through this
281  *      routine. It's code duplication but I really want to avoid
282  *      extra checks since ipv6_build_header is used by TCP (which
283  *      is for us performance critical)
284  */
285
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287                struct in6_addr *saddr, struct in6_addr *daddr,
288                int proto, int len)
289 {
290         struct ipv6_pinfo *np = inet6_sk(sk);
291         struct ipv6hdr *hdr;
292         int totlen;
293
294         skb->protocol = htons(ETH_P_IPV6);
295         skb->dev = dev;
296
297         totlen = len + sizeof(struct ipv6hdr);
298
299         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
300         skb->nh.ipv6h = hdr;
301
302         *(u32*)hdr = htonl(0x60000000);
303
304         hdr->payload_len = htons(len);
305         hdr->nexthdr = proto;
306         hdr->hop_limit = np->hop_limit;
307
308         ipv6_addr_copy(&hdr->saddr, saddr);
309         ipv6_addr_copy(&hdr->daddr, daddr);
310
311         return 0;
312 }
313
314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
315 {
316         struct ip6_ra_chain *ra;
317         struct sock *last = NULL;
318
319         read_lock(&ip6_ra_lock);
320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
321                 struct sock *sk = ra->sk;
322                 if (sk && ra->sel == sel) {
323                         if (last) {
324                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
325                                 if (skb2)
326                                         rawv6_rcv(last, skb2);
327                         }
328                         last = sk;
329                 }
330         }
331
332         if (last) {
333                 rawv6_rcv(last, skb);
334                 read_unlock(&ip6_ra_lock);
335                 return 1;
336         }
337         read_unlock(&ip6_ra_lock);
338         return 0;
339 }
340
341 static inline int ip6_forward_finish(struct sk_buff *skb)
342 {
343         return dst_output(skb);
344 }
345
346 int ip6_forward(struct sk_buff *skb)
347 {
348         struct dst_entry *dst = skb->dst;
349         struct ipv6hdr *hdr = skb->nh.ipv6h;
350         struct inet6_skb_parm *opt = IP6CB(skb);
351         
352         if (ipv6_devconf.forwarding == 0)
353                 goto error;
354
355         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
356                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
357                 goto drop;
358         }
359
360         skb->ip_summed = CHECKSUM_NONE;
361
362         /*
363          *      We DO NOT make any processing on
364          *      RA packets, pushing them to user level AS IS
365          *      without ane WARRANTY that application will be able
366          *      to interpret them. The reason is that we
367          *      cannot make anything clever here.
368          *
369          *      We are not end-node, so that if packet contains
370          *      AH/ESP, we cannot make anything.
371          *      Defragmentation also would be mistake, RA packets
372          *      cannot be fragmented, because there is no warranty
373          *      that different fragments will go along one path. --ANK
374          */
375         if (opt->ra) {
376                 u8 *ptr = skb->nh.raw + opt->ra;
377                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
378                         return 0;
379         }
380
381         /*
382          *      check and decrement ttl
383          */
384         if (hdr->hop_limit <= 1) {
385                 /* Force OUTPUT device used as source address */
386                 skb->dev = dst->dev;
387                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
388                             0, skb->dev);
389
390                 kfree_skb(skb);
391                 return -ETIMEDOUT;
392         }
393
394         if (!xfrm6_route_forward(skb)) {
395                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
396                 goto drop;
397         }
398
399         /* IPv6 specs say nothing about it, but it is clear that we cannot
400            send redirects to source routed frames.
401          */
402         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
403                 struct in6_addr *target = NULL;
404                 struct rt6_info *rt;
405                 struct neighbour *n = dst->neighbour;
406
407                 /*
408                  *      incoming and outgoing devices are the same
409                  *      send a redirect.
410                  */
411
412                 rt = (struct rt6_info *) dst;
413                 if ((rt->rt6i_flags & RTF_GATEWAY))
414                         target = (struct in6_addr*)&n->primary_key;
415                 else
416                         target = &hdr->daddr;
417
418                 /* Limit redirects both by destination (here)
419                    and by source (inside ndisc_send_redirect)
420                  */
421                 if (xrlim_allow(dst, 1*HZ))
422                         ndisc_send_redirect(skb, n, target);
423         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
424                                                 |IPV6_ADDR_LINKLOCAL)) {
425                 /* This check is security critical. */
426                 goto error;
427         }
428
429         if (skb->len > dst_pmtu(dst)) {
430                 /* Again, force OUTPUT device used as source address */
431                 skb->dev = dst->dev;
432                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
433                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
434                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
435                 kfree_skb(skb);
436                 return -EMSGSIZE;
437         }
438
439         if (skb_cow(skb, dst->dev->hard_header_len)) {
440                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
441                 goto drop;
442         }
443
444         hdr = skb->nh.ipv6h;
445
446         /* Mangling hops number delayed to point after skb COW */
447  
448         hdr->hop_limit--;
449
450         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
451         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
452
453 error:
454         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
455 drop:
456         kfree_skb(skb);
457         return -EINVAL;
458 }
459
460 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
461 {
462         to->pkt_type = from->pkt_type;
463         to->priority = from->priority;
464         to->protocol = from->protocol;
465         to->security = from->security;
466         dst_release(to->dst);
467         to->dst = dst_clone(from->dst);
468         to->dev = from->dev;
469
470 #ifdef CONFIG_NET_SCHED
471         to->tc_index = from->tc_index;
472 #endif
473 #ifdef CONFIG_NETFILTER
474         to->nfmark = from->nfmark;
475         /* Connection association is same as pre-frag packet */
476         to->nfct = from->nfct;
477         nf_conntrack_get(to->nfct);
478         to->nfctinfo = from->nfctinfo;
479 #ifdef CONFIG_BRIDGE_NETFILTER
480         nf_bridge_put(to->nf_bridge);
481         to->nf_bridge = from->nf_bridge;
482         nf_bridge_get(to->nf_bridge);
483 #endif
484 #ifdef CONFIG_NETFILTER_DEBUG
485         to->nf_debug = from->nf_debug;
486 #endif
487 #endif
488 }
489
490 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
491 {
492         u16 offset = sizeof(struct ipv6hdr);
493         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
494         unsigned int packet_len = skb->tail - skb->nh.raw;
495         int found_rhdr = 0;
496         *nexthdr = &skb->nh.ipv6h->nexthdr;
497
498         while (offset + 1 <= packet_len) {
499
500                 switch (**nexthdr) {
501
502                 case NEXTHDR_HOP:
503                 case NEXTHDR_ROUTING:
504                 case NEXTHDR_DEST:
505                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
506                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
507                         offset += ipv6_optlen(exthdr);
508                         *nexthdr = &exthdr->nexthdr;
509                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
510                         break;
511                 default :
512                         return offset;
513                 }
514         }
515
516         return offset;
517 }
518
519 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
520 {
521         struct net_device *dev;
522         struct sk_buff *frag;
523         struct rt6_info *rt = (struct rt6_info*)skb->dst;
524         struct ipv6hdr *tmp_hdr;
525         struct frag_hdr *fh;
526         unsigned int mtu, hlen, left, len;
527         u32 frag_id = 0;
528         int ptr, offset = 0, err=0;
529         u8 *prevhdr, nexthdr = 0;
530
531         dev = rt->u.dst.dev;
532         hlen = ip6_find_1stfragopt(skb, &prevhdr);
533         nexthdr = *prevhdr;
534
535         mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
536
537         if (skb_shinfo(skb)->frag_list) {
538                 int first_len = skb_pagelen(skb);
539
540                 if (first_len - hlen > mtu ||
541                     ((first_len - hlen) & 7) ||
542                     skb_cloned(skb))
543                         goto slow_path;
544
545                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
546                         /* Correct geometry. */
547                         if (frag->len > mtu ||
548                             ((frag->len & 7) && frag->next) ||
549                             skb_headroom(frag) < hlen)
550                             goto slow_path;
551
552                         /* Correct socket ownership. */
553                         if (frag->sk == NULL)
554                                 goto slow_path;
555
556                         /* Partially cloned skb? */
557                         if (skb_shared(frag))
558                                 goto slow_path;
559                 }
560
561                 err = 0;
562                 offset = 0;
563                 frag = skb_shinfo(skb)->frag_list;
564                 skb_shinfo(skb)->frag_list = NULL;
565                 /* BUILD HEADER */
566
567                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
568                 if (!tmp_hdr) {
569                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
570                         return -ENOMEM;
571                 }
572
573                 *prevhdr = NEXTHDR_FRAGMENT;
574                 memcpy(tmp_hdr, skb->nh.raw, hlen);
575                 __skb_pull(skb, hlen);
576                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
577                 skb->nh.raw = __skb_push(skb, hlen);
578                 memcpy(skb->nh.raw, tmp_hdr, hlen);
579
580                 ipv6_select_ident(skb, fh);
581                 fh->nexthdr = nexthdr;
582                 fh->reserved = 0;
583                 fh->frag_off = htons(IP6_MF);
584                 frag_id = fh->identification;
585
586                 first_len = skb_pagelen(skb);
587                 skb->data_len = first_len - skb_headlen(skb);
588                 skb->len = first_len;
589                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
590  
591
592                 for (;;) {
593                         /* Prepare header of the next frame,
594                          * before previous one went down. */
595                         if (frag) {
596                                 frag->ip_summed = CHECKSUM_NONE;
597                                 frag->h.raw = frag->data;
598                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
599                                 frag->nh.raw = __skb_push(frag, hlen);
600                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
601                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
602                                 fh->nexthdr = nexthdr;
603                                 fh->reserved = 0;
604                                 fh->frag_off = htons(offset);
605                                 if (frag->next != NULL)
606                                         fh->frag_off |= htons(IP6_MF);
607                                 fh->identification = frag_id;
608                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
609                                 ip6_copy_metadata(frag, skb);
610                         }
611                         
612                         err = output(skb);
613                         if (err || !frag)
614                                 break;
615
616                         skb = frag;
617                         frag = skb->next;
618                         skb->next = NULL;
619                 }
620
621                 if (tmp_hdr)
622                         kfree(tmp_hdr);
623
624                 if (err == 0) {
625                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
626                         return 0;
627                 }
628
629                 while (frag) {
630                         skb = frag->next;
631                         kfree_skb(frag);
632                         frag = skb;
633                 }
634
635                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
636                 return err;
637         }
638
639 slow_path:
640         left = skb->len - hlen;         /* Space per frame */
641         ptr = hlen;                     /* Where to start from */
642
643         /*
644          *      Fragment the datagram.
645          */
646
647         *prevhdr = NEXTHDR_FRAGMENT;
648
649         /*
650          *      Keep copying data until we run out.
651          */
652         while(left > 0) {
653                 len = left;
654                 /* IF: it doesn't fit, use 'mtu' - the data space left */
655                 if (len > mtu)
656                         len = mtu;
657                 /* IF: we are not sending upto and including the packet end
658                    then align the next start on an eight byte boundary */
659                 if (len < left) {
660                         len &= ~7;
661                 }
662                 /*
663                  *      Allocate buffer.
664                  */
665
666                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
667                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
668                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
669                         err = -ENOMEM;
670                         goto fail;
671                 }
672
673                 /*
674                  *      Set up data on packet
675                  */
676
677                 ip6_copy_metadata(frag, skb);
678                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
679                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
680                 frag->nh.raw = frag->data;
681                 fh = (struct frag_hdr*)(frag->data + hlen);
682                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
683
684                 /*
685                  *      Charge the memory for the fragment to any owner
686                  *      it might possess
687                  */
688                 if (skb->sk)
689                         skb_set_owner_w(frag, skb->sk);
690
691                 /*
692                  *      Copy the packet header into the new buffer.
693                  */
694                 memcpy(frag->nh.raw, skb->data, hlen);
695
696                 /*
697                  *      Build fragment header.
698                  */
699                 fh->nexthdr = nexthdr;
700                 fh->reserved = 0;
701                 if (frag_id) {
702                         ipv6_select_ident(skb, fh);
703                         frag_id = fh->identification;
704                 } else
705                         fh->identification = frag_id;
706
707                 /*
708                  *      Copy a block of the IP datagram.
709                  */
710                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
711                         BUG();
712                 left -= len;
713
714                 fh->frag_off = htons(offset);
715                 if (left > 0)
716                         fh->frag_off |= htons(IP6_MF);
717                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
718
719                 ptr += len;
720                 offset += len;
721
722                 /*
723                  *      Put this fragment into the sending queue.
724                  */
725
726                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
727
728                 err = output(frag);
729                 if (err)
730                         goto fail;
731         }
732         kfree_skb(skb);
733         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
734         return err;
735
736 fail:
737         kfree_skb(skb); 
738         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
739         return err;
740 }
741
742 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
743 {
744         int err = 0;
745
746         *dst = NULL;
747         if (sk) {
748                 struct ipv6_pinfo *np = inet6_sk(sk);
749         
750                 *dst = sk_dst_check(sk, np->dst_cookie);
751                 if (*dst) {
752                         struct rt6_info *rt = (struct rt6_info*)*dst;
753         
754                                 /* Yes, checking route validity in not connected
755                                    case is not very simple. Take into account,
756                                    that we do not support routing by source, TOS,
757                                    and MSG_DONTROUTE            --ANK (980726)
758         
759                                    1. If route was host route, check that
760                                       cached destination is current.
761                                       If it is network route, we still may
762                                       check its validity using saved pointer
763                                       to the last used address: daddr_cache.
764                                       We do not want to save whole address now,
765                                       (because main consumer of this service
766                                        is tcp, which has not this problem),
767                                       so that the last trick works only on connected
768                                       sockets.
769                                    2. oif also should be the same.
770                                  */
771         
772                         if (((rt->rt6i_dst.plen != 128 ||
773                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
774                              && (np->daddr_cache == NULL ||
775                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
776                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
777                                 dst_release(*dst);
778                                 *dst = NULL;
779                         }
780                 }
781         }
782
783         if (*dst == NULL)
784                 *dst = ip6_route_output(sk, fl);
785
786         if ((err = (*dst)->error))
787                 goto out_err_release;
788
789         if (ipv6_addr_any(&fl->fl6_src)) {
790                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
791
792                 if (err) {
793 #if IP6_DEBUG >= 2
794                         printk(KERN_DEBUG "ip6_dst_lookup: "
795                                "no available source address\n");
796 #endif
797                         goto out_err_release;
798                 }
799         }
800
801         return 0;
802
803 out_err_release:
804         dst_release(*dst);
805         *dst = NULL;
806         return err;
807 }
808
809 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
810                     void *from, int length, int transhdrlen,
811                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
812                     unsigned int flags)
813 {
814         struct inet_sock *inet = inet_sk(sk);
815         struct ipv6_pinfo *np = inet6_sk(sk);
816         struct sk_buff *skb;
817         unsigned int maxfraglen, fragheaderlen;
818         int exthdrlen;
819         int hh_len;
820         int mtu;
821         int copy;
822         int err;
823         int offset = 0;
824         int csummode = CHECKSUM_NONE;
825
826         if (flags&MSG_PROBE)
827                 return 0;
828         if (skb_queue_empty(&sk->sk_write_queue)) {
829                 /*
830                  * setup for corking
831                  */
832                 if (opt) {
833                         if (np->cork.opt == NULL) {
834                                 np->cork.opt = kmalloc(opt->tot_len,
835                                                        sk->sk_allocation);
836                                 if (unlikely(np->cork.opt == NULL))
837                                         return -ENOBUFS;
838                         } else if (np->cork.opt->tot_len < opt->tot_len) {
839                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
840                                 return -EINVAL;
841                         }
842                         memcpy(np->cork.opt, opt, opt->tot_len);
843                         inet->cork.flags |= IPCORK_OPT;
844                         /* need source address above miyazawa*/
845                 }
846                 dst_hold(&rt->u.dst);
847                 np->cork.rt = rt;
848                 inet->cork.fl = *fl;
849                 np->cork.hop_limit = hlimit;
850                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
851                 inet->cork.length = 0;
852                 sk->sk_sndmsg_page = NULL;
853                 sk->sk_sndmsg_off = 0;
854                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
855                 length += exthdrlen;
856                 transhdrlen += exthdrlen;
857         } else {
858                 rt = np->cork.rt;
859                 fl = &inet->cork.fl;
860                 if (inet->cork.flags & IPCORK_OPT)
861                         opt = np->cork.opt;
862                 transhdrlen = 0;
863                 exthdrlen = 0;
864                 mtu = inet->cork.fragsize;
865         }
866
867         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
868
869         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
870         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
871
872         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
873                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
874                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
875                         return -EMSGSIZE;
876                 }
877         }
878
879         /*
880          * Let's try using as much space as possible.
881          * Use MTU if total length of the message fits into the MTU.
882          * Otherwise, we need to reserve fragment header and
883          * fragment alignment (= 8-15 octects, in total).
884          *
885          * Note that we may need to "move" the data from the tail of
886          * of the buffer to the new fragment when we split 
887          * the message.
888          *
889          * FIXME: It may be fragmented into multiple chunks 
890          *        at once if non-fragmentable extension headers
891          *        are too large.
892          * --yoshfuji 
893          */
894
895         inet->cork.length += length;
896
897         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
898                 goto alloc_new_skb;
899
900         while (length > 0) {
901                 /* Check if the remaining data fits into current packet. */
902                 copy = mtu - skb->len;
903                 if (copy < length)
904                         copy = maxfraglen - skb->len;
905
906                 if (copy <= 0) {
907                         char *data;
908                         unsigned int datalen;
909                         unsigned int fraglen;
910                         unsigned int fraggap;
911                         unsigned int alloclen;
912                         struct sk_buff *skb_prev;
913 alloc_new_skb:
914                         skb_prev = skb;
915
916                         /* There's no room in the current skb */
917                         if (skb_prev)
918                                 fraggap = skb_prev->len - maxfraglen;
919                         else
920                                 fraggap = 0;
921
922                         /*
923                          * If remaining data exceeds the mtu,
924                          * we know we need more fragment(s).
925                          */
926                         datalen = length + fraggap;
927                         if (datalen > mtu - fragheaderlen)
928                                 datalen = maxfraglen - fragheaderlen;
929
930                         fraglen = datalen + fragheaderlen;
931                         if ((flags & MSG_MORE) &&
932                             !(rt->u.dst.dev->features&NETIF_F_SG))
933                                 alloclen = mtu;
934                         else
935                                 alloclen = datalen + fragheaderlen;
936
937                         /*
938                          * The last fragment gets additional space at tail.
939                          * Note: we overallocate on fragments with MSG_MODE
940                          * because we have no idea if we're the last one.
941                          */
942                         if (datalen == length + fraggap)
943                                 alloclen += rt->u.dst.trailer_len;
944
945                         /*
946                          * We just reserve space for fragment header.
947                          * Note: this may be overallocation if the message 
948                          * (without MSG_MORE) fits into the MTU.
949                          */
950                         alloclen += sizeof(struct frag_hdr);
951
952                         if (transhdrlen) {
953                                 skb = sock_alloc_send_skb(sk,
954                                                 alloclen + hh_len,
955                                                 (flags & MSG_DONTWAIT), &err);
956                         } else {
957                                 skb = NULL;
958                                 if (atomic_read(&sk->sk_wmem_alloc) <=
959                                     2 * sk->sk_sndbuf)
960                                         skb = sock_wmalloc(sk,
961                                                            alloclen + hh_len, 1,
962                                                            sk->sk_allocation);
963                                 if (unlikely(skb == NULL))
964                                         err = -ENOBUFS;
965                         }
966                         if (skb == NULL)
967                                 goto error;
968                         /*
969                          *      Fill in the control structures
970                          */
971                         skb->ip_summed = csummode;
972                         skb->csum = 0;
973                         /* reserve for fragmentation */
974                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
975
976                         /*
977                          *      Find where to start putting bytes
978                          */
979                         data = skb_put(skb, fraglen);
980                         skb->nh.raw = data + exthdrlen;
981                         data += fragheaderlen;
982                         skb->h.raw = data + exthdrlen;
983
984                         if (fraggap) {
985                                 skb->csum = skb_copy_and_csum_bits(
986                                         skb_prev, maxfraglen,
987                                         data + transhdrlen, fraggap, 0);
988                                 skb_prev->csum = csum_sub(skb_prev->csum,
989                                                           skb->csum);
990                                 data += fraggap;
991                                 skb_trim(skb_prev, maxfraglen);
992                         }
993                         copy = datalen - transhdrlen - fraggap;
994                         if (copy < 0) {
995                                 err = -EINVAL;
996                                 kfree_skb(skb);
997                                 goto error;
998                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
999                                 err = -EFAULT;
1000                                 kfree_skb(skb);
1001                                 goto error;
1002                         }
1003
1004                         offset += copy;
1005                         length -= datalen - fraggap;
1006                         transhdrlen = 0;
1007                         exthdrlen = 0;
1008                         csummode = CHECKSUM_NONE;
1009
1010                         /*
1011                          * Put the packet on the pending queue
1012                          */
1013                         __skb_queue_tail(&sk->sk_write_queue, skb);
1014                         continue;
1015                 }
1016
1017                 if (copy > length)
1018                         copy = length;
1019
1020                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1021                         unsigned int off;
1022
1023                         off = skb->len;
1024                         if (getfrag(from, skb_put(skb, copy),
1025                                                 offset, copy, off, skb) < 0) {
1026                                 __skb_trim(skb, off);
1027                                 err = -EFAULT;
1028                                 goto error;
1029                         }
1030                 } else {
1031                         int i = skb_shinfo(skb)->nr_frags;
1032                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1033                         struct page *page = sk->sk_sndmsg_page;
1034                         int off = sk->sk_sndmsg_off;
1035                         unsigned int left;
1036
1037                         if (page && (left = PAGE_SIZE - off) > 0) {
1038                                 if (copy >= left)
1039                                         copy = left;
1040                                 if (page != frag->page) {
1041                                         if (i == MAX_SKB_FRAGS) {
1042                                                 err = -EMSGSIZE;
1043                                                 goto error;
1044                                         }
1045                                         get_page(page);
1046                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1047                                         frag = &skb_shinfo(skb)->frags[i];
1048                                 }
1049                         } else if(i < MAX_SKB_FRAGS) {
1050                                 if (copy > PAGE_SIZE)
1051                                         copy = PAGE_SIZE;
1052                                 page = alloc_pages(sk->sk_allocation, 0);
1053                                 if (page == NULL) {
1054                                         err = -ENOMEM;
1055                                         goto error;
1056                                 }
1057                                 sk->sk_sndmsg_page = page;
1058                                 sk->sk_sndmsg_off = 0;
1059
1060                                 skb_fill_page_desc(skb, i, page, 0, 0);
1061                                 frag = &skb_shinfo(skb)->frags[i];
1062                                 skb->truesize += PAGE_SIZE;
1063                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1064                         } else {
1065                                 err = -EMSGSIZE;
1066                                 goto error;
1067                         }
1068                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1069                                 err = -EFAULT;
1070                                 goto error;
1071                         }
1072                         sk->sk_sndmsg_off += copy;
1073                         frag->size += copy;
1074                         skb->len += copy;
1075                         skb->data_len += copy;
1076                 }
1077                 offset += copy;
1078                 length -= copy;
1079         }
1080         return 0;
1081 error:
1082         inet->cork.length -= length;
1083         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1084         return err;
1085 }
1086
1087 int ip6_push_pending_frames(struct sock *sk)
1088 {
1089         struct sk_buff *skb, *tmp_skb;
1090         struct sk_buff **tail_skb;
1091         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1092         struct inet_sock *inet = inet_sk(sk);
1093         struct ipv6_pinfo *np = inet6_sk(sk);
1094         struct ipv6hdr *hdr;
1095         struct ipv6_txoptions *opt = np->cork.opt;
1096         struct rt6_info *rt = np->cork.rt;
1097         struct flowi *fl = &inet->cork.fl;
1098         unsigned char proto = fl->proto;
1099         int err = 0;
1100
1101         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1102                 goto out;
1103         tail_skb = &(skb_shinfo(skb)->frag_list);
1104
1105         /* move skb->data to ip header from ext header */
1106         if (skb->data < skb->nh.raw)
1107                 __skb_pull(skb, skb->nh.raw - skb->data);
1108         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1109                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1110                 *tail_skb = tmp_skb;
1111                 tail_skb = &(tmp_skb->next);
1112                 skb->len += tmp_skb->len;
1113                 skb->data_len += tmp_skb->len;
1114 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1115                 skb->truesize += tmp_skb->truesize;
1116                 __sock_put(tmp_skb->sk);
1117                 tmp_skb->destructor = NULL;
1118                 tmp_skb->sk = NULL;
1119 #endif
1120         }
1121
1122         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1123         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1124         if (opt && opt->opt_flen)
1125                 ipv6_push_frag_opts(skb, opt, &proto);
1126         if (opt && opt->opt_nflen)
1127                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1128
1129         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1130         
1131         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1132
1133         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1134                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1135         else
1136                 hdr->payload_len = 0;
1137         hdr->hop_limit = np->cork.hop_limit;
1138         hdr->nexthdr = proto;
1139         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1140         ipv6_addr_copy(&hdr->daddr, final_dst);
1141
1142         skb->dst = dst_clone(&rt->u.dst);
1143         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1144         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1145         if (err) {
1146                 if (err > 0)
1147                         err = inet->recverr ? net_xmit_errno(err) : 0;
1148                 if (err)
1149                         goto error;
1150         }
1151
1152 out:
1153         inet->cork.flags &= ~IPCORK_OPT;
1154         if (np->cork.opt) {
1155                 kfree(np->cork.opt);
1156                 np->cork.opt = NULL;
1157         }
1158         if (np->cork.rt) {
1159                 dst_release(&np->cork.rt->u.dst);
1160                 np->cork.rt = NULL;
1161         }
1162         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1163         return err;
1164 error:
1165         goto out;
1166 }
1167
1168 void ip6_flush_pending_frames(struct sock *sk)
1169 {
1170         struct inet_sock *inet = inet_sk(sk);
1171         struct ipv6_pinfo *np = inet6_sk(sk);
1172         struct sk_buff *skb;
1173
1174         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1175                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1176                 kfree_skb(skb);
1177         }
1178
1179         inet->cork.flags &= ~IPCORK_OPT;
1180
1181         if (np->cork.opt) {
1182                 kfree(np->cork.opt);
1183                 np->cork.opt = NULL;
1184         }
1185         if (np->cork.rt) {
1186                 dst_release(&np->cork.rt->u.dst);
1187                 np->cork.rt = NULL;
1188         }
1189         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1190 }