ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
59
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62         static u32 ipv6_fragmentation_id = 1;
63         static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
64
65         spin_lock_bh(&ip6_id_lock);
66         fhdr->identification = htonl(ipv6_fragmentation_id);
67         if (++ipv6_fragmentation_id == 0)
68                 ipv6_fragmentation_id = 1;
69         spin_unlock_bh(&ip6_id_lock);
70 }
71
72 static inline int ip6_output_finish(struct sk_buff *skb)
73 {
74
75         struct dst_entry *dst = skb->dst;
76         struct hh_cache *hh = dst->hh;
77
78         if (hh) {
79                 int hh_alen;
80
81                 read_lock_bh(&hh->hh_lock);
82                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
83                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
84                 read_unlock_bh(&hh->hh_lock);
85                 skb_push(skb, hh->hh_len);
86                 return hh->hh_output(skb);
87         } else if (dst->neighbour)
88                 return dst->neighbour->output(skb);
89
90         IP6_INC_STATS_BH(Ip6OutNoRoutes);
91         kfree_skb(skb);
92         return -EINVAL;
93
94 }
95
96 /* dev_loopback_xmit for use with netfilter. */
97 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
98 {
99         newskb->mac.raw = newskb->data;
100         __skb_pull(newskb, newskb->nh.raw - newskb->data);
101         newskb->pkt_type = PACKET_LOOPBACK;
102         newskb->ip_summed = CHECKSUM_UNNECESSARY;
103         BUG_TRAP(newskb->dst);
104
105         netif_rx(newskb);
106         return 0;
107 }
108
109
110 int ip6_output2(struct sk_buff *skb)
111 {
112         struct dst_entry *dst = skb->dst;
113         struct net_device *dev = dst->dev;
114
115         skb->protocol = htons(ETH_P_IPV6);
116         skb->dev = dev;
117
118         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
119                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
120
121                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
122                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
123                                 &skb->nh.ipv6h->saddr)) {
124                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
125
126                         /* Do not check for IFF_ALLMULTI; multicast routing
127                            is not supported in any case.
128                          */
129                         if (newskb)
130                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
131                                         newskb->dev,
132                                         ip6_dev_loopback_xmit);
133
134                         if (skb->nh.ipv6h->hop_limit == 0) {
135                                 IP6_INC_STATS(Ip6OutDiscards);
136                                 kfree_skb(skb);
137                                 return 0;
138                         }
139                 }
140
141                 IP6_INC_STATS(Ip6OutMcastPkts);
142         }
143
144         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
145 }
146
147 int ip6_output(struct sk_buff *skb)
148 {
149         if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
150                 return ip6_fragment(skb, ip6_output2);
151         else
152                 return ip6_output2(skb);
153 }
154
155 #ifdef CONFIG_NETFILTER
156 int ip6_route_me_harder(struct sk_buff *skb)
157 {
158         struct ipv6hdr *iph = skb->nh.ipv6h;
159         struct dst_entry *dst;
160         struct flowi fl = {
161                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
162                 .nl_u =
163                 { .ip6_u =
164                   { .daddr = iph->daddr,
165                     .saddr = iph->saddr, } },
166                 .proto = iph->nexthdr,
167         };
168
169         dst = ip6_route_output(skb->sk, &fl);
170
171         if (dst->error) {
172                 IP6_INC_STATS(Ip6OutNoRoutes);
173                 LIMIT_NETDEBUG(
174                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
175                 dst_release(dst);
176                 return -EINVAL;
177         }
178
179         /* Drop old route. */
180         dst_release(skb->dst);
181
182         skb->dst = dst;
183         return 0;
184 }
185 #endif
186
187 static inline int ip6_maybe_reroute(struct sk_buff *skb)
188 {
189 #ifdef CONFIG_NETFILTER
190         if (skb->nfcache & NFC_ALTERED){
191                 if (ip6_route_me_harder(skb) != 0){
192                         kfree_skb(skb);
193                         return -EINVAL;
194                 }
195         }
196 #endif /* CONFIG_NETFILTER */
197         return dst_output(skb);
198 }
199
200 /*
201  *      xmit an sk_buff (used by TCP)
202  */
203
204 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
205              struct ipv6_txoptions *opt, int ipfragok)
206 {
207         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
208         struct in6_addr *first_hop = &fl->fl6_dst;
209         struct dst_entry *dst = skb->dst;
210         struct ipv6hdr *hdr;
211         u8  proto = fl->proto;
212         int seg_len = skb->len;
213         int hlimit;
214         u32 mtu;
215
216         if (opt) {
217                 int head_room;
218
219                 /* First: exthdrs may take lots of space (~8K for now)
220                    MAX_HEADER is not enough.
221                  */
222                 head_room = opt->opt_nflen + opt->opt_flen;
223                 seg_len += head_room;
224                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
225
226                 if (skb_headroom(skb) < head_room) {
227                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
228                         kfree_skb(skb);
229                         skb = skb2;
230                         if (skb == NULL) {      
231                                 IP6_INC_STATS(Ip6OutDiscards);
232                                 return -ENOBUFS;
233                         }
234                         if (sk)
235                                 skb_set_owner_w(skb, sk);
236                 }
237                 if (opt->opt_flen)
238                         ipv6_push_frag_opts(skb, opt, &proto);
239                 if (opt->opt_nflen)
240                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
241         }
242
243         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
244
245         /*
246          *      Fill in the IPv6 header
247          */
248
249         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
250         hlimit = -1;
251         if (np)
252                 hlimit = np->hop_limit;
253         if (hlimit < 0)
254                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
255
256         hdr->payload_len = htons(seg_len);
257         hdr->nexthdr = proto;
258         hdr->hop_limit = hlimit;
259
260         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
261         ipv6_addr_copy(&hdr->daddr, first_hop);
262
263         mtu = dst_pmtu(dst);
264         if ((skb->len <= mtu) || ipfragok) {
265                 IP6_INC_STATS(Ip6OutRequests);
266                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
267         }
268
269         if (net_ratelimit())
270                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
271         skb->dev = dst->dev;
272         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
273         IP6_INC_STATS(Ip6FragFails);
274         kfree_skb(skb);
275         return -EMSGSIZE;
276 }
277
278 /*
279  *      To avoid extra problems ND packets are send through this
280  *      routine. It's code duplication but I really want to avoid
281  *      extra checks since ipv6_build_header is used by TCP (which
282  *      is for us performance critical)
283  */
284
285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
286                struct in6_addr *saddr, struct in6_addr *daddr,
287                int proto, int len)
288 {
289         struct ipv6_pinfo *np = inet6_sk(sk);
290         struct ipv6hdr *hdr;
291         int totlen;
292
293         skb->protocol = htons(ETH_P_IPV6);
294         skb->dev = dev;
295
296         totlen = len + sizeof(struct ipv6hdr);
297
298         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
299         skb->nh.ipv6h = hdr;
300
301         *(u32*)hdr = htonl(0x60000000);
302
303         hdr->payload_len = htons(len);
304         hdr->nexthdr = proto;
305         hdr->hop_limit = np->hop_limit;
306
307         ipv6_addr_copy(&hdr->saddr, saddr);
308         ipv6_addr_copy(&hdr->daddr, daddr);
309
310         return 0;
311 }
312
313 int ip6_call_ra_chain(struct sk_buff *skb, int sel)
314 {
315         struct ip6_ra_chain *ra;
316         struct sock *last = NULL;
317
318         read_lock(&ip6_ra_lock);
319         for (ra = ip6_ra_chain; ra; ra = ra->next) {
320                 struct sock *sk = ra->sk;
321                 if (sk && ra->sel == sel) {
322                         if (last) {
323                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
324                                 if (skb2)
325                                         rawv6_rcv(last, skb2);
326                         }
327                         last = sk;
328                 }
329         }
330
331         if (last) {
332                 rawv6_rcv(last, skb);
333                 read_unlock(&ip6_ra_lock);
334                 return 1;
335         }
336         read_unlock(&ip6_ra_lock);
337         return 0;
338 }
339
340 static inline int ip6_forward_finish(struct sk_buff *skb)
341 {
342         return dst_output(skb);
343 }
344
345 int ip6_forward(struct sk_buff *skb)
346 {
347         struct dst_entry *dst = skb->dst;
348         struct ipv6hdr *hdr = skb->nh.ipv6h;
349         struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
350         
351         if (ipv6_devconf.forwarding == 0)
352                 goto error;
353
354         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
355                 IP6_INC_STATS(Ip6InDiscards);
356                 goto drop;
357         }
358
359         skb->ip_summed = CHECKSUM_NONE;
360
361         /*
362          *      We DO NOT make any processing on
363          *      RA packets, pushing them to user level AS IS
364          *      without ane WARRANTY that application will be able
365          *      to interpret them. The reason is that we
366          *      cannot make anything clever here.
367          *
368          *      We are not end-node, so that if packet contains
369          *      AH/ESP, we cannot make anything.
370          *      Defragmentation also would be mistake, RA packets
371          *      cannot be fragmented, because there is no warranty
372          *      that different fragments will go along one path. --ANK
373          */
374         if (opt->ra) {
375                 u8 *ptr = skb->nh.raw + opt->ra;
376                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
377                         return 0;
378         }
379
380         /*
381          *      check and decrement ttl
382          */
383         if (hdr->hop_limit <= 1) {
384                 /* Force OUTPUT device used as source address */
385                 skb->dev = dst->dev;
386                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
387                             0, skb->dev);
388
389                 kfree_skb(skb);
390                 return -ETIMEDOUT;
391         }
392
393         if (!xfrm6_route_forward(skb)) {
394                 IP6_INC_STATS(Ip6InDiscards);
395                 goto drop;
396         }
397
398         /* IPv6 specs say nothing about it, but it is clear that we cannot
399            send redirects to source routed frames.
400          */
401         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
402                 struct in6_addr *target = NULL;
403                 struct rt6_info *rt;
404                 struct neighbour *n = dst->neighbour;
405
406                 /*
407                  *      incoming and outgoing devices are the same
408                  *      send a redirect.
409                  */
410
411                 rt = (struct rt6_info *) dst;
412                 if ((rt->rt6i_flags & RTF_GATEWAY))
413                         target = (struct in6_addr*)&n->primary_key;
414                 else
415                         target = &hdr->daddr;
416
417                 /* Limit redirects both by destination (here)
418                    and by source (inside ndisc_send_redirect)
419                  */
420                 if (xrlim_allow(dst, 1*HZ))
421                         ndisc_send_redirect(skb, n, target);
422         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
423                                                 |IPV6_ADDR_LINKLOCAL)) {
424                 /* This check is security critical. */
425                 goto error;
426         }
427
428         if (skb->len > dst_pmtu(dst)) {
429                 /* Again, force OUTPUT device used as source address */
430                 skb->dev = dst->dev;
431                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
432                 IP6_INC_STATS_BH(Ip6InTooBigErrors);
433                 IP6_INC_STATS_BH(Ip6FragFails);
434                 kfree_skb(skb);
435                 return -EMSGSIZE;
436         }
437
438         if (skb_cow(skb, dst->dev->hard_header_len)) {
439                 IP6_INC_STATS(Ip6OutDiscards);
440                 goto drop;
441         }
442
443         hdr = skb->nh.ipv6h;
444
445         /* Mangling hops number delayed to point after skb COW */
446  
447         hdr->hop_limit--;
448
449         IP6_INC_STATS_BH(Ip6OutForwDatagrams);
450         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
451
452 error:
453         IP6_INC_STATS_BH(Ip6InAddrErrors);
454 drop:
455         kfree_skb(skb);
456         return -EINVAL;
457 }
458
459 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
460 {
461         to->pkt_type = from->pkt_type;
462         to->priority = from->priority;
463         to->protocol = from->protocol;
464         to->security = from->security;
465         to->dst = dst_clone(from->dst);
466         to->dev = from->dev;
467
468 #ifdef CONFIG_NET_SCHED
469         to->tc_index = from->tc_index;
470 #endif
471 #ifdef CONFIG_NETFILTER
472         to->nfmark = from->nfmark;
473         /* Connection association is same as pre-frag packet */
474         to->nfct = from->nfct;
475         nf_conntrack_get(to->nfct);
476 #ifdef CONFIG_BRIDGE_NETFILTER
477         nf_bridge_put(to->nf_bridge);
478         to->nf_bridge = from->nf_bridge;
479         nf_bridge_get(to->nf_bridge);
480 #endif
481 #ifdef CONFIG_NETFILTER_DEBUG
482         to->nf_debug = from->nf_debug;
483 #endif
484 #endif
485 }
486
487 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
488 {
489         u16 offset = sizeof(struct ipv6hdr);
490         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
491         unsigned int packet_len = skb->tail - skb->nh.raw;
492         int found_rhdr = 0;
493         *nexthdr = &skb->nh.ipv6h->nexthdr;
494
495         while (offset + 1 <= packet_len) {
496
497                 switch (**nexthdr) {
498
499                 case NEXTHDR_HOP:
500                 case NEXTHDR_ROUTING:
501                 case NEXTHDR_DEST:
502                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
503                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
504                         offset += ipv6_optlen(exthdr);
505                         *nexthdr = &exthdr->nexthdr;
506                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
507                         break;
508                 default :
509                         return offset;
510                 }
511         }
512
513         return offset;
514 }
515
516 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
517 {
518         struct net_device *dev;
519         struct rt6_info *rt = (struct rt6_info*)skb->dst;
520         struct sk_buff *frag;
521         struct ipv6hdr *tmp_hdr;
522         struct frag_hdr *fh;
523         unsigned int mtu, hlen, left, len;
524         u32 frag_id = 0;
525         int ptr, offset = 0, err=0;
526         u8 *prevhdr, nexthdr = 0;
527
528         dev = rt->u.dst.dev;
529         hlen = ip6_find_1stfragopt(skb, &prevhdr);
530         nexthdr = *prevhdr;
531
532         mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
533
534         if (skb_shinfo(skb)->frag_list) {
535                 int first_len = skb_pagelen(skb);
536
537                 if (first_len - hlen > mtu ||
538                     ((first_len - hlen) & 7) ||
539                     skb_cloned(skb))
540                         goto slow_path;
541
542                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
543                         /* Correct geometry. */
544                         if (frag->len > mtu ||
545                             ((frag->len & 7) && frag->next) ||
546                             skb_headroom(frag) < hlen)
547                             goto slow_path;
548
549                         /* Correct socket ownership. */
550                         if (frag->sk == NULL)
551                                 goto slow_path;
552
553                         /* Partially cloned skb? */
554                         if (skb_shared(frag))
555                                 goto slow_path;
556                 }
557
558                 err = 0;
559                 offset = 0;
560                 frag = skb_shinfo(skb)->frag_list;
561                 skb_shinfo(skb)->frag_list = 0;
562                 /* BUILD HEADER */
563
564                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
565                 if (!tmp_hdr) {
566                         IP6_INC_STATS(Ip6FragFails);
567                         return -ENOMEM;
568                 }
569
570                 *prevhdr = NEXTHDR_FRAGMENT;
571                 memcpy(tmp_hdr, skb->nh.raw, hlen);
572                 __skb_pull(skb, hlen);
573                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
574                 skb->nh.raw = __skb_push(skb, hlen);
575                 memcpy(skb->nh.raw, tmp_hdr, hlen);
576
577                 ipv6_select_ident(skb, fh);
578                 fh->nexthdr = nexthdr;
579                 fh->reserved = 0;
580                 fh->frag_off = htons(IP6_MF);
581                 frag_id = fh->identification;
582
583                 first_len = skb_pagelen(skb);
584                 skb->data_len = first_len - skb_headlen(skb);
585                 skb->len = first_len;
586                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
587  
588
589                 for (;;) {
590                         /* Prepare header of the next frame,
591                          * before previous one went down. */
592                         if (frag) {
593                                 frag->h.raw = frag->data;
594                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
595                                 frag->nh.raw = __skb_push(frag, hlen);
596                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
597                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
598                                 fh->nexthdr = nexthdr;
599                                 fh->reserved = 0;
600                                 fh->frag_off = htons(offset);
601                                 if (frag->next != NULL)
602                                         fh->frag_off |= htons(IP6_MF);
603                                 fh->identification = frag_id;
604                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
605                                 ip6_copy_metadata(frag, skb);
606                         }
607                         err = output(skb);
608
609                         if (err || !frag)
610                                 break;
611
612                         skb = frag;
613                         frag = skb->next;
614                         skb->next = NULL;
615                 }
616
617                 if (tmp_hdr)
618                         kfree(tmp_hdr);
619
620                 if (err == 0) {
621                         IP6_INC_STATS(Ip6FragOKs);
622                         return 0;
623                 }
624
625                 while (frag) {
626                         skb = frag->next;
627                         kfree_skb(frag);
628                         frag = skb;
629                 }
630
631                 IP6_INC_STATS(Ip6FragFails);
632                 return err;
633         }
634
635 slow_path:
636         left = skb->len - hlen;         /* Space per frame */
637         ptr = hlen;                     /* Where to start from */
638
639         /*
640          *      Fragment the datagram.
641          */
642
643         *prevhdr = NEXTHDR_FRAGMENT;
644
645         /*
646          *      Keep copying data until we run out.
647          */
648         while(left > 0) {
649                 len = left;
650                 /* IF: it doesn't fit, use 'mtu' - the data space left */
651                 if (len > mtu)
652                         len = mtu;
653                 /* IF: we are not sending upto and including the packet end
654                    then align the next start on an eight byte boundary */
655                 if (len < left) {
656                         len &= ~7;
657                 }
658                 /*
659                  *      Allocate buffer.
660                  */
661
662                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
663                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
664                         IP6_INC_STATS(Ip6FragFails);
665                         err = -ENOMEM;
666                         goto fail;
667                 }
668
669                 /*
670                  *      Set up data on packet
671                  */
672
673                 ip6_copy_metadata(frag, skb);
674                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
675                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
676                 frag->nh.raw = frag->data;
677                 fh = (struct frag_hdr*)(frag->data + hlen);
678                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
679
680                 /*
681                  *      Charge the memory for the fragment to any owner
682                  *      it might possess
683                  */
684                 if (skb->sk)
685                         skb_set_owner_w(frag, skb->sk);
686
687                 /*
688                  *      Copy the packet header into the new buffer.
689                  */
690                 memcpy(frag->nh.raw, skb->data, hlen);
691
692                 /*
693                  *      Build fragment header.
694                  */
695                 fh->nexthdr = nexthdr;
696                 fh->reserved = 0;
697                 if (frag_id) {
698                         ipv6_select_ident(skb, fh);
699                         frag_id = fh->identification;
700                 } else
701                         fh->identification = frag_id;
702
703                 /*
704                  *      Copy a block of the IP datagram.
705                  */
706                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
707                         BUG();
708                 left -= len;
709
710                 fh->frag_off = htons(offset);
711                 if (left > 0)
712                         fh->frag_off |= htons(IP6_MF);
713                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
714
715                 ptr += len;
716                 offset += len;
717
718                 /*
719                  *      Put this fragment into the sending queue.
720                  */
721
722                 IP6_INC_STATS(Ip6FragCreates);
723
724                 err = output(frag);
725                 if (err)
726                         goto fail;
727         }
728         kfree_skb(skb);
729         IP6_INC_STATS(Ip6FragOKs);
730         return err;
731
732 fail:
733         kfree_skb(skb); 
734         IP6_INC_STATS(Ip6FragFails);
735         return err;
736 }
737
738 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
739 {
740         int err = 0;
741
742         *dst = NULL;
743         if (sk) {
744                 struct ipv6_pinfo *np = inet6_sk(sk);
745         
746                 *dst = __sk_dst_check(sk, np->dst_cookie);
747                 if (*dst) {
748                         struct rt6_info *rt = (struct rt6_info*)*dst;
749         
750                                 /* Yes, checking route validity in not connected
751                                    case is not very simple. Take into account,
752                                    that we do not support routing by source, TOS,
753                                    and MSG_DONTROUTE            --ANK (980726)
754         
755                                    1. If route was host route, check that
756                                       cached destination is current.
757                                       If it is network route, we still may
758                                       check its validity using saved pointer
759                                       to the last used address: daddr_cache.
760                                       We do not want to save whole address now,
761                                       (because main consumer of this service
762                                        is tcp, which has not this problem),
763                                       so that the last trick works only on connected
764                                       sockets.
765                                    2. oif also should be the same.
766                                  */
767         
768                         if (((rt->rt6i_dst.plen != 128 ||
769                               ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
770                              && (np->daddr_cache == NULL ||
771                                  ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
772                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
773                                 *dst = NULL;
774                         } else
775                                 dst_hold(*dst);
776                 }
777         }
778
779         if (*dst == NULL)
780                 *dst = ip6_route_output(sk, fl);
781
782         if ((err = (*dst)->error))
783                 goto out_err_release;
784
785         if (ipv6_addr_any(&fl->fl6_src)) {
786                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
787
788                 if (err) {
789 #if IP6_DEBUG >= 2
790                         printk(KERN_DEBUG "ip6_dst_lookup: "
791                                "no available source address\n");
792 #endif
793                         goto out_err_release;
794                 }
795         }
796         if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) {
797                 err = -ENETUNREACH;
798                 goto out_err_release;
799         }
800
801         return 0;
802
803 out_err_release:
804         dst_release(*dst);
805         *dst = NULL;
806         return err;
807 }
808
809 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
810                     void *from, int length, int transhdrlen,
811                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
812                     unsigned int flags)
813 {
814         struct inet_opt *inet = inet_sk(sk);
815         struct ipv6_pinfo *np = inet6_sk(sk);
816         struct sk_buff *skb;
817         unsigned int maxfraglen, fragheaderlen;
818         int exthdrlen;
819         int hh_len;
820         int mtu;
821         int copy = 0;
822         int err;
823         int offset = 0;
824         int csummode = CHECKSUM_NONE;
825
826         if (flags&MSG_PROBE)
827                 return 0;
828         if (skb_queue_empty(&sk->sk_write_queue)) {
829                 /*
830                  * setup for corking
831                  */
832                 if (opt) {
833                         if (np->cork.opt == NULL) {
834                                 np->cork.opt = kmalloc(opt->tot_len,
835                                                        sk->sk_allocation);
836                                 if (unlikely(np->cork.opt == NULL))
837                                         return -ENOBUFS;
838                         } else if (np->cork.opt->tot_len < opt->tot_len) {
839                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
840                                 return -EINVAL;
841                         }
842                         memcpy(np->cork.opt, opt, opt->tot_len);
843                         inet->cork.flags |= IPCORK_OPT;
844                         /* need source address above miyazawa*/
845                 }
846                 dst_hold(&rt->u.dst);
847                 np->cork.rt = rt;
848                 inet->cork.fl = *fl;
849                 np->cork.hop_limit = hlimit;
850                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
851                 inet->cork.length = 0;
852                 inet->sndmsg_page = NULL;
853                 inet->sndmsg_off = 0;
854                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
855                 length += exthdrlen;
856                 transhdrlen += exthdrlen;
857         } else {
858                 rt = np->cork.rt;
859                 fl = &inet->cork.fl;
860                 if (inet->cork.flags & IPCORK_OPT)
861                         opt = np->cork.opt;
862                 transhdrlen = 0;
863                 exthdrlen = 0;
864                 mtu = inet->cork.fragsize;
865         }
866
867         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
868
869         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
870         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
871
872         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
873                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
874                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
875                         return -EMSGSIZE;
876                 }
877         }
878
879         inet->cork.length += length;
880
881         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
882                 goto alloc_new_skb;
883
884         while (length > 0) {
885                 if ((copy = maxfraglen - skb->len) <= 0) {
886                         char *data;
887                         unsigned int datalen;
888                         unsigned int fraglen;
889                         unsigned int alloclen;
890                         BUG_TRAP(copy == 0);
891 alloc_new_skb:
892                         datalen = maxfraglen - fragheaderlen;
893                         if (datalen > length)
894                                 datalen = length;
895                         fraglen = datalen + fragheaderlen;
896                         if ((flags & MSG_MORE) &&
897                             !(rt->u.dst.dev->features&NETIF_F_SG))
898                                 alloclen = maxfraglen;
899                         else
900                                 alloclen = fraglen;
901                         alloclen += sizeof(struct frag_hdr);
902                         if (transhdrlen) {
903                                 skb = sock_alloc_send_skb(sk,
904                                                 alloclen + hh_len,
905                                                 (flags & MSG_DONTWAIT), &err);
906                         } else {
907                                 skb = NULL;
908                                 if (atomic_read(&sk->sk_wmem_alloc) <=
909                                     2 * sk->sk_sndbuf)
910                                         skb = sock_wmalloc(sk,
911                                                            alloclen + hh_len, 1,
912                                                            sk->sk_allocation);
913                                 if (unlikely(skb == NULL))
914                                         err = -ENOBUFS;
915                         }
916                         if (skb == NULL)
917                                 goto error;
918                         /*
919                          *      Fill in the control structures
920                          */
921                         skb->ip_summed = csummode;
922                         skb->csum = 0;
923                         /* reserve 8 byte for fragmentation */
924                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
925
926                         /*
927                          *      Find where to start putting bytes
928                          */
929                         data = skb_put(skb, fraglen);
930                         skb->nh.raw = data + exthdrlen;
931                         data += fragheaderlen;
932                         skb->h.raw = data + exthdrlen;
933                         copy = datalen - transhdrlen;
934                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
935                                 err = -EFAULT;
936                                 kfree_skb(skb);
937                                 goto error;
938                         }
939
940                         offset += copy;
941                         length -= datalen;
942                         transhdrlen = 0;
943                         exthdrlen = 0;
944                         csummode = CHECKSUM_NONE;
945
946                         /*
947                          * Put the packet on the pending queue
948                          */
949                         __skb_queue_tail(&sk->sk_write_queue, skb);
950                         continue;
951                 }
952
953                 if (copy > length)
954                         copy = length;
955
956                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
957                         unsigned int off;
958
959                         off = skb->len;
960                         if (getfrag(from, skb_put(skb, copy),
961                                                 offset, copy, off, skb) < 0) {
962                                 __skb_trim(skb, off);
963                                 err = -EFAULT;
964                                 goto error;
965                         }
966                 } else {
967                         int i = skb_shinfo(skb)->nr_frags;
968                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
969                         struct page *page = inet->sndmsg_page;
970                         int off = inet->sndmsg_off;
971                         unsigned int left;
972
973                         if (page && (left = PAGE_SIZE - off) > 0) {
974                                 if (copy >= left)
975                                         copy = left;
976                                 if (page != frag->page) {
977                                         if (i == MAX_SKB_FRAGS) {
978                                                 err = -EMSGSIZE;
979                                                 goto error;
980                                         }
981                                         get_page(page);
982                                         skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
983                                         frag = &skb_shinfo(skb)->frags[i];
984                                 }
985                         } else if(i < MAX_SKB_FRAGS) {
986                                 if (copy > PAGE_SIZE)
987                                         copy = PAGE_SIZE;
988                                 page = alloc_pages(sk->sk_allocation, 0);
989                                 if (page == NULL) {
990                                         err = -ENOMEM;
991                                         goto error;
992                                 }
993                                 inet->sndmsg_page = page;
994                                 inet->sndmsg_off = 0;
995
996                                 skb_fill_page_desc(skb, i, page, 0, 0);
997                                 frag = &skb_shinfo(skb)->frags[i];
998                                 skb->truesize += PAGE_SIZE;
999                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1000                         } else {
1001                                 err = -EMSGSIZE;
1002                                 goto error;
1003                         }
1004                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1005                                 err = -EFAULT;
1006                                 goto error;
1007                         }
1008                         inet->sndmsg_off += copy;
1009                         frag->size += copy;
1010                         skb->len += copy;
1011                         skb->data_len += copy;
1012                 }
1013                 offset += copy;
1014                 length -= copy;
1015         }
1016         return 0;
1017 error:
1018         inet->cork.length -= length;
1019         IP6_INC_STATS(Ip6OutDiscards);
1020         return err;
1021 }
1022
1023 int ip6_push_pending_frames(struct sock *sk)
1024 {
1025         struct sk_buff *skb, *tmp_skb;
1026         struct sk_buff **tail_skb;
1027         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1028         struct inet_opt *inet = inet_sk(sk);
1029         struct ipv6_pinfo *np = inet6_sk(sk);
1030         struct ipv6hdr *hdr;
1031         struct ipv6_txoptions *opt = np->cork.opt;
1032         struct rt6_info *rt = np->cork.rt;
1033         struct flowi *fl = &inet->cork.fl;
1034         unsigned char proto = fl->proto;
1035         int err = 0;
1036
1037         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1038                 goto out;
1039         tail_skb = &(skb_shinfo(skb)->frag_list);
1040
1041         /* move skb->data to ip header from ext header */
1042         if (skb->data < skb->nh.raw)
1043                 __skb_pull(skb, skb->nh.raw - skb->data);
1044         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1045                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1046                 *tail_skb = tmp_skb;
1047                 tail_skb = &(tmp_skb->next);
1048                 skb->len += tmp_skb->len;
1049                 skb->data_len += tmp_skb->len;
1050 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1051                 skb->truesize += tmp_skb->truesize;
1052                 __sock_put(tmp_skb->sk);
1053                 tmp_skb->destructor = NULL;
1054                 tmp_skb->sk = NULL;
1055 #endif
1056         }
1057
1058         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1059         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1060         if (opt && opt->opt_flen)
1061                 ipv6_push_frag_opts(skb, opt, &proto);
1062         if (opt && opt->opt_nflen)
1063                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1064
1065         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1066         
1067         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1068
1069         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1070                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1071         else
1072                 hdr->payload_len = 0;
1073         hdr->hop_limit = np->cork.hop_limit;
1074         hdr->nexthdr = proto;
1075         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1076         ipv6_addr_copy(&hdr->daddr, final_dst);
1077
1078         skb->dst = dst_clone(&rt->u.dst);
1079         IP6_INC_STATS(Ip6OutRequests);  
1080         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1081         if (err) {
1082                 if (err > 0)
1083                         err = inet->recverr ? net_xmit_errno(err) : 0;
1084                 if (err)
1085                         goto error;
1086         }
1087
1088 out:
1089         inet->cork.flags &= ~IPCORK_OPT;
1090         if (np->cork.opt) {
1091                 kfree(np->cork.opt);
1092                 np->cork.opt = NULL;
1093         }
1094         if (np->cork.rt) {
1095                 dst_release(&np->cork.rt->u.dst);
1096                 np->cork.rt = NULL;
1097         }
1098         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1099         return err;
1100 error:
1101         goto out;
1102 }
1103
1104 void ip6_flush_pending_frames(struct sock *sk)
1105 {
1106         struct inet_opt *inet = inet_sk(sk);
1107         struct ipv6_pinfo *np = inet6_sk(sk);
1108         struct sk_buff *skb;
1109
1110         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1111                 IP6_INC_STATS(Ip6OutDiscards);
1112                 kfree_skb(skb);
1113         }
1114
1115         inet->cork.flags &= ~IPCORK_OPT;
1116
1117         if (np->cork.opt) {
1118                 kfree(np->cork.opt);
1119                 np->cork.opt = NULL;
1120         }
1121         if (np->cork.rt) {
1122                 dst_release(&np->cork.rt->u.dst);
1123                 np->cork.rt = NULL;
1124         }
1125         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1126 }