ed31c5320eafc7d6135d3a2cb8332289a8a12db0
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_pmtu(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191         if (skb->nfcache & NFC_ALTERED){
192                 if (ip6_route_me_harder(skb) != 0){
193                         kfree_skb(skb);
194                         return -EINVAL;
195                 }
196         }
197 #endif /* CONFIG_NETFILTER */
198         return dst_output(skb);
199 }
200
201 /*
202  *      xmit an sk_buff (used by TCP)
203  */
204
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206              struct ipv6_txoptions *opt, int ipfragok)
207 {
208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209         struct in6_addr *first_hop = &fl->fl6_dst;
210         struct dst_entry *dst = skb->dst;
211         struct ipv6hdr *hdr;
212         u8  proto = fl->proto;
213         int seg_len = skb->len;
214         int hlimit;
215         u32 mtu;
216
217         if (opt) {
218                 int head_room;
219
220                 /* First: exthdrs may take lots of space (~8K for now)
221                    MAX_HEADER is not enough.
222                  */
223                 head_room = opt->opt_nflen + opt->opt_flen;
224                 seg_len += head_room;
225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227                 if (skb_headroom(skb) < head_room) {
228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229                         kfree_skb(skb);
230                         skb = skb2;
231                         if (skb == NULL) {      
232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233                                 return -ENOBUFS;
234                         }
235                         if (sk)
236                                 skb_set_owner_w(skb, sk);
237                 }
238                 if (opt->opt_flen)
239                         ipv6_push_frag_opts(skb, opt, &proto);
240                 if (opt->opt_nflen)
241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242         }
243
244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246         /*
247          *      Fill in the IPv6 header
248          */
249
250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251         hlimit = -1;
252         if (np)
253                 hlimit = np->hop_limit;
254         if (hlimit < 0)
255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256
257         hdr->payload_len = htons(seg_len);
258         hdr->nexthdr = proto;
259         hdr->hop_limit = hlimit;
260
261         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
262         ipv6_addr_copy(&hdr->daddr, first_hop);
263
264         mtu = dst_pmtu(dst);
265         if ((skb->len <= mtu) || ipfragok) {
266                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
267                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
268         }
269
270         if (net_ratelimit())
271                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
272         skb->dev = dst->dev;
273         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
274         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
275         kfree_skb(skb);
276         return -EMSGSIZE;
277 }
278
279 /*
280  *      To avoid extra problems ND packets are send through this
281  *      routine. It's code duplication but I really want to avoid
282  *      extra checks since ipv6_build_header is used by TCP (which
283  *      is for us performance critical)
284  */
285
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287                struct in6_addr *saddr, struct in6_addr *daddr,
288                int proto, int len)
289 {
290         struct ipv6_pinfo *np = inet6_sk(sk);
291         struct ipv6hdr *hdr;
292         int totlen;
293
294         skb->protocol = htons(ETH_P_IPV6);
295         skb->dev = dev;
296
297         totlen = len + sizeof(struct ipv6hdr);
298
299         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
300         skb->nh.ipv6h = hdr;
301
302         *(u32*)hdr = htonl(0x60000000);
303
304         hdr->payload_len = htons(len);
305         hdr->nexthdr = proto;
306         hdr->hop_limit = np->hop_limit;
307
308         ipv6_addr_copy(&hdr->saddr, saddr);
309         ipv6_addr_copy(&hdr->daddr, daddr);
310
311         return 0;
312 }
313
314 int ip6_call_ra_chain(struct sk_buff *skb, int sel)
315 {
316         struct ip6_ra_chain *ra;
317         struct sock *last = NULL;
318
319         read_lock(&ip6_ra_lock);
320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
321                 struct sock *sk = ra->sk;
322                 if (sk && ra->sel == sel) {
323                         if (last) {
324                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
325                                 if (skb2)
326                                         rawv6_rcv(last, skb2);
327                         }
328                         last = sk;
329                 }
330         }
331
332         if (last) {
333                 rawv6_rcv(last, skb);
334                 read_unlock(&ip6_ra_lock);
335                 return 1;
336         }
337         read_unlock(&ip6_ra_lock);
338         return 0;
339 }
340
341 static inline int ip6_forward_finish(struct sk_buff *skb)
342 {
343         return dst_output(skb);
344 }
345
346 int ip6_forward(struct sk_buff *skb)
347 {
348         struct dst_entry *dst = skb->dst;
349         struct ipv6hdr *hdr = skb->nh.ipv6h;
350         struct inet6_skb_parm *opt = IP6CB(skb);
351         
352         if (ipv6_devconf.forwarding == 0)
353                 goto error;
354
355         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
356                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
357                 goto drop;
358         }
359
360         skb->ip_summed = CHECKSUM_NONE;
361
362         /*
363          *      We DO NOT make any processing on
364          *      RA packets, pushing them to user level AS IS
365          *      without ane WARRANTY that application will be able
366          *      to interpret them. The reason is that we
367          *      cannot make anything clever here.
368          *
369          *      We are not end-node, so that if packet contains
370          *      AH/ESP, we cannot make anything.
371          *      Defragmentation also would be mistake, RA packets
372          *      cannot be fragmented, because there is no warranty
373          *      that different fragments will go along one path. --ANK
374          */
375         if (opt->ra) {
376                 u8 *ptr = skb->nh.raw + opt->ra;
377                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
378                         return 0;
379         }
380
381         /*
382          *      check and decrement ttl
383          */
384         if (hdr->hop_limit <= 1) {
385                 /* Force OUTPUT device used as source address */
386                 skb->dev = dst->dev;
387                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
388                             0, skb->dev);
389
390                 kfree_skb(skb);
391                 return -ETIMEDOUT;
392         }
393
394         if (!xfrm6_route_forward(skb)) {
395                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
396                 goto drop;
397         }
398
399         /* IPv6 specs say nothing about it, but it is clear that we cannot
400            send redirects to source routed frames.
401          */
402         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
403                 struct in6_addr *target = NULL;
404                 struct rt6_info *rt;
405                 struct neighbour *n = dst->neighbour;
406
407                 /*
408                  *      incoming and outgoing devices are the same
409                  *      send a redirect.
410                  */
411
412                 rt = (struct rt6_info *) dst;
413                 if ((rt->rt6i_flags & RTF_GATEWAY))
414                         target = (struct in6_addr*)&n->primary_key;
415                 else
416                         target = &hdr->daddr;
417
418                 /* Limit redirects both by destination (here)
419                    and by source (inside ndisc_send_redirect)
420                  */
421                 if (xrlim_allow(dst, 1*HZ))
422                         ndisc_send_redirect(skb, n, target);
423         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
424                                                 |IPV6_ADDR_LINKLOCAL)) {
425                 /* This check is security critical. */
426                 goto error;
427         }
428
429         if (skb->len > dst_pmtu(dst)) {
430                 /* Again, force OUTPUT device used as source address */
431                 skb->dev = dst->dev;
432                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
433                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
434                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
435                 kfree_skb(skb);
436                 return -EMSGSIZE;
437         }
438
439         if (skb_cow(skb, dst->dev->hard_header_len)) {
440                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
441                 goto drop;
442         }
443
444         hdr = skb->nh.ipv6h;
445
446         /* Mangling hops number delayed to point after skb COW */
447  
448         hdr->hop_limit--;
449
450         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
451         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
452
453 error:
454         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
455 drop:
456         kfree_skb(skb);
457         return -EINVAL;
458 }
459
460 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
461 {
462         to->pkt_type = from->pkt_type;
463         to->priority = from->priority;
464         to->protocol = from->protocol;
465         to->security = from->security;
466         to->dst = dst_clone(from->dst);
467         to->dev = from->dev;
468
469 #ifdef CONFIG_NET_SCHED
470         to->tc_index = from->tc_index;
471 #endif
472 #ifdef CONFIG_NETFILTER
473         to->nfmark = from->nfmark;
474         /* Connection association is same as pre-frag packet */
475         to->nfct = from->nfct;
476         nf_conntrack_get(to->nfct);
477         to->nfctinfo = from->nfctinfo;
478 #ifdef CONFIG_BRIDGE_NETFILTER
479         nf_bridge_put(to->nf_bridge);
480         to->nf_bridge = from->nf_bridge;
481         nf_bridge_get(to->nf_bridge);
482 #endif
483 #ifdef CONFIG_NETFILTER_DEBUG
484         to->nf_debug = from->nf_debug;
485 #endif
486 #endif
487 }
488
489 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
490 {
491         u16 offset = sizeof(struct ipv6hdr);
492         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
493         unsigned int packet_len = skb->tail - skb->nh.raw;
494         int found_rhdr = 0;
495         *nexthdr = &skb->nh.ipv6h->nexthdr;
496
497         while (offset + 1 <= packet_len) {
498
499                 switch (**nexthdr) {
500
501                 case NEXTHDR_HOP:
502                 case NEXTHDR_ROUTING:
503                 case NEXTHDR_DEST:
504                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
505                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
506                         offset += ipv6_optlen(exthdr);
507                         *nexthdr = &exthdr->nexthdr;
508                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
509                         break;
510                 default :
511                         return offset;
512                 }
513         }
514
515         return offset;
516 }
517
518 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
519 {
520         struct net_device *dev;
521         struct sk_buff *frag;
522         struct rt6_info *rt = (struct rt6_info*)skb->dst;
523         struct ipv6hdr *tmp_hdr;
524         struct frag_hdr *fh;
525         unsigned int mtu, hlen, left, len;
526         u32 frag_id = 0;
527         int ptr, offset = 0, err=0;
528         u8 *prevhdr, nexthdr = 0;
529
530         dev = rt->u.dst.dev;
531         hlen = ip6_find_1stfragopt(skb, &prevhdr);
532         nexthdr = *prevhdr;
533
534         mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
535
536         if (skb_shinfo(skb)->frag_list) {
537                 int first_len = skb_pagelen(skb);
538
539                 if (first_len - hlen > mtu ||
540                     ((first_len - hlen) & 7) ||
541                     skb_cloned(skb))
542                         goto slow_path;
543
544                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
545                         /* Correct geometry. */
546                         if (frag->len > mtu ||
547                             ((frag->len & 7) && frag->next) ||
548                             skb_headroom(frag) < hlen)
549                             goto slow_path;
550
551                         /* Correct socket ownership. */
552                         if (frag->sk == NULL)
553                                 goto slow_path;
554
555                         /* Partially cloned skb? */
556                         if (skb_shared(frag))
557                                 goto slow_path;
558                 }
559
560                 err = 0;
561                 offset = 0;
562                 frag = skb_shinfo(skb)->frag_list;
563                 skb_shinfo(skb)->frag_list = NULL;
564                 /* BUILD HEADER */
565
566                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
567                 if (!tmp_hdr) {
568                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
569                         return -ENOMEM;
570                 }
571
572                 *prevhdr = NEXTHDR_FRAGMENT;
573                 memcpy(tmp_hdr, skb->nh.raw, hlen);
574                 __skb_pull(skb, hlen);
575                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
576                 skb->nh.raw = __skb_push(skb, hlen);
577                 memcpy(skb->nh.raw, tmp_hdr, hlen);
578
579                 ipv6_select_ident(skb, fh);
580                 fh->nexthdr = nexthdr;
581                 fh->reserved = 0;
582                 fh->frag_off = htons(IP6_MF);
583                 frag_id = fh->identification;
584
585                 first_len = skb_pagelen(skb);
586                 skb->data_len = first_len - skb_headlen(skb);
587                 skb->len = first_len;
588                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
589  
590
591                 for (;;) {
592                         /* Prepare header of the next frame,
593                          * before previous one went down. */
594                         if (frag) {
595                                 frag->h.raw = frag->data;
596                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
597                                 frag->nh.raw = __skb_push(frag, hlen);
598                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
599                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
600                                 fh->nexthdr = nexthdr;
601                                 fh->reserved = 0;
602                                 fh->frag_off = htons(offset);
603                                 if (frag->next != NULL)
604                                         fh->frag_off |= htons(IP6_MF);
605                                 fh->identification = frag_id;
606                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
607                                 ip6_copy_metadata(frag, skb);
608                         }
609                         
610                         err = output(skb);
611                         if (err || !frag)
612                                 break;
613
614                         skb = frag;
615                         frag = skb->next;
616                         skb->next = NULL;
617                 }
618
619                 if (tmp_hdr)
620                         kfree(tmp_hdr);
621
622                 if (err == 0) {
623                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
624                         return 0;
625                 }
626
627                 while (frag) {
628                         skb = frag->next;
629                         kfree_skb(frag);
630                         frag = skb;
631                 }
632
633                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
634                 return err;
635         }
636
637 slow_path:
638         left = skb->len - hlen;         /* Space per frame */
639         ptr = hlen;                     /* Where to start from */
640
641         /*
642          *      Fragment the datagram.
643          */
644
645         *prevhdr = NEXTHDR_FRAGMENT;
646
647         /*
648          *      Keep copying data until we run out.
649          */
650         while(left > 0) {
651                 len = left;
652                 /* IF: it doesn't fit, use 'mtu' - the data space left */
653                 if (len > mtu)
654                         len = mtu;
655                 /* IF: we are not sending upto and including the packet end
656                    then align the next start on an eight byte boundary */
657                 if (len < left) {
658                         len &= ~7;
659                 }
660                 /*
661                  *      Allocate buffer.
662                  */
663
664                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
665                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
666                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
667                         err = -ENOMEM;
668                         goto fail;
669                 }
670
671                 /*
672                  *      Set up data on packet
673                  */
674
675                 ip6_copy_metadata(frag, skb);
676                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
677                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
678                 frag->nh.raw = frag->data;
679                 fh = (struct frag_hdr*)(frag->data + hlen);
680                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
681
682                 /*
683                  *      Charge the memory for the fragment to any owner
684                  *      it might possess
685                  */
686                 if (skb->sk)
687                         skb_set_owner_w(frag, skb->sk);
688
689                 /*
690                  *      Copy the packet header into the new buffer.
691                  */
692                 memcpy(frag->nh.raw, skb->data, hlen);
693
694                 /*
695                  *      Build fragment header.
696                  */
697                 fh->nexthdr = nexthdr;
698                 fh->reserved = 0;
699                 if (frag_id) {
700                         ipv6_select_ident(skb, fh);
701                         frag_id = fh->identification;
702                 } else
703                         fh->identification = frag_id;
704
705                 /*
706                  *      Copy a block of the IP datagram.
707                  */
708                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
709                         BUG();
710                 left -= len;
711
712                 fh->frag_off = htons(offset);
713                 if (left > 0)
714                         fh->frag_off |= htons(IP6_MF);
715                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
716
717                 ptr += len;
718                 offset += len;
719
720                 /*
721                  *      Put this fragment into the sending queue.
722                  */
723
724                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
725
726                 err = output(frag);
727                 if (err)
728                         goto fail;
729         }
730         kfree_skb(skb);
731         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
732         return err;
733
734 fail:
735         kfree_skb(skb); 
736         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
737         return err;
738 }
739
740 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
741 {
742         int err = 0;
743
744         *dst = NULL;
745         if (sk) {
746                 struct ipv6_pinfo *np = inet6_sk(sk);
747         
748                 *dst = sk_dst_check(sk, np->dst_cookie);
749                 if (*dst) {
750                         struct rt6_info *rt = (struct rt6_info*)*dst;
751         
752                                 /* Yes, checking route validity in not connected
753                                    case is not very simple. Take into account,
754                                    that we do not support routing by source, TOS,
755                                    and MSG_DONTROUTE            --ANK (980726)
756         
757                                    1. If route was host route, check that
758                                       cached destination is current.
759                                       If it is network route, we still may
760                                       check its validity using saved pointer
761                                       to the last used address: daddr_cache.
762                                       We do not want to save whole address now,
763                                       (because main consumer of this service
764                                        is tcp, which has not this problem),
765                                       so that the last trick works only on connected
766                                       sockets.
767                                    2. oif also should be the same.
768                                  */
769         
770                         if (((rt->rt6i_dst.plen != 128 ||
771                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
772                              && (np->daddr_cache == NULL ||
773                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
774                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
775                                 dst_release(*dst);
776                                 *dst = NULL;
777                         }
778                 }
779         }
780
781         if (*dst == NULL)
782                 *dst = ip6_route_output(sk, fl);
783
784         if ((err = (*dst)->error))
785                 goto out_err_release;
786
787         if (ipv6_addr_any(&fl->fl6_src)) {
788                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
789
790                 if (err) {
791 #if IP6_DEBUG >= 2
792                         printk(KERN_DEBUG "ip6_dst_lookup: "
793                                "no available source address\n");
794 #endif
795                         goto out_err_release;
796                 }
797         }
798
799         return 0;
800
801 out_err_release:
802         dst_release(*dst);
803         *dst = NULL;
804         return err;
805 }
806
807 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
808                     void *from, int length, int transhdrlen,
809                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
810                     unsigned int flags)
811 {
812         struct inet_opt *inet = inet_sk(sk);
813         struct ipv6_pinfo *np = inet6_sk(sk);
814         struct sk_buff *skb;
815         unsigned int maxfraglen, fragheaderlen;
816         int exthdrlen;
817         int hh_len;
818         int mtu;
819         int copy;
820         int err;
821         int offset = 0;
822         int csummode = CHECKSUM_NONE;
823
824         if (flags&MSG_PROBE)
825                 return 0;
826         if (skb_queue_empty(&sk->sk_write_queue)) {
827                 /*
828                  * setup for corking
829                  */
830                 if (opt) {
831                         if (np->cork.opt == NULL) {
832                                 np->cork.opt = kmalloc(opt->tot_len,
833                                                        sk->sk_allocation);
834                                 if (unlikely(np->cork.opt == NULL))
835                                         return -ENOBUFS;
836                         } else if (np->cork.opt->tot_len < opt->tot_len) {
837                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
838                                 return -EINVAL;
839                         }
840                         memcpy(np->cork.opt, opt, opt->tot_len);
841                         inet->cork.flags |= IPCORK_OPT;
842                         /* need source address above miyazawa*/
843                 }
844                 dst_hold(&rt->u.dst);
845                 np->cork.rt = rt;
846                 inet->cork.fl = *fl;
847                 np->cork.hop_limit = hlimit;
848                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
849                 inet->cork.length = 0;
850                 sk->sk_sndmsg_page = NULL;
851                 sk->sk_sndmsg_off = 0;
852                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
853                 length += exthdrlen;
854                 transhdrlen += exthdrlen;
855         } else {
856                 rt = np->cork.rt;
857                 fl = &inet->cork.fl;
858                 if (inet->cork.flags & IPCORK_OPT)
859                         opt = np->cork.opt;
860                 transhdrlen = 0;
861                 exthdrlen = 0;
862                 mtu = inet->cork.fragsize;
863         }
864
865         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
866
867         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
868         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
869
870         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
871                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
872                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
873                         return -EMSGSIZE;
874                 }
875         }
876
877         /*
878          * Let's try using as much space as possible.
879          * Use MTU if total length of the message fits into the MTU.
880          * Otherwise, we need to reserve fragment header and
881          * fragment alignment (= 8-15 octects, in total).
882          *
883          * Note that we may need to "move" the data from the tail of
884          * of the buffer to the new fragment when we split 
885          * the message.
886          *
887          * FIXME: It may be fragmented into multiple chunks 
888          *        at once if non-fragmentable extension headers
889          *        are too large.
890          * --yoshfuji 
891          */
892
893         inet->cork.length += length;
894
895         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
896                 goto alloc_new_skb;
897
898         while (length > 0) {
899                 /* Check if the remaining data fits into current packet. */
900                 copy = mtu - skb->len;
901                 if (copy < length)
902                         copy = maxfraglen - skb->len;
903
904                 if (copy <= 0) {
905                         char *data;
906                         unsigned int datalen;
907                         unsigned int fraglen;
908                         unsigned int fraggap;
909                         unsigned int alloclen;
910                         struct sk_buff *skb_prev;
911 alloc_new_skb:
912                         skb_prev = skb;
913
914                         /* There's no room in the current skb */
915                         if (skb_prev)
916                                 fraggap = skb_prev->len - maxfraglen;
917                         else
918                                 fraggap = 0;
919
920                         /*
921                          * If remaining data exceeds the mtu,
922                          * we know we need more fragment(s).
923                          */
924                         datalen = length + fraggap;
925                         if (datalen > mtu - fragheaderlen)
926                                 datalen = maxfraglen - fragheaderlen;
927
928                         fraglen = datalen + fragheaderlen;
929                         if ((flags & MSG_MORE) &&
930                             !(rt->u.dst.dev->features&NETIF_F_SG))
931                                 alloclen = mtu;
932                         else
933                                 alloclen = datalen + fragheaderlen;
934
935                         /*
936                          * The last fragment gets additional space at tail.
937                          * Note: we overallocate on fragments with MSG_MODE
938                          * because we have no idea if we're the last one.
939                          */
940                         if (datalen == length + fraggap)
941                                 alloclen += rt->u.dst.trailer_len;
942
943                         /*
944                          * We just reserve space for fragment header.
945                          * Note: this may be overallocation if the message 
946                          * (without MSG_MORE) fits into the MTU.
947                          */
948                         alloclen += sizeof(struct frag_hdr);
949
950                         if (transhdrlen) {
951                                 skb = sock_alloc_send_skb(sk,
952                                                 alloclen + hh_len,
953                                                 (flags & MSG_DONTWAIT), &err);
954                         } else {
955                                 skb = NULL;
956                                 if (atomic_read(&sk->sk_wmem_alloc) <=
957                                     2 * sk->sk_sndbuf)
958                                         skb = sock_wmalloc(sk,
959                                                            alloclen + hh_len, 1,
960                                                            sk->sk_allocation);
961                                 if (unlikely(skb == NULL))
962                                         err = -ENOBUFS;
963                         }
964                         if (skb == NULL)
965                                 goto error;
966                         /*
967                          *      Fill in the control structures
968                          */
969                         skb->ip_summed = csummode;
970                         skb->csum = 0;
971                         /* reserve for fragmentation */
972                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
973
974                         /*
975                          *      Find where to start putting bytes
976                          */
977                         data = skb_put(skb, fraglen);
978                         skb->nh.raw = data + exthdrlen;
979                         data += fragheaderlen;
980                         skb->h.raw = data + exthdrlen;
981
982                         if (fraggap) {
983                                 skb->csum = skb_copy_and_csum_bits(
984                                         skb_prev, maxfraglen,
985                                         data + transhdrlen, fraggap, 0);
986                                 skb_prev->csum = csum_sub(skb_prev->csum,
987                                                           skb->csum);
988                                 data += fraggap;
989                                 skb_trim(skb_prev, maxfraglen);
990                         }
991                         copy = datalen - transhdrlen - fraggap;
992                         if (copy < 0) {
993                                 err = -EINVAL;
994                                 kfree_skb(skb);
995                                 goto error;
996                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
997                                 err = -EFAULT;
998                                 kfree_skb(skb);
999                                 goto error;
1000                         }
1001
1002                         offset += copy;
1003                         length -= datalen - fraggap;
1004                         transhdrlen = 0;
1005                         exthdrlen = 0;
1006                         csummode = CHECKSUM_NONE;
1007
1008                         /*
1009                          * Put the packet on the pending queue
1010                          */
1011                         __skb_queue_tail(&sk->sk_write_queue, skb);
1012                         continue;
1013                 }
1014
1015                 if (copy > length)
1016                         copy = length;
1017
1018                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1019                         unsigned int off;
1020
1021                         off = skb->len;
1022                         if (getfrag(from, skb_put(skb, copy),
1023                                                 offset, copy, off, skb) < 0) {
1024                                 __skb_trim(skb, off);
1025                                 err = -EFAULT;
1026                                 goto error;
1027                         }
1028                 } else {
1029                         int i = skb_shinfo(skb)->nr_frags;
1030                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1031                         struct page *page = sk->sk_sndmsg_page;
1032                         int off = sk->sk_sndmsg_off;
1033                         unsigned int left;
1034
1035                         if (page && (left = PAGE_SIZE - off) > 0) {
1036                                 if (copy >= left)
1037                                         copy = left;
1038                                 if (page != frag->page) {
1039                                         if (i == MAX_SKB_FRAGS) {
1040                                                 err = -EMSGSIZE;
1041                                                 goto error;
1042                                         }
1043                                         get_page(page);
1044                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1045                                         frag = &skb_shinfo(skb)->frags[i];
1046                                 }
1047                         } else if(i < MAX_SKB_FRAGS) {
1048                                 if (copy > PAGE_SIZE)
1049                                         copy = PAGE_SIZE;
1050                                 page = alloc_pages(sk->sk_allocation, 0);
1051                                 if (page == NULL) {
1052                                         err = -ENOMEM;
1053                                         goto error;
1054                                 }
1055                                 sk->sk_sndmsg_page = page;
1056                                 sk->sk_sndmsg_off = 0;
1057
1058                                 skb_fill_page_desc(skb, i, page, 0, 0);
1059                                 frag = &skb_shinfo(skb)->frags[i];
1060                                 skb->truesize += PAGE_SIZE;
1061                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1062                         } else {
1063                                 err = -EMSGSIZE;
1064                                 goto error;
1065                         }
1066                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1067                                 err = -EFAULT;
1068                                 goto error;
1069                         }
1070                         sk->sk_sndmsg_off += copy;
1071                         frag->size += copy;
1072                         skb->len += copy;
1073                         skb->data_len += copy;
1074                 }
1075                 offset += copy;
1076                 length -= copy;
1077         }
1078         return 0;
1079 error:
1080         inet->cork.length -= length;
1081         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1082         return err;
1083 }
1084
1085 int ip6_push_pending_frames(struct sock *sk)
1086 {
1087         struct sk_buff *skb, *tmp_skb;
1088         struct sk_buff **tail_skb;
1089         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1090         struct inet_opt *inet = inet_sk(sk);
1091         struct ipv6_pinfo *np = inet6_sk(sk);
1092         struct ipv6hdr *hdr;
1093         struct ipv6_txoptions *opt = np->cork.opt;
1094         struct rt6_info *rt = np->cork.rt;
1095         struct flowi *fl = &inet->cork.fl;
1096         unsigned char proto = fl->proto;
1097         int err = 0;
1098
1099         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1100                 goto out;
1101         tail_skb = &(skb_shinfo(skb)->frag_list);
1102
1103         /* move skb->data to ip header from ext header */
1104         if (skb->data < skb->nh.raw)
1105                 __skb_pull(skb, skb->nh.raw - skb->data);
1106         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1107                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1108                 *tail_skb = tmp_skb;
1109                 tail_skb = &(tmp_skb->next);
1110                 skb->len += tmp_skb->len;
1111                 skb->data_len += tmp_skb->len;
1112 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1113                 skb->truesize += tmp_skb->truesize;
1114                 __sock_put(tmp_skb->sk);
1115                 tmp_skb->destructor = NULL;
1116                 tmp_skb->sk = NULL;
1117 #endif
1118         }
1119
1120         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1121         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1122         if (opt && opt->opt_flen)
1123                 ipv6_push_frag_opts(skb, opt, &proto);
1124         if (opt && opt->opt_nflen)
1125                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1126
1127         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1128         
1129         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1130
1131         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1132                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1133         else
1134                 hdr->payload_len = 0;
1135         hdr->hop_limit = np->cork.hop_limit;
1136         hdr->nexthdr = proto;
1137         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1138         ipv6_addr_copy(&hdr->daddr, final_dst);
1139
1140         skb->dst = dst_clone(&rt->u.dst);
1141         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1142         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1143         if (err) {
1144                 if (err > 0)
1145                         err = inet->recverr ? net_xmit_errno(err) : 0;
1146                 if (err)
1147                         goto error;
1148         }
1149
1150 out:
1151         inet->cork.flags &= ~IPCORK_OPT;
1152         if (np->cork.opt) {
1153                 kfree(np->cork.opt);
1154                 np->cork.opt = NULL;
1155         }
1156         if (np->cork.rt) {
1157                 dst_release(&np->cork.rt->u.dst);
1158                 np->cork.rt = NULL;
1159         }
1160         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1161         return err;
1162 error:
1163         goto out;
1164 }
1165
1166 void ip6_flush_pending_frames(struct sock *sk)
1167 {
1168         struct inet_opt *inet = inet_sk(sk);
1169         struct ipv6_pinfo *np = inet6_sk(sk);
1170         struct sk_buff *skb;
1171
1172         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1173                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1174                 kfree_skb(skb);
1175         }
1176
1177         inet->cork.flags &= ~IPCORK_OPT;
1178
1179         if (np->cork.opt) {
1180                 kfree(np->cork.opt);
1181                 np->cork.opt = NULL;
1182         }
1183         if (np->cork.rt) {
1184                 dst_release(&np->cork.rt->u.dst);
1185                 np->cork.rt = NULL;
1186         }
1187         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1188 }