vserver 1.9.3
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff **pskb)
112 {
113         struct sk_buff *skb = *pskb;
114         struct dst_entry *dst = skb->dst;
115         struct net_device *dev = dst->dev;
116
117         skb->protocol = htons(ETH_P_IPV6);
118         skb->dev = dev;
119
120         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
121                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
122
123                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
124                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
125                                 &skb->nh.ipv6h->saddr)) {
126                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
127
128                         /* Do not check for IFF_ALLMULTI; multicast routing
129                            is not supported in any case.
130                          */
131                         if (newskb)
132                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
133                                         newskb->dev,
134                                         ip6_dev_loopback_xmit);
135
136                         if (skb->nh.ipv6h->hop_limit == 0) {
137                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
138                                 kfree_skb(skb);
139                                 return 0;
140                         }
141                 }
142
143                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
144         }
145
146         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
147 }
148
149 int ip6_output(struct sk_buff **pskb)
150 {
151         struct sk_buff *skb = *pskb;
152
153         if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
154                 return ip6_fragment(pskb, ip6_output2);
155         else
156                 return ip6_output2(pskb);
157 }
158
159 #ifdef CONFIG_NETFILTER
160 int ip6_route_me_harder(struct sk_buff *skb)
161 {
162         struct ipv6hdr *iph = skb->nh.ipv6h;
163         struct dst_entry *dst;
164         struct flowi fl = {
165                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
166                 .nl_u =
167                 { .ip6_u =
168                   { .daddr = iph->daddr,
169                     .saddr = iph->saddr, } },
170                 .proto = iph->nexthdr,
171         };
172
173         dst = ip6_route_output(skb->sk, &fl);
174
175         if (dst->error) {
176                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
177                 LIMIT_NETDEBUG(
178                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
179                 dst_release(dst);
180                 return -EINVAL;
181         }
182
183         /* Drop old route. */
184         dst_release(skb->dst);
185
186         skb->dst = dst;
187         return 0;
188 }
189 #endif
190
191 static inline int ip6_maybe_reroute(struct sk_buff *skb)
192 {
193 #ifdef CONFIG_NETFILTER
194         if (skb->nfcache & NFC_ALTERED){
195                 if (ip6_route_me_harder(skb) != 0){
196                         kfree_skb(skb);
197                         return -EINVAL;
198                 }
199         }
200 #endif /* CONFIG_NETFILTER */
201         return dst_output(skb);
202 }
203
204 /*
205  *      xmit an sk_buff (used by TCP)
206  */
207
208 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
209              struct ipv6_txoptions *opt, int ipfragok)
210 {
211         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
212         struct in6_addr *first_hop = &fl->fl6_dst;
213         struct dst_entry *dst = skb->dst;
214         struct ipv6hdr *hdr;
215         u8  proto = fl->proto;
216         int seg_len = skb->len;
217         int hlimit;
218         u32 mtu;
219
220         if (opt) {
221                 int head_room;
222
223                 /* First: exthdrs may take lots of space (~8K for now)
224                    MAX_HEADER is not enough.
225                  */
226                 head_room = opt->opt_nflen + opt->opt_flen;
227                 seg_len += head_room;
228                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
229
230                 if (skb_headroom(skb) < head_room) {
231                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
232                         kfree_skb(skb);
233                         skb = skb2;
234                         if (skb == NULL) {      
235                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
236                                 return -ENOBUFS;
237                         }
238                         if (sk)
239                                 skb_set_owner_w(skb, sk);
240                 }
241                 if (opt->opt_flen)
242                         ipv6_push_frag_opts(skb, opt, &proto);
243                 if (opt->opt_nflen)
244                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
245         }
246
247         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
248
249         /*
250          *      Fill in the IPv6 header
251          */
252
253         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
254         hlimit = -1;
255         if (np)
256                 hlimit = np->hop_limit;
257         if (hlimit < 0)
258                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
259
260         hdr->payload_len = htons(seg_len);
261         hdr->nexthdr = proto;
262         hdr->hop_limit = hlimit;
263
264         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
265         ipv6_addr_copy(&hdr->daddr, first_hop);
266
267         mtu = dst_pmtu(dst);
268         if ((skb->len <= mtu) || ipfragok) {
269                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
270                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
271         }
272
273         if (net_ratelimit())
274                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
275         skb->dev = dst->dev;
276         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
277         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
278         kfree_skb(skb);
279         return -EMSGSIZE;
280 }
281
282 /*
283  *      To avoid extra problems ND packets are send through this
284  *      routine. It's code duplication but I really want to avoid
285  *      extra checks since ipv6_build_header is used by TCP (which
286  *      is for us performance critical)
287  */
288
289 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
290                struct in6_addr *saddr, struct in6_addr *daddr,
291                int proto, int len)
292 {
293         struct ipv6_pinfo *np = inet6_sk(sk);
294         struct ipv6hdr *hdr;
295         int totlen;
296
297         skb->protocol = htons(ETH_P_IPV6);
298         skb->dev = dev;
299
300         totlen = len + sizeof(struct ipv6hdr);
301
302         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
303         skb->nh.ipv6h = hdr;
304
305         *(u32*)hdr = htonl(0x60000000);
306
307         hdr->payload_len = htons(len);
308         hdr->nexthdr = proto;
309         hdr->hop_limit = np->hop_limit;
310
311         ipv6_addr_copy(&hdr->saddr, saddr);
312         ipv6_addr_copy(&hdr->daddr, daddr);
313
314         return 0;
315 }
316
317 int ip6_call_ra_chain(struct sk_buff *skb, int sel)
318 {
319         struct ip6_ra_chain *ra;
320         struct sock *last = NULL;
321
322         read_lock(&ip6_ra_lock);
323         for (ra = ip6_ra_chain; ra; ra = ra->next) {
324                 struct sock *sk = ra->sk;
325                 if (sk && ra->sel == sel) {
326                         if (last) {
327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328                                 if (skb2)
329                                         rawv6_rcv(last, skb2);
330                         }
331                         last = sk;
332                 }
333         }
334
335         if (last) {
336                 rawv6_rcv(last, skb);
337                 read_unlock(&ip6_ra_lock);
338                 return 1;
339         }
340         read_unlock(&ip6_ra_lock);
341         return 0;
342 }
343
344 static inline int ip6_forward_finish(struct sk_buff *skb)
345 {
346         return dst_output(skb);
347 }
348
349 int ip6_forward(struct sk_buff *skb)
350 {
351         struct dst_entry *dst = skb->dst;
352         struct ipv6hdr *hdr = skb->nh.ipv6h;
353         struct inet6_skb_parm *opt = IP6CB(skb);
354         
355         if (ipv6_devconf.forwarding == 0)
356                 goto error;
357
358         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
359                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
360                 goto drop;
361         }
362
363         skb->ip_summed = CHECKSUM_NONE;
364
365         /*
366          *      We DO NOT make any processing on
367          *      RA packets, pushing them to user level AS IS
368          *      without ane WARRANTY that application will be able
369          *      to interpret them. The reason is that we
370          *      cannot make anything clever here.
371          *
372          *      We are not end-node, so that if packet contains
373          *      AH/ESP, we cannot make anything.
374          *      Defragmentation also would be mistake, RA packets
375          *      cannot be fragmented, because there is no warranty
376          *      that different fragments will go along one path. --ANK
377          */
378         if (opt->ra) {
379                 u8 *ptr = skb->nh.raw + opt->ra;
380                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
381                         return 0;
382         }
383
384         /*
385          *      check and decrement ttl
386          */
387         if (hdr->hop_limit <= 1) {
388                 /* Force OUTPUT device used as source address */
389                 skb->dev = dst->dev;
390                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
391                             0, skb->dev);
392
393                 kfree_skb(skb);
394                 return -ETIMEDOUT;
395         }
396
397         if (!xfrm6_route_forward(skb)) {
398                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
399                 goto drop;
400         }
401
402         /* IPv6 specs say nothing about it, but it is clear that we cannot
403            send redirects to source routed frames.
404          */
405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406                 struct in6_addr *target = NULL;
407                 struct rt6_info *rt;
408                 struct neighbour *n = dst->neighbour;
409
410                 /*
411                  *      incoming and outgoing devices are the same
412                  *      send a redirect.
413                  */
414
415                 rt = (struct rt6_info *) dst;
416                 if ((rt->rt6i_flags & RTF_GATEWAY))
417                         target = (struct in6_addr*)&n->primary_key;
418                 else
419                         target = &hdr->daddr;
420
421                 /* Limit redirects both by destination (here)
422                    and by source (inside ndisc_send_redirect)
423                  */
424                 if (xrlim_allow(dst, 1*HZ))
425                         ndisc_send_redirect(skb, n, target);
426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427                                                 |IPV6_ADDR_LINKLOCAL)) {
428                 /* This check is security critical. */
429                 goto error;
430         }
431
432         if (skb->len > dst_pmtu(dst)) {
433                 /* Again, force OUTPUT device used as source address */
434                 skb->dev = dst->dev;
435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438                 kfree_skb(skb);
439                 return -EMSGSIZE;
440         }
441
442         if (skb_cow(skb, dst->dev->hard_header_len)) {
443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444                 goto drop;
445         }
446
447         hdr = skb->nh.ipv6h;
448
449         /* Mangling hops number delayed to point after skb COW */
450  
451         hdr->hop_limit--;
452
453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456 error:
457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459         kfree_skb(skb);
460         return -EINVAL;
461 }
462
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465         to->pkt_type = from->pkt_type;
466         to->priority = from->priority;
467         to->protocol = from->protocol;
468         to->security = from->security;
469         to->dst = dst_clone(from->dst);
470         to->dev = from->dev;
471
472 #ifdef CONFIG_NET_SCHED
473         to->tc_index = from->tc_index;
474 #endif
475 #ifdef CONFIG_NETFILTER
476         to->nfmark = from->nfmark;
477         /* Connection association is same as pre-frag packet */
478         to->nfct = from->nfct;
479         nf_conntrack_get(to->nfct);
480         to->nfctinfo = from->nfctinfo;
481 #ifdef CONFIG_BRIDGE_NETFILTER
482         nf_bridge_put(to->nf_bridge);
483         to->nf_bridge = from->nf_bridge;
484         nf_bridge_get(to->nf_bridge);
485 #endif
486 #ifdef CONFIG_NETFILTER_DEBUG
487         to->nf_debug = from->nf_debug;
488 #endif
489 #endif
490 }
491
492 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
493 {
494         u16 offset = sizeof(struct ipv6hdr);
495         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
496         unsigned int packet_len = skb->tail - skb->nh.raw;
497         int found_rhdr = 0;
498         *nexthdr = &skb->nh.ipv6h->nexthdr;
499
500         while (offset + 1 <= packet_len) {
501
502                 switch (**nexthdr) {
503
504                 case NEXTHDR_HOP:
505                 case NEXTHDR_ROUTING:
506                 case NEXTHDR_DEST:
507                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
508                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
509                         offset += ipv6_optlen(exthdr);
510                         *nexthdr = &exthdr->nexthdr;
511                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
512                         break;
513                 default :
514                         return offset;
515                 }
516         }
517
518         return offset;
519 }
520
521 static int ip6_fragment(struct sk_buff **pskb, int (*output)(struct sk_buff**))
522 {
523         struct net_device *dev;
524         struct sk_buff *frag, *skb = *pskb;
525         struct rt6_info *rt = (struct rt6_info*)skb->dst;
526         struct ipv6hdr *tmp_hdr;
527         struct frag_hdr *fh;
528         unsigned int mtu, hlen, left, len;
529         u32 frag_id = 0;
530         int ptr, offset = 0, err=0;
531         u8 *prevhdr, nexthdr = 0;
532
533         dev = rt->u.dst.dev;
534         hlen = ip6_find_1stfragopt(skb, &prevhdr);
535         nexthdr = *prevhdr;
536
537         mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
538
539         if (skb_shinfo(skb)->frag_list) {
540                 int first_len = skb_pagelen(skb);
541
542                 if (first_len - hlen > mtu ||
543                     ((first_len - hlen) & 7) ||
544                     skb_cloned(skb))
545                         goto slow_path;
546
547                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
548                         /* Correct geometry. */
549                         if (frag->len > mtu ||
550                             ((frag->len & 7) && frag->next) ||
551                             skb_headroom(frag) < hlen)
552                             goto slow_path;
553
554                         /* Correct socket ownership. */
555                         if (frag->sk == NULL)
556                                 goto slow_path;
557
558                         /* Partially cloned skb? */
559                         if (skb_shared(frag))
560                                 goto slow_path;
561                 }
562
563                 err = 0;
564                 offset = 0;
565                 frag = skb_shinfo(skb)->frag_list;
566                 skb_shinfo(skb)->frag_list = NULL;
567                 /* BUILD HEADER */
568
569                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
570                 if (!tmp_hdr) {
571                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
572                         return -ENOMEM;
573                 }
574
575                 *prevhdr = NEXTHDR_FRAGMENT;
576                 memcpy(tmp_hdr, skb->nh.raw, hlen);
577                 __skb_pull(skb, hlen);
578                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
579                 skb->nh.raw = __skb_push(skb, hlen);
580                 memcpy(skb->nh.raw, tmp_hdr, hlen);
581
582                 ipv6_select_ident(skb, fh);
583                 fh->nexthdr = nexthdr;
584                 fh->reserved = 0;
585                 fh->frag_off = htons(IP6_MF);
586                 frag_id = fh->identification;
587
588                 first_len = skb_pagelen(skb);
589                 skb->data_len = first_len - skb_headlen(skb);
590                 skb->len = first_len;
591                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
592  
593
594                 for (;;) {
595                         /* Prepare header of the next frame,
596                          * before previous one went down. */
597                         if (frag) {
598                                 frag->h.raw = frag->data;
599                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
600                                 frag->nh.raw = __skb_push(frag, hlen);
601                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
602                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
603                                 fh->nexthdr = nexthdr;
604                                 fh->reserved = 0;
605                                 fh->frag_off = htons(offset);
606                                 if (frag->next != NULL)
607                                         fh->frag_off |= htons(IP6_MF);
608                                 fh->identification = frag_id;
609                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
610                                 ip6_copy_metadata(frag, skb);
611                         }
612                         
613                         err = output(&skb);
614                         if (err || !frag)
615                                 break;
616
617                         skb = frag;
618                         frag = skb->next;
619                         skb->next = NULL;
620                 }
621
622                 if (tmp_hdr)
623                         kfree(tmp_hdr);
624
625                 if (err == 0) {
626                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
627                         return 0;
628                 }
629
630                 while (frag) {
631                         skb = frag->next;
632                         kfree_skb(frag);
633                         frag = skb;
634                 }
635
636                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
637                 return err;
638         }
639
640 slow_path:
641         left = skb->len - hlen;         /* Space per frame */
642         ptr = hlen;                     /* Where to start from */
643
644         /*
645          *      Fragment the datagram.
646          */
647
648         *prevhdr = NEXTHDR_FRAGMENT;
649
650         /*
651          *      Keep copying data until we run out.
652          */
653         while(left > 0) {
654                 len = left;
655                 /* IF: it doesn't fit, use 'mtu' - the data space left */
656                 if (len > mtu)
657                         len = mtu;
658                 /* IF: we are not sending upto and including the packet end
659                    then align the next start on an eight byte boundary */
660                 if (len < left) {
661                         len &= ~7;
662                 }
663                 /*
664                  *      Allocate buffer.
665                  */
666
667                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
668                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
669                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
670                         err = -ENOMEM;
671                         goto fail;
672                 }
673
674                 /*
675                  *      Set up data on packet
676                  */
677
678                 ip6_copy_metadata(frag, skb);
679                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
680                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
681                 frag->nh.raw = frag->data;
682                 fh = (struct frag_hdr*)(frag->data + hlen);
683                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
684
685                 /*
686                  *      Charge the memory for the fragment to any owner
687                  *      it might possess
688                  */
689                 if (skb->sk)
690                         skb_set_owner_w(frag, skb->sk);
691
692                 /*
693                  *      Copy the packet header into the new buffer.
694                  */
695                 memcpy(frag->nh.raw, skb->data, hlen);
696
697                 /*
698                  *      Build fragment header.
699                  */
700                 fh->nexthdr = nexthdr;
701                 fh->reserved = 0;
702                 if (frag_id) {
703                         ipv6_select_ident(skb, fh);
704                         frag_id = fh->identification;
705                 } else
706                         fh->identification = frag_id;
707
708                 /*
709                  *      Copy a block of the IP datagram.
710                  */
711                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
712                         BUG();
713                 left -= len;
714
715                 fh->frag_off = htons(offset);
716                 if (left > 0)
717                         fh->frag_off |= htons(IP6_MF);
718                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
719
720                 ptr += len;
721                 offset += len;
722
723                 /*
724                  *      Put this fragment into the sending queue.
725                  */
726
727                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
728
729                 err = output(&frag);
730                 if (err)
731                         goto fail;
732         }
733         kfree_skb(skb);
734         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
735         return err;
736
737 fail:
738         kfree_skb(skb); 
739         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
740         return err;
741 }
742
743 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
744 {
745         int err = 0;
746
747         *dst = NULL;
748         if (sk) {
749                 struct ipv6_pinfo *np = inet6_sk(sk);
750         
751                 *dst = __sk_dst_check(sk, np->dst_cookie);
752                 if (*dst) {
753                         struct rt6_info *rt = (struct rt6_info*)*dst;
754         
755                                 /* Yes, checking route validity in not connected
756                                    case is not very simple. Take into account,
757                                    that we do not support routing by source, TOS,
758                                    and MSG_DONTROUTE            --ANK (980726)
759         
760                                    1. If route was host route, check that
761                                       cached destination is current.
762                                       If it is network route, we still may
763                                       check its validity using saved pointer
764                                       to the last used address: daddr_cache.
765                                       We do not want to save whole address now,
766                                       (because main consumer of this service
767                                        is tcp, which has not this problem),
768                                       so that the last trick works only on connected
769                                       sockets.
770                                    2. oif also should be the same.
771                                  */
772         
773                         if (((rt->rt6i_dst.plen != 128 ||
774                               ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
775                              && (np->daddr_cache == NULL ||
776                                  ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
777                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
778                                 *dst = NULL;
779                         } else
780                                 dst_hold(*dst);
781                 }
782         }
783
784         if (*dst == NULL)
785                 *dst = ip6_route_output(sk, fl);
786
787         if ((err = (*dst)->error))
788                 goto out_err_release;
789
790         if (ipv6_addr_any(&fl->fl6_src)) {
791                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
792
793                 if (err) {
794 #if IP6_DEBUG >= 2
795                         printk(KERN_DEBUG "ip6_dst_lookup: "
796                                "no available source address\n");
797 #endif
798                         goto out_err_release;
799                 }
800         }
801
802         return 0;
803
804 out_err_release:
805         dst_release(*dst);
806         *dst = NULL;
807         return err;
808 }
809
810 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
811                     void *from, int length, int transhdrlen,
812                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
813                     unsigned int flags)
814 {
815         struct inet_opt *inet = inet_sk(sk);
816         struct ipv6_pinfo *np = inet6_sk(sk);
817         struct sk_buff *skb;
818         unsigned int maxfraglen, fragheaderlen;
819         int exthdrlen;
820         int hh_len;
821         int mtu;
822         int copy;
823         int err;
824         int offset = 0;
825         int csummode = CHECKSUM_NONE;
826
827         if (flags&MSG_PROBE)
828                 return 0;
829         if (skb_queue_empty(&sk->sk_write_queue)) {
830                 /*
831                  * setup for corking
832                  */
833                 if (opt) {
834                         if (np->cork.opt == NULL) {
835                                 np->cork.opt = kmalloc(opt->tot_len,
836                                                        sk->sk_allocation);
837                                 if (unlikely(np->cork.opt == NULL))
838                                         return -ENOBUFS;
839                         } else if (np->cork.opt->tot_len < opt->tot_len) {
840                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
841                                 return -EINVAL;
842                         }
843                         memcpy(np->cork.opt, opt, opt->tot_len);
844                         inet->cork.flags |= IPCORK_OPT;
845                         /* need source address above miyazawa*/
846                 }
847                 dst_hold(&rt->u.dst);
848                 np->cork.rt = rt;
849                 inet->cork.fl = *fl;
850                 np->cork.hop_limit = hlimit;
851                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
852                 inet->cork.length = 0;
853                 sk->sk_sndmsg_page = NULL;
854                 sk->sk_sndmsg_off = 0;
855                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
856                 length += exthdrlen;
857                 transhdrlen += exthdrlen;
858         } else {
859                 rt = np->cork.rt;
860                 fl = &inet->cork.fl;
861                 if (inet->cork.flags & IPCORK_OPT)
862                         opt = np->cork.opt;
863                 transhdrlen = 0;
864                 exthdrlen = 0;
865                 mtu = inet->cork.fragsize;
866         }
867
868         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
869
870         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
871         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
872
873         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
874                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
875                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
876                         return -EMSGSIZE;
877                 }
878         }
879
880         /*
881          * Let's try using as much space as possible.
882          * Use MTU if total length of the message fits into the MTU.
883          * Otherwise, we need to reserve fragment header and
884          * fragment alignment (= 8-15 octects, in total).
885          *
886          * Note that we may need to "move" the data from the tail of
887          * of the buffer to the new fragment when we split 
888          * the message.
889          *
890          * FIXME: It may be fragmented into multiple chunks 
891          *        at once if non-fragmentable extension headers
892          *        are too large.
893          * --yoshfuji 
894          */
895
896         inet->cork.length += length;
897
898         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
899                 goto alloc_new_skb;
900
901         while (length > 0) {
902                 /* Check if the remaining data fits into current packet. */
903                 copy = mtu - skb->len;
904                 if (copy < length)
905                         copy = maxfraglen - skb->len;
906
907                 if (copy <= 0) {
908                         char *data;
909                         unsigned int datalen;
910                         unsigned int fraglen;
911                         unsigned int fraggap;
912                         unsigned int alloclen;
913                         struct sk_buff *skb_prev;
914 alloc_new_skb:
915                         skb_prev = skb;
916
917                         /* There's no room in the current skb */
918                         if (skb_prev)
919                                 fraggap = skb_prev->len - maxfraglen;
920                         else
921                                 fraggap = 0;
922
923                         /*
924                          * If remaining data exceeds the mtu,
925                          * we know we need more fragment(s).
926                          */
927                         datalen = length + fraggap;
928                         if (datalen > mtu - fragheaderlen)
929                                 datalen = maxfraglen - fragheaderlen;
930
931                         fraglen = datalen + fragheaderlen;
932                         if ((flags & MSG_MORE) &&
933                             !(rt->u.dst.dev->features&NETIF_F_SG))
934                                 alloclen = mtu;
935                         else
936                                 alloclen = datalen + fragheaderlen;
937
938                         /*
939                          * The last fragment gets additional space at tail.
940                          * Note: we overallocate on fragments with MSG_MODE
941                          * because we have no idea if we're the last one.
942                          */
943                         if (datalen == length + fraggap)
944                                 alloclen += rt->u.dst.trailer_len;
945
946                         /*
947                          * We just reserve space for fragment header.
948                          * Note: this may be overallocation if the message 
949                          * (without MSG_MORE) fits into the MTU.
950                          */
951                         alloclen += sizeof(struct frag_hdr);
952
953                         if (transhdrlen) {
954                                 skb = sock_alloc_send_skb(sk,
955                                                 alloclen + hh_len,
956                                                 (flags & MSG_DONTWAIT), &err);
957                         } else {
958                                 skb = NULL;
959                                 if (atomic_read(&sk->sk_wmem_alloc) <=
960                                     2 * sk->sk_sndbuf)
961                                         skb = sock_wmalloc(sk,
962                                                            alloclen + hh_len, 1,
963                                                            sk->sk_allocation);
964                                 if (unlikely(skb == NULL))
965                                         err = -ENOBUFS;
966                         }
967                         if (skb == NULL)
968                                 goto error;
969                         /*
970                          *      Fill in the control structures
971                          */
972                         skb->ip_summed = csummode;
973                         skb->csum = 0;
974                         /* reserve for fragmentation */
975                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
976
977                         /*
978                          *      Find where to start putting bytes
979                          */
980                         data = skb_put(skb, fraglen);
981                         skb->nh.raw = data + exthdrlen;
982                         data += fragheaderlen;
983                         skb->h.raw = data + exthdrlen;
984
985                         if (fraggap) {
986                                 skb->csum = skb_copy_and_csum_bits(
987                                         skb_prev, maxfraglen,
988                                         data + transhdrlen, fraggap, 0);
989                                 skb_prev->csum = csum_sub(skb_prev->csum,
990                                                           skb->csum);
991                                 data += fraggap;
992                                 skb_trim(skb_prev, maxfraglen);
993                         }
994                         copy = datalen - transhdrlen - fraggap;
995                         if (copy < 0) {
996                                 err = -EINVAL;
997                                 kfree_skb(skb);
998                                 goto error;
999                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1000                                 err = -EFAULT;
1001                                 kfree_skb(skb);
1002                                 goto error;
1003                         }
1004
1005                         offset += copy;
1006                         length -= datalen - fraggap;
1007                         transhdrlen = 0;
1008                         exthdrlen = 0;
1009                         csummode = CHECKSUM_NONE;
1010
1011                         /*
1012                          * Put the packet on the pending queue
1013                          */
1014                         __skb_queue_tail(&sk->sk_write_queue, skb);
1015                         continue;
1016                 }
1017
1018                 if (copy > length)
1019                         copy = length;
1020
1021                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1022                         unsigned int off;
1023
1024                         off = skb->len;
1025                         if (getfrag(from, skb_put(skb, copy),
1026                                                 offset, copy, off, skb) < 0) {
1027                                 __skb_trim(skb, off);
1028                                 err = -EFAULT;
1029                                 goto error;
1030                         }
1031                 } else {
1032                         int i = skb_shinfo(skb)->nr_frags;
1033                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1034                         struct page *page = sk->sk_sndmsg_page;
1035                         int off = sk->sk_sndmsg_off;
1036                         unsigned int left;
1037
1038                         if (page && (left = PAGE_SIZE - off) > 0) {
1039                                 if (copy >= left)
1040                                         copy = left;
1041                                 if (page != frag->page) {
1042                                         if (i == MAX_SKB_FRAGS) {
1043                                                 err = -EMSGSIZE;
1044                                                 goto error;
1045                                         }
1046                                         get_page(page);
1047                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1048                                         frag = &skb_shinfo(skb)->frags[i];
1049                                 }
1050                         } else if(i < MAX_SKB_FRAGS) {
1051                                 if (copy > PAGE_SIZE)
1052                                         copy = PAGE_SIZE;
1053                                 page = alloc_pages(sk->sk_allocation, 0);
1054                                 if (page == NULL) {
1055                                         err = -ENOMEM;
1056                                         goto error;
1057                                 }
1058                                 sk->sk_sndmsg_page = page;
1059                                 sk->sk_sndmsg_off = 0;
1060
1061                                 skb_fill_page_desc(skb, i, page, 0, 0);
1062                                 frag = &skb_shinfo(skb)->frags[i];
1063                                 skb->truesize += PAGE_SIZE;
1064                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1065                         } else {
1066                                 err = -EMSGSIZE;
1067                                 goto error;
1068                         }
1069                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1070                                 err = -EFAULT;
1071                                 goto error;
1072                         }
1073                         sk->sk_sndmsg_off += copy;
1074                         frag->size += copy;
1075                         skb->len += copy;
1076                         skb->data_len += copy;
1077                 }
1078                 offset += copy;
1079                 length -= copy;
1080         }
1081         return 0;
1082 error:
1083         inet->cork.length -= length;
1084         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1085         return err;
1086 }
1087
1088 int ip6_push_pending_frames(struct sock *sk)
1089 {
1090         struct sk_buff *skb, *tmp_skb;
1091         struct sk_buff **tail_skb;
1092         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1093         struct inet_opt *inet = inet_sk(sk);
1094         struct ipv6_pinfo *np = inet6_sk(sk);
1095         struct ipv6hdr *hdr;
1096         struct ipv6_txoptions *opt = np->cork.opt;
1097         struct rt6_info *rt = np->cork.rt;
1098         struct flowi *fl = &inet->cork.fl;
1099         unsigned char proto = fl->proto;
1100         int err = 0;
1101
1102         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1103                 goto out;
1104         tail_skb = &(skb_shinfo(skb)->frag_list);
1105
1106         /* move skb->data to ip header from ext header */
1107         if (skb->data < skb->nh.raw)
1108                 __skb_pull(skb, skb->nh.raw - skb->data);
1109         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1110                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1111                 *tail_skb = tmp_skb;
1112                 tail_skb = &(tmp_skb->next);
1113                 skb->len += tmp_skb->len;
1114                 skb->data_len += tmp_skb->len;
1115 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1116                 skb->truesize += tmp_skb->truesize;
1117                 __sock_put(tmp_skb->sk);
1118                 tmp_skb->destructor = NULL;
1119                 tmp_skb->sk = NULL;
1120 #endif
1121         }
1122
1123         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1124         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1125         if (opt && opt->opt_flen)
1126                 ipv6_push_frag_opts(skb, opt, &proto);
1127         if (opt && opt->opt_nflen)
1128                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1129
1130         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1131         
1132         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1133
1134         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1135                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1136         else
1137                 hdr->payload_len = 0;
1138         hdr->hop_limit = np->cork.hop_limit;
1139         hdr->nexthdr = proto;
1140         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1141         ipv6_addr_copy(&hdr->daddr, final_dst);
1142
1143         skb->dst = dst_clone(&rt->u.dst);
1144         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1145         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1146         if (err) {
1147                 if (err > 0)
1148                         err = inet->recverr ? net_xmit_errno(err) : 0;
1149                 if (err)
1150                         goto error;
1151         }
1152
1153 out:
1154         inet->cork.flags &= ~IPCORK_OPT;
1155         if (np->cork.opt) {
1156                 kfree(np->cork.opt);
1157                 np->cork.opt = NULL;
1158         }
1159         if (np->cork.rt) {
1160                 dst_release(&np->cork.rt->u.dst);
1161                 np->cork.rt = NULL;
1162         }
1163         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1164         return err;
1165 error:
1166         goto out;
1167 }
1168
1169 void ip6_flush_pending_frames(struct sock *sk)
1170 {
1171         struct inet_opt *inet = inet_sk(sk);
1172         struct ipv6_pinfo *np = inet6_sk(sk);
1173         struct sk_buff *skb;
1174
1175         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1176                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1177                 kfree_skb(skb);
1178         }
1179
1180         inet->cork.flags &= ~IPCORK_OPT;
1181
1182         if (np->cork.opt) {
1183                 kfree(np->cork.opt);
1184                 np->cork.opt = NULL;
1185         }
1186         if (np->cork.rt) {
1187                 dst_release(&np->cork.rt->u.dst);
1188                 np->cork.rt = NULL;
1189         }
1190         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1191 }