VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/udp.h>
74 #include <linux/skbuff.h>
75 #include <net/sock.h>
76 #include <net/arp.h>
77 #include <net/icmp.h>
78 #include <net/raw.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <linux/igmp.h>
82 #include <linux/netfilter_ipv4.h>
83 #include <linux/netfilter_bridge.h>
84 #include <linux/mroute.h>
85 #include <linux/netlink.h>
86
87 /*
88  *      Shall we try to damage output packets if routing dev changes?
89  */
90
91 int sysctl_ip_dynaddr;
92 int sysctl_ip_default_ttl = IPDEFTTL;
93
94 /* Generate a checksum for an outgoing IP datagram. */
95 __inline__ void ip_send_check(struct iphdr *iph)
96 {
97         iph->check = 0;
98         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
99 }
100
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
103 {
104         newskb->mac.raw = newskb->data;
105         __skb_pull(newskb, newskb->nh.raw - newskb->data);
106         newskb->pkt_type = PACKET_LOOPBACK;
107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
108         BUG_TRAP(newskb->dst);
109
110 #ifdef CONFIG_NETFILTER_DEBUG
111         nf_debug_ip_loopback_xmit(newskb);
112 #endif
113         netif_rx(newskb);
114         return 0;
115 }
116
117 static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst)
118 {
119         int ttl = inet->uc_ttl;
120
121         if (ttl < 0)
122                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
123         return ttl;
124 }
125
126 /* 
127  *              Add an ip header to a skbuff and send it out.
128  *
129  */
130 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
131                           u32 saddr, u32 daddr, struct ip_options *opt)
132 {
133         struct inet_opt *inet = inet_sk(sk);
134         struct rtable *rt = (struct rtable *)skb->dst;
135         struct iphdr *iph;
136
137         /* Build the IP header. */
138         if (opt)
139                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
140         else
141                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
142
143         iph->version  = 4;
144         iph->ihl      = 5;
145         iph->tos      = inet->tos;
146         if (ip_dont_fragment(sk, &rt->u.dst))
147                 iph->frag_off = htons(IP_DF);
148         else
149                 iph->frag_off = 0;
150         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
151         iph->daddr    = rt->rt_dst;
152         iph->saddr    = rt->rt_src;
153         iph->protocol = sk->sk_protocol;
154         iph->tot_len  = htons(skb->len);
155         ip_select_ident(iph, &rt->u.dst, sk);
156         skb->nh.iph   = iph;
157
158         if (opt && opt->optlen) {
159                 iph->ihl += opt->optlen>>2;
160                 ip_options_build(skb, opt, daddr, rt, 0);
161         }
162         ip_send_check(iph);
163
164         skb->priority = sk->sk_priority;
165
166         /* Send it out. */
167         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
168                        dst_output);
169 }
170
171 static inline int ip_finish_output2(struct sk_buff *skb)
172 {
173         struct dst_entry *dst = skb->dst;
174         struct hh_cache *hh = dst->hh;
175         struct net_device *dev = dst->dev;
176         int hh_len = LL_RESERVED_SPACE(dev);
177
178         /* Be paranoid, rather than too clever. */
179         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
180                 struct sk_buff *skb2;
181
182                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
183                 if (skb2 == NULL) {
184                         kfree_skb(skb);
185                         return -ENOMEM;
186                 }
187                 if (skb->sk)
188                         skb_set_owner_w(skb2, skb->sk);
189                 kfree_skb(skb);
190                 skb = skb2;
191         }
192
193 #ifdef CONFIG_NETFILTER_DEBUG
194         nf_debug_ip_finish_output2(skb);
195 #endif /*CONFIG_NETFILTER_DEBUG*/
196
197         if (hh) {
198                 int hh_alen;
199
200                 read_lock_bh(&hh->hh_lock);
201                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
202                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
203                 read_unlock_bh(&hh->hh_lock);
204                 skb_push(skb, hh->hh_len);
205                 return hh->hh_output(skb);
206         } else if (dst->neighbour)
207                 return dst->neighbour->output(skb);
208
209         if (net_ratelimit())
210                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211         kfree_skb(skb);
212         return -EINVAL;
213 }
214
215 int ip_finish_output(struct sk_buff *skb)
216 {
217         struct net_device *dev = skb->dst->dev;
218
219         skb->dev = dev;
220         skb->protocol = htons(ETH_P_IP);
221
222         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
223                        ip_finish_output2);
224 }
225
226 int ip_mc_output(struct sk_buff **pskb)
227 {
228         struct sk_buff *skb = *pskb;
229         struct sock *sk = skb->sk;
230         struct rtable *rt = (struct rtable*)skb->dst;
231         struct net_device *dev = rt->u.dst.dev;
232
233         /*
234          *      If the indicated interface is up and running, send the packet.
235          */
236         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
237
238         skb->dev = dev;
239         skb->protocol = htons(ETH_P_IP);
240
241         /*
242          *      Multicasts are looped back for other local users
243          */
244
245         if (rt->rt_flags&RTCF_MULTICAST) {
246                 if ((!sk || inet_sk(sk)->mc_loop)
247 #ifdef CONFIG_IP_MROUTE
248                 /* Small optimization: do not loopback not local frames,
249                    which returned after forwarding; they will be  dropped
250                    by ip_mr_input in any case.
251                    Note, that local frames are looped back to be delivered
252                    to local recipients.
253
254                    This check is duplicated in ip_mr_input at the moment.
255                  */
256                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
257 #endif
258                 ) {
259                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
260                         if (newskb)
261                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
262                                         newskb->dev, 
263                                         ip_dev_loopback_xmit);
264                 }
265
266                 /* Multicasts with ttl 0 must not go beyond the host */
267
268                 if (skb->nh.iph->ttl == 0) {
269                         kfree_skb(skb);
270                         return 0;
271                 }
272         }
273
274         if (rt->rt_flags&RTCF_BROADCAST) {
275                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
276                 if (newskb)
277                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
278                                 newskb->dev, ip_dev_loopback_xmit);
279         }
280
281         if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list)
282                 return ip_fragment(skb, ip_finish_output);
283         else
284                 return ip_finish_output(skb);
285 }
286
287 int ip_output(struct sk_buff **pskb)
288 {
289         struct sk_buff *skb = *pskb;
290
291         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
292
293         if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) &&
294             !skb_shinfo(skb)->tso_size)
295                 return ip_fragment(skb, ip_finish_output);
296         else
297                 return ip_finish_output(skb);
298 }
299
300 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
301 {
302         struct sock *sk = skb->sk;
303         struct inet_opt *inet = inet_sk(sk);
304         struct ip_options *opt = inet->opt;
305         struct rtable *rt;
306         struct iphdr *iph;
307         u32 mtu;
308
309         /* Skip all of this if the packet is already routed,
310          * f.e. by something like SCTP.
311          */
312         rt = (struct rtable *) skb->dst;
313         if (rt != NULL)
314                 goto packet_routed;
315
316         /* Make sure we can route this packet. */
317         rt = (struct rtable *)__sk_dst_check(sk, 0);
318         if (rt == NULL) {
319                 u32 daddr;
320
321                 /* Use correct destination address if we have options. */
322                 daddr = inet->daddr;
323                 if(opt && opt->srr)
324                         daddr = opt->faddr;
325
326                 {
327                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
328                                             .nl_u = { .ip4_u =
329                                                       { .daddr = daddr,
330                                                         .saddr = inet->saddr,
331                                                         .tos = RT_CONN_FLAGS(sk) } },
332                                             .proto = sk->sk_protocol,
333                                             .uli_u = { .ports =
334                                                        { .sport = inet->sport,
335                                                          .dport = inet->dport } } };
336
337                         /* If this fails, retransmit mechanism of transport layer will
338                          * keep trying until route appears or the connection times
339                          * itself out.
340                          */
341                         if (ip_route_output_flow(&rt, &fl, sk, 0))
342                                 goto no_route;
343                 }
344                 __sk_dst_set(sk, &rt->u.dst);
345                 tcp_v4_setup_caps(sk, &rt->u.dst);
346         }
347         skb->dst = dst_clone(&rt->u.dst);
348
349 packet_routed:
350         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
351                 goto no_route;
352
353         /* OK, we know where to send it, allocate and build IP header. */
354         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
355         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
356         iph->tot_len = htons(skb->len);
357         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
358                 iph->frag_off = htons(IP_DF);
359         else
360                 iph->frag_off = 0;
361         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
362         iph->protocol = sk->sk_protocol;
363         iph->saddr    = rt->rt_src;
364         iph->daddr    = rt->rt_dst;
365         skb->nh.iph   = iph;
366         /* Transport layer set skb->h.foo itself. */
367
368         if(opt && opt->optlen) {
369                 iph->ihl += opt->optlen >> 2;
370                 ip_options_build(skb, opt, inet->daddr, rt, 0);
371         }
372
373         mtu = dst_pmtu(&rt->u.dst);
374         if (skb->len > mtu && (sk->sk_route_caps & NETIF_F_TSO)) {
375                 unsigned int hlen;
376
377                 /* Hack zone: all this must be done by TCP. */
378                 hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
379                 skb_shinfo(skb)->tso_size = mtu - hlen;
380                 skb_shinfo(skb)->tso_segs =
381                         (skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
382                                 skb_shinfo(skb)->tso_size - 1;
383         }
384
385         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
386
387         /* Add an IP checksum. */
388         ip_send_check(iph);
389
390         skb->priority = sk->sk_priority;
391
392         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
393                        dst_output);
394
395 no_route:
396         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
397         kfree_skb(skb);
398         return -EHOSTUNREACH;
399 }
400
401
402 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403 {
404         to->pkt_type = from->pkt_type;
405         to->priority = from->priority;
406         to->protocol = from->protocol;
407         to->security = from->security;
408         to->dst = dst_clone(from->dst);
409         to->dev = from->dev;
410
411         /* Copy the flags to each fragment. */
412         IPCB(to)->flags = IPCB(from)->flags;
413
414 #ifdef CONFIG_NET_SCHED
415         to->tc_index = from->tc_index;
416 #endif
417 #ifdef CONFIG_NETFILTER
418         to->nfmark = from->nfmark;
419         to->nfcache = from->nfcache;
420         /* Connection association is same as pre-frag packet */
421         nf_conntrack_put(to->nfct);
422         to->nfct = from->nfct;
423         nf_conntrack_get(to->nfct);
424 #ifdef CONFIG_BRIDGE_NETFILTER
425         nf_bridge_put(to->nf_bridge);
426         to->nf_bridge = from->nf_bridge;
427         nf_bridge_get(to->nf_bridge);
428 #endif
429 #ifdef CONFIG_NETFILTER_DEBUG
430         to->nf_debug = from->nf_debug;
431 #endif
432 #endif
433 }
434
435 /*
436  *      This IP datagram is too large to be sent in one piece.  Break it up into
437  *      smaller pieces (each of size equal to IP header plus
438  *      a block of the data of the original IP data part) that will yet fit in a
439  *      single device frame, and queue such a frame for sending.
440  */
441
442 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
443 {
444         struct iphdr *iph;
445         int raw = 0;
446         int ptr;
447         struct net_device *dev;
448         struct sk_buff *skb2;
449         unsigned int mtu, hlen, left, len, ll_rs;
450         int offset;
451         int not_last_frag;
452         struct rtable *rt = (struct rtable*)skb->dst;
453         int err = 0;
454
455         dev = rt->u.dst.dev;
456
457         /*
458          *      Point into the IP datagram header.
459          */
460
461         iph = skb->nh.iph;
462
463         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
464                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
465                           htonl(dst_pmtu(&rt->u.dst)));
466                 kfree_skb(skb);
467                 return -EMSGSIZE;
468         }
469
470         /*
471          *      Setup starting values.
472          */
473
474         hlen = iph->ihl * 4;
475         mtu = dst_pmtu(&rt->u.dst) - hlen;      /* Size of data space */
476
477         /* When frag_list is given, use it. First, check its validity:
478          * some transformers could create wrong frag_list or break existing
479          * one, it is not prohibited. In this case fall back to copying.
480          *
481          * LATER: this step can be merged to real generation of fragments,
482          * we can switch to copy when see the first bad fragment.
483          */
484         if (skb_shinfo(skb)->frag_list) {
485                 struct sk_buff *frag;
486                 int first_len = skb_pagelen(skb);
487
488                 if (first_len - hlen > mtu ||
489                     ((first_len - hlen) & 7) ||
490                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
491                     skb_cloned(skb))
492                         goto slow_path;
493
494                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
495                         /* Correct geometry. */
496                         if (frag->len > mtu ||
497                             ((frag->len & 7) && frag->next) ||
498                             skb_headroom(frag) < hlen)
499                             goto slow_path;
500
501                         /* Partially cloned skb? */
502                         if (skb_shared(frag))
503                                 goto slow_path;
504                 }
505
506                 /* Everything is OK. Generate! */
507
508                 err = 0;
509                 offset = 0;
510                 frag = skb_shinfo(skb)->frag_list;
511                 skb_shinfo(skb)->frag_list = NULL;
512                 skb->data_len = first_len - skb_headlen(skb);
513                 skb->len = first_len;
514                 iph->tot_len = htons(first_len);
515                 iph->frag_off |= htons(IP_MF);
516                 ip_send_check(iph);
517
518                 for (;;) {
519                         /* Prepare header of the next frame,
520                          * before previous one went down. */
521                         if (frag) {
522                                 frag->h.raw = frag->data;
523                                 frag->nh.raw = __skb_push(frag, hlen);
524                                 memcpy(frag->nh.raw, iph, hlen);
525                                 iph = frag->nh.iph;
526                                 iph->tot_len = htons(frag->len);
527                                 ip_copy_metadata(frag, skb);
528                                 if (offset == 0)
529                                         ip_options_fragment(frag);
530                                 offset += skb->len - hlen;
531                                 iph->frag_off = htons(offset>>3);
532                                 if (frag->next != NULL)
533                                         iph->frag_off |= htons(IP_MF);
534                                 /* Ready, complete checksum */
535                                 ip_send_check(iph);
536                         }
537
538                         err = output(skb);
539
540                         if (err || !frag)
541                                 break;
542
543                         skb = frag;
544                         frag = skb->next;
545                         skb->next = NULL;
546                 }
547
548                 if (err == 0) {
549                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
550                         return 0;
551                 }
552
553                 while (frag) {
554                         skb = frag->next;
555                         kfree_skb(frag);
556                         frag = skb;
557                 }
558                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
559                 return err;
560         }
561
562 slow_path:
563         left = skb->len - hlen;         /* Space per frame */
564         ptr = raw + hlen;               /* Where to start from */
565
566 #ifdef CONFIG_BRIDGE_NETFILTER
567         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
568          * we need to make room for the encapsulating header */
569         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
570         mtu -= nf_bridge_pad(skb);
571 #else
572         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
573 #endif
574         /*
575          *      Fragment the datagram.
576          */
577
578         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
579         not_last_frag = iph->frag_off & htons(IP_MF);
580
581         /*
582          *      Keep copying data until we run out.
583          */
584
585         while(left > 0) {
586                 len = left;
587                 /* IF: it doesn't fit, use 'mtu' - the data space left */
588                 if (len > mtu)
589                         len = mtu;
590                 /* IF: we are not sending upto and including the packet end
591                    then align the next start on an eight byte boundary */
592                 if (len < left) {
593                         len &= ~7;
594                 }
595                 /*
596                  *      Allocate buffer.
597                  */
598
599                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
600                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
601                         err = -ENOMEM;
602                         goto fail;
603                 }
604
605                 /*
606                  *      Set up data on packet
607                  */
608
609                 ip_copy_metadata(skb2, skb);
610                 skb_reserve(skb2, ll_rs);
611                 skb_put(skb2, len + hlen);
612                 skb2->nh.raw = skb2->data;
613                 skb2->h.raw = skb2->data + hlen;
614
615                 /*
616                  *      Charge the memory for the fragment to any owner
617                  *      it might possess
618                  */
619
620                 if (skb->sk)
621                         skb_set_owner_w(skb2, skb->sk);
622
623                 /*
624                  *      Copy the packet header into the new buffer.
625                  */
626
627                 memcpy(skb2->nh.raw, skb->data, hlen);
628
629                 /*
630                  *      Copy a block of the IP datagram.
631                  */
632                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
633                         BUG();
634                 left -= len;
635
636                 /*
637                  *      Fill in the new header fields.
638                  */
639                 iph = skb2->nh.iph;
640                 iph->frag_off = htons((offset >> 3));
641
642                 /* ANK: dirty, but effective trick. Upgrade options only if
643                  * the segment to be fragmented was THE FIRST (otherwise,
644                  * options are already fixed) and make it ONCE
645                  * on the initial skb, so that all the following fragments
646                  * will inherit fixed options.
647                  */
648                 if (offset == 0)
649                         ip_options_fragment(skb);
650
651                 /*
652                  *      Added AC : If we are fragmenting a fragment that's not the
653                  *                 last fragment then keep MF on each bit
654                  */
655                 if (left > 0 || not_last_frag)
656                         iph->frag_off |= htons(IP_MF);
657                 ptr += len;
658                 offset += len;
659
660                 /*
661                  *      Put this fragment into the sending queue.
662                  */
663
664                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
665
666                 iph->tot_len = htons(len + hlen);
667
668                 ip_send_check(iph);
669
670                 err = output(skb2);
671                 if (err)
672                         goto fail;
673         }
674         kfree_skb(skb);
675         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
676         return err;
677
678 fail:
679         kfree_skb(skb); 
680         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
681         return err;
682 }
683
684 int
685 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
686 {
687         struct iovec *iov = from;
688
689         if (skb->ip_summed == CHECKSUM_HW) {
690                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
691                         return -EFAULT;
692         } else {
693                 unsigned int csum = 0;
694                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
695                         return -EFAULT;
696                 skb->csum = csum_block_add(skb->csum, csum, odd);
697         }
698         return 0;
699 }
700
701 static inline unsigned int
702 csum_page(struct page *page, int offset, int copy)
703 {
704         char *kaddr;
705         unsigned int csum;
706         kaddr = kmap(page);
707         csum = csum_partial(kaddr + offset, copy, 0);
708         kunmap(page);
709         return csum;
710 }
711
712 /*
713  *      ip_append_data() and ip_append_page() can make one large IP datagram
714  *      from many pieces of data. Each pieces will be holded on the socket
715  *      until ip_push_pending_frames() is called. Eache pieces can be a page
716  *      or non-page data.
717  *      
718  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
719  *      this interface potentially.
720  *
721  *      LATER: length must be adjusted by pad at tail, when it is required.
722  */
723 int ip_append_data(struct sock *sk,
724                    int getfrag(void *from, char *to, int offset, int len,
725                                int odd, struct sk_buff *skb),
726                    void *from, int length, int transhdrlen,
727                    struct ipcm_cookie *ipc, struct rtable *rt,
728                    unsigned int flags)
729 {
730         struct inet_opt *inet = inet_sk(sk);
731         struct sk_buff *skb;
732
733         struct ip_options *opt = NULL;
734         int hh_len;
735         int exthdrlen;
736         int mtu;
737         int copy;
738         int err;
739         int offset = 0;
740         unsigned int maxfraglen, fragheaderlen;
741         int csummode = CHECKSUM_NONE;
742
743         if (flags&MSG_PROBE)
744                 return 0;
745
746         if (skb_queue_empty(&sk->sk_write_queue)) {
747                 /*
748                  * setup for corking.
749                  */
750                 opt = ipc->opt;
751                 if (opt) {
752                         if (inet->cork.opt == NULL) {
753                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
754                                 if (unlikely(inet->cork.opt == NULL))
755                                         return -ENOBUFS;
756                         }
757                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
758                         inet->cork.flags |= IPCORK_OPT;
759                         inet->cork.addr = ipc->addr;
760                 }
761                 dst_hold(&rt->u.dst);
762                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
763                 inet->cork.rt = rt;
764                 inet->cork.length = 0;
765                 sk->sk_sndmsg_page = NULL;
766                 sk->sk_sndmsg_off = 0;
767                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
768                         length += exthdrlen;
769                         transhdrlen += exthdrlen;
770                 }
771         } else {
772                 rt = inet->cork.rt;
773                 if (inet->cork.flags & IPCORK_OPT)
774                         opt = inet->cork.opt;
775
776                 transhdrlen = 0;
777                 exthdrlen = 0;
778                 mtu = inet->cork.fragsize;
779         }
780         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
781
782         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
783         maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
784
785         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
786                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
787                 return -EMSGSIZE;
788         }
789
790         /*
791          * transhdrlen > 0 means that this is the first fragment and we wish
792          * it won't be fragmented in the future.
793          */
794         if (transhdrlen &&
795             length + fragheaderlen <= maxfraglen &&
796             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
797             !exthdrlen)
798                 csummode = CHECKSUM_HW;
799
800         inet->cork.length += length;
801
802         /* So, what's going on in the loop below?
803          *
804          * We use calculated fragment length to generate chained skb,
805          * each of segments is IP fragment ready for sending to network after
806          * adding appropriate IP header.
807          *
808          * Mistake is:
809          *
810          *    If mtu-fragheaderlen is not 0 modulo 8, we generate additional
811          *    small fragment of length (mtu-fragheaderlen)%8, even though
812          *    it is not necessary. Not a big bug, but needs a fix.
813          */
814
815         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
816                 goto alloc_new_skb;
817
818         while (length > 0) {
819                 if ((copy = maxfraglen - skb->len) <= 0) {
820                         char *data;
821                         unsigned int datalen;
822                         unsigned int fraglen;
823                         unsigned int alloclen;
824                         BUG_TRAP(copy == 0);
825
826 alloc_new_skb:
827                         datalen = maxfraglen - fragheaderlen;
828                         if (datalen > length)
829                                 datalen = length;
830
831                         fraglen = datalen + fragheaderlen;
832                         if ((flags & MSG_MORE) && 
833                             !(rt->u.dst.dev->features&NETIF_F_SG))
834                                 alloclen = maxfraglen;
835                         else
836                                 alloclen = datalen + fragheaderlen;
837
838                         /* The last fragment gets additional space at tail.
839                          * Note, with MSG_MORE we overallocate on fragments,
840                          * because we have no idea what fragment will be
841                          * the last.
842                          */
843                         if (datalen == length)
844                                 alloclen += rt->u.dst.trailer_len;
845
846                         if (transhdrlen) {
847                                 skb = sock_alloc_send_skb(sk, 
848                                                 alloclen + hh_len + 15,
849                                                 (flags & MSG_DONTWAIT), &err);
850                         } else {
851                                 skb = NULL;
852                                 if (atomic_read(&sk->sk_wmem_alloc) <=
853                                     2 * sk->sk_sndbuf)
854                                         skb = sock_wmalloc(sk, 
855                                                            alloclen + hh_len + 15, 1,
856                                                            sk->sk_allocation);
857                                 if (unlikely(skb == NULL))
858                                         err = -ENOBUFS;
859                         }
860                         if (skb == NULL)
861                                 goto error;
862
863                         /*
864                          *      Fill in the control structures
865                          */
866                         skb->ip_summed = csummode;
867                         skb->csum = 0;
868                         skb_reserve(skb, hh_len);
869
870                         /*
871                          *      Find where to start putting bytes.
872                          */
873                         data = skb_put(skb, fraglen);
874                         skb->nh.raw = data + exthdrlen;
875                         data += fragheaderlen;
876                         skb->h.raw = data + exthdrlen;
877
878                         copy = datalen - transhdrlen;
879                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
880                                 err = -EFAULT;
881                                 kfree_skb(skb);
882                                 goto error;
883                         }
884
885                         offset += copy;
886                         length -= datalen;
887                         transhdrlen = 0;
888                         exthdrlen = 0;
889                         csummode = CHECKSUM_NONE;
890
891                         /*
892                          * Put the packet on the pending queue.
893                          */
894                         __skb_queue_tail(&sk->sk_write_queue, skb);
895                         continue;
896                 }
897
898                 if (copy > length)
899                         copy = length;
900
901                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
902                         unsigned int off;
903
904                         off = skb->len;
905                         if (getfrag(from, skb_put(skb, copy), 
906                                         offset, copy, off, skb) < 0) {
907                                 __skb_trim(skb, off);
908                                 err = -EFAULT;
909                                 goto error;
910                         }
911                 } else {
912                         int i = skb_shinfo(skb)->nr_frags;
913                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
914                         struct page *page = sk->sk_sndmsg_page;
915                         int off = sk->sk_sndmsg_off;
916                         unsigned int left;
917
918                         if (page && (left = PAGE_SIZE - off) > 0) {
919                                 if (copy >= left)
920                                         copy = left;
921                                 if (page != frag->page) {
922                                         if (i == MAX_SKB_FRAGS) {
923                                                 err = -EMSGSIZE;
924                                                 goto error;
925                                         }
926                                         get_page(page);
927                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
928                                         frag = &skb_shinfo(skb)->frags[i];
929                                 }
930                         } else if (i < MAX_SKB_FRAGS) {
931                                 if (copy > PAGE_SIZE)
932                                         copy = PAGE_SIZE;
933                                 page = alloc_pages(sk->sk_allocation, 0);
934                                 if (page == NULL)  {
935                                         err = -ENOMEM;
936                                         goto error;
937                                 }
938                                 sk->sk_sndmsg_page = page;
939                                 sk->sk_sndmsg_off = 0;
940
941                                 skb_fill_page_desc(skb, i, page, 0, 0);
942                                 frag = &skb_shinfo(skb)->frags[i];
943                                 skb->truesize += PAGE_SIZE;
944                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
945                         } else {
946                                 err = -EMSGSIZE;
947                                 goto error;
948                         }
949                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
950                                 err = -EFAULT;
951                                 goto error;
952                         }
953                         sk->sk_sndmsg_off += copy;
954                         frag->size += copy;
955                         skb->len += copy;
956                         skb->data_len += copy;
957                 }
958                 offset += copy;
959                 length -= copy;
960         }
961
962         return 0;
963
964 error:
965         inet->cork.length -= length;
966         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
967         return err; 
968 }
969
970 ssize_t ip_append_page(struct sock *sk, struct page *page,
971                        int offset, size_t size, int flags)
972 {
973         struct inet_opt *inet = inet_sk(sk);
974         struct sk_buff *skb;
975         struct rtable *rt;
976         struct ip_options *opt = NULL;
977         int hh_len;
978         int mtu;
979         int len;
980         int err;
981         unsigned int maxfraglen, fragheaderlen;
982
983         if (inet->hdrincl)
984                 return -EPERM;
985
986         if (flags&MSG_PROBE)
987                 return 0;
988
989         if (skb_queue_empty(&sk->sk_write_queue))
990                 return -EINVAL;
991
992         rt = inet->cork.rt;
993         if (inet->cork.flags & IPCORK_OPT)
994                 opt = inet->cork.opt;
995
996         if (!(rt->u.dst.dev->features&NETIF_F_SG))
997                 return -EOPNOTSUPP;
998
999         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1000         mtu = inet->cork.fragsize;
1001
1002         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1003         maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
1004
1005         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1006                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1007                 return -EMSGSIZE;
1008         }
1009
1010         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1011                 return -EINVAL;
1012
1013         inet->cork.length += size;
1014
1015         while (size > 0) {
1016                 int i;
1017                 if ((len = maxfraglen - skb->len) <= 0) {
1018                         char *data;
1019                         struct iphdr *iph;
1020                         BUG_TRAP(len == 0);
1021
1022                         skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1,
1023                                            sk->sk_allocation);
1024                         if (unlikely(!skb)) {
1025                                 err = -ENOBUFS;
1026                                 goto error;
1027                         }
1028
1029                         /*
1030                          *      Fill in the control structures
1031                          */
1032                         skb->ip_summed = CHECKSUM_NONE;
1033                         skb->csum = 0;
1034                         skb_reserve(skb, hh_len);
1035
1036                         /*
1037                          *      Find where to start putting bytes.
1038                          */
1039                         data = skb_put(skb, fragheaderlen);
1040                         skb->nh.iph = iph = (struct iphdr *)data;
1041                         data += fragheaderlen;
1042                         skb->h.raw = data;
1043
1044                         /*
1045                          * Put the packet on the pending queue.
1046                          */
1047                         __skb_queue_tail(&sk->sk_write_queue, skb);
1048                         continue;
1049                 }
1050
1051                 i = skb_shinfo(skb)->nr_frags;
1052                 if (len > size)
1053                         len = size;
1054                 if (skb_can_coalesce(skb, i, page, offset)) {
1055                         skb_shinfo(skb)->frags[i-1].size += len;
1056                 } else if (i < MAX_SKB_FRAGS) {
1057                         get_page(page);
1058                         skb_fill_page_desc(skb, i, page, offset, len);
1059                 } else {
1060                         err = -EMSGSIZE;
1061                         goto error;
1062                 }
1063
1064                 if (skb->ip_summed == CHECKSUM_NONE) {
1065                         unsigned int csum;
1066                         csum = csum_page(page, offset, len);
1067                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1068                 }
1069
1070                 skb->len += len;
1071                 skb->data_len += len;
1072                 offset += len;
1073                 size -= len;
1074         }
1075         return 0;
1076
1077 error:
1078         inet->cork.length -= size;
1079         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1080         return err;
1081 }
1082
1083 /*
1084  *      Combined all pending IP fragments on the socket as one IP datagram
1085  *      and push them out.
1086  */
1087 int ip_push_pending_frames(struct sock *sk)
1088 {
1089         struct sk_buff *skb, *tmp_skb;
1090         struct sk_buff **tail_skb;
1091         struct inet_opt *inet = inet_sk(sk);
1092         struct ip_options *opt = NULL;
1093         struct rtable *rt = inet->cork.rt;
1094         struct iphdr *iph;
1095         int df = 0;
1096         __u8 ttl;
1097         int err = 0;
1098
1099         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1100                 goto out;
1101         tail_skb = &(skb_shinfo(skb)->frag_list);
1102
1103         /* move skb->data to ip header from ext header */
1104         if (skb->data < skb->nh.raw)
1105                 __skb_pull(skb, skb->nh.raw - skb->data);
1106         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1107                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1108                 *tail_skb = tmp_skb;
1109                 tail_skb = &(tmp_skb->next);
1110                 skb->len += tmp_skb->len;
1111                 skb->data_len += tmp_skb->len;
1112                 skb->truesize += tmp_skb->truesize;
1113                 __sock_put(tmp_skb->sk);
1114                 tmp_skb->destructor = NULL;
1115                 tmp_skb->sk = NULL;
1116         }
1117
1118         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1119          * to fragment the frame generated here. No matter, what transforms
1120          * how transforms change size of the packet, it will come out.
1121          */
1122         if (inet->pmtudisc != IP_PMTUDISC_DO)
1123                 skb->local_df = 1;
1124
1125         /* DF bit is set when we want to see DF on outgoing frames.
1126          * If local_df is set too, we still allow to fragment this frame
1127          * locally. */
1128         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1129             (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
1130                 df = htons(IP_DF);
1131
1132         if (inet->cork.flags & IPCORK_OPT)
1133                 opt = inet->cork.opt;
1134
1135         if (rt->rt_type == RTN_MULTICAST)
1136                 ttl = inet->mc_ttl;
1137         else
1138                 ttl = ip_select_ttl(inet, &rt->u.dst);
1139
1140         iph = (struct iphdr *)skb->data;
1141         iph->version = 4;
1142         iph->ihl = 5;
1143         if (opt) {
1144                 iph->ihl += opt->optlen>>2;
1145                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1146         }
1147         iph->tos = inet->tos;
1148         iph->tot_len = htons(skb->len);
1149         iph->frag_off = df;
1150         if (!df) {
1151                 __ip_select_ident(iph, &rt->u.dst, 0);
1152         } else {
1153                 iph->id = htons(inet->id++);
1154         }
1155         iph->ttl = ttl;
1156         iph->protocol = sk->sk_protocol;
1157         iph->saddr = rt->rt_src;
1158         iph->daddr = rt->rt_dst;
1159         ip_send_check(iph);
1160
1161         skb->priority = sk->sk_priority;
1162         skb->dst = dst_clone(&rt->u.dst);
1163
1164         /* Netfilter gets whole the not fragmented skb. */
1165         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1166                       skb->dst->dev, dst_output);
1167         if (err) {
1168                 if (err > 0)
1169                         err = inet->recverr ? net_xmit_errno(err) : 0;
1170                 if (err)
1171                         goto error;
1172         }
1173
1174 out:
1175         inet->cork.flags &= ~IPCORK_OPT;
1176         if (inet->cork.opt) {
1177                 kfree(inet->cork.opt);
1178                 inet->cork.opt = NULL;
1179         }
1180         if (inet->cork.rt) {
1181                 ip_rt_put(inet->cork.rt);
1182                 inet->cork.rt = NULL;
1183         }
1184         return err;
1185
1186 error:
1187         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1188         goto out;
1189 }
1190
1191 /*
1192  *      Throw away all pending data on the socket.
1193  */
1194 void ip_flush_pending_frames(struct sock *sk)
1195 {
1196         struct inet_opt *inet = inet_sk(sk);
1197         struct sk_buff *skb;
1198
1199         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1200                 kfree_skb(skb);
1201
1202         inet->cork.flags &= ~IPCORK_OPT;
1203         if (inet->cork.opt) {
1204                 kfree(inet->cork.opt);
1205                 inet->cork.opt = NULL;
1206         }
1207         if (inet->cork.rt) {
1208                 ip_rt_put(inet->cork.rt);
1209                 inet->cork.rt = NULL;
1210         }
1211 }
1212
1213
1214 /*
1215  *      Fetch data from kernel space and fill in checksum if needed.
1216  */
1217 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1218                               int len, int odd, struct sk_buff *skb)
1219 {
1220         unsigned int csum;
1221
1222         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1223         skb->csum = csum_block_add(skb->csum, csum, odd);
1224         return 0;  
1225 }
1226
1227 /* 
1228  *      Generic function to send a packet as reply to another packet.
1229  *      Used to send TCP resets so far. ICMP should use this function too.
1230  *
1231  *      Should run single threaded per socket because it uses the sock 
1232  *      structure to pass arguments.
1233  *
1234  *      LATER: switch from ip_build_xmit to ip_append_*
1235  */
1236 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1237                    unsigned int len)
1238 {
1239         struct inet_opt *inet = inet_sk(sk);
1240         struct {
1241                 struct ip_options       opt;
1242                 char                    data[40];
1243         } replyopts;
1244         struct ipcm_cookie ipc;
1245         u32 daddr;
1246         struct rtable *rt = (struct rtable*)skb->dst;
1247
1248         if (ip_options_echo(&replyopts.opt, skb))
1249                 return;
1250
1251         daddr = ipc.addr = rt->rt_src;
1252         ipc.opt = NULL;
1253
1254         if (replyopts.opt.optlen) {
1255                 ipc.opt = &replyopts.opt;
1256
1257                 if (ipc.opt->srr)
1258                         daddr = replyopts.opt.faddr;
1259         }
1260
1261         {
1262                 struct flowi fl = { .nl_u = { .ip4_u =
1263                                               { .daddr = daddr,
1264                                                 .saddr = rt->rt_spec_dst,
1265                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1266                                     /* Not quite clean, but right. */
1267                                     .uli_u = { .ports =
1268                                                { .sport = skb->h.th->dest,
1269                                                  .dport = skb->h.th->source } },
1270                                     .proto = sk->sk_protocol };
1271                 if (ip_route_output_key(&rt, &fl))
1272                         return;
1273         }
1274
1275         /* And let IP do all the hard work.
1276
1277            This chunk is not reenterable, hence spinlock.
1278            Note that it uses the fact, that this function is called
1279            with locally disabled BH and that sk cannot be already spinlocked.
1280          */
1281         bh_lock_sock(sk);
1282         inet->tos = skb->nh.iph->tos;
1283         sk->sk_priority = skb->priority;
1284         sk->sk_protocol = skb->nh.iph->protocol;
1285         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1286                        &ipc, rt, MSG_DONTWAIT);
1287         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1288                 if (arg->csumoffset >= 0)
1289                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1290                 skb->ip_summed = CHECKSUM_NONE;
1291                 ip_push_pending_frames(sk);
1292         }
1293
1294         bh_unlock_sock(sk);
1295
1296         ip_rt_put(rt);
1297 }
1298
1299 /*
1300  *      IP protocol layer initialiser
1301  */
1302
1303 static struct packet_type ip_packet_type = {
1304         .type = __constant_htons(ETH_P_IP),
1305         .func = ip_rcv,
1306 };
1307
1308 /*
1309  *      IP registers the packet type and then calls the subprotocol initialisers
1310  */
1311
1312 void __init ip_init(void)
1313 {
1314         dev_add_pack(&ip_packet_type);
1315
1316         ip_rt_init();
1317         inet_initpeers();
1318
1319 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1320         igmp_mc_proc_init();
1321 #endif
1322 }
1323
1324 EXPORT_SYMBOL(ip_finish_output);
1325 EXPORT_SYMBOL(ip_fragment);
1326 EXPORT_SYMBOL(ip_generic_getfrag);
1327 EXPORT_SYMBOL(ip_queue_xmit);
1328 EXPORT_SYMBOL(ip_send_check);
1329
1330 #ifdef CONFIG_SYSCTL
1331 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1332 #endif