net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/udp.h>
  74 #include <linux/skbuff.h>
  75 #include <net/sock.h>
  76 #include <net/arp.h>
  77 #include <net/icmp.h>
  78 #include <net/raw.h>
  79 #include <net/checksum.h>
  80 #include <net/inetpeer.h>
  81 #include <net/checksum.h>
  82 #include <linux/igmp.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/netfilter_bridge.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netlink.h>
  87
  88 /*
  89  *      Shall we try to damage output packets if routing dev changes?
  90  */
  91
  92 int sysctl_ip_dynaddr;
  93 int sysctl_ip_default_ttl = IPDEFTTL;
  94
  95 /* Generate a checksum for an outgoing IP datagram. */
  96 __inline__ void ip_send_check(struct iphdr *iph)
  97 {
  98         iph->check = 0;
  99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 100 }
 101
 102 /* dev_loopback_xmit for use with netfilter. */
 103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 104 {
 105         newskb->mac.raw = newskb->data;
 106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 107         newskb->pkt_type = PACKET_LOOPBACK;
 108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 109         BUG_TRAP(newskb->dst);
 110
 111 #ifdef CONFIG_NETFILTER_DEBUG
 112         nf_debug_ip_loopback_xmit(newskb);
 113 #endif
 114         netif_rx(newskb);
 115         return 0;
 116 }
 117
 118 static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst)
 119 {
 120         int ttl = inet->uc_ttl;
 121
 122         if (ttl < 0)
 123                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 124         return ttl;
 125 }
 126
 127 /*
 128  *              Add an ip header to a skbuff and send it out.
 129  *
 130  */
 131 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 132                           u32 saddr, u32 daddr, struct ip_options *opt)
 133 {
 134         struct inet_opt *inet = inet_sk(sk);
 135         struct rtable *rt = (struct rtable *)skb->dst;
 136         struct iphdr *iph;
 137
 138         /* Build the IP header. */
 139         if (opt)
 140                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 141         else
 142                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 143
 144         iph->version  = 4;
 145         iph->ihl      = 5;
 146         iph->tos      = inet->tos;
 147         if (ip_dont_fragment(sk, &rt->u.dst))
 148                 iph->frag_off = htons(IP_DF);
 149         else
 150                 iph->frag_off = 0;
 151         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 152         iph->daddr    = rt->rt_dst;
 153         iph->saddr    = rt->rt_src;
 154         iph->protocol = sk->sk_protocol;
 155         iph->tot_len  = htons(skb->len);
 156         ip_select_ident(iph, &rt->u.dst, sk);
 157         skb->nh.iph   = iph;
 158
 159         if (opt && opt->optlen) {
 160                 iph->ihl += opt->optlen>>2;
 161                 ip_options_build(skb, opt, daddr, rt, 0);
 162         }
 163         ip_send_check(iph);
 164
 165         skb->priority = sk->sk_priority;
 166
 167         /* Send it out. */
 168         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 169                        dst_output);
 170 }
 171
 172 static inline int ip_finish_output2(struct sk_buff *skb)
 173 {
 174         struct dst_entry *dst = skb->dst;
 175         struct hh_cache *hh = dst->hh;
 176         struct net_device *dev = dst->dev;
 177         int hh_len = LL_RESERVED_SPACE(dev);
 178
 179         /* Be paranoid, rather than too clever. */
 180         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 181                 struct sk_buff *skb2;
 182
 183                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 184                 if (skb2 == NULL) {
 185                         kfree_skb(skb);
 186                         return -ENOMEM;
 187                 }
 188                 if (skb->sk)
 189                         skb_set_owner_w(skb2, skb->sk);
 190                 kfree_skb(skb);
 191                 skb = skb2;
 192         }
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195         nf_debug_ip_finish_output2(skb);
 196 #endif /*CONFIG_NETFILTER_DEBUG*/
 197
 198         if (hh) {
 199                 int hh_alen;
 200
 201                 read_lock_bh(&hh->hh_lock);
 202                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 203                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 204                 read_unlock_bh(&hh->hh_lock);
 205                 skb_push(skb, hh->hh_len);
 206                 return hh->hh_output(skb);
 207         } else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 int ip_finish_output(struct sk_buff *skb)
 217 {
 218         struct net_device *dev = skb->dst->dev;
 219
 220         skb->dev = dev;
 221         skb->protocol = htons(ETH_P_IP);
 222
 223         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 224                        ip_finish_output2);
 225 }
 226
 227 int ip_mc_output(struct sk_buff *skb)
 228 {
 229         struct sock *sk = skb->sk;
 230         struct rtable *rt = (struct rtable*)skb->dst;
 231         struct net_device *dev = rt->u.dst.dev;
 232
 233         /*
 234          *      If the indicated interface is up and running, send the packet.
 235          */
 236         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 237
 238         skb->dev = dev;
 239         skb->protocol = htons(ETH_P_IP);
 240
 241         /*
 242          *      Multicasts are looped back for other local users
 243          */
 244
 245         if (rt->rt_flags&RTCF_MULTICAST) {
 246                 if ((!sk || inet_sk(sk)->mc_loop)
 247 #ifdef CONFIG_IP_MROUTE
 248                 /* Small optimization: do not loopback not local frames,
 249                    which returned after forwarding; they will be  dropped
 250                    by ip_mr_input in any case.
 251                    Note, that local frames are looped back to be delivered
 252                    to local recipients.
 253
 254                    This check is duplicated in ip_mr_input at the moment.
 255                  */
 256                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 257 #endif
 258                 ) {
 259                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 260                         if (newskb)
 261                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 262                                         newskb->dev,
 263                                         ip_dev_loopback_xmit);
 264                 }
 265
 266                 /* Multicasts with ttl 0 must not go beyond the host */
 267
 268                 if (skb->nh.iph->ttl == 0) {
 269                         kfree_skb(skb);
 270                         return 0;
 271                 }
 272         }
 273
 274         if (rt->rt_flags&RTCF_BROADCAST) {
 275                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 276                 if (newskb)
 277                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 278                                 newskb->dev, ip_dev_loopback_xmit);
 279         }
 280
 281         if (skb->len > dst_pmtu(&rt->u.dst))
 282                 return ip_fragment(skb, ip_finish_output);
 283         else
 284                 return ip_finish_output(skb);
 285 }
 286
 287 int ip_output(struct sk_buff *skb)
 288 {
 289         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 290
 291         if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size)
 292                 return ip_fragment(skb, ip_finish_output);
 293         else
 294                 return ip_finish_output(skb);
 295 }
 296
 297 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 298 {
 299         struct sock *sk = skb->sk;
 300         struct inet_opt *inet = inet_sk(sk);
 301         struct ip_options *opt = inet->opt;
 302         struct rtable *rt;
 303         struct iphdr *iph;
 304
 305         /* Skip all of this if the packet is already routed,
 306          * f.e. by something like SCTP.
 307          */
 308         rt = (struct rtable *) skb->dst;
 309         if (rt != NULL)
 310                 goto packet_routed;
 311
 312         /* Make sure we can route this packet. */
 313         rt = (struct rtable *)__sk_dst_check(sk, 0);
 314         if (rt == NULL) {
 315                 u32 daddr;
 316
 317                 /* Use correct destination address if we have options. */
 318                 daddr = inet->daddr;
 319                 if(opt && opt->srr)
 320                         daddr = opt->faddr;
 321
 322                 {
 323                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 324                                             .nl_u = { .ip4_u =
 325                                                       { .daddr = daddr,
 326                                                         .saddr = inet->saddr,
 327                                                         .tos = RT_CONN_FLAGS(sk) } },
 328                                             .proto = sk->sk_protocol,
 329                                             .uli_u = { .ports =
 330                                                        { .sport = inet->sport,
 331                                                          .dport = inet->dport } } };
 332
 333                         /* If this fails, retransmit mechanism of transport layer will
 334                          * keep trying until route appears or the connection times
 335                          * itself out.
 336                          */
 337                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 338                                 goto no_route;
 339                 }
 340                 __sk_dst_set(sk, &rt->u.dst);
 341                 tcp_v4_setup_caps(sk, &rt->u.dst);
 342         }
 343         skb->dst = dst_clone(&rt->u.dst);
 344
 345 packet_routed:
 346         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 347                 goto no_route;
 348
 349         /* OK, we know where to send it, allocate and build IP header. */
 350         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 351         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 352         iph->tot_len = htons(skb->len);
 353         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 354                 iph->frag_off = htons(IP_DF);
 355         else
 356                 iph->frag_off = 0;
 357         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 358         iph->protocol = sk->sk_protocol;
 359         iph->saddr    = rt->rt_src;
 360         iph->daddr    = rt->rt_dst;
 361         skb->nh.iph   = iph;
 362         /* Transport layer set skb->h.foo itself. */
 363
 364         if (opt && opt->optlen) {
 365                 iph->ihl += opt->optlen >> 2;
 366                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 367         }
 368
 369         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 370
 371         /* Add an IP checksum. */
 372         ip_send_check(iph);
 373
 374         skb->priority = sk->sk_priority;
 375
 376         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 377                        dst_output);
 378
 379 no_route:
 380         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 381         kfree_skb(skb);
 382         return -EHOSTUNREACH;
 383 }
 384
 385
 386 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 387 {
 388         to->pkt_type = from->pkt_type;
 389         to->priority = from->priority;
 390         to->protocol = from->protocol;
 391         to->security = from->security;
 392         to->dst = dst_clone(from->dst);
 393         to->dev = from->dev;
 394
 395         /* Copy the flags to each fragment. */
 396         IPCB(to)->flags = IPCB(from)->flags;
 397
 398 #ifdef CONFIG_NET_SCHED
 399         to->tc_index = from->tc_index;
 400 #endif
 401 #ifdef CONFIG_NETFILTER
 402         to->nfmark = from->nfmark;
 403         to->nfcache = from->nfcache;
 404         /* Connection association is same as pre-frag packet */
 405         nf_conntrack_put(to->nfct);
 406         to->nfct = from->nfct;
 407         nf_conntrack_get(to->nfct);
 408         to->nfctinfo = from->nfctinfo;
 409 #ifdef CONFIG_BRIDGE_NETFILTER
 410         nf_bridge_put(to->nf_bridge);
 411         to->nf_bridge = from->nf_bridge;
 412         nf_bridge_get(to->nf_bridge);
 413 #endif
 414 #ifdef CONFIG_NETFILTER_DEBUG
 415         to->nf_debug = from->nf_debug;
 416 #endif
 417 #endif
 418 }
 419
 420 /*
 421  *      This IP datagram is too large to be sent in one piece.  Break it up into
 422  *      smaller pieces (each of size equal to IP header plus
 423  *      a block of the data of the original IP data part) that will yet fit in a
 424  *      single device frame, and queue such a frame for sending.
 425  */
 426
 427 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 428 {
 429         struct iphdr *iph;
 430         int raw = 0;
 431         int ptr;
 432         struct net_device *dev;
 433         struct sk_buff *skb2;
 434         unsigned int mtu, hlen, left, len, ll_rs;
 435         int offset;
 436         int not_last_frag;
 437         struct rtable *rt = (struct rtable*)skb->dst;
 438         int err = 0;
 439
 440         dev = rt->u.dst.dev;
 441
 442         /*
 443          *      Point into the IP datagram header.
 444          */
 445
 446         iph = skb->nh.iph;
 447
 448         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 449                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 450                           htonl(dst_pmtu(&rt->u.dst)));
 451                 kfree_skb(skb);
 452                 return -EMSGSIZE;
 453         }
 454
 455         /*
 456          *      Setup starting values.
 457          */
 458
 459         hlen = iph->ihl * 4;
 460         mtu = dst_pmtu(&rt->u.dst) - hlen;      /* Size of data space */
 461
 462         /* When frag_list is given, use it. First, check its validity:
 463          * some transformers could create wrong frag_list or break existing
 464          * one, it is not prohibited. In this case fall back to copying.
 465          *
 466          * LATER: this step can be merged to real generation of fragments,
 467          * we can switch to copy when see the first bad fragment.
 468          */
 469         if (skb_shinfo(skb)->frag_list) {
 470                 struct sk_buff *frag;
 471                 int first_len = skb_pagelen(skb);
 472
 473                 if (first_len - hlen > mtu ||
 474                     ((first_len - hlen) & 7) ||
 475                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 476                     skb_cloned(skb))
 477                         goto slow_path;
 478
 479                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 480                         /* Correct geometry. */
 481                         if (frag->len > mtu ||
 482                             ((frag->len & 7) && frag->next) ||
 483                             skb_headroom(frag) < hlen)
 484                             goto slow_path;
 485
 486                         /* Partially cloned skb? */
 487                         if (skb_shared(frag))
 488                                 goto slow_path;
 489                 }
 490
 491                 /* Everything is OK. Generate! */
 492
 493                 err = 0;
 494                 offset = 0;
 495                 frag = skb_shinfo(skb)->frag_list;
 496                 skb_shinfo(skb)->frag_list = NULL;
 497                 skb->data_len = first_len - skb_headlen(skb);
 498                 skb->len = first_len;
 499                 iph->tot_len = htons(first_len);
 500                 iph->frag_off |= htons(IP_MF);
 501                 ip_send_check(iph);
 502
 503                 for (;;) {
 504                         /* Prepare header of the next frame,
 505                          * before previous one went down. */
 506                         if (frag) {
 507                                 frag->h.raw = frag->data;
 508                                 frag->nh.raw = __skb_push(frag, hlen);
 509                                 memcpy(frag->nh.raw, iph, hlen);
 510                                 iph = frag->nh.iph;
 511                                 iph->tot_len = htons(frag->len);
 512                                 ip_copy_metadata(frag, skb);
 513                                 if (offset == 0)
 514                                         ip_options_fragment(frag);
 515                                 offset += skb->len - hlen;
 516                                 iph->frag_off = htons(offset>>3);
 517                                 if (frag->next != NULL)
 518                                         iph->frag_off |= htons(IP_MF);
 519                                 /* Ready, complete checksum */
 520                                 ip_send_check(iph);
 521                         }
 522
 523                         err = output(skb);
 524
 525                         if (err || !frag)
 526                                 break;
 527
 528                         skb = frag;
 529                         frag = skb->next;
 530                         skb->next = NULL;
 531                 }
 532
 533                 if (err == 0) {
 534                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 535                         return 0;
 536                 }
 537
 538                 while (frag) {
 539                         skb = frag->next;
 540                         kfree_skb(frag);
 541                         frag = skb;
 542                 }
 543                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 544                 return err;
 545         }
 546
 547 slow_path:
 548         left = skb->len - hlen;         /* Space per frame */
 549         ptr = raw + hlen;               /* Where to start from */
 550
 551 #ifdef CONFIG_BRIDGE_NETFILTER
 552         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 553          * we need to make room for the encapsulating header */
 554         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 555         mtu -= nf_bridge_pad(skb);
 556 #else
 557         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 558 #endif
 559         /*
 560          *      Fragment the datagram.
 561          */
 562
 563         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 564         not_last_frag = iph->frag_off & htons(IP_MF);
 565
 566         /*
 567          *      Keep copying data until we run out.
 568          */
 569
 570         while(left > 0) {
 571                 len = left;
 572                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 573                 if (len > mtu)
 574                         len = mtu;
 575                 /* IF: we are not sending upto and including the packet end
 576                    then align the next start on an eight byte boundary */
 577                 if (len < left) {
 578                         len &= ~7;
 579                 }
 580                 /*
 581                  *      Allocate buffer.
 582                  */
 583
 584                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 585                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 586                         err = -ENOMEM;
 587                         goto fail;
 588                 }
 589
 590                 /*
 591                  *      Set up data on packet
 592                  */
 593
 594                 ip_copy_metadata(skb2, skb);
 595                 skb_reserve(skb2, ll_rs);
 596                 skb_put(skb2, len + hlen);
 597                 skb2->nh.raw = skb2->data;
 598                 skb2->h.raw = skb2->data + hlen;
 599
 600                 /*
 601                  *      Charge the memory for the fragment to any owner
 602                  *      it might possess
 603                  */
 604
 605                 if (skb->sk)
 606                         skb_set_owner_w(skb2, skb->sk);
 607
 608                 /*
 609                  *      Copy the packet header into the new buffer.
 610                  */
 611
 612                 memcpy(skb2->nh.raw, skb->data, hlen);
 613
 614                 /*
 615                  *      Copy a block of the IP datagram.
 616                  */
 617                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 618                         BUG();
 619                 left -= len;
 620
 621                 /*
 622                  *      Fill in the new header fields.
 623                  */
 624                 iph = skb2->nh.iph;
 625                 iph->frag_off = htons((offset >> 3));
 626
 627                 /* ANK: dirty, but effective trick. Upgrade options only if
 628                  * the segment to be fragmented was THE FIRST (otherwise,
 629                  * options are already fixed) and make it ONCE
 630                  * on the initial skb, so that all the following fragments
 631                  * will inherit fixed options.
 632                  */
 633                 if (offset == 0)
 634                         ip_options_fragment(skb);
 635
 636                 /*
 637                  *      Added AC : If we are fragmenting a fragment that's not the
 638                  *                 last fragment then keep MF on each bit
 639                  */
 640                 if (left > 0 || not_last_frag)
 641                         iph->frag_off |= htons(IP_MF);
 642                 ptr += len;
 643                 offset += len;
 644
 645                 /*
 646                  *      Put this fragment into the sending queue.
 647                  */
 648
 649                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 650
 651                 iph->tot_len = htons(len + hlen);
 652
 653                 ip_send_check(iph);
 654
 655                 err = output(skb2);
 656                 if (err)
 657                         goto fail;
 658         }
 659         kfree_skb(skb);
 660         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 661         return err;
 662
 663 fail:
 664         kfree_skb(skb);
 665         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 666         return err;
 667 }
 668
 669 int
 670 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 671 {
 672         struct iovec *iov = from;
 673
 674         if (skb->ip_summed == CHECKSUM_HW) {
 675                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 676                         return -EFAULT;
 677         } else {
 678                 unsigned int csum = 0;
 679                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 680                         return -EFAULT;
 681                 skb->csum = csum_block_add(skb->csum, csum, odd);
 682         }
 683         return 0;
 684 }
 685
 686 static inline unsigned int
 687 csum_page(struct page *page, int offset, int copy)
 688 {
 689         char *kaddr;
 690         unsigned int csum;
 691         kaddr = kmap(page);
 692         csum = csum_partial(kaddr + offset, copy, 0);
 693         kunmap(page);
 694         return csum;
 695 }
 696
 697 /*
 698  *      ip_append_data() and ip_append_page() can make one large IP datagram
 699  *      from many pieces of data. Each pieces will be holded on the socket
 700  *      until ip_push_pending_frames() is called. Each piece can be a page
 701  *      or non-page data.
 702  *
 703  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 704  *      this interface potentially.
 705  *
 706  *      LATER: length must be adjusted by pad at tail, when it is required.
 707  */
 708 int ip_append_data(struct sock *sk,
 709                    int getfrag(void *from, char *to, int offset, int len,
 710                                int odd, struct sk_buff *skb),
 711                    void *from, int length, int transhdrlen,
 712                    struct ipcm_cookie *ipc, struct rtable *rt,
 713                    unsigned int flags)
 714 {
 715         struct inet_opt *inet = inet_sk(sk);
 716         struct sk_buff *skb;
 717
 718         struct ip_options *opt = NULL;
 719         int hh_len;
 720         int exthdrlen;
 721         int mtu;
 722         int copy;
 723         int err;
 724         int offset = 0;
 725         unsigned int maxfraglen, fragheaderlen;
 726         int csummode = CHECKSUM_NONE;
 727
 728         if (flags&MSG_PROBE)
 729                 return 0;
 730
 731         if (skb_queue_empty(&sk->sk_write_queue)) {
 732                 /*
 733                  * setup for corking.
 734                  */
 735                 opt = ipc->opt;
 736                 if (opt) {
 737                         if (inet->cork.opt == NULL) {
 738                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 739                                 if (unlikely(inet->cork.opt == NULL))
 740                                         return -ENOBUFS;
 741                         }
 742                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 743                         inet->cork.flags |= IPCORK_OPT;
 744                         inet->cork.addr = ipc->addr;
 745                 }
 746                 dst_hold(&rt->u.dst);
 747                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
 748                 inet->cork.rt = rt;
 749                 inet->cork.length = 0;
 750                 sk->sk_sndmsg_page = NULL;
 751                 sk->sk_sndmsg_off = 0;
 752                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 753                         length += exthdrlen;
 754                         transhdrlen += exthdrlen;
 755                 }
 756         } else {
 757                 rt = inet->cork.rt;
 758                 if (inet->cork.flags & IPCORK_OPT)
 759                         opt = inet->cork.opt;
 760
 761                 transhdrlen = 0;
 762                 exthdrlen = 0;
 763                 mtu = inet->cork.fragsize;
 764         }
 765         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 766
 767         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 768         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 769
 770         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 771                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 772                 return -EMSGSIZE;
 773         }
 774
 775         /*
 776          * transhdrlen > 0 means that this is the first fragment and we wish
 777          * it won't be fragmented in the future.
 778          */
 779         if (transhdrlen &&
 780             length + fragheaderlen <= mtu &&
 781             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 782             !exthdrlen)
 783                 csummode = CHECKSUM_HW;
 784
 785         inet->cork.length += length;
 786
 787         /* So, what's going on in the loop below?
 788          *
 789          * We use calculated fragment length to generate chained skb,
 790          * each of segments is IP fragment ready for sending to network after
 791          * adding appropriate IP header.
 792          */
 793
 794         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 795                 goto alloc_new_skb;
 796
 797         while (length > 0) {
 798                 /* Check if the remaining data fits into current packet. */
 799                 copy = mtu - skb->len;
 800                 if (copy < length)
 801                         copy = maxfraglen - skb->len;
 802                 if (copy <= 0) {
 803                         char *data;
 804                         unsigned int datalen;
 805                         unsigned int fraglen;
 806                         unsigned int fraggap;
 807                         unsigned int alloclen;
 808                         struct sk_buff *skb_prev;
 809 alloc_new_skb:
 810                         skb_prev = skb;
 811                         if (skb_prev)
 812                                 fraggap = skb_prev->len - maxfraglen;
 813                         else
 814                                 fraggap = 0;
 815
 816                         /*
 817                          * If remaining data exceeds the mtu,
 818                          * we know we need more fragment(s).
 819                          */
 820                         datalen = length + fraggap;
 821                         if (datalen > mtu - fragheaderlen)
 822                                 datalen = maxfraglen - fragheaderlen;
 823                         fraglen = datalen + fragheaderlen;
 824
 825                         if ((flags & MSG_MORE) &&
 826                             !(rt->u.dst.dev->features&NETIF_F_SG))
 827                                 alloclen = mtu;
 828                         else
 829                                 alloclen = datalen + fragheaderlen;
 830
 831                         /* The last fragment gets additional space at tail.
 832                          * Note, with MSG_MORE we overallocate on fragments,
 833                          * because we have no idea what fragment will be
 834                          * the last.
 835                          */
 836                         if (datalen == length)
 837                                 alloclen += rt->u.dst.trailer_len;
 838
 839                         if (transhdrlen) {
 840                                 skb = sock_alloc_send_skb(sk,
 841                                                 alloclen + hh_len + 15,
 842                                                 (flags & MSG_DONTWAIT), &err);
 843                         } else {
 844                                 skb = NULL;
 845                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 846                                     2 * sk->sk_sndbuf)
 847                                         skb = sock_wmalloc(sk,
 848                                                            alloclen + hh_len + 15, 1,
 849                                                            sk->sk_allocation);
 850                                 if (unlikely(skb == NULL))
 851                                         err = -ENOBUFS;
 852                         }
 853                         if (skb == NULL)
 854                                 goto error;
 855
 856                         /*
 857                          *      Fill in the control structures
 858                          */
 859                         skb->ip_summed = csummode;
 860                         skb->csum = 0;
 861                         skb_reserve(skb, hh_len);
 862
 863                         /*
 864                          *      Find where to start putting bytes.
 865                          */
 866                         data = skb_put(skb, fraglen);
 867                         skb->nh.raw = data + exthdrlen;
 868                         data += fragheaderlen;
 869                         skb->h.raw = data + exthdrlen;
 870
 871                         if (fraggap) {
 872                                 skb->csum = skb_copy_and_csum_bits(
 873                                         skb_prev, maxfraglen,
 874                                         data + transhdrlen, fraggap, 0);
 875                                 skb_prev->csum = csum_sub(skb_prev->csum,
 876                                                           skb->csum);
 877                                 data += fraggap;
 878                                 skb_trim(skb_prev, maxfraglen);
 879                         }
 880
 881                         copy = datalen - transhdrlen - fraggap;
 882                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 883                                 err = -EFAULT;
 884                                 kfree_skb(skb);
 885                                 goto error;
 886                         }
 887
 888                         offset += copy;
 889                         length -= datalen - fraggap;
 890                         transhdrlen = 0;
 891                         exthdrlen = 0;
 892                         csummode = CHECKSUM_NONE;
 893
 894                         /*
 895                          * Put the packet on the pending queue.
 896                          */
 897                         __skb_queue_tail(&sk->sk_write_queue, skb);
 898                         continue;
 899                 }
 900
 901                 if (copy > length)
 902                         copy = length;
 903
 904                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 905                         unsigned int off;
 906
 907                         off = skb->len;
 908                         if (getfrag(from, skb_put(skb, copy),
 909                                         offset, copy, off, skb) < 0) {
 910                                 __skb_trim(skb, off);
 911                                 err = -EFAULT;
 912                                 goto error;
 913                         }
 914                 } else {
 915                         int i = skb_shinfo(skb)->nr_frags;
 916                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 917                         struct page *page = sk->sk_sndmsg_page;
 918                         int off = sk->sk_sndmsg_off;
 919                         unsigned int left;
 920
 921                         if (page && (left = PAGE_SIZE - off) > 0) {
 922                                 if (copy >= left)
 923                                         copy = left;
 924                                 if (page != frag->page) {
 925                                         if (i == MAX_SKB_FRAGS) {
 926                                                 err = -EMSGSIZE;
 927                                                 goto error;
 928                                         }
 929                                         get_page(page);
 930                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 931                                         frag = &skb_shinfo(skb)->frags[i];
 932                                 }
 933                         } else if (i < MAX_SKB_FRAGS) {
 934                                 if (copy > PAGE_SIZE)
 935                                         copy = PAGE_SIZE;
 936                                 page = alloc_pages(sk->sk_allocation, 0);
 937                                 if (page == NULL)  {
 938                                         err = -ENOMEM;
 939                                         goto error;
 940                                 }
 941                                 sk->sk_sndmsg_page = page;
 942                                 sk->sk_sndmsg_off = 0;
 943
 944                                 skb_fill_page_desc(skb, i, page, 0, 0);
 945                                 frag = &skb_shinfo(skb)->frags[i];
 946                                 skb->truesize += PAGE_SIZE;
 947                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
 948                         } else {
 949                                 err = -EMSGSIZE;
 950                                 goto error;
 951                         }
 952                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 953                                 err = -EFAULT;
 954                                 goto error;
 955                         }
 956                         sk->sk_sndmsg_off += copy;
 957                         frag->size += copy;
 958                         skb->len += copy;
 959                         skb->data_len += copy;
 960                 }
 961                 offset += copy;
 962                 length -= copy;
 963         }
 964
 965         return 0;
 966
 967 error:
 968         inet->cork.length -= length;
 969         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 970         return err;
 971 }
 972
 973 ssize_t ip_append_page(struct sock *sk, struct page *page,
 974                        int offset, size_t size, int flags)
 975 {
 976         struct inet_opt *inet = inet_sk(sk);
 977         struct sk_buff *skb;
 978         struct rtable *rt;
 979         struct ip_options *opt = NULL;
 980         int hh_len;
 981         int mtu;
 982         int len;
 983         int err;
 984         unsigned int maxfraglen, fragheaderlen, fraggap;
 985
 986         if (inet->hdrincl)
 987                 return -EPERM;
 988
 989         if (flags&MSG_PROBE)
 990                 return 0;
 991
 992         if (skb_queue_empty(&sk->sk_write_queue))
 993                 return -EINVAL;
 994
 995         rt = inet->cork.rt;
 996         if (inet->cork.flags & IPCORK_OPT)
 997                 opt = inet->cork.opt;
 998
 999         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1000                 return -EOPNOTSUPP;
1001
1002         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1003         mtu = inet->cork.fragsize;
1004
1005         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1006         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1007
1008         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1009                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1010                 return -EMSGSIZE;
1011         }
1012
1013         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1014                 return -EINVAL;
1015
1016         inet->cork.length += size;
1017
1018         while (size > 0) {
1019                 int i;
1020
1021                 /* Check if the remaining data fits into current packet. */
1022                 len = mtu - skb->len;
1023                 if (len < size)
1024                         len = maxfraglen - skb->len;
1025                 if (len <= 0) {
1026                         struct sk_buff *skb_prev;
1027                         char *data;
1028                         struct iphdr *iph;
1029                         int alloclen;
1030
1031                         skb_prev = skb;
1032                         if (skb_prev)
1033                                 fraggap = skb_prev->len - maxfraglen;
1034                         else
1035                                 fraggap = 0;
1036
1037                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1038                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1039                         if (unlikely(!skb)) {
1040                                 err = -ENOBUFS;
1041                                 goto error;
1042                         }
1043
1044                         /*
1045                          *      Fill in the control structures
1046                          */
1047                         skb->ip_summed = CHECKSUM_NONE;
1048                         skb->csum = 0;
1049                         skb_reserve(skb, hh_len);
1050
1051                         /*
1052                          *      Find where to start putting bytes.
1053                          */
1054                         data = skb_put(skb, fragheaderlen + fraggap);
1055                         skb->nh.iph = iph = (struct iphdr *)data;
1056                         data += fragheaderlen;
1057                         skb->h.raw = data;
1058
1059                         if (fraggap) {
1060                                 skb->csum = skb_copy_and_csum_bits(
1061                                         skb_prev, maxfraglen,
1062                                         data, fraggap, 0);
1063                                 skb_prev->csum = csum_sub(skb_prev->csum,
1064                                                           skb->csum);
1065                                 skb_trim(skb_prev, maxfraglen);
1066                         }
1067
1068                         /*
1069                          * Put the packet on the pending queue.
1070                          */
1071                         __skb_queue_tail(&sk->sk_write_queue, skb);
1072                         continue;
1073                 }
1074
1075                 i = skb_shinfo(skb)->nr_frags;
1076                 if (len > size)
1077                         len = size;
1078                 if (skb_can_coalesce(skb, i, page, offset)) {
1079                         skb_shinfo(skb)->frags[i-1].size += len;
1080                 } else if (i < MAX_SKB_FRAGS) {
1081                         get_page(page);
1082                         skb_fill_page_desc(skb, i, page, offset, len);
1083                 } else {
1084                         err = -EMSGSIZE;
1085                         goto error;
1086                 }
1087
1088                 if (skb->ip_summed == CHECKSUM_NONE) {
1089                         unsigned int csum;
1090                         csum = csum_page(page, offset, len);
1091                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1092                 }
1093
1094                 skb->len += len;
1095                 skb->data_len += len;
1096                 offset += len;
1097                 size -= len;
1098         }
1099         return 0;
1100
1101 error:
1102         inet->cork.length -= size;
1103         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1104         return err;
1105 }
1106
1107 /*
1108  *      Combined all pending IP fragments on the socket as one IP datagram
1109  *      and push them out.
1110  */
1111 int ip_push_pending_frames(struct sock *sk)
1112 {
1113         struct sk_buff *skb, *tmp_skb;
1114         struct sk_buff **tail_skb;
1115         struct inet_opt *inet = inet_sk(sk);
1116         struct ip_options *opt = NULL;
1117         struct rtable *rt = inet->cork.rt;
1118         struct iphdr *iph;
1119         int df = 0;
1120         __u8 ttl;
1121         int err = 0;
1122
1123         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1124                 goto out;
1125         tail_skb = &(skb_shinfo(skb)->frag_list);
1126
1127         /* move skb->data to ip header from ext header */
1128         if (skb->data < skb->nh.raw)
1129                 __skb_pull(skb, skb->nh.raw - skb->data);
1130         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1131                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1132                 *tail_skb = tmp_skb;
1133                 tail_skb = &(tmp_skb->next);
1134                 skb->len += tmp_skb->len;
1135                 skb->data_len += tmp_skb->len;
1136                 skb->truesize += tmp_skb->truesize;
1137                 __sock_put(tmp_skb->sk);
1138                 tmp_skb->destructor = NULL;
1139                 tmp_skb->sk = NULL;
1140         }
1141
1142         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1143          * to fragment the frame generated here. No matter, what transforms
1144          * how transforms change size of the packet, it will come out.
1145          */
1146         if (inet->pmtudisc != IP_PMTUDISC_DO)
1147                 skb->local_df = 1;
1148
1149         /* DF bit is set when we want to see DF on outgoing frames.
1150          * If local_df is set too, we still allow to fragment this frame
1151          * locally. */
1152         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1153             (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
1154                 df = htons(IP_DF);
1155
1156         if (inet->cork.flags & IPCORK_OPT)
1157                 opt = inet->cork.opt;
1158
1159         if (rt->rt_type == RTN_MULTICAST)
1160                 ttl = inet->mc_ttl;
1161         else
1162                 ttl = ip_select_ttl(inet, &rt->u.dst);
1163
1164         iph = (struct iphdr *)skb->data;
1165         iph->version = 4;
1166         iph->ihl = 5;
1167         if (opt) {
1168                 iph->ihl += opt->optlen>>2;
1169                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1170         }
1171         iph->tos = inet->tos;
1172         iph->tot_len = htons(skb->len);
1173         iph->frag_off = df;
1174         if (!df) {
1175                 __ip_select_ident(iph, &rt->u.dst, 0);
1176         } else {
1177                 iph->id = htons(inet->id++);
1178         }
1179         iph->ttl = ttl;
1180         iph->protocol = sk->sk_protocol;
1181         iph->saddr = rt->rt_src;
1182         iph->daddr = rt->rt_dst;
1183         ip_send_check(iph);
1184
1185         skb->priority = sk->sk_priority;
1186         skb->dst = dst_clone(&rt->u.dst);
1187
1188         /* Netfilter gets whole the not fragmented skb. */
1189         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1190                       skb->dst->dev, dst_output);
1191         if (err) {
1192                 if (err > 0)
1193                         err = inet->recverr ? net_xmit_errno(err) : 0;
1194                 if (err)
1195                         goto error;
1196         }
1197
1198 out:
1199         inet->cork.flags &= ~IPCORK_OPT;
1200         if (inet->cork.opt) {
1201                 kfree(inet->cork.opt);
1202                 inet->cork.opt = NULL;
1203         }
1204         if (inet->cork.rt) {
1205                 ip_rt_put(inet->cork.rt);
1206                 inet->cork.rt = NULL;
1207         }
1208         return err;
1209
1210 error:
1211         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1212         goto out;
1213 }
1214
1215 /*
1216  *      Throw away all pending data on the socket.
1217  */
1218 void ip_flush_pending_frames(struct sock *sk)
1219 {
1220         struct inet_opt *inet = inet_sk(sk);
1221         struct sk_buff *skb;
1222
1223         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1224                 kfree_skb(skb);
1225
1226         inet->cork.flags &= ~IPCORK_OPT;
1227         if (inet->cork.opt) {
1228                 kfree(inet->cork.opt);
1229                 inet->cork.opt = NULL;
1230         }
1231         if (inet->cork.rt) {
1232                 ip_rt_put(inet->cork.rt);
1233                 inet->cork.rt = NULL;
1234         }
1235 }
1236
1237
1238 /*
1239  *      Fetch data from kernel space and fill in checksum if needed.
1240  */
1241 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1242                               int len, int odd, struct sk_buff *skb)
1243 {
1244         unsigned int csum;
1245
1246         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1247         skb->csum = csum_block_add(skb->csum, csum, odd);
1248         return 0;
1249 }
1250
1251 /*
1252  *      Generic function to send a packet as reply to another packet.
1253  *      Used to send TCP resets so far. ICMP should use this function too.
1254  *
1255  *      Should run single threaded per socket because it uses the sock
1256  *      structure to pass arguments.
1257  *
1258  *      LATER: switch from ip_build_xmit to ip_append_*
1259  */
1260 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1261                    unsigned int len)
1262 {
1263         struct inet_opt *inet = inet_sk(sk);
1264         struct {
1265                 struct ip_options       opt;
1266                 char                    data[40];
1267         } replyopts;
1268         struct ipcm_cookie ipc;
1269         u32 daddr;
1270         struct rtable *rt = (struct rtable*)skb->dst;
1271
1272         if (ip_options_echo(&replyopts.opt, skb))
1273                 return;
1274
1275         daddr = ipc.addr = rt->rt_src;
1276         ipc.opt = NULL;
1277
1278         if (replyopts.opt.optlen) {
1279                 ipc.opt = &replyopts.opt;
1280
1281                 if (ipc.opt->srr)
1282                         daddr = replyopts.opt.faddr;
1283         }
1284
1285         {
1286                 struct flowi fl = { .nl_u = { .ip4_u =
1287                                               { .daddr = daddr,
1288                                                 .saddr = rt->rt_spec_dst,
1289                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1290                                     /* Not quite clean, but right. */
1291                                     .uli_u = { .ports =
1292                                                { .sport = skb->h.th->dest,
1293                                                  .dport = skb->h.th->source } },
1294                                     .proto = sk->sk_protocol };
1295                 if (ip_route_output_key(&rt, &fl))
1296                         return;
1297         }
1298
1299         /* And let IP do all the hard work.
1300
1301            This chunk is not reenterable, hence spinlock.
1302            Note that it uses the fact, that this function is called
1303            with locally disabled BH and that sk cannot be already spinlocked.
1304          */
1305         bh_lock_sock(sk);
1306         inet->tos = skb->nh.iph->tos;
1307         sk->sk_priority = skb->priority;
1308         sk->sk_protocol = skb->nh.iph->protocol;
1309         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1310                        &ipc, rt, MSG_DONTWAIT);
1311         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1312                 if (arg->csumoffset >= 0)
1313                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1314                 skb->ip_summed = CHECKSUM_NONE;
1315                 ip_push_pending_frames(sk);
1316         }
1317
1318         bh_unlock_sock(sk);
1319
1320         ip_rt_put(rt);
1321 }
1322
1323 /*
1324  *      IP protocol layer initialiser
1325  */
1326
1327 static struct packet_type ip_packet_type = {
1328         .type = __constant_htons(ETH_P_IP),
1329         .func = ip_rcv,
1330 };
1331
1332 /*
1333  *      IP registers the packet type and then calls the subprotocol initialisers
1334  */
1335
1336 void __init ip_init(void)
1337 {
1338         dev_add_pack(&ip_packet_type);
1339
1340         ip_rt_init();
1341         inet_initpeers();
1342
1343 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1344         igmp_mc_proc_init();
1345 #endif
1346 }
1347
1348 EXPORT_SYMBOL(ip_finish_output);
1349 EXPORT_SYMBOL(ip_fragment);
1350 EXPORT_SYMBOL(ip_generic_getfrag);
1351 EXPORT_SYMBOL(ip_queue_xmit);
1352 EXPORT_SYMBOL(ip_send_check);
1353
1354 #ifdef CONFIG_SYSCTL
1355 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1356 #endif