net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/udp.h>
  74 #include <linux/skbuff.h>
  75 #include <net/sock.h>
  76 #include <net/arp.h>
  77 #include <net/icmp.h>
  78 #include <net/raw.h>
  79 #include <net/checksum.h>
  80 #include <net/inetpeer.h>
  81 #include <net/checksum.h>
  82 #include <linux/igmp.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/netfilter_bridge.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netlink.h>
  87
  88 /*
  89  *      Shall we try to damage output packets if routing dev changes?
  90  */
  91
  92 int sysctl_ip_dynaddr;
  93 int sysctl_ip_default_ttl = IPDEFTTL;
  94
  95 /* Generate a checksum for an outgoing IP datagram. */
  96 __inline__ void ip_send_check(struct iphdr *iph)
  97 {
  98         iph->check = 0;
  99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 100 }
 101
 102 /* dev_loopback_xmit for use with netfilter. */
 103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 104 {
 105         newskb->mac.raw = newskb->data;
 106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 107         newskb->pkt_type = PACKET_LOOPBACK;
 108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 109         BUG_TRAP(newskb->dst);
 110
 111 #ifdef CONFIG_NETFILTER_DEBUG
 112         nf_debug_ip_loopback_xmit(newskb);
 113 #endif
 114         netif_rx(newskb);
 115         return 0;
 116 }
 117
 118 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 119 {
 120         int ttl = inet->uc_ttl;
 121
 122         if (ttl < 0)
 123                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 124         return ttl;
 125 }
 126
 127 /*
 128  *              Add an ip header to a skbuff and send it out.
 129  *
 130  */
 131 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 132                           u32 saddr, u32 daddr, struct ip_options *opt)
 133 {
 134         struct inet_sock *inet = inet_sk(sk);
 135         struct rtable *rt = (struct rtable *)skb->dst;
 136         struct iphdr *iph;
 137
 138         /* Build the IP header. */
 139         if (opt)
 140                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 141         else
 142                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 143
 144         iph->version  = 4;
 145         iph->ihl      = 5;
 146         iph->tos      = inet->tos;
 147         if (ip_dont_fragment(sk, &rt->u.dst))
 148                 iph->frag_off = htons(IP_DF);
 149         else
 150                 iph->frag_off = 0;
 151         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 152         iph->daddr    = rt->rt_dst;
 153         iph->saddr    = rt->rt_src;
 154         iph->protocol = sk->sk_protocol;
 155         iph->tot_len  = htons(skb->len);
 156         ip_select_ident(iph, &rt->u.dst, sk);
 157         skb->nh.iph   = iph;
 158
 159         if (opt && opt->optlen) {
 160                 iph->ihl += opt->optlen>>2;
 161                 ip_options_build(skb, opt, daddr, rt, 0);
 162         }
 163         ip_send_check(iph);
 164
 165         skb->priority = sk->sk_priority;
 166
 167         /* Send it out. */
 168         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 169                        dst_output);
 170 }
 171
 172 static inline int ip_finish_output2(struct sk_buff *skb)
 173 {
 174         struct dst_entry *dst = skb->dst;
 175         struct hh_cache *hh = dst->hh;
 176         struct net_device *dev = dst->dev;
 177         int hh_len = LL_RESERVED_SPACE(dev);
 178
 179         /* Be paranoid, rather than too clever. */
 180         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 181                 struct sk_buff *skb2;
 182
 183                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 184                 if (skb2 == NULL) {
 185                         kfree_skb(skb);
 186                         return -ENOMEM;
 187                 }
 188                 if (skb->sk)
 189                         skb_set_owner_w(skb2, skb->sk);
 190                 kfree_skb(skb);
 191                 skb = skb2;
 192         }
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195         nf_debug_ip_finish_output2(skb);
 196 #endif /*CONFIG_NETFILTER_DEBUG*/
 197
 198         if (hh) {
 199                 int hh_alen;
 200
 201                 read_lock_bh(&hh->hh_lock);
 202                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 203                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 204                 read_unlock_bh(&hh->hh_lock);
 205                 skb_push(skb, hh->hh_len);
 206                 return hh->hh_output(skb);
 207         } else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 int ip_finish_output(struct sk_buff *skb)
 217 {
 218         struct net_device *dev = skb->dst->dev;
 219
 220         skb->dev = dev;
 221         skb->protocol = htons(ETH_P_IP);
 222
 223         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 224                        ip_finish_output2);
 225 }
 226
 227 int ip_mc_output(struct sk_buff *skb)
 228 {
 229         struct sock *sk = skb->sk;
 230         struct rtable *rt = (struct rtable*)skb->dst;
 231         struct net_device *dev = rt->u.dst.dev;
 232
 233         /*
 234          *      If the indicated interface is up and running, send the packet.
 235          */
 236         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 237
 238         skb->dev = dev;
 239         skb->protocol = htons(ETH_P_IP);
 240
 241         /*
 242          *      Multicasts are looped back for other local users
 243          */
 244
 245         if (rt->rt_flags&RTCF_MULTICAST) {
 246                 if ((!sk || inet_sk(sk)->mc_loop)
 247 #ifdef CONFIG_IP_MROUTE
 248                 /* Small optimization: do not loopback not local frames,
 249                    which returned after forwarding; they will be  dropped
 250                    by ip_mr_input in any case.
 251                    Note, that local frames are looped back to be delivered
 252                    to local recipients.
 253
 254                    This check is duplicated in ip_mr_input at the moment.
 255                  */
 256                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 257 #endif
 258                 ) {
 259                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 260                         if (newskb)
 261                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 262                                         newskb->dev,
 263                                         ip_dev_loopback_xmit);
 264                 }
 265
 266                 /* Multicasts with ttl 0 must not go beyond the host */
 267
 268                 if (skb->nh.iph->ttl == 0) {
 269                         kfree_skb(skb);
 270                         return 0;
 271                 }
 272         }
 273
 274         if (rt->rt_flags&RTCF_BROADCAST) {
 275                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 276                 if (newskb)
 277                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 278                                 newskb->dev, ip_dev_loopback_xmit);
 279         }
 280
 281         if (skb->len > dst_mtu(&rt->u.dst))
 282                 return ip_fragment(skb, ip_finish_output);
 283         else
 284                 return ip_finish_output(skb);
 285 }
 286
 287 int ip_output(struct sk_buff *skb)
 288 {
 289         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 290
 291         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
 292                 return ip_fragment(skb, ip_finish_output);
 293         else
 294                 return ip_finish_output(skb);
 295 }
 296
 297 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 298 {
 299         struct sock *sk = skb->sk;
 300         struct inet_sock *inet = inet_sk(sk);
 301         struct ip_options *opt = inet->opt;
 302         struct rtable *rt;
 303         struct iphdr *iph;
 304
 305         /* Skip all of this if the packet is already routed,
 306          * f.e. by something like SCTP.
 307          */
 308         rt = (struct rtable *) skb->dst;
 309         if (rt != NULL)
 310                 goto packet_routed;
 311
 312         /* Make sure we can route this packet. */
 313         rt = (struct rtable *)__sk_dst_check(sk, 0);
 314         if (rt == NULL) {
 315                 u32 daddr;
 316
 317                 /* Use correct destination address if we have options. */
 318                 daddr = inet->daddr;
 319                 if(opt && opt->srr)
 320                         daddr = opt->faddr;
 321
 322                 {
 323                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 324                                             .nl_u = { .ip4_u =
 325                                                       { .daddr = daddr,
 326                                                         .saddr = inet->saddr,
 327                                                         .tos = RT_CONN_FLAGS(sk) } },
 328                                             .proto = sk->sk_protocol,
 329                                             .uli_u = { .ports =
 330                                                        { .sport = inet->sport,
 331                                                          .dport = inet->dport } } };
 332
 333                         /* If this fails, retransmit mechanism of transport layer will
 334                          * keep trying until route appears or the connection times
 335                          * itself out.
 336                          */
 337                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 338                                 goto no_route;
 339                 }
 340                 __sk_dst_set(sk, &rt->u.dst);
 341                 tcp_v4_setup_caps(sk, &rt->u.dst);
 342         }
 343         skb->dst = dst_clone(&rt->u.dst);
 344
 345 packet_routed:
 346         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 347                 goto no_route;
 348
 349         /* OK, we know where to send it, allocate and build IP header. */
 350         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 351         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 352         iph->tot_len = htons(skb->len);
 353         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 354                 iph->frag_off = htons(IP_DF);
 355         else
 356                 iph->frag_off = 0;
 357         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 358         iph->protocol = sk->sk_protocol;
 359         iph->saddr    = rt->rt_src;
 360         iph->daddr    = rt->rt_dst;
 361         skb->nh.iph   = iph;
 362         /* Transport layer set skb->h.foo itself. */
 363
 364         if (opt && opt->optlen) {
 365                 iph->ihl += opt->optlen >> 2;
 366                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 367         }
 368
 369         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 370
 371         /* Add an IP checksum. */
 372         ip_send_check(iph);
 373
 374         skb->priority = sk->sk_priority;
 375
 376         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 377                        dst_output);
 378
 379 no_route:
 380         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 381         kfree_skb(skb);
 382         return -EHOSTUNREACH;
 383 }
 384
 385
 386 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 387 {
 388         to->pkt_type = from->pkt_type;
 389         to->priority = from->priority;
 390         to->protocol = from->protocol;
 391         to->security = from->security;
 392         dst_release(to->dst);
 393         to->dst = dst_clone(from->dst);
 394         to->dev = from->dev;
 395
 396         /* Copy the flags to each fragment. */
 397         IPCB(to)->flags = IPCB(from)->flags;
 398
 399 #ifdef CONFIG_NET_SCHED
 400         to->tc_index = from->tc_index;
 401 #endif
 402 #ifdef CONFIG_NETFILTER
 403         to->nfmark = from->nfmark;
 404         to->nfcache = from->nfcache;
 405         /* Connection association is same as pre-frag packet */
 406         nf_conntrack_put(to->nfct);
 407         to->nfct = from->nfct;
 408         nf_conntrack_get(to->nfct);
 409         to->nfctinfo = from->nfctinfo;
 410 #ifdef CONFIG_BRIDGE_NETFILTER
 411         nf_bridge_put(to->nf_bridge);
 412         to->nf_bridge = from->nf_bridge;
 413         nf_bridge_get(to->nf_bridge);
 414 #endif
 415 #ifdef CONFIG_NETFILTER_DEBUG
 416         to->nf_debug = from->nf_debug;
 417 #endif
 418 #endif
 419 }
 420
 421 /*
 422  *      This IP datagram is too large to be sent in one piece.  Break it up into
 423  *      smaller pieces (each of size equal to IP header plus
 424  *      a block of the data of the original IP data part) that will yet fit in a
 425  *      single device frame, and queue such a frame for sending.
 426  */
 427
 428 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 429 {
 430         struct iphdr *iph;
 431         int raw = 0;
 432         int ptr;
 433         struct net_device *dev;
 434         struct sk_buff *skb2;
 435         unsigned int mtu, hlen, left, len, ll_rs;
 436         int offset;
 437         int not_last_frag;
 438         struct rtable *rt = (struct rtable*)skb->dst;
 439         int err = 0;
 440
 441         dev = rt->u.dst.dev;
 442
 443         /*
 444          *      Point into the IP datagram header.
 445          */
 446
 447         iph = skb->nh.iph;
 448
 449         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 450                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 451                           htonl(dst_mtu(&rt->u.dst)));
 452                 kfree_skb(skb);
 453                 return -EMSGSIZE;
 454         }
 455
 456         /*
 457          *      Setup starting values.
 458          */
 459
 460         hlen = iph->ihl * 4;
 461         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 462
 463         /* When frag_list is given, use it. First, check its validity:
 464          * some transformers could create wrong frag_list or break existing
 465          * one, it is not prohibited. In this case fall back to copying.
 466          *
 467          * LATER: this step can be merged to real generation of fragments,
 468          * we can switch to copy when see the first bad fragment.
 469          */
 470         if (skb_shinfo(skb)->frag_list) {
 471                 struct sk_buff *frag;
 472                 int first_len = skb_pagelen(skb);
 473
 474                 if (first_len - hlen > mtu ||
 475                     ((first_len - hlen) & 7) ||
 476                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 477                     skb_cloned(skb))
 478                         goto slow_path;
 479
 480                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 481                         /* Correct geometry. */
 482                         if (frag->len > mtu ||
 483                             ((frag->len & 7) && frag->next) ||
 484                             skb_headroom(frag) < hlen)
 485                             goto slow_path;
 486
 487                         /* Partially cloned skb? */
 488                         if (skb_shared(frag))
 489                                 goto slow_path;
 490
 491                         BUG_ON(frag->sk);
 492                         if (skb->sk) {
 493                                 sock_hold(skb->sk);
 494                                 frag->sk = skb->sk;
 495                                 frag->destructor = sock_wfree;
 496                                 skb->truesize -= frag->truesize;
 497                         }
 498                 }
 499
 500                 /* Everything is OK. Generate! */
 501
 502                 err = 0;
 503                 offset = 0;
 504                 frag = skb_shinfo(skb)->frag_list;
 505                 skb_shinfo(skb)->frag_list = NULL;
 506                 skb->data_len = first_len - skb_headlen(skb);
 507                 skb->len = first_len;
 508                 iph->tot_len = htons(first_len);
 509                 iph->frag_off = htons(IP_MF);
 510                 ip_send_check(iph);
 511
 512                 for (;;) {
 513                         /* Prepare header of the next frame,
 514                          * before previous one went down. */
 515                         if (frag) {
 516                                 frag->ip_summed = CHECKSUM_NONE;
 517                                 frag->h.raw = frag->data;
 518                                 frag->nh.raw = __skb_push(frag, hlen);
 519                                 memcpy(frag->nh.raw, iph, hlen);
 520                                 iph = frag->nh.iph;
 521                                 iph->tot_len = htons(frag->len);
 522                                 ip_copy_metadata(frag, skb);
 523                                 if (offset == 0)
 524                                         ip_options_fragment(frag);
 525                                 offset += skb->len - hlen;
 526                                 iph->frag_off = htons(offset>>3);
 527                                 if (frag->next != NULL)
 528                                         iph->frag_off |= htons(IP_MF);
 529                                 /* Ready, complete checksum */
 530                                 ip_send_check(iph);
 531                         }
 532
 533                         err = output(skb);
 534
 535                         if (err || !frag)
 536                                 break;
 537
 538                         skb = frag;
 539                         frag = skb->next;
 540                         skb->next = NULL;
 541                 }
 542
 543                 if (err == 0) {
 544                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 545                         return 0;
 546                 }
 547
 548                 while (frag) {
 549                         skb = frag->next;
 550                         kfree_skb(frag);
 551                         frag = skb;
 552                 }
 553                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 554                 return err;
 555         }
 556
 557 slow_path:
 558         left = skb->len - hlen;         /* Space per frame */
 559         ptr = raw + hlen;               /* Where to start from */
 560
 561 #ifdef CONFIG_BRIDGE_NETFILTER
 562         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 563          * we need to make room for the encapsulating header */
 564         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 565         mtu -= nf_bridge_pad(skb);
 566 #else
 567         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 568 #endif
 569         /*
 570          *      Fragment the datagram.
 571          */
 572
 573         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 574         not_last_frag = iph->frag_off & htons(IP_MF);
 575
 576         /*
 577          *      Keep copying data until we run out.
 578          */
 579
 580         while(left > 0) {
 581                 len = left;
 582                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 583                 if (len > mtu)
 584                         len = mtu;
 585                 /* IF: we are not sending upto and including the packet end
 586                    then align the next start on an eight byte boundary */
 587                 if (len < left) {
 588                         len &= ~7;
 589                 }
 590                 /*
 591                  *      Allocate buffer.
 592                  */
 593
 594                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 595                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 596                         err = -ENOMEM;
 597                         goto fail;
 598                 }
 599
 600                 /*
 601                  *      Set up data on packet
 602                  */
 603
 604                 ip_copy_metadata(skb2, skb);
 605                 skb_reserve(skb2, ll_rs);
 606                 skb_put(skb2, len + hlen);
 607                 skb2->nh.raw = skb2->data;
 608                 skb2->h.raw = skb2->data + hlen;
 609
 610                 /*
 611                  *      Charge the memory for the fragment to any owner
 612                  *      it might possess
 613                  */
 614
 615                 if (skb->sk)
 616                         skb_set_owner_w(skb2, skb->sk);
 617
 618                 /*
 619                  *      Copy the packet header into the new buffer.
 620                  */
 621
 622                 memcpy(skb2->nh.raw, skb->data, hlen);
 623
 624                 /*
 625                  *      Copy a block of the IP datagram.
 626                  */
 627                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 628                         BUG();
 629                 left -= len;
 630
 631                 /*
 632                  *      Fill in the new header fields.
 633                  */
 634                 iph = skb2->nh.iph;
 635                 iph->frag_off = htons((offset >> 3));
 636
 637                 /* ANK: dirty, but effective trick. Upgrade options only if
 638                  * the segment to be fragmented was THE FIRST (otherwise,
 639                  * options are already fixed) and make it ONCE
 640                  * on the initial skb, so that all the following fragments
 641                  * will inherit fixed options.
 642                  */
 643                 if (offset == 0)
 644                         ip_options_fragment(skb);
 645
 646                 /*
 647                  *      Added AC : If we are fragmenting a fragment that's not the
 648                  *                 last fragment then keep MF on each bit
 649                  */
 650                 if (left > 0 || not_last_frag)
 651                         iph->frag_off |= htons(IP_MF);
 652                 ptr += len;
 653                 offset += len;
 654
 655                 /*
 656                  *      Put this fragment into the sending queue.
 657                  */
 658
 659                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 660
 661                 iph->tot_len = htons(len + hlen);
 662
 663                 ip_send_check(iph);
 664
 665                 err = output(skb2);
 666                 if (err)
 667                         goto fail;
 668         }
 669         kfree_skb(skb);
 670         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 671         return err;
 672
 673 fail:
 674         kfree_skb(skb);
 675         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 676         return err;
 677 }
 678
 679 int
 680 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 681 {
 682         struct iovec *iov = from;
 683
 684         if (skb->ip_summed == CHECKSUM_HW) {
 685                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 686                         return -EFAULT;
 687         } else {
 688                 unsigned int csum = 0;
 689                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 690                         return -EFAULT;
 691                 skb->csum = csum_block_add(skb->csum, csum, odd);
 692         }
 693         return 0;
 694 }
 695
 696 static inline unsigned int
 697 csum_page(struct page *page, int offset, int copy)
 698 {
 699         char *kaddr;
 700         unsigned int csum;
 701         kaddr = kmap(page);
 702         csum = csum_partial(kaddr + offset, copy, 0);
 703         kunmap(page);
 704         return csum;
 705 }
 706
 707 /*
 708  *      ip_append_data() and ip_append_page() can make one large IP datagram
 709  *      from many pieces of data. Each pieces will be holded on the socket
 710  *      until ip_push_pending_frames() is called. Each piece can be a page
 711  *      or non-page data.
 712  *
 713  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 714  *      this interface potentially.
 715  *
 716  *      LATER: length must be adjusted by pad at tail, when it is required.
 717  */
 718 int ip_append_data(struct sock *sk,
 719                    int getfrag(void *from, char *to, int offset, int len,
 720                                int odd, struct sk_buff *skb),
 721                    void *from, int length, int transhdrlen,
 722                    struct ipcm_cookie *ipc, struct rtable *rt,
 723                    unsigned int flags)
 724 {
 725         struct inet_sock *inet = inet_sk(sk);
 726         struct sk_buff *skb;
 727
 728         struct ip_options *opt = NULL;
 729         int hh_len;
 730         int exthdrlen;
 731         int mtu;
 732         int copy;
 733         int err;
 734         int offset = 0;
 735         unsigned int maxfraglen, fragheaderlen;
 736         int csummode = CHECKSUM_NONE;
 737
 738         if (flags&MSG_PROBE)
 739                 return 0;
 740
 741         if (skb_queue_empty(&sk->sk_write_queue)) {
 742                 /*
 743                  * setup for corking.
 744                  */
 745                 opt = ipc->opt;
 746                 if (opt) {
 747                         if (inet->cork.opt == NULL) {
 748                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 749                                 if (unlikely(inet->cork.opt == NULL))
 750                                         return -ENOBUFS;
 751                         }
 752                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 753                         inet->cork.flags |= IPCORK_OPT;
 754                         inet->cork.addr = ipc->addr;
 755                 }
 756                 dst_hold(&rt->u.dst);
 757                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 758                 inet->cork.rt = rt;
 759                 inet->cork.length = 0;
 760                 sk->sk_sndmsg_page = NULL;
 761                 sk->sk_sndmsg_off = 0;
 762                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 763                         length += exthdrlen;
 764                         transhdrlen += exthdrlen;
 765                 }
 766         } else {
 767                 rt = inet->cork.rt;
 768                 if (inet->cork.flags & IPCORK_OPT)
 769                         opt = inet->cork.opt;
 770
 771                 transhdrlen = 0;
 772                 exthdrlen = 0;
 773                 mtu = inet->cork.fragsize;
 774         }
 775         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 776
 777         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 778         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 779
 780         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 781                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 782                 return -EMSGSIZE;
 783         }
 784
 785         /*
 786          * transhdrlen > 0 means that this is the first fragment and we wish
 787          * it won't be fragmented in the future.
 788          */
 789         if (transhdrlen &&
 790             length + fragheaderlen <= mtu &&
 791             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 792             !exthdrlen)
 793                 csummode = CHECKSUM_HW;
 794
 795         inet->cork.length += length;
 796
 797         /* So, what's going on in the loop below?
 798          *
 799          * We use calculated fragment length to generate chained skb,
 800          * each of segments is IP fragment ready for sending to network after
 801          * adding appropriate IP header.
 802          */
 803
 804         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 805                 goto alloc_new_skb;
 806
 807         while (length > 0) {
 808                 /* Check if the remaining data fits into current packet. */
 809                 copy = mtu - skb->len;
 810                 if (copy < length)
 811                         copy = maxfraglen - skb->len;
 812                 if (copy <= 0) {
 813                         char *data;
 814                         unsigned int datalen;
 815                         unsigned int fraglen;
 816                         unsigned int fraggap;
 817                         unsigned int alloclen;
 818                         struct sk_buff *skb_prev;
 819 alloc_new_skb:
 820                         skb_prev = skb;
 821                         if (skb_prev)
 822                                 fraggap = skb_prev->len - maxfraglen;
 823                         else
 824                                 fraggap = 0;
 825
 826                         /*
 827                          * If remaining data exceeds the mtu,
 828                          * we know we need more fragment(s).
 829                          */
 830                         datalen = length + fraggap;
 831                         if (datalen > mtu - fragheaderlen)
 832                                 datalen = maxfraglen - fragheaderlen;
 833                         fraglen = datalen + fragheaderlen;
 834
 835                         if ((flags & MSG_MORE) &&
 836                             !(rt->u.dst.dev->features&NETIF_F_SG))
 837                                 alloclen = mtu;
 838                         else
 839                                 alloclen = datalen + fragheaderlen;
 840
 841                         /* The last fragment gets additional space at tail.
 842                          * Note, with MSG_MORE we overallocate on fragments,
 843                          * because we have no idea what fragment will be
 844                          * the last.
 845                          */
 846                         if (datalen == length)
 847                                 alloclen += rt->u.dst.trailer_len;
 848
 849                         if (transhdrlen) {
 850                                 skb = sock_alloc_send_skb(sk,
 851                                                 alloclen + hh_len + 15,
 852                                                 (flags & MSG_DONTWAIT), &err);
 853                         } else {
 854                                 skb = NULL;
 855                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 856                                     2 * sk->sk_sndbuf)
 857                                         skb = sock_wmalloc(sk,
 858                                                            alloclen + hh_len + 15, 1,
 859                                                            sk->sk_allocation);
 860                                 if (unlikely(skb == NULL))
 861                                         err = -ENOBUFS;
 862                         }
 863                         if (skb == NULL)
 864                                 goto error;
 865
 866                         /*
 867                          *      Fill in the control structures
 868                          */
 869                         skb->ip_summed = csummode;
 870                         skb->csum = 0;
 871                         skb_reserve(skb, hh_len);
 872
 873                         /*
 874                          *      Find where to start putting bytes.
 875                          */
 876                         data = skb_put(skb, fraglen);
 877                         skb->nh.raw = data + exthdrlen;
 878                         data += fragheaderlen;
 879                         skb->h.raw = data + exthdrlen;
 880
 881                         if (fraggap) {
 882                                 skb->csum = skb_copy_and_csum_bits(
 883                                         skb_prev, maxfraglen,
 884                                         data + transhdrlen, fraggap, 0);
 885                                 skb_prev->csum = csum_sub(skb_prev->csum,
 886                                                           skb->csum);
 887                                 data += fraggap;
 888                                 skb_trim(skb_prev, maxfraglen);
 889                         }
 890
 891                         copy = datalen - transhdrlen - fraggap;
 892                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 893                                 err = -EFAULT;
 894                                 kfree_skb(skb);
 895                                 goto error;
 896                         }
 897
 898                         offset += copy;
 899                         length -= datalen - fraggap;
 900                         transhdrlen = 0;
 901                         exthdrlen = 0;
 902                         csummode = CHECKSUM_NONE;
 903
 904                         /*
 905                          * Put the packet on the pending queue.
 906                          */
 907                         __skb_queue_tail(&sk->sk_write_queue, skb);
 908                         continue;
 909                 }
 910
 911                 if (copy > length)
 912                         copy = length;
 913
 914                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 915                         unsigned int off;
 916
 917                         off = skb->len;
 918                         if (getfrag(from, skb_put(skb, copy),
 919                                         offset, copy, off, skb) < 0) {
 920                                 __skb_trim(skb, off);
 921                                 err = -EFAULT;
 922                                 goto error;
 923                         }
 924                 } else {
 925                         int i = skb_shinfo(skb)->nr_frags;
 926                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 927                         struct page *page = sk->sk_sndmsg_page;
 928                         int off = sk->sk_sndmsg_off;
 929                         unsigned int left;
 930
 931                         if (page && (left = PAGE_SIZE - off) > 0) {
 932                                 if (copy >= left)
 933                                         copy = left;
 934                                 if (page != frag->page) {
 935                                         if (i == MAX_SKB_FRAGS) {
 936                                                 err = -EMSGSIZE;
 937                                                 goto error;
 938                                         }
 939                                         get_page(page);
 940                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 941                                         frag = &skb_shinfo(skb)->frags[i];
 942                                 }
 943                         } else if (i < MAX_SKB_FRAGS) {
 944                                 if (copy > PAGE_SIZE)
 945                                         copy = PAGE_SIZE;
 946                                 page = alloc_pages(sk->sk_allocation, 0);
 947                                 if (page == NULL)  {
 948                                         err = -ENOMEM;
 949                                         goto error;
 950                                 }
 951                                 sk->sk_sndmsg_page = page;
 952                                 sk->sk_sndmsg_off = 0;
 953
 954                                 skb_fill_page_desc(skb, i, page, 0, 0);
 955                                 frag = &skb_shinfo(skb)->frags[i];
 956                                 skb->truesize += PAGE_SIZE;
 957                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
 958                         } else {
 959                                 err = -EMSGSIZE;
 960                                 goto error;
 961                         }
 962                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 963                                 err = -EFAULT;
 964                                 goto error;
 965                         }
 966                         sk->sk_sndmsg_off += copy;
 967                         frag->size += copy;
 968                         skb->len += copy;
 969                         skb->data_len += copy;
 970                 }
 971                 offset += copy;
 972                 length -= copy;
 973         }
 974
 975         return 0;
 976
 977 error:
 978         inet->cork.length -= length;
 979         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 980         return err;
 981 }
 982
 983 ssize_t ip_append_page(struct sock *sk, struct page *page,
 984                        int offset, size_t size, int flags)
 985 {
 986         struct inet_sock *inet = inet_sk(sk);
 987         struct sk_buff *skb;
 988         struct rtable *rt;
 989         struct ip_options *opt = NULL;
 990         int hh_len;
 991         int mtu;
 992         int len;
 993         int err;
 994         unsigned int maxfraglen, fragheaderlen, fraggap;
 995
 996         if (inet->hdrincl)
 997                 return -EPERM;
 998
 999         if (flags&MSG_PROBE)
1000                 return 0;
1001
1002         if (skb_queue_empty(&sk->sk_write_queue))
1003                 return -EINVAL;
1004
1005         rt = inet->cork.rt;
1006         if (inet->cork.flags & IPCORK_OPT)
1007                 opt = inet->cork.opt;
1008
1009         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1010                 return -EOPNOTSUPP;
1011
1012         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1013         mtu = inet->cork.fragsize;
1014
1015         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1016         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1017
1018         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1019                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1020                 return -EMSGSIZE;
1021         }
1022
1023         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1024                 return -EINVAL;
1025
1026         inet->cork.length += size;
1027
1028         while (size > 0) {
1029                 int i;
1030
1031                 /* Check if the remaining data fits into current packet. */
1032                 len = mtu - skb->len;
1033                 if (len < size)
1034                         len = maxfraglen - skb->len;
1035                 if (len <= 0) {
1036                         struct sk_buff *skb_prev;
1037                         char *data;
1038                         struct iphdr *iph;
1039                         int alloclen;
1040
1041                         skb_prev = skb;
1042                         if (skb_prev)
1043                                 fraggap = skb_prev->len - maxfraglen;
1044                         else
1045                                 fraggap = 0;
1046
1047                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1048                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1049                         if (unlikely(!skb)) {
1050                                 err = -ENOBUFS;
1051                                 goto error;
1052                         }
1053
1054                         /*
1055                          *      Fill in the control structures
1056                          */
1057                         skb->ip_summed = CHECKSUM_NONE;
1058                         skb->csum = 0;
1059                         skb_reserve(skb, hh_len);
1060
1061                         /*
1062                          *      Find where to start putting bytes.
1063                          */
1064                         data = skb_put(skb, fragheaderlen + fraggap);
1065                         skb->nh.iph = iph = (struct iphdr *)data;
1066                         data += fragheaderlen;
1067                         skb->h.raw = data;
1068
1069                         if (fraggap) {
1070                                 skb->csum = skb_copy_and_csum_bits(
1071                                         skb_prev, maxfraglen,
1072                                         data, fraggap, 0);
1073                                 skb_prev->csum = csum_sub(skb_prev->csum,
1074                                                           skb->csum);
1075                                 skb_trim(skb_prev, maxfraglen);
1076                         }
1077
1078                         /*
1079                          * Put the packet on the pending queue.
1080                          */
1081                         __skb_queue_tail(&sk->sk_write_queue, skb);
1082                         continue;
1083                 }
1084
1085                 i = skb_shinfo(skb)->nr_frags;
1086                 if (len > size)
1087                         len = size;
1088                 if (skb_can_coalesce(skb, i, page, offset)) {
1089                         skb_shinfo(skb)->frags[i-1].size += len;
1090                 } else if (i < MAX_SKB_FRAGS) {
1091                         get_page(page);
1092                         skb_fill_page_desc(skb, i, page, offset, len);
1093                 } else {
1094                         err = -EMSGSIZE;
1095                         goto error;
1096                 }
1097
1098                 if (skb->ip_summed == CHECKSUM_NONE) {
1099                         unsigned int csum;
1100                         csum = csum_page(page, offset, len);
1101                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1102                 }
1103
1104                 skb->len += len;
1105                 skb->data_len += len;
1106                 offset += len;
1107                 size -= len;
1108         }
1109         return 0;
1110
1111 error:
1112         inet->cork.length -= size;
1113         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1114         return err;
1115 }
1116
1117 /*
1118  *      Combined all pending IP fragments on the socket as one IP datagram
1119  *      and push them out.
1120  */
1121 int ip_push_pending_frames(struct sock *sk)
1122 {
1123         struct sk_buff *skb, *tmp_skb;
1124         struct sk_buff **tail_skb;
1125         struct inet_sock *inet = inet_sk(sk);
1126         struct ip_options *opt = NULL;
1127         struct rtable *rt = inet->cork.rt;
1128         struct iphdr *iph;
1129         int df = 0;
1130         __u8 ttl;
1131         int err = 0;
1132
1133         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1134                 goto out;
1135         tail_skb = &(skb_shinfo(skb)->frag_list);
1136
1137         /* move skb->data to ip header from ext header */
1138         if (skb->data < skb->nh.raw)
1139                 __skb_pull(skb, skb->nh.raw - skb->data);
1140         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1141                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1142                 *tail_skb = tmp_skb;
1143                 tail_skb = &(tmp_skb->next);
1144                 skb->len += tmp_skb->len;
1145                 skb->data_len += tmp_skb->len;
1146                 skb->truesize += tmp_skb->truesize;
1147                 __sock_put(tmp_skb->sk);
1148                 tmp_skb->destructor = NULL;
1149                 tmp_skb->sk = NULL;
1150         }
1151
1152         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1153          * to fragment the frame generated here. No matter, what transforms
1154          * how transforms change size of the packet, it will come out.
1155          */
1156         if (inet->pmtudisc != IP_PMTUDISC_DO)
1157                 skb->local_df = 1;
1158
1159         /* DF bit is set when we want to see DF on outgoing frames.
1160          * If local_df is set too, we still allow to fragment this frame
1161          * locally. */
1162         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1163             (skb->len <= dst_mtu(&rt->u.dst) &&
1164              ip_dont_fragment(sk, &rt->u.dst)))
1165                 df = htons(IP_DF);
1166
1167         if (inet->cork.flags & IPCORK_OPT)
1168                 opt = inet->cork.opt;
1169
1170         if (rt->rt_type == RTN_MULTICAST)
1171                 ttl = inet->mc_ttl;
1172         else
1173                 ttl = ip_select_ttl(inet, &rt->u.dst);
1174
1175         iph = (struct iphdr *)skb->data;
1176         iph->version = 4;
1177         iph->ihl = 5;
1178         if (opt) {
1179                 iph->ihl += opt->optlen>>2;
1180                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1181         }
1182         iph->tos = inet->tos;
1183         iph->tot_len = htons(skb->len);
1184         iph->frag_off = df;
1185         if (!df) {
1186                 __ip_select_ident(iph, &rt->u.dst, 0);
1187         } else {
1188                 iph->id = htons(inet->id++);
1189         }
1190         iph->ttl = ttl;
1191         iph->protocol = sk->sk_protocol;
1192         iph->saddr = rt->rt_src;
1193         iph->daddr = rt->rt_dst;
1194         ip_send_check(iph);
1195
1196         skb->priority = sk->sk_priority;
1197         skb->dst = dst_clone(&rt->u.dst);
1198
1199         /* Netfilter gets whole the not fragmented skb. */
1200         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1201                       skb->dst->dev, dst_output);
1202         if (err) {
1203                 if (err > 0)
1204                         err = inet->recverr ? net_xmit_errno(err) : 0;
1205                 if (err)
1206                         goto error;
1207         }
1208
1209 out:
1210         inet->cork.flags &= ~IPCORK_OPT;
1211         if (inet->cork.opt) {
1212                 kfree(inet->cork.opt);
1213                 inet->cork.opt = NULL;
1214         }
1215         if (inet->cork.rt) {
1216                 ip_rt_put(inet->cork.rt);
1217                 inet->cork.rt = NULL;
1218         }
1219         return err;
1220
1221 error:
1222         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1223         goto out;
1224 }
1225
1226 /*
1227  *      Throw away all pending data on the socket.
1228  */
1229 void ip_flush_pending_frames(struct sock *sk)
1230 {
1231         struct inet_sock *inet = inet_sk(sk);
1232         struct sk_buff *skb;
1233
1234         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1235                 kfree_skb(skb);
1236
1237         inet->cork.flags &= ~IPCORK_OPT;
1238         if (inet->cork.opt) {
1239                 kfree(inet->cork.opt);
1240                 inet->cork.opt = NULL;
1241         }
1242         if (inet->cork.rt) {
1243                 ip_rt_put(inet->cork.rt);
1244                 inet->cork.rt = NULL;
1245         }
1246 }
1247
1248
1249 /*
1250  *      Fetch data from kernel space and fill in checksum if needed.
1251  */
1252 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1253                               int len, int odd, struct sk_buff *skb)
1254 {
1255         unsigned int csum;
1256
1257         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1258         skb->csum = csum_block_add(skb->csum, csum, odd);
1259         return 0;
1260 }
1261
1262 /*
1263  *      Generic function to send a packet as reply to another packet.
1264  *      Used to send TCP resets so far. ICMP should use this function too.
1265  *
1266  *      Should run single threaded per socket because it uses the sock
1267  *      structure to pass arguments.
1268  *
1269  *      LATER: switch from ip_build_xmit to ip_append_*
1270  */
1271 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1272                    unsigned int len)
1273 {
1274         struct inet_sock *inet = inet_sk(sk);
1275         struct {
1276                 struct ip_options       opt;
1277                 char                    data[40];
1278         } replyopts;
1279         struct ipcm_cookie ipc;
1280         u32 daddr;
1281         struct rtable *rt = (struct rtable*)skb->dst;
1282
1283         if (ip_options_echo(&replyopts.opt, skb))
1284                 return;
1285
1286         daddr = ipc.addr = rt->rt_src;
1287         ipc.opt = NULL;
1288
1289         if (replyopts.opt.optlen) {
1290                 ipc.opt = &replyopts.opt;
1291
1292                 if (ipc.opt->srr)
1293                         daddr = replyopts.opt.faddr;
1294         }
1295
1296         {
1297                 struct flowi fl = { .nl_u = { .ip4_u =
1298                                               { .daddr = daddr,
1299                                                 .saddr = rt->rt_spec_dst,
1300                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1301                                     /* Not quite clean, but right. */
1302                                     .uli_u = { .ports =
1303                                                { .sport = skb->h.th->dest,
1304                                                  .dport = skb->h.th->source } },
1305                                     .proto = sk->sk_protocol };
1306                 if (ip_route_output_key(&rt, &fl))
1307                         return;
1308         }
1309
1310         /* And let IP do all the hard work.
1311
1312            This chunk is not reenterable, hence spinlock.
1313            Note that it uses the fact, that this function is called
1314            with locally disabled BH and that sk cannot be already spinlocked.
1315          */
1316         bh_lock_sock(sk);
1317         inet->tos = skb->nh.iph->tos;
1318         sk->sk_priority = skb->priority;
1319         sk->sk_protocol = skb->nh.iph->protocol;
1320         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1321                        &ipc, rt, MSG_DONTWAIT);
1322         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1323                 if (arg->csumoffset >= 0)
1324                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1325                 skb->ip_summed = CHECKSUM_NONE;
1326                 ip_push_pending_frames(sk);
1327         }
1328
1329         bh_unlock_sock(sk);
1330
1331         ip_rt_put(rt);
1332 }
1333
1334 /*
1335  *      IP protocol layer initialiser
1336  */
1337
1338 static struct packet_type ip_packet_type = {
1339         .type = __constant_htons(ETH_P_IP),
1340         .func = ip_rcv,
1341 };
1342
1343 /*
1344  *      IP registers the packet type and then calls the subprotocol initialisers
1345  */
1346
1347 void __init ip_init(void)
1348 {
1349         dev_add_pack(&ip_packet_type);
1350
1351         ip_rt_init();
1352         inet_initpeers();
1353
1354 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1355         igmp_mc_proc_init();
1356 #endif
1357 }
1358
1359 EXPORT_SYMBOL(ip_finish_output);
1360 EXPORT_SYMBOL(ip_fragment);
1361 EXPORT_SYMBOL(ip_generic_getfrag);
1362 EXPORT_SYMBOL(ip_queue_xmit);
1363 EXPORT_SYMBOL(ip_send_check);
1364
1365 #ifdef CONFIG_SYSCTL
1366 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1367 #endif