X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;ds=sidebyside;f=net%2Fipv4%2Fip_output.c;h=a0f2008584bc6b24eb771b169cf33ca797f1c5b8;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=5a853aac2e95955a47dc975125f34397ccea25f1;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5a853aac2..a0f200858 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -7,7 +7,7 @@ * * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ * - * Authors: Ross Biro, + * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, * Alan Cox, @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -69,27 +69,22 @@ #include #include #include -#include -#include +#include #include #include #include #include -#include #include #include +#include #include #include #include #include #include +#include -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; -int sysctl_ip_default_ttl = IPDEFTTL; +int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ __inline__ void ip_send_check(struct iphdr *iph) @@ -106,15 +101,11 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); - -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_loopback_xmit(newskb); -#endif netif_rx(newskb); return 0; } -static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst) +static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; @@ -128,9 +119,9 @@ static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - u32 saddr, u32 daddr, struct ip_options *opt) + __be32 saddr, __be32 daddr, struct ip_options *opt) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct rtable *rt = (struct rtable *)skb->dst; struct iphdr *iph; @@ -168,10 +159,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; - struct hh_cache *hh = dst->hh; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); @@ -190,20 +182,9 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_finish_output2(skb); -#endif /*CONFIG_NETFILTER_DEBUG*/ - - if (hh) { - int hh_alen; - - read_lock_bh(&hh->hh_lock); - hh_alen = HH_DATA_ALIGN(hh->hh_len); - memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); - read_unlock_bh(&hh->hh_lock); - skb_push(skb, hh->hh_len); - return hh->hh_output(skb); - } else if (dst->neighbour) + if (dst->hh) + return neigh_hh_output(dst->hh, skb); + else if (dst->neighbour) return dst->neighbour->output(skb); if (net_ratelimit()) @@ -212,20 +193,23 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static inline int ip_finish_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; - - skb->dev = dev; - skb->protocol = htons(ETH_P_IP); - - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output2); +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb->dst->xfrm != NULL) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) + return ip_fragment(skb, ip_finish_output2); + else + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff **pskb) +int ip_mc_output(struct sk_buff *skb) { - struct sk_buff *skb = *pskb; struct sock *sk = skb->sk; struct rtable *rt = (struct rtable*)skb->dst; struct net_device *dev = rt->u.dst.dev; @@ -278,33 +262,32 @@ int ip_mc_output(struct sk_buff **pskb) newskb->dev, ip_dev_loopback_xmit); } - if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff **pskb) +int ip_output(struct sk_buff *skb) { - struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dst->dev; IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) && - !skb_shinfo(skb)->tso_size) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) { struct sock *sk = skb->sk; - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = inet->opt; struct rtable *rt; struct iphdr *iph; - u32 mtu; /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. @@ -316,7 +299,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); if (rt == NULL) { - u32 daddr; + __be32 daddr; /* Use correct destination address if we have options. */ daddr = inet->daddr; @@ -338,11 +321,11 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) * keep trying until route appears or the connection times * itself out. */ + security_sk_classify_flow(sk, &fl); if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); @@ -352,7 +335,7 @@ packet_routed: /* OK, we know where to send it, allocate and build IP header. */ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); - *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); @@ -365,24 +348,13 @@ packet_routed: skb->nh.iph = iph; /* Transport layer set skb->h.foo itself. */ - if(opt && opt->optlen) { + if (opt && opt->optlen) { iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, inet->daddr, rt, 0); } - mtu = dst_pmtu(&rt->u.dst); - if (skb->len > mtu && (sk->sk_route_caps & NETIF_F_TSO)) { - unsigned int hlen; - - /* Hack zone: all this must be done by TCP. */ - hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - skb_shinfo(skb)->tso_size = mtu - hlen; - skb_shinfo(skb)->tso_segs = - (skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/ - skb_shinfo(skb)->tso_size - 1; - } - - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->gso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -404,9 +376,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; + dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; + to->mark = from->mark; /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; @@ -415,21 +388,21 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif #ifdef CONFIG_NETFILTER - to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; nf_conntrack_get(to->nfct); + to->nfctinfo = from->nfctinfo; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + to->ipvs_property = from->ipvs_property; +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(to->nf_bridge); to->nf_bridge = from->nf_bridge; nf_bridge_get(to->nf_bridge); #endif -#ifdef CONFIG_NETFILTER_DEBUG - to->nf_debug = from->nf_debug; -#endif #endif + skb_copy_secmark(to, from); } /* @@ -446,9 +419,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) int ptr; struct net_device *dev; struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs; + unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; - int not_last_frag; + __be16 not_last_frag; struct rtable *rt = (struct rtable*)skb->dst; int err = 0; @@ -461,8 +434,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = skb->nh.iph; if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_pmtu(&rt->u.dst))); + htonl(dst_mtu(&rt->u.dst))); kfree_skb(skb); return -EMSGSIZE; } @@ -472,7 +446,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) */ hlen = iph->ihl * 4; - mtu = dst_pmtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -501,6 +476,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } /* Everything is OK. Generate! */ @@ -512,13 +495,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); - iph->frag_off |= htons(IP_MF); + iph->frag_off = htons(IP_MF); ip_send_check(iph); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (frag) { + frag->ip_summed = CHECKSUM_NONE; frag->h.raw = frag->data; frag->nh.raw = __skb_push(frag, hlen); memcpy(frag->nh.raw, iph, hlen); @@ -537,6 +521,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); + if (!err) + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -563,14 +549,13 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = raw + hlen; /* Where to start from */ -#ifdef CONFIG_BRIDGE_NETFILTER /* for bridged IP traffic encapsulated inside f.e. a vlan header, - * we need to make room for the encapsulating header */ - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb)); - mtu -= nf_bridge_pad(skb); -#else - ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev); -#endif + * we need to make room for the encapsulating header + */ + pad = nf_bridge_pad(skb); + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad); + mtu -= pad; + /* * Fragment the datagram. */ @@ -597,7 +582,7 @@ slow_path: */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } @@ -660,9 +645,6 @@ slow_path: /* * Put this fragment into the sending queue. */ - - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); - iph->tot_len = htons(len + hlen); ip_send_check(iph); @@ -670,6 +652,8 @@ slow_path: err = output(skb2); if (err) goto fail; + + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); IP_INC_STATS(IPSTATS_MIB_FRAGOKS); @@ -681,16 +665,18 @@ fail: return err; } +EXPORT_SYMBOL(ip_fragment); + int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct iovec *iov = from; - if (skb->ip_summed == CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { if (memcpy_fromiovecend(to, iov, offset, len) < 0) return -EFAULT; } else { - unsigned int csum = 0; + __wsum csum = 0; if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); @@ -698,21 +684,76 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk return 0; } -static inline unsigned int +static inline __wsum csum_page(struct page *page, int offset, int copy) { char *kaddr; - unsigned int csum; + __wsum csum; kaddr = kmap(page); csum = csum_partial(kaddr + offset, copy, 0); kunmap(page); return csum; } +static inline int ip_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags) +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP fragmentation offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb->nh.raw = skb->data; + + /* initialize protocol header pointer */ + skb->h.raw = skb->data + fragheaderlen; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + sk->sk_sndmsg_off = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + /* specify the length of each IP datagram fragment*/ + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UFO , + * so follow normal path + */ + kfree_skb(skb); + return err; +} + /* * ip_append_data() and ip_append_page() can make one large IP datagram * from many pieces of data. Each pieces will be holded on the socket - * until ip_push_pending_frames() is called. Eache pieces can be a page + * until ip_push_pending_frames() is called. Each piece can be a page * or non-page data. * * Not only UDP, other transport protocols - e.g. raw sockets - can use @@ -727,7 +768,7 @@ int ip_append_data(struct sock *sk, struct ipcm_cookie *ipc, struct rtable *rt, unsigned int flags) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct ip_options *opt = NULL; @@ -759,7 +800,7 @@ int ip_append_data(struct sock *sk, inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; @@ -780,7 +821,7 @@ int ip_append_data(struct sock *sk, hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); - maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen; + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + length > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); @@ -792,46 +833,64 @@ int ip_append_data(struct sock *sk, * it won't be fragmented in the future. */ if (transhdrlen && - length + fragheaderlen <= maxfraglen && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + length + fragheaderlen <= mtu && + rt->u.dst.dev->features & NETIF_F_ALL_CSUM && !exthdrlen) - csummode = CHECKSUM_HW; + csummode = CHECKSUM_PARTIAL; inet->cork.length += length; + if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + + err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, + fragheaderlen, transhdrlen, mtu, + flags); + if (err) + goto error; + return 0; + } /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, * each of segments is IP fragment ready for sending to network after * adding appropriate IP header. - * - * Mistake is: - * - * If mtu-fragheaderlen is not 0 modulo 8, we generate additional - * small fragment of length (mtu-fragheaderlen)%8, even though - * it is not necessary. Not a big bug, but needs a fix. */ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) goto alloc_new_skb; while (length > 0) { - if ((copy = maxfraglen - skb->len) <= 0) { + /* Check if the remaining data fits into current packet. */ + copy = mtu - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; + unsigned int fraggap; unsigned int alloclen; - BUG_TRAP(copy == 0); - + struct sk_buff *skb_prev; alloc_new_skb: - datalen = maxfraglen - fragheaderlen; - if (datalen > length) - datalen = length; + skb_prev = skb; + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + if (datalen > mtu - fragheaderlen) + datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && !(rt->u.dst.dev->features&NETIF_F_SG)) - alloclen = maxfraglen; + alloclen = mtu; else alloclen = datalen + fragheaderlen; @@ -840,7 +899,7 @@ alloc_new_skb: * because we have no idea what fragment will be * the last. */ - if (datalen == length) + if (datalen == length + fraggap) alloclen += rt->u.dst.trailer_len; if (transhdrlen) { @@ -875,15 +934,25 @@ alloc_new_skb: data += fragheaderlen; skb->h.raw = data + exthdrlen; - copy = datalen - transhdrlen; - if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) { + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + pskb_trim_unique(skb_prev, maxfraglen); + } + + copy = datalen - transhdrlen - fraggap; + if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } offset += copy; - length -= datalen; + length -= datalen - fraggap; transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; @@ -970,7 +1039,7 @@ error: ssize_t ip_append_page(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; @@ -978,7 +1047,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, int mtu; int len; int err; - unsigned int maxfraglen, fragheaderlen; + unsigned int maxfraglen, fragheaderlen, fraggap; if (inet->hdrincl) return -EPERM; @@ -1000,7 +1069,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, mtu = inet->cork.fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); - maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen; + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + size > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); @@ -1011,16 +1080,36 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return -EINVAL; inet->cork.length += size; + if ((sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + } + while (size > 0) { int i; - if ((len = maxfraglen - skb->len) <= 0) { + + if (skb_is_gso(skb)) + len = size; + else { + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + } + if (len <= 0) { + struct sk_buff *skb_prev; char *data; struct iphdr *iph; - BUG_TRAP(len == 0); + int alloclen; - skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1, - sk->sk_allocation); + skb_prev = skb; + fraggap = skb_prev->len - maxfraglen; + + alloclen = fragheaderlen + hh_len + fraggap + 15; + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); if (unlikely(!skb)) { err = -ENOBUFS; goto error; @@ -1036,11 +1125,20 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, /* * Find where to start putting bytes. */ - data = skb_put(skb, fragheaderlen); + data = skb_put(skb, fragheaderlen + fraggap); skb->nh.iph = iph = (struct iphdr *)data; data += fragheaderlen; skb->h.raw = data; + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + pskb_trim_unique(skb_prev, maxfraglen); + } + /* * Put the packet on the pending queue. */ @@ -1062,7 +1160,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, } if (skb->ip_summed == CHECKSUM_NONE) { - unsigned int csum; + __wsum csum; csum = csum_page(page, offset, len); skb->csum = csum_block_add(skb->csum, csum, skb->len); } @@ -1088,11 +1186,11 @@ int ip_push_pending_frames(struct sock *sk) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = NULL; struct rtable *rt = inet->cork.rt; struct iphdr *iph; - int df = 0; + __be16 df = 0; __u8 ttl; int err = 0; @@ -1126,7 +1224,8 @@ int ip_push_pending_frames(struct sock *sk) * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || - (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst))) + (skb->len <= dst_mtu(&rt->u.dst) && + ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) @@ -1147,11 +1246,7 @@ int ip_push_pending_frames(struct sock *sk) iph->tos = inet->tos; iph->tot_len = htons(skb->len); iph->frag_off = df; - if (!df) { - __ip_select_ident(iph, &rt->u.dst, 0); - } else { - iph->id = htons(inet->id++); - } + ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; @@ -1173,10 +1268,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1193,17 +1286,15 @@ error: */ void ip_flush_pending_frames(struct sock *sk) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1217,7 +1308,7 @@ void ip_flush_pending_frames(struct sock *sk) static int ip_reply_glue_bits(void *dptr, char *to, int offset, int len, int odd, struct sk_buff *skb) { - unsigned int csum; + __wsum csum; csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); skb->csum = csum_block_add(skb->csum, csum, odd); @@ -1236,13 +1327,13 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct { struct ip_options opt; char data[40]; } replyopts; struct ipcm_cookie ipc; - u32 daddr; + __be32 daddr; struct rtable *rt = (struct rtable*)skb->dst; if (ip_options_echo(&replyopts.opt, skb)) @@ -1268,6 +1359,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar { .sport = skb->h.th->dest, .dport = skb->h.th->source } }, .proto = sk->sk_protocol }; + security_skb_classify_flow(skb, &fl); if (ip_route_output_key(&rt, &fl)) return; } @@ -1286,7 +1378,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar &ipc, rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) - *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(sk); } @@ -1296,23 +1388,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar ip_rt_put(rt); } -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - void __init ip_init(void) { - dev_add_pack(&ip_packet_type); - ip_rt_init(); inet_initpeers(); @@ -1321,12 +1398,6 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); -EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif