X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fip_output.c;h=a2ede167e045b32340e3a1346b8ccde641cb6b68;hb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;hp=47054f8a3a33f449e93f6f8bd96603641d07ade0;hpb=6a77f38946aaee1cd85eeec6cf4229b204c15071;p=linux-2.6.git diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 47054f8a3..a2ede167e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -7,7 +7,7 @@ * * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ * - * Authors: Ross Biro, + * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, * Alan Cox, @@ -53,7 +53,6 @@ #include #include #include -#include #include #include @@ -69,13 +68,11 @@ #include #include #include -#include -#include +#include #include #include #include #include -#include #include #include #include @@ -84,12 +81,8 @@ #include #include #include +#include -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; int sysctl_ip_default_ttl = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ @@ -107,10 +100,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); - -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_loopback_xmit(newskb); -#endif netif_rx(newskb); return 0; } @@ -169,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; @@ -191,10 +182,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_finish_output2(skb); -#endif /*CONFIG_NETFILTER_DEBUG*/ - if (hh) { int hh_alen; @@ -213,15 +200,19 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static inline int ip_finish_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; - - skb->dev = dev; - skb->protocol = htons(ETH_P_IP); - - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output2); +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb->dst->xfrm != NULL) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) + return ip_fragment(skb, ip_finish_output2); + else + return ip_finish_output2(skb); } int ip_mc_output(struct sk_buff *skb) @@ -278,20 +269,23 @@ int ip_mc_output(struct sk_buff *skb) newskb->dev, ip_dev_loopback_xmit); } - if (skb->len > dst_pmtu(&rt->u.dst)) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_output(struct sk_buff *skb) { + struct net_device *dev = skb->dst->dev; + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) @@ -337,8 +331,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); @@ -366,7 +359,8 @@ packet_routed: ip_options_build(skb, opt, inet->daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->gso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -388,7 +382,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; @@ -401,21 +394,21 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #endif #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; nf_conntrack_get(to->nfct); to->nfctinfo = from->nfctinfo; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + to->ipvs_property = from->ipvs_property; +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(to->nf_bridge); to->nf_bridge = from->nf_bridge; nf_bridge_get(to->nf_bridge); #endif -#ifdef CONFIG_NETFILTER_DEBUG - to->nf_debug = from->nf_debug; -#endif #endif + skb_copy_secmark(to, from); } /* @@ -434,7 +427,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; - int not_last_frag; + __be16 not_last_frag; struct rtable *rt = (struct rtable*)skb->dst; int err = 0; @@ -447,8 +440,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = skb->nh.iph; if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_pmtu(&rt->u.dst))); + htonl(dst_mtu(&rt->u.dst))); kfree_skb(skb); return -EMSGSIZE; } @@ -458,7 +452,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) */ hlen = iph->ihl * 4; - mtu = dst_pmtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -487,6 +482,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } /* Everything is OK. Generate! */ @@ -498,7 +501,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); - iph->frag_off |= htons(IP_MF); + iph->frag_off = htons(IP_MF); ip_send_check(iph); for (;;) { @@ -524,6 +527,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); + if (!err) + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -584,7 +589,7 @@ slow_path: */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } @@ -647,9 +652,6 @@ slow_path: /* * Put this fragment into the sending queue. */ - - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); - iph->tot_len = htons(len + hlen); ip_send_check(iph); @@ -657,6 +659,8 @@ slow_path: err = output(skb2); if (err) goto fail; + + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); IP_INC_STATS(IPSTATS_MIB_FRAGOKS); @@ -668,6 +672,8 @@ fail: return err; } +EXPORT_SYMBOL(ip_fragment); + int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { @@ -696,6 +702,61 @@ csum_page(struct page *page, int offset, int copy) return csum; } +static inline int ip_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags) +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP fragmentation offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb->nh.raw = skb->data; + + /* initialize protocol header pointer */ + skb->h.raw = skb->data + fragheaderlen; + + skb->ip_summed = CHECKSUM_HW; + skb->csum = 0; + sk->sk_sndmsg_off = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + /* specify the length of each IP datagram fragment*/ + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UFO , + * so follow normal path + */ + kfree_skb(skb); + return err; +} + /* * ip_append_data() and ip_append_page() can make one large IP datagram * from many pieces of data. Each pieces will be holded on the socket @@ -746,7 +807,7 @@ int ip_append_data(struct sock *sk, inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; @@ -780,11 +841,21 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + rt->u.dst.dev->features & NETIF_F_ALL_CSUM && !exthdrlen) csummode = CHECKSUM_HW; inet->cork.length += length; + if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + + err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, + fragheaderlen, transhdrlen, mtu, + flags); + if (err) + goto error; + return 0; + } /* So, what's going on in the loop below? * @@ -835,7 +906,7 @@ alloc_new_skb: * because we have no idea what fragment will be * the last. */ - if (datalen == length) + if (datalen == length + fraggap) alloclen += rt->u.dst.trailer_len; if (transhdrlen) { @@ -877,7 +948,7 @@ alloc_new_skb: skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; @@ -1016,14 +1087,25 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return -EINVAL; inet->cork.length += size; + if ((sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + } + while (size > 0) { int i; - /* Check if the remaining data fits into current packet. */ - len = mtu - skb->len; - if (len < size) - len = maxfraglen - skb->len; + if (skb_is_gso(skb)) + len = size; + else { + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + } if (len <= 0) { struct sk_buff *skb_prev; char *data; @@ -1031,10 +1113,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, int alloclen; skb_prev = skb; - if (skb_prev) - fraggap = skb_prev->len - maxfraglen; - else - fraggap = 0; + fraggap = skb_prev->len - maxfraglen; alloclen = fragheaderlen + hh_len + fraggap + 15; skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); @@ -1064,7 +1143,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, data, fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } /* @@ -1118,7 +1197,7 @@ int ip_push_pending_frames(struct sock *sk) struct ip_options *opt = NULL; struct rtable *rt = inet->cork.rt; struct iphdr *iph; - int df = 0; + __be16 df = 0; __u8 ttl; int err = 0; @@ -1152,7 +1231,8 @@ int ip_push_pending_frames(struct sock *sk) * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || - (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst))) + (skb->len <= dst_mtu(&rt->u.dst) && + ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) @@ -1173,11 +1253,7 @@ int ip_push_pending_frames(struct sock *sk) iph->tos = inet->tos; iph->tot_len = htons(skb->len); iph->frag_off = df; - if (!df) { - __ip_select_ident(iph, &rt->u.dst, 0); - } else { - iph->id = htons(inet->id++); - } + ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; @@ -1199,10 +1275,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1226,10 +1300,8 @@ void ip_flush_pending_frames(struct sock *sk) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1322,23 +1394,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar ip_rt_put(rt); } -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - void __init ip_init(void) { - dev_add_pack(&ip_packet_type); - ip_rt_init(); inet_initpeers(); @@ -1347,12 +1404,6 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); -EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif