X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fip_output.c;h=cff9c3a72daf254013b7ae80cb5761e04a733a2f;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=8ef2b82630a28a6d8029b3c08c8785e677794bd5;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8ef2b8263..cff9c3a72 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -7,7 +7,7 @@ * * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ * - * Authors: Ross Biro, + * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, * Alan Cox, @@ -69,13 +69,11 @@ #include #include #include -#include -#include +#include #include #include #include #include -#include #include #include #include @@ -84,12 +82,8 @@ #include #include #include +#include -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; int sysctl_ip_default_ttl = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ @@ -107,15 +101,11 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); - -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_loopback_xmit(newskb); -#endif netif_rx(newskb); return 0; } -static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst) +static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; @@ -131,7 +121,7 @@ static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst) int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct rtable *rt = (struct rtable *)skb->dst; struct iphdr *iph; @@ -169,6 +159,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; @@ -191,10 +183,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_finish_output2(skb); -#endif /*CONFIG_NETFILTER_DEBUG*/ - if (hh) { int hh_alen; @@ -213,20 +201,24 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static inline int ip_finish_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; - - skb->dev = dev; - skb->protocol = htons(ETH_P_IP); - - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output2); +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb->dst->xfrm != NULL) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + if (skb->len > dst_mtu(skb->dst) && + !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + return ip_fragment(skb, ip_finish_output2); + else + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff **pskb) +int ip_mc_output(struct sk_buff *skb) { - struct sk_buff *skb = *pskb; struct sock *sk = skb->sk; struct rtable *rt = (struct rtable*)skb->dst; struct net_device *dev = rt->u.dst.dev; @@ -279,29 +271,29 @@ int ip_mc_output(struct sk_buff **pskb) newskb->dev, ip_dev_loopback_xmit); } - if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff **pskb) +int ip_output(struct sk_buff *skb) { - struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dst->dev; IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) && - !skb_shinfo(skb)->tso_size) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) { struct sock *sk = skb->sk; - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = inet->opt; struct rtable *rt; struct iphdr *iph; @@ -341,8 +333,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); @@ -370,7 +361,8 @@ packet_routed: ip_options_build(skb, opt, inet->daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->tso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -392,7 +384,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; + dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; @@ -404,20 +396,19 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #endif #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; nf_conntrack_get(to->nfct); to->nfctinfo = from->nfctinfo; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + to->ipvs_property = from->ipvs_property; +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(to->nf_bridge); to->nf_bridge = from->nf_bridge; nf_bridge_get(to->nf_bridge); #endif -#ifdef CONFIG_NETFILTER_DEBUG - to->nf_debug = from->nf_debug; -#endif #endif } @@ -437,7 +428,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; - int not_last_frag; + __be16 not_last_frag; struct rtable *rt = (struct rtable*)skb->dst; int err = 0; @@ -451,7 +442,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_pmtu(&rt->u.dst))); + htonl(dst_mtu(&rt->u.dst))); kfree_skb(skb); return -EMSGSIZE; } @@ -461,7 +452,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) */ hlen = iph->ihl * 4; - mtu = dst_pmtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -490,6 +482,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } /* Everything is OK. Generate! */ @@ -501,13 +501,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); - iph->frag_off |= htons(IP_MF); + iph->frag_off = htons(IP_MF); ip_send_check(iph); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (frag) { + frag->ip_summed = CHECKSUM_NONE; frag->h.raw = frag->data; frag->nh.raw = __skb_push(frag, hlen); memcpy(frag->nh.raw, iph, hlen); @@ -586,7 +587,7 @@ slow_path: */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } @@ -670,6 +671,8 @@ fail: return err; } +EXPORT_SYMBOL(ip_fragment); + int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { @@ -698,6 +701,60 @@ csum_page(struct page *page, int offset, int copy) return csum; } +static inline int ip_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags) +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP fragmentation offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb->nh.raw = skb->data; + + /* initialize protocol header pointer */ + skb->h.raw = skb->data + fragheaderlen; + + skb->ip_summed = CHECKSUM_HW; + skb->csum = 0; + sk->sk_sndmsg_off = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + /* specify the length of each IP datagram fragment*/ + skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UFO , + * so follow normal path + */ + kfree_skb(skb); + return err; +} + /* * ip_append_data() and ip_append_page() can make one large IP datagram * from many pieces of data. Each pieces will be holded on the socket @@ -716,7 +773,7 @@ int ip_append_data(struct sock *sk, struct ipcm_cookie *ipc, struct rtable *rt, unsigned int flags) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct ip_options *opt = NULL; @@ -748,7 +805,7 @@ int ip_append_data(struct sock *sk, inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; @@ -787,6 +844,16 @@ int ip_append_data(struct sock *sk, csummode = CHECKSUM_HW; inet->cork.length += length; + if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + + err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, + fragheaderlen, transhdrlen, mtu, + flags); + if (err) + goto error; + return 0; + } /* So, what's going on in the loop below? * @@ -837,7 +904,7 @@ alloc_new_skb: * because we have no idea what fragment will be * the last. */ - if (datalen == length) + if (datalen == length + fraggap) alloclen += rt->u.dst.trailer_len; if (transhdrlen) { @@ -977,7 +1044,7 @@ error: ssize_t ip_append_page(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; @@ -1018,14 +1085,23 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return -EINVAL; inet->cork.length += size; + if ((sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) + skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + while (size > 0) { int i; - /* Check if the remaining data fits into current packet. */ - len = mtu - skb->len; - if (len < size) - len = maxfraglen - skb->len; + if (skb_shinfo(skb)->ufo_size) + len = size; + else { + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + } if (len <= 0) { struct sk_buff *skb_prev; char *data; @@ -1033,10 +1109,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, int alloclen; skb_prev = skb; - if (skb_prev) - fraggap = skb_prev->len - maxfraglen; - else - fraggap = 0; + fraggap = skb_prev->len - maxfraglen; alloclen = fragheaderlen + hh_len + fraggap + 15; skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); @@ -1116,11 +1189,11 @@ int ip_push_pending_frames(struct sock *sk) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = NULL; struct rtable *rt = inet->cork.rt; struct iphdr *iph; - int df = 0; + __be16 df = 0; __u8 ttl; int err = 0; @@ -1154,7 +1227,8 @@ int ip_push_pending_frames(struct sock *sk) * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || - (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst))) + (skb->len <= dst_mtu(&rt->u.dst) && + ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) @@ -1175,11 +1249,7 @@ int ip_push_pending_frames(struct sock *sk) iph->tos = inet->tos; iph->tot_len = htons(skb->len); iph->frag_off = df; - if (!df) { - __ip_select_ident(iph, &rt->u.dst, 0); - } else { - iph->id = htons(inet->id++); - } + ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; @@ -1201,10 +1271,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1221,17 +1289,15 @@ error: */ void ip_flush_pending_frames(struct sock *sk) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1264,7 +1330,7 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) { - struct inet_opt *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct { struct ip_options opt; char data[40]; @@ -1324,23 +1390,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar ip_rt_put(rt); } -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - void __init ip_init(void) { - dev_add_pack(&ip_packet_type); - ip_rt_init(); inet_initpeers(); @@ -1349,12 +1400,6 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); -EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif