X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_output.c;h=a29ac27636008cc61959fec9d14c2269ca4fe53a;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=bc5fba4a39fd0bbd578d21bc0aefd1c2be7f326d;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bc5fba4a3..a29ac2763 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,15 +45,20 @@ /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; -static __inline__ -void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +/* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ +int sysctl_tcp_tso_win_divisor = 8; + +static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { - tp->send_head = skb->next; - if (tp->send_head == (struct sk_buff *)&sk->sk_write_queue) - tp->send_head = NULL; + sk->sk_send_head = skb->next; + if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) + sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp->packets_out++ == 0) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_packets_out_inc(sk, tp, skb); } /* SND.NXT, if window was not shrunk. @@ -62,7 +67,7 @@ void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ -static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp) +static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp) { if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) return tp->snd_nxt; @@ -86,7 +91,7 @@ static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp) */ static __u16 tcp_advertise_mss(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; @@ -100,7 +105,7 @@ static __u16 tcp_advertise_mss(struct sock *sk) /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst) +static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) { s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp, dst); @@ -119,7 +124,8 @@ static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst) tp->snd_cwnd_used = 0; } -static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk) +static inline void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) { u32 now = tcp_time_stamp; @@ -137,20 +143,79 @@ static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *s static __inline__ void tcp_event_ack_sent(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); tcp_dec_quickack_mode(tp); tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } -/* Chose a new window to advertise, update state in tcp_opt for the +/* Determine a window scaling and initial window to offer. + * Based on the assumption that the given amount of space + * will be offered. Store the results in the tp structure. + * NOTE: for smooth operation initial space offering should + * be a multiple of mss if possible. We assume here that mss >= 1. + * This MUST be enforced by all callers. + */ +void tcp_select_initial_window(int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale) +{ + unsigned int space = (__space < 0 ? 0 : __space); + + /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp == 0) + (*window_clamp) = (65535 << 14); + space = min(*window_clamp, space); + + /* Quantize space offering to a multiple of mss if possible. */ + if (space > mss) + space = (space / mss) * mss; + + /* NOTE: offering an initial window larger than 32767 + * will break some buggy TCP stacks. We try to be nice. + * If we are not window scaling, then this truncates + * our initial window offering to 32k. There should also + * be a sysctl option to stop being nice. + */ + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + (*rcv_wscale) = 0; + if (wscale_ok) { + /* Set window scaling on max possible window + * See RFC1323 for an explanation of the limit to 14 + */ + space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + while (space > 65535 && (*rcv_wscale) < 14) { + space >>= 1; + (*rcv_wscale)++; + } + } + + /* Set initial window to value enough for senders, + * following RFC1414. Senders, not following this RFC, + * will be satisfied with 2. + */ + if (mss > (1<<*rcv_wscale)) { + int init_cwnd = 4; + if (mss > 1460*3) + init_cwnd = 2; + else if (mss > 1460) + init_cwnd = 3; + if (*rcv_wnd > init_cwnd*mss) + *rcv_wnd = init_cwnd*mss; + } + + /* Set the clamp no higher than max representable value */ + (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); +} + +/* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static __inline__ u16 tcp_select_window(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); @@ -168,8 +233,16 @@ static __inline__ u16 tcp_select_window(struct sock *sk) tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (!tp->rx_opt.rcv_wscale) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + /* RFC1323 scaling applied */ - new_win >>= tp->rcv_wscale; + new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) @@ -190,17 +263,19 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { - if(skb != NULL) { - struct inet_opt *inet = inet_sk(sk); - struct tcp_opt *tp = tcp_sk(sk); + if (skb != NULL) { + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); int tcp_header_size = tp->tcp_header_len; struct tcphdr *th; int sysctl_flags; int err; + BUG_ON(!tcp_skb_pcount(skb)); + #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 @@ -221,12 +296,12 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } - } else if (tp->eff_sacks) { + } else if (tp->rx_opt.eff_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } /* @@ -274,9 +349,9 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_flags & SYSCTL_FLAG_TSTAMPS), (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rcv_wscale, + tp->rx_opt.rcv_wscale, tcb->when, - tp->ts_recent); + tp->rx_opt.ts_recent); } else { tcp_build_and_update_options((__u32 *)(th + 1), tp, tcb->when); @@ -291,7 +366,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); - TCP_INC_STATS(TcpOutSegs); + TCP_INC_STATS(TCP_MIB_OUTSEGS); err = tp->af_specific->queue_xmit(skb, 0); if (err <= 0) @@ -321,16 +396,26 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->sk_write_queue, skb); - tcp_charge_skb(sk, skb); + sk_charge_skb(sk, skb); /* Queue it, remembering where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; + if (sk->sk_send_head == NULL) + sk->sk_send_head = skb; +} + +static inline void tcp_tso_set_push(struct sk_buff *skb) +{ + /* Force push to be on for any TSO frames to workaround + * problems with busted implementations like Mac OS-X that + * hold off socket receive wakeups until push is seen. + */ + if (tcp_skb_pcount(skb) > 1) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; } /* Send _single_ skb sitting at the send head. This function requires @@ -338,83 +423,37 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) */ void tcp_push_one(struct sock *sk, unsigned cur_mss) { - struct tcp_opt *tp = tcp_sk(sk); - struct sk_buff *skb = tp->send_head; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - tp->send_head = NULL; + sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp->packets_out++ == 0) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_packets_out_inc(sk, tp, skb); return; } } } -/* Split fragmented skb to two parts at length len. */ - -static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len) +void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std) { - int i; - int pos = skb_headlen(skb); - - if (len < pos) { - /* Split line is inside header. */ - memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len); - - /* And move data appendix as is. */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; - - skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; - - skb1->data_len = skb->data_len; - skb1->len += skb1->data_len; - skb->data_len = 0; - skb->len = len; - skb->tail = skb->data+len; + if (skb->len <= mss_std) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; } else { - int k = 0; - int nfrags = skb_shinfo(skb)->nr_frags; - - /* Second chunk has no header, nothing to copy. */ - - skb_shinfo(skb)->nr_frags = 0; - skb1->len = skb1->data_len = skb->len - len; - skb->len = len; - skb->data_len = len - pos; - - for (i=0; ifrags[i].size; - if (pos + size > len) { - skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; - - if (pos < len) { - /* Split frag. - * We have to variants in this case: - * 1. Move all the frag to the second - * part, if it is possible. F.e. - * this approach is mandatory for TUX, - * where splitting is expensive. - * 2. Split is accurately. We make this. - */ - get_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb1)->frags[0].page_offset += (len-pos); - skb_shinfo(skb1)->frags[0].size -= (len-pos); - skb_shinfo(skb)->frags[i].size = len-pos; - skb_shinfo(skb)->nr_frags++; - } - k++; - } else { - skb_shinfo(skb)->nr_frags++; - } - pos += size; - } - skb_shinfo(skb1)->nr_frags = k; + unsigned int factor; + + factor = skb->len + (mss_std - 1); + factor /= mss_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = mss_std; } } @@ -425,21 +464,25 @@ static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len) */ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize = skb->len - len; + int nsize; u16 flags; + nsize = skb_headlen(skb) - len; + if (nsize < 0) + nsize = 0; + if (skb_cloned(skb) && skb_is_nonlinear(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ - tcp_charge_skb(sk, buff); + sk_charge_skb(sk, buff); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; @@ -450,11 +493,9 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; - TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); - if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { - tp->lost_out++; - tp->left_out++; - } + TCP_SKB_CB(buff)->sacked = + (TCP_SKB_CB(skb)->sacked & + (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { @@ -477,6 +518,25 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out -= tcp_skb_pcount(skb); + tp->left_out -= tcp_skb_pcount(skb); + } + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + tcp_set_skb_tso_segs(buff, tp->mss_cache_std); + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out += tcp_skb_pcount(skb); + tp->left_out += tcp_skb_pcount(skb); + } + + if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { + tp->lost_out += tcp_skb_pcount(buff); + tp->left_out += tcp_skb_pcount(buff); + } + /* Link BUFF into the send queue. */ __skb_append(skb, buff); @@ -487,7 +547,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -unsigned char * __pskb_trim_head(struct sk_buff *skb, int len) +static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; @@ -515,7 +575,7 @@ unsigned char * __pskb_trim_head(struct sk_buff *skb, int len) return skb->tail; } -static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -530,15 +590,27 @@ static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_HW; + + skb->truesize -= len; + sk->sk_queue_shrunk = 1; + sk->sk_wmem_queued -= len; + sk->sk_forward_alloc += len; + + /* Any change of skb->len requires recalculation of tso + * factor and mss. + */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); + return 0; } /* This function synchronize snd mss to current pmtu/exthdr set. - tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. - tp->mss_clamp is mss negotiated at connection setup. + tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minumum of user_mss and mss received with SYN. It also does not include TCP options. @@ -547,7 +619,7 @@ static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds - tp->mss_clamp. + tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. @@ -556,9 +628,9 @@ static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) this function. --ANK (980731) */ -int tcp_sync_mss(struct sock *sk, u32 pmtu) +unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); int mss_now; @@ -571,8 +643,8 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ - if (mss_now > tp->mss_clamp) - mss_now = tp->mss_clamp; + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= tp->ext_header_len + tp->ext2_header_len; @@ -592,23 +664,71 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) tp->pmtu_cookie = pmtu; tp->mss_cache = tp->mss_cache_std = mss_now; - if (sk->sk_route_caps & NETIF_F_TSO) { - int large_mss; + return mss_now; +} + +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + * + * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * cannot be large. However, taking into account rare use of URG, this + * is not a big flaw. + */ + +unsigned int tcp_current_mss(struct sock *sk, int large) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + unsigned int do_large, mss_now; + + mss_now = tp->mss_cache_std; + if (dst) { + u32 mtu = dst_pmtu(dst); + if (mtu != tp->pmtu_cookie || + tp->ext2_header_len != dst->header_len) + mss_now = tcp_sync_mss(sk, mtu); + } + + do_large = (large && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode); + + if (do_large) { + unsigned int large_mss, factor, limit; large_mss = 65535 - tp->af_specific->net_header_len - - tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len; + tp->ext_header_len - tp->ext2_header_len - + tp->tcp_header_len; if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len); + large_mss = max((tp->max_window>>1), + 68U - tp->tcp_header_len); + + factor = large_mss / mss_now; + + /* Always keep large mss multiple of real mss, but + * do not exceed 1/tso_win_divisor of the congestion window + * so we can keep the ACK clock ticking and minimize + * bursting. + */ + limit = tp->snd_cwnd; + if (sysctl_tcp_tso_win_divisor) + limit /= sysctl_tcp_tso_win_divisor; + limit = max(1U, limit); + if (factor > limit) + factor = limit; - /* Always keep large mss multiple of real mss. */ - tp->mss_cache = mss_now*(large_mss/mss_now); + tp->mss_cache = mss_now * factor; + + mss_now = tp->mss_cache; } + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now; } - /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -618,7 +738,7 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) */ int tcp_write_xmit(struct sock *sk, int nonagle) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); unsigned int mss_now; /* If we are closed, the bytes will have to remain here. @@ -636,18 +756,25 @@ int tcp_write_xmit(struct sock *sk, int nonagle) */ mss_now = tcp_current_mss(sk, 1); - while((skb = tp->send_head) && - tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)) { + while ((skb = sk->sk_send_head) && + tcp_snd_test(tp, skb, mss_now, + tcp_skb_is_last(sk, skb) ? nonagle : + TCP_NAGLE_PUSH)) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) break; - /* Advance the send_head. This one is sent out. */ + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ update_send_head(sk, tp, skb); + tcp_minshall_update(tp, mss_now, skb); sent_pkts = 1; } @@ -657,7 +784,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle) return 0; } - return !tp->packets_out && tp->send_head; + return !tp->packets_out && sk->sk_send_head; } return 0; } @@ -716,7 +843,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle) */ u32 __tcp_select_window(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct @@ -744,17 +871,32 @@ u32 __tcp_select_window(struct sock *sk) if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; - /* Get the largest window that is a nice multiple of mss. - * Window clamp already applied above. - * If our current window offering is within 1 mss of the - * free space we just keep it. This prevents the divide - * and multiply from happening most of the time. - * We also don't do any window rounding when the free space - * is too small. + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. */ window = tp->rcv_wnd; - if (window <= free_space - mss || window > free_space) - window = (free_space/mss)*mss; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1< mss. + */ + if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space/mss)*mss; + } return window; } @@ -762,13 +904,13 @@ u32 __tcp_select_window(struct sock *sk) /* Attempt to collapse two adjacent SKB's during retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb->next; /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. */ - if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + if (!skb_cloned(skb) && !skb_cloned(next_skb)) { int skb_size = skb->len, next_skb_size = next_skb->len; u16 flags = TCP_SKB_CB(skb)->flags; @@ -788,6 +930,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m ((skb_size + next_skb_size) > mss_now)) return; + BUG_ON(tcp_skb_pcount(skb) != 1 || + tcp_skb_pcount(next_skb) != 1); + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, next_skb->list); @@ -811,24 +956,23 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) - tp->retrans_out--; + tp->retrans_out -= tcp_skb_pcount(next_skb); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { - tp->lost_out--; - tp->left_out--; + tp->lost_out -= tcp_skb_pcount(next_skb); + tp->left_out -= tcp_skb_pcount(next_skb); } /* Reno case is special. Sigh... */ - if (!tp->sack_ok && tp->sacked_out) { - tp->sacked_out--; - tp->left_out--; + if (!tp->rx_opt.sack_ok && tp->sacked_out) { + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + tp->left_out -= tcp_skb_pcount(next_skb); } /* Not quite right: it can be > snd.fack, but * it is better to underestimate fackets. */ - if (tp->fackets_out) - tp->fackets_out--; - tcp_free_skb(sk, next_skb); - tp->packets_out--; + tcp_dec_pcount_approx(&tp->fackets_out, next_skb); + tcp_packets_out_dec(tp, next_skb); + sk_stream_free_skb(sk, next_skb); } } @@ -838,21 +982,21 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ void tcp_simple_retransmit(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); int lost = 0; - for_retrans_queue(skb, sk, tp) { + sk_stream_for_retrans_queue(skb, sk) { if (skb->len > mss && !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out--; + tp->retrans_out -= tcp_skb_pcount(skb); } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tp->lost_out += tcp_skb_pcount(skb); lost = 1; } } @@ -884,7 +1028,7 @@ void tcp_simple_retransmit(struct sock *sk) */ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss = tcp_current_mss(sk, 0); int err; @@ -918,20 +1062,26 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; - if(skb->len > cur_mss) { - if(tcp_fragment(sk, skb, cur_mss)) + if (skb->len > cur_mss) { + int old_factor = tcp_skb_pcount(skb); + int new_factor; + + if (tcp_fragment(sk, skb, cur_mss)) return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ - tp->packets_out++; + new_factor = tcp_skb_pcount(skb); + tp->packets_out -= old_factor - new_factor; + tp->packets_out += tcp_skb_pcount(skb->next); } /* Collapse two adjacent packets if worthwhile and we can. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && (skb->len < (cur_mss >> 1)) && - (skb->next != tp->send_head) && + (skb->next != sk->sk_send_head) && (skb->next != (struct sk_buff *)&sk->sk_write_queue) && (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) && (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); @@ -947,6 +1097,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; } @@ -956,6 +1108,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -963,7 +1116,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TcpRetransSegs); + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + + tp->total_retrans++; #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { @@ -972,7 +1127,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; - tp->retrans_out++; + tp->retrans_out += tcp_skb_pcount(skb); /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) @@ -998,15 +1153,22 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ void tcp_xmit_retransmit_queue(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int packet_cnt = tp->lost_out; /* First pass: retransmit lost packets. */ if (packet_cnt) { - for_retrans_queue(skb, sk, tp) { + sk_stream_for_retrans_queue(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; @@ -1015,16 +1177,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_retransmit_skb(sk, skb)) return; if (tp->ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(TCPFastRetrans); + NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else - NET_INC_STATS_BH(TCPSlowStartRetrans); + NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); if (skb == skb_peek(&sk->sk_write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } - if (--packet_cnt <= 0) + packet_cnt -= tcp_skb_pcount(skb); + if (packet_cnt <= 0) break; } } @@ -1037,13 +1200,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk) return; /* No forward retransmissions in Reno are possible. */ - if (!tp->sack_ok) + if (!tp->rx_opt.sack_ok) return; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... * - * For now we do not retrnamsit anything, while we have some new + * For now we do not retransmit anything, while we have some new * segments to send. */ @@ -1052,24 +1215,30 @@ void tcp_xmit_retransmit_queue(struct sock *sk) packet_cnt = 0; - for_retrans_queue(skb, sk, tp) { - if(++packet_cnt > tp->fackets_out) + sk_stream_for_retrans_queue(skb, sk) { + /* Similar to the retransmit loop above we + * can pretend that the retransmitted SKB + * we send out here will be composed of one + * real MSS sized packet because tcp_retransmit_skb() + * will fragment it if necessary. + */ + if (++packet_cnt > tp->fackets_out) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) break; - if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) continue; /* Ok, retransmit it. */ - if(tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) break; if (skb == skb_peek(&sk->sk_write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - NET_INC_STATS_BH(TCPForwardRetrans); + NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } } @@ -1079,17 +1248,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ void tcp_send_fin(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); - unsigned int mss_now; + int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk, 1); - if(tp->send_head != NULL) { + if (sk->sk_send_head != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; @@ -1107,6 +1276,8 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1123,13 +1294,13 @@ void tcp_send_fin(struct sock *sk) */ void tcp_send_active_reset(struct sock *sk, int priority) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(TCPAbortFailed); + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); return; } @@ -1138,13 +1309,15 @@ void tcp_send_active_reset(struct sock *sk, int priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb)) - NET_INC_STATS(TCPAbortFailed); + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } /* WARNING: This routine must only be called when we have already sent @@ -1168,8 +1341,8 @@ int tcp_send_synack(struct sock *sk) return -ENOMEM; __skb_unlink(skb, &sk->sk_write_queue); __skb_queue_head(&sk->sk_write_queue, nskb); - tcp_free_skb(sk, skb); - tcp_charge_skb(sk, nskb); + sk_stream_free_skb(sk, skb); + sk_charge_skb(sk, nskb); skb = nskb; } @@ -1186,7 +1359,7 @@ int tcp_send_synack(struct sock *sk) struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct open_request *req) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; int tcp_header_size; struct sk_buff *skb; @@ -1217,6 +1390,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->dest = req->rmt_port; TCP_SKB_CB(skb)->seq = req->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(req->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -1244,7 +1420,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->csum = 0; th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TcpOutSegs); + TCP_INC_STATS(TCP_MIB_OUTSEGS); return skb; } @@ -1254,7 +1430,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, static inline void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -1263,8 +1439,8 @@ static inline void tcp_connect_init(struct sock *sk) (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); /* If user gave his TCP_MAXSEG, record it to clamp */ - if (tp->user_mss) - tp->mss_clamp = tp->user_mss; + if (tp->rx_opt.user_mss) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_sync_mss(sk, dst_pmtu(dst)); @@ -1272,14 +1448,14 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); - tcp_vegas_init(tp); + tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), - tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, - &tp->rcv_wscale); + &tp->rx_opt.rcv_wscale); tp->rcv_ssthresh = tp->rcv_wnd; @@ -1303,7 +1479,7 @@ static inline void tcp_connect_init(struct sock *sk) */ int tcp_connect(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; tcp_connect_init(sk); @@ -1318,21 +1494,23 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - tcp_vegas_init(tp); + tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->sk_write_queue, buff); - tcp_charge_skb(sk, buff); - tp->packets_out++; + sk_charge_skb(sk, buff); + tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); - TCP_INC_STATS(TcpActiveOpens); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); @@ -1345,7 +1523,7 @@ int tcp_connect(struct sock *sk) */ void tcp_send_delayed_ack(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); int ato = tp->ack.ato; unsigned long timeout; @@ -1397,7 +1575,7 @@ void tcp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ if (sk->sk_state != TCP_CLOSE) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; /* We are not putting this on the write queue, so @@ -1417,6 +1595,8 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -1438,7 +1618,7 @@ void tcp_send_ack(struct sock *sk) */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ @@ -1451,6 +1631,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just @@ -1465,14 +1647,14 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) int tcp_write_wakeup(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if ((skb = tp->send_head) != NULL && + if ((skb = sk->sk_send_head) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; - int mss = tcp_current_mss(sk, 0); - int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; @@ -1494,9 +1676,12 @@ int tcp_write_wakeup(struct sock *sk) sk->sk_route_caps &= ~NETIF_F_TSO; tp->mss_cache = tp->mss_cache_std; } - } + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); @@ -1517,12 +1702,12 @@ int tcp_write_wakeup(struct sock *sk) */ void tcp_send_probe0(struct sock *sk) { - struct tcp_opt *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); int err; err = tcp_write_wakeup(sk); - if (tp->packets_out || !tp->send_head) { + if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; tp->backoff = 0; @@ -1549,13 +1734,7 @@ void tcp_send_probe0(struct sock *sk) } } -EXPORT_SYMBOL(tcp_acceptable_seq); EXPORT_SYMBOL(tcp_connect); -EXPORT_SYMBOL(tcp_connect_init); EXPORT_SYMBOL(tcp_make_synack); -EXPORT_SYMBOL(tcp_send_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); -EXPORT_SYMBOL(tcp_transmit_skb); -EXPORT_SYMBOL(tcp_write_wakeup); -EXPORT_SYMBOL(tcp_write_xmit);