X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_output.c;h=39214126cf9efc0af485fd836a58e5ffd00c35f8;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=bc5fba4a39fd0bbd578d21bc0aefd1c2be7f326d;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bc5fba4a3..39214126c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,15 +45,20 @@ /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; +/* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ +int sysctl_tcp_tso_win_divisor = 8; + static __inline__ void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) { - tp->send_head = skb->next; - if (tp->send_head == (struct sk_buff *)&sk->sk_write_queue) - tp->send_head = NULL; + sk->sk_send_head = skb->next; + if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) + sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp->packets_out++ == 0) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_packets_out_inc(sk, tp, skb); } /* SND.NXT, if window was not shrunk. @@ -123,7 +128,8 @@ static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *s { u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) + if (!tcp_get_pcount(&tp->packets_out) && + (s32)(now - tp->lsndtime) > tp->rto) tcp_cwnd_restart(tp, __sk_dst_get(sk)); tp->lsndtime = now; @@ -143,6 +149,65 @@ static __inline__ void tcp_event_ack_sent(struct sock *sk) tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } +/* Determine a window scaling and initial window to offer. + * Based on the assumption that the given amount of space + * will be offered. Store the results in the tp structure. + * NOTE: for smooth operation initial space offering should + * be a multiple of mss if possible. We assume here that mss >= 1. + * This MUST be enforced by all callers. + */ +void tcp_select_initial_window(int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale) +{ + unsigned int space = (__space < 0 ? 0 : __space); + + /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp == 0) + (*window_clamp) = (65535 << 14); + space = min(*window_clamp, space); + + /* Quantize space offering to a multiple of mss if possible. */ + if (space > mss) + space = (space / mss) * mss; + + /* NOTE: offering an initial window larger than 32767 + * will break some buggy TCP stacks. We try to be nice. + * If we are not window scaling, then this truncates + * our initial window offering to 32k. There should also + * be a sysctl option to stop being nice. + */ + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + (*rcv_wscale) = 0; + if (wscale_ok) { + /* Set window scaling on max possible window + * See RFC1323 for an explanation of the limit to 14 + */ + space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + while (space > 65535 && (*rcv_wscale) < 14) { + space >>= 1; + (*rcv_wscale)++; + } + } + + /* Set initial window to value enough for senders, + * following RFC1414. Senders, not following this RFC, + * will be satisfied with 2. + */ + if (mss > (1<<*rcv_wscale)) { + int init_cwnd = 4; + if (mss > 1460*3) + init_cwnd = 2; + else if (mss > 1460) + init_cwnd = 3; + if (*rcv_wnd > init_cwnd*mss) + *rcv_wnd = init_cwnd*mss; + } + + /* Set the clamp no higher than max representable value */ + (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); +} + /* Chose a new window to advertise, update state in tcp_opt for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing @@ -168,6 +233,14 @@ static __inline__ u16 tcp_select_window(struct sock *sk) tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (!tp->rcv_wscale) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rcv_wscale)); + /* RFC1323 scaling applied */ new_win >>= tp->rcv_wscale; @@ -190,9 +263,9 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { - if(skb != NULL) { + if (skb != NULL) { struct inet_opt *inet = inet_sk(sk); struct tcp_opt *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -201,6 +274,8 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) int sysctl_flags; int err; + BUG_ON(!tcp_skb_pcount(skb)); + #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 @@ -291,7 +366,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); - TCP_INC_STATS(TcpOutSegs); + TCP_INC_STATS(TCP_MIB_OUTSEGS); err = tp->af_specific->queue_xmit(skb, 0); if (err <= 0) @@ -326,11 +401,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->sk_write_queue, skb); - tcp_charge_skb(sk, skb); + sk_charge_skb(sk, skb); /* Queue it, remembering where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; + if (sk->sk_send_head == NULL) + sk->sk_send_head = skb; } /* Send _single_ skb sitting at the send head. This function requires @@ -339,82 +414,35 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) void tcp_push_one(struct sock *sk, unsigned cur_mss) { struct tcp_opt *tp = tcp_sk(sk); - struct sk_buff *skb = tp->send_head; + struct sk_buff *skb = sk->sk_send_head; if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - tp->send_head = NULL; + sk->sk_send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp->packets_out++ == 0) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_packets_out_inc(sk, tp, skb); return; } } } -/* Split fragmented skb to two parts at length len. */ - -static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len) +void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std) { - int i; - int pos = skb_headlen(skb); - - if (len < pos) { - /* Split line is inside header. */ - memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len); - - /* And move data appendix as is. */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; - - skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; - - skb1->data_len = skb->data_len; - skb1->len += skb1->data_len; - skb->data_len = 0; - skb->len = len; - skb->tail = skb->data+len; + if (skb->len <= mss_std) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; } else { - int k = 0; - int nfrags = skb_shinfo(skb)->nr_frags; - - /* Second chunk has no header, nothing to copy. */ - - skb_shinfo(skb)->nr_frags = 0; - skb1->len = skb1->data_len = skb->len - len; - skb->len = len; - skb->data_len = len - pos; - - for (i=0; ifrags[i].size; - if (pos + size > len) { - skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; - - if (pos < len) { - /* Split frag. - * We have to variants in this case: - * 1. Move all the frag to the second - * part, if it is possible. F.e. - * this approach is mandatory for TUX, - * where splitting is expensive. - * 2. Split is accurately. We make this. - */ - get_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb1)->frags[0].page_offset += (len-pos); - skb_shinfo(skb1)->frags[0].size -= (len-pos); - skb_shinfo(skb)->frags[i].size = len-pos; - skb_shinfo(skb)->nr_frags++; - } - k++; - } else { - skb_shinfo(skb)->nr_frags++; - } - pos += size; - } - skb_shinfo(skb1)->nr_frags = k; + unsigned int factor; + + factor = skb->len + (mss_std - 1); + factor /= mss_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = mss_std; } } @@ -436,10 +464,10 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ - tcp_charge_skb(sk, buff); + sk_charge_skb(sk, buff); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; @@ -450,11 +478,9 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; - TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); - if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { - tp->lost_out++; - tp->left_out++; - } + TCP_SKB_CB(buff)->sacked = + (TCP_SKB_CB(skb)->sacked & + (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { @@ -477,6 +503,25 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tcp_dec_pcount(&tp->lost_out, skb); + tcp_dec_pcount(&tp->left_out, skb); + } + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + tcp_set_skb_tso_segs(buff, tp->mss_cache_std); + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tcp_inc_pcount(&tp->lost_out, skb); + tcp_inc_pcount(&tp->left_out, skb); + } + + if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { + tcp_inc_pcount(&tp->lost_out, buff); + tcp_inc_pcount(&tp->left_out, buff); + } + /* Link BUFF into the send queue. */ __skb_append(skb, buff); @@ -487,7 +532,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -unsigned char * __pskb_trim_head(struct sk_buff *skb, int len) +static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; @@ -515,8 +560,10 @@ unsigned char * __pskb_trim_head(struct sk_buff *skb, int len) return skb->tail; } -static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { + struct tcp_opt *tp = tcp_sk(sk); + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; @@ -530,6 +577,17 @@ static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_HW; + + skb->truesize -= len; + sk->sk_queue_shrunk = 1; + sk->sk_wmem_queued -= len; + sk->sk_forward_alloc += len; + + /* Any change of skb->len requires recalculation of tso + * factor and mss. + */ + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + return 0; } @@ -556,7 +614,7 @@ static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) this function. --ANK (980731) */ -int tcp_sync_mss(struct sock *sk, u32 pmtu) +unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); @@ -592,23 +650,71 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) tp->pmtu_cookie = pmtu; tp->mss_cache = tp->mss_cache_std = mss_now; - if (sk->sk_route_caps & NETIF_F_TSO) { - int large_mss; + return mss_now; +} + +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + * + * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * cannot be large. However, taking into account rare use of URG, this + * is not a big flaw. + */ + +unsigned int tcp_current_mss(struct sock *sk, int large) +{ + struct tcp_opt *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + unsigned int do_large, mss_now; + + mss_now = tp->mss_cache_std; + if (dst) { + u32 mtu = dst_pmtu(dst); + if (mtu != tp->pmtu_cookie || + tp->ext2_header_len != dst->header_len) + mss_now = tcp_sync_mss(sk, mtu); + } + + do_large = (large && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode); + + if (do_large) { + unsigned int large_mss, factor, limit; large_mss = 65535 - tp->af_specific->net_header_len - - tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len; + tp->ext_header_len - tp->ext2_header_len - + tp->tcp_header_len; if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len); + large_mss = max((tp->max_window>>1), + 68U - tp->tcp_header_len); + + factor = large_mss / mss_now; + + /* Always keep large mss multiple of real mss, but + * do not exceed 1/tso_win_divisor of the congestion window + * so we can keep the ACK clock ticking and minimize + * bursting. + */ + limit = tp->snd_cwnd; + if (sysctl_tcp_tso_win_divisor) + limit /= sysctl_tcp_tso_win_divisor; + limit = max(1U, limit); + if (factor > limit) + factor = limit; - /* Always keep large mss multiple of real mss. */ - tp->mss_cache = mss_now*(large_mss/mss_now); + tp->mss_cache = mss_now * factor; + + mss_now = tp->mss_cache; } + if (tp->eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now; } - /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -636,8 +742,10 @@ int tcp_write_xmit(struct sock *sk, int nonagle) */ mss_now = tcp_current_mss(sk, 1); - while((skb = tp->send_head) && - tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)) { + while ((skb = sk->sk_send_head) && + tcp_snd_test(tp, skb, mss_now, + tcp_skb_is_last(sk, skb) ? nonagle : + TCP_NAGLE_PUSH)) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; @@ -646,8 +754,12 @@ int tcp_write_xmit(struct sock *sk, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) break; - /* Advance the send_head. This one is sent out. */ + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ update_send_head(sk, tp, skb); + tcp_minshall_update(tp, mss_now, skb); sent_pkts = 1; } @@ -657,7 +769,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle) return 0; } - return !tp->packets_out && tp->send_head; + return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head; } return 0; } @@ -744,17 +856,32 @@ u32 __tcp_select_window(struct sock *sk) if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; - /* Get the largest window that is a nice multiple of mss. - * Window clamp already applied above. - * If our current window offering is within 1 mss of the - * free space we just keep it. This prevents the divide - * and multiply from happening most of the time. - * We also don't do any window rounding when the free space - * is too small. + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. */ window = tp->rcv_wnd; - if (window <= free_space - mss || window > free_space) - window = (free_space/mss)*mss; + if (tp->rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1< mss. + */ + if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window) + window = (((window >> tp->rcv_wscale) + 1) + << tp->rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space/mss)*mss; + } return window; } @@ -768,7 +895,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. */ - if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + if (!skb_cloned(skb) && !skb_cloned(next_skb)) { int skb_size = skb->len, next_skb_size = next_skb->len; u16 flags = TCP_SKB_CB(skb)->flags; @@ -788,6 +915,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m ((skb_size + next_skb_size) > mss_now)) return; + BUG_ON(tcp_skb_pcount(skb) != 1 || + tcp_skb_pcount(next_skb) != 1); + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, next_skb->list); @@ -811,24 +941,23 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, next_skb); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { - tp->lost_out--; - tp->left_out--; + tcp_dec_pcount(&tp->lost_out, next_skb); + tcp_dec_pcount(&tp->left_out, next_skb); } /* Reno case is special. Sigh... */ - if (!tp->sack_ok && tp->sacked_out) { - tp->sacked_out--; - tp->left_out--; + if (!tp->sack_ok && tcp_get_pcount(&tp->sacked_out)) { + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + tcp_dec_pcount(&tp->left_out, next_skb); } /* Not quite right: it can be > snd.fack, but * it is better to underestimate fackets. */ - if (tp->fackets_out) - tp->fackets_out--; - tcp_free_skb(sk, next_skb); - tp->packets_out--; + tcp_dec_pcount_approx(&tp->fackets_out, next_skb); + tcp_packets_out_dec(tp, next_skb); + sk_stream_free_skb(sk, next_skb); } } @@ -843,16 +972,16 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk, 0); int lost = 0; - for_retrans_queue(skb, sk, tp) { + sk_stream_for_retrans_queue(skb, sk) { if (skb->len > mss && !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out--; + tcp_dec_pcount(&tp->retrans_out, skb); } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + tcp_inc_pcount(&tp->lost_out, skb); lost = 1; } } @@ -918,18 +1047,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; - if(skb->len > cur_mss) { - if(tcp_fragment(sk, skb, cur_mss)) + if (skb->len > cur_mss) { + int old_factor = tcp_skb_pcount(skb); + int new_factor; + + if (tcp_fragment(sk, skb, cur_mss)) return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ - tp->packets_out++; + new_factor = tcp_skb_pcount(skb); + tcp_dec_pcount_explicit(&tp->packets_out, + old_factor - new_factor); + tcp_inc_pcount(&tp->packets_out, skb->next); } /* Collapse two adjacent packets if worthwhile and we can. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && (skb->len < (cur_mss >> 1)) && - (skb->next != tp->send_head) && + (skb->next != sk->sk_send_head) && (skb->next != (struct sk_buff *)&sk->sk_write_queue) && (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && (sysctl_tcp_retrans_collapse != 0)) @@ -947,6 +1082,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; } @@ -963,7 +1100,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TcpRetransSegs); + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { @@ -972,7 +1109,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; - tp->retrans_out++; + tcp_inc_pcount(&tp->retrans_out, skb); /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) @@ -1000,13 +1137,20 @@ void tcp_xmit_retransmit_queue(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt = tcp_get_pcount(&tp->lost_out); /* First pass: retransmit lost packets. */ if (packet_cnt) { - for_retrans_queue(skb, sk, tp) { + sk_stream_for_retrans_queue(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; @@ -1015,16 +1159,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_retransmit_skb(sk, skb)) return; if (tp->ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(TCPFastRetrans); + NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else - NET_INC_STATS_BH(TCPSlowStartRetrans); + NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); if (skb == skb_peek(&sk->sk_write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } - if (--packet_cnt <= 0) + packet_cnt -= tcp_skb_pcount(skb); + if (packet_cnt <= 0) break; } } @@ -1043,7 +1188,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... * - * For now we do not retrnamsit anything, while we have some new + * For now we do not retransmit anything, while we have some new * segments to send. */ @@ -1052,24 +1197,30 @@ void tcp_xmit_retransmit_queue(struct sock *sk) packet_cnt = 0; - for_retrans_queue(skb, sk, tp) { - if(++packet_cnt > tp->fackets_out) + sk_stream_for_retrans_queue(skb, sk) { + /* Similar to the retransmit loop above we + * can pretend that the retransmitted SKB + * we send out here will be composed of one + * real MSS sized packet because tcp_retransmit_skb() + * will fragment it if necessary. + */ + if (++packet_cnt > tcp_get_pcount(&tp->fackets_out)) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) break; - if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) continue; /* Ok, retransmit it. */ - if(tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) break; if (skb == skb_peek(&sk->sk_write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - NET_INC_STATS_BH(TCPForwardRetrans); + NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } } @@ -1081,15 +1232,15 @@ void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); - unsigned int mss_now; + int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk, 1); - if(tp->send_head != NULL) { + if (sk->sk_send_head != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; @@ -1107,6 +1258,8 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1129,7 +1282,7 @@ void tcp_send_active_reset(struct sock *sk, int priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(TCPAbortFailed); + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); return; } @@ -1138,13 +1291,15 @@ void tcp_send_active_reset(struct sock *sk, int priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb)) - NET_INC_STATS(TCPAbortFailed); + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } /* WARNING: This routine must only be called when we have already sent @@ -1168,8 +1323,8 @@ int tcp_send_synack(struct sock *sk) return -ENOMEM; __skb_unlink(skb, &sk->sk_write_queue); __skb_queue_head(&sk->sk_write_queue, nskb); - tcp_free_skb(sk, skb); - tcp_charge_skb(sk, nskb); + sk_stream_free_skb(sk, skb); + sk_charge_skb(sk, nskb); skb = nskb; } @@ -1217,6 +1372,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->dest = req->rmt_port; TCP_SKB_CB(skb)->seq = req->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(req->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -1244,7 +1402,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->csum = 0; th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TcpOutSegs); + TCP_INC_STATS(TCP_MIB_OUTSEGS); return skb; } @@ -1272,7 +1430,7 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); - tcp_vegas_init(tp); + tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1318,21 +1476,23 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - tcp_vegas_init(tp); + tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->sk_write_queue, buff); - tcp_charge_skb(sk, buff); - tp->packets_out++; + sk_charge_skb(sk, buff); + tcp_inc_pcount(&tp->packets_out, buff); tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); - TCP_INC_STATS(TcpActiveOpens); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); @@ -1417,6 +1577,8 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -1451,6 +1613,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just @@ -1468,11 +1632,11 @@ int tcp_write_wakeup(struct sock *sk) struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *skb; - if ((skb = tp->send_head) != NULL && + if ((skb = sk->sk_send_head) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; - int mss = tcp_current_mss(sk, 0); - int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; @@ -1494,7 +1658,9 @@ int tcp_write_wakeup(struct sock *sk) sk->sk_route_caps &= ~NETIF_F_TSO; tp->mss_cache = tp->mss_cache_std; } - } + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); @@ -1522,7 +1688,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk); - if (tp->packets_out || !tp->send_head) { + if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; tp->backoff = 0; @@ -1556,6 +1722,5 @@ EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_send_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); -EXPORT_SYMBOL(tcp_transmit_skb); EXPORT_SYMBOL(tcp_write_wakeup); EXPORT_SYMBOL(tcp_write_xmit);