X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_output.c;h=340bcdda6d7a11c636c7860816b844317010881b;hb=refs%2Fheads%2Fvserver;hp=5dc3118dffef4162e22e5f148a415dddf1851f9f;hpb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;p=linux-2.6.git diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5dc3118df..340bcdda6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -43,24 +43,24 @@ #include /* People can turn this off for buggy TCP's found in printers etc. */ -int sysctl_tcp_retrans_collapse = 1; +int sysctl_tcp_retrans_collapse __read_mostly = 1; /* People can turn this on to work with those rare, broken TCPs that * interpret the window field as a signed quantity. */ -int sysctl_tcp_workaround_signed_windows = 0; +int sysctl_tcp_workaround_signed_windows __read_mostly = 0; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ -int sysctl_tcp_tso_win_divisor = 3; +int sysctl_tcp_tso_win_divisor __read_mostly = 3; -int sysctl_tcp_mtu_probing = 0; -int sysctl_tcp_base_mss = 512; +int sysctl_tcp_mtu_probing __read_mostly = 0; +int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ -int sysctl_tcp_slow_start_after_idle = 1; +int sysctl_tcp_slow_start_after_idle __read_mostly = 1; static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) @@ -269,14 +269,14 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, - __u32 tstamp) +static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, + __u32 tstamp, __u8 **md5_hash) { if (tp->rx_opt.tstamp_ok) { - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); *ptr++ = htonl(tp->rx_opt.ts_recent); } @@ -298,16 +298,29 @@ static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, tp->rx_opt.eff_sacks--; } } +#ifdef CONFIG_TCP_MD5SIG + if (md5_hash) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + *md5_hash = (__u8 *)ptr; + } +#endif } /* Construct a tcp options header for a SYN or SYN_ACK packet. * If this is every changed make sure to change the definition of * MAX_SYN_SIZE to match the new maximum number of options that you * can generate. + * + * Note - that with the RFC2385 TCP option, we make room for the + * 16 byte MD5 hash. This will be filled in later, so the pointer for the + * location to be filled is passed back up. */ -static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, +static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, int offer_wscale, int wscale, __u32 tstamp, - __u32 ts_recent) + __u32 ts_recent, __u8 **md5_hash) { /* We always get an MSS option. * The option bytes which will be seen in normal data @@ -325,18 +338,41 @@ static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); if (ts) { if(sack) - *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); else - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); /* TSVAL */ *ptr++ = htonl(ts_recent); /* TSECR */ } else if(sack) - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); if (offer_wscale) - *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + (wscale)); +#ifdef CONFIG_TCP_MD5SIG + /* + * If MD5 is enabled, then we set the option, and include the size + * (always 18). The actual MD5 hash is added just before the + * packet is sent. + */ + if (md5_hash) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + *md5_hash = (__u8 *) ptr; + } +#endif } /* This routine actually transmits TCP packets queued in by @@ -357,6 +393,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_sock *tp; struct tcp_skb_cb *tcb; int tcp_header_size; +#ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *md5; + __u8 *md5_hash_location; +#endif struct tcphdr *th; int sysctl_flags; int err; @@ -415,6 +455,16 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); +#ifdef CONFIG_TCP_MD5SIG + /* + * Are we doing MD5 on this segment? If so - make + * room for it. + */ + md5 = tp->af_specific->md5_lookup(sk, sk); + if (md5) + tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; +#endif + th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; skb_set_owner_w(skb, sk); @@ -424,14 +474,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->dest = inet->dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); - *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ - th->window = htons(tp->rcv_wnd); + th->window = htons(min(tp->rcv_wnd, 65535U)); } else { th->window = htons(tcp_select_window(sk)); } @@ -445,20 +495,41 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_syn_build_options((__u32 *)(th + 1), + tcp_syn_build_options((__be32 *)(th + 1), tcp_advertise_mss(sk), (sysctl_flags & SYSCTL_FLAG_TSTAMPS), (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rx_opt.rcv_wscale, tcb->when, - tp->rx_opt.ts_recent); + tp->rx_opt.ts_recent, + +#ifdef CONFIG_TCP_MD5SIG + md5 ? &md5_hash_location : +#endif + NULL); } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, tcb->when); + tcp_build_and_update_options((__be32 *)(th + 1), + tp, tcb->when, +#ifdef CONFIG_TCP_MD5SIG + md5 ? &md5_hash_location : +#endif + NULL); TCP_ECN_send(sk, tp, skb, tcp_header_size); } +#ifdef CONFIG_TCP_MD5SIG + /* Calculate the MD5 hash, as we have all we need now */ + if (md5) { + tp->af_specific->calc_md5_hash(md5_hash_location, + md5, + sk, NULL, NULL, + skb->h.th, + sk->sk_protocol, + skb->len); + } +#endif + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) @@ -476,13 +547,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk); - /* NET_XMIT_CN is special. It does not guarantee, - * that this packet is lost. It tells that device - * is about to start to drop packets or already - * drops some packets of the same priority and - * invokes us to send less aggressively. - */ - return err == NET_XMIT_CN ? 0 : err; + return net_xmit_eval(err); #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE @@ -577,7 +642,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; - if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), nsize, 0); @@ -586,7 +651,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss skb->csum = csum_block_sub(skb->csum, buff->csum, len); } else { - skb->ip_summed = CHECKSUM_HW; + skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); } @@ -689,7 +754,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) __pskb_trim_head(skb, len - skb_headlen(skb)); TCP_SKB_CB(skb)->seq += len; - skb->ip_summed = CHECKSUM_HW; + skb->ip_summed = CHECKSUM_PARTIAL; skb->truesize -= len; sk->sk_wmem_queued -= len; @@ -838,6 +903,11 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); +#ifdef CONFIG_TCP_MD5SIG + if (tp->af_specific->md5_lookup(sk, sk)) + mss_now -= TCPOLEN_MD5SIG_ALIGNED; +#endif + xmit_size_goal = mss_now; if (doing_tso) { @@ -858,8 +928,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) return mss_now; } -EXPORT_SYMBOL_GPL(tcp_current_mss); - /* Congestion window validation. (RFC2861) */ static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) @@ -875,7 +943,8 @@ static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) + if (sysctl_tcp_slow_start_after_idle && + (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } } @@ -897,7 +966,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tcp_skb_pcount(skb) == 1) return 1; in_flight = tcp_packets_in_flight(tp); @@ -1064,7 +1134,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* This packet was never sent out yet, so no SACK bits. */ TCP_SKB_CB(buff)->sacked = 0; - buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); /* Fix up tso_factor for both original and new SKB. */ @@ -1089,10 +1159,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ u32 send_win, cong_win, limit, in_flight; if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) - return 0; + goto send_now; if (icsk->icsk_ca_state != TCP_CA_Open) - return 0; + goto send_now; + + /* Defer for less than two clock ticks. */ + if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1) + goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1108,7 +1182,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ /* If a full-sized TSO skb can be sent, do it. */ if (limit >= 65536) - return 0; + goto send_now; if (sysctl_tcp_tso_win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1118,7 +1192,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ */ chunk /= sysctl_tcp_tso_win_divisor; if (limit >= chunk) - return 0; + goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized @@ -1126,11 +1200,17 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ * then send now. */ if (limit > tcp_max_burst(tp) * tp->mss_cache) - return 0; + goto send_now; } /* Ok, it looks like it is advisable to defer. */ + tp->tso_deferred = 1 | (jiffies<<1); + return 1; + +send_now: + tp->tso_deferred = 0; + return 0; } /* Create a new MTU probe if we are ready. @@ -1208,8 +1288,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; - if (skb->ip_summed == CHECKSUM_HW) - nskb->ip_summed = CHECKSUM_HW; + nskb->ip_summed = skb->ip_summed; len = 0; while (len < probe_size) { @@ -1233,7 +1312,7 @@ static int tcp_mtu_probe(struct sock *sk) ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); - if (skb->ip_summed != CHECKSUM_HW) + if (skb->ip_summed != CHECKSUM_PARTIAL) skb->csum = csum_partial(skb->data, skb->len, 0); } else { __pskb_trim_head(skb, copy); @@ -1373,7 +1452,6 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, tcp_check_probe_timer(sk, tp); } } -EXPORT_SYMBOL_GPL(__tcp_push_pending_frames); /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. @@ -1530,6 +1608,9 @@ u32 __tcp_select_window(struct sock *sk) */ if (window <= free_space - mss || window > free_space) window = (free_space/mss)*mss; + else if (mss == full_space && + free_space > window + full_space/2) + window = free_space; } return window; @@ -1575,10 +1656,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); - if (next_skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_HW; + if (next_skb->ip_summed == CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_PARTIAL; - if (skb->ip_summed != CHECKSUM_HW) + if (skb->ip_summed != CHECKSUM_PARTIAL) skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); /* Update sequence range on original skb. */ @@ -2026,6 +2107,10 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct tcphdr *th; int tcp_header_size; struct sk_buff *skb; +#ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *md5; + __u8 *md5_hash_location; +#endif skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) @@ -2041,6 +2126,13 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + /* SACK_PERM is in the place of NOP NOP of TS */ ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + +#ifdef CONFIG_TCP_MD5SIG + /* Are we doing MD5 on this segment? If so - make room for it */ + md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); + if (md5) + tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; +#endif skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); memset(th, 0, sizeof(struct tcphdr)); @@ -2072,17 +2164,35 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, } /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(req->rcv_wnd); + th->window = htons(min(req->rcv_wnd, 65535U)); TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, + tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, TCP_SKB_CB(skb)->when, - req->ts_recent); + req->ts_recent, + ( +#ifdef CONFIG_TCP_MD5SIG + md5 ? &md5_hash_location : +#endif + NULL) + ); skb->csum = 0; th->doff = (tcp_header_size >> 2); TCP_INC_STATS(TCP_MIB_OUTSEGS); + +#ifdef CONFIG_TCP_MD5SIG + /* Okay, we have all we need - do the md5 hash if needed */ + if (md5) { + tp->af_specific->calc_md5_hash(md5_hash_location, + md5, + NULL, dst, req, + skb->h.th, sk->sk_protocol, + skb->len); + } +#endif + return skb; } @@ -2101,6 +2211,11 @@ static void tcp_connect_init(struct sock *sk) tp->tcp_header_len = sizeof(struct tcphdr) + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); +#ifdef CONFIG_TCP_MD5SIG + if (tp->af_specific->md5_lookup(sk, sk) != NULL) + tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; +#endif + /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;