X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_input.c;h=c26076fb890e5f935631dc4e45f96b90cee15521;hb=refs%2Fheads%2Fvserver;hp=0d3f31f6f16cf31f508530db7495b5843507a2ec;hpb=76828883507a47dae78837ab5dec5a5b4513c667;p=linux-2.6.git diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0d3f31f6f..c26076fb8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -63,7 +63,6 @@ * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */ -#include #include #include #include @@ -71,25 +70,26 @@ #include #include #include - -int sysctl_tcp_timestamps = 1; -int sysctl_tcp_window_scaling = 1; -int sysctl_tcp_sack = 1; -int sysctl_tcp_fack = 1; -int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; -int sysctl_tcp_ecn; -int sysctl_tcp_dsack = 1; -int sysctl_tcp_app_win = 31; -int sysctl_tcp_adv_win_scale = 2; - -int sysctl_tcp_stdurg; -int sysctl_tcp_rfc1337; -int sysctl_tcp_max_orphans = NR_FILE; -int sysctl_tcp_frto; -int sysctl_tcp_nometrics_save; - -int sysctl_tcp_moderate_rcvbuf = 1; -int sysctl_tcp_abc = 1; +#include + +int sysctl_tcp_timestamps __read_mostly = 1; +int sysctl_tcp_window_scaling __read_mostly = 1; +int sysctl_tcp_sack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_ecn __read_mostly; +int sysctl_tcp_dsack __read_mostly = 1; +int sysctl_tcp_app_win __read_mostly = 31; +int sysctl_tcp_adv_win_scale __read_mostly = 2; + +int sysctl_tcp_stdurg __read_mostly; +int sysctl_tcp_rfc1337 __read_mostly; +int sysctl_tcp_max_orphans __read_mostly = NR_FILE; +int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_nometrics_save __read_mostly; + +int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; +int sysctl_tcp_abc __read_mostly; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -127,7 +127,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ - len = skb->len; + len = skb_shinfo(skb)->gso_size ?: skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { @@ -156,6 +156,8 @@ static void tcp_measure_rcv_mss(struct sock *sk, return; } } + if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } @@ -933,7 +935,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; - struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); + struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; int reord = tp->packets_out; int prior_fackets; @@ -1009,10 +1011,11 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ for (j = 0; j < i; j++){ if (after(ntohl(sp[j].start_seq), ntohl(sp[j+1].start_seq))){ - sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); - sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); - sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); - sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + struct tcp_sack_block_wire tmp; + + tmp = sp[j]; + sp[j] = sp[j+1]; + sp[j+1] = tmp; } } @@ -1072,7 +1075,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ else pkt_len = (end_seq - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size)) + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) break; pcount = tcp_skb_pcount(skb); } @@ -1649,7 +1652,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1688,17 +1691,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* Lower bound on congestion window is slow start threshold + * unless congestion avoidance choice decides to overide it. + */ +static inline u32 tcp_cwnd_min(const struct sock *sk) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + + return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + /* Decrease cwnd each second ack. */ static void tcp_cwnd_down(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1891,6 +1903,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) } } +static void tcp_mtup_probe_failed(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; + icsk->icsk_mtup.probe_size = 0; +} + +static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + /* FIXME: breaks with very large cwnd */ + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = tp->snd_cwnd * + tcp_mss_to_mtu(sk, tp->mss_cache) / + icsk->icsk_mtup.probe_size; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->rcv_ssthresh = tcp_current_ssthresh(sk); + + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; + icsk->icsk_mtup.probe_size = 0; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); +} + + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2023,6 +2063,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, return; } + /* MTU probe failure: don't reduce cwnd */ + if (icsk->icsk_ca_state < TCP_CA_CWR && + icsk->icsk_mtup.probe_size && + tp->snd_una == tp->mtu_probe.probe_seq_start) { + tcp_mtup_probe_failed(sk); + /* Restores the reduction we did in tcp_mtup_probe() */ + tp->snd_cwnd++; + tcp_simple_retransmit(sk); + return; + } + /* Otherwise enter Recovery state */ if (IsReno(tp)) @@ -2209,7 +2260,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) u32 pkts_acked = 0; void (*rtt_sample)(struct sock *sk, u32 usrtt) = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv; + struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2243,6 +2294,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tp->retrans_stamp = 0; } + /* MTU probing checks */ + if (icsk->icsk_mtup.probe_size) { + if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { + tcp_mtup_probe_success(sk, skb); + } + } + if (sacked) { if (sacked & TCPCB_RETRANS) { if(sacked & TCPCB_SACKED_RETRANS) @@ -2450,8 +2508,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; - if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) - tp->bytes_acked += ack - prior_snd_una; + if (sysctl_tcp_abc) { + if (icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + else if (icsk->icsk_ca_state == TCP_CA_Loss) + /* we assume just one segment left network */ + tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache); + } if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. @@ -2567,7 +2630,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, switch(opcode) { case TCPOPT_MSS: if(opsize==TCPOLEN_MSS && th->syn && !estab) { - u16 in_mss = ntohs(get_unaligned((__u16 *)ptr)); + u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) in_mss = opt_rx->user_mss; @@ -2595,8 +2658,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, if ((estab && opt_rx->tstamp_ok) || (!estab && sysctl_tcp_timestamps)) { opt_rx->saw_tstamp = 1; - opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr)); - opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4))); + opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr)); + opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4))); } } break; @@ -2615,6 +2678,14 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; } +#ifdef CONFIG_TCP_MD5SIG + case TCPOPT_MD5SIG: + /* + * The MD5 Hash has already been + * checked (see tcp_v{4,6}_do_rcv()). + */ + break; +#endif }; ptr+=opsize-2; length-=opsize; @@ -2633,8 +2704,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, return 0; } else if (tp->rx_opt.tstamp_ok && th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { - __u32 *ptr = (__u32 *)(th + 1); - if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + __be32 *ptr = (__be32 *)(th + 1); + if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->rx_opt.saw_tstamp = 1; ++ptr; @@ -3486,7 +3557,8 @@ void tcp_cwnd_application_limited(struct sock *sk) if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ - u32 win_used = max(tp->snd_cwnd_used, 2U); + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tp->snd_cwnd) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; @@ -3719,9 +3791,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) return err; } -static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { - int result; + __sum16 result; if (sock_owned_by_user(sk)) { local_bh_enable(); @@ -3739,6 +3811,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk __tcp_checksum_complete_user(sk, skb); } +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + /* * TCP receive function for the ESTABLISHED state. * @@ -3804,10 +3920,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - __u32 *ptr = (__u32 *)(th + 1); + __be32 *ptr = (__be32 *)(th + 1); /* No? Slow path! */ - if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) goto slow_path; @@ -3840,8 +3956,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); - /* We know that such packets are checksummed * on entry. */ @@ -3855,14 +3969,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; + int copied_early = 0; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -3878,8 +4001,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -3920,6 +4044,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __tcp_ack_snd_check(sk, 0); no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif if (eaten) __kfree_skb(skb); else @@ -4065,8 +4194,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ TCP_ECN_rcv_synack(tp, th); - if (tp->ecn_flags&TCP_ECN_OK) - sock_set_flag(sk, SOCK_NO_LARGESEND); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -4101,6 +4228,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; + tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4108,9 +4236,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * Change state from SYN-SENT only after copied_seq * is initialized. */ tp->copied_seq = tp->rcv_nxt; - mb(); + smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); + security_inet_conn_established(sk, skb); + /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -4208,9 +4338,8 @@ discard: tp->max_window = tp->snd_wnd; TCP_ECN_rcv_syn(tp, th); - if (tp->ecn_flags&TCP_ECN_OK) - sock_set_flag(sk, SOCK_NO_LARGESEND); + tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4292,9 +4421,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * But, this leaves one open to an easy denial of * service attack, and SYN cookies can't defend * against this problem. So, we drop the data - * in the interest of security over speed. + * in the interest of security over speed unless + * it's still in use. */ - goto discard; + kfree_skb(skb); + return 0; } goto discard; @@ -4355,7 +4486,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; - mb(); + smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); @@ -4399,6 +4530,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ tp->lsndtime = tcp_time_stamp; + tcp_mtup_init(sk); tcp_initialize_rcv_mss(sk); tcp_init_buffer_space(sk); tcp_fast_path_on(tp); @@ -4510,7 +4642,6 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); -EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process);