linux 2.6.16.38 w/ vs2.0.3-rc1
[linux-2.6.git] / net / ipv4 / tcp_input.c
index 39dca51..0d3f31f 100644 (file)
@@ -63,6 +63,7 @@
  *             Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
  */
 
+#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
@@ -70,7 +71,6 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
-#include <net/netdma.h>
 
 int sysctl_tcp_timestamps = 1;
 int sysctl_tcp_window_scaling = 1;
@@ -89,7 +89,7 @@ int sysctl_tcp_frto;
 int sysctl_tcp_nometrics_save;
 
 int sysctl_tcp_moderate_rcvbuf = 1;
-int sysctl_tcp_abc;
+int sysctl_tcp_abc = 1;
 
 #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
 #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
@@ -1072,7 +1072,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                                else
                                        pkt_len = (end_seq -
                                                   TCP_SKB_CB(skb)->seq);
-                               if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
+                               if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
                                        break;
                                pcount = tcp_skb_pcount(skb);
                        }
@@ -1649,7 +1649,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
         * Hence, we can detect timed out packets during fast
         * retransmit without falling to slow start.
         */
-       if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
+       if (tcp_head_timedout(sk, tp)) {
                struct sk_buff *skb;
 
                skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
@@ -1688,26 +1688,17 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
- */
-static inline u32 tcp_cwnd_min(const struct sock *sk)
-{
-       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-
-       return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
-}
-
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct sock *sk)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int decr = tp->snd_cwnd_cnt + 1;
 
        tp->snd_cwnd_cnt = decr&1;
        decr >>= 1;
 
-       if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
+       if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
                tp->snd_cwnd -= decr;
 
        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1900,34 +1891,6 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
        }
 }
 
-static void tcp_mtup_probe_failed(struct sock *sk)
-{
-       struct inet_connection_sock *icsk = inet_csk(sk);
-
-       icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
-       icsk->icsk_mtup.probe_size = 0;
-}
-
-static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       struct inet_connection_sock *icsk = inet_csk(sk);
-
-       /* FIXME: breaks with very large cwnd */
-       tp->prior_ssthresh = tcp_current_ssthresh(sk);
-       tp->snd_cwnd = tp->snd_cwnd *
-                      tcp_mss_to_mtu(sk, tp->mss_cache) /
-                      icsk->icsk_mtup.probe_size;
-       tp->snd_cwnd_cnt = 0;
-       tp->snd_cwnd_stamp = tcp_time_stamp;
-       tp->rcv_ssthresh = tcp_current_ssthresh(sk);
-
-       icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
-       icsk->icsk_mtup.probe_size = 0;
-       tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
-}
-
-
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2060,17 +2023,6 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                        return;
                }
 
-               /* MTU probe failure: don't reduce cwnd */
-               if (icsk->icsk_ca_state < TCP_CA_CWR &&
-                   icsk->icsk_mtup.probe_size &&
-                   tp->snd_una == tp->mtu_probe.probe_seq_start) {
-                       tcp_mtup_probe_failed(sk);
-                       /* Restores the reduction we did in tcp_mtup_probe() */
-                       tp->snd_cwnd++;
-                       tcp_simple_retransmit(sk);
-                       return;
-               }
-
                /* Otherwise enter Recovery state */
 
                if (IsReno(tp))
@@ -2291,13 +2243,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                        tp->retrans_stamp = 0;
                }
 
-               /* MTU probing checks */
-               if (icsk->icsk_mtup.probe_size) {
-                       if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
-                               tcp_mtup_probe_success(sk, skb);
-                       }
-               }
-
                if (sacked) {
                        if (sacked & TCPCB_RETRANS) {
                                if(sacked & TCPCB_SACKED_RETRANS)
@@ -2505,13 +2450,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        if (before(ack, prior_snd_una))
                goto old_ack;
 
-       if (sysctl_tcp_abc) {
-               if (icsk->icsk_ca_state < TCP_CA_CWR)
-                       tp->bytes_acked += ack - prior_snd_una;
-               else if (icsk->icsk_ca_state == TCP_CA_Loss)
-                       /* we assume just one segment left network */
-                       tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
-       }
+       if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
+               tp->bytes_acked += ack - prior_snd_una;
 
        if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
                /* Window is constant, pure forward advance.
@@ -3534,7 +3474,6 @@ static int tcp_prune_queue(struct sock *sk)
        return -1;
 }
 
-EXPORT_SYMBOL_GPL(tcp_cwnd_application_limited);
 
 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
  * As additional protections, we do not touch cwnd in retransmission phases,
@@ -3547,8 +3486,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
            sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                /* Limited by application or receiver window. */
-               u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
-               u32 win_used = max(tp->snd_cwnd_used, init_win);
+               u32 win_used = max(tp->snd_cwnd_used, 2U);
                if (win_used < tp->snd_cwnd) {
                        tp->snd_ssthresh = tcp_current_ssthresh(sk);
                        tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
@@ -3801,50 +3739,6 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk
                __tcp_checksum_complete_user(sk, skb);
 }
 
-#ifdef CONFIG_NET_DMA
-static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       int chunk = skb->len - hlen;
-       int dma_cookie;
-       int copied_early = 0;
-
-       if (tp->ucopy.wakeup)
-               return 0;
-
-       if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-               tp->ucopy.dma_chan = get_softnet_dma();
-
-       if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
-
-               dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
-                       skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
-
-               if (dma_cookie < 0)
-                       goto out;
-
-               tp->ucopy.dma_cookie = dma_cookie;
-               copied_early = 1;
-
-               tp->ucopy.len -= chunk;
-               tp->copied_seq += chunk;
-               tcp_rcv_space_adjust(sk);
-
-               if ((tp->ucopy.len == 0) ||
-                   (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
-                   (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
-                       tp->ucopy.wakeup = 1;
-                       sk->sk_data_ready(sk, 0);
-               }
-       } else if (chunk > 0) {
-               tp->ucopy.wakeup = 1;
-               sk->sk_data_ready(sk, 0);
-       }
-out:
-       return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
 /*
  *     TCP receive function for the ESTABLISHED state. 
  *
@@ -3946,6 +3840,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);
 
+                               tcp_rcv_rtt_measure_ts(sk, skb);
+
                                /* We know that such packets are checksummed
                                 * on entry.
                                 */
@@ -3959,23 +3855,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        }
                } else {
                        int eaten = 0;
-                       int copied_early = 0;
 
-                       if (tp->copied_seq == tp->rcv_nxt &&
-                           len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
-                               if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-                                       copied_early = 1;
-                                       eaten = 1;
-                               }
-#endif
-                               if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
-                                       __set_current_state(TASK_RUNNING);
+                       if (tp->ucopy.task == current &&
+                           tp->copied_seq == tp->rcv_nxt &&
+                           len - tcp_header_len <= tp->ucopy.len &&
+                           sock_owned_by_user(sk)) {
+                               __set_current_state(TASK_RUNNING);
 
-                                       if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
-                                               eaten = 1;
-                               }
-                               if (eaten) {
+                               if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
                                        /* Predicted packet is in window by definition.
                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                         * Hence, check seq<=rcv_wup reduces to:
@@ -3991,9 +3878,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        __skb_pull(skb, tcp_header_len);
                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                        NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
+                                       eaten = 1;
                                }
-                               if (copied_early)
-                                       tcp_cleanup_rbuf(sk, skb->len);
                        }
                        if (!eaten) {
                                if (tcp_checksum_complete_user(sk, skb))
@@ -4034,11 +3920,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
                        __tcp_ack_snd_check(sk, 0);
 no_ack:
-#ifdef CONFIG_NET_DMA
-                       if (copied_early)
-                               __skb_queue_tail(&sk->sk_async_wait_queue, skb);
-                       else
-#endif
                        if (eaten)
                                __kfree_skb(skb);
                        else
@@ -4184,6 +4065,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 */
 
                TCP_ECN_rcv_synack(tp, th);
+               if (tp->ecn_flags&TCP_ECN_OK)
+                       sock_set_flag(sk, SOCK_NO_LARGESEND);
 
                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
                tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -4218,7 +4101,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
                        tp->rx_opt.sack_ok |= 2;
 
-               tcp_mtup_init(sk);
                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                tcp_initialize_rcv_mss(sk);
 
@@ -4326,8 +4208,9 @@ discard:
                tp->max_window = tp->snd_wnd;
 
                TCP_ECN_rcv_syn(tp, th);
+               if (tp->ecn_flags&TCP_ECN_OK)
+                       sock_set_flag(sk, SOCK_NO_LARGESEND);
 
-               tcp_mtup_init(sk);
                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                tcp_initialize_rcv_mss(sk);
 
@@ -4516,7 +4399,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                 */
                                tp->lsndtime = tcp_time_stamp;
 
-                               tcp_mtup_init(sk);
                                tcp_initialize_rcv_mss(sk);
                                tcp_init_buffer_space(sk);
                                tcp_fast_path_on(tp);
@@ -4628,6 +4510,7 @@ discard:
 
 EXPORT_SYMBOL(sysctl_tcp_ecn);
 EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(sysctl_tcp_abc);
 EXPORT_SYMBOL(tcp_parse_options);
 EXPORT_SYMBOL(tcp_rcv_established);
 EXPORT_SYMBOL(tcp_rcv_state_process);