patch-2_6_7-vs1_9_1_12

[linux-2.6.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 1890be6..c0a0b65 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -90,6 +90,8 @@ int sysctl_tcp_nometrics_save;
  int sysctl_tcp_westwood;
  int sysctl_tcp_vegas_cong_avoid;
  
+int sysctl_tcp_moderate_rcvbuf;
+
  /* Default values of the Vegas variables, in fixed-point representation
   * with V_PARAM_SHIFT bits to the right of the binary point.
   */
@@ -305,6 +307,8 @@ static void tcp_init_buffer_space(struct sock *sk)
         if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
                 tcp_fixup_sndbuf(sk);
  
+       tp->rcvq_space.space = tp->rcv_wnd;
+
         maxwin = tcp_full_space(sk);
  
         if (tp->window_clamp >= maxwin) {
@@ -364,6 +368,130 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_opt *tp)
         }
  }
  
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ *
+ * More detail on this code can be found at
+ * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * though this reference is out of date.  A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_opt *tp, u32 sample, int win_dep)
+{
+       u32 new_sample = tp->rcv_rtt_est.rtt;
+       long m = sample;
+
+       if (m == 0)
+               m = 1;
+
+       if (new_sample != 0) {
+               /* If we sample in larger samples in the non-timestamp
+                * case, we could grossly overestimate the RTT especially
+                * with chatty applications or bulk transfer apps which
+                * are stalled on filesystem I/O.
+                *
+                * Also, since we are only going for a minimum in the
+                * non-timestamp case, we do not smoothe things out
+                * else with timestamps disabled convergance takes too
+                * long.
+                */
+               if (!win_dep) {
+                       m -= (new_sample >> 3);
+                       new_sample += m;
+               } else if (m < new_sample)
+                       new_sample = m << 3;
+       } else {
+               /* No previous mesaure. */
+               new_sample = m << 3;
+       }
+
+       if (tp->rcv_rtt_est.rtt != new_sample)
+               tp->rcv_rtt_est.rtt = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_opt *tp)
+{
+       if (tp->rcv_rtt_est.time == 0)
+               goto new_measure;
+       if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+               return;
+       tcp_rcv_rtt_update(tp,
+                          jiffies - tp->rcv_rtt_est.time,
+                          1);
+
+new_measure:
+       tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+       tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct tcp_opt *tp, struct sk_buff *skb)
+{
+       if (tp->rcv_tsecr &&
+           (TCP_SKB_CB(skb)->end_seq -
+            TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+               tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_tsecr, 0);
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+       struct tcp_opt *tp = tcp_sk(sk);
+       int time;
+       int space;
+       
+       if (tp->rcvq_space.time == 0)
+               goto new_measure;
+       
+       time = tcp_time_stamp - tp->rcvq_space.time;
+       if (time < (tp->rcv_rtt_est.rtt >> 3) ||
+           tp->rcv_rtt_est.rtt == 0)
+               return;
+       
+       space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+
+       space = max(tp->rcvq_space.space, space);
+
+       if (tp->rcvq_space.space != space) {
+               int rcvmem;
+
+               tp->rcvq_space.space = space;
+
+               if (sysctl_tcp_moderate_rcvbuf) {
+                       int new_clamp = space;
+
+                       /* Receive space grows, normalize in order to
+                        * take into account packet headers and sk_buff
+                        * structure overhead.
+                        */
+                       space /= tp->advmss;
+                       if (!space)
+                               space = 1;
+                       rcvmem = (tp->advmss + MAX_TCP_HEADER +
+                                 16 + sizeof(struct sk_buff));
+                       while (tcp_win_from_space(rcvmem) < tp->advmss)
+                               rcvmem += 128;
+                       space *= rcvmem;
+                       space = min(space, sysctl_tcp_rmem[2]);
+                       if (space > sk->sk_rcvbuf) {
+                               sk->sk_rcvbuf = space;
+
+                               /* Make the window clamp follow along.  */
+                               tp->window_clamp = new_clamp;
+                       }
+               }
+       }
+       
+new_measure:
+       tp->rcvq_space.seq = tp->copied_seq;
+       tp->rcvq_space.time = tcp_time_stamp;
+}
+
  /* There is something which you must keep in mind when you analyze the
   * behavior of the tp->ato delayed ack timeout interval.  When a
   * connection starts up, we want to ack as quickly as possible.  The
@@ -382,6 +510,8 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_b
  
         tcp_measure_rcv_mss(tp, skb);
  
+       tcp_rcv_rtt_measure(tp);
+       
         now = tcp_time_stamp;
  
         if (!tp->ack.ato) {
@@ -3318,6 +3448,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                                 tp->ucopy.len -= chunk;
                                 tp->copied_seq += chunk;
                                 eaten = (chunk == skb->len && !th->fin);
+                               tcp_rcv_space_adjust(sk);
                         }
                         local_bh_disable();
                 }
@@ -3918,6 +4049,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
         if (!err) {
                 tp->ucopy.len -= chunk;
                 tp->copied_seq += chunk;
+               tcp_rcv_space_adjust(sk);
         }
  
         local_bh_disable();
@@ -4045,6 +4177,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
                                     tp->rcv_nxt == tp->rcv_wup)
                                         tcp_store_ts_recent(tp);
+
+                               tcp_rcv_rtt_measure_ts(tp, skb);
+
                                 /* We know that such packets are checksummed
                                  * on entry.
                                  */
@@ -4076,6 +4211,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                             tp->rcv_nxt == tp->rcv_wup)
                                                 tcp_store_ts_recent(tp);
  
+                                       tcp_rcv_rtt_measure_ts(tp, skb);
+
                                         __skb_pull(skb, tcp_header_len);
                                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                         NET_INC_STATS_BH(TCPHPHitsToUser);
@@ -4095,6 +4232,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                     tp->rcv_nxt == tp->rcv_wup)
                                         tcp_store_ts_recent(tp);
  
+                               tcp_rcv_rtt_measure_ts(tp, skb);
+
                                 if ((int)skb->truesize > sk->sk_forward_alloc)
                                         goto step5;
  
@@ -4191,6 +4330,8 @@ step5:
         if(th->ack)
                 tcp_ack(sk, skb, FLAG_SLOWPATH);
  
+       tcp_rcv_rtt_measure_ts(tp, skb);
+
         /* Process urgent data. */
         tcp_urg(sk, skb, th);