net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/module.h>
  24 #include <net/tcp.h>
  25
  26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
  27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
  28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
  29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
  30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
  31 int sysctl_tcp_retries1 = TCP_RETR1;
  32 int sysctl_tcp_retries2 = TCP_RETR2;
  33 int sysctl_tcp_orphan_retries;
  34
  35 static void tcp_write_timer(unsigned long);
  36 static void tcp_delack_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38
  39 #ifdef TCP_DEBUG
  40 const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
  41 #endif
  42
  43 /*
  44  * Using different timers for retransmit, delayed acks and probes
  45  * We may wish use just one timer maintaining a list of expire jiffies
  46  * to optimize.
  47  */
  48
  49 void tcp_init_xmit_timers(struct sock *sk)
  50 {
  51         struct tcp_opt *tp = tcp_sk(sk);
  52
  53         init_timer(&tp->retransmit_timer);
  54         tp->retransmit_timer.function=&tcp_write_timer;
  55         tp->retransmit_timer.data = (unsigned long) sk;
  56         tp->pending = 0;
  57
  58         init_timer(&tp->delack_timer);
  59         tp->delack_timer.function=&tcp_delack_timer;
  60         tp->delack_timer.data = (unsigned long) sk;
  61         tp->ack.pending = 0;
  62
  63         init_timer(&sk->sk_timer);
  64         sk->sk_timer.function   = &tcp_keepalive_timer;
  65         sk->sk_timer.data       = (unsigned long)sk;
  66 }
  67
  68 void tcp_clear_xmit_timers(struct sock *sk)
  69 {
  70         struct tcp_opt *tp = tcp_sk(sk);
  71
  72         tp->pending = 0;
  73         sk_stop_timer(sk, &tp->retransmit_timer);
  74
  75         tp->ack.pending = 0;
  76         tp->ack.blocked = 0;
  77         sk_stop_timer(sk, &tp->delack_timer);
  78
  79         sk_stop_timer(sk, &sk->sk_timer);
  80 }
  81
  82 static void tcp_write_err(struct sock *sk)
  83 {
  84         sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  85         sk->sk_error_report(sk);
  86
  87         tcp_done(sk);
  88         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
  89 }
  90
  91 /* Do not allow orphaned sockets to eat all our resources.
  92  * This is direct violation of TCP specs, but it is required
  93  * to prevent DoS attacks. It is called when a retransmission timeout
  94  * or zero probe timeout occurs on orphaned socket.
  95  *
  96  * Criterium is still not confirmed experimentally and may change.
  97  * We kill the socket, if:
  98  * 1. If number of orphaned sockets exceeds an administratively configured
  99  *    limit.
 100  * 2. If we have strong memory pressure.
 101  */
 102 static int tcp_out_of_resources(struct sock *sk, int do_reset)
 103 {
 104         struct tcp_opt *tp = tcp_sk(sk);
 105         int orphans = atomic_read(&tcp_orphan_count);
 106
 107         /* If peer does not open window for long time, or did not transmit
 108          * anything for long time, penalize it. */
 109         if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
 110                 orphans <<= 1;
 111
 112         /* If some dubious ICMP arrived, penalize even more. */
 113         if (sk->sk_err_soft)
 114                 orphans <<= 1;
 115
 116         if (orphans >= sysctl_tcp_max_orphans ||
 117             (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
 118              atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
 119                 if (net_ratelimit())
 120                         printk(KERN_INFO "Out of socket memory\n");
 121
 122                 /* Catch exceptional cases, when connection requires reset.
 123                  *      1. Last segment was sent recently. */
 124                 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
 125                     /*  2. Window is closed. */
 126                     (!tp->snd_wnd && !tcp_get_pcount(&tp->packets_out)))
 127                         do_reset = 1;
 128                 if (do_reset)
 129                         tcp_send_active_reset(sk, GFP_ATOMIC);
 130                 tcp_done(sk);
 131                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
 132                 return 1;
 133         }
 134         return 0;
 135 }
 136
 137 /* Calculate maximal number or retries on an orphaned socket. */
 138 static int tcp_orphan_retries(struct sock *sk, int alive)
 139 {
 140         int retries = sysctl_tcp_orphan_retries; /* May be zero. */
 141
 142         /* We know from an ICMP that something is wrong. */
 143         if (sk->sk_err_soft && !alive)
 144                 retries = 0;
 145
 146         /* However, if socket sent something recently, select some safe
 147          * number of retries. 8 corresponds to >100 seconds with minimal
 148          * RTO of 200msec. */
 149         if (retries == 0 && alive)
 150                 retries = 8;
 151         return retries;
 152 }
 153
 154 /* A write timeout has occurred. Process the after effects. */
 155 static int tcp_write_timeout(struct sock *sk)
 156 {
 157         struct tcp_opt *tp = tcp_sk(sk);
 158         int retry_until;
 159
 160         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 161                 if (tp->retransmits)
 162                         dst_negative_advice(&sk->sk_dst_cache);
 163                 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
 164         } else {
 165                 if (tp->retransmits >= sysctl_tcp_retries1) {
 166                         /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 167                            hole detection. :-(
 168
 169                            It is place to make it. It is not made. I do not want
 170                            to make it. It is disguisting. It does not work in any
 171                            case. Let me to cite the same draft, which requires for
 172                            us to implement this:
 173
 174    "The one security concern raised by this memo is that ICMP black holes
 175    are often caused by over-zealous security administrators who block
 176    all ICMP messages.  It is vitally important that those who design and
 177    deploy security systems understand the impact of strict filtering on
 178    upper-layer protocols.  The safest web site in the world is worthless
 179    if most TCP implementations cannot transfer data from it.  It would
 180    be far nicer to have all of the black holes fixed rather than fixing
 181    all of the TCP implementations."
 182
 183                            Golden words :-).
 184                    */
 185
 186                         dst_negative_advice(&sk->sk_dst_cache);
 187                 }
 188
 189                 retry_until = sysctl_tcp_retries2;
 190                 if (sock_flag(sk, SOCK_DEAD)) {
 191                         int alive = (tp->rto < TCP_RTO_MAX);
 192
 193                         retry_until = tcp_orphan_retries(sk, alive);
 194
 195                         if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
 196                                 return 1;
 197                 }
 198         }
 199
 200         if (tp->retransmits >= retry_until) {
 201                 /* Has it gone just too far? */
 202                 tcp_write_err(sk);
 203                 return 1;
 204         }
 205         return 0;
 206 }
 207
 208 static void tcp_delack_timer(unsigned long data)
 209 {
 210         struct sock *sk = (struct sock*)data;
 211         struct tcp_opt *tp = tcp_sk(sk);
 212
 213         bh_lock_sock(sk);
 214         if (sock_owned_by_user(sk)) {
 215                 /* Try again later. */
 216                 tp->ack.blocked = 1;
 217                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
 218                 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
 219                 goto out_unlock;
 220         }
 221
 222         sk_stream_mem_reclaim(sk);
 223
 224         if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
 225                 goto out;
 226
 227         if (time_after(tp->ack.timeout, jiffies)) {
 228                 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
 229                 goto out;
 230         }
 231         tp->ack.pending &= ~TCP_ACK_TIMER;
 232
 233         if (skb_queue_len(&tp->ucopy.prequeue)) {
 234                 struct sk_buff *skb;
 235
 236                 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
 237                                  skb_queue_len(&tp->ucopy.prequeue));
 238
 239                 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 240                         sk->sk_backlog_rcv(sk, skb);
 241
 242                 tp->ucopy.memory = 0;
 243         }
 244
 245         if (tcp_ack_scheduled(tp)) {
 246                 if (!tp->ack.pingpong) {
 247                         /* Delayed ACK missed: inflate ATO. */
 248                         tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
 249                 } else {
 250                         /* Delayed ACK missed: leave pingpong mode and
 251                          * deflate ATO.
 252                          */
 253                         tp->ack.pingpong = 0;
 254                         tp->ack.ato = TCP_ATO_MIN;
 255                 }
 256                 tcp_send_ack(sk);
 257                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
 258         }
 259         TCP_CHECK_TIMER(sk);
 260
 261 out:
 262         if (tcp_memory_pressure)
 263                 sk_stream_mem_reclaim(sk);
 264 out_unlock:
 265         bh_unlock_sock(sk);
 266         sock_put(sk);
 267 }
 268
 269 static void tcp_probe_timer(struct sock *sk)
 270 {
 271         struct tcp_opt *tp = tcp_sk(sk);
 272         int max_probes;
 273
 274         if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
 275                 tp->probes_out = 0;
 276                 return;
 277         }
 278
 279         /* *WARNING* RFC 1122 forbids this
 280          *
 281          * It doesn't AFAIK, because we kill the retransmit timer -AK
 282          *
 283          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 284          * this behaviour in Solaris down as a bug fix. [AC]
 285          *
 286          * Let me to explain. probes_out is zeroed by incoming ACKs
 287          * even if they advertise zero window. Hence, connection is killed only
 288          * if we received no ACKs for normal connection timeout. It is not killed
 289          * only because window stays zero for some time, window may be zero
 290          * until armageddon and even later. We are in full accordance
 291          * with RFCs, only probe timer combines both retransmission timeout
 292          * and probe timeout in one bottle.                             --ANK
 293          */
 294         max_probes = sysctl_tcp_retries2;
 295
 296         if (sock_flag(sk, SOCK_DEAD)) {
 297                 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
 298
 299                 max_probes = tcp_orphan_retries(sk, alive);
 300
 301                 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
 302                         return;
 303         }
 304
 305         if (tp->probes_out > max_probes) {
 306                 tcp_write_err(sk);
 307         } else {
 308                 /* Only send another probe if we didn't close things up. */
 309                 tcp_send_probe0(sk);
 310         }
 311 }
 312
 313 /*
 314  *      The TCP retransmit timer.
 315  */
 316
 317 static void tcp_retransmit_timer(struct sock *sk)
 318 {
 319         struct tcp_opt *tp = tcp_sk(sk);
 320
 321         if (!tcp_get_pcount(&tp->packets_out))
 322                 goto out;
 323
 324         BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
 325
 326         if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 327             !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 328                 /* Receiver dastardly shrinks window. Our retransmits
 329                  * become zero probes, but we should not timeout this
 330                  * connection. If the socket is an orphan, time it out,
 331                  * we cannot allow such beasts to hang infinitely.
 332                  */
 333 #ifdef TCP_DEBUG
 334                 if (net_ratelimit()) {
 335                         struct inet_opt *inet = inet_sk(sk);
 336                         printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
 337                                NIPQUAD(inet->daddr), htons(inet->dport),
 338                                inet->num, tp->snd_una, tp->snd_nxt);
 339                 }
 340 #endif
 341                 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 342                         tcp_write_err(sk);
 343                         goto out;
 344                 }
 345                 tcp_enter_loss(sk, 0);
 346                 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
 347                 __sk_dst_reset(sk);
 348                 goto out_reset_timer;
 349         }
 350
 351         if (tcp_write_timeout(sk))
 352                 goto out;
 353
 354         if (tp->retransmits == 0) {
 355                 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
 356                         if (tp->sack_ok) {
 357                                 if (tp->ca_state == TCP_CA_Recovery)
 358                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 359                                 else
 360                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 361                         } else {
 362                                 if (tp->ca_state == TCP_CA_Recovery)
 363                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 364                                 else
 365                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 366                         }
 367                 } else if (tp->ca_state == TCP_CA_Loss) {
 368                         NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 369                 } else {
 370                         NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
 371                 }
 372         }
 373
 374         if (tcp_use_frto(sk)) {
 375                 tcp_enter_frto(sk);
 376         } else {
 377                 tcp_enter_loss(sk, 0);
 378         }
 379
 380         if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
 381                 /* Retransmission failed because of local congestion,
 382                  * do not backoff.
 383                  */
 384                 if (!tp->retransmits)
 385                         tp->retransmits=1;
 386                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
 387                                      min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
 388                 goto out;
 389         }
 390
 391         /* Increase the timeout each time we retransmit.  Note that
 392          * we do not increase the rtt estimate.  rto is initialized
 393          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 394          * that doubling rto each time is the least we can get away with.
 395          * In KA9Q, Karn uses this for the first few times, and then
 396          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 397          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 398          * defined in the protocol as the maximum possible RTT.  I guess
 399          * we'll have to use something other than TCP to talk to the
 400          * University of Mars.
 401          *
 402          * PAWS allows us longer timeouts and large windows, so once
 403          * implemented ftp to mars will work nicely. We will have to fix
 404          * the 120 second clamps though!
 405          */
 406         tp->backoff++;
 407         tp->retransmits++;
 408
 409 out_reset_timer:
 410         tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
 411         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 412         if (tp->retransmits > sysctl_tcp_retries1)
 413                 __sk_dst_reset(sk);
 414
 415 out:;
 416 }
 417
 418 static void tcp_write_timer(unsigned long data)
 419 {
 420         struct sock *sk = (struct sock*)data;
 421         struct tcp_opt *tp = tcp_sk(sk);
 422         int event;
 423
 424         bh_lock_sock(sk);
 425         if (sock_owned_by_user(sk)) {
 426                 /* Try again later */
 427                 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
 428                 goto out_unlock;
 429         }
 430
 431         if (sk->sk_state == TCP_CLOSE || !tp->pending)
 432                 goto out;
 433
 434         if (time_after(tp->timeout, jiffies)) {
 435                 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
 436                 goto out;
 437         }
 438
 439         event = tp->pending;
 440         tp->pending = 0;
 441
 442         switch (event) {
 443         case TCP_TIME_RETRANS:
 444                 tcp_retransmit_timer(sk);
 445                 break;
 446         case TCP_TIME_PROBE0:
 447                 tcp_probe_timer(sk);
 448                 break;
 449         }
 450         TCP_CHECK_TIMER(sk);
 451
 452 out:
 453         sk_stream_mem_reclaim(sk);
 454 out_unlock:
 455         bh_unlock_sock(sk);
 456         sock_put(sk);
 457 }
 458
 459 /*
 460  *      Timer for listening sockets
 461  */
 462
 463 static void tcp_synack_timer(struct sock *sk)
 464 {
 465         struct tcp_opt *tp = tcp_sk(sk);
 466         struct tcp_listen_opt *lopt = tp->listen_opt;
 467         int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
 468         int thresh = max_retries;
 469         unsigned long now = jiffies;
 470         struct open_request **reqp, *req;
 471         int i, budget;
 472
 473         if (lopt == NULL || lopt->qlen == 0)
 474                 return;
 475
 476         /* Normally all the openreqs are young and become mature
 477          * (i.e. converted to established socket) for first timeout.
 478          * If synack was not acknowledged for 3 seconds, it means
 479          * one of the following things: synack was lost, ack was lost,
 480          * rtt is high or nobody planned to ack (i.e. synflood).
 481          * When server is a bit loaded, queue is populated with old
 482          * open requests, reducing effective size of queue.
 483          * When server is well loaded, queue size reduces to zero
 484          * after several minutes of work. It is not synflood,
 485          * it is normal operation. The solution is pruning
 486          * too old entries overriding normal timeout, when
 487          * situation becomes dangerous.
 488          *
 489          * Essentially, we reserve half of room for young
 490          * embrions; and abort old ones without pity, if old
 491          * ones are about to clog our table.
 492          */
 493         if (lopt->qlen>>(lopt->max_qlen_log-1)) {
 494 #ifdef CONFIG_ACCEPT_QUEUES
 495                 int young = 0;
 496
 497                 for(i=0; i < NUM_ACCEPT_QUEUES; i++)
 498                         young += lopt->qlen_young[i];
 499
 500                 young <<= 1;
 501 #else
 502                 int young = (lopt->qlen_young<<1);
 503 #endif
 504
 505                 while (thresh > 2) {
 506                         if (lopt->qlen < young)
 507                                 break;
 508                         thresh--;
 509                         young <<= 1;
 510                 }
 511         }
 512
 513         if (tp->defer_accept)
 514                 max_retries = tp->defer_accept;
 515
 516         budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
 517         i = lopt->clock_hand;
 518
 519         do {
 520                 reqp=&lopt->syn_table[i];
 521                 while ((req = *reqp) != NULL) {
 522                         if (time_after_eq(now, req->expires)) {
 523                                 if ((req->retrans < thresh ||
 524                                      (req->acked && req->retrans < max_retries))
 525                                     && !req->class->rtx_syn_ack(sk, req, NULL)) {
 526                                         unsigned long timeo;
 527
 528                                         if (req->retrans++ == 0)
 529 #ifdef CONFIG_ACCEPT_QUEUES
 530                                                 lopt->qlen_young[req->acceptq_class]--;
 531 #else
 532                                                 lopt->qlen_young--;
 533 #endif
 534                                         timeo = min((TCP_TIMEOUT_INIT << req->retrans), TCP_RTO_MAX);
 535                                         req->expires = now + timeo;
 536                                         reqp = &req->dl_next;
 537                                         continue;
 538                                 }
 539
 540                                 /* Drop this request */
 541                                 write_lock(&tp->syn_wait_lock);
 542                                 *reqp = req->dl_next;
 543                                 write_unlock(&tp->syn_wait_lock);
 544                                 lopt->qlen--;
 545                                 if (req->retrans == 0)
 546 #ifdef CONFIG_ACCEPT_QUEUES
 547                                                 lopt->qlen_young[req->acceptq_class]--;
 548 #else
 549                                         lopt->qlen_young--;
 550 #endif
 551                                 tcp_openreq_free(req);
 552                                 continue;
 553                         }
 554                         reqp = &req->dl_next;
 555                 }
 556
 557                 i = (i+1)&(TCP_SYNQ_HSIZE-1);
 558
 559         } while (--budget > 0);
 560
 561         lopt->clock_hand = i;
 562
 563         if (lopt->qlen)
 564                 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
 565 }
 566
 567 void tcp_delete_keepalive_timer (struct sock *sk)
 568 {
 569         sk_stop_timer(sk, &sk->sk_timer);
 570 }
 571
 572 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
 573 {
 574         sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
 575 }
 576
 577 void tcp_set_keepalive(struct sock *sk, int val)
 578 {
 579         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
 580                 return;
 581
 582         if (val && !sock_flag(sk, SOCK_KEEPOPEN))
 583                 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 584         else if (!val)
 585                 tcp_delete_keepalive_timer(sk);
 586 }
 587
 588
 589 static void tcp_keepalive_timer (unsigned long data)
 590 {
 591         struct sock *sk = (struct sock *) data;
 592         struct tcp_opt *tp = tcp_sk(sk);
 593         __u32 elapsed;
 594
 595         /* Only process if socket is not in use. */
 596         bh_lock_sock(sk);
 597         if (sock_owned_by_user(sk)) {
 598                 /* Try again later. */
 599                 tcp_reset_keepalive_timer (sk, HZ/20);
 600                 goto out;
 601         }
 602
 603         if (sk->sk_state == TCP_LISTEN) {
 604                 tcp_synack_timer(sk);
 605                 goto out;
 606         }
 607
 608         if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 609                 if (tp->linger2 >= 0) {
 610                         int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
 611
 612                         if (tmo > 0) {
 613                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 614                                 goto out;
 615                         }
 616                 }
 617                 tcp_send_active_reset(sk, GFP_ATOMIC);
 618                 goto death;
 619         }
 620
 621         if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 622                 goto out;
 623
 624         elapsed = keepalive_time_when(tp);
 625
 626         /* It is alive without keepalive 8) */
 627         if (tcp_get_pcount(&tp->packets_out) || sk->sk_send_head)
 628                 goto resched;
 629
 630         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 631
 632         if (elapsed >= keepalive_time_when(tp)) {
 633                 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 634                      (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
 635                         tcp_send_active_reset(sk, GFP_ATOMIC);
 636                         tcp_write_err(sk);
 637                         goto out;
 638                 }
 639                 if (tcp_write_wakeup(sk) <= 0) {
 640                         tp->probes_out++;
 641                         elapsed = keepalive_intvl_when(tp);
 642                 } else {
 643                         /* If keepalive was lost due to local congestion,
 644                          * try harder.
 645                          */
 646                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 647                 }
 648         } else {
 649                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 650                 elapsed = keepalive_time_when(tp) - elapsed;
 651         }
 652
 653         TCP_CHECK_TIMER(sk);
 654         sk_stream_mem_reclaim(sk);
 655
 656 resched:
 657         tcp_reset_keepalive_timer (sk, elapsed);
 658         goto out;
 659
 660 death:
 661         tcp_done(sk);
 662
 663 out:
 664         bh_unlock_sock(sk);
 665         sock_put(sk);
 666 }
 667
 668 EXPORT_SYMBOL(tcp_clear_xmit_timers);
 669 EXPORT_SYMBOL(tcp_delete_keepalive_timer);
 670 EXPORT_SYMBOL(tcp_init_xmit_timers);
 671 EXPORT_SYMBOL(tcp_reset_keepalive_timer);
 672 #ifdef TCP_DEBUG
 673 EXPORT_SYMBOL(tcp_timer_bug_msg);
 674 #endif