2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/module.h>
26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
31 int sysctl_tcp_retries1 = TCP_RETR1;
32 int sysctl_tcp_retries2 = TCP_RETR2;
33 int sysctl_tcp_orphan_retries;
35 static void tcp_write_timer(unsigned long);
36 static void tcp_delack_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data);
39 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
42 * Using different timers for retransmit, delayed acks and probes
43 * We may wish use just one timer maintaining a list of expire jiffies
47 void tcp_init_xmit_timers(struct sock *sk)
49 struct tcp_opt *tp = tcp_sk(sk);
51 init_timer(&tp->retransmit_timer);
52 tp->retransmit_timer.function=&tcp_write_timer;
53 tp->retransmit_timer.data = (unsigned long) sk;
56 init_timer(&tp->delack_timer);
57 tp->delack_timer.function=&tcp_delack_timer;
58 tp->delack_timer.data = (unsigned long) sk;
61 init_timer(&sk->sk_timer);
62 sk->sk_timer.function = &tcp_keepalive_timer;
63 sk->sk_timer.data = (unsigned long)sk;
66 void tcp_clear_xmit_timers(struct sock *sk)
68 struct tcp_opt *tp = tcp_sk(sk);
71 if (timer_pending(&tp->retransmit_timer) &&
72 del_timer(&tp->retransmit_timer))
77 if (timer_pending(&tp->delack_timer) &&
78 del_timer(&tp->delack_timer))
81 if (timer_pending(&sk->sk_timer) && del_timer(&sk->sk_timer))
85 static void tcp_write_err(struct sock *sk)
87 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
88 sk->sk_error_report(sk);
91 NET_INC_STATS_BH(TCPAbortOnTimeout);
94 /* Do not allow orphaned sockets to eat all our resources.
95 * This is direct violation of TCP specs, but it is required
96 * to prevent DoS attacks. It is called when a retransmission timeout
97 * or zero probe timeout occurs on orphaned socket.
99 * Criterium is still not confirmed experimentally and may change.
100 * We kill the socket, if:
101 * 1. If number of orphaned sockets exceeds an administratively configured
103 * 2. If we have strong memory pressure.
105 static int tcp_out_of_resources(struct sock *sk, int do_reset)
107 struct tcp_opt *tp = tcp_sk(sk);
108 int orphans = atomic_read(&tcp_orphan_count);
110 /* If peer does not open window for long time, or did not transmit
111 * anything for long time, penalize it. */
112 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
115 /* If some dubious ICMP arrived, penalize even more. */
119 if (orphans >= sysctl_tcp_max_orphans ||
120 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
121 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
123 printk(KERN_INFO "Out of socket memory\n");
125 /* Catch exceptional cases, when connection requires reset.
126 * 1. Last segment was sent recently. */
127 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
128 /* 2. Window is closed. */
129 (!tp->snd_wnd && !tp->packets_out))
132 tcp_send_active_reset(sk, GFP_ATOMIC);
134 NET_INC_STATS_BH(TCPAbortOnMemory);
140 /* Calculate maximal number or retries on an orphaned socket. */
141 static int tcp_orphan_retries(struct sock *sk, int alive)
143 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
145 /* We know from an ICMP that something is wrong. */
146 if (sk->sk_err_soft && !alive)
149 /* However, if socket sent something recently, select some safe
150 * number of retries. 8 corresponds to >100 seconds with minimal
152 if (retries == 0 && alive)
157 /* A write timeout has occurred. Process the after effects. */
158 static int tcp_write_timeout(struct sock *sk)
160 struct tcp_opt *tp = tcp_sk(sk);
163 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
165 dst_negative_advice(&sk->sk_dst_cache);
166 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
168 if (tp->retransmits >= sysctl_tcp_retries1) {
169 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
172 It is place to make it. It is not made. I do not want
173 to make it. It is disguisting. It does not work in any
174 case. Let me to cite the same draft, which requires for
175 us to implement this:
177 "The one security concern raised by this memo is that ICMP black holes
178 are often caused by over-zealous security administrators who block
179 all ICMP messages. It is vitally important that those who design and
180 deploy security systems understand the impact of strict filtering on
181 upper-layer protocols. The safest web site in the world is worthless
182 if most TCP implementations cannot transfer data from it. It would
183 be far nicer to have all of the black holes fixed rather than fixing
184 all of the TCP implementations."
189 dst_negative_advice(&sk->sk_dst_cache);
192 retry_until = sysctl_tcp_retries2;
193 if (sock_flag(sk, SOCK_DEAD)) {
194 int alive = (tp->rto < TCP_RTO_MAX);
196 retry_until = tcp_orphan_retries(sk, alive);
198 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
203 if (tp->retransmits >= retry_until) {
204 /* Has it gone just too far? */
211 static void tcp_delack_timer(unsigned long data)
213 struct sock *sk = (struct sock*)data;
214 struct tcp_opt *tp = tcp_sk(sk);
217 if (sock_owned_by_user(sk)) {
218 /* Try again later. */
220 NET_INC_STATS_BH(DelayedACKLocked);
221 if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN))
228 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
231 if (time_after(tp->ack.timeout, jiffies)) {
232 if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
236 tp->ack.pending &= ~TCP_ACK_TIMER;
238 if (skb_queue_len(&tp->ucopy.prequeue)) {
241 NET_ADD_STATS_BH(TCPSchedulerFailed,
242 skb_queue_len(&tp->ucopy.prequeue));
244 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
245 sk->sk_backlog_rcv(sk, skb);
247 tp->ucopy.memory = 0;
250 if (tcp_ack_scheduled(tp)) {
251 if (!tp->ack.pingpong) {
252 /* Delayed ACK missed: inflate ATO. */
253 tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
255 /* Delayed ACK missed: leave pingpong mode and
258 tp->ack.pingpong = 0;
259 tp->ack.ato = TCP_ATO_MIN;
262 NET_INC_STATS_BH(DelayedACKs);
267 if (tcp_memory_pressure)
274 static void tcp_probe_timer(struct sock *sk)
276 struct tcp_opt *tp = tcp_sk(sk);
279 if (tp->packets_out || !tp->send_head) {
284 /* *WARNING* RFC 1122 forbids this
286 * It doesn't AFAIK, because we kill the retransmit timer -AK
288 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
289 * this behaviour in Solaris down as a bug fix. [AC]
291 * Let me to explain. probes_out is zeroed by incoming ACKs
292 * even if they advertise zero window. Hence, connection is killed only
293 * if we received no ACKs for normal connection timeout. It is not killed
294 * only because window stays zero for some time, window may be zero
295 * until armageddon and even later. We are in full accordance
296 * with RFCs, only probe timer combines both retransmission timeout
297 * and probe timeout in one bottle. --ANK
299 max_probes = sysctl_tcp_retries2;
301 if (sock_flag(sk, SOCK_DEAD)) {
302 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
304 max_probes = tcp_orphan_retries(sk, alive);
306 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
310 if (tp->probes_out > max_probes) {
313 /* Only send another probe if we didn't close things up. */
319 * The TCP retransmit timer.
322 static void tcp_retransmit_timer(struct sock *sk)
324 struct tcp_opt *tp = tcp_sk(sk);
326 if (tp->packets_out == 0)
329 BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
331 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
332 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
333 /* Receiver dastardly shrinks window. Our retransmits
334 * become zero probes, but we should not timeout this
335 * connection. If the socket is an orphan, time it out,
336 * we cannot allow such beasts to hang infinitely.
339 if (net_ratelimit()) {
340 struct inet_opt *inet = inet_sk(sk);
341 printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
342 NIPQUAD(inet->daddr), htons(inet->dport),
343 inet->num, tp->snd_una, tp->snd_nxt);
346 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
350 tcp_enter_loss(sk, 0);
351 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
353 goto out_reset_timer;
356 if (tcp_write_timeout(sk))
359 if (tp->retransmits == 0) {
360 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
362 if (tp->ca_state == TCP_CA_Recovery)
363 NET_INC_STATS_BH(TCPSackRecoveryFail);
365 NET_INC_STATS_BH(TCPSackFailures);
367 if (tp->ca_state == TCP_CA_Recovery)
368 NET_INC_STATS_BH(TCPRenoRecoveryFail);
370 NET_INC_STATS_BH(TCPRenoFailures);
372 } else if (tp->ca_state == TCP_CA_Loss) {
373 NET_INC_STATS_BH(TCPLossFailures);
375 NET_INC_STATS_BH(TCPTimeouts);
379 if (tcp_use_frto(sk)) {
382 tcp_enter_loss(sk, 0);
385 if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
386 /* Retransmission failed because of local congestion,
389 if (!tp->retransmits)
391 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
392 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
396 /* Increase the timeout each time we retransmit. Note that
397 * we do not increase the rtt estimate. rto is initialized
398 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
399 * that doubling rto each time is the least we can get away with.
400 * In KA9Q, Karn uses this for the first few times, and then
401 * goes to quadratic. netBSD doubles, but only goes up to *64,
402 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
403 * defined in the protocol as the maximum possible RTT. I guess
404 * we'll have to use something other than TCP to talk to the
405 * University of Mars.
407 * PAWS allows us longer timeouts and large windows, so once
408 * implemented ftp to mars will work nicely. We will have to fix
409 * the 120 second clamps though!
415 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
416 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
417 if (tp->retransmits > sysctl_tcp_retries1)
423 static void tcp_write_timer(unsigned long data)
425 struct sock *sk = (struct sock*)data;
426 struct tcp_opt *tp = tcp_sk(sk);
430 if (sock_owned_by_user(sk)) {
431 /* Try again later */
432 if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20)))
437 if (sk->sk_state == TCP_CLOSE || !tp->pending)
440 if (time_after(tp->timeout, jiffies)) {
441 if (!mod_timer(&tp->retransmit_timer, tp->timeout))
450 case TCP_TIME_RETRANS:
451 tcp_retransmit_timer(sk);
453 case TCP_TIME_PROBE0:
467 * Timer for listening sockets
470 static void tcp_synack_timer(struct sock *sk)
472 struct tcp_opt *tp = tcp_sk(sk);
473 struct tcp_listen_opt *lopt = tp->listen_opt;
474 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
475 int thresh = max_retries;
476 unsigned long now = jiffies;
477 struct open_request **reqp, *req;
480 if (lopt == NULL || lopt->qlen == 0)
483 /* Normally all the openreqs are young and become mature
484 * (i.e. converted to established socket) for first timeout.
485 * If synack was not acknowledged for 3 seconds, it means
486 * one of the following things: synack was lost, ack was lost,
487 * rtt is high or nobody planned to ack (i.e. synflood).
488 * When server is a bit loaded, queue is populated with old
489 * open requests, reducing effective size of queue.
490 * When server is well loaded, queue size reduces to zero
491 * after several minutes of work. It is not synflood,
492 * it is normal operation. The solution is pruning
493 * too old entries overriding normal timeout, when
494 * situation becomes dangerous.
496 * Essentially, we reserve half of room for young
497 * embrions; and abort old ones without pity, if old
498 * ones are about to clog our table.
500 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
501 int young = (lopt->qlen_young<<1);
504 if (lopt->qlen < young)
511 if (tp->defer_accept)
512 max_retries = tp->defer_accept;
514 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
515 i = lopt->clock_hand;
518 reqp=&lopt->syn_table[i];
519 while ((req = *reqp) != NULL) {
520 if (time_after_eq(now, req->expires)) {
521 if ((req->retrans < thresh ||
522 (req->acked && req->retrans < max_retries))
523 && !req->class->rtx_syn_ack(sk, req, NULL)) {
526 if (req->retrans++ == 0)
528 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
530 req->expires = now + timeo;
531 reqp = &req->dl_next;
535 /* Drop this request */
536 write_lock(&tp->syn_wait_lock);
537 *reqp = req->dl_next;
538 write_unlock(&tp->syn_wait_lock);
540 if (req->retrans == 0)
542 tcp_openreq_free(req);
545 reqp = &req->dl_next;
548 i = (i+1)&(TCP_SYNQ_HSIZE-1);
550 } while (--budget > 0);
552 lopt->clock_hand = i;
555 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
558 void tcp_delete_keepalive_timer (struct sock *sk)
560 if (timer_pending(&sk->sk_timer) && del_timer (&sk->sk_timer))
564 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
566 if (!mod_timer(&sk->sk_timer, jiffies + len))
570 void tcp_set_keepalive(struct sock *sk, int val)
572 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
575 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
576 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
578 tcp_delete_keepalive_timer(sk);
582 static void tcp_keepalive_timer (unsigned long data)
584 struct sock *sk = (struct sock *) data;
585 struct tcp_opt *tp = tcp_sk(sk);
588 /* Only process if socket is not in use. */
590 if (sock_owned_by_user(sk)) {
591 /* Try again later. */
592 tcp_reset_keepalive_timer (sk, HZ/20);
596 if (sk->sk_state == TCP_LISTEN) {
597 tcp_synack_timer(sk);
601 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
602 if (tp->linger2 >= 0) {
603 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
606 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
610 tcp_send_active_reset(sk, GFP_ATOMIC);
614 if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
617 elapsed = keepalive_time_when(tp);
619 /* It is alive without keepalive 8) */
620 if (tp->packets_out || tp->send_head)
623 elapsed = tcp_time_stamp - tp->rcv_tstamp;
625 if (elapsed >= keepalive_time_when(tp)) {
626 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
627 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
628 tcp_send_active_reset(sk, GFP_ATOMIC);
632 if (tcp_write_wakeup(sk) <= 0) {
634 elapsed = keepalive_intvl_when(tp);
636 /* If keepalive was lost due to local congestion,
639 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
642 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
643 elapsed = keepalive_time_when(tp) - elapsed;
650 tcp_reset_keepalive_timer (sk, elapsed);
661 EXPORT_SYMBOL(tcp_clear_xmit_timers);
662 EXPORT_SYMBOL(tcp_delete_keepalive_timer);
663 EXPORT_SYMBOL(tcp_init_xmit_timers);
664 EXPORT_SYMBOL(tcp_reset_keepalive_timer);