2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/module.h>
26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
31 int sysctl_tcp_retries1 = TCP_RETR1;
32 int sysctl_tcp_retries2 = TCP_RETR2;
33 int sysctl_tcp_orphan_retries;
35 static void tcp_write_timer(unsigned long);
36 static void tcp_delack_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data);
39 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
42 * Using different timers for retransmit, delayed acks and probes
43 * We may wish use just one timer maintaining a list of expire jiffies
47 void tcp_init_xmit_timers(struct sock *sk)
49 struct tcp_opt *tp = tcp_sk(sk);
51 init_timer(&tp->retransmit_timer);
52 tp->retransmit_timer.function=&tcp_write_timer;
53 tp->retransmit_timer.data = (unsigned long) sk;
56 init_timer(&tp->delack_timer);
57 tp->delack_timer.function=&tcp_delack_timer;
58 tp->delack_timer.data = (unsigned long) sk;
61 init_timer(&sk->sk_timer);
62 sk->sk_timer.function = &tcp_keepalive_timer;
63 sk->sk_timer.data = (unsigned long)sk;
66 void tcp_clear_xmit_timers(struct sock *sk)
68 struct tcp_opt *tp = tcp_sk(sk);
71 sk_stop_timer(sk, &tp->retransmit_timer);
75 sk_stop_timer(sk, &tp->delack_timer);
77 sk_stop_timer(sk, &sk->sk_timer);
80 static void tcp_write_err(struct sock *sk)
82 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
83 sk->sk_error_report(sk);
86 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
89 /* Do not allow orphaned sockets to eat all our resources.
90 * This is direct violation of TCP specs, but it is required
91 * to prevent DoS attacks. It is called when a retransmission timeout
92 * or zero probe timeout occurs on orphaned socket.
94 * Criterium is still not confirmed experimentally and may change.
95 * We kill the socket, if:
96 * 1. If number of orphaned sockets exceeds an administratively configured
98 * 2. If we have strong memory pressure.
100 static int tcp_out_of_resources(struct sock *sk, int do_reset)
102 struct tcp_opt *tp = tcp_sk(sk);
103 int orphans = atomic_read(&tcp_orphan_count);
105 /* If peer does not open window for long time, or did not transmit
106 * anything for long time, penalize it. */
107 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
110 /* If some dubious ICMP arrived, penalize even more. */
114 if (orphans >= sysctl_tcp_max_orphans ||
115 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
116 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
118 printk(KERN_INFO "Out of socket memory\n");
120 /* Catch exceptional cases, when connection requires reset.
121 * 1. Last segment was sent recently. */
122 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
123 /* 2. Window is closed. */
124 (!tp->snd_wnd && !tp->packets_out))
127 tcp_send_active_reset(sk, GFP_ATOMIC);
129 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
135 /* Calculate maximal number or retries on an orphaned socket. */
136 static int tcp_orphan_retries(struct sock *sk, int alive)
138 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
140 /* We know from an ICMP that something is wrong. */
141 if (sk->sk_err_soft && !alive)
144 /* However, if socket sent something recently, select some safe
145 * number of retries. 8 corresponds to >100 seconds with minimal
147 if (retries == 0 && alive)
152 /* A write timeout has occurred. Process the after effects. */
153 static int tcp_write_timeout(struct sock *sk)
155 struct tcp_opt *tp = tcp_sk(sk);
158 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
160 dst_negative_advice(&sk->sk_dst_cache);
161 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
163 if (tp->retransmits >= sysctl_tcp_retries1) {
164 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
167 It is place to make it. It is not made. I do not want
168 to make it. It is disguisting. It does not work in any
169 case. Let me to cite the same draft, which requires for
170 us to implement this:
172 "The one security concern raised by this memo is that ICMP black holes
173 are often caused by over-zealous security administrators who block
174 all ICMP messages. It is vitally important that those who design and
175 deploy security systems understand the impact of strict filtering on
176 upper-layer protocols. The safest web site in the world is worthless
177 if most TCP implementations cannot transfer data from it. It would
178 be far nicer to have all of the black holes fixed rather than fixing
179 all of the TCP implementations."
184 dst_negative_advice(&sk->sk_dst_cache);
187 retry_until = sysctl_tcp_retries2;
188 if (sock_flag(sk, SOCK_DEAD)) {
189 int alive = (tp->rto < TCP_RTO_MAX);
191 retry_until = tcp_orphan_retries(sk, alive);
193 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
198 if (tp->retransmits >= retry_until) {
199 /* Has it gone just too far? */
206 static void tcp_delack_timer(unsigned long data)
208 struct sock *sk = (struct sock*)data;
209 struct tcp_opt *tp = tcp_sk(sk);
212 if (sock_owned_by_user(sk)) {
213 /* Try again later. */
215 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
216 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
220 sk_stream_mem_reclaim(sk);
222 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
225 if (time_after(tp->ack.timeout, jiffies)) {
226 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
229 tp->ack.pending &= ~TCP_ACK_TIMER;
231 if (skb_queue_len(&tp->ucopy.prequeue)) {
234 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
235 skb_queue_len(&tp->ucopy.prequeue));
237 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
238 sk->sk_backlog_rcv(sk, skb);
240 tp->ucopy.memory = 0;
243 if (tcp_ack_scheduled(tp)) {
244 if (!tp->ack.pingpong) {
245 /* Delayed ACK missed: inflate ATO. */
246 tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
248 /* Delayed ACK missed: leave pingpong mode and
251 tp->ack.pingpong = 0;
252 tp->ack.ato = TCP_ATO_MIN;
255 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
260 if (tcp_memory_pressure)
261 sk_stream_mem_reclaim(sk);
267 static void tcp_probe_timer(struct sock *sk)
269 struct tcp_opt *tp = tcp_sk(sk);
272 if (tp->packets_out || !sk->sk_send_head) {
277 /* *WARNING* RFC 1122 forbids this
279 * It doesn't AFAIK, because we kill the retransmit timer -AK
281 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
282 * this behaviour in Solaris down as a bug fix. [AC]
284 * Let me to explain. probes_out is zeroed by incoming ACKs
285 * even if they advertise zero window. Hence, connection is killed only
286 * if we received no ACKs for normal connection timeout. It is not killed
287 * only because window stays zero for some time, window may be zero
288 * until armageddon and even later. We are in full accordance
289 * with RFCs, only probe timer combines both retransmission timeout
290 * and probe timeout in one bottle. --ANK
292 max_probes = sysctl_tcp_retries2;
294 if (sock_flag(sk, SOCK_DEAD)) {
295 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
297 max_probes = tcp_orphan_retries(sk, alive);
299 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
303 if (tp->probes_out > max_probes) {
306 /* Only send another probe if we didn't close things up. */
312 * The TCP retransmit timer.
315 static void tcp_retransmit_timer(struct sock *sk)
317 struct tcp_opt *tp = tcp_sk(sk);
319 if (tp->packets_out == 0)
322 BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
324 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
325 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
326 /* Receiver dastardly shrinks window. Our retransmits
327 * become zero probes, but we should not timeout this
328 * connection. If the socket is an orphan, time it out,
329 * we cannot allow such beasts to hang infinitely.
332 if (net_ratelimit()) {
333 struct inet_opt *inet = inet_sk(sk);
334 printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
335 NIPQUAD(inet->daddr), htons(inet->dport),
336 inet->num, tp->snd_una, tp->snd_nxt);
339 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
343 tcp_enter_loss(sk, 0);
344 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
346 goto out_reset_timer;
349 if (tcp_write_timeout(sk))
352 if (tp->retransmits == 0) {
353 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
355 if (tp->ca_state == TCP_CA_Recovery)
356 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
358 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
360 if (tp->ca_state == TCP_CA_Recovery)
361 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
363 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
365 } else if (tp->ca_state == TCP_CA_Loss) {
366 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
368 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
372 if (tcp_use_frto(sk)) {
375 tcp_enter_loss(sk, 0);
378 if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
379 /* Retransmission failed because of local congestion,
382 if (!tp->retransmits)
384 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
385 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
389 /* Increase the timeout each time we retransmit. Note that
390 * we do not increase the rtt estimate. rto is initialized
391 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
392 * that doubling rto each time is the least we can get away with.
393 * In KA9Q, Karn uses this for the first few times, and then
394 * goes to quadratic. netBSD doubles, but only goes up to *64,
395 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
396 * defined in the protocol as the maximum possible RTT. I guess
397 * we'll have to use something other than TCP to talk to the
398 * University of Mars.
400 * PAWS allows us longer timeouts and large windows, so once
401 * implemented ftp to mars will work nicely. We will have to fix
402 * the 120 second clamps though!
408 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
409 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
410 if (tp->retransmits > sysctl_tcp_retries1)
416 static void tcp_write_timer(unsigned long data)
418 struct sock *sk = (struct sock*)data;
419 struct tcp_opt *tp = tcp_sk(sk);
423 if (sock_owned_by_user(sk)) {
424 /* Try again later */
425 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
429 if (sk->sk_state == TCP_CLOSE || !tp->pending)
432 if (time_after(tp->timeout, jiffies)) {
433 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
441 case TCP_TIME_RETRANS:
442 tcp_retransmit_timer(sk);
444 case TCP_TIME_PROBE0:
451 sk_stream_mem_reclaim(sk);
458 * Timer for listening sockets
461 static void tcp_synack_timer(struct sock *sk)
463 struct tcp_opt *tp = tcp_sk(sk);
464 struct tcp_listen_opt *lopt = tp->listen_opt;
465 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
466 int thresh = max_retries;
467 unsigned long now = jiffies;
468 struct open_request **reqp, *req;
471 if (lopt == NULL || lopt->qlen == 0)
474 /* Normally all the openreqs are young and become mature
475 * (i.e. converted to established socket) for first timeout.
476 * If synack was not acknowledged for 3 seconds, it means
477 * one of the following things: synack was lost, ack was lost,
478 * rtt is high or nobody planned to ack (i.e. synflood).
479 * When server is a bit loaded, queue is populated with old
480 * open requests, reducing effective size of queue.
481 * When server is well loaded, queue size reduces to zero
482 * after several minutes of work. It is not synflood,
483 * it is normal operation. The solution is pruning
484 * too old entries overriding normal timeout, when
485 * situation becomes dangerous.
487 * Essentially, we reserve half of room for young
488 * embrions; and abort old ones without pity, if old
489 * ones are about to clog our table.
491 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
492 #ifdef CONFIG_ACCEPT_QUEUES
495 for(i=0; i < NUM_ACCEPT_QUEUES; i++)
496 young += lopt->qlen_young[i];
500 int young = (lopt->qlen_young<<1);
504 if (lopt->qlen < young)
511 if (tp->defer_accept)
512 max_retries = tp->defer_accept;
514 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
515 i = lopt->clock_hand;
518 reqp=&lopt->syn_table[i];
519 while ((req = *reqp) != NULL) {
520 if (time_after_eq(now, req->expires)) {
521 if ((req->retrans < thresh ||
522 (req->acked && req->retrans < max_retries))
523 && !req->class->rtx_syn_ack(sk, req, NULL)) {
526 if (req->retrans++ == 0)
527 #ifdef CONFIG_ACCEPT_QUEUES
528 lopt->qlen_young[req->acceptq_class]--;
532 timeo = min((TCP_TIMEOUT_INIT << req->retrans), TCP_RTO_MAX);
533 req->expires = now + timeo;
534 reqp = &req->dl_next;
538 /* Drop this request */
539 write_lock(&tp->syn_wait_lock);
540 *reqp = req->dl_next;
541 write_unlock(&tp->syn_wait_lock);
543 if (req->retrans == 0)
544 #ifdef CONFIG_ACCEPT_QUEUES
545 lopt->qlen_young[req->acceptq_class]--;
549 tcp_openreq_free(req);
552 reqp = &req->dl_next;
555 i = (i+1)&(TCP_SYNQ_HSIZE-1);
557 } while (--budget > 0);
559 lopt->clock_hand = i;
562 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
565 void tcp_delete_keepalive_timer (struct sock *sk)
567 sk_stop_timer(sk, &sk->sk_timer);
570 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
572 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
575 void tcp_set_keepalive(struct sock *sk, int val)
577 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
580 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
581 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
583 tcp_delete_keepalive_timer(sk);
587 static void tcp_keepalive_timer (unsigned long data)
589 struct sock *sk = (struct sock *) data;
590 struct tcp_opt *tp = tcp_sk(sk);
593 /* Only process if socket is not in use. */
595 if (sock_owned_by_user(sk)) {
596 /* Try again later. */
597 tcp_reset_keepalive_timer (sk, HZ/20);
601 if (sk->sk_state == TCP_LISTEN) {
602 tcp_synack_timer(sk);
606 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
607 if (tp->linger2 >= 0) {
608 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
611 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
615 tcp_send_active_reset(sk, GFP_ATOMIC);
619 if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
622 elapsed = keepalive_time_when(tp);
624 /* It is alive without keepalive 8) */
625 if (tp->packets_out || sk->sk_send_head)
628 elapsed = tcp_time_stamp - tp->rcv_tstamp;
630 if (elapsed >= keepalive_time_when(tp)) {
631 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
632 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
633 tcp_send_active_reset(sk, GFP_ATOMIC);
637 if (tcp_write_wakeup(sk) <= 0) {
639 elapsed = keepalive_intvl_when(tp);
641 /* If keepalive was lost due to local congestion,
644 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
647 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
648 elapsed = keepalive_time_when(tp) - elapsed;
652 sk_stream_mem_reclaim(sk);
655 tcp_reset_keepalive_timer (sk, elapsed);
666 EXPORT_SYMBOL(tcp_clear_xmit_timers);
667 EXPORT_SYMBOL(tcp_delete_keepalive_timer);
668 EXPORT_SYMBOL(tcp_init_xmit_timers);
669 EXPORT_SYMBOL(tcp_reset_keepalive_timer);