2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
57 * Alan Cox : Tidied tcp_data to avoid a potential
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
207 * Hirokazu Takahashi : Use copy_from_user() instead of
208 * csum_and_copy_from_user() if possible.
210 * This program is free software; you can redistribute it and/or
211 * modify it under the terms of the GNU General Public License
212 * as published by the Free Software Foundation; either version
213 * 2 of the License, or(at your option) any later version.
215 * Description of States:
217 * TCP_SYN_SENT sent a connection request, waiting for ack
219 * TCP_SYN_RECV received a connection request, sent ack,
220 * waiting for final ack in three-way handshake.
222 * TCP_ESTABLISHED connection established
224 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
225 * transmission of remaining buffered data
227 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
230 * TCP_CLOSING both sides have shutdown but we still have
231 * data we have to finish sending
233 * TCP_TIME_WAIT timeout to catch resent junk before entering
234 * closed, can only be entered from FIN_WAIT2
235 * or CLOSING. Required because the other end
236 * may not have gotten our last ACK causing it
237 * to retransmit the data packet (which we ignore)
239 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
240 * us to finish writing our data and to shutdown
241 * (we have to close() to move on to LAST_ACK)
243 * TCP_LAST_ACK out side has shutdown after remote has
244 * shutdown. There may still be data in our
245 * buffer that we have to finish sending
247 * TCP_CLOSE socket is finished
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
261 #include <linux/ckrm.h>
264 #include <net/icmp.h>
266 #include <net/xfrm.h>
270 #include <asm/uaccess.h>
271 #include <asm/ioctls.h>
273 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
275 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
277 kmem_cache_t *tcp_openreq_cachep;
278 kmem_cache_t *tcp_bucket_cachep;
279 kmem_cache_t *tcp_timewait_cachep;
281 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
283 int sysctl_tcp_default_win_scale = 7;
284 int sysctl_tcp_mem[3];
285 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
286 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
288 atomic_t tcp_memory_allocated; /* Current allocated memory. */
289 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
291 /* Pressure flag: try to collapse.
292 * Technical note: it is used by multiple contexts non atomically.
293 * All the tcp_mem_schedule() is of this nature: accounting
294 * is strict, actions are advisory and have some latency. */
295 int tcp_memory_pressure;
297 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
299 int tcp_mem_schedule(struct sock *sk, int size, int kind)
301 int amt = TCP_PAGES(size);
303 sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
304 atomic_add(amt, &tcp_memory_allocated);
307 if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
308 if (tcp_memory_pressure)
309 tcp_memory_pressure = 0;
313 /* Over hard limit. */
314 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
315 tcp_enter_memory_pressure();
316 goto suppress_allocation;
319 /* Under pressure. */
320 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
321 tcp_enter_memory_pressure();
324 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
326 } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
329 if (!tcp_memory_pressure ||
330 sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
331 TCP_PAGES(sk->sk_wmem_queued +
332 atomic_read(&sk->sk_rmem_alloc) +
333 sk->sk_forward_alloc))
339 tcp_moderate_sndbuf(sk);
341 /* Fail only if socket is _under_ its sndbuf.
342 * In this case we cannot block, so that we have to fail.
344 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
348 /* Alas. Undo changes. */
349 sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
350 atomic_sub(amt, &tcp_memory_allocated);
354 void __tcp_mem_reclaim(struct sock *sk)
356 if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
357 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
358 &tcp_memory_allocated);
359 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
360 if (tcp_memory_pressure &&
361 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
362 tcp_memory_pressure = 0;
366 void tcp_rfree(struct sk_buff *skb)
368 struct sock *sk = skb->sk;
370 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
371 sk->sk_forward_alloc += skb->truesize;
375 * LISTEN is a special case for poll..
377 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
380 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
384 * Wait for a TCP event.
386 * Note that we don't need to lock the socket, as the upper poll layers
387 * take care of normal races (between the test and the event) and we don't
388 * go look at any of the socket buffers directly.
390 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
393 struct sock *sk = sock->sk;
394 struct tcp_opt *tp = tcp_sk(sk);
396 poll_wait(file, sk->sk_sleep, wait);
397 if (sk->sk_state == TCP_LISTEN)
398 return tcp_listen_poll(sk, wait);
400 /* Socket is not locked. We are protected from async events
401 by poll logic and correct handling of state changes
402 made by another threads is impossible in any case.
410 * POLLHUP is certainly not done right. But poll() doesn't
411 * have a notion of HUP in just one direction, and for a
412 * socket the read side is more interesting.
414 * Some poll() documentation says that POLLHUP is incompatible
415 * with the POLLOUT/POLLWR flags, so somebody should check this
416 * all. But careful, it tends to be safer to return too many
417 * bits than too few, and you can easily break real applications
418 * if you don't tell them that something has hung up!
422 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
423 * our fs/select.c). It means that after we received EOF,
424 * poll always returns immediately, making impossible poll() on write()
425 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
426 * if and only if shutdown has been made in both directions.
427 * Actually, it is interesting to look how Solaris and DUX
428 * solve this dilemma. I would prefer, if PULLHUP were maskable,
429 * then we could set it on SND_SHUTDOWN. BTW examples given
430 * in Stevens' books assume exactly this behaviour, it explains
431 * why PULLHUP is incompatible with POLLOUT. --ANK
433 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
434 * blocking on fresh not-connected or disconnected socket. --ANK
436 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
438 if (sk->sk_shutdown & RCV_SHUTDOWN)
439 mask |= POLLIN | POLLRDNORM;
442 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
443 /* Potential race condition. If read of tp below will
444 * escape above sk->sk_state, we can be illegally awaken
445 * in SYN_* states. */
446 if ((tp->rcv_nxt != tp->copied_seq) &&
447 (tp->urg_seq != tp->copied_seq ||
448 tp->rcv_nxt != tp->copied_seq + 1 ||
449 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
450 mask |= POLLIN | POLLRDNORM;
452 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
453 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
454 mask |= POLLOUT | POLLWRNORM;
455 } else { /* send SIGIO later */
456 set_bit(SOCK_ASYNC_NOSPACE,
457 &sk->sk_socket->flags);
458 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
460 /* Race breaker. If space is freed after
461 * wspace test but before the flags are set,
462 * IO signal will be lost.
464 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
465 mask |= POLLOUT | POLLWRNORM;
469 if (tp->urg_data & TCP_URG_VALID)
475 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
477 struct tcp_opt *tp = tcp_sk(sk);
482 if (sk->sk_state == TCP_LISTEN)
486 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
488 else if (sock_flag(sk, SOCK_URGINLINE) ||
490 before(tp->urg_seq, tp->copied_seq) ||
491 !before(tp->urg_seq, tp->rcv_nxt)) {
492 answ = tp->rcv_nxt - tp->copied_seq;
494 /* Subtract 1, if FIN is in queue. */
495 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
497 ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
499 answ = tp->urg_seq - tp->copied_seq;
503 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
506 if (sk->sk_state == TCP_LISTEN)
509 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
512 answ = tp->write_seq - tp->snd_una;
518 return put_user(answ, (int __user *)arg);
522 int tcp_listen_start(struct sock *sk)
524 #ifdef CONFIG_ACCEPT_QUEUES
527 struct inet_opt *inet = inet_sk(sk);
528 struct tcp_opt *tp = tcp_sk(sk);
529 struct tcp_listen_opt *lopt;
531 sk->sk_max_ack_backlog = 0;
532 sk->sk_ack_backlog = 0;
533 #ifdef CONFIG_ACCEPT_QUEUES
534 tp->accept_queue = NULL;
536 tp->accept_queue = tp->accept_queue_tail = NULL;
538 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
541 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
545 memset(lopt, 0, sizeof(struct tcp_listen_opt));
546 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
547 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
549 get_random_bytes(&lopt->hash_rnd, 4);
551 #ifdef CONFIG_ACCEPT_QUEUES
553 for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
554 tp->acceptq[i].aq_tail = NULL;
555 tp->acceptq[i].aq_head = NULL;
556 tp->acceptq[i].aq_wait_time = 0;
557 tp->acceptq[i].aq_qcount = 0;
558 tp->acceptq[i].aq_count = 0;
560 tp->acceptq[i].aq_ratio = 1;
563 tp->acceptq[i].aq_ratio = 0;
568 write_lock_bh(&tp->syn_wait_lock);
569 tp->listen_opt = lopt;
570 write_unlock_bh(&tp->syn_wait_lock);
572 /* There is race window here: we announce ourselves listening,
573 * but this transition is still not validated by get_port().
574 * It is OK, because this socket enters to hash table only
575 * after validation is complete.
577 sk->sk_state = TCP_LISTEN;
578 if (!sk->sk_prot->get_port(sk, inet->num)) {
579 inet->sport = htons(inet->num);
582 sk->sk_prot->hash(sk);
585 ckrm_cb_listen_start(sk);
591 sk->sk_state = TCP_CLOSE;
592 write_lock_bh(&tp->syn_wait_lock);
593 tp->listen_opt = NULL;
594 write_unlock_bh(&tp->syn_wait_lock);
600 * This routine closes sockets which have been at least partially
601 * opened, but not yet accepted.
604 static void tcp_listen_stop (struct sock *sk)
606 struct tcp_opt *tp = tcp_sk(sk);
607 struct tcp_listen_opt *lopt = tp->listen_opt;
608 struct open_request *acc_req = tp->accept_queue;
609 struct open_request *req;
612 tcp_delete_keepalive_timer(sk);
614 /* make all the listen_opt local to us */
615 write_lock_bh(&tp->syn_wait_lock);
616 tp->listen_opt = NULL;
617 write_unlock_bh(&tp->syn_wait_lock);
620 ckrm_cb_listen_stop(sk);
623 #ifdef CONFIG_ACCEPT_QUEUES
624 for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
625 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
627 tp->accept_queue_tail = NULL;
629 tp->accept_queue = NULL;
632 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
633 while ((req = lopt->syn_table[i]) != NULL) {
634 lopt->syn_table[i] = req->dl_next;
636 tcp_openreq_free(req);
638 /* Following specs, it would be better either to send FIN
639 * (and enter FIN-WAIT-1, it is normal close)
640 * or to send active reset (abort).
641 * Certainly, it is pretty dangerous while synflood, but it is
642 * bad justification for our negligence 8)
643 * To be honest, we are not able to make either
644 * of the variants now. --ANK
649 BUG_TRAP(!lopt->qlen);
653 while ((req = acc_req) != NULL) {
654 struct sock *child = req->sk;
656 acc_req = req->dl_next;
660 BUG_TRAP(!sock_owned_by_user(child));
663 tcp_disconnect(child, O_NONBLOCK);
667 atomic_inc(&tcp_orphan_count);
669 tcp_destroy_sock(child);
671 bh_unlock_sock(child);
675 #ifdef CONFIG_ACCEPT_QUEUES
676 sk_acceptq_removed(sk, req->acceptq_class);
678 sk_acceptq_removed(sk);
680 tcp_openreq_fastfree(req);
682 BUG_TRAP(!sk->sk_ack_backlog);
686 * Wait for a socket to get into the connected state
688 * Note: Must be called with the socket locked.
690 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
692 struct tcp_opt *tp = tcp_sk(sk);
693 struct task_struct *tsk = current;
696 while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
698 return sock_error(sk);
699 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
703 if (signal_pending(tsk))
704 return sock_intr_errno(*timeo_p);
706 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
710 *timeo_p = schedule_timeout(*timeo_p);
713 finish_wait(sk->sk_sleep, &wait);
719 static inline int tcp_memory_free(struct sock *sk)
721 return sk->sk_wmem_queued < sk->sk_sndbuf;
725 * Wait for more memory for a socket
727 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
729 struct tcp_opt *tp = tcp_sk(sk);
732 long current_timeo = *timeo;
735 if (tcp_memory_free(sk))
736 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
739 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
741 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
743 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
747 if (signal_pending(current))
749 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
750 if (tcp_memory_free(sk) && !vm_wait)
753 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
756 if (!tcp_memory_free(sk) || vm_wait)
757 current_timeo = schedule_timeout(current_timeo);
762 vm_wait -= current_timeo;
763 current_timeo = *timeo;
764 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
765 (current_timeo -= vm_wait) < 0)
769 *timeo = current_timeo;
772 finish_wait(sk->sk_sleep, &wait);
782 err = sock_intr_errno(*timeo);
786 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
790 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
791 return page == frag->page &&
792 off == frag->page_offset + frag->size;
797 static inline void fill_page_desc(struct sk_buff *skb, int i,
798 struct page *page, int off, int size)
800 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
802 frag->page_offset = off;
804 skb_shinfo(skb)->nr_frags = i + 1;
807 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
809 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
810 tp->pushed_seq = tp->write_seq;
813 static inline int forced_push(struct tcp_opt *tp)
815 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
818 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
822 TCP_SKB_CB(skb)->seq = tp->write_seq;
823 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
824 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
825 TCP_SKB_CB(skb)->sacked = 0;
826 __skb_queue_tail(&sk->sk_write_queue, skb);
827 sk_charge_skb(sk, skb);
830 else if (tp->nonagle&TCP_NAGLE_PUSH)
831 tp->nonagle &= ~TCP_NAGLE_PUSH;
834 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
837 if (flags & MSG_OOB) {
839 tp->snd_up = tp->write_seq;
840 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
844 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
845 int mss_now, int nonagle)
848 struct sk_buff *skb = sk->sk_write_queue.prev;
849 if (!(flags & MSG_MORE) || forced_push(tp))
850 tcp_mark_push(tp, skb);
851 tcp_mark_urg(tp, flags, skb);
852 __tcp_push_pending_frames(sk, tp, mss_now,
853 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
857 static int tcp_error(struct sock *sk, int flags, int err)
860 err = sock_error(sk) ? : -EPIPE;
861 if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
862 send_sig(SIGPIPE, current, 0);
866 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
867 size_t psize, int flags)
869 struct tcp_opt *tp = tcp_sk(sk);
873 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
875 /* Wait for a connection to finish. */
876 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
877 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
880 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
882 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
886 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
890 struct sk_buff *skb = sk->sk_write_queue.prev;
891 struct page *page = pages[poffset / PAGE_SIZE];
893 int offset = poffset % PAGE_SIZE;
894 int size = min_t(size_t, psize, PAGE_SIZE - offset);
896 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
898 if (!tcp_memory_free(sk))
899 goto wait_for_sndbuf;
901 skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
904 goto wait_for_memory;
906 skb_entail(sk, tp, skb);
913 i = skb_shinfo(skb)->nr_frags;
914 if (can_coalesce(skb, i, page, offset)) {
915 skb_shinfo(skb)->frags[i - 1].size += copy;
916 } else if (i < MAX_SKB_FRAGS) {
918 fill_page_desc(skb, i, page, offset, copy);
920 tcp_mark_push(tp, skb);
925 skb->data_len += copy;
926 skb->ip_summed = CHECKSUM_HW;
927 tp->write_seq += copy;
928 TCP_SKB_CB(skb)->end_seq += copy;
931 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
935 if (!(psize -= copy))
938 if (skb->len != mss_now || (flags & MSG_OOB))
941 if (forced_push(tp)) {
942 tcp_mark_push(tp, skb);
943 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
944 } else if (skb == tp->send_head)
945 tcp_push_one(sk, mss_now);
949 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
952 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
954 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
957 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
962 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
969 return tcp_error(sk, flags, err);
972 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
973 size_t size, int flags)
976 struct sock *sk = sock->sk;
978 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
980 if (!(sk->sk_route_caps & NETIF_F_SG) ||
981 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
982 return sock_no_sendpage(sock, page, offset, size, flags);
984 #undef TCP_ZC_CSUM_FLAGS
988 res = do_tcp_sendpages(sk, &page, offset, size, flags);
994 #define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
995 #define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
997 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
998 struct sk_buff *skb, struct page *page,
1004 if (skb->ip_summed == CHECKSUM_NONE) {
1005 csum = csum_and_copy_from_user(from, page_address(page) + off,
1007 if (err) return err;
1008 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1010 if (copy_from_user(page_address(page) + off, from, copy))
1015 skb->data_len += copy;
1016 skb->truesize += copy;
1017 sk->sk_wmem_queued += copy;
1018 sk->sk_forward_alloc -= copy;
1022 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
1028 if (skb->ip_summed == CHECKSUM_NONE) {
1029 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1032 skb->csum = csum_block_add(skb->csum, csum, off);
1036 if (!copy_from_user(skb_put(skb, copy), from, copy))
1040 __skb_trim(skb, off);
1044 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1046 int tmp = tp->mss_cache_std;
1048 if (sk->sk_route_caps & NETIF_F_SG) {
1049 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1051 if (tmp >= pgbreak &&
1052 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1058 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1062 struct tcp_opt *tp = tcp_sk(sk);
1063 struct sk_buff *skb;
1070 TCP_CHECK_TIMER(sk);
1072 flags = msg->msg_flags;
1073 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1075 /* Wait for a connection to finish. */
1076 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1077 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1080 /* This should be in poll */
1081 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1083 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1085 /* Ok commence sending. */
1086 iovlen = msg->msg_iovlen;
1091 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1094 while (--iovlen >= 0) {
1095 int seglen = iov->iov_len;
1096 unsigned char __user *from = iov->iov_base;
1100 while (seglen > 0) {
1103 skb = sk->sk_write_queue.prev;
1105 if (!tp->send_head ||
1106 (copy = mss_now - skb->len) <= 0) {
1109 /* Allocate new segment. If the interface is SG,
1110 * allocate skb fitting to single page.
1112 if (!tcp_memory_free(sk))
1113 goto wait_for_sndbuf;
1115 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1116 0, sk->sk_allocation);
1118 goto wait_for_memory;
1121 * Check whether we can use HW checksum.
1123 if (sk->sk_route_caps &
1124 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1126 skb->ip_summed = CHECKSUM_HW;
1128 skb_entail(sk, tp, skb);
1132 /* Try to append data to the end of skb. */
1136 /* Where to copy to? */
1137 if (skb_tailroom(skb) > 0) {
1138 /* We have some space in skb head. Superb! */
1139 if (copy > skb_tailroom(skb))
1140 copy = skb_tailroom(skb);
1141 if ((err = skb_add_data(skb, from, copy)) != 0)
1145 int i = skb_shinfo(skb)->nr_frags;
1146 struct page *page = TCP_PAGE(sk);
1147 int off = TCP_OFF(sk);
1149 if (can_coalesce(skb, i, page, off) &&
1151 /* We can extend the last page
1154 } else if (i == MAX_SKB_FRAGS ||
1156 !(sk->sk_route_caps & NETIF_F_SG))) {
1157 /* Need to add new fragment and cannot
1158 * do this because interface is non-SG,
1159 * or because all the page slots are
1161 tcp_mark_push(tp, skb);
1164 /* If page is cached, align
1165 * offset to L1 cache boundary
1167 off = (off + L1_CACHE_BYTES - 1) &
1168 ~(L1_CACHE_BYTES - 1);
1169 if (off == PAGE_SIZE) {
1171 TCP_PAGE(sk) = page = NULL;
1176 /* Allocate new cache page. */
1177 if (!(page = tcp_alloc_page(sk)))
1178 goto wait_for_memory;
1182 if (copy > PAGE_SIZE - off)
1183 copy = PAGE_SIZE - off;
1185 /* Time to copy data. We are close to
1187 err = tcp_copy_to_page(sk, from, skb, page,
1190 /* If this page was new, give it to the
1191 * socket so it does not get leaked.
1193 if (!TCP_PAGE(sk)) {
1194 TCP_PAGE(sk) = page;
1200 /* Update the skb. */
1202 skb_shinfo(skb)->frags[i - 1].size +=
1205 fill_page_desc(skb, i, page, off, copy);
1208 } else if (off + copy < PAGE_SIZE) {
1210 TCP_PAGE(sk) = page;
1214 TCP_OFF(sk) = off + copy;
1218 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1220 tp->write_seq += copy;
1221 TCP_SKB_CB(skb)->end_seq += copy;
1225 if ((seglen -= copy) == 0 && iovlen == 0)
1228 if (skb->len != mss_now || (flags & MSG_OOB))
1231 if (forced_push(tp)) {
1232 tcp_mark_push(tp, skb);
1233 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1234 } else if (skb == tp->send_head)
1235 tcp_push_one(sk, mss_now);
1239 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1242 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1244 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1247 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1253 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1254 TCP_CHECK_TIMER(sk);
1260 if (tp->send_head == skb)
1261 tp->send_head = NULL;
1262 __skb_unlink(skb, skb->list);
1263 tcp_free_skb(sk, skb);
1270 err = tcp_error(sk, flags, err);
1271 TCP_CHECK_TIMER(sk);
1277 * Handle reading urgent data. BSD has very simple semantics for
1278 * this, no blocking and very strange errors 8)
1281 static int tcp_recv_urg(struct sock *sk, long timeo,
1282 struct msghdr *msg, int len, int flags,
1285 struct tcp_opt *tp = tcp_sk(sk);
1287 /* No URG data to read. */
1288 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1289 tp->urg_data == TCP_URG_READ)
1290 return -EINVAL; /* Yes this is right ! */
1292 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1295 if (tp->urg_data & TCP_URG_VALID) {
1297 char c = tp->urg_data;
1299 if (!(flags & MSG_PEEK))
1300 tp->urg_data = TCP_URG_READ;
1302 /* Read urgent data. */
1303 msg->msg_flags |= MSG_OOB;
1306 if (!(flags & MSG_TRUNC))
1307 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1310 msg->msg_flags |= MSG_TRUNC;
1312 return err ? -EFAULT : len;
1315 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1318 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1319 * the available implementations agree in this case:
1320 * this call should never block, independent of the
1321 * blocking state of the socket.
1322 * Mike <pall@rz.uni-karlsruhe.de>
1327 /* Clean up the receive buffer for full frames taken by the user,
1328 * then send an ACK if necessary. COPIED is the number of bytes
1329 * tcp_recvmsg has given to the user so far, it speeds up the
1330 * calculation of whether or not we must ACK for the sake of
1333 void cleanup_rbuf(struct sock *sk, int copied)
1335 struct tcp_opt *tp = tcp_sk(sk);
1336 int time_to_ack = 0;
1339 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1341 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1344 if (tcp_ack_scheduled(tp)) {
1345 /* Delayed ACKs frequently hit locked sockets during bulk
1347 if (tp->ack.blocked ||
1348 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1349 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1351 * If this read emptied read buffer, we send ACK, if
1352 * connection is not bidirectional, user drained
1353 * receive buffer and there was a small segment
1356 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1357 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1361 /* We send an ACK if we can now advertise a non-zero window
1362 * which has been raised "significantly".
1364 * Even if window raised up to infinity, do not send window open ACK
1365 * in states, where we will not receive more. It is useless.
1367 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1368 __u32 rcv_window_now = tcp_receive_window(tp);
1370 /* Optimize, __tcp_select_window() is not cheap. */
1371 if (2*rcv_window_now <= tp->window_clamp) {
1372 __u32 new_window = __tcp_select_window(sk);
1374 /* Send ACK now, if this read freed lots of space
1375 * in our buffer. Certainly, new_window is new window.
1376 * We can advertise it now, if it is not less than current one.
1377 * "Lots" means "at least twice" here.
1379 if (new_window && new_window >= 2 * rcv_window_now)
1387 static void tcp_prequeue_process(struct sock *sk)
1389 struct sk_buff *skb;
1390 struct tcp_opt *tp = tcp_sk(sk);
1392 NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1394 /* RX process wants to run with disabled BHs, though it is not
1397 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1398 sk->sk_backlog_rcv(sk, skb);
1401 /* Clear memory counter. */
1402 tp->ucopy.memory = 0;
1405 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1407 struct sk_buff *skb;
1410 skb_queue_walk(&sk->sk_receive_queue, skb) {
1411 offset = seq - TCP_SKB_CB(skb)->seq;
1414 if (offset < skb->len || skb->h.th->fin) {
1423 * This routine provides an alternative to tcp_recvmsg() for routines
1424 * that would like to handle copying from skbuffs directly in 'sendfile'
1427 * - It is assumed that the socket was locked by the caller.
1428 * - The routine does not block.
1429 * - At present, there is no support for reading OOB data
1430 * or for 'peeking' the socket using this routine
1431 * (although both would be easy to implement).
1433 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1434 sk_read_actor_t recv_actor)
1436 struct sk_buff *skb;
1437 struct tcp_opt *tp = tcp_sk(sk);
1438 u32 seq = tp->copied_seq;
1442 if (sk->sk_state == TCP_LISTEN)
1444 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1445 if (offset < skb->len) {
1448 len = skb->len - offset;
1449 /* Stop reading if we hit a patch of urgent data */
1451 u32 urg_offset = tp->urg_seq - seq;
1452 if (urg_offset < len)
1457 used = recv_actor(desc, skb, offset, len);
1463 if (offset != skb->len)
1466 if (skb->h.th->fin) {
1467 sk_eat_skb(sk, skb);
1471 sk_eat_skb(sk, skb);
1475 tp->copied_seq = seq;
1477 tcp_rcv_space_adjust(sk);
1479 /* Clean up data we have read: This will do ACK frames. */
1481 cleanup_rbuf(sk, copied);
1486 * This routine copies from a sock struct into the user buffer.
1488 * Technical note: in 2.3 we work on _locked_ socket, so that
1489 * tricks with *seq access order and skb->users are not required.
1490 * Probably, code can be easily improved even more.
1493 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1494 size_t len, int nonblock, int flags, int *addr_len)
1496 struct tcp_opt *tp = tcp_sk(sk);
1502 int target; /* Read at least this many bytes */
1504 struct task_struct *user_recv = NULL;
1508 TCP_CHECK_TIMER(sk);
1511 if (sk->sk_state == TCP_LISTEN)
1514 timeo = sock_rcvtimeo(sk, nonblock);
1516 /* Urgent data needs to be handled specially. */
1517 if (flags & MSG_OOB)
1520 seq = &tp->copied_seq;
1521 if (flags & MSG_PEEK) {
1522 peek_seq = tp->copied_seq;
1526 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1529 struct sk_buff *skb;
1532 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1533 if (tp->urg_data && tp->urg_seq == *seq) {
1536 if (signal_pending(current)) {
1537 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1542 /* Next get a buffer. */
1544 skb = skb_peek(&sk->sk_receive_queue);
1549 /* Now that we have two receive queues this
1552 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1553 printk(KERN_INFO "recvmsg bug: copied %X "
1554 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1557 offset = *seq - TCP_SKB_CB(skb)->seq;
1560 if (offset < skb->len)
1564 BUG_TRAP(flags & MSG_PEEK);
1566 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1568 /* Well, if we have backlog, try to process it now yet. */
1570 if (copied >= target && !sk->sk_backlog.tail)
1575 sk->sk_state == TCP_CLOSE ||
1576 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1578 signal_pending(current) ||
1582 if (sock_flag(sk, SOCK_DONE))
1586 copied = sock_error(sk);
1590 if (sk->sk_shutdown & RCV_SHUTDOWN)
1593 if (sk->sk_state == TCP_CLOSE) {
1594 if (!sock_flag(sk, SOCK_DONE)) {
1595 /* This occurs when user tries to read
1596 * from never connected socket.
1609 if (signal_pending(current)) {
1610 copied = sock_intr_errno(timeo);
1615 cleanup_rbuf(sk, copied);
1617 if (tp->ucopy.task == user_recv) {
1618 /* Install new reader */
1619 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1620 user_recv = current;
1621 tp->ucopy.task = user_recv;
1622 tp->ucopy.iov = msg->msg_iov;
1625 tp->ucopy.len = len;
1627 BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1628 (flags & (MSG_PEEK | MSG_TRUNC)));
1630 /* Ugly... If prequeue is not empty, we have to
1631 * process it before releasing socket, otherwise
1632 * order will be broken at second iteration.
1633 * More elegant solution is required!!!
1635 * Look: we have the following (pseudo)queues:
1637 * 1. packets in flight
1642 * Each queue can be processed only if the next ones
1643 * are empty. At this point we have empty receive_queue.
1644 * But prequeue _can_ be not empty after 2nd iteration,
1645 * when we jumped to start of loop because backlog
1646 * processing added something to receive_queue.
1647 * We cannot release_sock(), because backlog contains
1648 * packets arrived _after_ prequeued ones.
1650 * Shortly, algorithm is clear --- to process all
1651 * the queues in order. We could make it more directly,
1652 * requeueing packets from backlog to prequeue, if
1653 * is not empty. It is more elegant, but eats cycles,
1656 if (skb_queue_len(&tp->ucopy.prequeue))
1659 /* __ Set realtime policy in scheduler __ */
1662 if (copied >= target) {
1663 /* Do not sleep, just process backlog. */
1667 sk_wait_data(sk, &timeo);
1672 /* __ Restore normal policy in scheduler __ */
1674 if ((chunk = len - tp->ucopy.len) != 0) {
1675 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1680 if (tp->rcv_nxt == tp->copied_seq &&
1681 skb_queue_len(&tp->ucopy.prequeue)) {
1683 tcp_prequeue_process(sk);
1685 if ((chunk = len - tp->ucopy.len) != 0) {
1686 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1692 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1693 if (net_ratelimit())
1694 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1695 current->comm, current->pid);
1696 peek_seq = tp->copied_seq;
1701 /* Ok so how much can we use? */
1702 used = skb->len - offset;
1706 /* Do we have urgent data here? */
1708 u32 urg_offset = tp->urg_seq - *seq;
1709 if (urg_offset < used) {
1711 if (!sock_flag(sk, SOCK_URGINLINE)) {
1723 if (!(flags & MSG_TRUNC)) {
1724 err = skb_copy_datagram_iovec(skb, offset,
1725 msg->msg_iov, used);
1727 /* Exception. Bailout! */
1738 tcp_rcv_space_adjust(sk);
1741 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1743 tcp_fast_path_check(sk, tp);
1745 if (used + offset < skb->len)
1750 if (!(flags & MSG_PEEK))
1751 sk_eat_skb(sk, skb);
1755 /* Process the FIN. */
1757 if (!(flags & MSG_PEEK))
1758 sk_eat_skb(sk, skb);
1763 if (skb_queue_len(&tp->ucopy.prequeue)) {
1766 tp->ucopy.len = copied > 0 ? len : 0;
1768 tcp_prequeue_process(sk);
1770 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1771 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1777 tp->ucopy.task = NULL;
1781 /* According to UNIX98, msg_name/msg_namelen are ignored
1782 * on connected socket. I was just happy when found this 8) --ANK
1785 /* Clean up data we have read: This will do ACK frames. */
1786 cleanup_rbuf(sk, copied);
1788 TCP_CHECK_TIMER(sk);
1793 TCP_CHECK_TIMER(sk);
1798 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1803 * State processing on a close. This implements the state shift for
1804 * sending our FIN frame. Note that we only send a FIN for some
1805 * states. A shutdown() may have already sent the FIN, or we may be
1809 static unsigned char new_state[16] = {
1810 /* current state: new state: action: */
1811 /* (Invalid) */ TCP_CLOSE,
1812 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1813 /* TCP_SYN_SENT */ TCP_CLOSE,
1814 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1815 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1816 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1817 /* TCP_TIME_WAIT */ TCP_CLOSE,
1818 /* TCP_CLOSE */ TCP_CLOSE,
1819 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1820 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1821 /* TCP_LISTEN */ TCP_CLOSE,
1822 /* TCP_CLOSING */ TCP_CLOSING,
1825 static int tcp_close_state(struct sock *sk)
1827 int next = (int)new_state[sk->sk_state];
1828 int ns = next & TCP_STATE_MASK;
1830 tcp_set_state(sk, ns);
1832 return next & TCP_ACTION_FIN;
1836 * Shutdown the sending side of a connection. Much like close except
1837 * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1840 void tcp_shutdown(struct sock *sk, int how)
1842 /* We need to grab some memory, and put together a FIN,
1843 * and then put it into the queue to be sent.
1844 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1846 if (!(how & SEND_SHUTDOWN))
1849 /* If we've already sent a FIN, or it's a closed state, skip this. */
1850 if ((1 << sk->sk_state) &
1851 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1852 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1853 /* Clear out any half completed packets. FIN if needed. */
1854 if (tcp_close_state(sk))
1861 * Return 1 if we still have things to send in our buffers.
1864 static inline int closing(struct sock *sk)
1866 return (1 << sk->sk_state) &
1867 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1870 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1872 /* First the read buffer. */
1873 __skb_queue_purge(&sk->sk_receive_queue);
1875 /* Next, the error queue. */
1876 __skb_queue_purge(&sk->sk_error_queue);
1878 /* Next, the write queue. */
1879 BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1881 /* Account for returned memory. */
1882 tcp_mem_reclaim(sk);
1884 BUG_TRAP(!sk->sk_wmem_queued);
1885 BUG_TRAP(!sk->sk_forward_alloc);
1887 /* It is _impossible_ for the backlog to contain anything
1888 * when we get here. All user references to this socket
1889 * have gone away, only the net layer knows can touch it.
1894 * At this point, there should be no process reference to this
1895 * socket, and thus no user references at all. Therefore we
1896 * can assume the socket waitqueue is inactive and nobody will
1897 * try to jump onto it.
1899 void tcp_destroy_sock(struct sock *sk)
1901 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1902 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1904 /* It cannot be in hash table! */
1905 BUG_TRAP(sk_unhashed(sk));
1907 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1908 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1911 if (sk->sk_zapped) {
1912 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1918 sk->sk_prot->destroy(sk);
1920 tcp_kill_sk_queues(sk);
1922 xfrm_sk_free_policy(sk);
1924 #ifdef INET_REFCNT_DEBUG
1925 if (atomic_read(&sk->sk_refcnt) != 1) {
1926 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1927 sk, atomic_read(&sk->sk_refcnt));
1931 atomic_dec(&tcp_orphan_count);
1935 void tcp_close(struct sock *sk, long timeout)
1937 struct sk_buff *skb;
1938 int data_was_unread = 0;
1941 sk->sk_shutdown = SHUTDOWN_MASK;
1943 if (sk->sk_state == TCP_LISTEN) {
1944 tcp_set_state(sk, TCP_CLOSE);
1947 tcp_listen_stop(sk);
1949 goto adjudge_to_death;
1952 /* We need to flush the recv. buffs. We do this only on the
1953 * descriptor close, not protocol-sourced closes, because the
1954 * reader process may not have drained the data yet!
1956 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1957 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1959 data_was_unread += len;
1963 tcp_mem_reclaim(sk);
1965 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1966 * 3.10, we send a RST here because data was lost. To
1967 * witness the awful effects of the old behavior of always
1968 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1969 * a bulk GET in an FTP client, suspend the process, wait
1970 * for the client to advertise a zero window, then kill -9
1971 * the FTP client, wheee... Note: timeout is always zero
1974 if (data_was_unread) {
1975 /* Unread data was tossed, zap the connection. */
1976 NET_INC_STATS_USER(TCPAbortOnClose);
1977 tcp_set_state(sk, TCP_CLOSE);
1978 tcp_send_active_reset(sk, GFP_KERNEL);
1979 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1980 /* Check zero linger _after_ checking for unread data. */
1981 sk->sk_prot->disconnect(sk, 0);
1982 NET_INC_STATS_USER(TCPAbortOnData);
1983 } else if (tcp_close_state(sk)) {
1984 /* We FIN if the application ate all the data before
1985 * zapping the connection.
1988 /* RED-PEN. Formally speaking, we have broken TCP state
1989 * machine. State transitions:
1991 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1992 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1993 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1995 * are legal only when FIN has been sent (i.e. in window),
1996 * rather than queued out of window. Purists blame.
1998 * F.e. "RFC state" is ESTABLISHED,
1999 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2001 * The visible declinations are that sometimes
2002 * we enter time-wait state, when it is not required really
2003 * (harmless), do not send active resets, when they are
2004 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2005 * they look as CLOSING or LAST_ACK for Linux)
2006 * Probably, I missed some more holelets.
2013 struct task_struct *tsk = current;
2017 prepare_to_wait(sk->sk_sleep, &wait,
2018 TASK_INTERRUPTIBLE);
2022 timeout = schedule_timeout(timeout);
2024 } while (!signal_pending(tsk) && timeout);
2026 finish_wait(sk->sk_sleep, &wait);
2030 /* It is the last release_sock in its life. It will remove backlog. */
2034 /* Now socket is owned by kernel and we acquire BH lock
2035 to finish close. No need to check for user refs.
2039 BUG_TRAP(!sock_owned_by_user(sk));
2044 /* This is a (useful) BSD violating of the RFC. There is a
2045 * problem with TCP as specified in that the other end could
2046 * keep a socket open forever with no application left this end.
2047 * We use a 3 minute timeout (about the same as BSD) then kill
2048 * our end. If they send after that then tough - BUT: long enough
2049 * that we won't make the old 4*rto = almost no time - whoops
2052 * Nope, it was not mistake. It is really desired behaviour
2053 * f.e. on http servers, when such sockets are useless, but
2054 * consume significant resources. Let's do it with special
2055 * linger2 option. --ANK
2058 if (sk->sk_state == TCP_FIN_WAIT2) {
2059 struct tcp_opt *tp = tcp_sk(sk);
2060 if (tp->linger2 < 0) {
2061 tcp_set_state(sk, TCP_CLOSE);
2062 tcp_send_active_reset(sk, GFP_ATOMIC);
2063 NET_INC_STATS_BH(TCPAbortOnLinger);
2065 int tmo = tcp_fin_time(tp);
2067 if (tmo > TCP_TIMEWAIT_LEN) {
2068 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2070 atomic_inc(&tcp_orphan_count);
2071 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2076 if (sk->sk_state != TCP_CLOSE) {
2077 tcp_mem_reclaim(sk);
2078 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2079 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2080 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2081 if (net_ratelimit())
2082 printk(KERN_INFO "TCP: too many of orphaned "
2084 tcp_set_state(sk, TCP_CLOSE);
2085 tcp_send_active_reset(sk, GFP_ATOMIC);
2086 NET_INC_STATS_BH(TCPAbortOnMemory);
2089 atomic_inc(&tcp_orphan_count);
2091 if (sk->sk_state == TCP_CLOSE)
2092 tcp_destroy_sock(sk);
2093 /* Otherwise, socket is reprieved until protocol close. */
2101 /* These states need RST on ABORT according to RFC793 */
2103 static inline int tcp_need_reset(int state)
2105 return (1 << state) &
2106 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2107 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2110 int tcp_disconnect(struct sock *sk, int flags)
2112 struct inet_opt *inet = inet_sk(sk);
2113 struct tcp_opt *tp = tcp_sk(sk);
2115 int old_state = sk->sk_state;
2117 if (old_state != TCP_CLOSE)
2118 tcp_set_state(sk, TCP_CLOSE);
2120 /* ABORT function of RFC793 */
2121 if (old_state == TCP_LISTEN) {
2122 tcp_listen_stop(sk);
2123 } else if (tcp_need_reset(old_state) ||
2124 (tp->snd_nxt != tp->write_seq &&
2125 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2126 /* The last check adjusts for discrepance of Linux wrt. RFC
2129 tcp_send_active_reset(sk, gfp_any());
2130 sk->sk_err = ECONNRESET;
2131 } else if (old_state == TCP_SYN_SENT)
2132 sk->sk_err = ECONNRESET;
2134 tcp_clear_xmit_timers(sk);
2135 __skb_queue_purge(&sk->sk_receive_queue);
2136 tcp_writequeue_purge(sk);
2137 __skb_queue_purge(&tp->out_of_order_queue);
2141 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2142 inet_reset_saddr(sk);
2144 sk->sk_shutdown = 0;
2145 sock_reset_flag(sk, SOCK_DONE);
2147 if ((tp->write_seq += tp->max_window + 2) == 0)
2152 tp->packets_out = 0;
2153 tp->snd_ssthresh = 0x7fffffff;
2154 tp->snd_cwnd_cnt = 0;
2155 tcp_set_ca_state(tp, TCP_CA_Open);
2156 tcp_clear_retrans(tp);
2157 tcp_delack_init(tp);
2158 tp->send_head = NULL;
2163 BUG_TRAP(!inet->num || tp->bind_hash);
2165 sk->sk_error_report(sk);
2170 * Wait for an incoming connection, avoid race
2171 * conditions. This must be called with the socket locked.
2173 static int wait_for_connect(struct sock *sk, long timeo)
2175 struct tcp_opt *tp = tcp_sk(sk);
2180 * True wake-one mechanism for incoming connections: only
2181 * one process gets woken up, not the 'whole herd'.
2182 * Since we do not 'race & poll' for established sockets
2183 * anymore, the common case will execute the loop only once.
2185 * Subtle issue: "add_wait_queue_exclusive()" will be added
2186 * after any current non-exclusive waiters, and we know that
2187 * it will always _stay_ after any new non-exclusive waiters
2188 * because all non-exclusive waiters are added at the
2189 * beginning of the wait-queue. As such, it's ok to "drop"
2190 * our exclusiveness temporarily when we get woken up without
2191 * having to remove and re-insert us on the wait queue.
2194 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2195 TASK_INTERRUPTIBLE);
2197 if (!tp->accept_queue)
2198 timeo = schedule_timeout(timeo);
2201 if (tp->accept_queue)
2204 if (sk->sk_state != TCP_LISTEN)
2206 err = sock_intr_errno(timeo);
2207 if (signal_pending(current))
2213 finish_wait(sk->sk_sleep, &wait);
2218 * This will accept the next outstanding connection.
2221 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2223 struct tcp_opt *tp = tcp_sk(sk);
2224 struct open_request *req;
2227 #ifdef CONFIG_ACCEPT_QUEUES
2234 /* We need to make sure that this socket is listening,
2235 * and that it has something pending.
2238 if (sk->sk_state != TCP_LISTEN)
2241 /* Find already established connection */
2242 if (!tp->accept_queue) {
2243 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2244 /* If this is a non blocking socket don't sleep */
2249 error = wait_for_connect(sk, timeo);
2254 #ifndef CONFIG_ACCEPT_QUEUES
2255 req = tp->accept_queue;
2256 if ((tp->accept_queue = req->dl_next) == NULL)
2257 tp->accept_queue_tail = NULL;
2259 sk_acceptq_removed(sk);
2261 first = tp->class_index;
2262 /* We should always have request queued here. The accept_queue
2263 * is already checked for NULL above.
2265 while(!tp->acceptq[first].aq_head) {
2266 tp->acceptq[first].aq_cnt = 0;
2267 first = (first+1) & ~NUM_ACCEPT_QUEUES;
2269 req = tp->acceptq[first].aq_head;
2270 tp->acceptq[first].aq_qcount--;
2271 tp->acceptq[first].aq_count++;
2272 tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
2274 for (prev_class= first-1 ; prev_class >=0; prev_class--)
2275 if (tp->acceptq[prev_class].aq_tail)
2278 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
2280 tp->accept_queue = req->dl_next;
2282 if (req == tp->acceptq[first].aq_tail)
2283 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2285 tp->acceptq[first].aq_head = req->dl_next;
2287 if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2288 tp->acceptq[first].aq_cnt = 0;
2289 tp->class_index = ++first & (NUM_ACCEPT_QUEUES-1);
2292 sk_acceptq_removed(sk, req->acceptq_class);
2294 tcp_openreq_fastfree(req);
2295 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2307 * Socket option code for TCP.
2309 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2312 struct tcp_opt *tp = tcp_sk(sk);
2316 if (level != SOL_TCP)
2317 return tp->af_specific->setsockopt(sk, level, optname,
2320 if (optlen < sizeof(int))
2323 if (get_user(val, (int __user *)optval))
2330 /* Values greater than interface MTU won't take effect. However
2331 * at the point when this call is done we typically don't yet
2332 * know which interface is going to be used */
2333 if (val < 8 || val > MAX_TCP_WINDOW) {
2342 /* TCP_NODELAY is weaker than TCP_CORK, so that
2343 * this option on corked socket is remembered, but
2344 * it is not activated until cork is cleared.
2346 * However, when TCP_NODELAY is set we make
2347 * an explicit push, which overrides even TCP_CORK
2348 * for currently queued segments.
2350 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2351 tcp_push_pending_frames(sk, tp);
2353 tp->nonagle &= ~TCP_NAGLE_OFF;
2358 /* When set indicates to always queue non-full frames.
2359 * Later the user clears this option and we transmit
2360 * any pending partial frames in the queue. This is
2361 * meant to be used alongside sendfile() to get properly
2362 * filled frames when the user (for example) must write
2363 * out headers with a write() call first and then use
2364 * sendfile to send out the data parts.
2366 * TCP_CORK can be set together with TCP_NODELAY and it is
2367 * stronger than TCP_NODELAY.
2370 tp->nonagle |= TCP_NAGLE_CORK;
2372 tp->nonagle &= ~TCP_NAGLE_CORK;
2373 if (tp->nonagle&TCP_NAGLE_OFF)
2374 tp->nonagle |= TCP_NAGLE_PUSH;
2375 tcp_push_pending_frames(sk, tp);
2380 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2383 tp->keepalive_time = val * HZ;
2384 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2385 !((1 << sk->sk_state) &
2386 (TCPF_CLOSE | TCPF_LISTEN))) {
2387 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2388 if (tp->keepalive_time > elapsed)
2389 elapsed = tp->keepalive_time - elapsed;
2392 tcp_reset_keepalive_timer(sk, elapsed);
2397 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2400 tp->keepalive_intvl = val * HZ;
2403 if (val < 1 || val > MAX_TCP_KEEPCNT)
2406 tp->keepalive_probes = val;
2409 if (val < 1 || val > MAX_TCP_SYNCNT)
2412 tp->syn_retries = val;
2418 else if (val > sysctl_tcp_fin_timeout / HZ)
2421 tp->linger2 = val * HZ;
2424 case TCP_DEFER_ACCEPT:
2425 tp->defer_accept = 0;
2427 /* Translate value in seconds to number of
2429 while (tp->defer_accept < 32 &&
2430 val > ((TCP_TIMEOUT_INIT / HZ) <<
2437 case TCP_WINDOW_CLAMP:
2439 if (sk->sk_state != TCP_CLOSE) {
2443 tp->window_clamp = 0;
2445 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2446 SOCK_MIN_RCVBUF / 2 : val;
2451 tp->ack.pingpong = 1;
2453 tp->ack.pingpong = 0;
2454 if ((1 << sk->sk_state) &
2455 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2456 tcp_ack_scheduled(tp)) {
2457 tp->ack.pending |= TCP_ACK_PUSHED;
2458 cleanup_rbuf(sk, 1);
2460 tp->ack.pingpong = 1;
2465 #ifdef CONFIG_ACCEPT_QUEUES
2466 case TCP_ACCEPTQ_SHARE:
2468 // If CKRM is set then the shares are set through rcfs.
2469 // Get shares will still succeed.
2474 char share_wt[NUM_ACCEPT_QUEUES];
2477 if (sk->sk_state != TCP_LISTEN)
2480 if (copy_from_user(share_wt,optval, optlen)) {
2485 for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2489 else if (share_wt[i] < j) {
2494 tp->acceptq[i].aq_ratio = 0;
2498 /* Class 0 is always valid. If nothing is
2499 * specified set class 0 as 1.
2504 for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
2505 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2506 tp->acceptq[i].aq_cnt = 0;
2520 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2523 struct tcp_opt *tp = tcp_sk(sk);
2526 if (level != SOL_TCP)
2527 return tp->af_specific->getsockopt(sk, level, optname,
2530 if (get_user(len, optlen))
2533 len = min_t(unsigned int, len, sizeof(int));
2540 val = tp->mss_cache_std;
2541 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2545 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2548 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2551 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2554 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2557 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2560 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2565 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2567 case TCP_DEFER_ACCEPT:
2568 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2569 (tp->defer_accept - 1));
2571 case TCP_WINDOW_CLAMP:
2572 val = tp->window_clamp;
2575 struct tcp_info info;
2577 if (get_user(len, optlen))
2580 tcp_get_info(sk, &info);
2582 len = min_t(unsigned int, len, sizeof(info));
2583 if (put_user(len, optlen))
2585 if (copy_to_user(optval, &info, len))
2590 val = !tp->ack.pingpong;
2593 #ifdef CONFIG_ACCEPT_QUEUES
2594 case TCP_ACCEPTQ_SHARE:
2596 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2599 if (sk->sk_state != TCP_LISTEN)
2602 if (get_user(len, optlen))
2605 memset(tinfo, 0, sizeof(tinfo));
2607 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2608 tinfo[i].acceptq_wait_time =
2609 jiffies_to_msecs(tp->acceptq[i].aq_wait_time);
2610 tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2611 tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2612 tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2615 len = min_t(unsigned int, len, sizeof(tinfo));
2616 if (put_user(len, optlen))
2619 if (copy_to_user(optval, (char *)tinfo, len))
2627 return -ENOPROTOOPT;
2630 if (put_user(len, optlen))
2632 if (copy_to_user(optval, &val, len))
2638 extern void __skb_cb_too_small_for_tcp(int, int);
2639 extern void tcpdiag_init(void);
2641 static __initdata unsigned long thash_entries;
2642 static int __init set_thash_entries(char *str)
2646 thash_entries = simple_strtoul(str, &str, 0);
2649 __setup("thash_entries=", set_thash_entries);
2651 void __init tcp_init(void)
2653 struct sk_buff *skb = NULL;
2657 if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2658 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2661 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2662 sizeof(struct open_request),
2663 0, SLAB_HWCACHE_ALIGN,
2665 if (!tcp_openreq_cachep)
2666 panic("tcp_init: Cannot alloc open_request cache.");
2668 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2669 sizeof(struct tcp_bind_bucket),
2670 0, SLAB_HWCACHE_ALIGN,
2672 if (!tcp_bucket_cachep)
2673 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2675 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2676 sizeof(struct tcp_tw_bucket),
2677 0, SLAB_HWCACHE_ALIGN,
2679 if (!tcp_timewait_cachep)
2680 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2682 /* Size and allocate the main established and bind bucket
2685 * The methodology is similar to that of the buffer cache.
2687 if (num_physpages >= (128 * 1024))
2688 goal = num_physpages >> (21 - PAGE_SHIFT);
2690 goal = num_physpages >> (23 - PAGE_SHIFT);
2693 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2694 for (order = 0; (1UL << order) < goal; order++)
2697 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2698 sizeof(struct tcp_ehash_bucket);
2699 tcp_ehash_size >>= 1;
2700 while (tcp_ehash_size & (tcp_ehash_size - 1))
2702 tcp_ehash = (struct tcp_ehash_bucket *)
2703 __get_free_pages(GFP_ATOMIC, order);
2704 } while (!tcp_ehash && --order > 0);
2707 panic("Failed to allocate TCP established hash table\n");
2708 for (i = 0; i < (tcp_ehash_size << 1); i++) {
2709 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2710 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2714 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2715 sizeof(struct tcp_bind_hashbucket);
2716 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2718 tcp_bhash = (struct tcp_bind_hashbucket *)
2719 __get_free_pages(GFP_ATOMIC, order);
2720 } while (!tcp_bhash && --order >= 0);
2723 panic("Failed to allocate TCP bind hash table\n");
2724 for (i = 0; i < tcp_bhash_size; i++) {
2725 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2726 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2729 /* Try to be a bit smarter and adjust defaults depending
2730 * on available memory.
2733 sysctl_local_port_range[0] = 32768;
2734 sysctl_local_port_range[1] = 61000;
2735 sysctl_tcp_max_tw_buckets = 180000;
2736 sysctl_tcp_max_orphans = 4096 << (order - 4);
2737 sysctl_max_syn_backlog = 1024;
2738 } else if (order < 3) {
2739 sysctl_local_port_range[0] = 1024 * (3 - order);
2740 sysctl_tcp_max_tw_buckets >>= (3 - order);
2741 sysctl_tcp_max_orphans >>= (3 - order);
2742 sysctl_max_syn_backlog = 128;
2744 tcp_port_rover = sysctl_local_port_range[0] - 1;
2746 sysctl_tcp_mem[0] = 768 << order;
2747 sysctl_tcp_mem[1] = 1024 << order;
2748 sysctl_tcp_mem[2] = 1536 << order;
2751 sysctl_tcp_wmem[2] = 64 * 1024;
2752 sysctl_tcp_rmem[0] = PAGE_SIZE;
2753 sysctl_tcp_rmem[1] = 43689;
2754 sysctl_tcp_rmem[2] = 2 * 43689;
2757 printk(KERN_INFO "TCP: Hash tables configured "
2758 "(established %d bind %d)\n",
2759 tcp_ehash_size << 1, tcp_bhash_size);
2764 EXPORT_SYMBOL(__tcp_mem_reclaim);
2765 EXPORT_SYMBOL(sysctl_tcp_rmem);
2766 EXPORT_SYMBOL(sysctl_tcp_wmem);
2767 EXPORT_SYMBOL(tcp_accept);
2768 EXPORT_SYMBOL(tcp_close);
2769 EXPORT_SYMBOL(tcp_close_state);
2770 EXPORT_SYMBOL(tcp_destroy_sock);
2771 EXPORT_SYMBOL(tcp_disconnect);
2772 EXPORT_SYMBOL(tcp_getsockopt);
2773 EXPORT_SYMBOL(tcp_ioctl);
2774 EXPORT_SYMBOL(tcp_openreq_cachep);
2775 EXPORT_SYMBOL(tcp_poll);
2776 EXPORT_SYMBOL(tcp_read_sock);
2777 EXPORT_SYMBOL(tcp_recvmsg);
2778 EXPORT_SYMBOL(tcp_sendmsg);
2779 EXPORT_SYMBOL(tcp_sendpage);
2780 EXPORT_SYMBOL(tcp_setsockopt);
2781 EXPORT_SYMBOL(tcp_shutdown);
2782 EXPORT_SYMBOL(tcp_sockets_allocated);
2783 EXPORT_SYMBOL(tcp_statistics);
2784 EXPORT_SYMBOL(tcp_timewait_cachep);
2785 EXPORT_SYMBOL_GPL(cleanup_rbuf);