This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259
260 #include <net/icmp.h>
261 #include <net/tcp.h>
262 #include <net/xfrm.h>
263 #include <net/ip.h>
264
265
266 #include <asm/uaccess.h>
267 #include <asm/ioctls.h>
268
269 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
270
271 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
272
273 kmem_cache_t *tcp_openreq_cachep;
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_default_win_scale;
280
281 int sysctl_tcp_mem[3];
282 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
283 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
284
285 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
286 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
287
288 /* Pressure flag: try to collapse.
289  * Technical note: it is used by multiple contexts non atomically.
290  * All the tcp_mem_schedule() is of this nature: accounting
291  * is strict, actions are advisory and have some latency. */
292 int tcp_memory_pressure;
293
294 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
295
296 int tcp_mem_schedule(struct sock *sk, int size, int kind)
297 {
298         int amt = TCP_PAGES(size);
299
300         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
301         atomic_add(amt, &tcp_memory_allocated);
302
303         /* Under limit. */
304         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
305                 if (tcp_memory_pressure)
306                         tcp_memory_pressure = 0;
307                 return 1;
308         }
309
310         /* Over hard limit. */
311         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
312                 tcp_enter_memory_pressure();
313                 goto suppress_allocation;
314         }
315
316         /* Under pressure. */
317         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
318                 tcp_enter_memory_pressure();
319
320         if (kind) {
321                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
322                         return 1;
323         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
324                 return 1;
325
326         if (!tcp_memory_pressure ||
327             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
328                                 TCP_PAGES(sk->sk_wmem_queued +
329                                           atomic_read(&sk->sk_rmem_alloc) +
330                                           sk->sk_forward_alloc))
331                 return 1;
332
333 suppress_allocation:
334
335         if (!kind) {
336                 tcp_moderate_sndbuf(sk);
337
338                 /* Fail only if socket is _under_ its sndbuf.
339                  * In this case we cannot block, so that we have to fail.
340                  */
341                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
342                         return 1;
343         }
344
345         /* Alas. Undo changes. */
346         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
347         atomic_sub(amt, &tcp_memory_allocated);
348         return 0;
349 }
350
351 void __tcp_mem_reclaim(struct sock *sk)
352 {
353         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
354                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
355                            &tcp_memory_allocated);
356                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
357                 if (tcp_memory_pressure &&
358                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
359                         tcp_memory_pressure = 0;
360         }
361 }
362
363 void tcp_rfree(struct sk_buff *skb)
364 {
365         struct sock *sk = skb->sk;
366
367         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
368         sk->sk_forward_alloc += skb->truesize;
369 }
370
371 /*
372  * LISTEN is a special case for poll..
373  */
374 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
375                                                poll_table *wait)
376 {
377         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
378 }
379
380 /*
381  *      Wait for a TCP event.
382  *
383  *      Note that we don't need to lock the socket, as the upper poll layers
384  *      take care of normal races (between the test and the event) and we don't
385  *      go look at any of the socket buffers directly.
386  */
387 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
388 {
389         unsigned int mask;
390         struct sock *sk = sock->sk;
391         struct tcp_opt *tp = tcp_sk(sk);
392
393         poll_wait(file, sk->sk_sleep, wait);
394         if (sk->sk_state == TCP_LISTEN)
395                 return tcp_listen_poll(sk, wait);
396
397         /* Socket is not locked. We are protected from async events
398            by poll logic and correct handling of state changes
399            made by another threads is impossible in any case.
400          */
401
402         mask = 0;
403         if (sk->sk_err)
404                 mask = POLLERR;
405
406         /*
407          * POLLHUP is certainly not done right. But poll() doesn't
408          * have a notion of HUP in just one direction, and for a
409          * socket the read side is more interesting.
410          *
411          * Some poll() documentation says that POLLHUP is incompatible
412          * with the POLLOUT/POLLWR flags, so somebody should check this
413          * all. But careful, it tends to be safer to return too many
414          * bits than too few, and you can easily break real applications
415          * if you don't tell them that something has hung up!
416          *
417          * Check-me.
418          *
419          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
420          * our fs/select.c). It means that after we received EOF,
421          * poll always returns immediately, making impossible poll() on write()
422          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
423          * if and only if shutdown has been made in both directions.
424          * Actually, it is interesting to look how Solaris and DUX
425          * solve this dilemma. I would prefer, if PULLHUP were maskable,
426          * then we could set it on SND_SHUTDOWN. BTW examples given
427          * in Stevens' books assume exactly this behaviour, it explains
428          * why PULLHUP is incompatible with POLLOUT.    --ANK
429          *
430          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
431          * blocking on fresh not-connected or disconnected socket. --ANK
432          */
433         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
434                 mask |= POLLHUP;
435         if (sk->sk_shutdown & RCV_SHUTDOWN)
436                 mask |= POLLIN | POLLRDNORM;
437
438         /* Connected? */
439         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
440                 /* Potential race condition. If read of tp below will
441                  * escape above sk->sk_state, we can be illegally awaken
442                  * in SYN_* states. */
443                 if ((tp->rcv_nxt != tp->copied_seq) &&
444                     (tp->urg_seq != tp->copied_seq ||
445                      tp->rcv_nxt != tp->copied_seq + 1 ||
446                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
447                         mask |= POLLIN | POLLRDNORM;
448
449                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
450                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
451                                 mask |= POLLOUT | POLLWRNORM;
452                         } else {  /* send SIGIO later */
453                                 set_bit(SOCK_ASYNC_NOSPACE,
454                                         &sk->sk_socket->flags);
455                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
456
457                                 /* Race breaker. If space is freed after
458                                  * wspace test but before the flags are set,
459                                  * IO signal will be lost.
460                                  */
461                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
462                                         mask |= POLLOUT | POLLWRNORM;
463                         }
464                 }
465
466                 if (tp->urg_data & TCP_URG_VALID)
467                         mask |= POLLPRI;
468         }
469         return mask;
470 }
471
472 /*
473  *      TCP socket write_space callback.
474  */
475 void tcp_write_space(struct sock *sk)
476 {
477         struct socket *sock = sk->sk_socket;
478
479         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
480                 clear_bit(SOCK_NOSPACE, &sock->flags);
481
482                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
483                         wake_up_interruptible(sk->sk_sleep);
484
485                 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
486                         sock_wake_async(sock, 2, POLL_OUT);
487         }
488 }
489
490 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
491 {
492         struct tcp_opt *tp = tcp_sk(sk);
493         int answ;
494
495         switch (cmd) {
496         case SIOCINQ:
497                 if (sk->sk_state == TCP_LISTEN)
498                         return -EINVAL;
499
500                 lock_sock(sk);
501                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
502                         answ = 0;
503                 else if (sock_flag(sk, SOCK_URGINLINE) ||
504                          !tp->urg_data ||
505                          before(tp->urg_seq, tp->copied_seq) ||
506                          !before(tp->urg_seq, tp->rcv_nxt)) {
507                         answ = tp->rcv_nxt - tp->copied_seq;
508
509                         /* Subtract 1, if FIN is in queue. */
510                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
511                                 answ -=
512                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
513                 } else
514                         answ = tp->urg_seq - tp->copied_seq;
515                 release_sock(sk);
516                 break;
517         case SIOCATMARK:
518                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
519                 break;
520         case SIOCOUTQ:
521                 if (sk->sk_state == TCP_LISTEN)
522                         return -EINVAL;
523
524                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
525                         answ = 0;
526                 else
527                         answ = tp->write_seq - tp->snd_una;
528                 break;
529         default:
530                 return -ENOIOCTLCMD;
531         };
532
533         return put_user(answ, (int __user *)arg);
534 }
535
536
537 int tcp_listen_start(struct sock *sk)
538 {
539         struct inet_opt *inet = inet_sk(sk);
540         struct tcp_opt *tp = tcp_sk(sk);
541         struct tcp_listen_opt *lopt;
542
543         sk->sk_max_ack_backlog = 0;
544         sk->sk_ack_backlog = 0;
545         tp->accept_queue = tp->accept_queue_tail = NULL;
546         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
547         tcp_delack_init(tp);
548
549         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
550         if (!lopt)
551                 return -ENOMEM;
552
553         memset(lopt, 0, sizeof(struct tcp_listen_opt));
554         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
555                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
556                         break;
557         get_random_bytes(&lopt->hash_rnd, 4);
558
559         write_lock_bh(&tp->syn_wait_lock);
560         tp->listen_opt = lopt;
561         write_unlock_bh(&tp->syn_wait_lock);
562
563         /* There is race window here: we announce ourselves listening,
564          * but this transition is still not validated by get_port().
565          * It is OK, because this socket enters to hash table only
566          * after validation is complete.
567          */
568         sk->sk_state = TCP_LISTEN;
569         if (!sk->sk_prot->get_port(sk, inet->num)) {
570                 inet->sport = htons(inet->num);
571
572                 sk_dst_reset(sk);
573                 sk->sk_prot->hash(sk);
574
575                 return 0;
576         }
577
578         sk->sk_state = TCP_CLOSE;
579         write_lock_bh(&tp->syn_wait_lock);
580         tp->listen_opt = NULL;
581         write_unlock_bh(&tp->syn_wait_lock);
582         kfree(lopt);
583         return -EADDRINUSE;
584 }
585
586 /*
587  *      This routine closes sockets which have been at least partially
588  *      opened, but not yet accepted.
589  */
590
591 static void tcp_listen_stop (struct sock *sk)
592 {
593         struct tcp_opt *tp = tcp_sk(sk);
594         struct tcp_listen_opt *lopt = tp->listen_opt;
595         struct open_request *acc_req = tp->accept_queue;
596         struct open_request *req;
597         int i;
598
599         tcp_delete_keepalive_timer(sk);
600
601         /* make all the listen_opt local to us */
602         write_lock_bh(&tp->syn_wait_lock);
603         tp->listen_opt = NULL;
604         write_unlock_bh(&tp->syn_wait_lock);
605         tp->accept_queue = tp->accept_queue_tail = NULL;
606
607         if (lopt->qlen) {
608                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
609                         while ((req = lopt->syn_table[i]) != NULL) {
610                                 lopt->syn_table[i] = req->dl_next;
611                                 lopt->qlen--;
612                                 tcp_openreq_free(req);
613
614                 /* Following specs, it would be better either to send FIN
615                  * (and enter FIN-WAIT-1, it is normal close)
616                  * or to send active reset (abort).
617                  * Certainly, it is pretty dangerous while synflood, but it is
618                  * bad justification for our negligence 8)
619                  * To be honest, we are not able to make either
620                  * of the variants now.                 --ANK
621                  */
622                         }
623                 }
624         }
625         BUG_TRAP(!lopt->qlen);
626
627         kfree(lopt);
628
629         while ((req = acc_req) != NULL) {
630                 struct sock *child = req->sk;
631
632                 acc_req = req->dl_next;
633
634                 local_bh_disable();
635                 bh_lock_sock(child);
636                 BUG_TRAP(!sock_owned_by_user(child));
637                 sock_hold(child);
638
639                 tcp_disconnect(child, O_NONBLOCK);
640
641                 sock_orphan(child);
642
643                 atomic_inc(&tcp_orphan_count);
644
645                 tcp_destroy_sock(child);
646
647                 bh_unlock_sock(child);
648                 local_bh_enable();
649                 sock_put(child);
650
651                 tcp_acceptq_removed(sk);
652                 tcp_openreq_fastfree(req);
653         }
654         BUG_TRAP(!sk->sk_ack_backlog);
655 }
656
657 /*
658  *      Wait for a socket to get into the connected state
659  *
660  *      Note: Must be called with the socket locked.
661  */
662 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
663 {
664         struct tcp_opt *tp = tcp_sk(sk);
665         struct task_struct *tsk = current;
666         DEFINE_WAIT(wait);
667
668         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
669                 if (sk->sk_err)
670                         return sock_error(sk);
671                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
672                         return -EPIPE;
673                 if (!*timeo_p)
674                         return -EAGAIN;
675                 if (signal_pending(tsk))
676                         return sock_intr_errno(*timeo_p);
677
678                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
679                 tp->write_pending++;
680
681                 release_sock(sk);
682                 *timeo_p = schedule_timeout(*timeo_p);
683                 lock_sock(sk);
684
685                 finish_wait(sk->sk_sleep, &wait);
686                 tp->write_pending--;
687         }
688         return 0;
689 }
690
691 static inline int tcp_memory_free(struct sock *sk)
692 {
693         return sk->sk_wmem_queued < sk->sk_sndbuf;
694 }
695
696 /*
697  *      Wait for more memory for a socket
698  */
699 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
700 {
701         struct tcp_opt *tp = tcp_sk(sk);
702         int err = 0;
703         long vm_wait = 0;
704         long current_timeo = *timeo;
705         DEFINE_WAIT(wait);
706
707         if (tcp_memory_free(sk))
708                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
709
710         for (;;) {
711                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
712
713                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
714
715                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
716                         goto do_error;
717                 if (!*timeo)
718                         goto do_nonblock;
719                 if (signal_pending(current))
720                         goto do_interrupted;
721                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
722                 if (tcp_memory_free(sk) && !vm_wait)
723                         break;
724
725                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
726                 tp->write_pending++;
727                 release_sock(sk);
728                 if (!tcp_memory_free(sk) || vm_wait)
729                         current_timeo = schedule_timeout(current_timeo);
730                 lock_sock(sk);
731                 tp->write_pending--;
732
733                 if (vm_wait) {
734                         vm_wait -= current_timeo;
735                         current_timeo = *timeo;
736                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
737                             (current_timeo -= vm_wait) < 0)
738                                 current_timeo = 0;
739                         vm_wait = 0;
740                 }
741                 *timeo = current_timeo;
742         }
743 out:
744         finish_wait(sk->sk_sleep, &wait);
745         return err;
746
747 do_error:
748         err = -EPIPE;
749         goto out;
750 do_nonblock:
751         err = -EAGAIN;
752         goto out;
753 do_interrupted:
754         err = sock_intr_errno(*timeo);
755         goto out;
756 }
757
758 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
759                                int off)
760 {
761         if (i) {
762                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
763                 return page == frag->page &&
764                        off == frag->page_offset + frag->size;
765         }
766         return 0;
767 }
768
769 static inline void fill_page_desc(struct sk_buff *skb, int i,
770                                   struct page *page, int off, int size)
771 {
772         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
773         frag->page = page;
774         frag->page_offset = off;
775         frag->size = size;
776         skb_shinfo(skb)->nr_frags = i + 1;
777 }
778
779 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
780 {
781         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
782         tp->pushed_seq = tp->write_seq;
783 }
784
785 static inline int forced_push(struct tcp_opt *tp)
786 {
787         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
788 }
789
790 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
791                               struct sk_buff *skb)
792 {
793         skb->csum = 0;
794         TCP_SKB_CB(skb)->seq = tp->write_seq;
795         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
796         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
797         TCP_SKB_CB(skb)->sacked = 0;
798         __skb_queue_tail(&sk->sk_write_queue, skb);
799         tcp_charge_skb(sk, skb);
800         if (!tp->send_head)
801                 tp->send_head = skb;
802         else if (tp->nonagle&TCP_NAGLE_PUSH)
803                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
804 }
805
806 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
807                                 struct sk_buff *skb)
808 {
809         if (flags & MSG_OOB) {
810                 tp->urg_mode = 1;
811                 tp->snd_up = tp->write_seq;
812                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
813         }
814 }
815
816 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
817                             int mss_now, int nonagle)
818 {
819         if (tp->send_head) {
820                 struct sk_buff *skb = sk->sk_write_queue.prev;
821                 if (!(flags & MSG_MORE) || forced_push(tp))
822                         tcp_mark_push(tp, skb);
823                 tcp_mark_urg(tp, flags, skb);
824                 __tcp_push_pending_frames(sk, tp, mss_now,
825                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
826         }
827 }
828
829 static int tcp_error(struct sock *sk, int flags, int err)
830 {
831         if (err == -EPIPE)
832                 err = sock_error(sk) ? : -EPIPE;
833         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
834                 send_sig(SIGPIPE, current, 0);
835         return err;
836 }
837
838 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
839                          size_t psize, int flags)
840 {
841         struct tcp_opt *tp = tcp_sk(sk);
842         int mss_now;
843         int err;
844         ssize_t copied;
845         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
846
847         /* Wait for a connection to finish. */
848         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
849                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
850                         goto out_err;
851
852         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
853
854         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
855         copied = 0;
856
857         err = -EPIPE;
858         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
859                 goto do_error;
860
861         while (psize > 0) {
862                 struct sk_buff *skb = sk->sk_write_queue.prev;
863                 struct page *page = pages[poffset / PAGE_SIZE];
864                 int copy, i;
865                 int offset = poffset % PAGE_SIZE;
866                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
867
868                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
869 new_segment:
870                         if (!tcp_memory_free(sk))
871                                 goto wait_for_sndbuf;
872
873                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
874                                              sk->sk_allocation);
875                         if (!skb)
876                                 goto wait_for_memory;
877
878                         skb_entail(sk, tp, skb);
879                         copy = mss_now;
880                 }
881
882                 if (copy > size)
883                         copy = size;
884
885                 i = skb_shinfo(skb)->nr_frags;
886                 if (can_coalesce(skb, i, page, offset)) {
887                         skb_shinfo(skb)->frags[i - 1].size += copy;
888                 } else if (i < MAX_SKB_FRAGS) {
889                         get_page(page);
890                         fill_page_desc(skb, i, page, offset, copy);
891                 } else {
892                         tcp_mark_push(tp, skb);
893                         goto new_segment;
894                 }
895
896                 skb->len += copy;
897                 skb->data_len += copy;
898                 skb->ip_summed = CHECKSUM_HW;
899                 tp->write_seq += copy;
900                 TCP_SKB_CB(skb)->end_seq += copy;
901
902                 if (!copied)
903                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
904
905                 copied += copy;
906                 poffset += copy;
907                 if (!(psize -= copy))
908                         goto out;
909
910                 if (skb->len != mss_now || (flags & MSG_OOB))
911                         continue;
912
913                 if (forced_push(tp)) {
914                         tcp_mark_push(tp, skb);
915                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
916                 } else if (skb == tp->send_head)
917                         tcp_push_one(sk, mss_now);
918                 continue;
919
920 wait_for_sndbuf:
921                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
922 wait_for_memory:
923                 if (copied)
924                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
925
926                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
927                         goto do_error;
928
929                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
930         }
931
932 out:
933         if (copied)
934                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
935         return copied;
936
937 do_error:
938         if (copied)
939                 goto out;
940 out_err:
941         return tcp_error(sk, flags, err);
942 }
943
944 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
945                      size_t size, int flags)
946 {
947         ssize_t res;
948         struct sock *sk = sock->sk;
949
950 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
951
952         if (!(sk->sk_route_caps & NETIF_F_SG) ||
953             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
954                 return sock_no_sendpage(sock, page, offset, size, flags);
955
956 #undef TCP_ZC_CSUM_FLAGS
957
958         lock_sock(sk);
959         TCP_CHECK_TIMER(sk);
960         res = do_tcp_sendpages(sk, &page, offset, size, flags);
961         TCP_CHECK_TIMER(sk);
962         release_sock(sk);
963         return res;
964 }
965
966 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
967 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
968
969 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
970                                    struct sk_buff *skb, struct page *page,
971                                    int off, int copy)
972 {
973         int err = 0;
974         unsigned int csum;
975
976         if (skb->ip_summed == CHECKSUM_NONE) {
977                 csum = csum_and_copy_from_user(from, page_address(page) + off,
978                                        copy, 0, &err);
979                 if (err) return err;
980                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
981         } else {
982                 if (copy_from_user(page_address(page) + off, from, copy))
983                         return -EFAULT;
984         }
985
986         skb->len += copy;
987         skb->data_len += copy;
988         skb->truesize += copy;
989         sk->sk_wmem_queued += copy;
990         sk->sk_forward_alloc -= copy;
991         return 0;
992 }
993
994 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
995 {
996         int err = 0;
997         unsigned int csum;
998         int off = skb->len;
999
1000         if (skb->ip_summed == CHECKSUM_NONE) {
1001                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1002                                        copy, 0, &err);
1003                 if (!err) {
1004                         skb->csum = csum_block_add(skb->csum, csum, off);
1005                         return 0;
1006                 }
1007         } else {
1008                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1009                         return 0;
1010         }
1011
1012         __skb_trim(skb, off);
1013         return -EFAULT;
1014 }
1015
1016 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1017 {
1018         int tmp = tp->mss_cache_std;
1019
1020         if (sk->sk_route_caps & NETIF_F_SG) {
1021                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1022
1023                 if (tmp >= pgbreak &&
1024                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1025                         tmp = pgbreak;
1026         }
1027         return tmp;
1028 }
1029
1030 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1031                 size_t size)
1032 {
1033         struct iovec *iov;
1034         struct tcp_opt *tp = tcp_sk(sk);
1035         struct sk_buff *skb;
1036         int iovlen, flags;
1037         int mss_now;
1038         int err, copied;
1039         long timeo;
1040
1041         lock_sock(sk);
1042         TCP_CHECK_TIMER(sk);
1043
1044         flags = msg->msg_flags;
1045         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1046
1047         /* Wait for a connection to finish. */
1048         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1049                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1050                         goto out_err;
1051
1052         /* This should be in poll */
1053         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1054
1055         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1056
1057         /* Ok commence sending. */
1058         iovlen = msg->msg_iovlen;
1059         iov = msg->msg_iov;
1060         copied = 0;
1061
1062         err = -EPIPE;
1063         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1064                 goto do_error;
1065
1066         while (--iovlen >= 0) {
1067                 int seglen = iov->iov_len;
1068                 unsigned char __user *from = iov->iov_base;
1069
1070                 iov++;
1071
1072                 while (seglen > 0) {
1073                         int copy;
1074
1075                         skb = sk->sk_write_queue.prev;
1076
1077                         if (!tp->send_head ||
1078                             (copy = mss_now - skb->len) <= 0) {
1079
1080 new_segment:
1081                                 /* Allocate new segment. If the interface is SG,
1082                                  * allocate skb fitting to single page.
1083                                  */
1084                                 if (!tcp_memory_free(sk))
1085                                         goto wait_for_sndbuf;
1086
1087                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1088                                                      0, sk->sk_allocation);
1089                                 if (!skb)
1090                                         goto wait_for_memory;
1091
1092                                 /*
1093                                  * Check whether we can use HW checksum.
1094                                  */
1095                                 if (sk->sk_route_caps &
1096                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1097                                      NETIF_F_HW_CSUM))
1098                                         skb->ip_summed = CHECKSUM_HW;
1099
1100                                 skb_entail(sk, tp, skb);
1101                                 copy = mss_now;
1102                         }
1103
1104                         /* Try to append data to the end of skb. */
1105                         if (copy > seglen)
1106                                 copy = seglen;
1107
1108                         /* Where to copy to? */
1109                         if (skb_tailroom(skb) > 0) {
1110                                 /* We have some space in skb head. Superb! */
1111                                 if (copy > skb_tailroom(skb))
1112                                         copy = skb_tailroom(skb);
1113                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1114                                         goto do_fault;
1115                         } else {
1116                                 int merge = 0;
1117                                 int i = skb_shinfo(skb)->nr_frags;
1118                                 struct page *page = TCP_PAGE(sk);
1119                                 int off = TCP_OFF(sk);
1120
1121                                 if (can_coalesce(skb, i, page, off) &&
1122                                     off != PAGE_SIZE) {
1123                                         /* We can extend the last page
1124                                          * fragment. */
1125                                         merge = 1;
1126                                 } else if (i == MAX_SKB_FRAGS ||
1127                                            (!i &&
1128                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1129                                         /* Need to add new fragment and cannot
1130                                          * do this because interface is non-SG,
1131                                          * or because all the page slots are
1132                                          * busy. */
1133                                         tcp_mark_push(tp, skb);
1134                                         goto new_segment;
1135                                 } else if (page) {
1136                                         /* If page is cached, align
1137                                          * offset to L1 cache boundary
1138                                          */
1139                                         off = (off + L1_CACHE_BYTES - 1) &
1140                                               ~(L1_CACHE_BYTES - 1);
1141                                         if (off == PAGE_SIZE) {
1142                                                 put_page(page);
1143                                                 TCP_PAGE(sk) = page = NULL;
1144                                         }
1145                                 }
1146
1147                                 if (!page) {
1148                                         /* Allocate new cache page. */
1149                                         if (!(page = tcp_alloc_page(sk)))
1150                                                 goto wait_for_memory;
1151                                         off = 0;
1152                                 }
1153
1154                                 if (copy > PAGE_SIZE - off)
1155                                         copy = PAGE_SIZE - off;
1156
1157                                 /* Time to copy data. We are close to
1158                                  * the end! */
1159                                 err = tcp_copy_to_page(sk, from, skb, page,
1160                                                        off, copy);
1161                                 if (err) {
1162                                         /* If this page was new, give it to the
1163                                          * socket so it does not get leaked.
1164                                          */
1165                                         if (!TCP_PAGE(sk)) {
1166                                                 TCP_PAGE(sk) = page;
1167                                                 TCP_OFF(sk) = 0;
1168                                         }
1169                                         goto do_error;
1170                                 }
1171
1172                                 /* Update the skb. */
1173                                 if (merge) {
1174                                         skb_shinfo(skb)->frags[i - 1].size +=
1175                                                                         copy;
1176                                 } else {
1177                                         fill_page_desc(skb, i, page, off, copy);
1178                                         if (TCP_PAGE(sk)) {
1179                                                 get_page(page);
1180                                         } else if (off + copy < PAGE_SIZE) {
1181                                                 get_page(page);
1182                                                 TCP_PAGE(sk) = page;
1183                                         }
1184                                 }
1185
1186                                 TCP_OFF(sk) = off + copy;
1187                         }
1188
1189                         if (!copied)
1190                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1191
1192                         tp->write_seq += copy;
1193                         TCP_SKB_CB(skb)->end_seq += copy;
1194
1195                         from += copy;
1196                         copied += copy;
1197                         if ((seglen -= copy) == 0 && iovlen == 0)
1198                                 goto out;
1199
1200                         if (skb->len != mss_now || (flags & MSG_OOB))
1201                                 continue;
1202
1203                         if (forced_push(tp)) {
1204                                 tcp_mark_push(tp, skb);
1205                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1206                         } else if (skb == tp->send_head)
1207                                 tcp_push_one(sk, mss_now);
1208                         continue;
1209
1210 wait_for_sndbuf:
1211                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1212 wait_for_memory:
1213                         if (copied)
1214                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1215
1216                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1217                                 goto do_error;
1218
1219                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1220                 }
1221         }
1222
1223 out:
1224         if (copied)
1225                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1226         TCP_CHECK_TIMER(sk);
1227         release_sock(sk);
1228         return copied;
1229
1230 do_fault:
1231         if (!skb->len) {
1232                 if (tp->send_head == skb)
1233                         tp->send_head = NULL;
1234                 __skb_unlink(skb, skb->list);
1235                 tcp_free_skb(sk, skb);
1236         }
1237
1238 do_error:
1239         if (copied)
1240                 goto out;
1241 out_err:
1242         err = tcp_error(sk, flags, err);
1243         TCP_CHECK_TIMER(sk);
1244         release_sock(sk);
1245         return err;
1246 }
1247
1248 /*
1249  *      Handle reading urgent data. BSD has very simple semantics for
1250  *      this, no blocking and very strange errors 8)
1251  */
1252
1253 static int tcp_recv_urg(struct sock *sk, long timeo,
1254                         struct msghdr *msg, int len, int flags,
1255                         int *addr_len)
1256 {
1257         struct tcp_opt *tp = tcp_sk(sk);
1258
1259         /* No URG data to read. */
1260         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1261             tp->urg_data == TCP_URG_READ)
1262                 return -EINVAL; /* Yes this is right ! */
1263
1264         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1265                 return -ENOTCONN;
1266
1267         if (tp->urg_data & TCP_URG_VALID) {
1268                 int err = 0;
1269                 char c = tp->urg_data;
1270
1271                 if (!(flags & MSG_PEEK))
1272                         tp->urg_data = TCP_URG_READ;
1273
1274                 /* Read urgent data. */
1275                 msg->msg_flags |= MSG_OOB;
1276
1277                 if (len > 0) {
1278                         if (!(flags & MSG_TRUNC))
1279                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1280                         len = 1;
1281                 } else
1282                         msg->msg_flags |= MSG_TRUNC;
1283
1284                 return err ? -EFAULT : len;
1285         }
1286
1287         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1288                 return 0;
1289
1290         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1291          * the available implementations agree in this case:
1292          * this call should never block, independent of the
1293          * blocking state of the socket.
1294          * Mike <pall@rz.uni-karlsruhe.de>
1295          */
1296         return -EAGAIN;
1297 }
1298
1299 /*
1300  *      Release a skb if it is no longer needed. This routine
1301  *      must be called with interrupts disabled or with the
1302  *      socket locked so that the sk_buff queue operation is ok.
1303  */
1304
1305 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1306 {
1307         __skb_unlink(skb, &sk->sk_receive_queue);
1308         __kfree_skb(skb);
1309 }
1310
1311 /* Clean up the receive buffer for full frames taken by the user,
1312  * then send an ACK if necessary.  COPIED is the number of bytes
1313  * tcp_recvmsg has given to the user so far, it speeds up the
1314  * calculation of whether or not we must ACK for the sake of
1315  * a window update.
1316  */
1317 void cleanup_rbuf(struct sock *sk, int copied)
1318 {
1319         struct tcp_opt *tp = tcp_sk(sk);
1320         int time_to_ack = 0;
1321
1322 #if TCP_DEBUG
1323         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1324
1325         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1326 #endif
1327
1328         if (tcp_ack_scheduled(tp)) {
1329                    /* Delayed ACKs frequently hit locked sockets during bulk
1330                     * receive. */
1331                 if (tp->ack.blocked ||
1332                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1333                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1334                     /*
1335                      * If this read emptied read buffer, we send ACK, if
1336                      * connection is not bidirectional, user drained
1337                      * receive buffer and there was a small segment
1338                      * in queue.
1339                      */
1340                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1341                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1342                         time_to_ack = 1;
1343         }
1344
1345         /* We send an ACK if we can now advertise a non-zero window
1346          * which has been raised "significantly".
1347          *
1348          * Even if window raised up to infinity, do not send window open ACK
1349          * in states, where we will not receive more. It is useless.
1350          */
1351         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1352                 __u32 rcv_window_now = tcp_receive_window(tp);
1353
1354                 /* Optimize, __tcp_select_window() is not cheap. */
1355                 if (2*rcv_window_now <= tp->window_clamp) {
1356                         __u32 new_window = __tcp_select_window(sk);
1357
1358                         /* Send ACK now, if this read freed lots of space
1359                          * in our buffer. Certainly, new_window is new window.
1360                          * We can advertise it now, if it is not less than current one.
1361                          * "Lots" means "at least twice" here.
1362                          */
1363                         if (new_window && new_window >= 2 * rcv_window_now)
1364                                 time_to_ack = 1;
1365                 }
1366         }
1367         if (time_to_ack)
1368                 tcp_send_ack(sk);
1369 }
1370
1371 /* Now socket state including sk->sk_err is changed only under lock,
1372  * hence we may omit checks after joining wait queue.
1373  * We check receive queue before schedule() only as optimization;
1374  * it is very likely that release_sock() added new data.
1375  */
1376
1377 static long tcp_data_wait(struct sock *sk, long timeo)
1378 {
1379         DEFINE_WAIT(wait);
1380
1381         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1382
1383         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1384         release_sock(sk);
1385
1386         if (skb_queue_empty(&sk->sk_receive_queue))
1387                 timeo = schedule_timeout(timeo);
1388
1389         lock_sock(sk);
1390         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1391
1392         finish_wait(sk->sk_sleep, &wait);
1393         return timeo;
1394 }
1395
1396 static void tcp_prequeue_process(struct sock *sk)
1397 {
1398         struct sk_buff *skb;
1399         struct tcp_opt *tp = tcp_sk(sk);
1400
1401         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1402
1403         /* RX process wants to run with disabled BHs, though it is not
1404          * necessary */
1405         local_bh_disable();
1406         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1407                 sk->sk_backlog_rcv(sk, skb);
1408         local_bh_enable();
1409
1410         /* Clear memory counter. */
1411         tp->ucopy.memory = 0;
1412 }
1413
1414 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1415 {
1416         struct sk_buff *skb;
1417         u32 offset;
1418
1419         skb_queue_walk(&sk->sk_receive_queue, skb) {
1420                 offset = seq - TCP_SKB_CB(skb)->seq;
1421                 if (skb->h.th->syn)
1422                         offset--;
1423                 if (offset < skb->len || skb->h.th->fin) {
1424                         *off = offset;
1425                         return skb;
1426                 }
1427         }
1428         return NULL;
1429 }
1430
1431 /*
1432  * This routine provides an alternative to tcp_recvmsg() for routines
1433  * that would like to handle copying from skbuffs directly in 'sendfile'
1434  * fashion.
1435  * Note:
1436  *      - It is assumed that the socket was locked by the caller.
1437  *      - The routine does not block.
1438  *      - At present, there is no support for reading OOB data
1439  *        or for 'peeking' the socket using this routine
1440  *        (although both would be easy to implement).
1441  */
1442 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1443                   sk_read_actor_t recv_actor)
1444 {
1445         struct sk_buff *skb;
1446         struct tcp_opt *tp = tcp_sk(sk);
1447         u32 seq = tp->copied_seq;
1448         u32 offset;
1449         int copied = 0;
1450
1451         if (sk->sk_state == TCP_LISTEN)
1452                 return -ENOTCONN;
1453         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1454                 if (offset < skb->len) {
1455                         size_t used, len;
1456
1457                         len = skb->len - offset;
1458                         /* Stop reading if we hit a patch of urgent data */
1459                         if (tp->urg_data) {
1460                                 u32 urg_offset = tp->urg_seq - seq;
1461                                 if (urg_offset < len)
1462                                         len = urg_offset;
1463                                 if (!len)
1464                                         break;
1465                         }
1466                         used = recv_actor(desc, skb, offset, len);
1467                         if (used <= len) {
1468                                 seq += used;
1469                                 copied += used;
1470                                 offset += used;
1471                         }
1472                         if (offset != skb->len)
1473                                 break;
1474                 }
1475                 if (skb->h.th->fin) {
1476                         tcp_eat_skb(sk, skb);
1477                         ++seq;
1478                         break;
1479                 }
1480                 tcp_eat_skb(sk, skb);
1481                 if (!desc->count)
1482                         break;
1483         }
1484         tp->copied_seq = seq;
1485
1486         tcp_rcv_space_adjust(sk);
1487
1488         /* Clean up data we have read: This will do ACK frames. */
1489         if (copied)
1490                 cleanup_rbuf(sk, copied);
1491         return copied;
1492 }
1493
1494 /*
1495  *      This routine copies from a sock struct into the user buffer.
1496  *
1497  *      Technical note: in 2.3 we work on _locked_ socket, so that
1498  *      tricks with *seq access order and skb->users are not required.
1499  *      Probably, code can be easily improved even more.
1500  */
1501
1502 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1503                 size_t len, int nonblock, int flags, int *addr_len)
1504 {
1505         struct tcp_opt *tp = tcp_sk(sk);
1506         int copied = 0;
1507         u32 peek_seq;
1508         u32 *seq;
1509         unsigned long used;
1510         int err;
1511         int target;             /* Read at least this many bytes */
1512         long timeo;
1513         struct task_struct *user_recv = NULL;
1514
1515         lock_sock(sk);
1516
1517         TCP_CHECK_TIMER(sk);
1518
1519         err = -ENOTCONN;
1520         if (sk->sk_state == TCP_LISTEN)
1521                 goto out;
1522
1523         timeo = sock_rcvtimeo(sk, nonblock);
1524
1525         /* Urgent data needs to be handled specially. */
1526         if (flags & MSG_OOB)
1527                 goto recv_urg;
1528
1529         seq = &tp->copied_seq;
1530         if (flags & MSG_PEEK) {
1531                 peek_seq = tp->copied_seq;
1532                 seq = &peek_seq;
1533         }
1534
1535         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1536
1537         do {
1538                 struct sk_buff *skb;
1539                 u32 offset;
1540
1541                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1542                 if (tp->urg_data && tp->urg_seq == *seq) {
1543                         if (copied)
1544                                 break;
1545                         if (signal_pending(current)) {
1546                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1547                                 break;
1548                         }
1549                 }
1550
1551                 /* Next get a buffer. */
1552
1553                 skb = skb_peek(&sk->sk_receive_queue);
1554                 do {
1555                         if (!skb)
1556                                 break;
1557
1558                         /* Now that we have two receive queues this
1559                          * shouldn't happen.
1560                          */
1561                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1562                                 printk(KERN_INFO "recvmsg bug: copied %X "
1563                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1564                                 break;
1565                         }
1566                         offset = *seq - TCP_SKB_CB(skb)->seq;
1567                         if (skb->h.th->syn)
1568                                 offset--;
1569                         if (offset < skb->len)
1570                                 goto found_ok_skb;
1571                         if (skb->h.th->fin)
1572                                 goto found_fin_ok;
1573                         BUG_TRAP(flags & MSG_PEEK);
1574                         skb = skb->next;
1575                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1576
1577                 /* Well, if we have backlog, try to process it now yet. */
1578
1579                 if (copied >= target && !sk->sk_backlog.tail)
1580                         break;
1581
1582                 if (copied) {
1583                         if (sk->sk_err ||
1584                             sk->sk_state == TCP_CLOSE ||
1585                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1586                             !timeo ||
1587                             signal_pending(current) ||
1588                             (flags & MSG_PEEK))
1589                                 break;
1590                 } else {
1591                         if (sock_flag(sk, SOCK_DONE))
1592                                 break;
1593
1594                         if (sk->sk_err) {
1595                                 copied = sock_error(sk);
1596                                 break;
1597                         }
1598
1599                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1600                                 break;
1601
1602                         if (sk->sk_state == TCP_CLOSE) {
1603                                 if (!sock_flag(sk, SOCK_DONE)) {
1604                                         /* This occurs when user tries to read
1605                                          * from never connected socket.
1606                                          */
1607                                         copied = -ENOTCONN;
1608                                         break;
1609                                 }
1610                                 break;
1611                         }
1612
1613                         if (!timeo) {
1614                                 copied = -EAGAIN;
1615                                 break;
1616                         }
1617
1618                         if (signal_pending(current)) {
1619                                 copied = sock_intr_errno(timeo);
1620                                 break;
1621                         }
1622                 }
1623
1624                 cleanup_rbuf(sk, copied);
1625
1626                 if (tp->ucopy.task == user_recv) {
1627                         /* Install new reader */
1628                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1629                                 user_recv = current;
1630                                 tp->ucopy.task = user_recv;
1631                                 tp->ucopy.iov = msg->msg_iov;
1632                         }
1633
1634                         tp->ucopy.len = len;
1635
1636                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1637                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1638
1639                         /* Ugly... If prequeue is not empty, we have to
1640                          * process it before releasing socket, otherwise
1641                          * order will be broken at second iteration.
1642                          * More elegant solution is required!!!
1643                          *
1644                          * Look: we have the following (pseudo)queues:
1645                          *
1646                          * 1. packets in flight
1647                          * 2. backlog
1648                          * 3. prequeue
1649                          * 4. receive_queue
1650                          *
1651                          * Each queue can be processed only if the next ones
1652                          * are empty. At this point we have empty receive_queue.
1653                          * But prequeue _can_ be not empty after 2nd iteration,
1654                          * when we jumped to start of loop because backlog
1655                          * processing added something to receive_queue.
1656                          * We cannot release_sock(), because backlog contains
1657                          * packets arrived _after_ prequeued ones.
1658                          *
1659                          * Shortly, algorithm is clear --- to process all
1660                          * the queues in order. We could make it more directly,
1661                          * requeueing packets from backlog to prequeue, if
1662                          * is not empty. It is more elegant, but eats cycles,
1663                          * unfortunately.
1664                          */
1665                         if (skb_queue_len(&tp->ucopy.prequeue))
1666                                 goto do_prequeue;
1667
1668                         /* __ Set realtime policy in scheduler __ */
1669                 }
1670
1671                 if (copied >= target) {
1672                         /* Do not sleep, just process backlog. */
1673                         release_sock(sk);
1674                         lock_sock(sk);
1675                 } else {
1676                         timeo = tcp_data_wait(sk, timeo);
1677                 }
1678
1679                 if (user_recv) {
1680                         int chunk;
1681
1682                         /* __ Restore normal policy in scheduler __ */
1683
1684                         if ((chunk = len - tp->ucopy.len) != 0) {
1685                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1686                                 len -= chunk;
1687                                 copied += chunk;
1688                         }
1689
1690                         if (tp->rcv_nxt == tp->copied_seq &&
1691                             skb_queue_len(&tp->ucopy.prequeue)) {
1692 do_prequeue:
1693                                 tcp_prequeue_process(sk);
1694
1695                                 if ((chunk = len - tp->ucopy.len) != 0) {
1696                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1697                                         len -= chunk;
1698                                         copied += chunk;
1699                                 }
1700                         }
1701                 }
1702                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1703                         if (net_ratelimit())
1704                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1705                                        current->comm, current->pid);
1706                         peek_seq = tp->copied_seq;
1707                 }
1708                 continue;
1709
1710         found_ok_skb:
1711                 /* Ok so how much can we use? */
1712                 used = skb->len - offset;
1713                 if (len < used)
1714                         used = len;
1715
1716                 /* Do we have urgent data here? */
1717                 if (tp->urg_data) {
1718                         u32 urg_offset = tp->urg_seq - *seq;
1719                         if (urg_offset < used) {
1720                                 if (!urg_offset) {
1721                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1722                                                 ++*seq;
1723                                                 offset++;
1724                                                 used--;
1725                                                 if (!used)
1726                                                         goto skip_copy;
1727                                         }
1728                                 } else
1729                                         used = urg_offset;
1730                         }
1731                 }
1732
1733                 if (!(flags & MSG_TRUNC)) {
1734                         err = skb_copy_datagram_iovec(skb, offset,
1735                                                       msg->msg_iov, used);
1736                         if (err) {
1737                                 /* Exception. Bailout! */
1738                                 if (!copied)
1739                                         copied = -EFAULT;
1740                                 break;
1741                         }
1742                 }
1743
1744                 *seq += used;
1745                 copied += used;
1746                 len -= used;
1747
1748                 tcp_rcv_space_adjust(sk);
1749
1750 skip_copy:
1751                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1752                         tp->urg_data = 0;
1753                         tcp_fast_path_check(sk, tp);
1754                 }
1755                 if (used + offset < skb->len)
1756                         continue;
1757
1758                 if (skb->h.th->fin)
1759                         goto found_fin_ok;
1760                 if (!(flags & MSG_PEEK))
1761                         tcp_eat_skb(sk, skb);
1762                 continue;
1763
1764         found_fin_ok:
1765                 /* Process the FIN. */
1766                 ++*seq;
1767                 if (!(flags & MSG_PEEK))
1768                         tcp_eat_skb(sk, skb);
1769                 break;
1770         } while (len > 0);
1771
1772         if (user_recv) {
1773                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1774                         int chunk;
1775
1776                         tp->ucopy.len = copied > 0 ? len : 0;
1777
1778                         tcp_prequeue_process(sk);
1779
1780                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1781                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1782                                 len -= chunk;
1783                                 copied += chunk;
1784                         }
1785                 }
1786
1787                 tp->ucopy.task = NULL;
1788                 tp->ucopy.len = 0;
1789         }
1790
1791         /* According to UNIX98, msg_name/msg_namelen are ignored
1792          * on connected socket. I was just happy when found this 8) --ANK
1793          */
1794
1795         /* Clean up data we have read: This will do ACK frames. */
1796         cleanup_rbuf(sk, copied);
1797
1798         TCP_CHECK_TIMER(sk);
1799         release_sock(sk);
1800         return copied;
1801
1802 out:
1803         TCP_CHECK_TIMER(sk);
1804         release_sock(sk);
1805         return err;
1806
1807 recv_urg:
1808         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1809         goto out;
1810 }
1811
1812 /*
1813  *      State processing on a close. This implements the state shift for
1814  *      sending our FIN frame. Note that we only send a FIN for some
1815  *      states. A shutdown() may have already sent the FIN, or we may be
1816  *      closed.
1817  */
1818
1819 static unsigned char new_state[16] = {
1820   /* current state:        new state:      action:      */
1821   /* (Invalid)          */ TCP_CLOSE,
1822   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1823   /* TCP_SYN_SENT       */ TCP_CLOSE,
1824   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1825   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1826   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1827   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1828   /* TCP_CLOSE          */ TCP_CLOSE,
1829   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1830   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1831   /* TCP_LISTEN         */ TCP_CLOSE,
1832   /* TCP_CLOSING        */ TCP_CLOSING,
1833 };
1834
1835 static int tcp_close_state(struct sock *sk)
1836 {
1837         int next = (int)new_state[sk->sk_state];
1838         int ns = next & TCP_STATE_MASK;
1839
1840         tcp_set_state(sk, ns);
1841
1842         return next & TCP_ACTION_FIN;
1843 }
1844
1845 /*
1846  *      Shutdown the sending side of a connection. Much like close except
1847  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1848  */
1849
1850 void tcp_shutdown(struct sock *sk, int how)
1851 {
1852         /*      We need to grab some memory, and put together a FIN,
1853          *      and then put it into the queue to be sent.
1854          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1855          */
1856         if (!(how & SEND_SHUTDOWN))
1857                 return;
1858
1859         /* If we've already sent a FIN, or it's a closed state, skip this. */
1860         if ((1 << sk->sk_state) &
1861             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1862              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1863                 /* Clear out any half completed packets.  FIN if needed. */
1864                 if (tcp_close_state(sk))
1865                         tcp_send_fin(sk);
1866         }
1867 }
1868
1869
1870 /*
1871  *      Return 1 if we still have things to send in our buffers.
1872  */
1873
1874 static inline int closing(struct sock *sk)
1875 {
1876         return (1 << sk->sk_state) &
1877                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1878 }
1879
1880 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1881 {
1882         /* First the read buffer. */
1883         __skb_queue_purge(&sk->sk_receive_queue);
1884
1885         /* Next, the error queue. */
1886         __skb_queue_purge(&sk->sk_error_queue);
1887
1888         /* Next, the write queue. */
1889         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1890
1891         /* Account for returned memory. */
1892         tcp_mem_reclaim(sk);
1893
1894         BUG_TRAP(!sk->sk_wmem_queued);
1895         BUG_TRAP(!sk->sk_forward_alloc);
1896
1897         /* It is _impossible_ for the backlog to contain anything
1898          * when we get here.  All user references to this socket
1899          * have gone away, only the net layer knows can touch it.
1900          */
1901 }
1902
1903 /*
1904  * At this point, there should be no process reference to this
1905  * socket, and thus no user references at all.  Therefore we
1906  * can assume the socket waitqueue is inactive and nobody will
1907  * try to jump onto it.
1908  */
1909 void tcp_destroy_sock(struct sock *sk)
1910 {
1911         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1912         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1913
1914         /* It cannot be in hash table! */
1915         BUG_TRAP(sk_unhashed(sk));
1916
1917         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1918         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1919
1920 #ifdef TCP_DEBUG
1921         if (sk->sk_zapped) {
1922                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1923                 sock_hold(sk);
1924         }
1925         sk->sk_zapped = 1;
1926 #endif
1927
1928         sk->sk_prot->destroy(sk);
1929
1930         tcp_kill_sk_queues(sk);
1931
1932         xfrm_sk_free_policy(sk);
1933
1934 #ifdef INET_REFCNT_DEBUG
1935         if (atomic_read(&sk->sk_refcnt) != 1) {
1936                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1937                        sk, atomic_read(&sk->sk_refcnt));
1938         }
1939 #endif
1940
1941         atomic_dec(&tcp_orphan_count);
1942         sock_put(sk);
1943 }
1944
1945 void tcp_close(struct sock *sk, long timeout)
1946 {
1947         struct sk_buff *skb;
1948         int data_was_unread = 0;
1949
1950         lock_sock(sk);
1951         sk->sk_shutdown = SHUTDOWN_MASK;
1952
1953         if (sk->sk_state == TCP_LISTEN) {
1954                 tcp_set_state(sk, TCP_CLOSE);
1955
1956                 /* Special case. */
1957                 tcp_listen_stop(sk);
1958
1959                 goto adjudge_to_death;
1960         }
1961
1962         /*  We need to flush the recv. buffs.  We do this only on the
1963          *  descriptor close, not protocol-sourced closes, because the
1964          *  reader process may not have drained the data yet!
1965          */
1966         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1967                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1968                           skb->h.th->fin;
1969                 data_was_unread += len;
1970                 __kfree_skb(skb);
1971         }
1972
1973         tcp_mem_reclaim(sk);
1974
1975         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1976          * 3.10, we send a RST here because data was lost.  To
1977          * witness the awful effects of the old behavior of always
1978          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1979          * a bulk GET in an FTP client, suspend the process, wait
1980          * for the client to advertise a zero window, then kill -9
1981          * the FTP client, wheee...  Note: timeout is always zero
1982          * in such a case.
1983          */
1984         if (data_was_unread) {
1985                 /* Unread data was tossed, zap the connection. */
1986                 NET_INC_STATS_USER(TCPAbortOnClose);
1987                 tcp_set_state(sk, TCP_CLOSE);
1988                 tcp_send_active_reset(sk, GFP_KERNEL);
1989         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1990                 /* Check zero linger _after_ checking for unread data. */
1991                 sk->sk_prot->disconnect(sk, 0);
1992                 NET_INC_STATS_USER(TCPAbortOnData);
1993         } else if (tcp_close_state(sk)) {
1994                 /* We FIN if the application ate all the data before
1995                  * zapping the connection.
1996                  */
1997
1998                 /* RED-PEN. Formally speaking, we have broken TCP state
1999                  * machine. State transitions:
2000                  *
2001                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2002                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2003                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2004                  *
2005                  * are legal only when FIN has been sent (i.e. in window),
2006                  * rather than queued out of window. Purists blame.
2007                  *
2008                  * F.e. "RFC state" is ESTABLISHED,
2009                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2010                  *
2011                  * The visible declinations are that sometimes
2012                  * we enter time-wait state, when it is not required really
2013                  * (harmless), do not send active resets, when they are
2014                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2015                  * they look as CLOSING or LAST_ACK for Linux)
2016                  * Probably, I missed some more holelets.
2017                  *                                              --ANK
2018                  */
2019                 tcp_send_fin(sk);
2020         }
2021
2022         if (timeout) {
2023                 struct task_struct *tsk = current;
2024                 DEFINE_WAIT(wait);
2025
2026                 do {
2027                         prepare_to_wait(sk->sk_sleep, &wait,
2028                                         TASK_INTERRUPTIBLE);
2029                         if (!closing(sk))
2030                                 break;
2031                         release_sock(sk);
2032                         timeout = schedule_timeout(timeout);
2033                         lock_sock(sk);
2034                 } while (!signal_pending(tsk) && timeout);
2035
2036                 finish_wait(sk->sk_sleep, &wait);
2037         }
2038
2039 adjudge_to_death:
2040         /* It is the last release_sock in its life. It will remove backlog. */
2041         release_sock(sk);
2042
2043
2044         /* Now socket is owned by kernel and we acquire BH lock
2045            to finish close. No need to check for user refs.
2046          */
2047         local_bh_disable();
2048         bh_lock_sock(sk);
2049         BUG_TRAP(!sock_owned_by_user(sk));
2050
2051         sock_hold(sk);
2052         sock_orphan(sk);
2053
2054         /*      This is a (useful) BSD violating of the RFC. There is a
2055          *      problem with TCP as specified in that the other end could
2056          *      keep a socket open forever with no application left this end.
2057          *      We use a 3 minute timeout (about the same as BSD) then kill
2058          *      our end. If they send after that then tough - BUT: long enough
2059          *      that we won't make the old 4*rto = almost no time - whoops
2060          *      reset mistake.
2061          *
2062          *      Nope, it was not mistake. It is really desired behaviour
2063          *      f.e. on http servers, when such sockets are useless, but
2064          *      consume significant resources. Let's do it with special
2065          *      linger2 option.                                 --ANK
2066          */
2067
2068         if (sk->sk_state == TCP_FIN_WAIT2) {
2069                 struct tcp_opt *tp = tcp_sk(sk);
2070                 if (tp->linger2 < 0) {
2071                         tcp_set_state(sk, TCP_CLOSE);
2072                         tcp_send_active_reset(sk, GFP_ATOMIC);
2073                         NET_INC_STATS_BH(TCPAbortOnLinger);
2074                 } else {
2075                         int tmo = tcp_fin_time(tp);
2076
2077                         if (tmo > TCP_TIMEWAIT_LEN) {
2078                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2079                         } else {
2080                                 atomic_inc(&tcp_orphan_count);
2081                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2082                                 goto out;
2083                         }
2084                 }
2085         }
2086         if (sk->sk_state != TCP_CLOSE) {
2087                 tcp_mem_reclaim(sk);
2088                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2089                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2090                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2091                         if (net_ratelimit())
2092                                 printk(KERN_INFO "TCP: too many of orphaned "
2093                                        "sockets\n");
2094                         tcp_set_state(sk, TCP_CLOSE);
2095                         tcp_send_active_reset(sk, GFP_ATOMIC);
2096                         NET_INC_STATS_BH(TCPAbortOnMemory);
2097                 }
2098         }
2099         atomic_inc(&tcp_orphan_count);
2100
2101         if (sk->sk_state == TCP_CLOSE)
2102                 tcp_destroy_sock(sk);
2103         /* Otherwise, socket is reprieved until protocol close. */
2104
2105 out:
2106         bh_unlock_sock(sk);
2107         local_bh_enable();
2108         sock_put(sk);
2109 }
2110
2111 /* These states need RST on ABORT according to RFC793 */
2112
2113 static inline int tcp_need_reset(int state)
2114 {
2115         return (1 << state) &
2116                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2117                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2118 }
2119
2120 int tcp_disconnect(struct sock *sk, int flags)
2121 {
2122         struct inet_opt *inet = inet_sk(sk);
2123         struct tcp_opt *tp = tcp_sk(sk);
2124         int err = 0;
2125         int old_state = sk->sk_state;
2126
2127         if (old_state != TCP_CLOSE)
2128                 tcp_set_state(sk, TCP_CLOSE);
2129
2130         /* ABORT function of RFC793 */
2131         if (old_state == TCP_LISTEN) {
2132                 tcp_listen_stop(sk);
2133         } else if (tcp_need_reset(old_state) ||
2134                    (tp->snd_nxt != tp->write_seq &&
2135                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2136                 /* The last check adjusts for discrepance of Linux wrt. RFC
2137                  * states
2138                  */
2139                 tcp_send_active_reset(sk, gfp_any());
2140                 sk->sk_err = ECONNRESET;
2141         } else if (old_state == TCP_SYN_SENT)
2142                 sk->sk_err = ECONNRESET;
2143
2144         tcp_clear_xmit_timers(sk);
2145         __skb_queue_purge(&sk->sk_receive_queue);
2146         tcp_writequeue_purge(sk);
2147         __skb_queue_purge(&tp->out_of_order_queue);
2148
2149         inet->dport = 0;
2150
2151         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2152                 inet_reset_saddr(sk);
2153
2154         sk->sk_shutdown = 0;
2155         sock_reset_flag(sk, SOCK_DONE);
2156         tp->srtt = 0;
2157         if ((tp->write_seq += tp->max_window + 2) == 0)
2158                 tp->write_seq = 1;
2159         tp->backoff = 0;
2160         tp->snd_cwnd = 2;
2161         tp->probes_out = 0;
2162         tp->packets_out = 0;
2163         tp->snd_ssthresh = 0x7fffffff;
2164         tp->snd_cwnd_cnt = 0;
2165         tcp_set_ca_state(tp, TCP_CA_Open);
2166         tcp_clear_retrans(tp);
2167         tcp_delack_init(tp);
2168         tp->send_head = NULL;
2169         tp->saw_tstamp = 0;
2170         tcp_sack_reset(tp);
2171         __sk_dst_reset(sk);
2172
2173         BUG_TRAP(!inet->num || tp->bind_hash);
2174
2175         sk->sk_error_report(sk);
2176         return err;
2177 }
2178
2179 /*
2180  *      Wait for an incoming connection, avoid race
2181  *      conditions. This must be called with the socket locked.
2182  */
2183 static int wait_for_connect(struct sock *sk, long timeo)
2184 {
2185         struct tcp_opt *tp = tcp_sk(sk);
2186         DEFINE_WAIT(wait);
2187         int err;
2188
2189         /*
2190          * True wake-one mechanism for incoming connections: only
2191          * one process gets woken up, not the 'whole herd'.
2192          * Since we do not 'race & poll' for established sockets
2193          * anymore, the common case will execute the loop only once.
2194          *
2195          * Subtle issue: "add_wait_queue_exclusive()" will be added
2196          * after any current non-exclusive waiters, and we know that
2197          * it will always _stay_ after any new non-exclusive waiters
2198          * because all non-exclusive waiters are added at the
2199          * beginning of the wait-queue. As such, it's ok to "drop"
2200          * our exclusiveness temporarily when we get woken up without
2201          * having to remove and re-insert us on the wait queue.
2202          */
2203         for (;;) {
2204                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2205                                           TASK_INTERRUPTIBLE);
2206                 release_sock(sk);
2207                 if (!tp->accept_queue)
2208                         timeo = schedule_timeout(timeo);
2209                 lock_sock(sk);
2210                 err = 0;
2211                 if (tp->accept_queue)
2212                         break;
2213                 err = -EINVAL;
2214                 if (sk->sk_state != TCP_LISTEN)
2215                         break;
2216                 err = sock_intr_errno(timeo);
2217                 if (signal_pending(current))
2218                         break;
2219                 err = -EAGAIN;
2220                 if (!timeo)
2221                         break;
2222         }
2223         finish_wait(sk->sk_sleep, &wait);
2224         return err;
2225 }
2226
2227 /*
2228  *      This will accept the next outstanding connection.
2229  */
2230
2231 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2232 {
2233         struct tcp_opt *tp = tcp_sk(sk);
2234         struct open_request *req;
2235         struct sock *newsk;
2236         int error;
2237
2238         lock_sock(sk);
2239
2240         /* We need to make sure that this socket is listening,
2241          * and that it has something pending.
2242          */
2243         error = -EINVAL;
2244         if (sk->sk_state != TCP_LISTEN)
2245                 goto out;
2246
2247         /* Find already established connection */
2248         if (!tp->accept_queue) {
2249                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2250
2251                 /* If this is a non blocking socket don't sleep */
2252                 error = -EAGAIN;
2253                 if (!timeo)
2254                         goto out;
2255
2256                 error = wait_for_connect(sk, timeo);
2257                 if (error)
2258                         goto out;
2259         }
2260
2261         req = tp->accept_queue;
2262         if ((tp->accept_queue = req->dl_next) == NULL)
2263                 tp->accept_queue_tail = NULL;
2264
2265         newsk = req->sk;
2266         tcp_acceptq_removed(sk);
2267         tcp_openreq_fastfree(req);
2268         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2269         release_sock(sk);
2270         return newsk;
2271
2272 out:
2273         release_sock(sk);
2274         *err = error;
2275         return NULL;
2276 }
2277
2278 /*
2279  *      Socket option code for TCP.
2280  */
2281 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2282                    int optlen)
2283 {
2284         struct tcp_opt *tp = tcp_sk(sk);
2285         int val;
2286         int err = 0;
2287
2288         if (level != SOL_TCP)
2289                 return tp->af_specific->setsockopt(sk, level, optname,
2290                                                    optval, optlen);
2291
2292         if (optlen < sizeof(int))
2293                 return -EINVAL;
2294
2295         if (get_user(val, (int __user *)optval))
2296                 return -EFAULT;
2297
2298         lock_sock(sk);
2299
2300         switch (optname) {
2301         case TCP_MAXSEG:
2302                 /* Values greater than interface MTU won't take effect. However
2303                  * at the point when this call is done we typically don't yet
2304                  * know which interface is going to be used */
2305                 if (val < 8 || val > MAX_TCP_WINDOW) {
2306                         err = -EINVAL;
2307                         break;
2308                 }
2309                 tp->user_mss = val;
2310                 break;
2311
2312         case TCP_NODELAY:
2313                 if (val) {
2314                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2315                          * this option on corked socket is remembered, but
2316                          * it is not activated until cork is cleared.
2317                          *
2318                          * However, when TCP_NODELAY is set we make
2319                          * an explicit push, which overrides even TCP_CORK
2320                          * for currently queued segments.
2321                          */
2322                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2323                         tcp_push_pending_frames(sk, tp);
2324                 } else {
2325                         tp->nonagle &= ~TCP_NAGLE_OFF;
2326                 }
2327                 break;
2328
2329         case TCP_CORK:
2330                 /* When set indicates to always queue non-full frames.
2331                  * Later the user clears this option and we transmit
2332                  * any pending partial frames in the queue.  This is
2333                  * meant to be used alongside sendfile() to get properly
2334                  * filled frames when the user (for example) must write
2335                  * out headers with a write() call first and then use
2336                  * sendfile to send out the data parts.
2337                  *
2338                  * TCP_CORK can be set together with TCP_NODELAY and it is
2339                  * stronger than TCP_NODELAY.
2340                  */
2341                 if (val) {
2342                         tp->nonagle |= TCP_NAGLE_CORK;
2343                 } else {
2344                         tp->nonagle &= ~TCP_NAGLE_CORK;
2345                         if (tp->nonagle&TCP_NAGLE_OFF)
2346                                 tp->nonagle |= TCP_NAGLE_PUSH;
2347                         tcp_push_pending_frames(sk, tp);
2348                 }
2349                 break;
2350
2351         case TCP_KEEPIDLE:
2352                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2353                         err = -EINVAL;
2354                 else {
2355                         tp->keepalive_time = val * HZ;
2356                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2357                             !((1 << sk->sk_state) &
2358                               (TCPF_CLOSE | TCPF_LISTEN))) {
2359                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2360                                 if (tp->keepalive_time > elapsed)
2361                                         elapsed = tp->keepalive_time - elapsed;
2362                                 else
2363                                         elapsed = 0;
2364                                 tcp_reset_keepalive_timer(sk, elapsed);
2365                         }
2366                 }
2367                 break;
2368         case TCP_KEEPINTVL:
2369                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2370                         err = -EINVAL;
2371                 else
2372                         tp->keepalive_intvl = val * HZ;
2373                 break;
2374         case TCP_KEEPCNT:
2375                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2376                         err = -EINVAL;
2377                 else
2378                         tp->keepalive_probes = val;
2379                 break;
2380         case TCP_SYNCNT:
2381                 if (val < 1 || val > MAX_TCP_SYNCNT)
2382                         err = -EINVAL;
2383                 else
2384                         tp->syn_retries = val;
2385                 break;
2386
2387         case TCP_LINGER2:
2388                 if (val < 0)
2389                         tp->linger2 = -1;
2390                 else if (val > sysctl_tcp_fin_timeout / HZ)
2391                         tp->linger2 = 0;
2392                 else
2393                         tp->linger2 = val * HZ;
2394                 break;
2395
2396         case TCP_DEFER_ACCEPT:
2397                 tp->defer_accept = 0;
2398                 if (val > 0) {
2399                         /* Translate value in seconds to number of
2400                          * retransmits */
2401                         while (tp->defer_accept < 32 &&
2402                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2403                                        tp->defer_accept))
2404                                 tp->defer_accept++;
2405                         tp->defer_accept++;
2406                 }
2407                 break;
2408
2409         case TCP_WINDOW_CLAMP:
2410                 if (!val) {
2411                         if (sk->sk_state != TCP_CLOSE) {
2412                                 err = -EINVAL;
2413                                 break;
2414                         }
2415                         tp->window_clamp = 0;
2416                 } else
2417                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2418                                                 SOCK_MIN_RCVBUF / 2 : val;
2419                 break;
2420
2421         case TCP_QUICKACK:
2422                 if (!val) {
2423                         tp->ack.pingpong = 1;
2424                 } else {
2425                         tp->ack.pingpong = 0;
2426                         if ((1 << sk->sk_state) &
2427                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2428                             tcp_ack_scheduled(tp)) {
2429                                 tp->ack.pending |= TCP_ACK_PUSHED;
2430                                 cleanup_rbuf(sk, 1);
2431                                 if (!(val & 1))
2432                                         tp->ack.pingpong = 1;
2433                         }
2434                 }
2435                 break;
2436
2437         default:
2438                 err = -ENOPROTOOPT;
2439                 break;
2440         };
2441         release_sock(sk);
2442         return err;
2443 }
2444
2445 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2446                    int __user *optlen)
2447 {
2448         struct tcp_opt *tp = tcp_sk(sk);
2449         int val, len;
2450
2451         if (level != SOL_TCP)
2452                 return tp->af_specific->getsockopt(sk, level, optname,
2453                                                    optval, optlen);
2454
2455         if (get_user(len, optlen))
2456                 return -EFAULT;
2457
2458         len = min_t(unsigned int, len, sizeof(int));
2459
2460         if (len < 0)
2461                 return -EINVAL;
2462
2463         switch (optname) {
2464         case TCP_MAXSEG:
2465                 val = tp->mss_cache_std;
2466                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2467                         val = tp->user_mss;
2468                 break;
2469         case TCP_NODELAY:
2470                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2471                 break;
2472         case TCP_CORK:
2473                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2474                 break;
2475         case TCP_KEEPIDLE:
2476                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2477                 break;
2478         case TCP_KEEPINTVL:
2479                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2480                 break;
2481         case TCP_KEEPCNT:
2482                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2483                 break;
2484         case TCP_SYNCNT:
2485                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2486                 break;
2487         case TCP_LINGER2:
2488                 val = tp->linger2;
2489                 if (val >= 0)
2490                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2491                 break;
2492         case TCP_DEFER_ACCEPT:
2493                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2494                                                (tp->defer_accept - 1));
2495                 break;
2496         case TCP_WINDOW_CLAMP:
2497                 val = tp->window_clamp;
2498                 break;
2499         case TCP_INFO: {
2500                 struct tcp_info info;
2501                 u32 now = tcp_time_stamp;
2502
2503                 if (get_user(len, optlen))
2504                         return -EFAULT;
2505                 info.tcpi_state = sk->sk_state;
2506                 info.tcpi_ca_state = tp->ca_state;
2507                 info.tcpi_retransmits = tp->retransmits;
2508                 info.tcpi_probes = tp->probes_out;
2509                 info.tcpi_backoff = tp->backoff;
2510                 info.tcpi_options = 0;
2511                 if (tp->tstamp_ok)
2512                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2513                 if (tp->sack_ok)
2514                         info.tcpi_options |= TCPI_OPT_SACK;
2515                 if (tp->wscale_ok) {
2516                         info.tcpi_options |= TCPI_OPT_WSCALE;
2517                         info.tcpi_snd_wscale = tp->snd_wscale;
2518                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2519                 } else {
2520                         info.tcpi_snd_wscale = 0;
2521                         info.tcpi_rcv_wscale = 0;
2522                 }
2523                 if (tp->ecn_flags & TCP_ECN_OK)
2524                         info.tcpi_options |= TCPI_OPT_ECN;
2525
2526                 info.tcpi_rto = (1000000 * tp->rto) / HZ;
2527                 info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2528                 info.tcpi_snd_mss = tp->mss_cache_std;
2529                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2530
2531                 info.tcpi_unacked = tp->packets_out;
2532                 info.tcpi_sacked = tp->sacked_out;
2533                 info.tcpi_lost = tp->lost_out;
2534                 info.tcpi_retrans = tp->retrans_out;
2535                 info.tcpi_fackets = tp->fackets_out;
2536
2537                 info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2538                 info.tcpi_last_ack_sent = 0;
2539                 info.tcpi_last_data_recv = ((now -
2540                                              tp->ack.lrcvtime) * 1000) / HZ;
2541                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2542
2543                 info.tcpi_pmtu = tp->pmtu_cookie;
2544                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2545                 info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2546                 info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2547                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2548                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2549                 info.tcpi_advmss = tp->advmss;
2550                 info.tcpi_reordering = tp->reordering;
2551
2552                 len = min_t(unsigned int, len, sizeof(info));
2553                 if (put_user(len, optlen))
2554                         return -EFAULT;
2555                 if (copy_to_user(optval, &info, len))
2556                         return -EFAULT;
2557                 return 0;
2558         }
2559         case TCP_QUICKACK:
2560                 val = !tp->ack.pingpong;
2561                 break;
2562         default:
2563                 return -ENOPROTOOPT;
2564         };
2565
2566         if (put_user(len, optlen))
2567                 return -EFAULT;
2568         if (copy_to_user(optval, &val, len))
2569                 return -EFAULT;
2570         return 0;
2571 }
2572
2573
2574 extern void __skb_cb_too_small_for_tcp(int, int);
2575 extern void tcpdiag_init(void);
2576
2577 static __initdata unsigned long thash_entries;
2578 static int __init set_thash_entries(char *str)
2579 {
2580         if (!str)
2581                 return 0;
2582         thash_entries = simple_strtoul(str, &str, 0);
2583         return 1;
2584 }
2585 __setup("thash_entries=", set_thash_entries);
2586
2587 void __init tcp_init(void)
2588 {
2589         struct sk_buff *skb = NULL;
2590         unsigned long goal;
2591         int order, i;
2592
2593         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2594                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2595                                            sizeof(skb->cb));
2596
2597         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2598                                                    sizeof(struct open_request),
2599                                                0, SLAB_HWCACHE_ALIGN,
2600                                                NULL, NULL);
2601         if (!tcp_openreq_cachep)
2602                 panic("tcp_init: Cannot alloc open_request cache.");
2603
2604         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2605                                               sizeof(struct tcp_bind_bucket),
2606                                               0, SLAB_HWCACHE_ALIGN,
2607                                               NULL, NULL);
2608         if (!tcp_bucket_cachep)
2609                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2610
2611         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2612                                                 sizeof(struct tcp_tw_bucket),
2613                                                 0, SLAB_HWCACHE_ALIGN,
2614                                                 NULL, NULL);
2615         if (!tcp_timewait_cachep)
2616                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2617
2618         /* Size and allocate the main established and bind bucket
2619          * hash tables.
2620          *
2621          * The methodology is similar to that of the buffer cache.
2622          */
2623         if (num_physpages >= (128 * 1024))
2624                 goal = num_physpages >> (21 - PAGE_SHIFT);
2625         else
2626                 goal = num_physpages >> (23 - PAGE_SHIFT);
2627
2628         if (thash_entries)
2629                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2630         for (order = 0; (1UL << order) < goal; order++)
2631                 ;
2632         do {
2633                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2634                         sizeof(struct tcp_ehash_bucket);
2635                 tcp_ehash_size >>= 1;
2636                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2637                         tcp_ehash_size--;
2638                 tcp_ehash = (struct tcp_ehash_bucket *)
2639                         __get_free_pages(GFP_ATOMIC, order);
2640         } while (!tcp_ehash && --order > 0);
2641
2642         if (!tcp_ehash)
2643                 panic("Failed to allocate TCP established hash table\n");
2644         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2645                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2646                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2647         }
2648
2649         do {
2650                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2651                         sizeof(struct tcp_bind_hashbucket);
2652                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2653                         continue;
2654                 tcp_bhash = (struct tcp_bind_hashbucket *)
2655                         __get_free_pages(GFP_ATOMIC, order);
2656         } while (!tcp_bhash && --order >= 0);
2657
2658         if (!tcp_bhash)
2659                 panic("Failed to allocate TCP bind hash table\n");
2660         for (i = 0; i < tcp_bhash_size; i++) {
2661                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2662                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2663         }
2664
2665         /* Try to be a bit smarter and adjust defaults depending
2666          * on available memory.
2667          */
2668         if (order > 4) {
2669                 sysctl_local_port_range[0] = 32768;
2670                 sysctl_local_port_range[1] = 61000;
2671                 sysctl_tcp_max_tw_buckets = 180000;
2672                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2673                 sysctl_max_syn_backlog = 1024;
2674         } else if (order < 3) {
2675                 sysctl_local_port_range[0] = 1024 * (3 - order);
2676                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2677                 sysctl_tcp_max_orphans >>= (3 - order);
2678                 sysctl_max_syn_backlog = 128;
2679         }
2680         tcp_port_rover = sysctl_local_port_range[0] - 1;
2681
2682         sysctl_tcp_mem[0] =  768 << order;
2683         sysctl_tcp_mem[1] = 1024 << order;
2684         sysctl_tcp_mem[2] = 1536 << order;
2685
2686         if (order < 3) {
2687                 sysctl_tcp_wmem[2] = 64 * 1024;
2688                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2689                 sysctl_tcp_rmem[1] = 43689;
2690                 sysctl_tcp_rmem[2] = 2 * 43689;
2691         }
2692
2693         printk(KERN_INFO "TCP: Hash tables configured "
2694                "(established %d bind %d)\n",
2695                tcp_ehash_size << 1, tcp_bhash_size);
2696
2697         tcpdiag_init();
2698 }
2699
2700 EXPORT_SYMBOL(__tcp_mem_reclaim);
2701 EXPORT_SYMBOL(sysctl_tcp_rmem);
2702 EXPORT_SYMBOL(sysctl_tcp_wmem);
2703 EXPORT_SYMBOL(tcp_accept);
2704 EXPORT_SYMBOL(tcp_close);
2705 EXPORT_SYMBOL(tcp_close_state);
2706 EXPORT_SYMBOL(tcp_destroy_sock);
2707 EXPORT_SYMBOL(tcp_disconnect);
2708 EXPORT_SYMBOL(tcp_getsockopt);
2709 EXPORT_SYMBOL(tcp_ioctl);
2710 EXPORT_SYMBOL(tcp_openreq_cachep);
2711 EXPORT_SYMBOL(tcp_poll);
2712 EXPORT_SYMBOL(tcp_read_sock);
2713 EXPORT_SYMBOL(tcp_recvmsg);
2714 EXPORT_SYMBOL(tcp_sendmsg);
2715 EXPORT_SYMBOL(tcp_sendpage);
2716 EXPORT_SYMBOL(tcp_setsockopt);
2717 EXPORT_SYMBOL(tcp_shutdown);
2718 EXPORT_SYMBOL(tcp_sockets_allocated);
2719 EXPORT_SYMBOL(tcp_statistics);
2720 EXPORT_SYMBOL(tcp_timewait_cachep);
2721 EXPORT_SYMBOL(tcp_write_space);
2722 EXPORT_SYMBOL_GPL(cleanup_rbuf);