patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259
260 #include <net/icmp.h>
261 #include <net/tcp.h>
262 #include <net/xfrm.h>
263 #include <net/ip.h>
264
265
266 #include <asm/uaccess.h>
267 #include <asm/ioctls.h>
268
269 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
270
271 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
272
273 kmem_cache_t *tcp_openreq_cachep;
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_default_win_scale;
280
281 int sysctl_tcp_mem[3];
282 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
283 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
284
285 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
286 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
287
288 /* Pressure flag: try to collapse.
289  * Technical note: it is used by multiple contexts non atomically.
290  * All the tcp_mem_schedule() is of this nature: accounting
291  * is strict, actions are advisory and have some latency. */
292 int tcp_memory_pressure;
293
294 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
295
296 int tcp_mem_schedule(struct sock *sk, int size, int kind)
297 {
298         int amt = TCP_PAGES(size);
299
300         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
301         atomic_add(amt, &tcp_memory_allocated);
302
303         /* Under limit. */
304         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
305                 if (tcp_memory_pressure)
306                         tcp_memory_pressure = 0;
307                 return 1;
308         }
309
310         /* Over hard limit. */
311         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
312                 tcp_enter_memory_pressure();
313                 goto suppress_allocation;
314         }
315
316         /* Under pressure. */
317         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
318                 tcp_enter_memory_pressure();
319
320         if (kind) {
321                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
322                         return 1;
323         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
324                 return 1;
325
326         if (!tcp_memory_pressure ||
327             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
328                                 TCP_PAGES(sk->sk_wmem_queued +
329                                           atomic_read(&sk->sk_rmem_alloc) +
330                                           sk->sk_forward_alloc))
331                 return 1;
332
333 suppress_allocation:
334
335         if (!kind) {
336                 tcp_moderate_sndbuf(sk);
337
338                 /* Fail only if socket is _under_ its sndbuf.
339                  * In this case we cannot block, so that we have to fail.
340                  */
341                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
342                         return 1;
343         }
344
345         /* Alas. Undo changes. */
346         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
347         atomic_sub(amt, &tcp_memory_allocated);
348         return 0;
349 }
350
351 void __tcp_mem_reclaim(struct sock *sk)
352 {
353         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
354                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
355                            &tcp_memory_allocated);
356                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
357                 if (tcp_memory_pressure &&
358                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
359                         tcp_memory_pressure = 0;
360         }
361 }
362
363 void tcp_rfree(struct sk_buff *skb)
364 {
365         struct sock *sk = skb->sk;
366
367         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
368         sk->sk_forward_alloc += skb->truesize;
369 }
370
371 /*
372  * LISTEN is a special case for poll..
373  */
374 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
375                                                poll_table *wait)
376 {
377         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
378 }
379
380 /*
381  *      Wait for a TCP event.
382  *
383  *      Note that we don't need to lock the socket, as the upper poll layers
384  *      take care of normal races (between the test and the event) and we don't
385  *      go look at any of the socket buffers directly.
386  */
387 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
388 {
389         unsigned int mask;
390         struct sock *sk = sock->sk;
391         struct tcp_opt *tp = tcp_sk(sk);
392
393         poll_wait(file, sk->sk_sleep, wait);
394         if (sk->sk_state == TCP_LISTEN)
395                 return tcp_listen_poll(sk, wait);
396
397         /* Socket is not locked. We are protected from async events
398            by poll logic and correct handling of state changes
399            made by another threads is impossible in any case.
400          */
401
402         mask = 0;
403         if (sk->sk_err)
404                 mask = POLLERR;
405
406         /*
407          * POLLHUP is certainly not done right. But poll() doesn't
408          * have a notion of HUP in just one direction, and for a
409          * socket the read side is more interesting.
410          *
411          * Some poll() documentation says that POLLHUP is incompatible
412          * with the POLLOUT/POLLWR flags, so somebody should check this
413          * all. But careful, it tends to be safer to return too many
414          * bits than too few, and you can easily break real applications
415          * if you don't tell them that something has hung up!
416          *
417          * Check-me.
418          *
419          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
420          * our fs/select.c). It means that after we received EOF,
421          * poll always returns immediately, making impossible poll() on write()
422          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
423          * if and only if shutdown has been made in both directions.
424          * Actually, it is interesting to look how Solaris and DUX
425          * solve this dilemma. I would prefer, if PULLHUP were maskable,
426          * then we could set it on SND_SHUTDOWN. BTW examples given
427          * in Stevens' books assume exactly this behaviour, it explains
428          * why PULLHUP is incompatible with POLLOUT.    --ANK
429          *
430          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
431          * blocking on fresh not-connected or disconnected socket. --ANK
432          */
433         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
434                 mask |= POLLHUP;
435         if (sk->sk_shutdown & RCV_SHUTDOWN)
436                 mask |= POLLIN | POLLRDNORM;
437
438         /* Connected? */
439         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
440                 /* Potential race condition. If read of tp below will
441                  * escape above sk->sk_state, we can be illegally awaken
442                  * in SYN_* states. */
443                 if ((tp->rcv_nxt != tp->copied_seq) &&
444                     (tp->urg_seq != tp->copied_seq ||
445                      tp->rcv_nxt != tp->copied_seq + 1 ||
446                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
447                         mask |= POLLIN | POLLRDNORM;
448
449                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
450                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
451                                 mask |= POLLOUT | POLLWRNORM;
452                         } else {  /* send SIGIO later */
453                                 set_bit(SOCK_ASYNC_NOSPACE,
454                                         &sk->sk_socket->flags);
455                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
456
457                                 /* Race breaker. If space is freed after
458                                  * wspace test but before the flags are set,
459                                  * IO signal will be lost.
460                                  */
461                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
462                                         mask |= POLLOUT | POLLWRNORM;
463                         }
464                 }
465
466                 if (tp->urg_data & TCP_URG_VALID)
467                         mask |= POLLPRI;
468         }
469         return mask;
470 }
471
472 /*
473  *      TCP socket write_space callback.
474  */
475 void tcp_write_space(struct sock *sk)
476 {
477         struct socket *sock = sk->sk_socket;
478
479         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
480                 clear_bit(SOCK_NOSPACE, &sock->flags);
481
482                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
483                         wake_up_interruptible(sk->sk_sleep);
484
485                 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
486                         sock_wake_async(sock, 2, POLL_OUT);
487         }
488 }
489
490 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
491 {
492         struct tcp_opt *tp = tcp_sk(sk);
493         int answ;
494
495         switch (cmd) {
496         case SIOCINQ:
497                 if (sk->sk_state == TCP_LISTEN)
498                         return -EINVAL;
499
500                 lock_sock(sk);
501                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
502                         answ = 0;
503                 else if (sock_flag(sk, SOCK_URGINLINE) ||
504                          !tp->urg_data ||
505                          before(tp->urg_seq, tp->copied_seq) ||
506                          !before(tp->urg_seq, tp->rcv_nxt)) {
507                         answ = tp->rcv_nxt - tp->copied_seq;
508
509                         /* Subtract 1, if FIN is in queue. */
510                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
511                                 answ -=
512                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
513                 } else
514                         answ = tp->urg_seq - tp->copied_seq;
515                 release_sock(sk);
516                 break;
517         case SIOCATMARK:
518                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
519                 break;
520         case SIOCOUTQ:
521                 if (sk->sk_state == TCP_LISTEN)
522                         return -EINVAL;
523
524                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
525                         answ = 0;
526                 else
527                         answ = tp->write_seq - tp->snd_una;
528                 break;
529         default:
530                 return -ENOIOCTLCMD;
531         };
532
533         return put_user(answ, (int __user *)arg);
534 }
535
536
537 int tcp_listen_start(struct sock *sk)
538 {
539         struct inet_opt *inet = inet_sk(sk);
540         struct tcp_opt *tp = tcp_sk(sk);
541         struct tcp_listen_opt *lopt;
542
543         sk->sk_max_ack_backlog = 0;
544         sk->sk_ack_backlog = 0;
545         tp->accept_queue = tp->accept_queue_tail = NULL;
546         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
547         tcp_delack_init(tp);
548
549         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
550         if (!lopt)
551                 return -ENOMEM;
552
553         memset(lopt, 0, sizeof(struct tcp_listen_opt));
554         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
555                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
556                         break;
557         get_random_bytes(&lopt->hash_rnd, 4);
558
559         write_lock_bh(&tp->syn_wait_lock);
560         tp->listen_opt = lopt;
561         write_unlock_bh(&tp->syn_wait_lock);
562
563         /* There is race window here: we announce ourselves listening,
564          * but this transition is still not validated by get_port().
565          * It is OK, because this socket enters to hash table only
566          * after validation is complete.
567          */
568         sk->sk_state = TCP_LISTEN;
569         if (!sk->sk_prot->get_port(sk, inet->num)) {
570                 inet->sport = htons(inet->num);
571
572                 sk_dst_reset(sk);
573                 sk->sk_prot->hash(sk);
574
575                 return 0;
576         }
577
578         sk->sk_state = TCP_CLOSE;
579         write_lock_bh(&tp->syn_wait_lock);
580         tp->listen_opt = NULL;
581         write_unlock_bh(&tp->syn_wait_lock);
582         kfree(lopt);
583         return -EADDRINUSE;
584 }
585
586 /*
587  *      This routine closes sockets which have been at least partially
588  *      opened, but not yet accepted.
589  */
590
591 static void tcp_listen_stop (struct sock *sk)
592 {
593         struct tcp_opt *tp = tcp_sk(sk);
594         struct tcp_listen_opt *lopt = tp->listen_opt;
595         struct open_request *acc_req = tp->accept_queue;
596         struct open_request *req;
597         int i;
598
599         tcp_delete_keepalive_timer(sk);
600
601         /* make all the listen_opt local to us */
602         write_lock_bh(&tp->syn_wait_lock);
603         tp->listen_opt = NULL;
604         write_unlock_bh(&tp->syn_wait_lock);
605         tp->accept_queue = tp->accept_queue_tail = NULL;
606
607         if (lopt->qlen) {
608                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
609                         while ((req = lopt->syn_table[i]) != NULL) {
610                                 lopt->syn_table[i] = req->dl_next;
611                                 lopt->qlen--;
612                                 tcp_openreq_free(req);
613
614                 /* Following specs, it would be better either to send FIN
615                  * (and enter FIN-WAIT-1, it is normal close)
616                  * or to send active reset (abort).
617                  * Certainly, it is pretty dangerous while synflood, but it is
618                  * bad justification for our negligence 8)
619                  * To be honest, we are not able to make either
620                  * of the variants now.                 --ANK
621                  */
622                         }
623                 }
624         }
625         BUG_TRAP(!lopt->qlen);
626
627         kfree(lopt);
628
629         while ((req = acc_req) != NULL) {
630                 struct sock *child = req->sk;
631
632                 acc_req = req->dl_next;
633
634                 local_bh_disable();
635                 bh_lock_sock(child);
636                 BUG_TRAP(!sock_owned_by_user(child));
637                 sock_hold(child);
638
639                 tcp_disconnect(child, O_NONBLOCK);
640
641                 sock_orphan(child);
642
643                 atomic_inc(&tcp_orphan_count);
644
645                 tcp_destroy_sock(child);
646
647                 bh_unlock_sock(child);
648                 local_bh_enable();
649                 sock_put(child);
650
651                 sk_acceptq_removed(sk);
652                 tcp_openreq_fastfree(req);
653         }
654         BUG_TRAP(!sk->sk_ack_backlog);
655 }
656
657 /*
658  *      Wait for a socket to get into the connected state
659  *
660  *      Note: Must be called with the socket locked.
661  */
662 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
663 {
664         struct tcp_opt *tp = tcp_sk(sk);
665         struct task_struct *tsk = current;
666         DEFINE_WAIT(wait);
667
668         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
669                 if (sk->sk_err)
670                         return sock_error(sk);
671                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
672                         return -EPIPE;
673                 if (!*timeo_p)
674                         return -EAGAIN;
675                 if (signal_pending(tsk))
676                         return sock_intr_errno(*timeo_p);
677
678                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
679                 tp->write_pending++;
680
681                 release_sock(sk);
682                 *timeo_p = schedule_timeout(*timeo_p);
683                 lock_sock(sk);
684
685                 finish_wait(sk->sk_sleep, &wait);
686                 tp->write_pending--;
687         }
688         return 0;
689 }
690
691 static inline int tcp_memory_free(struct sock *sk)
692 {
693         return sk->sk_wmem_queued < sk->sk_sndbuf;
694 }
695
696 /*
697  *      Wait for more memory for a socket
698  */
699 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
700 {
701         struct tcp_opt *tp = tcp_sk(sk);
702         int err = 0;
703         long vm_wait = 0;
704         long current_timeo = *timeo;
705         DEFINE_WAIT(wait);
706
707         if (tcp_memory_free(sk))
708                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
709
710         for (;;) {
711                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
712
713                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
714
715                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
716                         goto do_error;
717                 if (!*timeo)
718                         goto do_nonblock;
719                 if (signal_pending(current))
720                         goto do_interrupted;
721                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
722                 if (tcp_memory_free(sk) && !vm_wait)
723                         break;
724
725                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
726                 tp->write_pending++;
727                 release_sock(sk);
728                 if (!tcp_memory_free(sk) || vm_wait)
729                         current_timeo = schedule_timeout(current_timeo);
730                 lock_sock(sk);
731                 tp->write_pending--;
732
733                 if (vm_wait) {
734                         vm_wait -= current_timeo;
735                         current_timeo = *timeo;
736                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
737                             (current_timeo -= vm_wait) < 0)
738                                 current_timeo = 0;
739                         vm_wait = 0;
740                 }
741                 *timeo = current_timeo;
742         }
743 out:
744         finish_wait(sk->sk_sleep, &wait);
745         return err;
746
747 do_error:
748         err = -EPIPE;
749         goto out;
750 do_nonblock:
751         err = -EAGAIN;
752         goto out;
753 do_interrupted:
754         err = sock_intr_errno(*timeo);
755         goto out;
756 }
757
758 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
759                                int off)
760 {
761         if (i) {
762                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
763                 return page == frag->page &&
764                        off == frag->page_offset + frag->size;
765         }
766         return 0;
767 }
768
769 static inline void fill_page_desc(struct sk_buff *skb, int i,
770                                   struct page *page, int off, int size)
771 {
772         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
773         frag->page = page;
774         frag->page_offset = off;
775         frag->size = size;
776         skb_shinfo(skb)->nr_frags = i + 1;
777 }
778
779 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
780 {
781         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
782         tp->pushed_seq = tp->write_seq;
783 }
784
785 static inline int forced_push(struct tcp_opt *tp)
786 {
787         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
788 }
789
790 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
791                               struct sk_buff *skb)
792 {
793         skb->csum = 0;
794         TCP_SKB_CB(skb)->seq = tp->write_seq;
795         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
796         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
797         TCP_SKB_CB(skb)->sacked = 0;
798         __skb_queue_tail(&sk->sk_write_queue, skb);
799         tcp_charge_skb(sk, skb);
800         if (!tp->send_head)
801                 tp->send_head = skb;
802         else if (tp->nonagle&TCP_NAGLE_PUSH)
803                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
804 }
805
806 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
807                                 struct sk_buff *skb)
808 {
809         if (flags & MSG_OOB) {
810                 tp->urg_mode = 1;
811                 tp->snd_up = tp->write_seq;
812                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
813         }
814 }
815
816 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
817                             int mss_now, int nonagle)
818 {
819         if (tp->send_head) {
820                 struct sk_buff *skb = sk->sk_write_queue.prev;
821                 if (!(flags & MSG_MORE) || forced_push(tp))
822                         tcp_mark_push(tp, skb);
823                 tcp_mark_urg(tp, flags, skb);
824                 __tcp_push_pending_frames(sk, tp, mss_now,
825                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
826         }
827 }
828
829 static int tcp_error(struct sock *sk, int flags, int err)
830 {
831         if (err == -EPIPE)
832                 err = sock_error(sk) ? : -EPIPE;
833         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
834                 send_sig(SIGPIPE, current, 0);
835         return err;
836 }
837
838 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
839                          size_t psize, int flags)
840 {
841         struct tcp_opt *tp = tcp_sk(sk);
842         int mss_now;
843         int err;
844         ssize_t copied;
845         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
846
847         /* Wait for a connection to finish. */
848         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
849                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
850                         goto out_err;
851
852         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
853
854         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
855         copied = 0;
856
857         err = -EPIPE;
858         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
859                 goto do_error;
860
861         while (psize > 0) {
862                 struct sk_buff *skb = sk->sk_write_queue.prev;
863                 struct page *page = pages[poffset / PAGE_SIZE];
864                 int copy, i;
865                 int offset = poffset % PAGE_SIZE;
866                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
867
868                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
869 new_segment:
870                         if (!tcp_memory_free(sk))
871                                 goto wait_for_sndbuf;
872
873                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
874                                              sk->sk_allocation);
875                         if (!skb)
876                                 goto wait_for_memory;
877
878                         skb_entail(sk, tp, skb);
879                         copy = mss_now;
880                 }
881
882                 if (copy > size)
883                         copy = size;
884
885                 i = skb_shinfo(skb)->nr_frags;
886                 if (can_coalesce(skb, i, page, offset)) {
887                         skb_shinfo(skb)->frags[i - 1].size += copy;
888                 } else if (i < MAX_SKB_FRAGS) {
889                         get_page(page);
890                         fill_page_desc(skb, i, page, offset, copy);
891                 } else {
892                         tcp_mark_push(tp, skb);
893                         goto new_segment;
894                 }
895
896                 skb->len += copy;
897                 skb->data_len += copy;
898                 skb->ip_summed = CHECKSUM_HW;
899                 tp->write_seq += copy;
900                 TCP_SKB_CB(skb)->end_seq += copy;
901
902                 if (!copied)
903                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
904
905                 copied += copy;
906                 poffset += copy;
907                 if (!(psize -= copy))
908                         goto out;
909
910                 if (skb->len != mss_now || (flags & MSG_OOB))
911                         continue;
912
913                 if (forced_push(tp)) {
914                         tcp_mark_push(tp, skb);
915                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
916                 } else if (skb == tp->send_head)
917                         tcp_push_one(sk, mss_now);
918                 continue;
919
920 wait_for_sndbuf:
921                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
922 wait_for_memory:
923                 if (copied)
924                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
925
926                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
927                         goto do_error;
928
929                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
930         }
931
932 out:
933         if (copied)
934                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
935         return copied;
936
937 do_error:
938         if (copied)
939                 goto out;
940 out_err:
941         return tcp_error(sk, flags, err);
942 }
943
944 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
945                      size_t size, int flags)
946 {
947         ssize_t res;
948         struct sock *sk = sock->sk;
949
950 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
951
952         if (!(sk->sk_route_caps & NETIF_F_SG) ||
953             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
954                 return sock_no_sendpage(sock, page, offset, size, flags);
955
956 #undef TCP_ZC_CSUM_FLAGS
957
958         lock_sock(sk);
959         TCP_CHECK_TIMER(sk);
960         res = do_tcp_sendpages(sk, &page, offset, size, flags);
961         TCP_CHECK_TIMER(sk);
962         release_sock(sk);
963         return res;
964 }
965
966 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
967 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
968
969 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
970                                    struct sk_buff *skb, struct page *page,
971                                    int off, int copy)
972 {
973         int err = 0;
974         unsigned int csum;
975
976         if (skb->ip_summed == CHECKSUM_NONE) {
977                 csum = csum_and_copy_from_user(from, page_address(page) + off,
978                                        copy, 0, &err);
979                 if (err) return err;
980                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
981         } else {
982                 if (copy_from_user(page_address(page) + off, from, copy))
983                         return -EFAULT;
984         }
985
986         skb->len += copy;
987         skb->data_len += copy;
988         skb->truesize += copy;
989         sk->sk_wmem_queued += copy;
990         sk->sk_forward_alloc -= copy;
991         return 0;
992 }
993
994 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
995 {
996         int err = 0;
997         unsigned int csum;
998         int off = skb->len;
999
1000         if (skb->ip_summed == CHECKSUM_NONE) {
1001                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1002                                        copy, 0, &err);
1003                 if (!err) {
1004                         skb->csum = csum_block_add(skb->csum, csum, off);
1005                         return 0;
1006                 }
1007         } else {
1008                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1009                         return 0;
1010         }
1011
1012         __skb_trim(skb, off);
1013         return -EFAULT;
1014 }
1015
1016 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1017 {
1018         int tmp = tp->mss_cache_std;
1019
1020         if (sk->sk_route_caps & NETIF_F_SG) {
1021                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1022
1023                 if (tmp >= pgbreak &&
1024                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1025                         tmp = pgbreak;
1026         }
1027         return tmp;
1028 }
1029
1030 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1031                 size_t size)
1032 {
1033         struct iovec *iov;
1034         struct tcp_opt *tp = tcp_sk(sk);
1035         struct sk_buff *skb;
1036         int iovlen, flags;
1037         int mss_now;
1038         int err, copied;
1039         long timeo;
1040
1041         lock_sock(sk);
1042         TCP_CHECK_TIMER(sk);
1043
1044         flags = msg->msg_flags;
1045         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1046
1047         /* Wait for a connection to finish. */
1048         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1049                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1050                         goto out_err;
1051
1052         /* This should be in poll */
1053         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1054
1055         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1056
1057         /* Ok commence sending. */
1058         iovlen = msg->msg_iovlen;
1059         iov = msg->msg_iov;
1060         copied = 0;
1061
1062         err = -EPIPE;
1063         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1064                 goto do_error;
1065
1066         while (--iovlen >= 0) {
1067                 int seglen = iov->iov_len;
1068                 unsigned char __user *from = iov->iov_base;
1069
1070                 iov++;
1071
1072                 while (seglen > 0) {
1073                         int copy;
1074
1075                         skb = sk->sk_write_queue.prev;
1076
1077                         if (!tp->send_head ||
1078                             (copy = mss_now - skb->len) <= 0) {
1079
1080 new_segment:
1081                                 /* Allocate new segment. If the interface is SG,
1082                                  * allocate skb fitting to single page.
1083                                  */
1084                                 if (!tcp_memory_free(sk))
1085                                         goto wait_for_sndbuf;
1086
1087                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1088                                                      0, sk->sk_allocation);
1089                                 if (!skb)
1090                                         goto wait_for_memory;
1091
1092                                 /*
1093                                  * Check whether we can use HW checksum.
1094                                  */
1095                                 if (sk->sk_route_caps &
1096                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1097                                      NETIF_F_HW_CSUM))
1098                                         skb->ip_summed = CHECKSUM_HW;
1099
1100                                 skb_entail(sk, tp, skb);
1101                                 copy = mss_now;
1102                         }
1103
1104                         /* Try to append data to the end of skb. */
1105                         if (copy > seglen)
1106                                 copy = seglen;
1107
1108                         /* Where to copy to? */
1109                         if (skb_tailroom(skb) > 0) {
1110                                 /* We have some space in skb head. Superb! */
1111                                 if (copy > skb_tailroom(skb))
1112                                         copy = skb_tailroom(skb);
1113                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1114                                         goto do_fault;
1115                         } else {
1116                                 int merge = 0;
1117                                 int i = skb_shinfo(skb)->nr_frags;
1118                                 struct page *page = TCP_PAGE(sk);
1119                                 int off = TCP_OFF(sk);
1120
1121                                 if (can_coalesce(skb, i, page, off) &&
1122                                     off != PAGE_SIZE) {
1123                                         /* We can extend the last page
1124                                          * fragment. */
1125                                         merge = 1;
1126                                 } else if (i == MAX_SKB_FRAGS ||
1127                                            (!i &&
1128                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1129                                         /* Need to add new fragment and cannot
1130                                          * do this because interface is non-SG,
1131                                          * or because all the page slots are
1132                                          * busy. */
1133                                         tcp_mark_push(tp, skb);
1134                                         goto new_segment;
1135                                 } else if (page) {
1136                                         /* If page is cached, align
1137                                          * offset to L1 cache boundary
1138                                          */
1139                                         off = (off + L1_CACHE_BYTES - 1) &
1140                                               ~(L1_CACHE_BYTES - 1);
1141                                         if (off == PAGE_SIZE) {
1142                                                 put_page(page);
1143                                                 TCP_PAGE(sk) = page = NULL;
1144                                         }
1145                                 }
1146
1147                                 if (!page) {
1148                                         /* Allocate new cache page. */
1149                                         if (!(page = tcp_alloc_page(sk)))
1150                                                 goto wait_for_memory;
1151                                         off = 0;
1152                                 }
1153
1154                                 if (copy > PAGE_SIZE - off)
1155                                         copy = PAGE_SIZE - off;
1156
1157                                 /* Time to copy data. We are close to
1158                                  * the end! */
1159                                 err = tcp_copy_to_page(sk, from, skb, page,
1160                                                        off, copy);
1161                                 if (err) {
1162                                         /* If this page was new, give it to the
1163                                          * socket so it does not get leaked.
1164                                          */
1165                                         if (!TCP_PAGE(sk)) {
1166                                                 TCP_PAGE(sk) = page;
1167                                                 TCP_OFF(sk) = 0;
1168                                         }
1169                                         goto do_error;
1170                                 }
1171
1172                                 /* Update the skb. */
1173                                 if (merge) {
1174                                         skb_shinfo(skb)->frags[i - 1].size +=
1175                                                                         copy;
1176                                 } else {
1177                                         fill_page_desc(skb, i, page, off, copy);
1178                                         if (TCP_PAGE(sk)) {
1179                                                 get_page(page);
1180                                         } else if (off + copy < PAGE_SIZE) {
1181                                                 get_page(page);
1182                                                 TCP_PAGE(sk) = page;
1183                                         }
1184                                 }
1185
1186                                 TCP_OFF(sk) = off + copy;
1187                         }
1188
1189                         if (!copied)
1190                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1191
1192                         tp->write_seq += copy;
1193                         TCP_SKB_CB(skb)->end_seq += copy;
1194
1195                         from += copy;
1196                         copied += copy;
1197                         if ((seglen -= copy) == 0 && iovlen == 0)
1198                                 goto out;
1199
1200                         if (skb->len != mss_now || (flags & MSG_OOB))
1201                                 continue;
1202
1203                         if (forced_push(tp)) {
1204                                 tcp_mark_push(tp, skb);
1205                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1206                         } else if (skb == tp->send_head)
1207                                 tcp_push_one(sk, mss_now);
1208                         continue;
1209
1210 wait_for_sndbuf:
1211                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1212 wait_for_memory:
1213                         if (copied)
1214                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1215
1216                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1217                                 goto do_error;
1218
1219                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1220                 }
1221         }
1222
1223 out:
1224         if (copied)
1225                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1226         TCP_CHECK_TIMER(sk);
1227         release_sock(sk);
1228         return copied;
1229
1230 do_fault:
1231         if (!skb->len) {
1232                 if (tp->send_head == skb)
1233                         tp->send_head = NULL;
1234                 __skb_unlink(skb, skb->list);
1235                 tcp_free_skb(sk, skb);
1236         }
1237
1238 do_error:
1239         if (copied)
1240                 goto out;
1241 out_err:
1242         err = tcp_error(sk, flags, err);
1243         TCP_CHECK_TIMER(sk);
1244         release_sock(sk);
1245         return err;
1246 }
1247
1248 /*
1249  *      Handle reading urgent data. BSD has very simple semantics for
1250  *      this, no blocking and very strange errors 8)
1251  */
1252
1253 static int tcp_recv_urg(struct sock *sk, long timeo,
1254                         struct msghdr *msg, int len, int flags,
1255                         int *addr_len)
1256 {
1257         struct tcp_opt *tp = tcp_sk(sk);
1258
1259         /* No URG data to read. */
1260         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1261             tp->urg_data == TCP_URG_READ)
1262                 return -EINVAL; /* Yes this is right ! */
1263
1264         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1265                 return -ENOTCONN;
1266
1267         if (tp->urg_data & TCP_URG_VALID) {
1268                 int err = 0;
1269                 char c = tp->urg_data;
1270
1271                 if (!(flags & MSG_PEEK))
1272                         tp->urg_data = TCP_URG_READ;
1273
1274                 /* Read urgent data. */
1275                 msg->msg_flags |= MSG_OOB;
1276
1277                 if (len > 0) {
1278                         if (!(flags & MSG_TRUNC))
1279                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1280                         len = 1;
1281                 } else
1282                         msg->msg_flags |= MSG_TRUNC;
1283
1284                 return err ? -EFAULT : len;
1285         }
1286
1287         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1288                 return 0;
1289
1290         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1291          * the available implementations agree in this case:
1292          * this call should never block, independent of the
1293          * blocking state of the socket.
1294          * Mike <pall@rz.uni-karlsruhe.de>
1295          */
1296         return -EAGAIN;
1297 }
1298
1299 /* Clean up the receive buffer for full frames taken by the user,
1300  * then send an ACK if necessary.  COPIED is the number of bytes
1301  * tcp_recvmsg has given to the user so far, it speeds up the
1302  * calculation of whether or not we must ACK for the sake of
1303  * a window update.
1304  */
1305 static void cleanup_rbuf(struct sock *sk, int copied)
1306 {
1307         struct tcp_opt *tp = tcp_sk(sk);
1308         int time_to_ack = 0;
1309
1310 #if TCP_DEBUG
1311         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1312
1313         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1314 #endif
1315
1316         if (tcp_ack_scheduled(tp)) {
1317                    /* Delayed ACKs frequently hit locked sockets during bulk
1318                     * receive. */
1319                 if (tp->ack.blocked ||
1320                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1321                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1322                     /*
1323                      * If this read emptied read buffer, we send ACK, if
1324                      * connection is not bidirectional, user drained
1325                      * receive buffer and there was a small segment
1326                      * in queue.
1327                      */
1328                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1329                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1330                         time_to_ack = 1;
1331         }
1332
1333         /* We send an ACK if we can now advertise a non-zero window
1334          * which has been raised "significantly".
1335          *
1336          * Even if window raised up to infinity, do not send window open ACK
1337          * in states, where we will not receive more. It is useless.
1338          */
1339         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1340                 __u32 rcv_window_now = tcp_receive_window(tp);
1341
1342                 /* Optimize, __tcp_select_window() is not cheap. */
1343                 if (2*rcv_window_now <= tp->window_clamp) {
1344                         __u32 new_window = __tcp_select_window(sk);
1345
1346                         /* Send ACK now, if this read freed lots of space
1347                          * in our buffer. Certainly, new_window is new window.
1348                          * We can advertise it now, if it is not less than current one.
1349                          * "Lots" means "at least twice" here.
1350                          */
1351                         if (new_window && new_window >= 2 * rcv_window_now)
1352                                 time_to_ack = 1;
1353                 }
1354         }
1355         if (time_to_ack)
1356                 tcp_send_ack(sk);
1357 }
1358
1359 static void tcp_prequeue_process(struct sock *sk)
1360 {
1361         struct sk_buff *skb;
1362         struct tcp_opt *tp = tcp_sk(sk);
1363
1364         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1365
1366         /* RX process wants to run with disabled BHs, though it is not
1367          * necessary */
1368         local_bh_disable();
1369         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1370                 sk->sk_backlog_rcv(sk, skb);
1371         local_bh_enable();
1372
1373         /* Clear memory counter. */
1374         tp->ucopy.memory = 0;
1375 }
1376
1377 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1378 {
1379         struct sk_buff *skb;
1380         u32 offset;
1381
1382         skb_queue_walk(&sk->sk_receive_queue, skb) {
1383                 offset = seq - TCP_SKB_CB(skb)->seq;
1384                 if (skb->h.th->syn)
1385                         offset--;
1386                 if (offset < skb->len || skb->h.th->fin) {
1387                         *off = offset;
1388                         return skb;
1389                 }
1390         }
1391         return NULL;
1392 }
1393
1394 /*
1395  * This routine provides an alternative to tcp_recvmsg() for routines
1396  * that would like to handle copying from skbuffs directly in 'sendfile'
1397  * fashion.
1398  * Note:
1399  *      - It is assumed that the socket was locked by the caller.
1400  *      - The routine does not block.
1401  *      - At present, there is no support for reading OOB data
1402  *        or for 'peeking' the socket using this routine
1403  *        (although both would be easy to implement).
1404  */
1405 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1406                   sk_read_actor_t recv_actor)
1407 {
1408         struct sk_buff *skb;
1409         struct tcp_opt *tp = tcp_sk(sk);
1410         u32 seq = tp->copied_seq;
1411         u32 offset;
1412         int copied = 0;
1413
1414         if (sk->sk_state == TCP_LISTEN)
1415                 return -ENOTCONN;
1416         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1417                 if (offset < skb->len) {
1418                         size_t used, len;
1419
1420                         len = skb->len - offset;
1421                         /* Stop reading if we hit a patch of urgent data */
1422                         if (tp->urg_data) {
1423                                 u32 urg_offset = tp->urg_seq - seq;
1424                                 if (urg_offset < len)
1425                                         len = urg_offset;
1426                                 if (!len)
1427                                         break;
1428                         }
1429                         used = recv_actor(desc, skb, offset, len);
1430                         if (used <= len) {
1431                                 seq += used;
1432                                 copied += used;
1433                                 offset += used;
1434                         }
1435                         if (offset != skb->len)
1436                                 break;
1437                 }
1438                 if (skb->h.th->fin) {
1439                         sk_eat_skb(sk, skb);
1440                         ++seq;
1441                         break;
1442                 }
1443                 sk_eat_skb(sk, skb);
1444                 if (!desc->count)
1445                         break;
1446         }
1447         tp->copied_seq = seq;
1448
1449         tcp_rcv_space_adjust(sk);
1450
1451         /* Clean up data we have read: This will do ACK frames. */
1452         if (copied)
1453                 cleanup_rbuf(sk, copied);
1454         return copied;
1455 }
1456
1457 /*
1458  *      This routine copies from a sock struct into the user buffer.
1459  *
1460  *      Technical note: in 2.3 we work on _locked_ socket, so that
1461  *      tricks with *seq access order and skb->users are not required.
1462  *      Probably, code can be easily improved even more.
1463  */
1464
1465 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1466                 size_t len, int nonblock, int flags, int *addr_len)
1467 {
1468         struct tcp_opt *tp = tcp_sk(sk);
1469         int copied = 0;
1470         u32 peek_seq;
1471         u32 *seq;
1472         unsigned long used;
1473         int err;
1474         int target;             /* Read at least this many bytes */
1475         long timeo;
1476         struct task_struct *user_recv = NULL;
1477
1478         lock_sock(sk);
1479
1480         TCP_CHECK_TIMER(sk);
1481
1482         err = -ENOTCONN;
1483         if (sk->sk_state == TCP_LISTEN)
1484                 goto out;
1485
1486         timeo = sock_rcvtimeo(sk, nonblock);
1487
1488         /* Urgent data needs to be handled specially. */
1489         if (flags & MSG_OOB)
1490                 goto recv_urg;
1491
1492         seq = &tp->copied_seq;
1493         if (flags & MSG_PEEK) {
1494                 peek_seq = tp->copied_seq;
1495                 seq = &peek_seq;
1496         }
1497
1498         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1499
1500         do {
1501                 struct sk_buff *skb;
1502                 u32 offset;
1503
1504                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1505                 if (tp->urg_data && tp->urg_seq == *seq) {
1506                         if (copied)
1507                                 break;
1508                         if (signal_pending(current)) {
1509                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1510                                 break;
1511                         }
1512                 }
1513
1514                 /* Next get a buffer. */
1515
1516                 skb = skb_peek(&sk->sk_receive_queue);
1517                 do {
1518                         if (!skb)
1519                                 break;
1520
1521                         /* Now that we have two receive queues this
1522                          * shouldn't happen.
1523                          */
1524                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1525                                 printk(KERN_INFO "recvmsg bug: copied %X "
1526                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1527                                 break;
1528                         }
1529                         offset = *seq - TCP_SKB_CB(skb)->seq;
1530                         if (skb->h.th->syn)
1531                                 offset--;
1532                         if (offset < skb->len)
1533                                 goto found_ok_skb;
1534                         if (skb->h.th->fin)
1535                                 goto found_fin_ok;
1536                         BUG_TRAP(flags & MSG_PEEK);
1537                         skb = skb->next;
1538                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1539
1540                 /* Well, if we have backlog, try to process it now yet. */
1541
1542                 if (copied >= target && !sk->sk_backlog.tail)
1543                         break;
1544
1545                 if (copied) {
1546                         if (sk->sk_err ||
1547                             sk->sk_state == TCP_CLOSE ||
1548                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1549                             !timeo ||
1550                             signal_pending(current) ||
1551                             (flags & MSG_PEEK))
1552                                 break;
1553                 } else {
1554                         if (sock_flag(sk, SOCK_DONE))
1555                                 break;
1556
1557                         if (sk->sk_err) {
1558                                 copied = sock_error(sk);
1559                                 break;
1560                         }
1561
1562                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1563                                 break;
1564
1565                         if (sk->sk_state == TCP_CLOSE) {
1566                                 if (!sock_flag(sk, SOCK_DONE)) {
1567                                         /* This occurs when user tries to read
1568                                          * from never connected socket.
1569                                          */
1570                                         copied = -ENOTCONN;
1571                                         break;
1572                                 }
1573                                 break;
1574                         }
1575
1576                         if (!timeo) {
1577                                 copied = -EAGAIN;
1578                                 break;
1579                         }
1580
1581                         if (signal_pending(current)) {
1582                                 copied = sock_intr_errno(timeo);
1583                                 break;
1584                         }
1585                 }
1586
1587                 cleanup_rbuf(sk, copied);
1588
1589                 if (tp->ucopy.task == user_recv) {
1590                         /* Install new reader */
1591                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1592                                 user_recv = current;
1593                                 tp->ucopy.task = user_recv;
1594                                 tp->ucopy.iov = msg->msg_iov;
1595                         }
1596
1597                         tp->ucopy.len = len;
1598
1599                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1600                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1601
1602                         /* Ugly... If prequeue is not empty, we have to
1603                          * process it before releasing socket, otherwise
1604                          * order will be broken at second iteration.
1605                          * More elegant solution is required!!!
1606                          *
1607                          * Look: we have the following (pseudo)queues:
1608                          *
1609                          * 1. packets in flight
1610                          * 2. backlog
1611                          * 3. prequeue
1612                          * 4. receive_queue
1613                          *
1614                          * Each queue can be processed only if the next ones
1615                          * are empty. At this point we have empty receive_queue.
1616                          * But prequeue _can_ be not empty after 2nd iteration,
1617                          * when we jumped to start of loop because backlog
1618                          * processing added something to receive_queue.
1619                          * We cannot release_sock(), because backlog contains
1620                          * packets arrived _after_ prequeued ones.
1621                          *
1622                          * Shortly, algorithm is clear --- to process all
1623                          * the queues in order. We could make it more directly,
1624                          * requeueing packets from backlog to prequeue, if
1625                          * is not empty. It is more elegant, but eats cycles,
1626                          * unfortunately.
1627                          */
1628                         if (skb_queue_len(&tp->ucopy.prequeue))
1629                                 goto do_prequeue;
1630
1631                         /* __ Set realtime policy in scheduler __ */
1632                 }
1633
1634                 if (copied >= target) {
1635                         /* Do not sleep, just process backlog. */
1636                         release_sock(sk);
1637                         lock_sock(sk);
1638                 } else
1639                         sk_wait_data(sk, &timeo);
1640
1641                 if (user_recv) {
1642                         int chunk;
1643
1644                         /* __ Restore normal policy in scheduler __ */
1645
1646                         if ((chunk = len - tp->ucopy.len) != 0) {
1647                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1648                                 len -= chunk;
1649                                 copied += chunk;
1650                         }
1651
1652                         if (tp->rcv_nxt == tp->copied_seq &&
1653                             skb_queue_len(&tp->ucopy.prequeue)) {
1654 do_prequeue:
1655                                 tcp_prequeue_process(sk);
1656
1657                                 if ((chunk = len - tp->ucopy.len) != 0) {
1658                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1659                                         len -= chunk;
1660                                         copied += chunk;
1661                                 }
1662                         }
1663                 }
1664                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1665                         if (net_ratelimit())
1666                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1667                                        current->comm, current->pid);
1668                         peek_seq = tp->copied_seq;
1669                 }
1670                 continue;
1671
1672         found_ok_skb:
1673                 /* Ok so how much can we use? */
1674                 used = skb->len - offset;
1675                 if (len < used)
1676                         used = len;
1677
1678                 /* Do we have urgent data here? */
1679                 if (tp->urg_data) {
1680                         u32 urg_offset = tp->urg_seq - *seq;
1681                         if (urg_offset < used) {
1682                                 if (!urg_offset) {
1683                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1684                                                 ++*seq;
1685                                                 offset++;
1686                                                 used--;
1687                                                 if (!used)
1688                                                         goto skip_copy;
1689                                         }
1690                                 } else
1691                                         used = urg_offset;
1692                         }
1693                 }
1694
1695                 if (!(flags & MSG_TRUNC)) {
1696                         err = skb_copy_datagram_iovec(skb, offset,
1697                                                       msg->msg_iov, used);
1698                         if (err) {
1699                                 /* Exception. Bailout! */
1700                                 if (!copied)
1701                                         copied = -EFAULT;
1702                                 break;
1703                         }
1704                 }
1705
1706                 *seq += used;
1707                 copied += used;
1708                 len -= used;
1709
1710                 tcp_rcv_space_adjust(sk);
1711
1712 skip_copy:
1713                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1714                         tp->urg_data = 0;
1715                         tcp_fast_path_check(sk, tp);
1716                 }
1717                 if (used + offset < skb->len)
1718                         continue;
1719
1720                 if (skb->h.th->fin)
1721                         goto found_fin_ok;
1722                 if (!(flags & MSG_PEEK))
1723                         sk_eat_skb(sk, skb);
1724                 continue;
1725
1726         found_fin_ok:
1727                 /* Process the FIN. */
1728                 ++*seq;
1729                 if (!(flags & MSG_PEEK))
1730                         sk_eat_skb(sk, skb);
1731                 break;
1732         } while (len > 0);
1733
1734         if (user_recv) {
1735                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1736                         int chunk;
1737
1738                         tp->ucopy.len = copied > 0 ? len : 0;
1739
1740                         tcp_prequeue_process(sk);
1741
1742                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1743                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1744                                 len -= chunk;
1745                                 copied += chunk;
1746                         }
1747                 }
1748
1749                 tp->ucopy.task = NULL;
1750                 tp->ucopy.len = 0;
1751         }
1752
1753         /* According to UNIX98, msg_name/msg_namelen are ignored
1754          * on connected socket. I was just happy when found this 8) --ANK
1755          */
1756
1757         /* Clean up data we have read: This will do ACK frames. */
1758         cleanup_rbuf(sk, copied);
1759
1760         TCP_CHECK_TIMER(sk);
1761         release_sock(sk);
1762         return copied;
1763
1764 out:
1765         TCP_CHECK_TIMER(sk);
1766         release_sock(sk);
1767         return err;
1768
1769 recv_urg:
1770         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1771         goto out;
1772 }
1773
1774 /*
1775  *      State processing on a close. This implements the state shift for
1776  *      sending our FIN frame. Note that we only send a FIN for some
1777  *      states. A shutdown() may have already sent the FIN, or we may be
1778  *      closed.
1779  */
1780
1781 static unsigned char new_state[16] = {
1782   /* current state:        new state:      action:      */
1783   /* (Invalid)          */ TCP_CLOSE,
1784   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1785   /* TCP_SYN_SENT       */ TCP_CLOSE,
1786   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1787   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1788   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1789   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1790   /* TCP_CLOSE          */ TCP_CLOSE,
1791   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1792   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1793   /* TCP_LISTEN         */ TCP_CLOSE,
1794   /* TCP_CLOSING        */ TCP_CLOSING,
1795 };
1796
1797 static int tcp_close_state(struct sock *sk)
1798 {
1799         int next = (int)new_state[sk->sk_state];
1800         int ns = next & TCP_STATE_MASK;
1801
1802         tcp_set_state(sk, ns);
1803
1804         return next & TCP_ACTION_FIN;
1805 }
1806
1807 /*
1808  *      Shutdown the sending side of a connection. Much like close except
1809  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1810  */
1811
1812 void tcp_shutdown(struct sock *sk, int how)
1813 {
1814         /*      We need to grab some memory, and put together a FIN,
1815          *      and then put it into the queue to be sent.
1816          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1817          */
1818         if (!(how & SEND_SHUTDOWN))
1819                 return;
1820
1821         /* If we've already sent a FIN, or it's a closed state, skip this. */
1822         if ((1 << sk->sk_state) &
1823             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1824              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1825                 /* Clear out any half completed packets.  FIN if needed. */
1826                 if (tcp_close_state(sk))
1827                         tcp_send_fin(sk);
1828         }
1829 }
1830
1831
1832 /*
1833  *      Return 1 if we still have things to send in our buffers.
1834  */
1835
1836 static inline int closing(struct sock *sk)
1837 {
1838         return (1 << sk->sk_state) &
1839                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1840 }
1841
1842 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1843 {
1844         /* First the read buffer. */
1845         __skb_queue_purge(&sk->sk_receive_queue);
1846
1847         /* Next, the error queue. */
1848         __skb_queue_purge(&sk->sk_error_queue);
1849
1850         /* Next, the write queue. */
1851         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1852
1853         /* Account for returned memory. */
1854         tcp_mem_reclaim(sk);
1855
1856         BUG_TRAP(!sk->sk_wmem_queued);
1857         BUG_TRAP(!sk->sk_forward_alloc);
1858
1859         /* It is _impossible_ for the backlog to contain anything
1860          * when we get here.  All user references to this socket
1861          * have gone away, only the net layer knows can touch it.
1862          */
1863 }
1864
1865 /*
1866  * At this point, there should be no process reference to this
1867  * socket, and thus no user references at all.  Therefore we
1868  * can assume the socket waitqueue is inactive and nobody will
1869  * try to jump onto it.
1870  */
1871 void tcp_destroy_sock(struct sock *sk)
1872 {
1873         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1874         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1875
1876         /* It cannot be in hash table! */
1877         BUG_TRAP(sk_unhashed(sk));
1878
1879         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1880         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1881
1882 #ifdef TCP_DEBUG
1883         if (sk->sk_zapped) {
1884                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1885                 sock_hold(sk);
1886         }
1887         sk->sk_zapped = 1;
1888 #endif
1889
1890         sk->sk_prot->destroy(sk);
1891
1892         tcp_kill_sk_queues(sk);
1893
1894         xfrm_sk_free_policy(sk);
1895
1896 #ifdef INET_REFCNT_DEBUG
1897         if (atomic_read(&sk->sk_refcnt) != 1) {
1898                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1899                        sk, atomic_read(&sk->sk_refcnt));
1900         }
1901 #endif
1902
1903         atomic_dec(&tcp_orphan_count);
1904         sock_put(sk);
1905 }
1906
1907 void tcp_close(struct sock *sk, long timeout)
1908 {
1909         struct sk_buff *skb;
1910         int data_was_unread = 0;
1911
1912         lock_sock(sk);
1913         sk->sk_shutdown = SHUTDOWN_MASK;
1914
1915         if (sk->sk_state == TCP_LISTEN) {
1916                 tcp_set_state(sk, TCP_CLOSE);
1917
1918                 /* Special case. */
1919                 tcp_listen_stop(sk);
1920
1921                 goto adjudge_to_death;
1922         }
1923
1924         /*  We need to flush the recv. buffs.  We do this only on the
1925          *  descriptor close, not protocol-sourced closes, because the
1926          *  reader process may not have drained the data yet!
1927          */
1928         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1929                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1930                           skb->h.th->fin;
1931                 data_was_unread += len;
1932                 __kfree_skb(skb);
1933         }
1934
1935         tcp_mem_reclaim(sk);
1936
1937         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1938          * 3.10, we send a RST here because data was lost.  To
1939          * witness the awful effects of the old behavior of always
1940          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1941          * a bulk GET in an FTP client, suspend the process, wait
1942          * for the client to advertise a zero window, then kill -9
1943          * the FTP client, wheee...  Note: timeout is always zero
1944          * in such a case.
1945          */
1946         if (data_was_unread) {
1947                 /* Unread data was tossed, zap the connection. */
1948                 NET_INC_STATS_USER(TCPAbortOnClose);
1949                 tcp_set_state(sk, TCP_CLOSE);
1950                 tcp_send_active_reset(sk, GFP_KERNEL);
1951         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1952                 /* Check zero linger _after_ checking for unread data. */
1953                 sk->sk_prot->disconnect(sk, 0);
1954                 NET_INC_STATS_USER(TCPAbortOnData);
1955         } else if (tcp_close_state(sk)) {
1956                 /* We FIN if the application ate all the data before
1957                  * zapping the connection.
1958                  */
1959
1960                 /* RED-PEN. Formally speaking, we have broken TCP state
1961                  * machine. State transitions:
1962                  *
1963                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1964                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1965                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1966                  *
1967                  * are legal only when FIN has been sent (i.e. in window),
1968                  * rather than queued out of window. Purists blame.
1969                  *
1970                  * F.e. "RFC state" is ESTABLISHED,
1971                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1972                  *
1973                  * The visible declinations are that sometimes
1974                  * we enter time-wait state, when it is not required really
1975                  * (harmless), do not send active resets, when they are
1976                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1977                  * they look as CLOSING or LAST_ACK for Linux)
1978                  * Probably, I missed some more holelets.
1979                  *                                              --ANK
1980                  */
1981                 tcp_send_fin(sk);
1982         }
1983
1984         if (timeout) {
1985                 struct task_struct *tsk = current;
1986                 DEFINE_WAIT(wait);
1987
1988                 do {
1989                         prepare_to_wait(sk->sk_sleep, &wait,
1990                                         TASK_INTERRUPTIBLE);
1991                         if (!closing(sk))
1992                                 break;
1993                         release_sock(sk);
1994                         timeout = schedule_timeout(timeout);
1995                         lock_sock(sk);
1996                 } while (!signal_pending(tsk) && timeout);
1997
1998                 finish_wait(sk->sk_sleep, &wait);
1999         }
2000
2001 adjudge_to_death:
2002         /* It is the last release_sock in its life. It will remove backlog. */
2003         release_sock(sk);
2004
2005
2006         /* Now socket is owned by kernel and we acquire BH lock
2007            to finish close. No need to check for user refs.
2008          */
2009         local_bh_disable();
2010         bh_lock_sock(sk);
2011         BUG_TRAP(!sock_owned_by_user(sk));
2012
2013         sock_hold(sk);
2014         sock_orphan(sk);
2015
2016         /*      This is a (useful) BSD violating of the RFC. There is a
2017          *      problem with TCP as specified in that the other end could
2018          *      keep a socket open forever with no application left this end.
2019          *      We use a 3 minute timeout (about the same as BSD) then kill
2020          *      our end. If they send after that then tough - BUT: long enough
2021          *      that we won't make the old 4*rto = almost no time - whoops
2022          *      reset mistake.
2023          *
2024          *      Nope, it was not mistake. It is really desired behaviour
2025          *      f.e. on http servers, when such sockets are useless, but
2026          *      consume significant resources. Let's do it with special
2027          *      linger2 option.                                 --ANK
2028          */
2029
2030         if (sk->sk_state == TCP_FIN_WAIT2) {
2031                 struct tcp_opt *tp = tcp_sk(sk);
2032                 if (tp->linger2 < 0) {
2033                         tcp_set_state(sk, TCP_CLOSE);
2034                         tcp_send_active_reset(sk, GFP_ATOMIC);
2035                         NET_INC_STATS_BH(TCPAbortOnLinger);
2036                 } else {
2037                         int tmo = tcp_fin_time(tp);
2038
2039                         if (tmo > TCP_TIMEWAIT_LEN) {
2040                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2041                         } else {
2042                                 atomic_inc(&tcp_orphan_count);
2043                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2044                                 goto out;
2045                         }
2046                 }
2047         }
2048         if (sk->sk_state != TCP_CLOSE) {
2049                 tcp_mem_reclaim(sk);
2050                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2051                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2052                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2053                         if (net_ratelimit())
2054                                 printk(KERN_INFO "TCP: too many of orphaned "
2055                                        "sockets\n");
2056                         tcp_set_state(sk, TCP_CLOSE);
2057                         tcp_send_active_reset(sk, GFP_ATOMIC);
2058                         NET_INC_STATS_BH(TCPAbortOnMemory);
2059                 }
2060         }
2061         atomic_inc(&tcp_orphan_count);
2062
2063         if (sk->sk_state == TCP_CLOSE)
2064                 tcp_destroy_sock(sk);
2065         /* Otherwise, socket is reprieved until protocol close. */
2066
2067 out:
2068         bh_unlock_sock(sk);
2069         local_bh_enable();
2070         sock_put(sk);
2071 }
2072
2073 /* These states need RST on ABORT according to RFC793 */
2074
2075 static inline int tcp_need_reset(int state)
2076 {
2077         return (1 << state) &
2078                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2079                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2080 }
2081
2082 int tcp_disconnect(struct sock *sk, int flags)
2083 {
2084         struct inet_opt *inet = inet_sk(sk);
2085         struct tcp_opt *tp = tcp_sk(sk);
2086         int err = 0;
2087         int old_state = sk->sk_state;
2088
2089         if (old_state != TCP_CLOSE)
2090                 tcp_set_state(sk, TCP_CLOSE);
2091
2092         /* ABORT function of RFC793 */
2093         if (old_state == TCP_LISTEN) {
2094                 tcp_listen_stop(sk);
2095         } else if (tcp_need_reset(old_state) ||
2096                    (tp->snd_nxt != tp->write_seq &&
2097                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2098                 /* The last check adjusts for discrepance of Linux wrt. RFC
2099                  * states
2100                  */
2101                 tcp_send_active_reset(sk, gfp_any());
2102                 sk->sk_err = ECONNRESET;
2103         } else if (old_state == TCP_SYN_SENT)
2104                 sk->sk_err = ECONNRESET;
2105
2106         tcp_clear_xmit_timers(sk);
2107         __skb_queue_purge(&sk->sk_receive_queue);
2108         tcp_writequeue_purge(sk);
2109         __skb_queue_purge(&tp->out_of_order_queue);
2110
2111         inet->dport = 0;
2112
2113         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2114                 inet_reset_saddr(sk);
2115
2116         sk->sk_shutdown = 0;
2117         sock_reset_flag(sk, SOCK_DONE);
2118         tp->srtt = 0;
2119         if ((tp->write_seq += tp->max_window + 2) == 0)
2120                 tp->write_seq = 1;
2121         tp->backoff = 0;
2122         tp->snd_cwnd = 2;
2123         tp->probes_out = 0;
2124         tp->packets_out = 0;
2125         tp->snd_ssthresh = 0x7fffffff;
2126         tp->snd_cwnd_cnt = 0;
2127         tcp_set_ca_state(tp, TCP_CA_Open);
2128         tcp_clear_retrans(tp);
2129         tcp_delack_init(tp);
2130         tp->send_head = NULL;
2131         tp->saw_tstamp = 0;
2132         tcp_sack_reset(tp);
2133         __sk_dst_reset(sk);
2134
2135         BUG_TRAP(!inet->num || tp->bind_hash);
2136
2137         sk->sk_error_report(sk);
2138         return err;
2139 }
2140
2141 /*
2142  *      Wait for an incoming connection, avoid race
2143  *      conditions. This must be called with the socket locked.
2144  */
2145 static int wait_for_connect(struct sock *sk, long timeo)
2146 {
2147         struct tcp_opt *tp = tcp_sk(sk);
2148         DEFINE_WAIT(wait);
2149         int err;
2150
2151         /*
2152          * True wake-one mechanism for incoming connections: only
2153          * one process gets woken up, not the 'whole herd'.
2154          * Since we do not 'race & poll' for established sockets
2155          * anymore, the common case will execute the loop only once.
2156          *
2157          * Subtle issue: "add_wait_queue_exclusive()" will be added
2158          * after any current non-exclusive waiters, and we know that
2159          * it will always _stay_ after any new non-exclusive waiters
2160          * because all non-exclusive waiters are added at the
2161          * beginning of the wait-queue. As such, it's ok to "drop"
2162          * our exclusiveness temporarily when we get woken up without
2163          * having to remove and re-insert us on the wait queue.
2164          */
2165         for (;;) {
2166                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2167                                           TASK_INTERRUPTIBLE);
2168                 release_sock(sk);
2169                 if (!tp->accept_queue)
2170                         timeo = schedule_timeout(timeo);
2171                 lock_sock(sk);
2172                 err = 0;
2173                 if (tp->accept_queue)
2174                         break;
2175                 err = -EINVAL;
2176                 if (sk->sk_state != TCP_LISTEN)
2177                         break;
2178                 err = sock_intr_errno(timeo);
2179                 if (signal_pending(current))
2180                         break;
2181                 err = -EAGAIN;
2182                 if (!timeo)
2183                         break;
2184         }
2185         finish_wait(sk->sk_sleep, &wait);
2186         return err;
2187 }
2188
2189 /*
2190  *      This will accept the next outstanding connection.
2191  */
2192
2193 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2194 {
2195         struct tcp_opt *tp = tcp_sk(sk);
2196         struct open_request *req;
2197         struct sock *newsk;
2198         int error;
2199
2200         lock_sock(sk);
2201
2202         /* We need to make sure that this socket is listening,
2203          * and that it has something pending.
2204          */
2205         error = -EINVAL;
2206         if (sk->sk_state != TCP_LISTEN)
2207                 goto out;
2208
2209         /* Find already established connection */
2210         if (!tp->accept_queue) {
2211                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2212
2213                 /* If this is a non blocking socket don't sleep */
2214                 error = -EAGAIN;
2215                 if (!timeo)
2216                         goto out;
2217
2218                 error = wait_for_connect(sk, timeo);
2219                 if (error)
2220                         goto out;
2221         }
2222
2223         req = tp->accept_queue;
2224         if ((tp->accept_queue = req->dl_next) == NULL)
2225                 tp->accept_queue_tail = NULL;
2226
2227         newsk = req->sk;
2228         sk_acceptq_removed(sk);
2229         tcp_openreq_fastfree(req);
2230         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2231         release_sock(sk);
2232         return newsk;
2233
2234 out:
2235         release_sock(sk);
2236         *err = error;
2237         return NULL;
2238 }
2239
2240 /*
2241  *      Socket option code for TCP.
2242  */
2243 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2244                    int optlen)
2245 {
2246         struct tcp_opt *tp = tcp_sk(sk);
2247         int val;
2248         int err = 0;
2249
2250         if (level != SOL_TCP)
2251                 return tp->af_specific->setsockopt(sk, level, optname,
2252                                                    optval, optlen);
2253
2254         if (optlen < sizeof(int))
2255                 return -EINVAL;
2256
2257         if (get_user(val, (int __user *)optval))
2258                 return -EFAULT;
2259
2260         lock_sock(sk);
2261
2262         switch (optname) {
2263         case TCP_MAXSEG:
2264                 /* Values greater than interface MTU won't take effect. However
2265                  * at the point when this call is done we typically don't yet
2266                  * know which interface is going to be used */
2267                 if (val < 8 || val > MAX_TCP_WINDOW) {
2268                         err = -EINVAL;
2269                         break;
2270                 }
2271                 tp->user_mss = val;
2272                 break;
2273
2274         case TCP_NODELAY:
2275                 if (val) {
2276                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2277                          * this option on corked socket is remembered, but
2278                          * it is not activated until cork is cleared.
2279                          *
2280                          * However, when TCP_NODELAY is set we make
2281                          * an explicit push, which overrides even TCP_CORK
2282                          * for currently queued segments.
2283                          */
2284                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2285                         tcp_push_pending_frames(sk, tp);
2286                 } else {
2287                         tp->nonagle &= ~TCP_NAGLE_OFF;
2288                 }
2289                 break;
2290
2291         case TCP_CORK:
2292                 /* When set indicates to always queue non-full frames.
2293                  * Later the user clears this option and we transmit
2294                  * any pending partial frames in the queue.  This is
2295                  * meant to be used alongside sendfile() to get properly
2296                  * filled frames when the user (for example) must write
2297                  * out headers with a write() call first and then use
2298                  * sendfile to send out the data parts.
2299                  *
2300                  * TCP_CORK can be set together with TCP_NODELAY and it is
2301                  * stronger than TCP_NODELAY.
2302                  */
2303                 if (val) {
2304                         tp->nonagle |= TCP_NAGLE_CORK;
2305                 } else {
2306                         tp->nonagle &= ~TCP_NAGLE_CORK;
2307                         if (tp->nonagle&TCP_NAGLE_OFF)
2308                                 tp->nonagle |= TCP_NAGLE_PUSH;
2309                         tcp_push_pending_frames(sk, tp);
2310                 }
2311                 break;
2312
2313         case TCP_KEEPIDLE:
2314                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2315                         err = -EINVAL;
2316                 else {
2317                         tp->keepalive_time = val * HZ;
2318                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2319                             !((1 << sk->sk_state) &
2320                               (TCPF_CLOSE | TCPF_LISTEN))) {
2321                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2322                                 if (tp->keepalive_time > elapsed)
2323                                         elapsed = tp->keepalive_time - elapsed;
2324                                 else
2325                                         elapsed = 0;
2326                                 tcp_reset_keepalive_timer(sk, elapsed);
2327                         }
2328                 }
2329                 break;
2330         case TCP_KEEPINTVL:
2331                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2332                         err = -EINVAL;
2333                 else
2334                         tp->keepalive_intvl = val * HZ;
2335                 break;
2336         case TCP_KEEPCNT:
2337                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2338                         err = -EINVAL;
2339                 else
2340                         tp->keepalive_probes = val;
2341                 break;
2342         case TCP_SYNCNT:
2343                 if (val < 1 || val > MAX_TCP_SYNCNT)
2344                         err = -EINVAL;
2345                 else
2346                         tp->syn_retries = val;
2347                 break;
2348
2349         case TCP_LINGER2:
2350                 if (val < 0)
2351                         tp->linger2 = -1;
2352                 else if (val > sysctl_tcp_fin_timeout / HZ)
2353                         tp->linger2 = 0;
2354                 else
2355                         tp->linger2 = val * HZ;
2356                 break;
2357
2358         case TCP_DEFER_ACCEPT:
2359                 tp->defer_accept = 0;
2360                 if (val > 0) {
2361                         /* Translate value in seconds to number of
2362                          * retransmits */
2363                         while (tp->defer_accept < 32 &&
2364                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2365                                        tp->defer_accept))
2366                                 tp->defer_accept++;
2367                         tp->defer_accept++;
2368                 }
2369                 break;
2370
2371         case TCP_WINDOW_CLAMP:
2372                 if (!val) {
2373                         if (sk->sk_state != TCP_CLOSE) {
2374                                 err = -EINVAL;
2375                                 break;
2376                         }
2377                         tp->window_clamp = 0;
2378                 } else
2379                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2380                                                 SOCK_MIN_RCVBUF / 2 : val;
2381                 break;
2382
2383         case TCP_QUICKACK:
2384                 if (!val) {
2385                         tp->ack.pingpong = 1;
2386                 } else {
2387                         tp->ack.pingpong = 0;
2388                         if ((1 << sk->sk_state) &
2389                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2390                             tcp_ack_scheduled(tp)) {
2391                                 tp->ack.pending |= TCP_ACK_PUSHED;
2392                                 cleanup_rbuf(sk, 1);
2393                                 if (!(val & 1))
2394                                         tp->ack.pingpong = 1;
2395                         }
2396                 }
2397                 break;
2398
2399         default:
2400                 err = -ENOPROTOOPT;
2401                 break;
2402         };
2403         release_sock(sk);
2404         return err;
2405 }
2406
2407 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2408                    int __user *optlen)
2409 {
2410         struct tcp_opt *tp = tcp_sk(sk);
2411         int val, len;
2412
2413         if (level != SOL_TCP)
2414                 return tp->af_specific->getsockopt(sk, level, optname,
2415                                                    optval, optlen);
2416
2417         if (get_user(len, optlen))
2418                 return -EFAULT;
2419
2420         len = min_t(unsigned int, len, sizeof(int));
2421
2422         if (len < 0)
2423                 return -EINVAL;
2424
2425         switch (optname) {
2426         case TCP_MAXSEG:
2427                 val = tp->mss_cache_std;
2428                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2429                         val = tp->user_mss;
2430                 break;
2431         case TCP_NODELAY:
2432                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2433                 break;
2434         case TCP_CORK:
2435                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2436                 break;
2437         case TCP_KEEPIDLE:
2438                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2439                 break;
2440         case TCP_KEEPINTVL:
2441                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2442                 break;
2443         case TCP_KEEPCNT:
2444                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2445                 break;
2446         case TCP_SYNCNT:
2447                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2448                 break;
2449         case TCP_LINGER2:
2450                 val = tp->linger2;
2451                 if (val >= 0)
2452                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2453                 break;
2454         case TCP_DEFER_ACCEPT:
2455                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2456                                                (tp->defer_accept - 1));
2457                 break;
2458         case TCP_WINDOW_CLAMP:
2459                 val = tp->window_clamp;
2460                 break;
2461         case TCP_INFO: {
2462                 struct tcp_info info;
2463
2464                 if (get_user(len, optlen))
2465                         return -EFAULT;
2466
2467                 tcp_get_info(sk, &info);
2468
2469                 len = min_t(unsigned int, len, sizeof(info));
2470                 if (put_user(len, optlen))
2471                         return -EFAULT;
2472                 if (copy_to_user(optval, &info, len))
2473                         return -EFAULT;
2474                 return 0;
2475         }
2476         case TCP_QUICKACK:
2477                 val = !tp->ack.pingpong;
2478                 break;
2479         default:
2480                 return -ENOPROTOOPT;
2481         };
2482
2483         if (put_user(len, optlen))
2484                 return -EFAULT;
2485         if (copy_to_user(optval, &val, len))
2486                 return -EFAULT;
2487         return 0;
2488 }
2489
2490
2491 extern void __skb_cb_too_small_for_tcp(int, int);
2492 extern void tcpdiag_init(void);
2493
2494 static __initdata unsigned long thash_entries;
2495 static int __init set_thash_entries(char *str)
2496 {
2497         if (!str)
2498                 return 0;
2499         thash_entries = simple_strtoul(str, &str, 0);
2500         return 1;
2501 }
2502 __setup("thash_entries=", set_thash_entries);
2503
2504 void __init tcp_init(void)
2505 {
2506         struct sk_buff *skb = NULL;
2507         unsigned long goal;
2508         int order, i;
2509
2510         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2511                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2512                                            sizeof(skb->cb));
2513
2514         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2515                                                    sizeof(struct open_request),
2516                                                0, SLAB_HWCACHE_ALIGN,
2517                                                NULL, NULL);
2518         if (!tcp_openreq_cachep)
2519                 panic("tcp_init: Cannot alloc open_request cache.");
2520
2521         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2522                                               sizeof(struct tcp_bind_bucket),
2523                                               0, SLAB_HWCACHE_ALIGN,
2524                                               NULL, NULL);
2525         if (!tcp_bucket_cachep)
2526                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2527
2528         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2529                                                 sizeof(struct tcp_tw_bucket),
2530                                                 0, SLAB_HWCACHE_ALIGN,
2531                                                 NULL, NULL);
2532         if (!tcp_timewait_cachep)
2533                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2534
2535         /* Size and allocate the main established and bind bucket
2536          * hash tables.
2537          *
2538          * The methodology is similar to that of the buffer cache.
2539          */
2540         if (num_physpages >= (128 * 1024))
2541                 goal = num_physpages >> (21 - PAGE_SHIFT);
2542         else
2543                 goal = num_physpages >> (23 - PAGE_SHIFT);
2544
2545         if (thash_entries)
2546                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2547         for (order = 0; (1UL << order) < goal; order++)
2548                 ;
2549         do {
2550                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2551                         sizeof(struct tcp_ehash_bucket);
2552                 tcp_ehash_size >>= 1;
2553                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2554                         tcp_ehash_size--;
2555                 tcp_ehash = (struct tcp_ehash_bucket *)
2556                         __get_free_pages(GFP_ATOMIC, order);
2557         } while (!tcp_ehash && --order > 0);
2558
2559         if (!tcp_ehash)
2560                 panic("Failed to allocate TCP established hash table\n");
2561         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2562                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2563                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2564         }
2565
2566         do {
2567                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2568                         sizeof(struct tcp_bind_hashbucket);
2569                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2570                         continue;
2571                 tcp_bhash = (struct tcp_bind_hashbucket *)
2572                         __get_free_pages(GFP_ATOMIC, order);
2573         } while (!tcp_bhash && --order >= 0);
2574
2575         if (!tcp_bhash)
2576                 panic("Failed to allocate TCP bind hash table\n");
2577         for (i = 0; i < tcp_bhash_size; i++) {
2578                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2579                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2580         }
2581
2582         /* Try to be a bit smarter and adjust defaults depending
2583          * on available memory.
2584          */
2585         if (order > 4) {
2586                 sysctl_local_port_range[0] = 32768;
2587                 sysctl_local_port_range[1] = 61000;
2588                 sysctl_tcp_max_tw_buckets = 180000;
2589                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2590                 sysctl_max_syn_backlog = 1024;
2591         } else if (order < 3) {
2592                 sysctl_local_port_range[0] = 1024 * (3 - order);
2593                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2594                 sysctl_tcp_max_orphans >>= (3 - order);
2595                 sysctl_max_syn_backlog = 128;
2596         }
2597         tcp_port_rover = sysctl_local_port_range[0] - 1;
2598
2599         sysctl_tcp_mem[0] =  768 << order;
2600         sysctl_tcp_mem[1] = 1024 << order;
2601         sysctl_tcp_mem[2] = 1536 << order;
2602
2603         if (order < 3) {
2604                 sysctl_tcp_wmem[2] = 64 * 1024;
2605                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2606                 sysctl_tcp_rmem[1] = 43689;
2607                 sysctl_tcp_rmem[2] = 2 * 43689;
2608         }
2609
2610         printk(KERN_INFO "TCP: Hash tables configured "
2611                "(established %d bind %d)\n",
2612                tcp_ehash_size << 1, tcp_bhash_size);
2613
2614         tcpdiag_init();
2615 }
2616
2617 EXPORT_SYMBOL(__tcp_mem_reclaim);
2618 EXPORT_SYMBOL(sysctl_tcp_rmem);
2619 EXPORT_SYMBOL(sysctl_tcp_wmem);
2620 EXPORT_SYMBOL(tcp_accept);
2621 EXPORT_SYMBOL(tcp_close);
2622 EXPORT_SYMBOL(tcp_close_state);
2623 EXPORT_SYMBOL(tcp_destroy_sock);
2624 EXPORT_SYMBOL(tcp_disconnect);
2625 EXPORT_SYMBOL(tcp_getsockopt);
2626 EXPORT_SYMBOL(tcp_ioctl);
2627 EXPORT_SYMBOL(tcp_openreq_cachep);
2628 EXPORT_SYMBOL(tcp_poll);
2629 EXPORT_SYMBOL(tcp_read_sock);
2630 EXPORT_SYMBOL(tcp_recvmsg);
2631 EXPORT_SYMBOL(tcp_sendmsg);
2632 EXPORT_SYMBOL(tcp_sendpage);
2633 EXPORT_SYMBOL(tcp_setsockopt);
2634 EXPORT_SYMBOL(tcp_shutdown);
2635 EXPORT_SYMBOL(tcp_sockets_allocated);
2636 EXPORT_SYMBOL(tcp_statistics);
2637 EXPORT_SYMBOL(tcp_timewait_cachep);
2638 EXPORT_SYMBOL(tcp_write_space);