6cf848757b2cbc59b8af73c45739490ae7b8a177
[linux-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259
260 #include <net/icmp.h>
261 #include <net/tcp.h>
262 #include <net/xfrm.h>
263 #include <net/ip.h>
264
265
266 #include <asm/uaccess.h>
267 #include <asm/ioctls.h>
268
269 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
270
271 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
272
273 kmem_cache_t *tcp_openreq_cachep;
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_default_win_scale = 7;
280
281 int sysctl_tcp_mem[3];
282 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
283 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
284
285 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
286 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
287
288 /* Pressure flag: try to collapse.
289  * Technical note: it is used by multiple contexts non atomically.
290  * All the tcp_mem_schedule() is of this nature: accounting
291  * is strict, actions are advisory and have some latency. */
292 int tcp_memory_pressure;
293
294 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
295
296 int tcp_mem_schedule(struct sock *sk, int size, int kind)
297 {
298         int amt = TCP_PAGES(size);
299
300         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
301         atomic_add(amt, &tcp_memory_allocated);
302
303         /* Under limit. */
304         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
305                 if (tcp_memory_pressure)
306                         tcp_memory_pressure = 0;
307                 return 1;
308         }
309
310         /* Over hard limit. */
311         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
312                 tcp_enter_memory_pressure();
313                 goto suppress_allocation;
314         }
315
316         /* Under pressure. */
317         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
318                 tcp_enter_memory_pressure();
319
320         if (kind) {
321                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
322                         return 1;
323         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
324                 return 1;
325
326         if (!tcp_memory_pressure ||
327             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
328                                 TCP_PAGES(sk->sk_wmem_queued +
329                                           atomic_read(&sk->sk_rmem_alloc) +
330                                           sk->sk_forward_alloc))
331                 return 1;
332
333 suppress_allocation:
334
335         if (!kind) {
336                 tcp_moderate_sndbuf(sk);
337
338                 /* Fail only if socket is _under_ its sndbuf.
339                  * In this case we cannot block, so that we have to fail.
340                  */
341                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
342                         return 1;
343         }
344
345         /* Alas. Undo changes. */
346         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
347         atomic_sub(amt, &tcp_memory_allocated);
348         return 0;
349 }
350
351 void __tcp_mem_reclaim(struct sock *sk)
352 {
353         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
354                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
355                            &tcp_memory_allocated);
356                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
357                 if (tcp_memory_pressure &&
358                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
359                         tcp_memory_pressure = 0;
360         }
361 }
362
363 void tcp_rfree(struct sk_buff *skb)
364 {
365         struct sock *sk = skb->sk;
366
367         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
368         sk->sk_forward_alloc += skb->truesize;
369 }
370
371 /*
372  * LISTEN is a special case for poll..
373  */
374 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
375                                                poll_table *wait)
376 {
377         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
378 }
379
380 /*
381  *      Wait for a TCP event.
382  *
383  *      Note that we don't need to lock the socket, as the upper poll layers
384  *      take care of normal races (between the test and the event) and we don't
385  *      go look at any of the socket buffers directly.
386  */
387 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
388 {
389         unsigned int mask;
390         struct sock *sk = sock->sk;
391         struct tcp_opt *tp = tcp_sk(sk);
392
393         poll_wait(file, sk->sk_sleep, wait);
394         if (sk->sk_state == TCP_LISTEN)
395                 return tcp_listen_poll(sk, wait);
396
397         /* Socket is not locked. We are protected from async events
398            by poll logic and correct handling of state changes
399            made by another threads is impossible in any case.
400          */
401
402         mask = 0;
403         if (sk->sk_err)
404                 mask = POLLERR;
405
406         /*
407          * POLLHUP is certainly not done right. But poll() doesn't
408          * have a notion of HUP in just one direction, and for a
409          * socket the read side is more interesting.
410          *
411          * Some poll() documentation says that POLLHUP is incompatible
412          * with the POLLOUT/POLLWR flags, so somebody should check this
413          * all. But careful, it tends to be safer to return too many
414          * bits than too few, and you can easily break real applications
415          * if you don't tell them that something has hung up!
416          *
417          * Check-me.
418          *
419          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
420          * our fs/select.c). It means that after we received EOF,
421          * poll always returns immediately, making impossible poll() on write()
422          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
423          * if and only if shutdown has been made in both directions.
424          * Actually, it is interesting to look how Solaris and DUX
425          * solve this dilemma. I would prefer, if PULLHUP were maskable,
426          * then we could set it on SND_SHUTDOWN. BTW examples given
427          * in Stevens' books assume exactly this behaviour, it explains
428          * why PULLHUP is incompatible with POLLOUT.    --ANK
429          *
430          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
431          * blocking on fresh not-connected or disconnected socket. --ANK
432          */
433         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
434                 mask |= POLLHUP;
435         if (sk->sk_shutdown & RCV_SHUTDOWN)
436                 mask |= POLLIN | POLLRDNORM;
437
438         /* Connected? */
439         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
440                 /* Potential race condition. If read of tp below will
441                  * escape above sk->sk_state, we can be illegally awaken
442                  * in SYN_* states. */
443                 if ((tp->rcv_nxt != tp->copied_seq) &&
444                     (tp->urg_seq != tp->copied_seq ||
445                      tp->rcv_nxt != tp->copied_seq + 1 ||
446                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
447                         mask |= POLLIN | POLLRDNORM;
448
449                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
450                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
451                                 mask |= POLLOUT | POLLWRNORM;
452                         } else {  /* send SIGIO later */
453                                 set_bit(SOCK_ASYNC_NOSPACE,
454                                         &sk->sk_socket->flags);
455                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
456
457                                 /* Race breaker. If space is freed after
458                                  * wspace test but before the flags are set,
459                                  * IO signal will be lost.
460                                  */
461                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
462                                         mask |= POLLOUT | POLLWRNORM;
463                         }
464                 }
465
466                 if (tp->urg_data & TCP_URG_VALID)
467                         mask |= POLLPRI;
468         }
469         return mask;
470 }
471
472 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
473 {
474         struct tcp_opt *tp = tcp_sk(sk);
475         int answ;
476
477         switch (cmd) {
478         case SIOCINQ:
479                 if (sk->sk_state == TCP_LISTEN)
480                         return -EINVAL;
481
482                 lock_sock(sk);
483                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
484                         answ = 0;
485                 else if (sock_flag(sk, SOCK_URGINLINE) ||
486                          !tp->urg_data ||
487                          before(tp->urg_seq, tp->copied_seq) ||
488                          !before(tp->urg_seq, tp->rcv_nxt)) {
489                         answ = tp->rcv_nxt - tp->copied_seq;
490
491                         /* Subtract 1, if FIN is in queue. */
492                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
493                                 answ -=
494                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
495                 } else
496                         answ = tp->urg_seq - tp->copied_seq;
497                 release_sock(sk);
498                 break;
499         case SIOCATMARK:
500                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
501                 break;
502         case SIOCOUTQ:
503                 if (sk->sk_state == TCP_LISTEN)
504                         return -EINVAL;
505
506                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
507                         answ = 0;
508                 else
509                         answ = tp->write_seq - tp->snd_una;
510                 break;
511         default:
512                 return -ENOIOCTLCMD;
513         };
514
515         return put_user(answ, (int __user *)arg);
516 }
517
518
519 int tcp_listen_start(struct sock *sk)
520 {
521         struct inet_opt *inet = inet_sk(sk);
522         struct tcp_opt *tp = tcp_sk(sk);
523         struct tcp_listen_opt *lopt;
524
525         sk->sk_max_ack_backlog = 0;
526         sk->sk_ack_backlog = 0;
527         tp->accept_queue = tp->accept_queue_tail = NULL;
528         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
529         tcp_delack_init(tp);
530
531         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
532         if (!lopt)
533                 return -ENOMEM;
534
535         memset(lopt, 0, sizeof(struct tcp_listen_opt));
536         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
537                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
538                         break;
539         get_random_bytes(&lopt->hash_rnd, 4);
540
541         write_lock_bh(&tp->syn_wait_lock);
542         tp->listen_opt = lopt;
543         write_unlock_bh(&tp->syn_wait_lock);
544
545         /* There is race window here: we announce ourselves listening,
546          * but this transition is still not validated by get_port().
547          * It is OK, because this socket enters to hash table only
548          * after validation is complete.
549          */
550         sk->sk_state = TCP_LISTEN;
551         if (!sk->sk_prot->get_port(sk, inet->num)) {
552                 inet->sport = htons(inet->num);
553
554                 sk_dst_reset(sk);
555                 sk->sk_prot->hash(sk);
556
557                 return 0;
558         }
559
560         sk->sk_state = TCP_CLOSE;
561         write_lock_bh(&tp->syn_wait_lock);
562         tp->listen_opt = NULL;
563         write_unlock_bh(&tp->syn_wait_lock);
564         kfree(lopt);
565         return -EADDRINUSE;
566 }
567
568 /*
569  *      This routine closes sockets which have been at least partially
570  *      opened, but not yet accepted.
571  */
572
573 static void tcp_listen_stop (struct sock *sk)
574 {
575         struct tcp_opt *tp = tcp_sk(sk);
576         struct tcp_listen_opt *lopt = tp->listen_opt;
577         struct open_request *acc_req = tp->accept_queue;
578         struct open_request *req;
579         int i;
580
581         tcp_delete_keepalive_timer(sk);
582
583         /* make all the listen_opt local to us */
584         write_lock_bh(&tp->syn_wait_lock);
585         tp->listen_opt = NULL;
586         write_unlock_bh(&tp->syn_wait_lock);
587         tp->accept_queue = tp->accept_queue_tail = NULL;
588
589         if (lopt->qlen) {
590                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
591                         while ((req = lopt->syn_table[i]) != NULL) {
592                                 lopt->syn_table[i] = req->dl_next;
593                                 lopt->qlen--;
594                                 tcp_openreq_free(req);
595
596                 /* Following specs, it would be better either to send FIN
597                  * (and enter FIN-WAIT-1, it is normal close)
598                  * or to send active reset (abort).
599                  * Certainly, it is pretty dangerous while synflood, but it is
600                  * bad justification for our negligence 8)
601                  * To be honest, we are not able to make either
602                  * of the variants now.                 --ANK
603                  */
604                         }
605                 }
606         }
607         BUG_TRAP(!lopt->qlen);
608
609         kfree(lopt);
610
611         while ((req = acc_req) != NULL) {
612                 struct sock *child = req->sk;
613
614                 acc_req = req->dl_next;
615
616                 local_bh_disable();
617                 bh_lock_sock(child);
618                 BUG_TRAP(!sock_owned_by_user(child));
619                 sock_hold(child);
620
621                 tcp_disconnect(child, O_NONBLOCK);
622
623                 sock_orphan(child);
624
625                 atomic_inc(&tcp_orphan_count);
626
627                 tcp_destroy_sock(child);
628
629                 bh_unlock_sock(child);
630                 local_bh_enable();
631                 sock_put(child);
632
633                 sk_acceptq_removed(sk);
634                 tcp_openreq_fastfree(req);
635         }
636         BUG_TRAP(!sk->sk_ack_backlog);
637 }
638
639 /*
640  *      Wait for a socket to get into the connected state
641  *
642  *      Note: Must be called with the socket locked.
643  */
644 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
645 {
646         struct tcp_opt *tp = tcp_sk(sk);
647         struct task_struct *tsk = current;
648         DEFINE_WAIT(wait);
649
650         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
651                 if (sk->sk_err)
652                         return sock_error(sk);
653                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
654                         return -EPIPE;
655                 if (!*timeo_p)
656                         return -EAGAIN;
657                 if (signal_pending(tsk))
658                         return sock_intr_errno(*timeo_p);
659
660                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
661                 tp->write_pending++;
662
663                 release_sock(sk);
664                 *timeo_p = schedule_timeout(*timeo_p);
665                 lock_sock(sk);
666
667                 finish_wait(sk->sk_sleep, &wait);
668                 tp->write_pending--;
669         }
670         return 0;
671 }
672
673 static inline int tcp_memory_free(struct sock *sk)
674 {
675         return sk->sk_wmem_queued < sk->sk_sndbuf;
676 }
677
678 /*
679  *      Wait for more memory for a socket
680  */
681 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
682 {
683         struct tcp_opt *tp = tcp_sk(sk);
684         int err = 0;
685         long vm_wait = 0;
686         long current_timeo = *timeo;
687         DEFINE_WAIT(wait);
688
689         if (tcp_memory_free(sk))
690                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
691
692         for (;;) {
693                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
694
695                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
696
697                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
698                         goto do_error;
699                 if (!*timeo)
700                         goto do_nonblock;
701                 if (signal_pending(current))
702                         goto do_interrupted;
703                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
704                 if (tcp_memory_free(sk) && !vm_wait)
705                         break;
706
707                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708                 tp->write_pending++;
709                 release_sock(sk);
710                 if (!tcp_memory_free(sk) || vm_wait)
711                         current_timeo = schedule_timeout(current_timeo);
712                 lock_sock(sk);
713                 tp->write_pending--;
714
715                 if (vm_wait) {
716                         vm_wait -= current_timeo;
717                         current_timeo = *timeo;
718                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
719                             (current_timeo -= vm_wait) < 0)
720                                 current_timeo = 0;
721                         vm_wait = 0;
722                 }
723                 *timeo = current_timeo;
724         }
725 out:
726         finish_wait(sk->sk_sleep, &wait);
727         return err;
728
729 do_error:
730         err = -EPIPE;
731         goto out;
732 do_nonblock:
733         err = -EAGAIN;
734         goto out;
735 do_interrupted:
736         err = sock_intr_errno(*timeo);
737         goto out;
738 }
739
740 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
741                                int off)
742 {
743         if (i) {
744                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
745                 return page == frag->page &&
746                        off == frag->page_offset + frag->size;
747         }
748         return 0;
749 }
750
751 static inline void fill_page_desc(struct sk_buff *skb, int i,
752                                   struct page *page, int off, int size)
753 {
754         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
755         frag->page = page;
756         frag->page_offset = off;
757         frag->size = size;
758         skb_shinfo(skb)->nr_frags = i + 1;
759 }
760
761 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
762 {
763         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
764         tp->pushed_seq = tp->write_seq;
765 }
766
767 static inline int forced_push(struct tcp_opt *tp)
768 {
769         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
770 }
771
772 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
773                               struct sk_buff *skb)
774 {
775         skb->csum = 0;
776         TCP_SKB_CB(skb)->seq = tp->write_seq;
777         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
778         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
779         TCP_SKB_CB(skb)->sacked = 0;
780         __skb_queue_tail(&sk->sk_write_queue, skb);
781         sk_charge_skb(sk, skb);
782         if (!tp->send_head)
783                 tp->send_head = skb;
784         else if (tp->nonagle&TCP_NAGLE_PUSH)
785                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
786 }
787
788 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
789                                 struct sk_buff *skb)
790 {
791         if (flags & MSG_OOB) {
792                 tp->urg_mode = 1;
793                 tp->snd_up = tp->write_seq;
794                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
795         }
796 }
797
798 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
799                             int mss_now, int nonagle)
800 {
801         if (tp->send_head) {
802                 struct sk_buff *skb = sk->sk_write_queue.prev;
803                 if (!(flags & MSG_MORE) || forced_push(tp))
804                         tcp_mark_push(tp, skb);
805                 tcp_mark_urg(tp, flags, skb);
806                 __tcp_push_pending_frames(sk, tp, mss_now,
807                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
808         }
809 }
810
811 static int tcp_error(struct sock *sk, int flags, int err)
812 {
813         if (err == -EPIPE)
814                 err = sock_error(sk) ? : -EPIPE;
815         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
816                 send_sig(SIGPIPE, current, 0);
817         return err;
818 }
819
820 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
821                          size_t psize, int flags)
822 {
823         struct tcp_opt *tp = tcp_sk(sk);
824         int mss_now;
825         int err;
826         ssize_t copied;
827         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
828
829         /* Wait for a connection to finish. */
830         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
831                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
832                         goto out_err;
833
834         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
835
836         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
837         copied = 0;
838
839         err = -EPIPE;
840         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
841                 goto do_error;
842
843         while (psize > 0) {
844                 struct sk_buff *skb = sk->sk_write_queue.prev;
845                 struct page *page = pages[poffset / PAGE_SIZE];
846                 int copy, i;
847                 int offset = poffset % PAGE_SIZE;
848                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
849
850                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
851 new_segment:
852                         if (!tcp_memory_free(sk))
853                                 goto wait_for_sndbuf;
854
855                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
856                                              sk->sk_allocation);
857                         if (!skb)
858                                 goto wait_for_memory;
859
860                         skb_entail(sk, tp, skb);
861                         copy = mss_now;
862                 }
863
864                 if (copy > size)
865                         copy = size;
866
867                 i = skb_shinfo(skb)->nr_frags;
868                 if (can_coalesce(skb, i, page, offset)) {
869                         skb_shinfo(skb)->frags[i - 1].size += copy;
870                 } else if (i < MAX_SKB_FRAGS) {
871                         get_page(page);
872                         fill_page_desc(skb, i, page, offset, copy);
873                 } else {
874                         tcp_mark_push(tp, skb);
875                         goto new_segment;
876                 }
877
878                 skb->len += copy;
879                 skb->data_len += copy;
880                 skb->ip_summed = CHECKSUM_HW;
881                 tp->write_seq += copy;
882                 TCP_SKB_CB(skb)->end_seq += copy;
883
884                 if (!copied)
885                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
886
887                 copied += copy;
888                 poffset += copy;
889                 if (!(psize -= copy))
890                         goto out;
891
892                 if (skb->len != mss_now || (flags & MSG_OOB))
893                         continue;
894
895                 if (forced_push(tp)) {
896                         tcp_mark_push(tp, skb);
897                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
898                 } else if (skb == tp->send_head)
899                         tcp_push_one(sk, mss_now);
900                 continue;
901
902 wait_for_sndbuf:
903                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
904 wait_for_memory:
905                 if (copied)
906                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
907
908                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
909                         goto do_error;
910
911                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
912         }
913
914 out:
915         if (copied)
916                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
917         return copied;
918
919 do_error:
920         if (copied)
921                 goto out;
922 out_err:
923         return tcp_error(sk, flags, err);
924 }
925
926 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
927                      size_t size, int flags)
928 {
929         ssize_t res;
930         struct sock *sk = sock->sk;
931
932 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
933
934         if (!(sk->sk_route_caps & NETIF_F_SG) ||
935             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
936                 return sock_no_sendpage(sock, page, offset, size, flags);
937
938 #undef TCP_ZC_CSUM_FLAGS
939
940         lock_sock(sk);
941         TCP_CHECK_TIMER(sk);
942         res = do_tcp_sendpages(sk, &page, offset, size, flags);
943         TCP_CHECK_TIMER(sk);
944         release_sock(sk);
945         return res;
946 }
947
948 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
949 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
950
951 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
952                                    struct sk_buff *skb, struct page *page,
953                                    int off, int copy)
954 {
955         int err = 0;
956         unsigned int csum;
957
958         if (skb->ip_summed == CHECKSUM_NONE) {
959                 csum = csum_and_copy_from_user(from, page_address(page) + off,
960                                        copy, 0, &err);
961                 if (err) return err;
962                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
963         } else {
964                 if (copy_from_user(page_address(page) + off, from, copy))
965                         return -EFAULT;
966         }
967
968         skb->len += copy;
969         skb->data_len += copy;
970         skb->truesize += copy;
971         sk->sk_wmem_queued += copy;
972         sk->sk_forward_alloc -= copy;
973         return 0;
974 }
975
976 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
977 {
978         int err = 0;
979         unsigned int csum;
980         int off = skb->len;
981
982         if (skb->ip_summed == CHECKSUM_NONE) {
983                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
984                                        copy, 0, &err);
985                 if (!err) {
986                         skb->csum = csum_block_add(skb->csum, csum, off);
987                         return 0;
988                 }
989         } else {
990                 if (!copy_from_user(skb_put(skb, copy), from, copy))
991                         return 0;
992         }
993
994         __skb_trim(skb, off);
995         return -EFAULT;
996 }
997
998 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
999 {
1000         int tmp = tp->mss_cache_std;
1001
1002         if (sk->sk_route_caps & NETIF_F_SG) {
1003                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1004
1005                 if (tmp >= pgbreak &&
1006                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1007                         tmp = pgbreak;
1008         }
1009         return tmp;
1010 }
1011
1012 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1013                 size_t size)
1014 {
1015         struct iovec *iov;
1016         struct tcp_opt *tp = tcp_sk(sk);
1017         struct sk_buff *skb;
1018         int iovlen, flags;
1019         int mss_now;
1020         int err, copied;
1021         long timeo;
1022
1023         lock_sock(sk);
1024         TCP_CHECK_TIMER(sk);
1025
1026         flags = msg->msg_flags;
1027         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1028
1029         /* Wait for a connection to finish. */
1030         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1031                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1032                         goto out_err;
1033
1034         /* This should be in poll */
1035         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1036
1037         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1038
1039         /* Ok commence sending. */
1040         iovlen = msg->msg_iovlen;
1041         iov = msg->msg_iov;
1042         copied = 0;
1043
1044         err = -EPIPE;
1045         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1046                 goto do_error;
1047
1048         while (--iovlen >= 0) {
1049                 int seglen = iov->iov_len;
1050                 unsigned char __user *from = iov->iov_base;
1051
1052                 iov++;
1053
1054                 while (seglen > 0) {
1055                         int copy;
1056
1057                         skb = sk->sk_write_queue.prev;
1058
1059                         if (!tp->send_head ||
1060                             (copy = mss_now - skb->len) <= 0) {
1061
1062 new_segment:
1063                                 /* Allocate new segment. If the interface is SG,
1064                                  * allocate skb fitting to single page.
1065                                  */
1066                                 if (!tcp_memory_free(sk))
1067                                         goto wait_for_sndbuf;
1068
1069                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1070                                                      0, sk->sk_allocation);
1071                                 if (!skb)
1072                                         goto wait_for_memory;
1073
1074                                 /*
1075                                  * Check whether we can use HW checksum.
1076                                  */
1077                                 if (sk->sk_route_caps &
1078                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1079                                      NETIF_F_HW_CSUM))
1080                                         skb->ip_summed = CHECKSUM_HW;
1081
1082                                 skb_entail(sk, tp, skb);
1083                                 copy = mss_now;
1084                         }
1085
1086                         /* Try to append data to the end of skb. */
1087                         if (copy > seglen)
1088                                 copy = seglen;
1089
1090                         /* Where to copy to? */
1091                         if (skb_tailroom(skb) > 0) {
1092                                 /* We have some space in skb head. Superb! */
1093                                 if (copy > skb_tailroom(skb))
1094                                         copy = skb_tailroom(skb);
1095                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1096                                         goto do_fault;
1097                         } else {
1098                                 int merge = 0;
1099                                 int i = skb_shinfo(skb)->nr_frags;
1100                                 struct page *page = TCP_PAGE(sk);
1101                                 int off = TCP_OFF(sk);
1102
1103                                 if (can_coalesce(skb, i, page, off) &&
1104                                     off != PAGE_SIZE) {
1105                                         /* We can extend the last page
1106                                          * fragment. */
1107                                         merge = 1;
1108                                 } else if (i == MAX_SKB_FRAGS ||
1109                                            (!i &&
1110                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1111                                         /* Need to add new fragment and cannot
1112                                          * do this because interface is non-SG,
1113                                          * or because all the page slots are
1114                                          * busy. */
1115                                         tcp_mark_push(tp, skb);
1116                                         goto new_segment;
1117                                 } else if (page) {
1118                                         /* If page is cached, align
1119                                          * offset to L1 cache boundary
1120                                          */
1121                                         off = (off + L1_CACHE_BYTES - 1) &
1122                                               ~(L1_CACHE_BYTES - 1);
1123                                         if (off == PAGE_SIZE) {
1124                                                 put_page(page);
1125                                                 TCP_PAGE(sk) = page = NULL;
1126                                         }
1127                                 }
1128
1129                                 if (!page) {
1130                                         /* Allocate new cache page. */
1131                                         if (!(page = tcp_alloc_page(sk)))
1132                                                 goto wait_for_memory;
1133                                         off = 0;
1134                                 }
1135
1136                                 if (copy > PAGE_SIZE - off)
1137                                         copy = PAGE_SIZE - off;
1138
1139                                 /* Time to copy data. We are close to
1140                                  * the end! */
1141                                 err = tcp_copy_to_page(sk, from, skb, page,
1142                                                        off, copy);
1143                                 if (err) {
1144                                         /* If this page was new, give it to the
1145                                          * socket so it does not get leaked.
1146                                          */
1147                                         if (!TCP_PAGE(sk)) {
1148                                                 TCP_PAGE(sk) = page;
1149                                                 TCP_OFF(sk) = 0;
1150                                         }
1151                                         goto do_error;
1152                                 }
1153
1154                                 /* Update the skb. */
1155                                 if (merge) {
1156                                         skb_shinfo(skb)->frags[i - 1].size +=
1157                                                                         copy;
1158                                 } else {
1159                                         fill_page_desc(skb, i, page, off, copy);
1160                                         if (TCP_PAGE(sk)) {
1161                                                 get_page(page);
1162                                         } else if (off + copy < PAGE_SIZE) {
1163                                                 get_page(page);
1164                                                 TCP_PAGE(sk) = page;
1165                                         }
1166                                 }
1167
1168                                 TCP_OFF(sk) = off + copy;
1169                         }
1170
1171                         if (!copied)
1172                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1173
1174                         tp->write_seq += copy;
1175                         TCP_SKB_CB(skb)->end_seq += copy;
1176
1177                         from += copy;
1178                         copied += copy;
1179                         if ((seglen -= copy) == 0 && iovlen == 0)
1180                                 goto out;
1181
1182                         if (skb->len != mss_now || (flags & MSG_OOB))
1183                                 continue;
1184
1185                         if (forced_push(tp)) {
1186                                 tcp_mark_push(tp, skb);
1187                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1188                         } else if (skb == tp->send_head)
1189                                 tcp_push_one(sk, mss_now);
1190                         continue;
1191
1192 wait_for_sndbuf:
1193                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1194 wait_for_memory:
1195                         if (copied)
1196                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1197
1198                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1199                                 goto do_error;
1200
1201                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1202                 }
1203         }
1204
1205 out:
1206         if (copied)
1207                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1208         TCP_CHECK_TIMER(sk);
1209         release_sock(sk);
1210         return copied;
1211
1212 do_fault:
1213         if (!skb->len) {
1214                 if (tp->send_head == skb)
1215                         tp->send_head = NULL;
1216                 __skb_unlink(skb, skb->list);
1217                 tcp_free_skb(sk, skb);
1218         }
1219
1220 do_error:
1221         if (copied)
1222                 goto out;
1223 out_err:
1224         err = tcp_error(sk, flags, err);
1225         TCP_CHECK_TIMER(sk);
1226         release_sock(sk);
1227         return err;
1228 }
1229
1230 /*
1231  *      Handle reading urgent data. BSD has very simple semantics for
1232  *      this, no blocking and very strange errors 8)
1233  */
1234
1235 static int tcp_recv_urg(struct sock *sk, long timeo,
1236                         struct msghdr *msg, int len, int flags,
1237                         int *addr_len)
1238 {
1239         struct tcp_opt *tp = tcp_sk(sk);
1240
1241         /* No URG data to read. */
1242         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1243             tp->urg_data == TCP_URG_READ)
1244                 return -EINVAL; /* Yes this is right ! */
1245
1246         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1247                 return -ENOTCONN;
1248
1249         if (tp->urg_data & TCP_URG_VALID) {
1250                 int err = 0;
1251                 char c = tp->urg_data;
1252
1253                 if (!(flags & MSG_PEEK))
1254                         tp->urg_data = TCP_URG_READ;
1255
1256                 /* Read urgent data. */
1257                 msg->msg_flags |= MSG_OOB;
1258
1259                 if (len > 0) {
1260                         if (!(flags & MSG_TRUNC))
1261                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1262                         len = 1;
1263                 } else
1264                         msg->msg_flags |= MSG_TRUNC;
1265
1266                 return err ? -EFAULT : len;
1267         }
1268
1269         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1270                 return 0;
1271
1272         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1273          * the available implementations agree in this case:
1274          * this call should never block, independent of the
1275          * blocking state of the socket.
1276          * Mike <pall@rz.uni-karlsruhe.de>
1277          */
1278         return -EAGAIN;
1279 }
1280
1281 /* Clean up the receive buffer for full frames taken by the user,
1282  * then send an ACK if necessary.  COPIED is the number of bytes
1283  * tcp_recvmsg has given to the user so far, it speeds up the
1284  * calculation of whether or not we must ACK for the sake of
1285  * a window update.
1286  */
1287 void cleanup_rbuf(struct sock *sk, int copied)
1288 {
1289         struct tcp_opt *tp = tcp_sk(sk);
1290         int time_to_ack = 0;
1291
1292 #if TCP_DEBUG
1293         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1294
1295         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1296 #endif
1297
1298         if (tcp_ack_scheduled(tp)) {
1299                    /* Delayed ACKs frequently hit locked sockets during bulk
1300                     * receive. */
1301                 if (tp->ack.blocked ||
1302                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1303                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1304                     /*
1305                      * If this read emptied read buffer, we send ACK, if
1306                      * connection is not bidirectional, user drained
1307                      * receive buffer and there was a small segment
1308                      * in queue.
1309                      */
1310                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1311                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1312                         time_to_ack = 1;
1313         }
1314
1315         /* We send an ACK if we can now advertise a non-zero window
1316          * which has been raised "significantly".
1317          *
1318          * Even if window raised up to infinity, do not send window open ACK
1319          * in states, where we will not receive more. It is useless.
1320          */
1321         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1322                 __u32 rcv_window_now = tcp_receive_window(tp);
1323
1324                 /* Optimize, __tcp_select_window() is not cheap. */
1325                 if (2*rcv_window_now <= tp->window_clamp) {
1326                         __u32 new_window = __tcp_select_window(sk);
1327
1328                         /* Send ACK now, if this read freed lots of space
1329                          * in our buffer. Certainly, new_window is new window.
1330                          * We can advertise it now, if it is not less than current one.
1331                          * "Lots" means "at least twice" here.
1332                          */
1333                         if (new_window && new_window >= 2 * rcv_window_now)
1334                                 time_to_ack = 1;
1335                 }
1336         }
1337         if (time_to_ack)
1338                 tcp_send_ack(sk);
1339 }
1340
1341 static void tcp_prequeue_process(struct sock *sk)
1342 {
1343         struct sk_buff *skb;
1344         struct tcp_opt *tp = tcp_sk(sk);
1345
1346         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1347
1348         /* RX process wants to run with disabled BHs, though it is not
1349          * necessary */
1350         local_bh_disable();
1351         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1352                 sk->sk_backlog_rcv(sk, skb);
1353         local_bh_enable();
1354
1355         /* Clear memory counter. */
1356         tp->ucopy.memory = 0;
1357 }
1358
1359 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1360 {
1361         struct sk_buff *skb;
1362         u32 offset;
1363
1364         skb_queue_walk(&sk->sk_receive_queue, skb) {
1365                 offset = seq - TCP_SKB_CB(skb)->seq;
1366                 if (skb->h.th->syn)
1367                         offset--;
1368                 if (offset < skb->len || skb->h.th->fin) {
1369                         *off = offset;
1370                         return skb;
1371                 }
1372         }
1373         return NULL;
1374 }
1375
1376 /*
1377  * This routine provides an alternative to tcp_recvmsg() for routines
1378  * that would like to handle copying from skbuffs directly in 'sendfile'
1379  * fashion.
1380  * Note:
1381  *      - It is assumed that the socket was locked by the caller.
1382  *      - The routine does not block.
1383  *      - At present, there is no support for reading OOB data
1384  *        or for 'peeking' the socket using this routine
1385  *        (although both would be easy to implement).
1386  */
1387 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1388                   sk_read_actor_t recv_actor)
1389 {
1390         struct sk_buff *skb;
1391         struct tcp_opt *tp = tcp_sk(sk);
1392         u32 seq = tp->copied_seq;
1393         u32 offset;
1394         int copied = 0;
1395
1396         if (sk->sk_state == TCP_LISTEN)
1397                 return -ENOTCONN;
1398         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1399                 if (offset < skb->len) {
1400                         size_t used, len;
1401
1402                         len = skb->len - offset;
1403                         /* Stop reading if we hit a patch of urgent data */
1404                         if (tp->urg_data) {
1405                                 u32 urg_offset = tp->urg_seq - seq;
1406                                 if (urg_offset < len)
1407                                         len = urg_offset;
1408                                 if (!len)
1409                                         break;
1410                         }
1411                         used = recv_actor(desc, skb, offset, len);
1412                         if (used <= len) {
1413                                 seq += used;
1414                                 copied += used;
1415                                 offset += used;
1416                         }
1417                         if (offset != skb->len)
1418                                 break;
1419                 }
1420                 if (skb->h.th->fin) {
1421                         sk_eat_skb(sk, skb);
1422                         ++seq;
1423                         break;
1424                 }
1425                 sk_eat_skb(sk, skb);
1426                 if (!desc->count)
1427                         break;
1428         }
1429         tp->copied_seq = seq;
1430
1431         tcp_rcv_space_adjust(sk);
1432
1433         /* Clean up data we have read: This will do ACK frames. */
1434         if (copied)
1435                 cleanup_rbuf(sk, copied);
1436         return copied;
1437 }
1438
1439 /*
1440  *      This routine copies from a sock struct into the user buffer.
1441  *
1442  *      Technical note: in 2.3 we work on _locked_ socket, so that
1443  *      tricks with *seq access order and skb->users are not required.
1444  *      Probably, code can be easily improved even more.
1445  */
1446
1447 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1448                 size_t len, int nonblock, int flags, int *addr_len)
1449 {
1450         struct tcp_opt *tp = tcp_sk(sk);
1451         int copied = 0;
1452         u32 peek_seq;
1453         u32 *seq;
1454         unsigned long used;
1455         int err;
1456         int target;             /* Read at least this many bytes */
1457         long timeo;
1458         struct task_struct *user_recv = NULL;
1459
1460         lock_sock(sk);
1461
1462         TCP_CHECK_TIMER(sk);
1463
1464         err = -ENOTCONN;
1465         if (sk->sk_state == TCP_LISTEN)
1466                 goto out;
1467
1468         timeo = sock_rcvtimeo(sk, nonblock);
1469
1470         /* Urgent data needs to be handled specially. */
1471         if (flags & MSG_OOB)
1472                 goto recv_urg;
1473
1474         seq = &tp->copied_seq;
1475         if (flags & MSG_PEEK) {
1476                 peek_seq = tp->copied_seq;
1477                 seq = &peek_seq;
1478         }
1479
1480         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1481
1482         do {
1483                 struct sk_buff *skb;
1484                 u32 offset;
1485
1486                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1487                 if (tp->urg_data && tp->urg_seq == *seq) {
1488                         if (copied)
1489                                 break;
1490                         if (signal_pending(current)) {
1491                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1492                                 break;
1493                         }
1494                 }
1495
1496                 /* Next get a buffer. */
1497
1498                 skb = skb_peek(&sk->sk_receive_queue);
1499                 do {
1500                         if (!skb)
1501                                 break;
1502
1503                         /* Now that we have two receive queues this
1504                          * shouldn't happen.
1505                          */
1506                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1507                                 printk(KERN_INFO "recvmsg bug: copied %X "
1508                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1509                                 break;
1510                         }
1511                         offset = *seq - TCP_SKB_CB(skb)->seq;
1512                         if (skb->h.th->syn)
1513                                 offset--;
1514                         if (offset < skb->len)
1515                                 goto found_ok_skb;
1516                         if (skb->h.th->fin)
1517                                 goto found_fin_ok;
1518                         BUG_TRAP(flags & MSG_PEEK);
1519                         skb = skb->next;
1520                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1521
1522                 /* Well, if we have backlog, try to process it now yet. */
1523
1524                 if (copied >= target && !sk->sk_backlog.tail)
1525                         break;
1526
1527                 if (copied) {
1528                         if (sk->sk_err ||
1529                             sk->sk_state == TCP_CLOSE ||
1530                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1531                             !timeo ||
1532                             signal_pending(current) ||
1533                             (flags & MSG_PEEK))
1534                                 break;
1535                 } else {
1536                         if (sock_flag(sk, SOCK_DONE))
1537                                 break;
1538
1539                         if (sk->sk_err) {
1540                                 copied = sock_error(sk);
1541                                 break;
1542                         }
1543
1544                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1545                                 break;
1546
1547                         if (sk->sk_state == TCP_CLOSE) {
1548                                 if (!sock_flag(sk, SOCK_DONE)) {
1549                                         /* This occurs when user tries to read
1550                                          * from never connected socket.
1551                                          */
1552                                         copied = -ENOTCONN;
1553                                         break;
1554                                 }
1555                                 break;
1556                         }
1557
1558                         if (!timeo) {
1559                                 copied = -EAGAIN;
1560                                 break;
1561                         }
1562
1563                         if (signal_pending(current)) {
1564                                 copied = sock_intr_errno(timeo);
1565                                 break;
1566                         }
1567                 }
1568
1569                 cleanup_rbuf(sk, copied);
1570
1571                 if (tp->ucopy.task == user_recv) {
1572                         /* Install new reader */
1573                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1574                                 user_recv = current;
1575                                 tp->ucopy.task = user_recv;
1576                                 tp->ucopy.iov = msg->msg_iov;
1577                         }
1578
1579                         tp->ucopy.len = len;
1580
1581                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1582                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1583
1584                         /* Ugly... If prequeue is not empty, we have to
1585                          * process it before releasing socket, otherwise
1586                          * order will be broken at second iteration.
1587                          * More elegant solution is required!!!
1588                          *
1589                          * Look: we have the following (pseudo)queues:
1590                          *
1591                          * 1. packets in flight
1592                          * 2. backlog
1593                          * 3. prequeue
1594                          * 4. receive_queue
1595                          *
1596                          * Each queue can be processed only if the next ones
1597                          * are empty. At this point we have empty receive_queue.
1598                          * But prequeue _can_ be not empty after 2nd iteration,
1599                          * when we jumped to start of loop because backlog
1600                          * processing added something to receive_queue.
1601                          * We cannot release_sock(), because backlog contains
1602                          * packets arrived _after_ prequeued ones.
1603                          *
1604                          * Shortly, algorithm is clear --- to process all
1605                          * the queues in order. We could make it more directly,
1606                          * requeueing packets from backlog to prequeue, if
1607                          * is not empty. It is more elegant, but eats cycles,
1608                          * unfortunately.
1609                          */
1610                         if (skb_queue_len(&tp->ucopy.prequeue))
1611                                 goto do_prequeue;
1612
1613                         /* __ Set realtime policy in scheduler __ */
1614                 }
1615
1616                 if (copied >= target) {
1617                         /* Do not sleep, just process backlog. */
1618                         release_sock(sk);
1619                         lock_sock(sk);
1620                 } else
1621                         sk_wait_data(sk, &timeo);
1622
1623                 if (user_recv) {
1624                         int chunk;
1625
1626                         /* __ Restore normal policy in scheduler __ */
1627
1628                         if ((chunk = len - tp->ucopy.len) != 0) {
1629                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1630                                 len -= chunk;
1631                                 copied += chunk;
1632                         }
1633
1634                         if (tp->rcv_nxt == tp->copied_seq &&
1635                             skb_queue_len(&tp->ucopy.prequeue)) {
1636 do_prequeue:
1637                                 tcp_prequeue_process(sk);
1638
1639                                 if ((chunk = len - tp->ucopy.len) != 0) {
1640                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1641                                         len -= chunk;
1642                                         copied += chunk;
1643                                 }
1644                         }
1645                 }
1646                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1647                         if (net_ratelimit())
1648                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1649                                        current->comm, current->pid);
1650                         peek_seq = tp->copied_seq;
1651                 }
1652                 continue;
1653
1654         found_ok_skb:
1655                 /* Ok so how much can we use? */
1656                 used = skb->len - offset;
1657                 if (len < used)
1658                         used = len;
1659
1660                 /* Do we have urgent data here? */
1661                 if (tp->urg_data) {
1662                         u32 urg_offset = tp->urg_seq - *seq;
1663                         if (urg_offset < used) {
1664                                 if (!urg_offset) {
1665                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1666                                                 ++*seq;
1667                                                 offset++;
1668                                                 used--;
1669                                                 if (!used)
1670                                                         goto skip_copy;
1671                                         }
1672                                 } else
1673                                         used = urg_offset;
1674                         }
1675                 }
1676
1677                 if (!(flags & MSG_TRUNC)) {
1678                         err = skb_copy_datagram_iovec(skb, offset,
1679                                                       msg->msg_iov, used);
1680                         if (err) {
1681                                 /* Exception. Bailout! */
1682                                 if (!copied)
1683                                         copied = -EFAULT;
1684                                 break;
1685                         }
1686                 }
1687
1688                 *seq += used;
1689                 copied += used;
1690                 len -= used;
1691
1692                 tcp_rcv_space_adjust(sk);
1693
1694 skip_copy:
1695                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1696                         tp->urg_data = 0;
1697                         tcp_fast_path_check(sk, tp);
1698                 }
1699                 if (used + offset < skb->len)
1700                         continue;
1701
1702                 if (skb->h.th->fin)
1703                         goto found_fin_ok;
1704                 if (!(flags & MSG_PEEK))
1705                         sk_eat_skb(sk, skb);
1706                 continue;
1707
1708         found_fin_ok:
1709                 /* Process the FIN. */
1710                 ++*seq;
1711                 if (!(flags & MSG_PEEK))
1712                         sk_eat_skb(sk, skb);
1713                 break;
1714         } while (len > 0);
1715
1716         if (user_recv) {
1717                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1718                         int chunk;
1719
1720                         tp->ucopy.len = copied > 0 ? len : 0;
1721
1722                         tcp_prequeue_process(sk);
1723
1724                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1725                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1726                                 len -= chunk;
1727                                 copied += chunk;
1728                         }
1729                 }
1730
1731                 tp->ucopy.task = NULL;
1732                 tp->ucopy.len = 0;
1733         }
1734
1735         /* According to UNIX98, msg_name/msg_namelen are ignored
1736          * on connected socket. I was just happy when found this 8) --ANK
1737          */
1738
1739         /* Clean up data we have read: This will do ACK frames. */
1740         cleanup_rbuf(sk, copied);
1741
1742         TCP_CHECK_TIMER(sk);
1743         release_sock(sk);
1744         return copied;
1745
1746 out:
1747         TCP_CHECK_TIMER(sk);
1748         release_sock(sk);
1749         return err;
1750
1751 recv_urg:
1752         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1753         goto out;
1754 }
1755
1756 /*
1757  *      State processing on a close. This implements the state shift for
1758  *      sending our FIN frame. Note that we only send a FIN for some
1759  *      states. A shutdown() may have already sent the FIN, or we may be
1760  *      closed.
1761  */
1762
1763 static unsigned char new_state[16] = {
1764   /* current state:        new state:      action:      */
1765   /* (Invalid)          */ TCP_CLOSE,
1766   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1767   /* TCP_SYN_SENT       */ TCP_CLOSE,
1768   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1769   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1770   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1771   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1772   /* TCP_CLOSE          */ TCP_CLOSE,
1773   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1774   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1775   /* TCP_LISTEN         */ TCP_CLOSE,
1776   /* TCP_CLOSING        */ TCP_CLOSING,
1777 };
1778
1779 static int tcp_close_state(struct sock *sk)
1780 {
1781         int next = (int)new_state[sk->sk_state];
1782         int ns = next & TCP_STATE_MASK;
1783
1784         tcp_set_state(sk, ns);
1785
1786         return next & TCP_ACTION_FIN;
1787 }
1788
1789 /*
1790  *      Shutdown the sending side of a connection. Much like close except
1791  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1792  */
1793
1794 void tcp_shutdown(struct sock *sk, int how)
1795 {
1796         /*      We need to grab some memory, and put together a FIN,
1797          *      and then put it into the queue to be sent.
1798          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1799          */
1800         if (!(how & SEND_SHUTDOWN))
1801                 return;
1802
1803         /* If we've already sent a FIN, or it's a closed state, skip this. */
1804         if ((1 << sk->sk_state) &
1805             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1806              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1807                 /* Clear out any half completed packets.  FIN if needed. */
1808                 if (tcp_close_state(sk))
1809                         tcp_send_fin(sk);
1810         }
1811 }
1812
1813
1814 /*
1815  *      Return 1 if we still have things to send in our buffers.
1816  */
1817
1818 static inline int closing(struct sock *sk)
1819 {
1820         return (1 << sk->sk_state) &
1821                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1822 }
1823
1824 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1825 {
1826         /* First the read buffer. */
1827         __skb_queue_purge(&sk->sk_receive_queue);
1828
1829         /* Next, the error queue. */
1830         __skb_queue_purge(&sk->sk_error_queue);
1831
1832         /* Next, the write queue. */
1833         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1834
1835         /* Account for returned memory. */
1836         tcp_mem_reclaim(sk);
1837
1838         BUG_TRAP(!sk->sk_wmem_queued);
1839         BUG_TRAP(!sk->sk_forward_alloc);
1840
1841         /* It is _impossible_ for the backlog to contain anything
1842          * when we get here.  All user references to this socket
1843          * have gone away, only the net layer knows can touch it.
1844          */
1845 }
1846
1847 /*
1848  * At this point, there should be no process reference to this
1849  * socket, and thus no user references at all.  Therefore we
1850  * can assume the socket waitqueue is inactive and nobody will
1851  * try to jump onto it.
1852  */
1853 void tcp_destroy_sock(struct sock *sk)
1854 {
1855         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1856         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1857
1858         /* It cannot be in hash table! */
1859         BUG_TRAP(sk_unhashed(sk));
1860
1861         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1862         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1863
1864 #ifdef TCP_DEBUG
1865         if (sk->sk_zapped) {
1866                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1867                 sock_hold(sk);
1868         }
1869         sk->sk_zapped = 1;
1870 #endif
1871
1872         sk->sk_prot->destroy(sk);
1873
1874         tcp_kill_sk_queues(sk);
1875
1876         xfrm_sk_free_policy(sk);
1877
1878 #ifdef INET_REFCNT_DEBUG
1879         if (atomic_read(&sk->sk_refcnt) != 1) {
1880                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1881                        sk, atomic_read(&sk->sk_refcnt));
1882         }
1883 #endif
1884
1885         atomic_dec(&tcp_orphan_count);
1886         sock_put(sk);
1887 }
1888
1889 void tcp_close(struct sock *sk, long timeout)
1890 {
1891         struct sk_buff *skb;
1892         int data_was_unread = 0;
1893
1894         lock_sock(sk);
1895         sk->sk_shutdown = SHUTDOWN_MASK;
1896
1897         if (sk->sk_state == TCP_LISTEN) {
1898                 tcp_set_state(sk, TCP_CLOSE);
1899
1900                 /* Special case. */
1901                 tcp_listen_stop(sk);
1902
1903                 goto adjudge_to_death;
1904         }
1905
1906         /*  We need to flush the recv. buffs.  We do this only on the
1907          *  descriptor close, not protocol-sourced closes, because the
1908          *  reader process may not have drained the data yet!
1909          */
1910         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1911                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1912                           skb->h.th->fin;
1913                 data_was_unread += len;
1914                 __kfree_skb(skb);
1915         }
1916
1917         tcp_mem_reclaim(sk);
1918
1919         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1920          * 3.10, we send a RST here because data was lost.  To
1921          * witness the awful effects of the old behavior of always
1922          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1923          * a bulk GET in an FTP client, suspend the process, wait
1924          * for the client to advertise a zero window, then kill -9
1925          * the FTP client, wheee...  Note: timeout is always zero
1926          * in such a case.
1927          */
1928         if (data_was_unread) {
1929                 /* Unread data was tossed, zap the connection. */
1930                 NET_INC_STATS_USER(TCPAbortOnClose);
1931                 tcp_set_state(sk, TCP_CLOSE);
1932                 tcp_send_active_reset(sk, GFP_KERNEL);
1933         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1934                 /* Check zero linger _after_ checking for unread data. */
1935                 sk->sk_prot->disconnect(sk, 0);
1936                 NET_INC_STATS_USER(TCPAbortOnData);
1937         } else if (tcp_close_state(sk)) {
1938                 /* We FIN if the application ate all the data before
1939                  * zapping the connection.
1940                  */
1941
1942                 /* RED-PEN. Formally speaking, we have broken TCP state
1943                  * machine. State transitions:
1944                  *
1945                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1946                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1947                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1948                  *
1949                  * are legal only when FIN has been sent (i.e. in window),
1950                  * rather than queued out of window. Purists blame.
1951                  *
1952                  * F.e. "RFC state" is ESTABLISHED,
1953                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1954                  *
1955                  * The visible declinations are that sometimes
1956                  * we enter time-wait state, when it is not required really
1957                  * (harmless), do not send active resets, when they are
1958                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1959                  * they look as CLOSING or LAST_ACK for Linux)
1960                  * Probably, I missed some more holelets.
1961                  *                                              --ANK
1962                  */
1963                 tcp_send_fin(sk);
1964         }
1965
1966         if (timeout) {
1967                 struct task_struct *tsk = current;
1968                 DEFINE_WAIT(wait);
1969
1970                 do {
1971                         prepare_to_wait(sk->sk_sleep, &wait,
1972                                         TASK_INTERRUPTIBLE);
1973                         if (!closing(sk))
1974                                 break;
1975                         release_sock(sk);
1976                         timeout = schedule_timeout(timeout);
1977                         lock_sock(sk);
1978                 } while (!signal_pending(tsk) && timeout);
1979
1980                 finish_wait(sk->sk_sleep, &wait);
1981         }
1982
1983 adjudge_to_death:
1984         /* It is the last release_sock in its life. It will remove backlog. */
1985         release_sock(sk);
1986
1987
1988         /* Now socket is owned by kernel and we acquire BH lock
1989            to finish close. No need to check for user refs.
1990          */
1991         local_bh_disable();
1992         bh_lock_sock(sk);
1993         BUG_TRAP(!sock_owned_by_user(sk));
1994
1995         sock_hold(sk);
1996         sock_orphan(sk);
1997
1998         /*      This is a (useful) BSD violating of the RFC. There is a
1999          *      problem with TCP as specified in that the other end could
2000          *      keep a socket open forever with no application left this end.
2001          *      We use a 3 minute timeout (about the same as BSD) then kill
2002          *      our end. If they send after that then tough - BUT: long enough
2003          *      that we won't make the old 4*rto = almost no time - whoops
2004          *      reset mistake.
2005          *
2006          *      Nope, it was not mistake. It is really desired behaviour
2007          *      f.e. on http servers, when such sockets are useless, but
2008          *      consume significant resources. Let's do it with special
2009          *      linger2 option.                                 --ANK
2010          */
2011
2012         if (sk->sk_state == TCP_FIN_WAIT2) {
2013                 struct tcp_opt *tp = tcp_sk(sk);
2014                 if (tp->linger2 < 0) {
2015                         tcp_set_state(sk, TCP_CLOSE);
2016                         tcp_send_active_reset(sk, GFP_ATOMIC);
2017                         NET_INC_STATS_BH(TCPAbortOnLinger);
2018                 } else {
2019                         int tmo = tcp_fin_time(tp);
2020
2021                         if (tmo > TCP_TIMEWAIT_LEN) {
2022                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2023                         } else {
2024                                 atomic_inc(&tcp_orphan_count);
2025                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2026                                 goto out;
2027                         }
2028                 }
2029         }
2030         if (sk->sk_state != TCP_CLOSE) {
2031                 tcp_mem_reclaim(sk);
2032                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2033                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2034                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2035                         if (net_ratelimit())
2036                                 printk(KERN_INFO "TCP: too many of orphaned "
2037                                        "sockets\n");
2038                         tcp_set_state(sk, TCP_CLOSE);
2039                         tcp_send_active_reset(sk, GFP_ATOMIC);
2040                         NET_INC_STATS_BH(TCPAbortOnMemory);
2041                 }
2042         }
2043         atomic_inc(&tcp_orphan_count);
2044
2045         if (sk->sk_state == TCP_CLOSE)
2046                 tcp_destroy_sock(sk);
2047         /* Otherwise, socket is reprieved until protocol close. */
2048
2049 out:
2050         bh_unlock_sock(sk);
2051         local_bh_enable();
2052         sock_put(sk);
2053 }
2054
2055 /* These states need RST on ABORT according to RFC793 */
2056
2057 static inline int tcp_need_reset(int state)
2058 {
2059         return (1 << state) &
2060                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2061                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2062 }
2063
2064 int tcp_disconnect(struct sock *sk, int flags)
2065 {
2066         struct inet_opt *inet = inet_sk(sk);
2067         struct tcp_opt *tp = tcp_sk(sk);
2068         int err = 0;
2069         int old_state = sk->sk_state;
2070
2071         if (old_state != TCP_CLOSE)
2072                 tcp_set_state(sk, TCP_CLOSE);
2073
2074         /* ABORT function of RFC793 */
2075         if (old_state == TCP_LISTEN) {
2076                 tcp_listen_stop(sk);
2077         } else if (tcp_need_reset(old_state) ||
2078                    (tp->snd_nxt != tp->write_seq &&
2079                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2080                 /* The last check adjusts for discrepance of Linux wrt. RFC
2081                  * states
2082                  */
2083                 tcp_send_active_reset(sk, gfp_any());
2084                 sk->sk_err = ECONNRESET;
2085         } else if (old_state == TCP_SYN_SENT)
2086                 sk->sk_err = ECONNRESET;
2087
2088         tcp_clear_xmit_timers(sk);
2089         __skb_queue_purge(&sk->sk_receive_queue);
2090         tcp_writequeue_purge(sk);
2091         __skb_queue_purge(&tp->out_of_order_queue);
2092
2093         inet->dport = 0;
2094
2095         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2096                 inet_reset_saddr(sk);
2097
2098         sk->sk_shutdown = 0;
2099         sock_reset_flag(sk, SOCK_DONE);
2100         tp->srtt = 0;
2101         if ((tp->write_seq += tp->max_window + 2) == 0)
2102                 tp->write_seq = 1;
2103         tp->backoff = 0;
2104         tp->snd_cwnd = 2;
2105         tp->probes_out = 0;
2106         tp->packets_out = 0;
2107         tp->snd_ssthresh = 0x7fffffff;
2108         tp->snd_cwnd_cnt = 0;
2109         tcp_set_ca_state(tp, TCP_CA_Open);
2110         tcp_clear_retrans(tp);
2111         tcp_delack_init(tp);
2112         tp->send_head = NULL;
2113         tp->saw_tstamp = 0;
2114         tcp_sack_reset(tp);
2115         __sk_dst_reset(sk);
2116
2117         BUG_TRAP(!inet->num || tp->bind_hash);
2118
2119         sk->sk_error_report(sk);
2120         return err;
2121 }
2122
2123 /*
2124  *      Wait for an incoming connection, avoid race
2125  *      conditions. This must be called with the socket locked.
2126  */
2127 static int wait_for_connect(struct sock *sk, long timeo)
2128 {
2129         struct tcp_opt *tp = tcp_sk(sk);
2130         DEFINE_WAIT(wait);
2131         int err;
2132
2133         /*
2134          * True wake-one mechanism for incoming connections: only
2135          * one process gets woken up, not the 'whole herd'.
2136          * Since we do not 'race & poll' for established sockets
2137          * anymore, the common case will execute the loop only once.
2138          *
2139          * Subtle issue: "add_wait_queue_exclusive()" will be added
2140          * after any current non-exclusive waiters, and we know that
2141          * it will always _stay_ after any new non-exclusive waiters
2142          * because all non-exclusive waiters are added at the
2143          * beginning of the wait-queue. As such, it's ok to "drop"
2144          * our exclusiveness temporarily when we get woken up without
2145          * having to remove and re-insert us on the wait queue.
2146          */
2147         for (;;) {
2148                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2149                                           TASK_INTERRUPTIBLE);
2150                 release_sock(sk);
2151                 if (!tp->accept_queue)
2152                         timeo = schedule_timeout(timeo);
2153                 lock_sock(sk);
2154                 err = 0;
2155                 if (tp->accept_queue)
2156                         break;
2157                 err = -EINVAL;
2158                 if (sk->sk_state != TCP_LISTEN)
2159                         break;
2160                 err = sock_intr_errno(timeo);
2161                 if (signal_pending(current))
2162                         break;
2163                 err = -EAGAIN;
2164                 if (!timeo)
2165                         break;
2166         }
2167         finish_wait(sk->sk_sleep, &wait);
2168         return err;
2169 }
2170
2171 /*
2172  *      This will accept the next outstanding connection.
2173  */
2174
2175 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2176 {
2177         struct tcp_opt *tp = tcp_sk(sk);
2178         struct open_request *req;
2179         struct sock *newsk;
2180         int error;
2181
2182         lock_sock(sk);
2183
2184         /* We need to make sure that this socket is listening,
2185          * and that it has something pending.
2186          */
2187         error = -EINVAL;
2188         if (sk->sk_state != TCP_LISTEN)
2189                 goto out;
2190
2191         /* Find already established connection */
2192         if (!tp->accept_queue) {
2193                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2194
2195                 /* If this is a non blocking socket don't sleep */
2196                 error = -EAGAIN;
2197                 if (!timeo)
2198                         goto out;
2199
2200                 error = wait_for_connect(sk, timeo);
2201                 if (error)
2202                         goto out;
2203         }
2204
2205         req = tp->accept_queue;
2206         if ((tp->accept_queue = req->dl_next) == NULL)
2207                 tp->accept_queue_tail = NULL;
2208
2209         newsk = req->sk;
2210         sk_acceptq_removed(sk);
2211         tcp_openreq_fastfree(req);
2212         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2213         release_sock(sk);
2214         return newsk;
2215
2216 out:
2217         release_sock(sk);
2218         *err = error;
2219         return NULL;
2220 }
2221
2222 /*
2223  *      Socket option code for TCP.
2224  */
2225 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2226                    int optlen)
2227 {
2228         struct tcp_opt *tp = tcp_sk(sk);
2229         int val;
2230         int err = 0;
2231
2232         if (level != SOL_TCP)
2233                 return tp->af_specific->setsockopt(sk, level, optname,
2234                                                    optval, optlen);
2235
2236         if (optlen < sizeof(int))
2237                 return -EINVAL;
2238
2239         if (get_user(val, (int __user *)optval))
2240                 return -EFAULT;
2241
2242         lock_sock(sk);
2243
2244         switch (optname) {
2245         case TCP_MAXSEG:
2246                 /* Values greater than interface MTU won't take effect. However
2247                  * at the point when this call is done we typically don't yet
2248                  * know which interface is going to be used */
2249                 if (val < 8 || val > MAX_TCP_WINDOW) {
2250                         err = -EINVAL;
2251                         break;
2252                 }
2253                 tp->user_mss = val;
2254                 break;
2255
2256         case TCP_NODELAY:
2257                 if (val) {
2258                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2259                          * this option on corked socket is remembered, but
2260                          * it is not activated until cork is cleared.
2261                          *
2262                          * However, when TCP_NODELAY is set we make
2263                          * an explicit push, which overrides even TCP_CORK
2264                          * for currently queued segments.
2265                          */
2266                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2267                         tcp_push_pending_frames(sk, tp);
2268                 } else {
2269                         tp->nonagle &= ~TCP_NAGLE_OFF;
2270                 }
2271                 break;
2272
2273         case TCP_CORK:
2274                 /* When set indicates to always queue non-full frames.
2275                  * Later the user clears this option and we transmit
2276                  * any pending partial frames in the queue.  This is
2277                  * meant to be used alongside sendfile() to get properly
2278                  * filled frames when the user (for example) must write
2279                  * out headers with a write() call first and then use
2280                  * sendfile to send out the data parts.
2281                  *
2282                  * TCP_CORK can be set together with TCP_NODELAY and it is
2283                  * stronger than TCP_NODELAY.
2284                  */
2285                 if (val) {
2286                         tp->nonagle |= TCP_NAGLE_CORK;
2287                 } else {
2288                         tp->nonagle &= ~TCP_NAGLE_CORK;
2289                         if (tp->nonagle&TCP_NAGLE_OFF)
2290                                 tp->nonagle |= TCP_NAGLE_PUSH;
2291                         tcp_push_pending_frames(sk, tp);
2292                 }
2293                 break;
2294
2295         case TCP_KEEPIDLE:
2296                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2297                         err = -EINVAL;
2298                 else {
2299                         tp->keepalive_time = val * HZ;
2300                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2301                             !((1 << sk->sk_state) &
2302                               (TCPF_CLOSE | TCPF_LISTEN))) {
2303                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2304                                 if (tp->keepalive_time > elapsed)
2305                                         elapsed = tp->keepalive_time - elapsed;
2306                                 else
2307                                         elapsed = 0;
2308                                 tcp_reset_keepalive_timer(sk, elapsed);
2309                         }
2310                 }
2311                 break;
2312         case TCP_KEEPINTVL:
2313                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2314                         err = -EINVAL;
2315                 else
2316                         tp->keepalive_intvl = val * HZ;
2317                 break;
2318         case TCP_KEEPCNT:
2319                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2320                         err = -EINVAL;
2321                 else
2322                         tp->keepalive_probes = val;
2323                 break;
2324         case TCP_SYNCNT:
2325                 if (val < 1 || val > MAX_TCP_SYNCNT)
2326                         err = -EINVAL;
2327                 else
2328                         tp->syn_retries = val;
2329                 break;
2330
2331         case TCP_LINGER2:
2332                 if (val < 0)
2333                         tp->linger2 = -1;
2334                 else if (val > sysctl_tcp_fin_timeout / HZ)
2335                         tp->linger2 = 0;
2336                 else
2337                         tp->linger2 = val * HZ;
2338                 break;
2339
2340         case TCP_DEFER_ACCEPT:
2341                 tp->defer_accept = 0;
2342                 if (val > 0) {
2343                         /* Translate value in seconds to number of
2344                          * retransmits */
2345                         while (tp->defer_accept < 32 &&
2346                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2347                                        tp->defer_accept))
2348                                 tp->defer_accept++;
2349                         tp->defer_accept++;
2350                 }
2351                 break;
2352
2353         case TCP_WINDOW_CLAMP:
2354                 if (!val) {
2355                         if (sk->sk_state != TCP_CLOSE) {
2356                                 err = -EINVAL;
2357                                 break;
2358                         }
2359                         tp->window_clamp = 0;
2360                 } else
2361                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2362                                                 SOCK_MIN_RCVBUF / 2 : val;
2363                 break;
2364
2365         case TCP_QUICKACK:
2366                 if (!val) {
2367                         tp->ack.pingpong = 1;
2368                 } else {
2369                         tp->ack.pingpong = 0;
2370                         if ((1 << sk->sk_state) &
2371                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2372                             tcp_ack_scheduled(tp)) {
2373                                 tp->ack.pending |= TCP_ACK_PUSHED;
2374                                 cleanup_rbuf(sk, 1);
2375                                 if (!(val & 1))
2376                                         tp->ack.pingpong = 1;
2377                         }
2378                 }
2379                 break;
2380
2381         default:
2382                 err = -ENOPROTOOPT;
2383                 break;
2384         };
2385         release_sock(sk);
2386         return err;
2387 }
2388
2389 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2390                    int __user *optlen)
2391 {
2392         struct tcp_opt *tp = tcp_sk(sk);
2393         int val, len;
2394
2395         if (level != SOL_TCP)
2396                 return tp->af_specific->getsockopt(sk, level, optname,
2397                                                    optval, optlen);
2398
2399         if (get_user(len, optlen))
2400                 return -EFAULT;
2401
2402         len = min_t(unsigned int, len, sizeof(int));
2403
2404         if (len < 0)
2405                 return -EINVAL;
2406
2407         switch (optname) {
2408         case TCP_MAXSEG:
2409                 val = tp->mss_cache_std;
2410                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2411                         val = tp->user_mss;
2412                 break;
2413         case TCP_NODELAY:
2414                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2415                 break;
2416         case TCP_CORK:
2417                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2418                 break;
2419         case TCP_KEEPIDLE:
2420                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2421                 break;
2422         case TCP_KEEPINTVL:
2423                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2424                 break;
2425         case TCP_KEEPCNT:
2426                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2427                 break;
2428         case TCP_SYNCNT:
2429                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2430                 break;
2431         case TCP_LINGER2:
2432                 val = tp->linger2;
2433                 if (val >= 0)
2434                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2435                 break;
2436         case TCP_DEFER_ACCEPT:
2437                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2438                                                (tp->defer_accept - 1));
2439                 break;
2440         case TCP_WINDOW_CLAMP:
2441                 val = tp->window_clamp;
2442                 break;
2443         case TCP_INFO: {
2444                 struct tcp_info info;
2445
2446                 if (get_user(len, optlen))
2447                         return -EFAULT;
2448
2449                 tcp_get_info(sk, &info);
2450
2451                 len = min_t(unsigned int, len, sizeof(info));
2452                 if (put_user(len, optlen))
2453                         return -EFAULT;
2454                 if (copy_to_user(optval, &info, len))
2455                         return -EFAULT;
2456                 return 0;
2457         }
2458         case TCP_QUICKACK:
2459                 val = !tp->ack.pingpong;
2460                 break;
2461         default:
2462                 return -ENOPROTOOPT;
2463         };
2464
2465         if (put_user(len, optlen))
2466                 return -EFAULT;
2467         if (copy_to_user(optval, &val, len))
2468                 return -EFAULT;
2469         return 0;
2470 }
2471
2472
2473 extern void __skb_cb_too_small_for_tcp(int, int);
2474 extern void tcpdiag_init(void);
2475
2476 static __initdata unsigned long thash_entries;
2477 static int __init set_thash_entries(char *str)
2478 {
2479         if (!str)
2480                 return 0;
2481         thash_entries = simple_strtoul(str, &str, 0);
2482         return 1;
2483 }
2484 __setup("thash_entries=", set_thash_entries);
2485
2486 void __init tcp_init(void)
2487 {
2488         struct sk_buff *skb = NULL;
2489         unsigned long goal;
2490         int order, i;
2491
2492         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2493                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2494                                            sizeof(skb->cb));
2495
2496         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2497                                                    sizeof(struct open_request),
2498                                                0, SLAB_HWCACHE_ALIGN,
2499                                                NULL, NULL);
2500         if (!tcp_openreq_cachep)
2501                 panic("tcp_init: Cannot alloc open_request cache.");
2502
2503         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2504                                               sizeof(struct tcp_bind_bucket),
2505                                               0, SLAB_HWCACHE_ALIGN,
2506                                               NULL, NULL);
2507         if (!tcp_bucket_cachep)
2508                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2509
2510         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2511                                                 sizeof(struct tcp_tw_bucket),
2512                                                 0, SLAB_HWCACHE_ALIGN,
2513                                                 NULL, NULL);
2514         if (!tcp_timewait_cachep)
2515                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2516
2517         /* Size and allocate the main established and bind bucket
2518          * hash tables.
2519          *
2520          * The methodology is similar to that of the buffer cache.
2521          */
2522         if (num_physpages >= (128 * 1024))
2523                 goal = num_physpages >> (21 - PAGE_SHIFT);
2524         else
2525                 goal = num_physpages >> (23 - PAGE_SHIFT);
2526
2527         if (thash_entries)
2528                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2529         for (order = 0; (1UL << order) < goal; order++)
2530                 ;
2531         do {
2532                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2533                         sizeof(struct tcp_ehash_bucket);
2534                 tcp_ehash_size >>= 1;
2535                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2536                         tcp_ehash_size--;
2537                 tcp_ehash = (struct tcp_ehash_bucket *)
2538                         __get_free_pages(GFP_ATOMIC, order);
2539         } while (!tcp_ehash && --order > 0);
2540
2541         if (!tcp_ehash)
2542                 panic("Failed to allocate TCP established hash table\n");
2543         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2544                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2545                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2546         }
2547
2548         do {
2549                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2550                         sizeof(struct tcp_bind_hashbucket);
2551                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2552                         continue;
2553                 tcp_bhash = (struct tcp_bind_hashbucket *)
2554                         __get_free_pages(GFP_ATOMIC, order);
2555         } while (!tcp_bhash && --order >= 0);
2556
2557         if (!tcp_bhash)
2558                 panic("Failed to allocate TCP bind hash table\n");
2559         for (i = 0; i < tcp_bhash_size; i++) {
2560                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2561                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2562         }
2563
2564         /* Try to be a bit smarter and adjust defaults depending
2565          * on available memory.
2566          */
2567         if (order > 4) {
2568                 sysctl_local_port_range[0] = 32768;
2569                 sysctl_local_port_range[1] = 61000;
2570                 sysctl_tcp_max_tw_buckets = 180000;
2571                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2572                 sysctl_max_syn_backlog = 1024;
2573         } else if (order < 3) {
2574                 sysctl_local_port_range[0] = 1024 * (3 - order);
2575                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2576                 sysctl_tcp_max_orphans >>= (3 - order);
2577                 sysctl_max_syn_backlog = 128;
2578         }
2579         tcp_port_rover = sysctl_local_port_range[0] - 1;
2580
2581         sysctl_tcp_mem[0] =  768 << order;
2582         sysctl_tcp_mem[1] = 1024 << order;
2583         sysctl_tcp_mem[2] = 1536 << order;
2584
2585         if (order < 3) {
2586                 sysctl_tcp_wmem[2] = 64 * 1024;
2587                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2588                 sysctl_tcp_rmem[1] = 43689;
2589                 sysctl_tcp_rmem[2] = 2 * 43689;
2590         }
2591
2592         printk(KERN_INFO "TCP: Hash tables configured "
2593                "(established %d bind %d)\n",
2594                tcp_ehash_size << 1, tcp_bhash_size);
2595
2596         tcpdiag_init();
2597 }
2598
2599 EXPORT_SYMBOL(__tcp_mem_reclaim);
2600 EXPORT_SYMBOL(sysctl_tcp_rmem);
2601 EXPORT_SYMBOL(sysctl_tcp_wmem);
2602 EXPORT_SYMBOL(tcp_accept);
2603 EXPORT_SYMBOL(tcp_close);
2604 EXPORT_SYMBOL(tcp_close_state);
2605 EXPORT_SYMBOL(tcp_destroy_sock);
2606 EXPORT_SYMBOL(tcp_disconnect);
2607 EXPORT_SYMBOL(tcp_getsockopt);
2608 EXPORT_SYMBOL(tcp_ioctl);
2609 EXPORT_SYMBOL(tcp_openreq_cachep);
2610 EXPORT_SYMBOL(tcp_poll);
2611 EXPORT_SYMBOL(tcp_read_sock);
2612 EXPORT_SYMBOL(tcp_recvmsg);
2613 EXPORT_SYMBOL(tcp_sendmsg);
2614 EXPORT_SYMBOL(tcp_sendpage);
2615 EXPORT_SYMBOL(tcp_setsockopt);
2616 EXPORT_SYMBOL(tcp_shutdown);
2617 EXPORT_SYMBOL(tcp_sockets_allocated);
2618 EXPORT_SYMBOL(tcp_statistics);
2619 EXPORT_SYMBOL(tcp_timewait_cachep);
2620 EXPORT_SYMBOL_GPL(cleanup_rbuf);