upgrade to linux 2.6.10-1.12_FC2
[linux-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259
260 #ifdef CONFIG_CKRM
261 #include <linux/ckrm.h>
262 #endif
263
264 #include <net/icmp.h>
265 #include <net/tcp.h>
266 #include <net/xfrm.h>
267 #include <net/ip.h>
268
269
270 #include <asm/uaccess.h>
271 #include <asm/ioctls.h>
272
273 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
274
275 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
276
277 kmem_cache_t *tcp_openreq_cachep;
278 kmem_cache_t *tcp_bucket_cachep;
279 kmem_cache_t *tcp_timewait_cachep;
280
281 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
282
283 int sysctl_tcp_mem[3];
284 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
285 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
286
287 EXPORT_SYMBOL(sysctl_tcp_mem);
288 EXPORT_SYMBOL(sysctl_tcp_rmem);
289 EXPORT_SYMBOL(sysctl_tcp_wmem);
290
291 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
292 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
293
294 EXPORT_SYMBOL(tcp_memory_allocated);
295 EXPORT_SYMBOL(tcp_sockets_allocated);
296
297 /*
298  * Pressure flag: try to collapse.
299  * Technical note: it is used by multiple contexts non atomically.
300  * All the sk_stream_mem_schedule() is of this nature: accounting
301  * is strict, actions are advisory and have some latency.
302  */
303 int tcp_memory_pressure;
304
305 EXPORT_SYMBOL(tcp_memory_pressure);
306
307 void tcp_enter_memory_pressure(void)
308 {
309         if (!tcp_memory_pressure) {
310                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
311                 tcp_memory_pressure = 1;
312         }
313 }
314
315 EXPORT_SYMBOL(tcp_enter_memory_pressure);
316
317 /*
318  * LISTEN is a special case for poll..
319  */
320 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
321                                                poll_table *wait)
322 {
323         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
324 }
325
326 /*
327  *      Wait for a TCP event.
328  *
329  *      Note that we don't need to lock the socket, as the upper poll layers
330  *      take care of normal races (between the test and the event) and we don't
331  *      go look at any of the socket buffers directly.
332  */
333 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
334 {
335         unsigned int mask;
336         struct sock *sk = sock->sk;
337         struct tcp_opt *tp = tcp_sk(sk);
338
339         poll_wait(file, sk->sk_sleep, wait);
340         if (sk->sk_state == TCP_LISTEN)
341                 return tcp_listen_poll(sk, wait);
342
343         /* Socket is not locked. We are protected from async events
344            by poll logic and correct handling of state changes
345            made by another threads is impossible in any case.
346          */
347
348         mask = 0;
349         if (sk->sk_err)
350                 mask = POLLERR;
351
352         /*
353          * POLLHUP is certainly not done right. But poll() doesn't
354          * have a notion of HUP in just one direction, and for a
355          * socket the read side is more interesting.
356          *
357          * Some poll() documentation says that POLLHUP is incompatible
358          * with the POLLOUT/POLLWR flags, so somebody should check this
359          * all. But careful, it tends to be safer to return too many
360          * bits than too few, and you can easily break real applications
361          * if you don't tell them that something has hung up!
362          *
363          * Check-me.
364          *
365          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
366          * our fs/select.c). It means that after we received EOF,
367          * poll always returns immediately, making impossible poll() on write()
368          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
369          * if and only if shutdown has been made in both directions.
370          * Actually, it is interesting to look how Solaris and DUX
371          * solve this dilemma. I would prefer, if PULLHUP were maskable,
372          * then we could set it on SND_SHUTDOWN. BTW examples given
373          * in Stevens' books assume exactly this behaviour, it explains
374          * why PULLHUP is incompatible with POLLOUT.    --ANK
375          *
376          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
377          * blocking on fresh not-connected or disconnected socket. --ANK
378          */
379         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
380                 mask |= POLLHUP;
381         if (sk->sk_shutdown & RCV_SHUTDOWN)
382                 mask |= POLLIN | POLLRDNORM;
383
384         /* Connected? */
385         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
386                 /* Potential race condition. If read of tp below will
387                  * escape above sk->sk_state, we can be illegally awaken
388                  * in SYN_* states. */
389                 if ((tp->rcv_nxt != tp->copied_seq) &&
390                     (tp->urg_seq != tp->copied_seq ||
391                      tp->rcv_nxt != tp->copied_seq + 1 ||
392                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
393                         mask |= POLLIN | POLLRDNORM;
394
395                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
396                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
397                                 mask |= POLLOUT | POLLWRNORM;
398                         } else {  /* send SIGIO later */
399                                 set_bit(SOCK_ASYNC_NOSPACE,
400                                         &sk->sk_socket->flags);
401                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
402
403                                 /* Race breaker. If space is freed after
404                                  * wspace test but before the flags are set,
405                                  * IO signal will be lost.
406                                  */
407                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
408                                         mask |= POLLOUT | POLLWRNORM;
409                         }
410                 }
411
412                 if (tp->urg_data & TCP_URG_VALID)
413                         mask |= POLLPRI;
414         }
415         return mask;
416 }
417
418 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
419 {
420         struct tcp_opt *tp = tcp_sk(sk);
421         int answ;
422
423         switch (cmd) {
424         case SIOCINQ:
425                 if (sk->sk_state == TCP_LISTEN)
426                         return -EINVAL;
427
428                 lock_sock(sk);
429                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
430                         answ = 0;
431                 else if (sock_flag(sk, SOCK_URGINLINE) ||
432                          !tp->urg_data ||
433                          before(tp->urg_seq, tp->copied_seq) ||
434                          !before(tp->urg_seq, tp->rcv_nxt)) {
435                         answ = tp->rcv_nxt - tp->copied_seq;
436
437                         /* Subtract 1, if FIN is in queue. */
438                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
439                                 answ -=
440                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
441                 } else
442                         answ = tp->urg_seq - tp->copied_seq;
443                 release_sock(sk);
444                 break;
445         case SIOCATMARK:
446                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
447                 break;
448         case SIOCOUTQ:
449                 if (sk->sk_state == TCP_LISTEN)
450                         return -EINVAL;
451
452                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
453                         answ = 0;
454                 else
455                         answ = tp->write_seq - tp->snd_una;
456                 break;
457         default:
458                 return -ENOIOCTLCMD;
459         };
460
461         return put_user(answ, (int __user *)arg);
462 }
463
464
465 int tcp_listen_start(struct sock *sk)
466 {
467 #ifdef CONFIG_ACCEPT_QUEUES
468         int i = 0;
469 #endif
470         struct inet_opt *inet = inet_sk(sk);
471         struct tcp_opt *tp = tcp_sk(sk);
472         struct tcp_listen_opt *lopt;
473
474         sk->sk_max_ack_backlog = 0;
475         sk->sk_ack_backlog = 0;
476 #ifdef CONFIG_ACCEPT_QUEUES
477         tp->accept_queue = NULL;
478 #else
479         tp->accept_queue = tp->accept_queue_tail = NULL;
480 #endif 
481         rwlock_init(&tp->syn_wait_lock);
482         tcp_delack_init(tp);
483
484         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
485         if (!lopt)
486                 return -ENOMEM;
487
488         memset(lopt, 0, sizeof(struct tcp_listen_opt));
489         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
490                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
491                         break;
492         get_random_bytes(&lopt->hash_rnd, 4);
493
494 #ifdef CONFIG_ACCEPT_QUEUES
495         tp->class_index = 0;
496         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
497                 tp->acceptq[i].aq_tail = NULL;
498                 tp->acceptq[i].aq_head = NULL;
499                 tp->acceptq[i].aq_wait_time = 0; 
500                 tp->acceptq[i].aq_qcount = 0; 
501                 tp->acceptq[i].aq_count = 0; 
502                 if (i == 0) {
503                         tp->acceptq[i].aq_ratio = 1; 
504                 }
505                 else {
506                         tp->acceptq[i].aq_ratio = 0; 
507                 }
508         }
509 #endif
510
511         write_lock_bh(&tp->syn_wait_lock);
512         tp->listen_opt = lopt;
513         write_unlock_bh(&tp->syn_wait_lock);
514
515         /* There is race window here: we announce ourselves listening,
516          * but this transition is still not validated by get_port().
517          * It is OK, because this socket enters to hash table only
518          * after validation is complete.
519          */
520         sk->sk_state = TCP_LISTEN;
521         if (!sk->sk_prot->get_port(sk, inet->num)) {
522                 inet->sport = htons(inet->num);
523
524                 sk_dst_reset(sk);
525                 sk->sk_prot->hash(sk);
526
527 #ifdef CONFIG_CKRM
528                 ckrm_cb_listen_start(sk);
529 #endif
530
531                 return 0;
532         }
533
534         sk->sk_state = TCP_CLOSE;
535         write_lock_bh(&tp->syn_wait_lock);
536         tp->listen_opt = NULL;
537         write_unlock_bh(&tp->syn_wait_lock);
538         kfree(lopt);
539         return -EADDRINUSE;
540 }
541
542 /*
543  *      This routine closes sockets which have been at least partially
544  *      opened, but not yet accepted.
545  */
546
547 static void tcp_listen_stop (struct sock *sk)
548 {
549         struct tcp_opt *tp = tcp_sk(sk);
550         struct tcp_listen_opt *lopt = tp->listen_opt;
551         struct open_request *acc_req = tp->accept_queue;
552         struct open_request *req;
553         int i;
554
555         tcp_delete_keepalive_timer(sk);
556
557         /* make all the listen_opt local to us */
558         write_lock_bh(&tp->syn_wait_lock);
559         tp->listen_opt = NULL;
560         write_unlock_bh(&tp->syn_wait_lock);
561
562 #ifdef CONFIG_CKRM
563                 ckrm_cb_listen_stop(sk);
564 #endif
565
566 #ifdef CONFIG_ACCEPT_QUEUES
567         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
568                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
569 #else
570         tp->accept_queue_tail = NULL;
571 #endif
572         tp->accept_queue = NULL;
573
574         if (lopt->qlen) {
575                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
576                         while ((req = lopt->syn_table[i]) != NULL) {
577                                 lopt->syn_table[i] = req->dl_next;
578                                 lopt->qlen--;
579                                 tcp_openreq_free(req);
580
581                 /* Following specs, it would be better either to send FIN
582                  * (and enter FIN-WAIT-1, it is normal close)
583                  * or to send active reset (abort).
584                  * Certainly, it is pretty dangerous while synflood, but it is
585                  * bad justification for our negligence 8)
586                  * To be honest, we are not able to make either
587                  * of the variants now.                 --ANK
588                  */
589                         }
590                 }
591         }
592         BUG_TRAP(!lopt->qlen);
593
594         kfree(lopt);
595
596         while ((req = acc_req) != NULL) {
597                 struct sock *child = req->sk;
598
599                 acc_req = req->dl_next;
600
601                 local_bh_disable();
602                 bh_lock_sock(child);
603                 BUG_TRAP(!sock_owned_by_user(child));
604                 sock_hold(child);
605
606                 tcp_disconnect(child, O_NONBLOCK);
607
608                 sock_orphan(child);
609
610                 atomic_inc(&tcp_orphan_count);
611
612                 tcp_destroy_sock(child);
613
614                 bh_unlock_sock(child);
615                 local_bh_enable();
616                 sock_put(child);
617
618 #ifdef CONFIG_ACCEPT_QUEUES
619                 sk_acceptq_removed(sk, req->acceptq_class);
620 #else
621                 sk_acceptq_removed(sk);
622 #endif
623                 tcp_openreq_fastfree(req);
624         }
625         BUG_TRAP(!sk->sk_ack_backlog);
626 }
627
628 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
629 {
630         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
631         tp->pushed_seq = tp->write_seq;
632 }
633
634 static inline int forced_push(struct tcp_opt *tp)
635 {
636         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
637 }
638
639 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
640                               struct sk_buff *skb)
641 {
642         skb->csum = 0;
643         TCP_SKB_CB(skb)->seq = tp->write_seq;
644         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
645         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
646         TCP_SKB_CB(skb)->sacked = 0;
647         __skb_queue_tail(&sk->sk_write_queue, skb);
648         sk_charge_skb(sk, skb);
649         if (!sk->sk_send_head)
650                 sk->sk_send_head = skb;
651         else if (tp->nonagle&TCP_NAGLE_PUSH)
652                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
653 }
654
655 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
656                                 struct sk_buff *skb)
657 {
658         if (flags & MSG_OOB) {
659                 tp->urg_mode = 1;
660                 tp->snd_up = tp->write_seq;
661                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
662         }
663 }
664
665 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
666                             int mss_now, int nonagle)
667 {
668         if (sk->sk_send_head) {
669                 struct sk_buff *skb = sk->sk_write_queue.prev;
670                 if (!(flags & MSG_MORE) || forced_push(tp))
671                         tcp_mark_push(tp, skb);
672                 tcp_mark_urg(tp, flags, skb);
673                 __tcp_push_pending_frames(sk, tp, mss_now,
674                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
675         }
676 }
677
678 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
679                          size_t psize, int flags)
680 {
681         struct tcp_opt *tp = tcp_sk(sk);
682         int mss_now;
683         int err;
684         ssize_t copied;
685         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
686
687         /* Wait for a connection to finish. */
688         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
689                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
690                         goto out_err;
691
692         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
693
694         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
695         copied = 0;
696
697         err = -EPIPE;
698         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
699                 goto do_error;
700
701         while (psize > 0) {
702                 struct sk_buff *skb = sk->sk_write_queue.prev;
703                 struct page *page = pages[poffset / PAGE_SIZE];
704                 int copy, i;
705                 int offset = poffset % PAGE_SIZE;
706                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
707
708                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
709 new_segment:
710                         if (!sk_stream_memory_free(sk))
711                                 goto wait_for_sndbuf;
712
713                         skb = sk_stream_alloc_pskb(sk, 0, tp->mss_cache,
714                                                    sk->sk_allocation);
715                         if (!skb)
716                                 goto wait_for_memory;
717
718                         skb_entail(sk, tp, skb);
719                         copy = mss_now;
720                 }
721
722                 if (copy > size)
723                         copy = size;
724
725                 i = skb_shinfo(skb)->nr_frags;
726                 if (skb_can_coalesce(skb, i, page, offset)) {
727                         skb_shinfo(skb)->frags[i - 1].size += copy;
728                 } else if (i < MAX_SKB_FRAGS) {
729                         get_page(page);
730                         skb_fill_page_desc(skb, i, page, offset, copy);
731                 } else {
732                         tcp_mark_push(tp, skb);
733                         goto new_segment;
734                 }
735
736                 skb->len += copy;
737                 skb->data_len += copy;
738                 skb->ip_summed = CHECKSUM_HW;
739                 tp->write_seq += copy;
740                 TCP_SKB_CB(skb)->end_seq += copy;
741                 skb_shinfo(skb)->tso_segs = 0;
742
743                 if (!copied)
744                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
745
746                 copied += copy;
747                 poffset += copy;
748                 if (!(psize -= copy))
749                         goto out;
750
751                 if (skb->len != mss_now || (flags & MSG_OOB))
752                         continue;
753
754                 if (forced_push(tp)) {
755                         tcp_mark_push(tp, skb);
756                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
757                 } else if (skb == sk->sk_send_head)
758                         tcp_push_one(sk, mss_now);
759                 continue;
760
761 wait_for_sndbuf:
762                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
763 wait_for_memory:
764                 if (copied)
765                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
766
767                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
768                         goto do_error;
769
770                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
771         }
772
773 out:
774         if (copied)
775                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
776         return copied;
777
778 do_error:
779         if (copied)
780                 goto out;
781 out_err:
782         return sk_stream_error(sk, flags, err);
783 }
784
785 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
786                      size_t size, int flags)
787 {
788         ssize_t res;
789         struct sock *sk = sock->sk;
790
791 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
792
793         if (!(sk->sk_route_caps & NETIF_F_SG) ||
794             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
795                 return sock_no_sendpage(sock, page, offset, size, flags);
796
797 #undef TCP_ZC_CSUM_FLAGS
798
799         lock_sock(sk);
800         TCP_CHECK_TIMER(sk);
801         res = do_tcp_sendpages(sk, &page, offset, size, flags);
802         TCP_CHECK_TIMER(sk);
803         release_sock(sk);
804         return res;
805 }
806
807 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
808 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
809
810 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
811 {
812         int tmp = tp->mss_cache_std;
813
814         if (sk->sk_route_caps & NETIF_F_SG) {
815                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
816
817                 if (tmp >= pgbreak &&
818                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
819                         tmp = pgbreak;
820         }
821         return tmp;
822 }
823
824 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
825                 size_t size)
826 {
827         struct iovec *iov;
828         struct tcp_opt *tp = tcp_sk(sk);
829         struct sk_buff *skb;
830         int iovlen, flags;
831         int mss_now;
832         int err, copied;
833         long timeo;
834
835         lock_sock(sk);
836         TCP_CHECK_TIMER(sk);
837
838         flags = msg->msg_flags;
839         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
840
841         /* Wait for a connection to finish. */
842         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
843                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
844                         goto out_err;
845
846         /* This should be in poll */
847         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
848
849         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
850
851         /* Ok commence sending. */
852         iovlen = msg->msg_iovlen;
853         iov = msg->msg_iov;
854         copied = 0;
855
856         err = -EPIPE;
857         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
858                 goto do_error;
859
860         while (--iovlen >= 0) {
861                 int seglen = iov->iov_len;
862                 unsigned char __user *from = iov->iov_base;
863
864                 iov++;
865
866                 while (seglen > 0) {
867                         int copy;
868
869                         skb = sk->sk_write_queue.prev;
870
871                         if (!sk->sk_send_head ||
872                             (copy = mss_now - skb->len) <= 0) {
873
874 new_segment:
875                                 /* Allocate new segment. If the interface is SG,
876                                  * allocate skb fitting to single page.
877                                  */
878                                 if (!sk_stream_memory_free(sk))
879                                         goto wait_for_sndbuf;
880
881                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
882                                                            0, sk->sk_allocation);
883                                 if (!skb)
884                                         goto wait_for_memory;
885
886                                 /*
887                                  * Check whether we can use HW checksum.
888                                  */
889                                 if (sk->sk_route_caps &
890                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
891                                      NETIF_F_HW_CSUM))
892                                         skb->ip_summed = CHECKSUM_HW;
893
894                                 skb_entail(sk, tp, skb);
895                                 copy = mss_now;
896                         }
897
898                         /* Try to append data to the end of skb. */
899                         if (copy > seglen)
900                                 copy = seglen;
901
902                         /* Where to copy to? */
903                         if (skb_tailroom(skb) > 0) {
904                                 /* We have some space in skb head. Superb! */
905                                 if (copy > skb_tailroom(skb))
906                                         copy = skb_tailroom(skb);
907                                 if ((err = skb_add_data(skb, from, copy)) != 0)
908                                         goto do_fault;
909                         } else {
910                                 int merge = 0;
911                                 int i = skb_shinfo(skb)->nr_frags;
912                                 struct page *page = TCP_PAGE(sk);
913                                 int off = TCP_OFF(sk);
914
915                                 if (skb_can_coalesce(skb, i, page, off) &&
916                                     off != PAGE_SIZE) {
917                                         /* We can extend the last page
918                                          * fragment. */
919                                         merge = 1;
920                                 } else if (i == MAX_SKB_FRAGS ||
921                                            (!i &&
922                                            !(sk->sk_route_caps & NETIF_F_SG))) {
923                                         /* Need to add new fragment and cannot
924                                          * do this because interface is non-SG,
925                                          * or because all the page slots are
926                                          * busy. */
927                                         tcp_mark_push(tp, skb);
928                                         goto new_segment;
929                                 } else if (page) {
930                                         /* If page is cached, align
931                                          * offset to L1 cache boundary
932                                          */
933                                         off = (off + L1_CACHE_BYTES - 1) &
934                                               ~(L1_CACHE_BYTES - 1);
935                                         if (off == PAGE_SIZE) {
936                                                 put_page(page);
937                                                 TCP_PAGE(sk) = page = NULL;
938                                         }
939                                 }
940
941                                 if (!page) {
942                                         /* Allocate new cache page. */
943                                         if (!(page = sk_stream_alloc_page(sk)))
944                                                 goto wait_for_memory;
945                                         off = 0;
946                                 }
947
948                                 if (copy > PAGE_SIZE - off)
949                                         copy = PAGE_SIZE - off;
950
951                                 /* Time to copy data. We are close to
952                                  * the end! */
953                                 err = skb_copy_to_page(sk, from, skb, page,
954                                                        off, copy);
955                                 if (err) {
956                                         /* If this page was new, give it to the
957                                          * socket so it does not get leaked.
958                                          */
959                                         if (!TCP_PAGE(sk)) {
960                                                 TCP_PAGE(sk) = page;
961                                                 TCP_OFF(sk) = 0;
962                                         }
963                                         goto do_error;
964                                 }
965
966                                 /* Update the skb. */
967                                 if (merge) {
968                                         skb_shinfo(skb)->frags[i - 1].size +=
969                                                                         copy;
970                                 } else {
971                                         skb_fill_page_desc(skb, i, page, off, copy);
972                                         if (TCP_PAGE(sk)) {
973                                                 get_page(page);
974                                         } else if (off + copy < PAGE_SIZE) {
975                                                 get_page(page);
976                                                 TCP_PAGE(sk) = page;
977                                         }
978                                 }
979
980                                 TCP_OFF(sk) = off + copy;
981                         }
982
983                         if (!copied)
984                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
985
986                         tp->write_seq += copy;
987                         TCP_SKB_CB(skb)->end_seq += copy;
988                         skb_shinfo(skb)->tso_segs = 0;
989
990                         from += copy;
991                         copied += copy;
992                         if ((seglen -= copy) == 0 && iovlen == 0)
993                                 goto out;
994
995                         if (skb->len != mss_now || (flags & MSG_OOB))
996                                 continue;
997
998                         if (forced_push(tp)) {
999                                 tcp_mark_push(tp, skb);
1000                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1001                         } else if (skb == sk->sk_send_head)
1002                                 tcp_push_one(sk, mss_now);
1003                         continue;
1004
1005 wait_for_sndbuf:
1006                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1007 wait_for_memory:
1008                         if (copied)
1009                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1010
1011                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1012                                 goto do_error;
1013
1014                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1015                 }
1016         }
1017
1018 out:
1019         if (copied)
1020                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1021         TCP_CHECK_TIMER(sk);
1022         release_sock(sk);
1023         return copied;
1024
1025 do_fault:
1026         if (!skb->len) {
1027                 if (sk->sk_send_head == skb)
1028                         sk->sk_send_head = NULL;
1029                 __skb_unlink(skb, skb->list);
1030                 sk_stream_free_skb(sk, skb);
1031         }
1032
1033 do_error:
1034         if (copied)
1035                 goto out;
1036 out_err:
1037         err = sk_stream_error(sk, flags, err);
1038         TCP_CHECK_TIMER(sk);
1039         release_sock(sk);
1040         return err;
1041 }
1042
1043 /*
1044  *      Handle reading urgent data. BSD has very simple semantics for
1045  *      this, no blocking and very strange errors 8)
1046  */
1047
1048 static int tcp_recv_urg(struct sock *sk, long timeo,
1049                         struct msghdr *msg, int len, int flags,
1050                         int *addr_len)
1051 {
1052         struct tcp_opt *tp = tcp_sk(sk);
1053
1054         /* No URG data to read. */
1055         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1056             tp->urg_data == TCP_URG_READ)
1057                 return -EINVAL; /* Yes this is right ! */
1058
1059         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1060                 return -ENOTCONN;
1061
1062         if (tp->urg_data & TCP_URG_VALID) {
1063                 int err = 0;
1064                 char c = tp->urg_data;
1065
1066                 if (!(flags & MSG_PEEK))
1067                         tp->urg_data = TCP_URG_READ;
1068
1069                 /* Read urgent data. */
1070                 msg->msg_flags |= MSG_OOB;
1071
1072                 if (len > 0) {
1073                         if (!(flags & MSG_TRUNC))
1074                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1075                         len = 1;
1076                 } else
1077                         msg->msg_flags |= MSG_TRUNC;
1078
1079                 return err ? -EFAULT : len;
1080         }
1081
1082         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1083                 return 0;
1084
1085         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1086          * the available implementations agree in this case:
1087          * this call should never block, independent of the
1088          * blocking state of the socket.
1089          * Mike <pall@rz.uni-karlsruhe.de>
1090          */
1091         return -EAGAIN;
1092 }
1093
1094 /* Clean up the receive buffer for full frames taken by the user,
1095  * then send an ACK if necessary.  COPIED is the number of bytes
1096  * tcp_recvmsg has given to the user so far, it speeds up the
1097  * calculation of whether or not we must ACK for the sake of
1098  * a window update.
1099  */
1100 void cleanup_rbuf(struct sock *sk, int copied)
1101 {
1102         struct tcp_opt *tp = tcp_sk(sk);
1103         int time_to_ack = 0;
1104
1105 #if TCP_DEBUG
1106         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1107
1108         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1109 #endif
1110
1111         if (tcp_ack_scheduled(tp)) {
1112                    /* Delayed ACKs frequently hit locked sockets during bulk
1113                     * receive. */
1114                 if (tp->ack.blocked ||
1115                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1116                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1117                     /*
1118                      * If this read emptied read buffer, we send ACK, if
1119                      * connection is not bidirectional, user drained
1120                      * receive buffer and there was a small segment
1121                      * in queue.
1122                      */
1123                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1124                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1125                         time_to_ack = 1;
1126         }
1127
1128         /* We send an ACK if we can now advertise a non-zero window
1129          * which has been raised "significantly".
1130          *
1131          * Even if window raised up to infinity, do not send window open ACK
1132          * in states, where we will not receive more. It is useless.
1133          */
1134         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1135                 __u32 rcv_window_now = tcp_receive_window(tp);
1136
1137                 /* Optimize, __tcp_select_window() is not cheap. */
1138                 if (2*rcv_window_now <= tp->window_clamp) {
1139                         __u32 new_window = __tcp_select_window(sk);
1140
1141                         /* Send ACK now, if this read freed lots of space
1142                          * in our buffer. Certainly, new_window is new window.
1143                          * We can advertise it now, if it is not less than current one.
1144                          * "Lots" means "at least twice" here.
1145                          */
1146                         if (new_window && new_window >= 2 * rcv_window_now)
1147                                 time_to_ack = 1;
1148                 }
1149         }
1150         if (time_to_ack)
1151                 tcp_send_ack(sk);
1152 }
1153
1154 static void tcp_prequeue_process(struct sock *sk)
1155 {
1156         struct sk_buff *skb;
1157         struct tcp_opt *tp = tcp_sk(sk);
1158
1159         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1160
1161         /* RX process wants to run with disabled BHs, though it is not
1162          * necessary */
1163         local_bh_disable();
1164         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1165                 sk->sk_backlog_rcv(sk, skb);
1166         local_bh_enable();
1167
1168         /* Clear memory counter. */
1169         tp->ucopy.memory = 0;
1170 }
1171
1172 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1173 {
1174         struct sk_buff *skb;
1175         u32 offset;
1176
1177         skb_queue_walk(&sk->sk_receive_queue, skb) {
1178                 offset = seq - TCP_SKB_CB(skb)->seq;
1179                 if (skb->h.th->syn)
1180                         offset--;
1181                 if (offset < skb->len || skb->h.th->fin) {
1182                         *off = offset;
1183                         return skb;
1184                 }
1185         }
1186         return NULL;
1187 }
1188
1189 /*
1190  * This routine provides an alternative to tcp_recvmsg() for routines
1191  * that would like to handle copying from skbuffs directly in 'sendfile'
1192  * fashion.
1193  * Note:
1194  *      - It is assumed that the socket was locked by the caller.
1195  *      - The routine does not block.
1196  *      - At present, there is no support for reading OOB data
1197  *        or for 'peeking' the socket using this routine
1198  *        (although both would be easy to implement).
1199  */
1200 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1201                   sk_read_actor_t recv_actor)
1202 {
1203         struct sk_buff *skb;
1204         struct tcp_opt *tp = tcp_sk(sk);
1205         u32 seq = tp->copied_seq;
1206         u32 offset;
1207         int copied = 0;
1208
1209         if (sk->sk_state == TCP_LISTEN)
1210                 return -ENOTCONN;
1211         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1212                 if (offset < skb->len) {
1213                         size_t used, len;
1214
1215                         len = skb->len - offset;
1216                         /* Stop reading if we hit a patch of urgent data */
1217                         if (tp->urg_data) {
1218                                 u32 urg_offset = tp->urg_seq - seq;
1219                                 if (urg_offset < len)
1220                                         len = urg_offset;
1221                                 if (!len)
1222                                         break;
1223                         }
1224                         used = recv_actor(desc, skb, offset, len);
1225                         if (used <= len) {
1226                                 seq += used;
1227                                 copied += used;
1228                                 offset += used;
1229                         }
1230                         if (offset != skb->len)
1231                                 break;
1232                 }
1233                 if (skb->h.th->fin) {
1234                         sk_eat_skb(sk, skb);
1235                         ++seq;
1236                         break;
1237                 }
1238                 sk_eat_skb(sk, skb);
1239                 if (!desc->count)
1240                         break;
1241         }
1242         tp->copied_seq = seq;
1243
1244         tcp_rcv_space_adjust(sk);
1245
1246         /* Clean up data we have read: This will do ACK frames. */
1247         if (copied)
1248                 cleanup_rbuf(sk, copied);
1249         return copied;
1250 }
1251
1252 /*
1253  *      This routine copies from a sock struct into the user buffer.
1254  *
1255  *      Technical note: in 2.3 we work on _locked_ socket, so that
1256  *      tricks with *seq access order and skb->users are not required.
1257  *      Probably, code can be easily improved even more.
1258  */
1259
1260 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1261                 size_t len, int nonblock, int flags, int *addr_len)
1262 {
1263         struct tcp_opt *tp = tcp_sk(sk);
1264         int copied = 0;
1265         u32 peek_seq;
1266         u32 *seq;
1267         unsigned long used;
1268         int err;
1269         int target;             /* Read at least this many bytes */
1270         long timeo;
1271         struct task_struct *user_recv = NULL;
1272
1273         lock_sock(sk);
1274
1275         TCP_CHECK_TIMER(sk);
1276
1277         err = -ENOTCONN;
1278         if (sk->sk_state == TCP_LISTEN)
1279                 goto out;
1280
1281         timeo = sock_rcvtimeo(sk, nonblock);
1282
1283         /* Urgent data needs to be handled specially. */
1284         if (flags & MSG_OOB)
1285                 goto recv_urg;
1286
1287         seq = &tp->copied_seq;
1288         if (flags & MSG_PEEK) {
1289                 peek_seq = tp->copied_seq;
1290                 seq = &peek_seq;
1291         }
1292
1293         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1294
1295         do {
1296                 struct sk_buff *skb;
1297                 u32 offset;
1298
1299                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1300                 if (tp->urg_data && tp->urg_seq == *seq) {
1301                         if (copied)
1302                                 break;
1303                         if (signal_pending(current)) {
1304                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1305                                 break;
1306                         }
1307                 }
1308
1309                 /* Next get a buffer. */
1310
1311                 skb = skb_peek(&sk->sk_receive_queue);
1312                 do {
1313                         if (!skb)
1314                                 break;
1315
1316                         /* Now that we have two receive queues this
1317                          * shouldn't happen.
1318                          */
1319                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1320                                 printk(KERN_INFO "recvmsg bug: copied %X "
1321                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1322                                 break;
1323                         }
1324                         offset = *seq - TCP_SKB_CB(skb)->seq;
1325                         if (skb->h.th->syn)
1326                                 offset--;
1327                         if (offset < skb->len)
1328                                 goto found_ok_skb;
1329                         if (skb->h.th->fin)
1330                                 goto found_fin_ok;
1331                         BUG_TRAP(flags & MSG_PEEK);
1332                         skb = skb->next;
1333                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1334
1335                 /* Well, if we have backlog, try to process it now yet. */
1336
1337                 if (copied >= target && !sk->sk_backlog.tail)
1338                         break;
1339
1340                 if (copied) {
1341                         if (sk->sk_err ||
1342                             sk->sk_state == TCP_CLOSE ||
1343                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1344                             !timeo ||
1345                             signal_pending(current) ||
1346                             (flags & MSG_PEEK))
1347                                 break;
1348                 } else {
1349                         if (sock_flag(sk, SOCK_DONE))
1350                                 break;
1351
1352                         if (sk->sk_err) {
1353                                 copied = sock_error(sk);
1354                                 break;
1355                         }
1356
1357                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1358                                 break;
1359
1360                         if (sk->sk_state == TCP_CLOSE) {
1361                                 if (!sock_flag(sk, SOCK_DONE)) {
1362                                         /* This occurs when user tries to read
1363                                          * from never connected socket.
1364                                          */
1365                                         copied = -ENOTCONN;
1366                                         break;
1367                                 }
1368                                 break;
1369                         }
1370
1371                         if (!timeo) {
1372                                 copied = -EAGAIN;
1373                                 break;
1374                         }
1375
1376                         if (signal_pending(current)) {
1377                                 copied = sock_intr_errno(timeo);
1378                                 break;
1379                         }
1380                 }
1381
1382                 cleanup_rbuf(sk, copied);
1383
1384                 if (tp->ucopy.task == user_recv) {
1385                         /* Install new reader */
1386                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1387                                 user_recv = current;
1388                                 tp->ucopy.task = user_recv;
1389                                 tp->ucopy.iov = msg->msg_iov;
1390                         }
1391
1392                         tp->ucopy.len = len;
1393
1394                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1395                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1396
1397                         /* Ugly... If prequeue is not empty, we have to
1398                          * process it before releasing socket, otherwise
1399                          * order will be broken at second iteration.
1400                          * More elegant solution is required!!!
1401                          *
1402                          * Look: we have the following (pseudo)queues:
1403                          *
1404                          * 1. packets in flight
1405                          * 2. backlog
1406                          * 3. prequeue
1407                          * 4. receive_queue
1408                          *
1409                          * Each queue can be processed only if the next ones
1410                          * are empty. At this point we have empty receive_queue.
1411                          * But prequeue _can_ be not empty after 2nd iteration,
1412                          * when we jumped to start of loop because backlog
1413                          * processing added something to receive_queue.
1414                          * We cannot release_sock(), because backlog contains
1415                          * packets arrived _after_ prequeued ones.
1416                          *
1417                          * Shortly, algorithm is clear --- to process all
1418                          * the queues in order. We could make it more directly,
1419                          * requeueing packets from backlog to prequeue, if
1420                          * is not empty. It is more elegant, but eats cycles,
1421                          * unfortunately.
1422                          */
1423                         if (skb_queue_len(&tp->ucopy.prequeue))
1424                                 goto do_prequeue;
1425
1426                         /* __ Set realtime policy in scheduler __ */
1427                 }
1428
1429                 if (copied >= target) {
1430                         /* Do not sleep, just process backlog. */
1431                         release_sock(sk);
1432                         lock_sock(sk);
1433                 } else
1434                         sk_wait_data(sk, &timeo);
1435
1436                 if (user_recv) {
1437                         int chunk;
1438
1439                         /* __ Restore normal policy in scheduler __ */
1440
1441                         if ((chunk = len - tp->ucopy.len) != 0) {
1442                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1443                                 len -= chunk;
1444                                 copied += chunk;
1445                         }
1446
1447                         if (tp->rcv_nxt == tp->copied_seq &&
1448                             skb_queue_len(&tp->ucopy.prequeue)) {
1449 do_prequeue:
1450                                 tcp_prequeue_process(sk);
1451
1452                                 if ((chunk = len - tp->ucopy.len) != 0) {
1453                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1454                                         len -= chunk;
1455                                         copied += chunk;
1456                                 }
1457                         }
1458                 }
1459                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1460                         if (net_ratelimit())
1461                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1462                                        current->comm, current->pid);
1463                         peek_seq = tp->copied_seq;
1464                 }
1465                 continue;
1466
1467         found_ok_skb:
1468                 /* Ok so how much can we use? */
1469                 used = skb->len - offset;
1470                 if (len < used)
1471                         used = len;
1472
1473                 /* Do we have urgent data here? */
1474                 if (tp->urg_data) {
1475                         u32 urg_offset = tp->urg_seq - *seq;
1476                         if (urg_offset < used) {
1477                                 if (!urg_offset) {
1478                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1479                                                 ++*seq;
1480                                                 offset++;
1481                                                 used--;
1482                                                 if (!used)
1483                                                         goto skip_copy;
1484                                         }
1485                                 } else
1486                                         used = urg_offset;
1487                         }
1488                 }
1489
1490                 if (!(flags & MSG_TRUNC)) {
1491                         err = skb_copy_datagram_iovec(skb, offset,
1492                                                       msg->msg_iov, used);
1493                         if (err) {
1494                                 /* Exception. Bailout! */
1495                                 if (!copied)
1496                                         copied = -EFAULT;
1497                                 break;
1498                         }
1499                 }
1500
1501                 *seq += used;
1502                 copied += used;
1503                 len -= used;
1504
1505                 tcp_rcv_space_adjust(sk);
1506
1507 skip_copy:
1508                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1509                         tp->urg_data = 0;
1510                         tcp_fast_path_check(sk, tp);
1511                 }
1512                 if (used + offset < skb->len)
1513                         continue;
1514
1515                 if (skb->h.th->fin)
1516                         goto found_fin_ok;
1517                 if (!(flags & MSG_PEEK))
1518                         sk_eat_skb(sk, skb);
1519                 continue;
1520
1521         found_fin_ok:
1522                 /* Process the FIN. */
1523                 ++*seq;
1524                 if (!(flags & MSG_PEEK))
1525                         sk_eat_skb(sk, skb);
1526                 break;
1527         } while (len > 0);
1528
1529         if (user_recv) {
1530                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1531                         int chunk;
1532
1533                         tp->ucopy.len = copied > 0 ? len : 0;
1534
1535                         tcp_prequeue_process(sk);
1536
1537                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1538                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1539                                 len -= chunk;
1540                                 copied += chunk;
1541                         }
1542                 }
1543
1544                 tp->ucopy.task = NULL;
1545                 tp->ucopy.len = 0;
1546         }
1547
1548         /* According to UNIX98, msg_name/msg_namelen are ignored
1549          * on connected socket. I was just happy when found this 8) --ANK
1550          */
1551
1552         /* Clean up data we have read: This will do ACK frames. */
1553         cleanup_rbuf(sk, copied);
1554
1555         TCP_CHECK_TIMER(sk);
1556         release_sock(sk);
1557         return copied;
1558
1559 out:
1560         TCP_CHECK_TIMER(sk);
1561         release_sock(sk);
1562         return err;
1563
1564 recv_urg:
1565         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1566         goto out;
1567 }
1568
1569 /*
1570  *      State processing on a close. This implements the state shift for
1571  *      sending our FIN frame. Note that we only send a FIN for some
1572  *      states. A shutdown() may have already sent the FIN, or we may be
1573  *      closed.
1574  */
1575
1576 static unsigned char new_state[16] = {
1577   /* current state:        new state:      action:      */
1578   /* (Invalid)          */ TCP_CLOSE,
1579   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1580   /* TCP_SYN_SENT       */ TCP_CLOSE,
1581   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1582   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1583   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1584   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1585   /* TCP_CLOSE          */ TCP_CLOSE,
1586   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1587   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1588   /* TCP_LISTEN         */ TCP_CLOSE,
1589   /* TCP_CLOSING        */ TCP_CLOSING,
1590 };
1591
1592 static int tcp_close_state(struct sock *sk)
1593 {
1594         int next = (int)new_state[sk->sk_state];
1595         int ns = next & TCP_STATE_MASK;
1596
1597         tcp_set_state(sk, ns);
1598
1599         return next & TCP_ACTION_FIN;
1600 }
1601
1602 /*
1603  *      Shutdown the sending side of a connection. Much like close except
1604  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1605  */
1606
1607 void tcp_shutdown(struct sock *sk, int how)
1608 {
1609         /*      We need to grab some memory, and put together a FIN,
1610          *      and then put it into the queue to be sent.
1611          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1612          */
1613         if (!(how & SEND_SHUTDOWN))
1614                 return;
1615
1616         /* If we've already sent a FIN, or it's a closed state, skip this. */
1617         if ((1 << sk->sk_state) &
1618             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1619              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1620                 /* Clear out any half completed packets.  FIN if needed. */
1621                 if (tcp_close_state(sk))
1622                         tcp_send_fin(sk);
1623         }
1624 }
1625
1626 /*
1627  * At this point, there should be no process reference to this
1628  * socket, and thus no user references at all.  Therefore we
1629  * can assume the socket waitqueue is inactive and nobody will
1630  * try to jump onto it.
1631  */
1632 void tcp_destroy_sock(struct sock *sk)
1633 {
1634         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1635         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1636
1637         /* It cannot be in hash table! */
1638         BUG_TRAP(sk_unhashed(sk));
1639
1640         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1641         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1642
1643         sk->sk_prot->destroy(sk);
1644
1645         sk_stream_kill_queues(sk);
1646
1647         xfrm_sk_free_policy(sk);
1648
1649 #ifdef INET_REFCNT_DEBUG
1650         if (atomic_read(&sk->sk_refcnt) != 1) {
1651                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1652                        sk, atomic_read(&sk->sk_refcnt));
1653         }
1654 #endif
1655
1656         atomic_dec(&tcp_orphan_count);
1657         sock_put(sk);
1658 }
1659
1660 void tcp_close(struct sock *sk, long timeout)
1661 {
1662         struct sk_buff *skb;
1663         int data_was_unread = 0;
1664
1665         lock_sock(sk);
1666         sk->sk_shutdown = SHUTDOWN_MASK;
1667
1668         if (sk->sk_state == TCP_LISTEN) {
1669                 tcp_set_state(sk, TCP_CLOSE);
1670
1671                 /* Special case. */
1672                 tcp_listen_stop(sk);
1673
1674                 goto adjudge_to_death;
1675         }
1676
1677         /*  We need to flush the recv. buffs.  We do this only on the
1678          *  descriptor close, not protocol-sourced closes, because the
1679          *  reader process may not have drained the data yet!
1680          */
1681         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1682                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1683                           skb->h.th->fin;
1684                 data_was_unread += len;
1685                 __kfree_skb(skb);
1686         }
1687
1688         sk_stream_mem_reclaim(sk);
1689
1690         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1691          * 3.10, we send a RST here because data was lost.  To
1692          * witness the awful effects of the old behavior of always
1693          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1694          * a bulk GET in an FTP client, suspend the process, wait
1695          * for the client to advertise a zero window, then kill -9
1696          * the FTP client, wheee...  Note: timeout is always zero
1697          * in such a case.
1698          */
1699         if (data_was_unread) {
1700                 /* Unread data was tossed, zap the connection. */
1701                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1702                 tcp_set_state(sk, TCP_CLOSE);
1703                 tcp_send_active_reset(sk, GFP_KERNEL);
1704         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1705                 /* Check zero linger _after_ checking for unread data. */
1706                 sk->sk_prot->disconnect(sk, 0);
1707                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1708         } else if (tcp_close_state(sk)) {
1709                 /* We FIN if the application ate all the data before
1710                  * zapping the connection.
1711                  */
1712
1713                 /* RED-PEN. Formally speaking, we have broken TCP state
1714                  * machine. State transitions:
1715                  *
1716                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1717                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1718                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1719                  *
1720                  * are legal only when FIN has been sent (i.e. in window),
1721                  * rather than queued out of window. Purists blame.
1722                  *
1723                  * F.e. "RFC state" is ESTABLISHED,
1724                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1725                  *
1726                  * The visible declinations are that sometimes
1727                  * we enter time-wait state, when it is not required really
1728                  * (harmless), do not send active resets, when they are
1729                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1730                  * they look as CLOSING or LAST_ACK for Linux)
1731                  * Probably, I missed some more holelets.
1732                  *                                              --ANK
1733                  */
1734                 tcp_send_fin(sk);
1735         }
1736
1737         sk_stream_wait_close(sk, timeout);
1738
1739 adjudge_to_death:
1740         /* It is the last release_sock in its life. It will remove backlog. */
1741         release_sock(sk);
1742
1743
1744         /* Now socket is owned by kernel and we acquire BH lock
1745            to finish close. No need to check for user refs.
1746          */
1747         local_bh_disable();
1748         bh_lock_sock(sk);
1749         BUG_TRAP(!sock_owned_by_user(sk));
1750
1751         sock_hold(sk);
1752         sock_orphan(sk);
1753
1754         /*      This is a (useful) BSD violating of the RFC. There is a
1755          *      problem with TCP as specified in that the other end could
1756          *      keep a socket open forever with no application left this end.
1757          *      We use a 3 minute timeout (about the same as BSD) then kill
1758          *      our end. If they send after that then tough - BUT: long enough
1759          *      that we won't make the old 4*rto = almost no time - whoops
1760          *      reset mistake.
1761          *
1762          *      Nope, it was not mistake. It is really desired behaviour
1763          *      f.e. on http servers, when such sockets are useless, but
1764          *      consume significant resources. Let's do it with special
1765          *      linger2 option.                                 --ANK
1766          */
1767
1768         if (sk->sk_state == TCP_FIN_WAIT2) {
1769                 struct tcp_opt *tp = tcp_sk(sk);
1770                 if (tp->linger2 < 0) {
1771                         tcp_set_state(sk, TCP_CLOSE);
1772                         tcp_send_active_reset(sk, GFP_ATOMIC);
1773                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1774                 } else {
1775                         int tmo = tcp_fin_time(tp);
1776
1777                         if (tmo > TCP_TIMEWAIT_LEN) {
1778                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1779                         } else {
1780                                 atomic_inc(&tcp_orphan_count);
1781                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1782                                 goto out;
1783                         }
1784                 }
1785         }
1786         if (sk->sk_state != TCP_CLOSE) {
1787                 sk_stream_mem_reclaim(sk);
1788                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1789                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1790                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1791                         if (net_ratelimit())
1792                                 printk(KERN_INFO "TCP: too many of orphaned "
1793                                        "sockets\n");
1794                         tcp_set_state(sk, TCP_CLOSE);
1795                         tcp_send_active_reset(sk, GFP_ATOMIC);
1796                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1797                 }
1798         }
1799         atomic_inc(&tcp_orphan_count);
1800
1801         if (sk->sk_state == TCP_CLOSE)
1802                 tcp_destroy_sock(sk);
1803         /* Otherwise, socket is reprieved until protocol close. */
1804
1805 out:
1806         bh_unlock_sock(sk);
1807         local_bh_enable();
1808         sock_put(sk);
1809 }
1810
1811 /* These states need RST on ABORT according to RFC793 */
1812
1813 static inline int tcp_need_reset(int state)
1814 {
1815         return (1 << state) &
1816                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1817                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1818 }
1819
1820 int tcp_disconnect(struct sock *sk, int flags)
1821 {
1822         struct inet_opt *inet = inet_sk(sk);
1823         struct tcp_opt *tp = tcp_sk(sk);
1824         int err = 0;
1825         int old_state = sk->sk_state;
1826
1827         if (old_state != TCP_CLOSE)
1828                 tcp_set_state(sk, TCP_CLOSE);
1829
1830         /* ABORT function of RFC793 */
1831         if (old_state == TCP_LISTEN) {
1832                 tcp_listen_stop(sk);
1833         } else if (tcp_need_reset(old_state) ||
1834                    (tp->snd_nxt != tp->write_seq &&
1835                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1836                 /* The last check adjusts for discrepance of Linux wrt. RFC
1837                  * states
1838                  */
1839                 tcp_send_active_reset(sk, gfp_any());
1840                 sk->sk_err = ECONNRESET;
1841         } else if (old_state == TCP_SYN_SENT)
1842                 sk->sk_err = ECONNRESET;
1843
1844         tcp_clear_xmit_timers(sk);
1845         __skb_queue_purge(&sk->sk_receive_queue);
1846         sk_stream_writequeue_purge(sk);
1847         __skb_queue_purge(&tp->out_of_order_queue);
1848
1849         inet->dport = 0;
1850
1851         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1852                 inet_reset_saddr(sk);
1853
1854         sk->sk_shutdown = 0;
1855         sock_reset_flag(sk, SOCK_DONE);
1856         tp->srtt = 0;
1857         if ((tp->write_seq += tp->max_window + 2) == 0)
1858                 tp->write_seq = 1;
1859         tp->backoff = 0;
1860         tp->snd_cwnd = 2;
1861         tp->probes_out = 0;
1862         tcp_set_pcount(&tp->packets_out, 0);
1863         tp->snd_ssthresh = 0x7fffffff;
1864         tp->snd_cwnd_cnt = 0;
1865         tcp_set_ca_state(tp, TCP_CA_Open);
1866         tcp_clear_retrans(tp);
1867         tcp_delack_init(tp);
1868         sk->sk_send_head = NULL;
1869         tp->saw_tstamp = 0;
1870         tcp_sack_reset(tp);
1871         __sk_dst_reset(sk);
1872
1873         BUG_TRAP(!inet->num || tp->bind_hash);
1874
1875         sk->sk_error_report(sk);
1876         return err;
1877 }
1878
1879 /*
1880  *      Wait for an incoming connection, avoid race
1881  *      conditions. This must be called with the socket locked.
1882  */
1883 static int wait_for_connect(struct sock *sk, long timeo)
1884 {
1885         struct tcp_opt *tp = tcp_sk(sk);
1886         DEFINE_WAIT(wait);
1887         int err;
1888
1889         /*
1890          * True wake-one mechanism for incoming connections: only
1891          * one process gets woken up, not the 'whole herd'.
1892          * Since we do not 'race & poll' for established sockets
1893          * anymore, the common case will execute the loop only once.
1894          *
1895          * Subtle issue: "add_wait_queue_exclusive()" will be added
1896          * after any current non-exclusive waiters, and we know that
1897          * it will always _stay_ after any new non-exclusive waiters
1898          * because all non-exclusive waiters are added at the
1899          * beginning of the wait-queue. As such, it's ok to "drop"
1900          * our exclusiveness temporarily when we get woken up without
1901          * having to remove and re-insert us on the wait queue.
1902          */
1903         for (;;) {
1904                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1905                                           TASK_INTERRUPTIBLE);
1906                 release_sock(sk);
1907                 if (!tp->accept_queue)
1908                         timeo = schedule_timeout(timeo);
1909                 lock_sock(sk);
1910                 err = 0;
1911                 if (tp->accept_queue)
1912                         break;
1913                 err = -EINVAL;
1914                 if (sk->sk_state != TCP_LISTEN)
1915                         break;
1916                 err = sock_intr_errno(timeo);
1917                 if (signal_pending(current))
1918                         break;
1919                 err = -EAGAIN;
1920                 if (!timeo)
1921                         break;
1922         }
1923         finish_wait(sk->sk_sleep, &wait);
1924         return err;
1925 }
1926
1927 /*
1928  *      This will accept the next outstanding connection.
1929  */
1930
1931 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1932 {
1933         struct tcp_opt *tp = tcp_sk(sk);
1934         struct open_request *req;
1935         struct sock *newsk;
1936         int error;
1937 #ifdef CONFIG_ACCEPT_QUEUES     
1938         int prev_class = 0;
1939         int first;
1940 #endif
1941
1942         lock_sock(sk);
1943
1944         /* We need to make sure that this socket is listening,
1945          * and that it has something pending.
1946          */
1947         error = -EINVAL;
1948         if (sk->sk_state != TCP_LISTEN)
1949                 goto out;
1950
1951         /* Find already established connection */
1952         if (!tp->accept_queue) {
1953                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1954                 /* If this is a non blocking socket don't sleep */
1955                 error = -EAGAIN;
1956                 if (!timeo)
1957                         goto out;
1958
1959                 error = wait_for_connect(sk, timeo);
1960                 if (error)
1961                         goto out;
1962         }
1963
1964 #ifndef CONFIG_ACCEPT_QUEUES
1965         req = tp->accept_queue;
1966         if ((tp->accept_queue = req->dl_next) == NULL)
1967                 tp->accept_queue_tail = NULL;
1968         newsk = req->sk;
1969         sk_acceptq_removed(sk);
1970 #else
1971         first = tp->class_index;
1972         /* We should always have  request queued here. The accept_queue
1973          * is already checked for NULL above.
1974          */
1975         while(!tp->acceptq[first].aq_head) {
1976                 tp->acceptq[first].aq_cnt = 0;
1977                 first = (first+1) & ~NUM_ACCEPT_QUEUES; 
1978         }
1979         req = tp->acceptq[first].aq_head;
1980         tp->acceptq[first].aq_qcount--;
1981         tp->acceptq[first].aq_count++;
1982         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
1983
1984         for (prev_class= first-1 ; prev_class >=0; prev_class--)
1985                 if (tp->acceptq[prev_class].aq_tail)
1986                         break;
1987         if (prev_class>=0)
1988                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next; 
1989         else 
1990                 tp->accept_queue = req->dl_next;
1991
1992         if (req == tp->acceptq[first].aq_tail) 
1993                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
1994         else
1995                 tp->acceptq[first].aq_head = req->dl_next;
1996
1997         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
1998                 tp->acceptq[first].aq_cnt = 0;
1999                 tp->class_index = ++first & (NUM_ACCEPT_QUEUES-1);
2000         }       
2001         newsk = req->sk;
2002         sk_acceptq_removed(sk, req->acceptq_class);
2003 #endif
2004         tcp_openreq_fastfree(req);
2005         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2006         release_sock(sk);
2007         return newsk;
2008
2009 out:
2010         release_sock(sk);
2011         *err = error;
2012         return NULL;
2013 }
2014
2015
2016 /*
2017  *      Socket option code for TCP.
2018  */
2019 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2020                    int optlen)
2021 {
2022         struct tcp_opt *tp = tcp_sk(sk);
2023         int val;
2024         int err = 0;
2025
2026         if (level != SOL_TCP)
2027                 return tp->af_specific->setsockopt(sk, level, optname,
2028                                                    optval, optlen);
2029
2030         if (optlen < sizeof(int))
2031                 return -EINVAL;
2032
2033         if (get_user(val, (int __user *)optval))
2034                 return -EFAULT;
2035
2036         lock_sock(sk);
2037
2038         switch (optname) {
2039         case TCP_MAXSEG:
2040                 /* Values greater than interface MTU won't take effect. However
2041                  * at the point when this call is done we typically don't yet
2042                  * know which interface is going to be used */
2043                 if (val < 8 || val > MAX_TCP_WINDOW) {
2044                         err = -EINVAL;
2045                         break;
2046                 }
2047                 tp->user_mss = val;
2048                 break;
2049
2050         case TCP_NODELAY:
2051                 if (val) {
2052                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2053                          * this option on corked socket is remembered, but
2054                          * it is not activated until cork is cleared.
2055                          *
2056                          * However, when TCP_NODELAY is set we make
2057                          * an explicit push, which overrides even TCP_CORK
2058                          * for currently queued segments.
2059                          */
2060                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2061                         tcp_push_pending_frames(sk, tp);
2062                 } else {
2063                         tp->nonagle &= ~TCP_NAGLE_OFF;
2064                 }
2065                 break;
2066
2067         case TCP_CORK:
2068                 /* When set indicates to always queue non-full frames.
2069                  * Later the user clears this option and we transmit
2070                  * any pending partial frames in the queue.  This is
2071                  * meant to be used alongside sendfile() to get properly
2072                  * filled frames when the user (for example) must write
2073                  * out headers with a write() call first and then use
2074                  * sendfile to send out the data parts.
2075                  *
2076                  * TCP_CORK can be set together with TCP_NODELAY and it is
2077                  * stronger than TCP_NODELAY.
2078                  */
2079                 if (val) {
2080                         tp->nonagle |= TCP_NAGLE_CORK;
2081                 } else {
2082                         tp->nonagle &= ~TCP_NAGLE_CORK;
2083                         if (tp->nonagle&TCP_NAGLE_OFF)
2084                                 tp->nonagle |= TCP_NAGLE_PUSH;
2085                         tcp_push_pending_frames(sk, tp);
2086                 }
2087                 break;
2088
2089         case TCP_KEEPIDLE:
2090                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2091                         err = -EINVAL;
2092                 else {
2093                         tp->keepalive_time = val * HZ;
2094                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2095                             !((1 << sk->sk_state) &
2096                               (TCPF_CLOSE | TCPF_LISTEN))) {
2097                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2098                                 if (tp->keepalive_time > elapsed)
2099                                         elapsed = tp->keepalive_time - elapsed;
2100                                 else
2101                                         elapsed = 0;
2102                                 tcp_reset_keepalive_timer(sk, elapsed);
2103                         }
2104                 }
2105                 break;
2106         case TCP_KEEPINTVL:
2107                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2108                         err = -EINVAL;
2109                 else
2110                         tp->keepalive_intvl = val * HZ;
2111                 break;
2112         case TCP_KEEPCNT:
2113                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2114                         err = -EINVAL;
2115                 else
2116                         tp->keepalive_probes = val;
2117                 break;
2118         case TCP_SYNCNT:
2119                 if (val < 1 || val > MAX_TCP_SYNCNT)
2120                         err = -EINVAL;
2121                 else
2122                         tp->syn_retries = val;
2123                 break;
2124
2125         case TCP_LINGER2:
2126                 if (val < 0)
2127                         tp->linger2 = -1;
2128                 else if (val > sysctl_tcp_fin_timeout / HZ)
2129                         tp->linger2 = 0;
2130                 else
2131                         tp->linger2 = val * HZ;
2132                 break;
2133
2134         case TCP_DEFER_ACCEPT:
2135                 tp->defer_accept = 0;
2136                 if (val > 0) {
2137                         /* Translate value in seconds to number of
2138                          * retransmits */
2139                         while (tp->defer_accept < 32 &&
2140                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2141                                        tp->defer_accept))
2142                                 tp->defer_accept++;
2143                         tp->defer_accept++;
2144                 }
2145                 break;
2146
2147         case TCP_WINDOW_CLAMP:
2148                 if (!val) {
2149                         if (sk->sk_state != TCP_CLOSE) {
2150                                 err = -EINVAL;
2151                                 break;
2152                         }
2153                         tp->window_clamp = 0;
2154                 } else
2155                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2156                                                 SOCK_MIN_RCVBUF / 2 : val;
2157                 break;
2158
2159         case TCP_QUICKACK:
2160                 if (!val) {
2161                         tp->ack.pingpong = 1;
2162                 } else {
2163                         tp->ack.pingpong = 0;
2164                         if ((1 << sk->sk_state) &
2165                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2166                             tcp_ack_scheduled(tp)) {
2167                                 tp->ack.pending |= TCP_ACK_PUSHED;
2168                                 cleanup_rbuf(sk, 1);
2169                                 if (!(val & 1))
2170                                         tp->ack.pingpong = 1;
2171                         }
2172                 }
2173                 break;
2174                 
2175 #ifdef CONFIG_ACCEPT_QUEUES
2176         case TCP_ACCEPTQ_SHARE:
2177 #ifdef CONFIG_CKRM
2178                 // If CKRM is set then the shares are set through rcfs.
2179                 // Get shares will still succeed.
2180                 err = -EOPNOTSUPP;
2181                 break;
2182 #else           
2183                 {
2184                         char share_wt[NUM_ACCEPT_QUEUES];
2185                         int i,j;
2186
2187                         if (sk->sk_state != TCP_LISTEN)
2188                                 return -EOPNOTSUPP;
2189
2190                         if (copy_from_user(share_wt,optval, optlen)) {
2191                                 err = -EFAULT;
2192                                 break;
2193                         }
2194                         j = 0;
2195                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2196                                 if (share_wt[i]) {
2197                                         if (!j)
2198                                                 j = share_wt[i];
2199                                         else if (share_wt[i] < j) {
2200                                                 j = share_wt[i];
2201                                         }
2202                                 }
2203                                 else
2204                                         tp->acceptq[i].aq_ratio = 0;
2205                                         
2206                         }
2207                         if (j == 0) {
2208                                 /* Class 0 is always valid. If nothing is 
2209                                  * specified set class 0 as 1.
2210                                  */
2211                                 share_wt[0] = 1;
2212                                 j = 1;
2213                         }
2214                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2215                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2216                                 tp->acceptq[i].aq_cnt = 0;
2217                         }
2218                 }
2219                 break;
2220 #endif
2221 #endif
2222         default:
2223                 err = -ENOPROTOOPT;
2224                 break;
2225         };
2226         release_sock(sk);
2227         return err;
2228 }
2229
2230 /* Return information about state of tcp endpoint in API format. */
2231 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2232 {
2233         struct tcp_opt *tp = tcp_sk(sk);
2234         u32 now = tcp_time_stamp;
2235
2236         memset(info, 0, sizeof(*info));
2237
2238         info->tcpi_state = sk->sk_state;
2239         info->tcpi_ca_state = tp->ca_state;
2240         info->tcpi_retransmits = tp->retransmits;
2241         info->tcpi_probes = tp->probes_out;
2242         info->tcpi_backoff = tp->backoff;
2243
2244         if (tp->tstamp_ok)
2245                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2246         if (tp->sack_ok)
2247                 info->tcpi_options |= TCPI_OPT_SACK;
2248         if (tp->wscale_ok) {
2249                 info->tcpi_options |= TCPI_OPT_WSCALE;
2250                 info->tcpi_snd_wscale = tp->snd_wscale;
2251                 info->tcpi_rcv_wscale = tp->rcv_wscale;
2252         } 
2253
2254         if (tp->ecn_flags&TCP_ECN_OK)
2255                 info->tcpi_options |= TCPI_OPT_ECN;
2256
2257         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2258         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2259         info->tcpi_snd_mss = tp->mss_cache_std;
2260         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2261
2262         info->tcpi_unacked = tcp_get_pcount(&tp->packets_out);
2263         info->tcpi_sacked = tcp_get_pcount(&tp->sacked_out);
2264         info->tcpi_lost = tcp_get_pcount(&tp->lost_out);
2265         info->tcpi_retrans = tcp_get_pcount(&tp->retrans_out);
2266         info->tcpi_fackets = tcp_get_pcount(&tp->fackets_out);
2267
2268         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2269         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2270         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2271
2272         info->tcpi_pmtu = tp->pmtu_cookie;
2273         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2274         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2275         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2276         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2277         info->tcpi_snd_cwnd = tp->snd_cwnd;
2278         info->tcpi_advmss = tp->advmss;
2279         info->tcpi_reordering = tp->reordering;
2280
2281         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2282         info->tcpi_rcv_space = tp->rcvq_space.space;
2283
2284         info->tcpi_total_retrans = tp->total_retrans;
2285 }
2286
2287 EXPORT_SYMBOL_GPL(tcp_get_info);
2288
2289 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2290                    int __user *optlen)
2291 {
2292         struct tcp_opt *tp = tcp_sk(sk);
2293         int val, len;
2294
2295         if (level != SOL_TCP)
2296                 return tp->af_specific->getsockopt(sk, level, optname,
2297                                                    optval, optlen);
2298
2299         if (get_user(len, optlen))
2300                 return -EFAULT;
2301
2302         len = min_t(unsigned int, len, sizeof(int));
2303
2304         if (len < 0)
2305                 return -EINVAL;
2306
2307         switch (optname) {
2308         case TCP_MAXSEG:
2309                 val = tp->mss_cache_std;
2310                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2311                         val = tp->user_mss;
2312                 break;
2313         case TCP_NODELAY:
2314                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2315                 break;
2316         case TCP_CORK:
2317                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2318                 break;
2319         case TCP_KEEPIDLE:
2320                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2321                 break;
2322         case TCP_KEEPINTVL:
2323                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2324                 break;
2325         case TCP_KEEPCNT:
2326                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2327                 break;
2328         case TCP_SYNCNT:
2329                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2330                 break;
2331         case TCP_LINGER2:
2332                 val = tp->linger2;
2333                 if (val >= 0)
2334                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2335                 break;
2336         case TCP_DEFER_ACCEPT:
2337                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2338                                                (tp->defer_accept - 1));
2339                 break;
2340         case TCP_WINDOW_CLAMP:
2341                 val = tp->window_clamp;
2342                 break;
2343         case TCP_INFO: {
2344                 struct tcp_info info;
2345
2346                 if (get_user(len, optlen))
2347                         return -EFAULT;
2348
2349                 tcp_get_info(sk, &info);
2350
2351                 len = min_t(unsigned int, len, sizeof(info));
2352                 if (put_user(len, optlen))
2353                         return -EFAULT;
2354                 if (copy_to_user(optval, &info, len))
2355                         return -EFAULT;
2356                 return 0;
2357         }
2358         case TCP_QUICKACK:
2359                 val = !tp->ack.pingpong;
2360                 break;
2361
2362 #ifdef CONFIG_ACCEPT_QUEUES
2363         case TCP_ACCEPTQ_SHARE: 
2364         {
2365                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2366                 int i;
2367
2368                 if (sk->sk_state != TCP_LISTEN)
2369                         return -EOPNOTSUPP;
2370
2371                 if (get_user(len, optlen))
2372                         return -EFAULT;
2373
2374                 memset(tinfo, 0, sizeof(tinfo));
2375
2376                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2377                         tinfo[i].acceptq_wait_time = 
2378                              jiffies_to_msecs(tp->acceptq[i].aq_wait_time);
2379                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2380                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2381                         tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2382                 }
2383
2384                 len = min_t(unsigned int, len, sizeof(tinfo));
2385                 if (put_user(len, optlen)) 
2386                         return -EFAULT;
2387                         
2388                 if (copy_to_user(optval, (char *)tinfo, len))
2389                         return -EFAULT;
2390                 
2391                 return 0;
2392         }
2393         break;
2394 #endif
2395         default:
2396                 return -ENOPROTOOPT;
2397         };
2398
2399         if (put_user(len, optlen))
2400                 return -EFAULT;
2401         if (copy_to_user(optval, &val, len))
2402                 return -EFAULT;
2403         return 0;
2404 }
2405
2406
2407 extern void __skb_cb_too_small_for_tcp(int, int);
2408 extern void tcpdiag_init(void);
2409
2410 static __initdata unsigned long thash_entries;
2411 static int __init set_thash_entries(char *str)
2412 {
2413         if (!str)
2414                 return 0;
2415         thash_entries = simple_strtoul(str, &str, 0);
2416         return 1;
2417 }
2418 __setup("thash_entries=", set_thash_entries);
2419
2420 void __init tcp_init(void)
2421 {
2422         struct sk_buff *skb = NULL;
2423         unsigned long goal;
2424         int order, i;
2425
2426         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2427                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2428                                            sizeof(skb->cb));
2429
2430         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2431                                                    sizeof(struct open_request),
2432                                                0, SLAB_HWCACHE_ALIGN,
2433                                                NULL, NULL);
2434         if (!tcp_openreq_cachep)
2435                 panic("tcp_init: Cannot alloc open_request cache.");
2436
2437         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2438                                               sizeof(struct tcp_bind_bucket),
2439                                               0, SLAB_HWCACHE_ALIGN,
2440                                               NULL, NULL);
2441         if (!tcp_bucket_cachep)
2442                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2443
2444         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2445                                                 sizeof(struct tcp_tw_bucket),
2446                                                 0, SLAB_HWCACHE_ALIGN,
2447                                                 NULL, NULL);
2448         if (!tcp_timewait_cachep)
2449                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2450
2451         /* Size and allocate the main established and bind bucket
2452          * hash tables.
2453          *
2454          * The methodology is similar to that of the buffer cache.
2455          */
2456         if (num_physpages >= (128 * 1024))
2457                 goal = num_physpages >> (21 - PAGE_SHIFT);
2458         else
2459                 goal = num_physpages >> (23 - PAGE_SHIFT);
2460
2461         if (thash_entries)
2462                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2463         for (order = 0; (1UL << order) < goal; order++)
2464                 ;
2465         do {
2466                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2467                         sizeof(struct tcp_ehash_bucket);
2468                 tcp_ehash_size >>= 1;
2469                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2470                         tcp_ehash_size--;
2471                 tcp_ehash = (struct tcp_ehash_bucket *)
2472                         __get_free_pages(GFP_ATOMIC, order);
2473         } while (!tcp_ehash && --order > 0);
2474
2475         if (!tcp_ehash)
2476                 panic("Failed to allocate TCP established hash table\n");
2477         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2478                 rwlock_init(&tcp_ehash[i].lock);
2479                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2480         }
2481
2482         do {
2483                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2484                         sizeof(struct tcp_bind_hashbucket);
2485                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2486                         continue;
2487                 tcp_bhash = (struct tcp_bind_hashbucket *)
2488                         __get_free_pages(GFP_ATOMIC, order);
2489         } while (!tcp_bhash && --order >= 0);
2490
2491         if (!tcp_bhash)
2492                 panic("Failed to allocate TCP bind hash table\n");
2493         for (i = 0; i < tcp_bhash_size; i++) {
2494                 spin_lock_init(&tcp_bhash[i].lock);
2495                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2496         }
2497
2498         /* Try to be a bit smarter and adjust defaults depending
2499          * on available memory.
2500          */
2501         if (order > 4) {
2502                 sysctl_local_port_range[0] = 32768;
2503                 sysctl_local_port_range[1] = 61000;
2504                 sysctl_tcp_max_tw_buckets = 180000;
2505                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2506                 sysctl_max_syn_backlog = 1024;
2507         } else if (order < 3) {
2508                 sysctl_local_port_range[0] = 1024 * (3 - order);
2509                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2510                 sysctl_tcp_max_orphans >>= (3 - order);
2511                 sysctl_max_syn_backlog = 128;
2512         }
2513         tcp_port_rover = sysctl_local_port_range[0] - 1;
2514
2515         sysctl_tcp_mem[0] =  768 << order;
2516         sysctl_tcp_mem[1] = 1024 << order;
2517         sysctl_tcp_mem[2] = 1536 << order;
2518
2519         if (order < 3) {
2520                 sysctl_tcp_wmem[2] = 64 * 1024;
2521                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2522                 sysctl_tcp_rmem[1] = 43689;
2523                 sysctl_tcp_rmem[2] = 2 * 43689;
2524         }
2525
2526         printk(KERN_INFO "TCP: Hash tables configured "
2527                "(established %d bind %d)\n",
2528                tcp_ehash_size << 1, tcp_bhash_size);
2529 }
2530
2531 EXPORT_SYMBOL(tcp_accept);
2532 EXPORT_SYMBOL(tcp_close);
2533 EXPORT_SYMBOL(tcp_destroy_sock);
2534 EXPORT_SYMBOL(tcp_disconnect);
2535 EXPORT_SYMBOL(tcp_getsockopt);
2536 EXPORT_SYMBOL(tcp_ioctl);
2537 EXPORT_SYMBOL(tcp_openreq_cachep);
2538 EXPORT_SYMBOL(tcp_poll);
2539 EXPORT_SYMBOL(tcp_read_sock);
2540 EXPORT_SYMBOL(tcp_recvmsg);
2541 EXPORT_SYMBOL(tcp_sendmsg);
2542 EXPORT_SYMBOL(tcp_sendpage);
2543 EXPORT_SYMBOL(tcp_setsockopt);
2544 EXPORT_SYMBOL(tcp_shutdown);
2545 EXPORT_SYMBOL(tcp_statistics);
2546 EXPORT_SYMBOL(tcp_timewait_cachep);
2547 EXPORT_SYMBOL_GPL(cleanup_rbuf);