net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259
 260 #ifdef CONFIG_CKRM
 261 #include <linux/ckrm.h>
 262 #endif
 263
 264 #include <net/icmp.h>
 265 #include <net/tcp.h>
 266 #include <net/xfrm.h>
 267 #include <net/ip.h>
 268
 269
 270 #include <asm/uaccess.h>
 271 #include <asm/ioctls.h>
 272
 273 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 274
 275 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 276
 277 kmem_cache_t *tcp_openreq_cachep;
 278 kmem_cache_t *tcp_bucket_cachep;
 279 kmem_cache_t *tcp_timewait_cachep;
 280
 281 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 282
 283 int sysctl_tcp_default_win_scale = 7;
 284 int sysctl_tcp_mem[3];
 285 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 286 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 287
 288 EXPORT_SYMBOL(sysctl_tcp_mem);
 289 EXPORT_SYMBOL(sysctl_tcp_rmem);
 290 EXPORT_SYMBOL(sysctl_tcp_wmem);
 291
 292 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 293 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 294
 295 EXPORT_SYMBOL(tcp_memory_allocated);
 296 EXPORT_SYMBOL(tcp_sockets_allocated);
 297
 298 /*
 299  * Pressure flag: try to collapse.
 300  * Technical note: it is used by multiple contexts non atomically.
 301  * All the sk_stream_mem_schedule() is of this nature: accounting
 302  * is strict, actions are advisory and have some latency.
 303  */
 304 int tcp_memory_pressure;
 305
 306 EXPORT_SYMBOL(tcp_memory_pressure);
 307
 308 void tcp_enter_memory_pressure(void)
 309 {
 310         if (!tcp_memory_pressure) {
 311                 NET_INC_STATS(TCPMemoryPressures);
 312                 tcp_memory_pressure = 1;
 313         }
 314 }
 315
 316 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 317
 318 /*
 319  * LISTEN is a special case for poll..
 320  */
 321 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 322                                                poll_table *wait)
 323 {
 324         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 325 }
 326
 327 /*
 328  *      Wait for a TCP event.
 329  *
 330  *      Note that we don't need to lock the socket, as the upper poll layers
 331  *      take care of normal races (between the test and the event) and we don't
 332  *      go look at any of the socket buffers directly.
 333  */
 334 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 335 {
 336         unsigned int mask;
 337         struct sock *sk = sock->sk;
 338         struct tcp_opt *tp = tcp_sk(sk);
 339
 340         poll_wait(file, sk->sk_sleep, wait);
 341         if (sk->sk_state == TCP_LISTEN)
 342                 return tcp_listen_poll(sk, wait);
 343
 344         /* Socket is not locked. We are protected from async events
 345            by poll logic and correct handling of state changes
 346            made by another threads is impossible in any case.
 347          */
 348
 349         mask = 0;
 350         if (sk->sk_err)
 351                 mask = POLLERR;
 352
 353         /*
 354          * POLLHUP is certainly not done right. But poll() doesn't
 355          * have a notion of HUP in just one direction, and for a
 356          * socket the read side is more interesting.
 357          *
 358          * Some poll() documentation says that POLLHUP is incompatible
 359          * with the POLLOUT/POLLWR flags, so somebody should check this
 360          * all. But careful, it tends to be safer to return too many
 361          * bits than too few, and you can easily break real applications
 362          * if you don't tell them that something has hung up!
 363          *
 364          * Check-me.
 365          *
 366          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 367          * our fs/select.c). It means that after we received EOF,
 368          * poll always returns immediately, making impossible poll() on write()
 369          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 370          * if and only if shutdown has been made in both directions.
 371          * Actually, it is interesting to look how Solaris and DUX
 372          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 373          * then we could set it on SND_SHUTDOWN. BTW examples given
 374          * in Stevens' books assume exactly this behaviour, it explains
 375          * why PULLHUP is incompatible with POLLOUT.    --ANK
 376          *
 377          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 378          * blocking on fresh not-connected or disconnected socket. --ANK
 379          */
 380         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 381                 mask |= POLLHUP;
 382         if (sk->sk_shutdown & RCV_SHUTDOWN)
 383                 mask |= POLLIN | POLLRDNORM;
 384
 385         /* Connected? */
 386         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 387                 /* Potential race condition. If read of tp below will
 388                  * escape above sk->sk_state, we can be illegally awaken
 389                  * in SYN_* states. */
 390                 if ((tp->rcv_nxt != tp->copied_seq) &&
 391                     (tp->urg_seq != tp->copied_seq ||
 392                      tp->rcv_nxt != tp->copied_seq + 1 ||
 393                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 394                         mask |= POLLIN | POLLRDNORM;
 395
 396                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 397                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 398                                 mask |= POLLOUT | POLLWRNORM;
 399                         } else {  /* send SIGIO later */
 400                                 set_bit(SOCK_ASYNC_NOSPACE,
 401                                         &sk->sk_socket->flags);
 402                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 403
 404                                 /* Race breaker. If space is freed after
 405                                  * wspace test but before the flags are set,
 406                                  * IO signal will be lost.
 407                                  */
 408                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 409                                         mask |= POLLOUT | POLLWRNORM;
 410                         }
 411                 }
 412
 413                 if (tp->urg_data & TCP_URG_VALID)
 414                         mask |= POLLPRI;
 415         }
 416         return mask;
 417 }
 418
 419 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 420 {
 421         struct tcp_opt *tp = tcp_sk(sk);
 422         int answ;
 423
 424         switch (cmd) {
 425         case SIOCINQ:
 426                 if (sk->sk_state == TCP_LISTEN)
 427                         return -EINVAL;
 428
 429                 lock_sock(sk);
 430                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 431                         answ = 0;
 432                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 433                          !tp->urg_data ||
 434                          before(tp->urg_seq, tp->copied_seq) ||
 435                          !before(tp->urg_seq, tp->rcv_nxt)) {
 436                         answ = tp->rcv_nxt - tp->copied_seq;
 437
 438                         /* Subtract 1, if FIN is in queue. */
 439                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 440                                 answ -=
 441                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 442                 } else
 443                         answ = tp->urg_seq - tp->copied_seq;
 444                 release_sock(sk);
 445                 break;
 446         case SIOCATMARK:
 447                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 448                 break;
 449         case SIOCOUTQ:
 450                 if (sk->sk_state == TCP_LISTEN)
 451                         return -EINVAL;
 452
 453                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 454                         answ = 0;
 455                 else
 456                         answ = tp->write_seq - tp->snd_una;
 457                 break;
 458         default:
 459                 return -ENOIOCTLCMD;
 460         };
 461
 462         return put_user(answ, (int __user *)arg);
 463 }
 464
 465
 466 int tcp_listen_start(struct sock *sk)
 467 {
 468 #ifdef CONFIG_ACCEPT_QUEUES
 469         int i = 0;
 470 #endif
 471         struct inet_opt *inet = inet_sk(sk);
 472         struct tcp_opt *tp = tcp_sk(sk);
 473         struct tcp_listen_opt *lopt;
 474
 475         sk->sk_max_ack_backlog = 0;
 476         sk->sk_ack_backlog = 0;
 477 #ifdef CONFIG_ACCEPT_QUEUES
 478         tp->accept_queue = NULL;
 479 #else
 480         tp->accept_queue = tp->accept_queue_tail = NULL;
 481 #endif
 482         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 483         tcp_delack_init(tp);
 484
 485         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 486         if (!lopt)
 487                 return -ENOMEM;
 488
 489         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 490         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 491                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 492                         break;
 493         get_random_bytes(&lopt->hash_rnd, 4);
 494
 495 #ifdef CONFIG_ACCEPT_QUEUES
 496         tp->class_index = 0;
 497         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
 498                 tp->acceptq[i].aq_tail = NULL;
 499                 tp->acceptq[i].aq_head = NULL;
 500                 tp->acceptq[i].aq_wait_time = 0;
 501                 tp->acceptq[i].aq_qcount = 0;
 502                 tp->acceptq[i].aq_count = 0;
 503                 if (i == 0) {
 504                         tp->acceptq[i].aq_ratio = 1;
 505                 }
 506                 else {
 507                         tp->acceptq[i].aq_ratio = 0;
 508                 }
 509         }
 510 #endif
 511
 512         write_lock_bh(&tp->syn_wait_lock);
 513         tp->listen_opt = lopt;
 514         write_unlock_bh(&tp->syn_wait_lock);
 515
 516         /* There is race window here: we announce ourselves listening,
 517          * but this transition is still not validated by get_port().
 518          * It is OK, because this socket enters to hash table only
 519          * after validation is complete.
 520          */
 521         sk->sk_state = TCP_LISTEN;
 522         if (!sk->sk_prot->get_port(sk, inet->num)) {
 523                 inet->sport = htons(inet->num);
 524
 525                 sk_dst_reset(sk);
 526                 sk->sk_prot->hash(sk);
 527
 528 #ifdef CONFIG_CKRM
 529                 ckrm_cb_listen_start(sk);
 530 #endif
 531
 532                 return 0;
 533         }
 534
 535         sk->sk_state = TCP_CLOSE;
 536         write_lock_bh(&tp->syn_wait_lock);
 537         tp->listen_opt = NULL;
 538         write_unlock_bh(&tp->syn_wait_lock);
 539         kfree(lopt);
 540         return -EADDRINUSE;
 541 }
 542
 543 /*
 544  *      This routine closes sockets which have been at least partially
 545  *      opened, but not yet accepted.
 546  */
 547
 548 static void tcp_listen_stop (struct sock *sk)
 549 {
 550         struct tcp_opt *tp = tcp_sk(sk);
 551         struct tcp_listen_opt *lopt = tp->listen_opt;
 552         struct open_request *acc_req = tp->accept_queue;
 553         struct open_request *req;
 554         int i;
 555
 556         tcp_delete_keepalive_timer(sk);
 557
 558         /* make all the listen_opt local to us */
 559         write_lock_bh(&tp->syn_wait_lock);
 560         tp->listen_opt = NULL;
 561         write_unlock_bh(&tp->syn_wait_lock);
 562
 563 #ifdef CONFIG_CKRM
 564                 ckrm_cb_listen_stop(sk);
 565 #endif
 566
 567 #ifdef CONFIG_ACCEPT_QUEUES
 568         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
 569                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
 570 #else
 571         tp->accept_queue_tail = NULL;
 572 #endif
 573         tp->accept_queue = NULL;
 574
 575         if (lopt->qlen) {
 576                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 577                         while ((req = lopt->syn_table[i]) != NULL) {
 578                                 lopt->syn_table[i] = req->dl_next;
 579                                 lopt->qlen--;
 580                                 tcp_openreq_free(req);
 581
 582                 /* Following specs, it would be better either to send FIN
 583                  * (and enter FIN-WAIT-1, it is normal close)
 584                  * or to send active reset (abort).
 585                  * Certainly, it is pretty dangerous while synflood, but it is
 586                  * bad justification for our negligence 8)
 587                  * To be honest, we are not able to make either
 588                  * of the variants now.                 --ANK
 589                  */
 590                         }
 591                 }
 592         }
 593         BUG_TRAP(!lopt->qlen);
 594
 595         kfree(lopt);
 596
 597         while ((req = acc_req) != NULL) {
 598                 struct sock *child = req->sk;
 599
 600                 acc_req = req->dl_next;
 601
 602                 local_bh_disable();
 603                 bh_lock_sock(child);
 604                 BUG_TRAP(!sock_owned_by_user(child));
 605                 sock_hold(child);
 606
 607                 tcp_disconnect(child, O_NONBLOCK);
 608
 609                 sock_orphan(child);
 610
 611                 atomic_inc(&tcp_orphan_count);
 612
 613                 tcp_destroy_sock(child);
 614
 615                 bh_unlock_sock(child);
 616                 local_bh_enable();
 617                 sock_put(child);
 618
 619 #ifdef CONFIG_ACCEPT_QUEUES
 620                 sk_acceptq_removed(sk, req->acceptq_class);
 621 #else
 622                 sk_acceptq_removed(sk);
 623 #endif
 624                 tcp_openreq_fastfree(req);
 625         }
 626         BUG_TRAP(!sk->sk_ack_backlog);
 627 }
 628
 629 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 630 {
 631         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 632         tp->pushed_seq = tp->write_seq;
 633 }
 634
 635 static inline int forced_push(struct tcp_opt *tp)
 636 {
 637         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 638 }
 639
 640 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 641                               struct sk_buff *skb)
 642 {
 643         skb->csum = 0;
 644         TCP_SKB_CB(skb)->seq = tp->write_seq;
 645         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 646         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 647         TCP_SKB_CB(skb)->sacked = 0;
 648         __skb_queue_tail(&sk->sk_write_queue, skb);
 649         sk_charge_skb(sk, skb);
 650         if (!sk->sk_send_head)
 651                 sk->sk_send_head = skb;
 652         else if (tp->nonagle&TCP_NAGLE_PUSH)
 653                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 654 }
 655
 656 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 657                                 struct sk_buff *skb)
 658 {
 659         if (flags & MSG_OOB) {
 660                 tp->urg_mode = 1;
 661                 tp->snd_up = tp->write_seq;
 662                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 663         }
 664 }
 665
 666 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 667                             int mss_now, int nonagle)
 668 {
 669         if (sk->sk_send_head) {
 670                 struct sk_buff *skb = sk->sk_write_queue.prev;
 671                 if (!(flags & MSG_MORE) || forced_push(tp))
 672                         tcp_mark_push(tp, skb);
 673                 tcp_mark_urg(tp, flags, skb);
 674                 __tcp_push_pending_frames(sk, tp, mss_now,
 675                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 676         }
 677 }
 678
 679 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 680                          size_t psize, int flags)
 681 {
 682         struct tcp_opt *tp = tcp_sk(sk);
 683         int mss_now;
 684         int err;
 685         ssize_t copied;
 686         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 687
 688         /* Wait for a connection to finish. */
 689         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 690                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 691                         goto out_err;
 692
 693         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 694
 695         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 696         copied = 0;
 697
 698         err = -EPIPE;
 699         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 700                 goto do_error;
 701
 702         while (psize > 0) {
 703                 struct sk_buff *skb = sk->sk_write_queue.prev;
 704                 struct page *page = pages[poffset / PAGE_SIZE];
 705                 int copy, i;
 706                 int offset = poffset % PAGE_SIZE;
 707                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 708
 709                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
 710 new_segment:
 711                         if (!sk_stream_memory_free(sk))
 712                                 goto wait_for_sndbuf;
 713
 714                         skb = sk_stream_alloc_pskb(sk, 0, tp->mss_cache,
 715                                                    sk->sk_allocation);
 716                         if (!skb)
 717                                 goto wait_for_memory;
 718
 719                         skb_entail(sk, tp, skb);
 720                         copy = mss_now;
 721                 }
 722
 723                 if (copy > size)
 724                         copy = size;
 725
 726                 i = skb_shinfo(skb)->nr_frags;
 727                 if (skb_can_coalesce(skb, i, page, offset)) {
 728                         skb_shinfo(skb)->frags[i - 1].size += copy;
 729                 } else if (i < MAX_SKB_FRAGS) {
 730                         get_page(page);
 731                         skb_fill_page_desc(skb, i, page, offset, copy);
 732                 } else {
 733                         tcp_mark_push(tp, skb);
 734                         goto new_segment;
 735                 }
 736
 737                 skb->len += copy;
 738                 skb->data_len += copy;
 739                 skb->ip_summed = CHECKSUM_HW;
 740                 tp->write_seq += copy;
 741                 TCP_SKB_CB(skb)->end_seq += copy;
 742
 743                 if (!copied)
 744                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 745
 746                 copied += copy;
 747                 poffset += copy;
 748                 if (!(psize -= copy))
 749                         goto out;
 750
 751                 if (skb->len != mss_now || (flags & MSG_OOB))
 752                         continue;
 753
 754                 if (forced_push(tp)) {
 755                         tcp_mark_push(tp, skb);
 756                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 757                 } else if (skb == sk->sk_send_head)
 758                         tcp_push_one(sk, mss_now);
 759                 continue;
 760
 761 wait_for_sndbuf:
 762                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 763 wait_for_memory:
 764                 if (copied)
 765                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 766
 767                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 768                         goto do_error;
 769
 770                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 771         }
 772
 773 out:
 774         if (copied)
 775                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 776         return copied;
 777
 778 do_error:
 779         if (copied)
 780                 goto out;
 781 out_err:
 782         return sk_stream_error(sk, flags, err);
 783 }
 784
 785 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 786                      size_t size, int flags)
 787 {
 788         ssize_t res;
 789         struct sock *sk = sock->sk;
 790
 791 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 792
 793         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 794             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 795                 return sock_no_sendpage(sock, page, offset, size, flags);
 796
 797 #undef TCP_ZC_CSUM_FLAGS
 798
 799         lock_sock(sk);
 800         TCP_CHECK_TIMER(sk);
 801         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 802         TCP_CHECK_TIMER(sk);
 803         release_sock(sk);
 804         return res;
 805 }
 806
 807 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 808 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 809
 810 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
 811 {
 812         int tmp = tp->mss_cache_std;
 813
 814         if (sk->sk_route_caps & NETIF_F_SG) {
 815                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 816
 817                 if (tmp >= pgbreak &&
 818                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 819                         tmp = pgbreak;
 820         }
 821         return tmp;
 822 }
 823
 824 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 825                 size_t size)
 826 {
 827         struct iovec *iov;
 828         struct tcp_opt *tp = tcp_sk(sk);
 829         struct sk_buff *skb;
 830         int iovlen, flags;
 831         int mss_now;
 832         int err, copied;
 833         long timeo;
 834
 835         lock_sock(sk);
 836         TCP_CHECK_TIMER(sk);
 837
 838         flags = msg->msg_flags;
 839         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 840
 841         /* Wait for a connection to finish. */
 842         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 843                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 844                         goto out_err;
 845
 846         /* This should be in poll */
 847         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 848
 849         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 850
 851         /* Ok commence sending. */
 852         iovlen = msg->msg_iovlen;
 853         iov = msg->msg_iov;
 854         copied = 0;
 855
 856         err = -EPIPE;
 857         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 858                 goto do_error;
 859
 860         while (--iovlen >= 0) {
 861                 int seglen = iov->iov_len;
 862                 unsigned char __user *from = iov->iov_base;
 863
 864                 iov++;
 865
 866                 while (seglen > 0) {
 867                         int copy;
 868
 869                         skb = sk->sk_write_queue.prev;
 870
 871                         if (!sk->sk_send_head ||
 872                             (copy = mss_now - skb->len) <= 0) {
 873
 874 new_segment:
 875                                 /* Allocate new segment. If the interface is SG,
 876                                  * allocate skb fitting to single page.
 877                                  */
 878                                 if (!sk_stream_memory_free(sk))
 879                                         goto wait_for_sndbuf;
 880
 881                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 882                                                            0, sk->sk_allocation);
 883                                 if (!skb)
 884                                         goto wait_for_memory;
 885
 886                                 /*
 887                                  * Check whether we can use HW checksum.
 888                                  */
 889                                 if (sk->sk_route_caps &
 890                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 891                                      NETIF_F_HW_CSUM))
 892                                         skb->ip_summed = CHECKSUM_HW;
 893
 894                                 skb_entail(sk, tp, skb);
 895                                 copy = mss_now;
 896                         }
 897
 898                         /* Try to append data to the end of skb. */
 899                         if (copy > seglen)
 900                                 copy = seglen;
 901
 902                         /* Where to copy to? */
 903                         if (skb_tailroom(skb) > 0) {
 904                                 /* We have some space in skb head. Superb! */
 905                                 if (copy > skb_tailroom(skb))
 906                                         copy = skb_tailroom(skb);
 907                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 908                                         goto do_fault;
 909                         } else {
 910                                 int merge = 0;
 911                                 int i = skb_shinfo(skb)->nr_frags;
 912                                 struct page *page = TCP_PAGE(sk);
 913                                 int off = TCP_OFF(sk);
 914
 915                                 if (skb_can_coalesce(skb, i, page, off) &&
 916                                     off != PAGE_SIZE) {
 917                                         /* We can extend the last page
 918                                          * fragment. */
 919                                         merge = 1;
 920                                 } else if (i == MAX_SKB_FRAGS ||
 921                                            (!i &&
 922                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 923                                         /* Need to add new fragment and cannot
 924                                          * do this because interface is non-SG,
 925                                          * or because all the page slots are
 926                                          * busy. */
 927                                         tcp_mark_push(tp, skb);
 928                                         goto new_segment;
 929                                 } else if (page) {
 930                                         /* If page is cached, align
 931                                          * offset to L1 cache boundary
 932                                          */
 933                                         off = (off + L1_CACHE_BYTES - 1) &
 934                                               ~(L1_CACHE_BYTES - 1);
 935                                         if (off == PAGE_SIZE) {
 936                                                 put_page(page);
 937                                                 TCP_PAGE(sk) = page = NULL;
 938                                         }
 939                                 }
 940
 941                                 if (!page) {
 942                                         /* Allocate new cache page. */
 943                                         if (!(page = sk_stream_alloc_page(sk)))
 944                                                 goto wait_for_memory;
 945                                         off = 0;
 946                                 }
 947
 948                                 if (copy > PAGE_SIZE - off)
 949                                         copy = PAGE_SIZE - off;
 950
 951                                 /* Time to copy data. We are close to
 952                                  * the end! */
 953                                 err = skb_copy_to_page(sk, from, skb, page,
 954                                                        off, copy);
 955                                 if (err) {
 956                                         /* If this page was new, give it to the
 957                                          * socket so it does not get leaked.
 958                                          */
 959                                         if (!TCP_PAGE(sk)) {
 960                                                 TCP_PAGE(sk) = page;
 961                                                 TCP_OFF(sk) = 0;
 962                                         }
 963                                         goto do_error;
 964                                 }
 965
 966                                 /* Update the skb. */
 967                                 if (merge) {
 968                                         skb_shinfo(skb)->frags[i - 1].size +=
 969                                                                         copy;
 970                                 } else {
 971                                         skb_fill_page_desc(skb, i, page, off, copy);
 972                                         if (TCP_PAGE(sk)) {
 973                                                 get_page(page);
 974                                         } else if (off + copy < PAGE_SIZE) {
 975                                                 get_page(page);
 976                                                 TCP_PAGE(sk) = page;
 977                                         }
 978                                 }
 979
 980                                 TCP_OFF(sk) = off + copy;
 981                         }
 982
 983                         if (!copied)
 984                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 985
 986                         tp->write_seq += copy;
 987                         TCP_SKB_CB(skb)->end_seq += copy;
 988
 989                         from += copy;
 990                         copied += copy;
 991                         if ((seglen -= copy) == 0 && iovlen == 0)
 992                                 goto out;
 993
 994                         if (skb->len != mss_now || (flags & MSG_OOB))
 995                                 continue;
 996
 997                         if (forced_push(tp)) {
 998                                 tcp_mark_push(tp, skb);
 999                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1000                         } else if (skb == sk->sk_send_head)
1001                                 tcp_push_one(sk, mss_now);
1002                         continue;
1003
1004 wait_for_sndbuf:
1005                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1006 wait_for_memory:
1007                         if (copied)
1008                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1009
1010                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1011                                 goto do_error;
1012
1013                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1014                 }
1015         }
1016
1017 out:
1018         if (copied)
1019                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1020         TCP_CHECK_TIMER(sk);
1021         release_sock(sk);
1022         return copied;
1023
1024 do_fault:
1025         if (!skb->len) {
1026                 if (sk->sk_send_head == skb)
1027                         sk->sk_send_head = NULL;
1028                 __skb_unlink(skb, skb->list);
1029                 sk_stream_free_skb(sk, skb);
1030         }
1031
1032 do_error:
1033         if (copied)
1034                 goto out;
1035 out_err:
1036         err = sk_stream_error(sk, flags, err);
1037         TCP_CHECK_TIMER(sk);
1038         release_sock(sk);
1039         return err;
1040 }
1041
1042 /*
1043  *      Handle reading urgent data. BSD has very simple semantics for
1044  *      this, no blocking and very strange errors 8)
1045  */
1046
1047 static int tcp_recv_urg(struct sock *sk, long timeo,
1048                         struct msghdr *msg, int len, int flags,
1049                         int *addr_len)
1050 {
1051         struct tcp_opt *tp = tcp_sk(sk);
1052
1053         /* No URG data to read. */
1054         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1055             tp->urg_data == TCP_URG_READ)
1056                 return -EINVAL; /* Yes this is right ! */
1057
1058         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1059                 return -ENOTCONN;
1060
1061         if (tp->urg_data & TCP_URG_VALID) {
1062                 int err = 0;
1063                 char c = tp->urg_data;
1064
1065                 if (!(flags & MSG_PEEK))
1066                         tp->urg_data = TCP_URG_READ;
1067
1068                 /* Read urgent data. */
1069                 msg->msg_flags |= MSG_OOB;
1070
1071                 if (len > 0) {
1072                         if (!(flags & MSG_TRUNC))
1073                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1074                         len = 1;
1075                 } else
1076                         msg->msg_flags |= MSG_TRUNC;
1077
1078                 return err ? -EFAULT : len;
1079         }
1080
1081         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1082                 return 0;
1083
1084         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1085          * the available implementations agree in this case:
1086          * this call should never block, independent of the
1087          * blocking state of the socket.
1088          * Mike <pall@rz.uni-karlsruhe.de>
1089          */
1090         return -EAGAIN;
1091 }
1092
1093 /* Clean up the receive buffer for full frames taken by the user,
1094  * then send an ACK if necessary.  COPIED is the number of bytes
1095  * tcp_recvmsg has given to the user so far, it speeds up the
1096  * calculation of whether or not we must ACK for the sake of
1097  * a window update.
1098  */
1099 static void cleanup_rbuf(struct sock *sk, int copied)
1100 {
1101         struct tcp_opt *tp = tcp_sk(sk);
1102         int time_to_ack = 0;
1103
1104 #if TCP_DEBUG
1105         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1106
1107         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1108 #endif
1109
1110         if (tcp_ack_scheduled(tp)) {
1111                    /* Delayed ACKs frequently hit locked sockets during bulk
1112                     * receive. */
1113                 if (tp->ack.blocked ||
1114                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1115                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1116                     /*
1117                      * If this read emptied read buffer, we send ACK, if
1118                      * connection is not bidirectional, user drained
1119                      * receive buffer and there was a small segment
1120                      * in queue.
1121                      */
1122                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1123                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1124                         time_to_ack = 1;
1125         }
1126
1127         /* We send an ACK if we can now advertise a non-zero window
1128          * which has been raised "significantly".
1129          *
1130          * Even if window raised up to infinity, do not send window open ACK
1131          * in states, where we will not receive more. It is useless.
1132          */
1133         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1134                 __u32 rcv_window_now = tcp_receive_window(tp);
1135
1136                 /* Optimize, __tcp_select_window() is not cheap. */
1137                 if (2*rcv_window_now <= tp->window_clamp) {
1138                         __u32 new_window = __tcp_select_window(sk);
1139
1140                         /* Send ACK now, if this read freed lots of space
1141                          * in our buffer. Certainly, new_window is new window.
1142                          * We can advertise it now, if it is not less than current one.
1143                          * "Lots" means "at least twice" here.
1144                          */
1145                         if (new_window && new_window >= 2 * rcv_window_now)
1146                                 time_to_ack = 1;
1147                 }
1148         }
1149         if (time_to_ack)
1150                 tcp_send_ack(sk);
1151 }
1152
1153 static void tcp_prequeue_process(struct sock *sk)
1154 {
1155         struct sk_buff *skb;
1156         struct tcp_opt *tp = tcp_sk(sk);
1157
1158         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1159
1160         /* RX process wants to run with disabled BHs, though it is not
1161          * necessary */
1162         local_bh_disable();
1163         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1164                 sk->sk_backlog_rcv(sk, skb);
1165         local_bh_enable();
1166
1167         /* Clear memory counter. */
1168         tp->ucopy.memory = 0;
1169 }
1170
1171 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1172 {
1173         struct sk_buff *skb;
1174         u32 offset;
1175
1176         skb_queue_walk(&sk->sk_receive_queue, skb) {
1177                 offset = seq - TCP_SKB_CB(skb)->seq;
1178                 if (skb->h.th->syn)
1179                         offset--;
1180                 if (offset < skb->len || skb->h.th->fin) {
1181                         *off = offset;
1182                         return skb;
1183                 }
1184         }
1185         return NULL;
1186 }
1187
1188 /*
1189  * This routine provides an alternative to tcp_recvmsg() for routines
1190  * that would like to handle copying from skbuffs directly in 'sendfile'
1191  * fashion.
1192  * Note:
1193  *      - It is assumed that the socket was locked by the caller.
1194  *      - The routine does not block.
1195  *      - At present, there is no support for reading OOB data
1196  *        or for 'peeking' the socket using this routine
1197  *        (although both would be easy to implement).
1198  */
1199 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1200                   sk_read_actor_t recv_actor)
1201 {
1202         struct sk_buff *skb;
1203         struct tcp_opt *tp = tcp_sk(sk);
1204         u32 seq = tp->copied_seq;
1205         u32 offset;
1206         int copied = 0;
1207
1208         if (sk->sk_state == TCP_LISTEN)
1209                 return -ENOTCONN;
1210         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1211                 if (offset < skb->len) {
1212                         size_t used, len;
1213
1214                         len = skb->len - offset;
1215                         /* Stop reading if we hit a patch of urgent data */
1216                         if (tp->urg_data) {
1217                                 u32 urg_offset = tp->urg_seq - seq;
1218                                 if (urg_offset < len)
1219                                         len = urg_offset;
1220                                 if (!len)
1221                                         break;
1222                         }
1223                         used = recv_actor(desc, skb, offset, len);
1224                         if (used <= len) {
1225                                 seq += used;
1226                                 copied += used;
1227                                 offset += used;
1228                         }
1229                         if (offset != skb->len)
1230                                 break;
1231                 }
1232                 if (skb->h.th->fin) {
1233                         sk_eat_skb(sk, skb);
1234                         ++seq;
1235                         break;
1236                 }
1237                 sk_eat_skb(sk, skb);
1238                 if (!desc->count)
1239                         break;
1240         }
1241         tp->copied_seq = seq;
1242
1243         tcp_rcv_space_adjust(sk);
1244
1245         /* Clean up data we have read: This will do ACK frames. */
1246         if (copied)
1247                 cleanup_rbuf(sk, copied);
1248         return copied;
1249 }
1250
1251 /*
1252  *      This routine copies from a sock struct into the user buffer.
1253  *
1254  *      Technical note: in 2.3 we work on _locked_ socket, so that
1255  *      tricks with *seq access order and skb->users are not required.
1256  *      Probably, code can be easily improved even more.
1257  */
1258
1259 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1260                 size_t len, int nonblock, int flags, int *addr_len)
1261 {
1262         struct tcp_opt *tp = tcp_sk(sk);
1263         int copied = 0;
1264         u32 peek_seq;
1265         u32 *seq;
1266         unsigned long used;
1267         int err;
1268         int target;             /* Read at least this many bytes */
1269         long timeo;
1270         struct task_struct *user_recv = NULL;
1271
1272         lock_sock(sk);
1273
1274         TCP_CHECK_TIMER(sk);
1275
1276         err = -ENOTCONN;
1277         if (sk->sk_state == TCP_LISTEN)
1278                 goto out;
1279
1280         timeo = sock_rcvtimeo(sk, nonblock);
1281
1282         /* Urgent data needs to be handled specially. */
1283         if (flags & MSG_OOB)
1284                 goto recv_urg;
1285
1286         seq = &tp->copied_seq;
1287         if (flags & MSG_PEEK) {
1288                 peek_seq = tp->copied_seq;
1289                 seq = &peek_seq;
1290         }
1291
1292         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1293
1294         do {
1295                 struct sk_buff *skb;
1296                 u32 offset;
1297
1298                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1299                 if (tp->urg_data && tp->urg_seq == *seq) {
1300                         if (copied)
1301                                 break;
1302                         if (signal_pending(current)) {
1303                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1304                                 break;
1305                         }
1306                 }
1307
1308                 /* Next get a buffer. */
1309
1310                 skb = skb_peek(&sk->sk_receive_queue);
1311                 do {
1312                         if (!skb)
1313                                 break;
1314
1315                         /* Now that we have two receive queues this
1316                          * shouldn't happen.
1317                          */
1318                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1319                                 printk(KERN_INFO "recvmsg bug: copied %X "
1320                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1321                                 break;
1322                         }
1323                         offset = *seq - TCP_SKB_CB(skb)->seq;
1324                         if (skb->h.th->syn)
1325                                 offset--;
1326                         if (offset < skb->len)
1327                                 goto found_ok_skb;
1328                         if (skb->h.th->fin)
1329                                 goto found_fin_ok;
1330                         BUG_TRAP(flags & MSG_PEEK);
1331                         skb = skb->next;
1332                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1333
1334                 /* Well, if we have backlog, try to process it now yet. */
1335
1336                 if (copied >= target && !sk->sk_backlog.tail)
1337                         break;
1338
1339                 if (copied) {
1340                         if (sk->sk_err ||
1341                             sk->sk_state == TCP_CLOSE ||
1342                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1343                             !timeo ||
1344                             signal_pending(current) ||
1345                             (flags & MSG_PEEK))
1346                                 break;
1347                 } else {
1348                         if (sock_flag(sk, SOCK_DONE))
1349                                 break;
1350
1351                         if (sk->sk_err) {
1352                                 copied = sock_error(sk);
1353                                 break;
1354                         }
1355
1356                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1357                                 break;
1358
1359                         if (sk->sk_state == TCP_CLOSE) {
1360                                 if (!sock_flag(sk, SOCK_DONE)) {
1361                                         /* This occurs when user tries to read
1362                                          * from never connected socket.
1363                                          */
1364                                         copied = -ENOTCONN;
1365                                         break;
1366                                 }
1367                                 break;
1368                         }
1369
1370                         if (!timeo) {
1371                                 copied = -EAGAIN;
1372                                 break;
1373                         }
1374
1375                         if (signal_pending(current)) {
1376                                 copied = sock_intr_errno(timeo);
1377                                 break;
1378                         }
1379                 }
1380
1381                 cleanup_rbuf(sk, copied);
1382
1383                 if (tp->ucopy.task == user_recv) {
1384                         /* Install new reader */
1385                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1386                                 user_recv = current;
1387                                 tp->ucopy.task = user_recv;
1388                                 tp->ucopy.iov = msg->msg_iov;
1389                         }
1390
1391                         tp->ucopy.len = len;
1392
1393                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1394                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1395
1396                         /* Ugly... If prequeue is not empty, we have to
1397                          * process it before releasing socket, otherwise
1398                          * order will be broken at second iteration.
1399                          * More elegant solution is required!!!
1400                          *
1401                          * Look: we have the following (pseudo)queues:
1402                          *
1403                          * 1. packets in flight
1404                          * 2. backlog
1405                          * 3. prequeue
1406                          * 4. receive_queue
1407                          *
1408                          * Each queue can be processed only if the next ones
1409                          * are empty. At this point we have empty receive_queue.
1410                          * But prequeue _can_ be not empty after 2nd iteration,
1411                          * when we jumped to start of loop because backlog
1412                          * processing added something to receive_queue.
1413                          * We cannot release_sock(), because backlog contains
1414                          * packets arrived _after_ prequeued ones.
1415                          *
1416                          * Shortly, algorithm is clear --- to process all
1417                          * the queues in order. We could make it more directly,
1418                          * requeueing packets from backlog to prequeue, if
1419                          * is not empty. It is more elegant, but eats cycles,
1420                          * unfortunately.
1421                          */
1422                         if (skb_queue_len(&tp->ucopy.prequeue))
1423                                 goto do_prequeue;
1424
1425                         /* __ Set realtime policy in scheduler __ */
1426                 }
1427
1428                 if (copied >= target) {
1429                         /* Do not sleep, just process backlog. */
1430                         release_sock(sk);
1431                         lock_sock(sk);
1432                 } else
1433                         sk_wait_data(sk, &timeo);
1434
1435                 if (user_recv) {
1436                         int chunk;
1437
1438                         /* __ Restore normal policy in scheduler __ */
1439
1440                         if ((chunk = len - tp->ucopy.len) != 0) {
1441                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1442                                 len -= chunk;
1443                                 copied += chunk;
1444                         }
1445
1446                         if (tp->rcv_nxt == tp->copied_seq &&
1447                             skb_queue_len(&tp->ucopy.prequeue)) {
1448 do_prequeue:
1449                                 tcp_prequeue_process(sk);
1450
1451                                 if ((chunk = len - tp->ucopy.len) != 0) {
1452                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1453                                         len -= chunk;
1454                                         copied += chunk;
1455                                 }
1456                         }
1457                 }
1458                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1459                         if (net_ratelimit())
1460                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1461                                        current->comm, current->pid);
1462                         peek_seq = tp->copied_seq;
1463                 }
1464                 continue;
1465
1466         found_ok_skb:
1467                 /* Ok so how much can we use? */
1468                 used = skb->len - offset;
1469                 if (len < used)
1470                         used = len;
1471
1472                 /* Do we have urgent data here? */
1473                 if (tp->urg_data) {
1474                         u32 urg_offset = tp->urg_seq - *seq;
1475                         if (urg_offset < used) {
1476                                 if (!urg_offset) {
1477                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1478                                                 ++*seq;
1479                                                 offset++;
1480                                                 used--;
1481                                                 if (!used)
1482                                                         goto skip_copy;
1483                                         }
1484                                 } else
1485                                         used = urg_offset;
1486                         }
1487                 }
1488
1489                 if (!(flags & MSG_TRUNC)) {
1490                         err = skb_copy_datagram_iovec(skb, offset,
1491                                                       msg->msg_iov, used);
1492                         if (err) {
1493                                 /* Exception. Bailout! */
1494                                 if (!copied)
1495                                         copied = -EFAULT;
1496                                 break;
1497                         }
1498                 }
1499
1500                 *seq += used;
1501                 copied += used;
1502                 len -= used;
1503
1504                 tcp_rcv_space_adjust(sk);
1505
1506 skip_copy:
1507                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1508                         tp->urg_data = 0;
1509                         tcp_fast_path_check(sk, tp);
1510                 }
1511                 if (used + offset < skb->len)
1512                         continue;
1513
1514                 if (skb->h.th->fin)
1515                         goto found_fin_ok;
1516                 if (!(flags & MSG_PEEK))
1517                         sk_eat_skb(sk, skb);
1518                 continue;
1519
1520         found_fin_ok:
1521                 /* Process the FIN. */
1522                 ++*seq;
1523                 if (!(flags & MSG_PEEK))
1524                         sk_eat_skb(sk, skb);
1525                 break;
1526         } while (len > 0);
1527
1528         if (user_recv) {
1529                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1530                         int chunk;
1531
1532                         tp->ucopy.len = copied > 0 ? len : 0;
1533
1534                         tcp_prequeue_process(sk);
1535
1536                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1537                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1538                                 len -= chunk;
1539                                 copied += chunk;
1540                         }
1541                 }
1542
1543                 tp->ucopy.task = NULL;
1544                 tp->ucopy.len = 0;
1545         }
1546
1547         /* According to UNIX98, msg_name/msg_namelen are ignored
1548          * on connected socket. I was just happy when found this 8) --ANK
1549          */
1550
1551         /* Clean up data we have read: This will do ACK frames. */
1552         cleanup_rbuf(sk, copied);
1553
1554         TCP_CHECK_TIMER(sk);
1555         release_sock(sk);
1556         return copied;
1557
1558 out:
1559         TCP_CHECK_TIMER(sk);
1560         release_sock(sk);
1561         return err;
1562
1563 recv_urg:
1564         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1565         goto out;
1566 }
1567
1568 /*
1569  *      State processing on a close. This implements the state shift for
1570  *      sending our FIN frame. Note that we only send a FIN for some
1571  *      states. A shutdown() may have already sent the FIN, or we may be
1572  *      closed.
1573  */
1574
1575 static unsigned char new_state[16] = {
1576   /* current state:        new state:      action:      */
1577   /* (Invalid)          */ TCP_CLOSE,
1578   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1579   /* TCP_SYN_SENT       */ TCP_CLOSE,
1580   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1581   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1582   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1583   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1584   /* TCP_CLOSE          */ TCP_CLOSE,
1585   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1586   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1587   /* TCP_LISTEN         */ TCP_CLOSE,
1588   /* TCP_CLOSING        */ TCP_CLOSING,
1589 };
1590
1591 static int tcp_close_state(struct sock *sk)
1592 {
1593         int next = (int)new_state[sk->sk_state];
1594         int ns = next & TCP_STATE_MASK;
1595
1596         tcp_set_state(sk, ns);
1597
1598         return next & TCP_ACTION_FIN;
1599 }
1600
1601 /*
1602  *      Shutdown the sending side of a connection. Much like close except
1603  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1604  */
1605
1606 void tcp_shutdown(struct sock *sk, int how)
1607 {
1608         /*      We need to grab some memory, and put together a FIN,
1609          *      and then put it into the queue to be sent.
1610          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1611          */
1612         if (!(how & SEND_SHUTDOWN))
1613                 return;
1614
1615         /* If we've already sent a FIN, or it's a closed state, skip this. */
1616         if ((1 << sk->sk_state) &
1617             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1618              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1619                 /* Clear out any half completed packets.  FIN if needed. */
1620                 if (tcp_close_state(sk))
1621                         tcp_send_fin(sk);
1622         }
1623 }
1624
1625 /*
1626  * At this point, there should be no process reference to this
1627  * socket, and thus no user references at all.  Therefore we
1628  * can assume the socket waitqueue is inactive and nobody will
1629  * try to jump onto it.
1630  */
1631 void tcp_destroy_sock(struct sock *sk)
1632 {
1633         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1634         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1635
1636         /* It cannot be in hash table! */
1637         BUG_TRAP(sk_unhashed(sk));
1638
1639         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1640         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1641
1642 #ifdef TCP_DEBUG
1643         if (sk->sk_zapped) {
1644                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1645                 sock_hold(sk);
1646         }
1647         sk->sk_zapped = 1;
1648 #endif
1649
1650         sk->sk_prot->destroy(sk);
1651
1652         sk_stream_kill_queues(sk);
1653
1654         xfrm_sk_free_policy(sk);
1655
1656 #ifdef INET_REFCNT_DEBUG
1657         if (atomic_read(&sk->sk_refcnt) != 1) {
1658                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1659                        sk, atomic_read(&sk->sk_refcnt));
1660         }
1661 #endif
1662
1663         atomic_dec(&tcp_orphan_count);
1664         sock_put(sk);
1665 }
1666
1667 void tcp_close(struct sock *sk, long timeout)
1668 {
1669         struct sk_buff *skb;
1670         int data_was_unread = 0;
1671
1672         lock_sock(sk);
1673         sk->sk_shutdown = SHUTDOWN_MASK;
1674
1675         if (sk->sk_state == TCP_LISTEN) {
1676                 tcp_set_state(sk, TCP_CLOSE);
1677
1678                 /* Special case. */
1679                 tcp_listen_stop(sk);
1680
1681                 goto adjudge_to_death;
1682         }
1683
1684         /*  We need to flush the recv. buffs.  We do this only on the
1685          *  descriptor close, not protocol-sourced closes, because the
1686          *  reader process may not have drained the data yet!
1687          */
1688         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1689                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1690                           skb->h.th->fin;
1691                 data_was_unread += len;
1692                 __kfree_skb(skb);
1693         }
1694
1695         sk_stream_mem_reclaim(sk);
1696
1697         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1698          * 3.10, we send a RST here because data was lost.  To
1699          * witness the awful effects of the old behavior of always
1700          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1701          * a bulk GET in an FTP client, suspend the process, wait
1702          * for the client to advertise a zero window, then kill -9
1703          * the FTP client, wheee...  Note: timeout is always zero
1704          * in such a case.
1705          */
1706         if (data_was_unread) {
1707                 /* Unread data was tossed, zap the connection. */
1708                 NET_INC_STATS_USER(TCPAbortOnClose);
1709                 tcp_set_state(sk, TCP_CLOSE);
1710                 tcp_send_active_reset(sk, GFP_KERNEL);
1711         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1712                 /* Check zero linger _after_ checking for unread data. */
1713                 sk->sk_prot->disconnect(sk, 0);
1714                 NET_INC_STATS_USER(TCPAbortOnData);
1715         } else if (tcp_close_state(sk)) {
1716                 /* We FIN if the application ate all the data before
1717                  * zapping the connection.
1718                  */
1719
1720                 /* RED-PEN. Formally speaking, we have broken TCP state
1721                  * machine. State transitions:
1722                  *
1723                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1724                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1725                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1726                  *
1727                  * are legal only when FIN has been sent (i.e. in window),
1728                  * rather than queued out of window. Purists blame.
1729                  *
1730                  * F.e. "RFC state" is ESTABLISHED,
1731                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1732                  *
1733                  * The visible declinations are that sometimes
1734                  * we enter time-wait state, when it is not required really
1735                  * (harmless), do not send active resets, when they are
1736                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1737                  * they look as CLOSING or LAST_ACK for Linux)
1738                  * Probably, I missed some more holelets.
1739                  *                                              --ANK
1740                  */
1741                 tcp_send_fin(sk);
1742         }
1743
1744         sk_stream_wait_close(sk, timeout);
1745
1746 adjudge_to_death:
1747         /* It is the last release_sock in its life. It will remove backlog. */
1748         release_sock(sk);
1749
1750
1751         /* Now socket is owned by kernel and we acquire BH lock
1752            to finish close. No need to check for user refs.
1753          */
1754         local_bh_disable();
1755         bh_lock_sock(sk);
1756         BUG_TRAP(!sock_owned_by_user(sk));
1757
1758         sock_hold(sk);
1759         sock_orphan(sk);
1760
1761         /*      This is a (useful) BSD violating of the RFC. There is a
1762          *      problem with TCP as specified in that the other end could
1763          *      keep a socket open forever with no application left this end.
1764          *      We use a 3 minute timeout (about the same as BSD) then kill
1765          *      our end. If they send after that then tough - BUT: long enough
1766          *      that we won't make the old 4*rto = almost no time - whoops
1767          *      reset mistake.
1768          *
1769          *      Nope, it was not mistake. It is really desired behaviour
1770          *      f.e. on http servers, when such sockets are useless, but
1771          *      consume significant resources. Let's do it with special
1772          *      linger2 option.                                 --ANK
1773          */
1774
1775         if (sk->sk_state == TCP_FIN_WAIT2) {
1776                 struct tcp_opt *tp = tcp_sk(sk);
1777                 if (tp->linger2 < 0) {
1778                         tcp_set_state(sk, TCP_CLOSE);
1779                         tcp_send_active_reset(sk, GFP_ATOMIC);
1780                         NET_INC_STATS_BH(TCPAbortOnLinger);
1781                 } else {
1782                         int tmo = tcp_fin_time(tp);
1783
1784                         if (tmo > TCP_TIMEWAIT_LEN) {
1785                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1786                         } else {
1787                                 atomic_inc(&tcp_orphan_count);
1788                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1789                                 goto out;
1790                         }
1791                 }
1792         }
1793         if (sk->sk_state != TCP_CLOSE) {
1794                 sk_stream_mem_reclaim(sk);
1795                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1796                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1797                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1798                         if (net_ratelimit())
1799                                 printk(KERN_INFO "TCP: too many of orphaned "
1800                                        "sockets\n");
1801                         tcp_set_state(sk, TCP_CLOSE);
1802                         tcp_send_active_reset(sk, GFP_ATOMIC);
1803                         NET_INC_STATS_BH(TCPAbortOnMemory);
1804                 }
1805         }
1806         atomic_inc(&tcp_orphan_count);
1807
1808         if (sk->sk_state == TCP_CLOSE)
1809                 tcp_destroy_sock(sk);
1810         /* Otherwise, socket is reprieved until protocol close. */
1811
1812 out:
1813         bh_unlock_sock(sk);
1814         local_bh_enable();
1815         sock_put(sk);
1816 }
1817
1818 /* These states need RST on ABORT according to RFC793 */
1819
1820 static inline int tcp_need_reset(int state)
1821 {
1822         return (1 << state) &
1823                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1824                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1825 }
1826
1827 int tcp_disconnect(struct sock *sk, int flags)
1828 {
1829         struct inet_opt *inet = inet_sk(sk);
1830         struct tcp_opt *tp = tcp_sk(sk);
1831         int err = 0;
1832         int old_state = sk->sk_state;
1833
1834         if (old_state != TCP_CLOSE)
1835                 tcp_set_state(sk, TCP_CLOSE);
1836
1837         /* ABORT function of RFC793 */
1838         if (old_state == TCP_LISTEN) {
1839                 tcp_listen_stop(sk);
1840         } else if (tcp_need_reset(old_state) ||
1841                    (tp->snd_nxt != tp->write_seq &&
1842                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1843                 /* The last check adjusts for discrepance of Linux wrt. RFC
1844                  * states
1845                  */
1846                 tcp_send_active_reset(sk, gfp_any());
1847                 sk->sk_err = ECONNRESET;
1848         } else if (old_state == TCP_SYN_SENT)
1849                 sk->sk_err = ECONNRESET;
1850
1851         tcp_clear_xmit_timers(sk);
1852         __skb_queue_purge(&sk->sk_receive_queue);
1853         sk_stream_writequeue_purge(sk);
1854         __skb_queue_purge(&tp->out_of_order_queue);
1855
1856         inet->dport = 0;
1857
1858         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1859                 inet_reset_saddr(sk);
1860
1861         sk->sk_shutdown = 0;
1862         sock_reset_flag(sk, SOCK_DONE);
1863         tp->srtt = 0;
1864         if ((tp->write_seq += tp->max_window + 2) == 0)
1865                 tp->write_seq = 1;
1866         tp->backoff = 0;
1867         tp->snd_cwnd = 2;
1868         tp->probes_out = 0;
1869         tp->packets_out = 0;
1870         tp->snd_ssthresh = 0x7fffffff;
1871         tp->snd_cwnd_cnt = 0;
1872         tcp_set_ca_state(tp, TCP_CA_Open);
1873         tcp_clear_retrans(tp);
1874         tcp_delack_init(tp);
1875         sk->sk_send_head = NULL;
1876         tp->saw_tstamp = 0;
1877         tcp_sack_reset(tp);
1878         __sk_dst_reset(sk);
1879
1880         BUG_TRAP(!inet->num || tp->bind_hash);
1881
1882         sk->sk_error_report(sk);
1883         return err;
1884 }
1885
1886 /*
1887  *      Wait for an incoming connection, avoid race
1888  *      conditions. This must be called with the socket locked.
1889  */
1890 static int wait_for_connect(struct sock *sk, long timeo)
1891 {
1892         struct tcp_opt *tp = tcp_sk(sk);
1893         DEFINE_WAIT(wait);
1894         int err;
1895
1896         /*
1897          * True wake-one mechanism for incoming connections: only
1898          * one process gets woken up, not the 'whole herd'.
1899          * Since we do not 'race & poll' for established sockets
1900          * anymore, the common case will execute the loop only once.
1901          *
1902          * Subtle issue: "add_wait_queue_exclusive()" will be added
1903          * after any current non-exclusive waiters, and we know that
1904          * it will always _stay_ after any new non-exclusive waiters
1905          * because all non-exclusive waiters are added at the
1906          * beginning of the wait-queue. As such, it's ok to "drop"
1907          * our exclusiveness temporarily when we get woken up without
1908          * having to remove and re-insert us on the wait queue.
1909          */
1910         for (;;) {
1911                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1912                                           TASK_INTERRUPTIBLE);
1913                 release_sock(sk);
1914                 if (!tp->accept_queue)
1915                         timeo = schedule_timeout(timeo);
1916                 lock_sock(sk);
1917                 err = 0;
1918                 if (tp->accept_queue)
1919                         break;
1920                 err = -EINVAL;
1921                 if (sk->sk_state != TCP_LISTEN)
1922                         break;
1923                 err = sock_intr_errno(timeo);
1924                 if (signal_pending(current))
1925                         break;
1926                 err = -EAGAIN;
1927                 if (!timeo)
1928                         break;
1929         }
1930         finish_wait(sk->sk_sleep, &wait);
1931         return err;
1932 }
1933
1934 /*
1935  *      This will accept the next outstanding connection.
1936  */
1937
1938 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1939 {
1940         struct tcp_opt *tp = tcp_sk(sk);
1941         struct open_request *req;
1942         struct sock *newsk;
1943         int error;
1944 #ifdef CONFIG_ACCEPT_QUEUES
1945         int prev_class = 0;
1946         int first;
1947 #endif
1948
1949         lock_sock(sk);
1950
1951         /* We need to make sure that this socket is listening,
1952          * and that it has something pending.
1953          */
1954         error = -EINVAL;
1955         if (sk->sk_state != TCP_LISTEN)
1956                 goto out;
1957
1958         /* Find already established connection */
1959         if (!tp->accept_queue) {
1960                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1961                 /* If this is a non blocking socket don't sleep */
1962                 error = -EAGAIN;
1963                 if (!timeo)
1964                         goto out;
1965
1966                 error = wait_for_connect(sk, timeo);
1967                 if (error)
1968                         goto out;
1969         }
1970
1971 #ifndef CONFIG_ACCEPT_QUEUES
1972         req = tp->accept_queue;
1973         if ((tp->accept_queue = req->dl_next) == NULL)
1974                 tp->accept_queue_tail = NULL;
1975         newsk = req->sk;
1976         sk_acceptq_removed(sk);
1977 #else
1978         first = tp->class_index;
1979         /* We should always have  request queued here. The accept_queue
1980          * is already checked for NULL above.
1981          */
1982         while(!tp->acceptq[first].aq_head) {
1983                 tp->acceptq[first].aq_cnt = 0;
1984                 first = (first+1) & ~NUM_ACCEPT_QUEUES;
1985         }
1986         req = tp->acceptq[first].aq_head;
1987         tp->acceptq[first].aq_qcount--;
1988         tp->acceptq[first].aq_count++;
1989         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
1990
1991         for (prev_class= first-1 ; prev_class >=0; prev_class--)
1992                 if (tp->acceptq[prev_class].aq_tail)
1993                         break;
1994         if (prev_class>=0)
1995                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
1996         else
1997                 tp->accept_queue = req->dl_next;
1998
1999         if (req == tp->acceptq[first].aq_tail)
2000                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2001         else
2002                 tp->acceptq[first].aq_head = req->dl_next;
2003
2004         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2005                 tp->acceptq[first].aq_cnt = 0;
2006                 tp->class_index = ++first & (NUM_ACCEPT_QUEUES-1);
2007         }
2008         newsk = req->sk;
2009         sk_acceptq_removed(sk, req->acceptq_class);
2010 #endif
2011         tcp_openreq_fastfree(req);
2012         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2013         release_sock(sk);
2014         return newsk;
2015
2016 out:
2017         release_sock(sk);
2018         *err = error;
2019         return NULL;
2020 }
2021
2022
2023 /*
2024  *      Socket option code for TCP.
2025  */
2026 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2027                    int optlen)
2028 {
2029         struct tcp_opt *tp = tcp_sk(sk);
2030         int val;
2031         int err = 0;
2032
2033         if (level != SOL_TCP)
2034                 return tp->af_specific->setsockopt(sk, level, optname,
2035                                                    optval, optlen);
2036
2037         if (optlen < sizeof(int))
2038                 return -EINVAL;
2039
2040         if (get_user(val, (int __user *)optval))
2041                 return -EFAULT;
2042
2043         lock_sock(sk);
2044
2045         switch (optname) {
2046         case TCP_MAXSEG:
2047                 /* Values greater than interface MTU won't take effect. However
2048                  * at the point when this call is done we typically don't yet
2049                  * know which interface is going to be used */
2050                 if (val < 8 || val > MAX_TCP_WINDOW) {
2051                         err = -EINVAL;
2052                         break;
2053                 }
2054                 tp->user_mss = val;
2055                 break;
2056
2057         case TCP_NODELAY:
2058                 if (val) {
2059                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2060                          * this option on corked socket is remembered, but
2061                          * it is not activated until cork is cleared.
2062                          *
2063                          * However, when TCP_NODELAY is set we make
2064                          * an explicit push, which overrides even TCP_CORK
2065                          * for currently queued segments.
2066                          */
2067                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2068                         tcp_push_pending_frames(sk, tp);
2069                 } else {
2070                         tp->nonagle &= ~TCP_NAGLE_OFF;
2071                 }
2072                 break;
2073
2074         case TCP_CORK:
2075                 /* When set indicates to always queue non-full frames.
2076                  * Later the user clears this option and we transmit
2077                  * any pending partial frames in the queue.  This is
2078                  * meant to be used alongside sendfile() to get properly
2079                  * filled frames when the user (for example) must write
2080                  * out headers with a write() call first and then use
2081                  * sendfile to send out the data parts.
2082                  *
2083                  * TCP_CORK can be set together with TCP_NODELAY and it is
2084                  * stronger than TCP_NODELAY.
2085                  */
2086                 if (val) {
2087                         tp->nonagle |= TCP_NAGLE_CORK;
2088                 } else {
2089                         tp->nonagle &= ~TCP_NAGLE_CORK;
2090                         if (tp->nonagle&TCP_NAGLE_OFF)
2091                                 tp->nonagle |= TCP_NAGLE_PUSH;
2092                         tcp_push_pending_frames(sk, tp);
2093                 }
2094                 break;
2095
2096         case TCP_KEEPIDLE:
2097                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2098                         err = -EINVAL;
2099                 else {
2100                         tp->keepalive_time = val * HZ;
2101                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2102                             !((1 << sk->sk_state) &
2103                               (TCPF_CLOSE | TCPF_LISTEN))) {
2104                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2105                                 if (tp->keepalive_time > elapsed)
2106                                         elapsed = tp->keepalive_time - elapsed;
2107                                 else
2108                                         elapsed = 0;
2109                                 tcp_reset_keepalive_timer(sk, elapsed);
2110                         }
2111                 }
2112                 break;
2113         case TCP_KEEPINTVL:
2114                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2115                         err = -EINVAL;
2116                 else
2117                         tp->keepalive_intvl = val * HZ;
2118                 break;
2119         case TCP_KEEPCNT:
2120                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2121                         err = -EINVAL;
2122                 else
2123                         tp->keepalive_probes = val;
2124                 break;
2125         case TCP_SYNCNT:
2126                 if (val < 1 || val > MAX_TCP_SYNCNT)
2127                         err = -EINVAL;
2128                 else
2129                         tp->syn_retries = val;
2130                 break;
2131
2132         case TCP_LINGER2:
2133                 if (val < 0)
2134                         tp->linger2 = -1;
2135                 else if (val > sysctl_tcp_fin_timeout / HZ)
2136                         tp->linger2 = 0;
2137                 else
2138                         tp->linger2 = val * HZ;
2139                 break;
2140
2141         case TCP_DEFER_ACCEPT:
2142                 tp->defer_accept = 0;
2143                 if (val > 0) {
2144                         /* Translate value in seconds to number of
2145                          * retransmits */
2146                         while (tp->defer_accept < 32 &&
2147                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2148                                        tp->defer_accept))
2149                                 tp->defer_accept++;
2150                         tp->defer_accept++;
2151                 }
2152                 break;
2153
2154         case TCP_WINDOW_CLAMP:
2155                 if (!val) {
2156                         if (sk->sk_state != TCP_CLOSE) {
2157                                 err = -EINVAL;
2158                                 break;
2159                         }
2160                         tp->window_clamp = 0;
2161                 } else
2162                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2163                                                 SOCK_MIN_RCVBUF / 2 : val;
2164                 break;
2165
2166         case TCP_QUICKACK:
2167                 if (!val) {
2168                         tp->ack.pingpong = 1;
2169                 } else {
2170                         tp->ack.pingpong = 0;
2171                         if ((1 << sk->sk_state) &
2172                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2173                             tcp_ack_scheduled(tp)) {
2174                                 tp->ack.pending |= TCP_ACK_PUSHED;
2175                                 cleanup_rbuf(sk, 1);
2176                                 if (!(val & 1))
2177                                         tp->ack.pingpong = 1;
2178                         }
2179                 }
2180                 break;
2181
2182 #ifdef CONFIG_ACCEPT_QUEUES
2183         case TCP_ACCEPTQ_SHARE:
2184 #ifdef CONFIG_CKRM
2185                 // If CKRM is set then the shares are set through rcfs.
2186                 // Get shares will still succeed.
2187                 err = -EOPNOTSUPP;
2188                 break;
2189 #else
2190                 {
2191                         char share_wt[NUM_ACCEPT_QUEUES];
2192                         int i,j;
2193
2194                         if (sk->sk_state != TCP_LISTEN)
2195                                 return -EOPNOTSUPP;
2196
2197                         if (copy_from_user(share_wt,optval, optlen)) {
2198                                 err = -EFAULT;
2199                                 break;
2200                         }
2201                         j = 0;
2202                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2203                                 if (share_wt[i]) {
2204                                         if (!j)
2205                                                 j = share_wt[i];
2206                                         else if (share_wt[i] < j) {
2207                                                 j = share_wt[i];
2208                                         }
2209                                 }
2210                                 else
2211                                         tp->acceptq[i].aq_ratio = 0;
2212
2213                         }
2214                         if (j == 0) {
2215                                 /* Class 0 is always valid. If nothing is
2216                                  * specified set class 0 as 1.
2217                                  */
2218                                 share_wt[0] = 1;
2219                                 j = 1;
2220                         }
2221                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2222                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2223                                 tp->acceptq[i].aq_cnt = 0;
2224                         }
2225                 }
2226                 break;
2227 #endif
2228 #endif
2229         default:
2230                 err = -ENOPROTOOPT;
2231                 break;
2232         };
2233         release_sock(sk);
2234         return err;
2235 }
2236
2237 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2238                    int __user *optlen)
2239 {
2240         struct tcp_opt *tp = tcp_sk(sk);
2241         int val, len;
2242
2243         if (level != SOL_TCP)
2244                 return tp->af_specific->getsockopt(sk, level, optname,
2245                                                    optval, optlen);
2246
2247         if (get_user(len, optlen))
2248                 return -EFAULT;
2249
2250         len = min_t(unsigned int, len, sizeof(int));
2251
2252         if (len < 0)
2253                 return -EINVAL;
2254
2255         switch (optname) {
2256         case TCP_MAXSEG:
2257                 val = tp->mss_cache_std;
2258                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2259                         val = tp->user_mss;
2260                 break;
2261         case TCP_NODELAY:
2262                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2263                 break;
2264         case TCP_CORK:
2265                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2266                 break;
2267         case TCP_KEEPIDLE:
2268                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2269                 break;
2270         case TCP_KEEPINTVL:
2271                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2272                 break;
2273         case TCP_KEEPCNT:
2274                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2275                 break;
2276         case TCP_SYNCNT:
2277                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2278                 break;
2279         case TCP_LINGER2:
2280                 val = tp->linger2;
2281                 if (val >= 0)
2282                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2283                 break;
2284         case TCP_DEFER_ACCEPT:
2285                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2286                                                (tp->defer_accept - 1));
2287                 break;
2288         case TCP_WINDOW_CLAMP:
2289                 val = tp->window_clamp;
2290                 break;
2291         case TCP_INFO: {
2292                 struct tcp_info info;
2293
2294                 if (get_user(len, optlen))
2295                         return -EFAULT;
2296
2297                 tcp_get_info(sk, &info);
2298
2299                 len = min_t(unsigned int, len, sizeof(info));
2300                 if (put_user(len, optlen))
2301                         return -EFAULT;
2302                 if (copy_to_user(optval, &info, len))
2303                         return -EFAULT;
2304                 return 0;
2305         }
2306         case TCP_QUICKACK:
2307                 val = !tp->ack.pingpong;
2308                 break;
2309
2310 #ifdef CONFIG_ACCEPT_QUEUES
2311         case TCP_ACCEPTQ_SHARE:
2312         {
2313                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2314                 int i;
2315
2316                 if (sk->sk_state != TCP_LISTEN)
2317                         return -EOPNOTSUPP;
2318
2319                 if (get_user(len, optlen))
2320                         return -EFAULT;
2321
2322                 memset(tinfo, 0, sizeof(tinfo));
2323
2324                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2325                         tinfo[i].acceptq_wait_time =
2326                              jiffies_to_msecs(tp->acceptq[i].aq_wait_time);
2327                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2328                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2329                         tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2330                 }
2331
2332                 len = min_t(unsigned int, len, sizeof(tinfo));
2333                 if (put_user(len, optlen))
2334                         return -EFAULT;
2335
2336                 if (copy_to_user(optval, (char *)tinfo, len))
2337                         return -EFAULT;
2338
2339                 return 0;
2340         }
2341         break;
2342 #endif
2343         default:
2344                 return -ENOPROTOOPT;
2345         };
2346
2347         if (put_user(len, optlen))
2348                 return -EFAULT;
2349         if (copy_to_user(optval, &val, len))
2350                 return -EFAULT;
2351         return 0;
2352 }
2353
2354
2355 extern void __skb_cb_too_small_for_tcp(int, int);
2356 extern void tcpdiag_init(void);
2357
2358 static __initdata unsigned long thash_entries;
2359 static int __init set_thash_entries(char *str)
2360 {
2361         if (!str)
2362                 return 0;
2363         thash_entries = simple_strtoul(str, &str, 0);
2364         return 1;
2365 }
2366 __setup("thash_entries=", set_thash_entries);
2367
2368 void __init tcp_init(void)
2369 {
2370         struct sk_buff *skb = NULL;
2371         unsigned long goal;
2372         int order, i;
2373
2374         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2375                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2376                                            sizeof(skb->cb));
2377
2378         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2379                                                    sizeof(struct open_request),
2380                                                0, SLAB_HWCACHE_ALIGN,
2381                                                NULL, NULL);
2382         if (!tcp_openreq_cachep)
2383                 panic("tcp_init: Cannot alloc open_request cache.");
2384
2385         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2386                                               sizeof(struct tcp_bind_bucket),
2387                                               0, SLAB_HWCACHE_ALIGN,
2388                                               NULL, NULL);
2389         if (!tcp_bucket_cachep)
2390                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2391
2392         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2393                                                 sizeof(struct tcp_tw_bucket),
2394                                                 0, SLAB_HWCACHE_ALIGN,
2395                                                 NULL, NULL);
2396         if (!tcp_timewait_cachep)
2397                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2398
2399         /* Size and allocate the main established and bind bucket
2400          * hash tables.
2401          *
2402          * The methodology is similar to that of the buffer cache.
2403          */
2404         if (num_physpages >= (128 * 1024))
2405                 goal = num_physpages >> (21 - PAGE_SHIFT);
2406         else
2407                 goal = num_physpages >> (23 - PAGE_SHIFT);
2408
2409         if (thash_entries)
2410                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2411         for (order = 0; (1UL << order) < goal; order++)
2412                 ;
2413         do {
2414                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2415                         sizeof(struct tcp_ehash_bucket);
2416                 tcp_ehash_size >>= 1;
2417                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2418                         tcp_ehash_size--;
2419                 tcp_ehash = (struct tcp_ehash_bucket *)
2420                         __get_free_pages(GFP_ATOMIC, order);
2421         } while (!tcp_ehash && --order > 0);
2422
2423         if (!tcp_ehash)
2424                 panic("Failed to allocate TCP established hash table\n");
2425         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2426                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2427                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2428         }
2429
2430         do {
2431                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2432                         sizeof(struct tcp_bind_hashbucket);
2433                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2434                         continue;
2435                 tcp_bhash = (struct tcp_bind_hashbucket *)
2436                         __get_free_pages(GFP_ATOMIC, order);
2437         } while (!tcp_bhash && --order >= 0);
2438
2439         if (!tcp_bhash)
2440                 panic("Failed to allocate TCP bind hash table\n");
2441         for (i = 0; i < tcp_bhash_size; i++) {
2442                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2443                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2444         }
2445
2446         /* Try to be a bit smarter and adjust defaults depending
2447          * on available memory.
2448          */
2449         if (order > 4) {
2450                 sysctl_local_port_range[0] = 32768;
2451                 sysctl_local_port_range[1] = 61000;
2452                 sysctl_tcp_max_tw_buckets = 180000;
2453                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2454                 sysctl_max_syn_backlog = 1024;
2455         } else if (order < 3) {
2456                 sysctl_local_port_range[0] = 1024 * (3 - order);
2457                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2458                 sysctl_tcp_max_orphans >>= (3 - order);
2459                 sysctl_max_syn_backlog = 128;
2460         }
2461         tcp_port_rover = sysctl_local_port_range[0] - 1;
2462
2463         sysctl_tcp_mem[0] =  768 << order;
2464         sysctl_tcp_mem[1] = 1024 << order;
2465         sysctl_tcp_mem[2] = 1536 << order;
2466
2467         if (order < 3) {
2468                 sysctl_tcp_wmem[2] = 64 * 1024;
2469                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2470                 sysctl_tcp_rmem[1] = 43689;
2471                 sysctl_tcp_rmem[2] = 2 * 43689;
2472         }
2473
2474         printk(KERN_INFO "TCP: Hash tables configured "
2475                "(established %d bind %d)\n",
2476                tcp_ehash_size << 1, tcp_bhash_size);
2477
2478         tcpdiag_init();
2479 }
2480
2481 EXPORT_SYMBOL(tcp_accept);
2482 EXPORT_SYMBOL(tcp_close);
2483 EXPORT_SYMBOL(tcp_close_state);
2484 EXPORT_SYMBOL(tcp_destroy_sock);
2485 EXPORT_SYMBOL(tcp_disconnect);
2486 EXPORT_SYMBOL(tcp_getsockopt);
2487 EXPORT_SYMBOL(tcp_ioctl);
2488 EXPORT_SYMBOL(tcp_openreq_cachep);
2489 EXPORT_SYMBOL(tcp_poll);
2490 EXPORT_SYMBOL(tcp_read_sock);
2491 EXPORT_SYMBOL(tcp_recvmsg);
2492 EXPORT_SYMBOL(tcp_sendmsg);
2493 EXPORT_SYMBOL(tcp_sendpage);
2494 EXPORT_SYMBOL(tcp_setsockopt);
2495 EXPORT_SYMBOL(tcp_shutdown);
2496 EXPORT_SYMBOL(tcp_statistics);
2497 EXPORT_SYMBOL(tcp_timewait_cachep);