net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259
 260 #ifdef CONFIG_CKRM
 261 #include <linux/ckrm.h>
 262 #endif
 263
 264 #include <net/icmp.h>
 265 #include <net/tcp.h>
 266 #include <net/xfrm.h>
 267 #include <net/ip.h>
 268
 269
 270 #include <asm/uaccess.h>
 271 #include <asm/ioctls.h>
 272
 273 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 274
 275 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 276
 277 kmem_cache_t *tcp_openreq_cachep;
 278 kmem_cache_t *tcp_bucket_cachep;
 279 kmem_cache_t *tcp_timewait_cachep;
 280
 281 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 282
 283 int sysctl_tcp_default_win_scale = 7;
 284
 285 int sysctl_tcp_mem[3];
 286 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 287 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 288
 289 EXPORT_SYMBOL(sysctl_tcp_mem);
 290 EXPORT_SYMBOL(sysctl_tcp_rmem);
 291 EXPORT_SYMBOL(sysctl_tcp_wmem);
 292
 293 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 294 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 295
 296 EXPORT_SYMBOL(tcp_memory_allocated);
 297 EXPORT_SYMBOL(tcp_sockets_allocated);
 298
 299 /*
 300  * Pressure flag: try to collapse.
 301  * Technical note: it is used by multiple contexts non atomically.
 302  * All the sk_stream_mem_schedule() is of this nature: accounting
 303  * is strict, actions are advisory and have some latency.
 304  */
 305 int tcp_memory_pressure;
 306
 307 EXPORT_SYMBOL(tcp_memory_pressure);
 308
 309 void tcp_enter_memory_pressure(void)
 310 {
 311         if (!tcp_memory_pressure) {
 312                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 313                 tcp_memory_pressure = 1;
 314         }
 315 }
 316
 317 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 318
 319 /*
 320  * LISTEN is a special case for poll..
 321  */
 322 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 323                                                poll_table *wait)
 324 {
 325         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 326 }
 327
 328 /*
 329  *      Wait for a TCP event.
 330  *
 331  *      Note that we don't need to lock the socket, as the upper poll layers
 332  *      take care of normal races (between the test and the event) and we don't
 333  *      go look at any of the socket buffers directly.
 334  */
 335 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 336 {
 337         unsigned int mask;
 338         struct sock *sk = sock->sk;
 339         struct tcp_opt *tp = tcp_sk(sk);
 340
 341         poll_wait(file, sk->sk_sleep, wait);
 342         if (sk->sk_state == TCP_LISTEN)
 343                 return tcp_listen_poll(sk, wait);
 344
 345         /* Socket is not locked. We are protected from async events
 346            by poll logic and correct handling of state changes
 347            made by another threads is impossible in any case.
 348          */
 349
 350         mask = 0;
 351         if (sk->sk_err)
 352                 mask = POLLERR;
 353
 354         /*
 355          * POLLHUP is certainly not done right. But poll() doesn't
 356          * have a notion of HUP in just one direction, and for a
 357          * socket the read side is more interesting.
 358          *
 359          * Some poll() documentation says that POLLHUP is incompatible
 360          * with the POLLOUT/POLLWR flags, so somebody should check this
 361          * all. But careful, it tends to be safer to return too many
 362          * bits than too few, and you can easily break real applications
 363          * if you don't tell them that something has hung up!
 364          *
 365          * Check-me.
 366          *
 367          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 368          * our fs/select.c). It means that after we received EOF,
 369          * poll always returns immediately, making impossible poll() on write()
 370          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 371          * if and only if shutdown has been made in both directions.
 372          * Actually, it is interesting to look how Solaris and DUX
 373          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 374          * then we could set it on SND_SHUTDOWN. BTW examples given
 375          * in Stevens' books assume exactly this behaviour, it explains
 376          * why PULLHUP is incompatible with POLLOUT.    --ANK
 377          *
 378          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 379          * blocking on fresh not-connected or disconnected socket. --ANK
 380          */
 381         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 382                 mask |= POLLHUP;
 383         if (sk->sk_shutdown & RCV_SHUTDOWN)
 384                 mask |= POLLIN | POLLRDNORM;
 385
 386         /* Connected? */
 387         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 388                 /* Potential race condition. If read of tp below will
 389                  * escape above sk->sk_state, we can be illegally awaken
 390                  * in SYN_* states. */
 391                 if ((tp->rcv_nxt != tp->copied_seq) &&
 392                     (tp->urg_seq != tp->copied_seq ||
 393                      tp->rcv_nxt != tp->copied_seq + 1 ||
 394                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 395                         mask |= POLLIN | POLLRDNORM;
 396
 397                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 398                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 399                                 mask |= POLLOUT | POLLWRNORM;
 400                         } else {  /* send SIGIO later */
 401                                 set_bit(SOCK_ASYNC_NOSPACE,
 402                                         &sk->sk_socket->flags);
 403                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 404
 405                                 /* Race breaker. If space is freed after
 406                                  * wspace test but before the flags are set,
 407                                  * IO signal will be lost.
 408                                  */
 409                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 410                                         mask |= POLLOUT | POLLWRNORM;
 411                         }
 412                 }
 413
 414                 if (tp->urg_data & TCP_URG_VALID)
 415                         mask |= POLLPRI;
 416         }
 417         return mask;
 418 }
 419
 420 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 421 {
 422         struct tcp_opt *tp = tcp_sk(sk);
 423         int answ;
 424
 425         switch (cmd) {
 426         case SIOCINQ:
 427                 if (sk->sk_state == TCP_LISTEN)
 428                         return -EINVAL;
 429
 430                 lock_sock(sk);
 431                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 432                         answ = 0;
 433                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 434                          !tp->urg_data ||
 435                          before(tp->urg_seq, tp->copied_seq) ||
 436                          !before(tp->urg_seq, tp->rcv_nxt)) {
 437                         answ = tp->rcv_nxt - tp->copied_seq;
 438
 439                         /* Subtract 1, if FIN is in queue. */
 440                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 441                                 answ -=
 442                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 443                 } else
 444                         answ = tp->urg_seq - tp->copied_seq;
 445                 release_sock(sk);
 446                 break;
 447         case SIOCATMARK:
 448                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 449                 break;
 450         case SIOCOUTQ:
 451                 if (sk->sk_state == TCP_LISTEN)
 452                         return -EINVAL;
 453
 454                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 455                         answ = 0;
 456                 else
 457                         answ = tp->write_seq - tp->snd_una;
 458                 break;
 459         default:
 460                 return -ENOIOCTLCMD;
 461         };
 462
 463         return put_user(answ, (int __user *)arg);
 464 }
 465
 466
 467 int tcp_listen_start(struct sock *sk)
 468 {
 469 #ifdef CONFIG_ACCEPT_QUEUES
 470         int i = 0;
 471 #endif
 472         struct inet_opt *inet = inet_sk(sk);
 473         struct tcp_opt *tp = tcp_sk(sk);
 474         struct tcp_listen_opt *lopt;
 475
 476         sk->sk_max_ack_backlog = 0;
 477         sk->sk_ack_backlog = 0;
 478 #ifdef CONFIG_ACCEPT_QUEUES
 479         tp->accept_queue = NULL;
 480 #else
 481         tp->accept_queue = tp->accept_queue_tail = NULL;
 482 #endif
 483         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 484         tcp_delack_init(tp);
 485
 486         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 487         if (!lopt)
 488                 return -ENOMEM;
 489
 490         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 491         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 492                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 493                         break;
 494         get_random_bytes(&lopt->hash_rnd, 4);
 495
 496 #ifdef CONFIG_ACCEPT_QUEUES
 497         tp->class_index = 0;
 498         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
 499                 tp->acceptq[i].aq_tail = NULL;
 500                 tp->acceptq[i].aq_head = NULL;
 501                 tp->acceptq[i].aq_wait_time = 0;
 502                 tp->acceptq[i].aq_qcount = 0;
 503                 tp->acceptq[i].aq_count = 0;
 504                 if (i == 0) {
 505                         tp->acceptq[i].aq_ratio = 1;
 506                 }
 507                 else {
 508                         tp->acceptq[i].aq_ratio = 0;
 509                 }
 510         }
 511 #endif
 512
 513         write_lock_bh(&tp->syn_wait_lock);
 514         tp->listen_opt = lopt;
 515         write_unlock_bh(&tp->syn_wait_lock);
 516
 517         /* There is race window here: we announce ourselves listening,
 518          * but this transition is still not validated by get_port().
 519          * It is OK, because this socket enters to hash table only
 520          * after validation is complete.
 521          */
 522         sk->sk_state = TCP_LISTEN;
 523         if (!sk->sk_prot->get_port(sk, inet->num)) {
 524                 inet->sport = htons(inet->num);
 525
 526                 sk_dst_reset(sk);
 527                 sk->sk_prot->hash(sk);
 528
 529 #ifdef CONFIG_CKRM
 530                 ckrm_cb_listen_start(sk);
 531 #endif
 532
 533                 return 0;
 534         }
 535
 536         sk->sk_state = TCP_CLOSE;
 537         write_lock_bh(&tp->syn_wait_lock);
 538         tp->listen_opt = NULL;
 539         write_unlock_bh(&tp->syn_wait_lock);
 540         kfree(lopt);
 541         return -EADDRINUSE;
 542 }
 543
 544 /*
 545  *      This routine closes sockets which have been at least partially
 546  *      opened, but not yet accepted.
 547  */
 548
 549 static void tcp_listen_stop (struct sock *sk)
 550 {
 551         struct tcp_opt *tp = tcp_sk(sk);
 552         struct tcp_listen_opt *lopt = tp->listen_opt;
 553         struct open_request *acc_req = tp->accept_queue;
 554         struct open_request *req;
 555         int i;
 556
 557         tcp_delete_keepalive_timer(sk);
 558
 559         /* make all the listen_opt local to us */
 560         write_lock_bh(&tp->syn_wait_lock);
 561         tp->listen_opt = NULL;
 562         write_unlock_bh(&tp->syn_wait_lock);
 563
 564 #ifdef CONFIG_CKRM
 565                 ckrm_cb_listen_stop(sk);
 566 #endif
 567
 568 #ifdef CONFIG_ACCEPT_QUEUES
 569         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
 570                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
 571 #else
 572         tp->accept_queue_tail = NULL;
 573 #endif
 574         tp->accept_queue = NULL;
 575
 576         if (lopt->qlen) {
 577                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 578                         while ((req = lopt->syn_table[i]) != NULL) {
 579                                 lopt->syn_table[i] = req->dl_next;
 580                                 lopt->qlen--;
 581                                 tcp_openreq_free(req);
 582
 583                 /* Following specs, it would be better either to send FIN
 584                  * (and enter FIN-WAIT-1, it is normal close)
 585                  * or to send active reset (abort).
 586                  * Certainly, it is pretty dangerous while synflood, but it is
 587                  * bad justification for our negligence 8)
 588                  * To be honest, we are not able to make either
 589                  * of the variants now.                 --ANK
 590                  */
 591                         }
 592                 }
 593         }
 594         BUG_TRAP(!lopt->qlen);
 595
 596         kfree(lopt);
 597
 598         while ((req = acc_req) != NULL) {
 599                 struct sock *child = req->sk;
 600
 601                 acc_req = req->dl_next;
 602
 603                 local_bh_disable();
 604                 bh_lock_sock(child);
 605                 BUG_TRAP(!sock_owned_by_user(child));
 606                 sock_hold(child);
 607
 608                 tcp_disconnect(child, O_NONBLOCK);
 609
 610                 sock_orphan(child);
 611
 612                 atomic_inc(&tcp_orphan_count);
 613
 614                 tcp_destroy_sock(child);
 615
 616                 bh_unlock_sock(child);
 617                 local_bh_enable();
 618                 sock_put(child);
 619
 620 #ifdef CONFIG_ACCEPT_QUEUES
 621                 sk_acceptq_removed(sk, req->acceptq_class);
 622 #else
 623                 sk_acceptq_removed(sk);
 624 #endif
 625                 tcp_openreq_fastfree(req);
 626         }
 627         BUG_TRAP(!sk->sk_ack_backlog);
 628 }
 629
 630 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 631 {
 632         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 633         tp->pushed_seq = tp->write_seq;
 634 }
 635
 636 static inline int forced_push(struct tcp_opt *tp)
 637 {
 638         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 639 }
 640
 641 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 642                               struct sk_buff *skb)
 643 {
 644         skb->csum = 0;
 645         TCP_SKB_CB(skb)->seq = tp->write_seq;
 646         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 647         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 648         TCP_SKB_CB(skb)->sacked = 0;
 649         __skb_queue_tail(&sk->sk_write_queue, skb);
 650         sk_charge_skb(sk, skb);
 651         if (!sk->sk_send_head)
 652                 sk->sk_send_head = skb;
 653         else if (tp->nonagle&TCP_NAGLE_PUSH)
 654                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 655 }
 656
 657 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 658                                 struct sk_buff *skb)
 659 {
 660         if (flags & MSG_OOB) {
 661                 tp->urg_mode = 1;
 662                 tp->snd_up = tp->write_seq;
 663                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 664         }
 665 }
 666
 667 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 668                             int mss_now, int nonagle)
 669 {
 670         if (sk->sk_send_head) {
 671                 struct sk_buff *skb = sk->sk_write_queue.prev;
 672                 if (!(flags & MSG_MORE) || forced_push(tp))
 673                         tcp_mark_push(tp, skb);
 674                 tcp_mark_urg(tp, flags, skb);
 675                 __tcp_push_pending_frames(sk, tp, mss_now,
 676                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 677         }
 678 }
 679
 680 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 681                          size_t psize, int flags)
 682 {
 683         struct tcp_opt *tp = tcp_sk(sk);
 684         int mss_now;
 685         int err;
 686         ssize_t copied;
 687         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 688
 689         /* Wait for a connection to finish. */
 690         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 691                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 692                         goto out_err;
 693
 694         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 695
 696         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 697         copied = 0;
 698
 699         err = -EPIPE;
 700         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 701                 goto do_error;
 702
 703         while (psize > 0) {
 704                 struct sk_buff *skb = sk->sk_write_queue.prev;
 705                 struct page *page = pages[poffset / PAGE_SIZE];
 706                 int copy, i;
 707                 int offset = poffset % PAGE_SIZE;
 708                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 709
 710                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
 711 new_segment:
 712                         if (!sk_stream_memory_free(sk))
 713                                 goto wait_for_sndbuf;
 714
 715                         skb = sk_stream_alloc_pskb(sk, 0, tp->mss_cache,
 716                                                    sk->sk_allocation);
 717                         if (!skb)
 718                                 goto wait_for_memory;
 719
 720                         skb_entail(sk, tp, skb);
 721                         copy = mss_now;
 722                 }
 723
 724                 if (copy > size)
 725                         copy = size;
 726
 727                 i = skb_shinfo(skb)->nr_frags;
 728                 if (skb_can_coalesce(skb, i, page, offset)) {
 729                         skb_shinfo(skb)->frags[i - 1].size += copy;
 730                 } else if (i < MAX_SKB_FRAGS) {
 731                         get_page(page);
 732                         skb_fill_page_desc(skb, i, page, offset, copy);
 733                 } else {
 734                         tcp_mark_push(tp, skb);
 735                         goto new_segment;
 736                 }
 737
 738                 skb->len += copy;
 739                 skb->data_len += copy;
 740                 skb->ip_summed = CHECKSUM_HW;
 741                 tp->write_seq += copy;
 742                 TCP_SKB_CB(skb)->end_seq += copy;
 743
 744                 if (!copied)
 745                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 746
 747                 copied += copy;
 748                 poffset += copy;
 749                 if (!(psize -= copy))
 750                         goto out;
 751
 752                 if (skb->len != mss_now || (flags & MSG_OOB))
 753                         continue;
 754
 755                 if (forced_push(tp)) {
 756                         tcp_mark_push(tp, skb);
 757                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 758                 } else if (skb == sk->sk_send_head)
 759                         tcp_push_one(sk, mss_now);
 760                 continue;
 761
 762 wait_for_sndbuf:
 763                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 764 wait_for_memory:
 765                 if (copied)
 766                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 767
 768                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 769                         goto do_error;
 770
 771                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 772         }
 773
 774 out:
 775         if (copied)
 776                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 777         return copied;
 778
 779 do_error:
 780         if (copied)
 781                 goto out;
 782 out_err:
 783         return sk_stream_error(sk, flags, err);
 784 }
 785
 786 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 787                      size_t size, int flags)
 788 {
 789         ssize_t res;
 790         struct sock *sk = sock->sk;
 791
 792 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 793
 794         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 795             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 796                 return sock_no_sendpage(sock, page, offset, size, flags);
 797
 798 #undef TCP_ZC_CSUM_FLAGS
 799
 800         lock_sock(sk);
 801         TCP_CHECK_TIMER(sk);
 802         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 803         TCP_CHECK_TIMER(sk);
 804         release_sock(sk);
 805         return res;
 806 }
 807
 808 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 809 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 810
 811 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
 812 {
 813         int tmp = tp->mss_cache_std;
 814
 815         if (sk->sk_route_caps & NETIF_F_SG) {
 816                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 817
 818                 if (tmp >= pgbreak &&
 819                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 820                         tmp = pgbreak;
 821         }
 822         return tmp;
 823 }
 824
 825 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 826                 size_t size)
 827 {
 828         struct iovec *iov;
 829         struct tcp_opt *tp = tcp_sk(sk);
 830         struct sk_buff *skb;
 831         int iovlen, flags;
 832         int mss_now;
 833         int err, copied;
 834         long timeo;
 835
 836         lock_sock(sk);
 837         TCP_CHECK_TIMER(sk);
 838
 839         flags = msg->msg_flags;
 840         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 841
 842         /* Wait for a connection to finish. */
 843         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 844                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 845                         goto out_err;
 846
 847         /* This should be in poll */
 848         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 849
 850         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 851
 852         /* Ok commence sending. */
 853         iovlen = msg->msg_iovlen;
 854         iov = msg->msg_iov;
 855         copied = 0;
 856
 857         err = -EPIPE;
 858         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 859                 goto do_error;
 860
 861         while (--iovlen >= 0) {
 862                 int seglen = iov->iov_len;
 863                 unsigned char __user *from = iov->iov_base;
 864
 865                 iov++;
 866
 867                 while (seglen > 0) {
 868                         int copy;
 869
 870                         skb = sk->sk_write_queue.prev;
 871
 872                         if (!sk->sk_send_head ||
 873                             (copy = mss_now - skb->len) <= 0) {
 874
 875 new_segment:
 876                                 /* Allocate new segment. If the interface is SG,
 877                                  * allocate skb fitting to single page.
 878                                  */
 879                                 if (!sk_stream_memory_free(sk))
 880                                         goto wait_for_sndbuf;
 881
 882                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 883                                                            0, sk->sk_allocation);
 884                                 if (!skb)
 885                                         goto wait_for_memory;
 886
 887                                 /*
 888                                  * Check whether we can use HW checksum.
 889                                  */
 890                                 if (sk->sk_route_caps &
 891                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 892                                      NETIF_F_HW_CSUM))
 893                                         skb->ip_summed = CHECKSUM_HW;
 894
 895                                 skb_entail(sk, tp, skb);
 896                                 copy = mss_now;
 897                         }
 898
 899                         /* Try to append data to the end of skb. */
 900                         if (copy > seglen)
 901                                 copy = seglen;
 902
 903                         /* Where to copy to? */
 904                         if (skb_tailroom(skb) > 0) {
 905                                 /* We have some space in skb head. Superb! */
 906                                 if (copy > skb_tailroom(skb))
 907                                         copy = skb_tailroom(skb);
 908                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 909                                         goto do_fault;
 910                         } else {
 911                                 int merge = 0;
 912                                 int i = skb_shinfo(skb)->nr_frags;
 913                                 struct page *page = TCP_PAGE(sk);
 914                                 int off = TCP_OFF(sk);
 915
 916                                 if (skb_can_coalesce(skb, i, page, off) &&
 917                                     off != PAGE_SIZE) {
 918                                         /* We can extend the last page
 919                                          * fragment. */
 920                                         merge = 1;
 921                                 } else if (i == MAX_SKB_FRAGS ||
 922                                            (!i &&
 923                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 924                                         /* Need to add new fragment and cannot
 925                                          * do this because interface is non-SG,
 926                                          * or because all the page slots are
 927                                          * busy. */
 928                                         tcp_mark_push(tp, skb);
 929                                         goto new_segment;
 930                                 } else if (page) {
 931                                         /* If page is cached, align
 932                                          * offset to L1 cache boundary
 933                                          */
 934                                         off = (off + L1_CACHE_BYTES - 1) &
 935                                               ~(L1_CACHE_BYTES - 1);
 936                                         if (off == PAGE_SIZE) {
 937                                                 put_page(page);
 938                                                 TCP_PAGE(sk) = page = NULL;
 939                                         }
 940                                 }
 941
 942                                 if (!page) {
 943                                         /* Allocate new cache page. */
 944                                         if (!(page = sk_stream_alloc_page(sk)))
 945                                                 goto wait_for_memory;
 946                                         off = 0;
 947                                 }
 948
 949                                 if (copy > PAGE_SIZE - off)
 950                                         copy = PAGE_SIZE - off;
 951
 952                                 /* Time to copy data. We are close to
 953                                  * the end! */
 954                                 err = skb_copy_to_page(sk, from, skb, page,
 955                                                        off, copy);
 956                                 if (err) {
 957                                         /* If this page was new, give it to the
 958                                          * socket so it does not get leaked.
 959                                          */
 960                                         if (!TCP_PAGE(sk)) {
 961                                                 TCP_PAGE(sk) = page;
 962                                                 TCP_OFF(sk) = 0;
 963                                         }
 964                                         goto do_error;
 965                                 }
 966
 967                                 /* Update the skb. */
 968                                 if (merge) {
 969                                         skb_shinfo(skb)->frags[i - 1].size +=
 970                                                                         copy;
 971                                 } else {
 972                                         skb_fill_page_desc(skb, i, page, off, copy);
 973                                         if (TCP_PAGE(sk)) {
 974                                                 get_page(page);
 975                                         } else if (off + copy < PAGE_SIZE) {
 976                                                 get_page(page);
 977                                                 TCP_PAGE(sk) = page;
 978                                         }
 979                                 }
 980
 981                                 TCP_OFF(sk) = off + copy;
 982                         }
 983
 984                         if (!copied)
 985                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 986
 987                         tp->write_seq += copy;
 988                         TCP_SKB_CB(skb)->end_seq += copy;
 989
 990                         from += copy;
 991                         copied += copy;
 992                         if ((seglen -= copy) == 0 && iovlen == 0)
 993                                 goto out;
 994
 995                         if (skb->len != mss_now || (flags & MSG_OOB))
 996                                 continue;
 997
 998                         if (forced_push(tp)) {
 999                                 tcp_mark_push(tp, skb);
1000                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1001                         } else if (skb == sk->sk_send_head)
1002                                 tcp_push_one(sk, mss_now);
1003                         continue;
1004
1005 wait_for_sndbuf:
1006                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1007 wait_for_memory:
1008                         if (copied)
1009                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1010
1011                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1012                                 goto do_error;
1013
1014                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1015                 }
1016         }
1017
1018 out:
1019         if (copied)
1020                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1021         TCP_CHECK_TIMER(sk);
1022         release_sock(sk);
1023         return copied;
1024
1025 do_fault:
1026         if (!skb->len) {
1027                 if (sk->sk_send_head == skb)
1028                         sk->sk_send_head = NULL;
1029                 __skb_unlink(skb, skb->list);
1030                 sk_stream_free_skb(sk, skb);
1031         }
1032
1033 do_error:
1034         if (copied)
1035                 goto out;
1036 out_err:
1037         err = sk_stream_error(sk, flags, err);
1038         TCP_CHECK_TIMER(sk);
1039         release_sock(sk);
1040         return err;
1041 }
1042
1043 /*
1044  *      Handle reading urgent data. BSD has very simple semantics for
1045  *      this, no blocking and very strange errors 8)
1046  */
1047
1048 static int tcp_recv_urg(struct sock *sk, long timeo,
1049                         struct msghdr *msg, int len, int flags,
1050                         int *addr_len)
1051 {
1052         struct tcp_opt *tp = tcp_sk(sk);
1053
1054         /* No URG data to read. */
1055         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1056             tp->urg_data == TCP_URG_READ)
1057                 return -EINVAL; /* Yes this is right ! */
1058
1059         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1060                 return -ENOTCONN;
1061
1062         if (tp->urg_data & TCP_URG_VALID) {
1063                 int err = 0;
1064                 char c = tp->urg_data;
1065
1066                 if (!(flags & MSG_PEEK))
1067                         tp->urg_data = TCP_URG_READ;
1068
1069                 /* Read urgent data. */
1070                 msg->msg_flags |= MSG_OOB;
1071
1072                 if (len > 0) {
1073                         if (!(flags & MSG_TRUNC))
1074                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1075                         len = 1;
1076                 } else
1077                         msg->msg_flags |= MSG_TRUNC;
1078
1079                 return err ? -EFAULT : len;
1080         }
1081
1082         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1083                 return 0;
1084
1085         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1086          * the available implementations agree in this case:
1087          * this call should never block, independent of the
1088          * blocking state of the socket.
1089          * Mike <pall@rz.uni-karlsruhe.de>
1090          */
1091         return -EAGAIN;
1092 }
1093
1094 /* Clean up the receive buffer for full frames taken by the user,
1095  * then send an ACK if necessary.  COPIED is the number of bytes
1096  * tcp_recvmsg has given to the user so far, it speeds up the
1097  * calculation of whether or not we must ACK for the sake of
1098  * a window update.
1099  */
1100 void cleanup_rbuf(struct sock *sk, int copied)
1101 {
1102         struct tcp_opt *tp = tcp_sk(sk);
1103         int time_to_ack = 0;
1104
1105 #if TCP_DEBUG
1106         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1107
1108         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1109 #endif
1110
1111         if (tcp_ack_scheduled(tp)) {
1112                    /* Delayed ACKs frequently hit locked sockets during bulk
1113                     * receive. */
1114                 if (tp->ack.blocked ||
1115                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1116                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1117                     /*
1118                      * If this read emptied read buffer, we send ACK, if
1119                      * connection is not bidirectional, user drained
1120                      * receive buffer and there was a small segment
1121                      * in queue.
1122                      */
1123                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1124                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1125                         time_to_ack = 1;
1126         }
1127
1128         /* We send an ACK if we can now advertise a non-zero window
1129          * which has been raised "significantly".
1130          *
1131          * Even if window raised up to infinity, do not send window open ACK
1132          * in states, where we will not receive more. It is useless.
1133          */
1134         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1135                 __u32 rcv_window_now = tcp_receive_window(tp);
1136
1137                 /* Optimize, __tcp_select_window() is not cheap. */
1138                 if (2*rcv_window_now <= tp->window_clamp) {
1139                         __u32 new_window = __tcp_select_window(sk);
1140
1141                         /* Send ACK now, if this read freed lots of space
1142                          * in our buffer. Certainly, new_window is new window.
1143                          * We can advertise it now, if it is not less than current one.
1144                          * "Lots" means "at least twice" here.
1145                          */
1146                         if (new_window && new_window >= 2 * rcv_window_now)
1147                                 time_to_ack = 1;
1148                 }
1149         }
1150         if (time_to_ack)
1151                 tcp_send_ack(sk);
1152 }
1153
1154 static void tcp_prequeue_process(struct sock *sk)
1155 {
1156         struct sk_buff *skb;
1157         struct tcp_opt *tp = tcp_sk(sk);
1158
1159         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1160
1161         /* RX process wants to run with disabled BHs, though it is not
1162          * necessary */
1163         local_bh_disable();
1164         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1165                 sk->sk_backlog_rcv(sk, skb);
1166         local_bh_enable();
1167
1168         /* Clear memory counter. */
1169         tp->ucopy.memory = 0;
1170 }
1171
1172 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1173 {
1174         struct sk_buff *skb;
1175         u32 offset;
1176
1177         skb_queue_walk(&sk->sk_receive_queue, skb) {
1178                 offset = seq - TCP_SKB_CB(skb)->seq;
1179                 if (skb->h.th->syn)
1180                         offset--;
1181                 if (offset < skb->len || skb->h.th->fin) {
1182                         *off = offset;
1183                         return skb;
1184                 }
1185         }
1186         return NULL;
1187 }
1188
1189 /*
1190  * This routine provides an alternative to tcp_recvmsg() for routines
1191  * that would like to handle copying from skbuffs directly in 'sendfile'
1192  * fashion.
1193  * Note:
1194  *      - It is assumed that the socket was locked by the caller.
1195  *      - The routine does not block.
1196  *      - At present, there is no support for reading OOB data
1197  *        or for 'peeking' the socket using this routine
1198  *        (although both would be easy to implement).
1199  */
1200 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1201                   sk_read_actor_t recv_actor)
1202 {
1203         struct sk_buff *skb;
1204         struct tcp_opt *tp = tcp_sk(sk);
1205         u32 seq = tp->copied_seq;
1206         u32 offset;
1207         int copied = 0;
1208
1209         if (sk->sk_state == TCP_LISTEN)
1210                 return -ENOTCONN;
1211         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1212                 if (offset < skb->len) {
1213                         size_t used, len;
1214
1215                         len = skb->len - offset;
1216                         /* Stop reading if we hit a patch of urgent data */
1217                         if (tp->urg_data) {
1218                                 u32 urg_offset = tp->urg_seq - seq;
1219                                 if (urg_offset < len)
1220                                         len = urg_offset;
1221                                 if (!len)
1222                                         break;
1223                         }
1224                         used = recv_actor(desc, skb, offset, len);
1225                         if (used <= len) {
1226                                 seq += used;
1227                                 copied += used;
1228                                 offset += used;
1229                         }
1230                         if (offset != skb->len)
1231                                 break;
1232                 }
1233                 if (skb->h.th->fin) {
1234                         sk_eat_skb(sk, skb);
1235                         ++seq;
1236                         break;
1237                 }
1238                 sk_eat_skb(sk, skb);
1239                 if (!desc->count)
1240                         break;
1241         }
1242         tp->copied_seq = seq;
1243
1244         tcp_rcv_space_adjust(sk);
1245
1246         /* Clean up data we have read: This will do ACK frames. */
1247         if (copied)
1248                 cleanup_rbuf(sk, copied);
1249         return copied;
1250 }
1251
1252 /*
1253  *      This routine copies from a sock struct into the user buffer.
1254  *
1255  *      Technical note: in 2.3 we work on _locked_ socket, so that
1256  *      tricks with *seq access order and skb->users are not required.
1257  *      Probably, code can be easily improved even more.
1258  */
1259
1260 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1261                 size_t len, int nonblock, int flags, int *addr_len)
1262 {
1263         struct tcp_opt *tp = tcp_sk(sk);
1264         int copied = 0;
1265         u32 peek_seq;
1266         u32 *seq;
1267         unsigned long used;
1268         int err;
1269         int target;             /* Read at least this many bytes */
1270         long timeo;
1271         struct task_struct *user_recv = NULL;
1272
1273         lock_sock(sk);
1274
1275         TCP_CHECK_TIMER(sk);
1276
1277         err = -ENOTCONN;
1278         if (sk->sk_state == TCP_LISTEN)
1279                 goto out;
1280
1281         timeo = sock_rcvtimeo(sk, nonblock);
1282
1283         /* Urgent data needs to be handled specially. */
1284         if (flags & MSG_OOB)
1285                 goto recv_urg;
1286
1287         seq = &tp->copied_seq;
1288         if (flags & MSG_PEEK) {
1289                 peek_seq = tp->copied_seq;
1290                 seq = &peek_seq;
1291         }
1292
1293         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1294
1295         do {
1296                 struct sk_buff *skb;
1297                 u32 offset;
1298
1299                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1300                 if (tp->urg_data && tp->urg_seq == *seq) {
1301                         if (copied)
1302                                 break;
1303                         if (signal_pending(current)) {
1304                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1305                                 break;
1306                         }
1307                 }
1308
1309                 /* Next get a buffer. */
1310
1311                 skb = skb_peek(&sk->sk_receive_queue);
1312                 do {
1313                         if (!skb)
1314                                 break;
1315
1316                         /* Now that we have two receive queues this
1317                          * shouldn't happen.
1318                          */
1319                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1320                                 printk(KERN_INFO "recvmsg bug: copied %X "
1321                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1322                                 break;
1323                         }
1324                         offset = *seq - TCP_SKB_CB(skb)->seq;
1325                         if (skb->h.th->syn)
1326                                 offset--;
1327                         if (offset < skb->len)
1328                                 goto found_ok_skb;
1329                         if (skb->h.th->fin)
1330                                 goto found_fin_ok;
1331                         BUG_TRAP(flags & MSG_PEEK);
1332                         skb = skb->next;
1333                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1334
1335                 /* Well, if we have backlog, try to process it now yet. */
1336
1337                 if (copied >= target && !sk->sk_backlog.tail)
1338                         break;
1339
1340                 if (copied) {
1341                         if (sk->sk_err ||
1342                             sk->sk_state == TCP_CLOSE ||
1343                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1344                             !timeo ||
1345                             signal_pending(current) ||
1346                             (flags & MSG_PEEK))
1347                                 break;
1348                 } else {
1349                         if (sock_flag(sk, SOCK_DONE))
1350                                 break;
1351
1352                         if (sk->sk_err) {
1353                                 copied = sock_error(sk);
1354                                 break;
1355                         }
1356
1357                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1358                                 break;
1359
1360                         if (sk->sk_state == TCP_CLOSE) {
1361                                 if (!sock_flag(sk, SOCK_DONE)) {
1362                                         /* This occurs when user tries to read
1363                                          * from never connected socket.
1364                                          */
1365                                         copied = -ENOTCONN;
1366                                         break;
1367                                 }
1368                                 break;
1369                         }
1370
1371                         if (!timeo) {
1372                                 copied = -EAGAIN;
1373                                 break;
1374                         }
1375
1376                         if (signal_pending(current)) {
1377                                 copied = sock_intr_errno(timeo);
1378                                 break;
1379                         }
1380                 }
1381
1382                 cleanup_rbuf(sk, copied);
1383
1384                 if (tp->ucopy.task == user_recv) {
1385                         /* Install new reader */
1386                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1387                                 user_recv = current;
1388                                 tp->ucopy.task = user_recv;
1389                                 tp->ucopy.iov = msg->msg_iov;
1390                         }
1391
1392                         tp->ucopy.len = len;
1393
1394                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1395                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1396
1397                         /* Ugly... If prequeue is not empty, we have to
1398                          * process it before releasing socket, otherwise
1399                          * order will be broken at second iteration.
1400                          * More elegant solution is required!!!
1401                          *
1402                          * Look: we have the following (pseudo)queues:
1403                          *
1404                          * 1. packets in flight
1405                          * 2. backlog
1406                          * 3. prequeue
1407                          * 4. receive_queue
1408                          *
1409                          * Each queue can be processed only if the next ones
1410                          * are empty. At this point we have empty receive_queue.
1411                          * But prequeue _can_ be not empty after 2nd iteration,
1412                          * when we jumped to start of loop because backlog
1413                          * processing added something to receive_queue.
1414                          * We cannot release_sock(), because backlog contains
1415                          * packets arrived _after_ prequeued ones.
1416                          *
1417                          * Shortly, algorithm is clear --- to process all
1418                          * the queues in order. We could make it more directly,
1419                          * requeueing packets from backlog to prequeue, if
1420                          * is not empty. It is more elegant, but eats cycles,
1421                          * unfortunately.
1422                          */
1423                         if (skb_queue_len(&tp->ucopy.prequeue))
1424                                 goto do_prequeue;
1425
1426                         /* __ Set realtime policy in scheduler __ */
1427                 }
1428
1429                 if (copied >= target) {
1430                         /* Do not sleep, just process backlog. */
1431                         release_sock(sk);
1432                         lock_sock(sk);
1433                 } else
1434                         sk_wait_data(sk, &timeo);
1435
1436                 if (user_recv) {
1437                         int chunk;
1438
1439                         /* __ Restore normal policy in scheduler __ */
1440
1441                         if ((chunk = len - tp->ucopy.len) != 0) {
1442                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1443                                 len -= chunk;
1444                                 copied += chunk;
1445                         }
1446
1447                         if (tp->rcv_nxt == tp->copied_seq &&
1448                             skb_queue_len(&tp->ucopy.prequeue)) {
1449 do_prequeue:
1450                                 tcp_prequeue_process(sk);
1451
1452                                 if ((chunk = len - tp->ucopy.len) != 0) {
1453                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1454                                         len -= chunk;
1455                                         copied += chunk;
1456                                 }
1457                         }
1458                 }
1459                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1460                         if (net_ratelimit())
1461                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1462                                        current->comm, current->pid);
1463                         peek_seq = tp->copied_seq;
1464                 }
1465                 continue;
1466
1467         found_ok_skb:
1468                 /* Ok so how much can we use? */
1469                 used = skb->len - offset;
1470                 if (len < used)
1471                         used = len;
1472
1473                 /* Do we have urgent data here? */
1474                 if (tp->urg_data) {
1475                         u32 urg_offset = tp->urg_seq - *seq;
1476                         if (urg_offset < used) {
1477                                 if (!urg_offset) {
1478                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1479                                                 ++*seq;
1480                                                 offset++;
1481                                                 used--;
1482                                                 if (!used)
1483                                                         goto skip_copy;
1484                                         }
1485                                 } else
1486                                         used = urg_offset;
1487                         }
1488                 }
1489
1490                 if (!(flags & MSG_TRUNC)) {
1491                         err = skb_copy_datagram_iovec(skb, offset,
1492                                                       msg->msg_iov, used);
1493                         if (err) {
1494                                 /* Exception. Bailout! */
1495                                 if (!copied)
1496                                         copied = -EFAULT;
1497                                 break;
1498                         }
1499                 }
1500
1501                 *seq += used;
1502                 copied += used;
1503                 len -= used;
1504
1505                 tcp_rcv_space_adjust(sk);
1506
1507 skip_copy:
1508                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1509                         tp->urg_data = 0;
1510                         tcp_fast_path_check(sk, tp);
1511                 }
1512                 if (used + offset < skb->len)
1513                         continue;
1514
1515                 if (skb->h.th->fin)
1516                         goto found_fin_ok;
1517                 if (!(flags & MSG_PEEK))
1518                         sk_eat_skb(sk, skb);
1519                 continue;
1520
1521         found_fin_ok:
1522                 /* Process the FIN. */
1523                 ++*seq;
1524                 if (!(flags & MSG_PEEK))
1525                         sk_eat_skb(sk, skb);
1526                 break;
1527         } while (len > 0);
1528
1529         if (user_recv) {
1530                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1531                         int chunk;
1532
1533                         tp->ucopy.len = copied > 0 ? len : 0;
1534
1535                         tcp_prequeue_process(sk);
1536
1537                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1538                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1539                                 len -= chunk;
1540                                 copied += chunk;
1541                         }
1542                 }
1543
1544                 tp->ucopy.task = NULL;
1545                 tp->ucopy.len = 0;
1546         }
1547
1548         /* According to UNIX98, msg_name/msg_namelen are ignored
1549          * on connected socket. I was just happy when found this 8) --ANK
1550          */
1551
1552         /* Clean up data we have read: This will do ACK frames. */
1553         cleanup_rbuf(sk, copied);
1554
1555         TCP_CHECK_TIMER(sk);
1556         release_sock(sk);
1557         return copied;
1558
1559 out:
1560         TCP_CHECK_TIMER(sk);
1561         release_sock(sk);
1562         return err;
1563
1564 recv_urg:
1565         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1566         goto out;
1567 }
1568
1569 /*
1570  *      State processing on a close. This implements the state shift for
1571  *      sending our FIN frame. Note that we only send a FIN for some
1572  *      states. A shutdown() may have already sent the FIN, or we may be
1573  *      closed.
1574  */
1575
1576 static unsigned char new_state[16] = {
1577   /* current state:        new state:      action:      */
1578   /* (Invalid)          */ TCP_CLOSE,
1579   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1580   /* TCP_SYN_SENT       */ TCP_CLOSE,
1581   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1582   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1583   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1584   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1585   /* TCP_CLOSE          */ TCP_CLOSE,
1586   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1587   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1588   /* TCP_LISTEN         */ TCP_CLOSE,
1589   /* TCP_CLOSING        */ TCP_CLOSING,
1590 };
1591
1592 static int tcp_close_state(struct sock *sk)
1593 {
1594         int next = (int)new_state[sk->sk_state];
1595         int ns = next & TCP_STATE_MASK;
1596
1597         tcp_set_state(sk, ns);
1598
1599         return next & TCP_ACTION_FIN;
1600 }
1601
1602 /*
1603  *      Shutdown the sending side of a connection. Much like close except
1604  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1605  */
1606
1607 void tcp_shutdown(struct sock *sk, int how)
1608 {
1609         /*      We need to grab some memory, and put together a FIN,
1610          *      and then put it into the queue to be sent.
1611          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1612          */
1613         if (!(how & SEND_SHUTDOWN))
1614                 return;
1615
1616         /* If we've already sent a FIN, or it's a closed state, skip this. */
1617         if ((1 << sk->sk_state) &
1618             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1619              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1620                 /* Clear out any half completed packets.  FIN if needed. */
1621                 if (tcp_close_state(sk))
1622                         tcp_send_fin(sk);
1623         }
1624 }
1625
1626 /*
1627  * At this point, there should be no process reference to this
1628  * socket, and thus no user references at all.  Therefore we
1629  * can assume the socket waitqueue is inactive and nobody will
1630  * try to jump onto it.
1631  */
1632 void tcp_destroy_sock(struct sock *sk)
1633 {
1634         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1635         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1636
1637         /* It cannot be in hash table! */
1638         BUG_TRAP(sk_unhashed(sk));
1639
1640         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1641         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1642
1643 #ifdef TCP_DEBUG
1644         if (sk->sk_zapped) {
1645                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1646                 sock_hold(sk);
1647         }
1648         sk->sk_zapped = 1;
1649 #endif
1650
1651         sk->sk_prot->destroy(sk);
1652
1653         sk_stream_kill_queues(sk);
1654
1655         xfrm_sk_free_policy(sk);
1656
1657 #ifdef INET_REFCNT_DEBUG
1658         if (atomic_read(&sk->sk_refcnt) != 1) {
1659                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1660                        sk, atomic_read(&sk->sk_refcnt));
1661         }
1662 #endif
1663
1664         atomic_dec(&tcp_orphan_count);
1665         sock_put(sk);
1666 }
1667
1668 void tcp_close(struct sock *sk, long timeout)
1669 {
1670         struct sk_buff *skb;
1671         int data_was_unread = 0;
1672
1673         lock_sock(sk);
1674         sk->sk_shutdown = SHUTDOWN_MASK;
1675
1676         if (sk->sk_state == TCP_LISTEN) {
1677                 tcp_set_state(sk, TCP_CLOSE);
1678
1679                 /* Special case. */
1680                 tcp_listen_stop(sk);
1681
1682                 goto adjudge_to_death;
1683         }
1684
1685         /*  We need to flush the recv. buffs.  We do this only on the
1686          *  descriptor close, not protocol-sourced closes, because the
1687          *  reader process may not have drained the data yet!
1688          */
1689         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1690                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1691                           skb->h.th->fin;
1692                 data_was_unread += len;
1693                 __kfree_skb(skb);
1694         }
1695
1696         sk_stream_mem_reclaim(sk);
1697
1698         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1699          * 3.10, we send a RST here because data was lost.  To
1700          * witness the awful effects of the old behavior of always
1701          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1702          * a bulk GET in an FTP client, suspend the process, wait
1703          * for the client to advertise a zero window, then kill -9
1704          * the FTP client, wheee...  Note: timeout is always zero
1705          * in such a case.
1706          */
1707         if (data_was_unread) {
1708                 /* Unread data was tossed, zap the connection. */
1709                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1710                 tcp_set_state(sk, TCP_CLOSE);
1711                 tcp_send_active_reset(sk, GFP_KERNEL);
1712         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1713                 /* Check zero linger _after_ checking for unread data. */
1714                 sk->sk_prot->disconnect(sk, 0);
1715                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1716         } else if (tcp_close_state(sk)) {
1717                 /* We FIN if the application ate all the data before
1718                  * zapping the connection.
1719                  */
1720
1721                 /* RED-PEN. Formally speaking, we have broken TCP state
1722                  * machine. State transitions:
1723                  *
1724                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1725                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1726                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1727                  *
1728                  * are legal only when FIN has been sent (i.e. in window),
1729                  * rather than queued out of window. Purists blame.
1730                  *
1731                  * F.e. "RFC state" is ESTABLISHED,
1732                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1733                  *
1734                  * The visible declinations are that sometimes
1735                  * we enter time-wait state, when it is not required really
1736                  * (harmless), do not send active resets, when they are
1737                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1738                  * they look as CLOSING or LAST_ACK for Linux)
1739                  * Probably, I missed some more holelets.
1740                  *                                              --ANK
1741                  */
1742                 tcp_send_fin(sk);
1743         }
1744
1745         sk_stream_wait_close(sk, timeout);
1746
1747 adjudge_to_death:
1748         /* It is the last release_sock in its life. It will remove backlog. */
1749         release_sock(sk);
1750
1751
1752         /* Now socket is owned by kernel and we acquire BH lock
1753            to finish close. No need to check for user refs.
1754          */
1755         local_bh_disable();
1756         bh_lock_sock(sk);
1757         BUG_TRAP(!sock_owned_by_user(sk));
1758
1759         sock_hold(sk);
1760         sock_orphan(sk);
1761
1762         /*      This is a (useful) BSD violating of the RFC. There is a
1763          *      problem with TCP as specified in that the other end could
1764          *      keep a socket open forever with no application left this end.
1765          *      We use a 3 minute timeout (about the same as BSD) then kill
1766          *      our end. If they send after that then tough - BUT: long enough
1767          *      that we won't make the old 4*rto = almost no time - whoops
1768          *      reset mistake.
1769          *
1770          *      Nope, it was not mistake. It is really desired behaviour
1771          *      f.e. on http servers, when such sockets are useless, but
1772          *      consume significant resources. Let's do it with special
1773          *      linger2 option.                                 --ANK
1774          */
1775
1776         if (sk->sk_state == TCP_FIN_WAIT2) {
1777                 struct tcp_opt *tp = tcp_sk(sk);
1778                 if (tp->linger2 < 0) {
1779                         tcp_set_state(sk, TCP_CLOSE);
1780                         tcp_send_active_reset(sk, GFP_ATOMIC);
1781                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1782                 } else {
1783                         int tmo = tcp_fin_time(tp);
1784
1785                         if (tmo > TCP_TIMEWAIT_LEN) {
1786                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1787                         } else {
1788                                 atomic_inc(&tcp_orphan_count);
1789                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1790                                 goto out;
1791                         }
1792                 }
1793         }
1794         if (sk->sk_state != TCP_CLOSE) {
1795                 sk_stream_mem_reclaim(sk);
1796                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1797                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1798                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1799                         if (net_ratelimit())
1800                                 printk(KERN_INFO "TCP: too many of orphaned "
1801                                        "sockets\n");
1802                         tcp_set_state(sk, TCP_CLOSE);
1803                         tcp_send_active_reset(sk, GFP_ATOMIC);
1804                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1805                 }
1806         }
1807         atomic_inc(&tcp_orphan_count);
1808
1809         if (sk->sk_state == TCP_CLOSE)
1810                 tcp_destroy_sock(sk);
1811         /* Otherwise, socket is reprieved until protocol close. */
1812
1813 out:
1814         bh_unlock_sock(sk);
1815         local_bh_enable();
1816         sock_put(sk);
1817 }
1818
1819 /* These states need RST on ABORT according to RFC793 */
1820
1821 static inline int tcp_need_reset(int state)
1822 {
1823         return (1 << state) &
1824                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1825                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1826 }
1827
1828 int tcp_disconnect(struct sock *sk, int flags)
1829 {
1830         struct inet_opt *inet = inet_sk(sk);
1831         struct tcp_opt *tp = tcp_sk(sk);
1832         int err = 0;
1833         int old_state = sk->sk_state;
1834
1835         if (old_state != TCP_CLOSE)
1836                 tcp_set_state(sk, TCP_CLOSE);
1837
1838         /* ABORT function of RFC793 */
1839         if (old_state == TCP_LISTEN) {
1840                 tcp_listen_stop(sk);
1841         } else if (tcp_need_reset(old_state) ||
1842                    (tp->snd_nxt != tp->write_seq &&
1843                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1844                 /* The last check adjusts for discrepance of Linux wrt. RFC
1845                  * states
1846                  */
1847                 tcp_send_active_reset(sk, gfp_any());
1848                 sk->sk_err = ECONNRESET;
1849         } else if (old_state == TCP_SYN_SENT)
1850                 sk->sk_err = ECONNRESET;
1851
1852         tcp_clear_xmit_timers(sk);
1853         __skb_queue_purge(&sk->sk_receive_queue);
1854         sk_stream_writequeue_purge(sk);
1855         __skb_queue_purge(&tp->out_of_order_queue);
1856
1857         inet->dport = 0;
1858
1859         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1860                 inet_reset_saddr(sk);
1861
1862         sk->sk_shutdown = 0;
1863         sock_reset_flag(sk, SOCK_DONE);
1864         tp->srtt = 0;
1865         if ((tp->write_seq += tp->max_window + 2) == 0)
1866                 tp->write_seq = 1;
1867         tp->backoff = 0;
1868         tp->snd_cwnd = 2;
1869         tp->probes_out = 0;
1870         tp->packets_out = 0;
1871         tp->snd_ssthresh = 0x7fffffff;
1872         tp->snd_cwnd_cnt = 0;
1873         tcp_set_ca_state(tp, TCP_CA_Open);
1874         tcp_clear_retrans(tp);
1875         tcp_delack_init(tp);
1876         sk->sk_send_head = NULL;
1877         tp->saw_tstamp = 0;
1878         tcp_sack_reset(tp);
1879         __sk_dst_reset(sk);
1880
1881         BUG_TRAP(!inet->num || tp->bind_hash);
1882
1883         sk->sk_error_report(sk);
1884         return err;
1885 }
1886
1887 /*
1888  *      Wait for an incoming connection, avoid race
1889  *      conditions. This must be called with the socket locked.
1890  */
1891 static int wait_for_connect(struct sock *sk, long timeo)
1892 {
1893         struct tcp_opt *tp = tcp_sk(sk);
1894         DEFINE_WAIT(wait);
1895         int err;
1896
1897         /*
1898          * True wake-one mechanism for incoming connections: only
1899          * one process gets woken up, not the 'whole herd'.
1900          * Since we do not 'race & poll' for established sockets
1901          * anymore, the common case will execute the loop only once.
1902          *
1903          * Subtle issue: "add_wait_queue_exclusive()" will be added
1904          * after any current non-exclusive waiters, and we know that
1905          * it will always _stay_ after any new non-exclusive waiters
1906          * because all non-exclusive waiters are added at the
1907          * beginning of the wait-queue. As such, it's ok to "drop"
1908          * our exclusiveness temporarily when we get woken up without
1909          * having to remove and re-insert us on the wait queue.
1910          */
1911         for (;;) {
1912                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1913                                           TASK_INTERRUPTIBLE);
1914                 release_sock(sk);
1915                 if (!tp->accept_queue)
1916                         timeo = schedule_timeout(timeo);
1917                 lock_sock(sk);
1918                 err = 0;
1919                 if (tp->accept_queue)
1920                         break;
1921                 err = -EINVAL;
1922                 if (sk->sk_state != TCP_LISTEN)
1923                         break;
1924                 err = sock_intr_errno(timeo);
1925                 if (signal_pending(current))
1926                         break;
1927                 err = -EAGAIN;
1928                 if (!timeo)
1929                         break;
1930         }
1931         finish_wait(sk->sk_sleep, &wait);
1932         return err;
1933 }
1934
1935 /*
1936  *      This will accept the next outstanding connection.
1937  */
1938
1939 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1940 {
1941         struct tcp_opt *tp = tcp_sk(sk);
1942         struct open_request *req;
1943         struct sock *newsk;
1944         int error;
1945 #ifdef CONFIG_ACCEPT_QUEUES
1946         int prev_class = 0;
1947         int first;
1948 #endif
1949
1950         lock_sock(sk);
1951
1952         /* We need to make sure that this socket is listening,
1953          * and that it has something pending.
1954          */
1955         error = -EINVAL;
1956         if (sk->sk_state != TCP_LISTEN)
1957                 goto out;
1958
1959         /* Find already established connection */
1960         if (!tp->accept_queue) {
1961                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1962                 /* If this is a non blocking socket don't sleep */
1963                 error = -EAGAIN;
1964                 if (!timeo)
1965                         goto out;
1966
1967                 error = wait_for_connect(sk, timeo);
1968                 if (error)
1969                         goto out;
1970         }
1971
1972 #ifndef CONFIG_ACCEPT_QUEUES
1973         req = tp->accept_queue;
1974         if ((tp->accept_queue = req->dl_next) == NULL)
1975                 tp->accept_queue_tail = NULL;
1976         newsk = req->sk;
1977         sk_acceptq_removed(sk);
1978 #else
1979         first = tp->class_index;
1980         /* We should always have  request queued here. The accept_queue
1981          * is already checked for NULL above.
1982          */
1983         while(!tp->acceptq[first].aq_head) {
1984                 tp->acceptq[first].aq_cnt = 0;
1985                 first = (first+1) & ~NUM_ACCEPT_QUEUES;
1986         }
1987         req = tp->acceptq[first].aq_head;
1988         tp->acceptq[first].aq_qcount--;
1989         tp->acceptq[first].aq_count++;
1990         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
1991
1992         for (prev_class= first-1 ; prev_class >=0; prev_class--)
1993                 if (tp->acceptq[prev_class].aq_tail)
1994                         break;
1995         if (prev_class>=0)
1996                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
1997         else
1998                 tp->accept_queue = req->dl_next;
1999
2000         if (req == tp->acceptq[first].aq_tail)
2001                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2002         else
2003                 tp->acceptq[first].aq_head = req->dl_next;
2004
2005         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2006                 tp->acceptq[first].aq_cnt = 0;
2007                 tp->class_index = ++first & (NUM_ACCEPT_QUEUES-1);
2008         }
2009         newsk = req->sk;
2010         sk_acceptq_removed(sk, req->acceptq_class);
2011 #endif
2012         tcp_openreq_fastfree(req);
2013         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2014         release_sock(sk);
2015         return newsk;
2016
2017 out:
2018         release_sock(sk);
2019         *err = error;
2020         return NULL;
2021 }
2022
2023
2024 /*
2025  *      Socket option code for TCP.
2026  */
2027 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2028                    int optlen)
2029 {
2030         struct tcp_opt *tp = tcp_sk(sk);
2031         int val;
2032         int err = 0;
2033
2034         if (level != SOL_TCP)
2035                 return tp->af_specific->setsockopt(sk, level, optname,
2036                                                    optval, optlen);
2037
2038         if (optlen < sizeof(int))
2039                 return -EINVAL;
2040
2041         if (get_user(val, (int __user *)optval))
2042                 return -EFAULT;
2043
2044         lock_sock(sk);
2045
2046         switch (optname) {
2047         case TCP_MAXSEG:
2048                 /* Values greater than interface MTU won't take effect. However
2049                  * at the point when this call is done we typically don't yet
2050                  * know which interface is going to be used */
2051                 if (val < 8 || val > MAX_TCP_WINDOW) {
2052                         err = -EINVAL;
2053                         break;
2054                 }
2055                 tp->user_mss = val;
2056                 break;
2057
2058         case TCP_NODELAY:
2059                 if (val) {
2060                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2061                          * this option on corked socket is remembered, but
2062                          * it is not activated until cork is cleared.
2063                          *
2064                          * However, when TCP_NODELAY is set we make
2065                          * an explicit push, which overrides even TCP_CORK
2066                          * for currently queued segments.
2067                          */
2068                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2069                         tcp_push_pending_frames(sk, tp);
2070                 } else {
2071                         tp->nonagle &= ~TCP_NAGLE_OFF;
2072                 }
2073                 break;
2074
2075         case TCP_CORK:
2076                 /* When set indicates to always queue non-full frames.
2077                  * Later the user clears this option and we transmit
2078                  * any pending partial frames in the queue.  This is
2079                  * meant to be used alongside sendfile() to get properly
2080                  * filled frames when the user (for example) must write
2081                  * out headers with a write() call first and then use
2082                  * sendfile to send out the data parts.
2083                  *
2084                  * TCP_CORK can be set together with TCP_NODELAY and it is
2085                  * stronger than TCP_NODELAY.
2086                  */
2087                 if (val) {
2088                         tp->nonagle |= TCP_NAGLE_CORK;
2089                 } else {
2090                         tp->nonagle &= ~TCP_NAGLE_CORK;
2091                         if (tp->nonagle&TCP_NAGLE_OFF)
2092                                 tp->nonagle |= TCP_NAGLE_PUSH;
2093                         tcp_push_pending_frames(sk, tp);
2094                 }
2095                 break;
2096
2097         case TCP_KEEPIDLE:
2098                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2099                         err = -EINVAL;
2100                 else {
2101                         tp->keepalive_time = val * HZ;
2102                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2103                             !((1 << sk->sk_state) &
2104                               (TCPF_CLOSE | TCPF_LISTEN))) {
2105                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2106                                 if (tp->keepalive_time > elapsed)
2107                                         elapsed = tp->keepalive_time - elapsed;
2108                                 else
2109                                         elapsed = 0;
2110                                 tcp_reset_keepalive_timer(sk, elapsed);
2111                         }
2112                 }
2113                 break;
2114         case TCP_KEEPINTVL:
2115                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2116                         err = -EINVAL;
2117                 else
2118                         tp->keepalive_intvl = val * HZ;
2119                 break;
2120         case TCP_KEEPCNT:
2121                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2122                         err = -EINVAL;
2123                 else
2124                         tp->keepalive_probes = val;
2125                 break;
2126         case TCP_SYNCNT:
2127                 if (val < 1 || val > MAX_TCP_SYNCNT)
2128                         err = -EINVAL;
2129                 else
2130                         tp->syn_retries = val;
2131                 break;
2132
2133         case TCP_LINGER2:
2134                 if (val < 0)
2135                         tp->linger2 = -1;
2136                 else if (val > sysctl_tcp_fin_timeout / HZ)
2137                         tp->linger2 = 0;
2138                 else
2139                         tp->linger2 = val * HZ;
2140                 break;
2141
2142         case TCP_DEFER_ACCEPT:
2143                 tp->defer_accept = 0;
2144                 if (val > 0) {
2145                         /* Translate value in seconds to number of
2146                          * retransmits */
2147                         while (tp->defer_accept < 32 &&
2148                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2149                                        tp->defer_accept))
2150                                 tp->defer_accept++;
2151                         tp->defer_accept++;
2152                 }
2153                 break;
2154
2155         case TCP_WINDOW_CLAMP:
2156                 if (!val) {
2157                         if (sk->sk_state != TCP_CLOSE) {
2158                                 err = -EINVAL;
2159                                 break;
2160                         }
2161                         tp->window_clamp = 0;
2162                 } else
2163                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2164                                                 SOCK_MIN_RCVBUF / 2 : val;
2165                 break;
2166
2167         case TCP_QUICKACK:
2168                 if (!val) {
2169                         tp->ack.pingpong = 1;
2170                 } else {
2171                         tp->ack.pingpong = 0;
2172                         if ((1 << sk->sk_state) &
2173                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2174                             tcp_ack_scheduled(tp)) {
2175                                 tp->ack.pending |= TCP_ACK_PUSHED;
2176                                 cleanup_rbuf(sk, 1);
2177                                 if (!(val & 1))
2178                                         tp->ack.pingpong = 1;
2179                         }
2180                 }
2181                 break;
2182
2183 #ifdef CONFIG_ACCEPT_QUEUES
2184         case TCP_ACCEPTQ_SHARE:
2185 #ifdef CONFIG_CKRM
2186                 // If CKRM is set then the shares are set through rcfs.
2187                 // Get shares will still succeed.
2188                 err = -EOPNOTSUPP;
2189                 break;
2190 #else
2191                 {
2192                         char share_wt[NUM_ACCEPT_QUEUES];
2193                         int i,j;
2194
2195                         if (sk->sk_state != TCP_LISTEN)
2196                                 return -EOPNOTSUPP;
2197
2198                         if (copy_from_user(share_wt,optval, optlen)) {
2199                                 err = -EFAULT;
2200                                 break;
2201                         }
2202                         j = 0;
2203                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2204                                 if (share_wt[i]) {
2205                                         if (!j)
2206                                                 j = share_wt[i];
2207                                         else if (share_wt[i] < j) {
2208                                                 j = share_wt[i];
2209                                         }
2210                                 }
2211                                 else
2212                                         tp->acceptq[i].aq_ratio = 0;
2213
2214                         }
2215                         if (j == 0) {
2216                                 /* Class 0 is always valid. If nothing is
2217                                  * specified set class 0 as 1.
2218                                  */
2219                                 share_wt[0] = 1;
2220                                 j = 1;
2221                         }
2222                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2223                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2224                                 tp->acceptq[i].aq_cnt = 0;
2225                         }
2226                 }
2227                 break;
2228 #endif
2229 #endif
2230         default:
2231                 err = -ENOPROTOOPT;
2232                 break;
2233         };
2234         release_sock(sk);
2235         return err;
2236 }
2237
2238 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2239                    int __user *optlen)
2240 {
2241         struct tcp_opt *tp = tcp_sk(sk);
2242         int val, len;
2243
2244         if (level != SOL_TCP)
2245                 return tp->af_specific->getsockopt(sk, level, optname,
2246                                                    optval, optlen);
2247
2248         if (get_user(len, optlen))
2249                 return -EFAULT;
2250
2251         len = min_t(unsigned int, len, sizeof(int));
2252
2253         if (len < 0)
2254                 return -EINVAL;
2255
2256         switch (optname) {
2257         case TCP_MAXSEG:
2258                 val = tp->mss_cache_std;
2259                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2260                         val = tp->user_mss;
2261                 break;
2262         case TCP_NODELAY:
2263                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2264                 break;
2265         case TCP_CORK:
2266                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2267                 break;
2268         case TCP_KEEPIDLE:
2269                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2270                 break;
2271         case TCP_KEEPINTVL:
2272                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2273                 break;
2274         case TCP_KEEPCNT:
2275                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2276                 break;
2277         case TCP_SYNCNT:
2278                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2279                 break;
2280         case TCP_LINGER2:
2281                 val = tp->linger2;
2282                 if (val >= 0)
2283                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2284                 break;
2285         case TCP_DEFER_ACCEPT:
2286                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2287                                                (tp->defer_accept - 1));
2288                 break;
2289         case TCP_WINDOW_CLAMP:
2290                 val = tp->window_clamp;
2291                 break;
2292         case TCP_INFO: {
2293                 struct tcp_info info;
2294
2295                 if (get_user(len, optlen))
2296                         return -EFAULT;
2297
2298                 tcp_get_info(sk, &info);
2299
2300                 len = min_t(unsigned int, len, sizeof(info));
2301                 if (put_user(len, optlen))
2302                         return -EFAULT;
2303                 if (copy_to_user(optval, &info, len))
2304                         return -EFAULT;
2305                 return 0;
2306         }
2307         case TCP_QUICKACK:
2308                 val = !tp->ack.pingpong;
2309                 break;
2310
2311 #ifdef CONFIG_ACCEPT_QUEUES
2312         case TCP_ACCEPTQ_SHARE:
2313         {
2314                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2315                 int i;
2316
2317                 if (sk->sk_state != TCP_LISTEN)
2318                         return -EOPNOTSUPP;
2319
2320                 if (get_user(len, optlen))
2321                         return -EFAULT;
2322
2323                 memset(tinfo, 0, sizeof(tinfo));
2324
2325                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2326                         tinfo[i].acceptq_wait_time =
2327                              jiffies_to_msecs(tp->acceptq[i].aq_wait_time);
2328                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2329                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2330                         tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2331                 }
2332
2333                 len = min_t(unsigned int, len, sizeof(tinfo));
2334                 if (put_user(len, optlen))
2335                         return -EFAULT;
2336
2337                 if (copy_to_user(optval, (char *)tinfo, len))
2338                         return -EFAULT;
2339
2340                 return 0;
2341         }
2342         break;
2343 #endif
2344         default:
2345                 return -ENOPROTOOPT;
2346         };
2347
2348         if (put_user(len, optlen))
2349                 return -EFAULT;
2350         if (copy_to_user(optval, &val, len))
2351                 return -EFAULT;
2352         return 0;
2353 }
2354
2355
2356 extern void __skb_cb_too_small_for_tcp(int, int);
2357 extern void tcpdiag_init(void);
2358
2359 static __initdata unsigned long thash_entries;
2360 static int __init set_thash_entries(char *str)
2361 {
2362         if (!str)
2363                 return 0;
2364         thash_entries = simple_strtoul(str, &str, 0);
2365         return 1;
2366 }
2367 __setup("thash_entries=", set_thash_entries);
2368
2369 void __init tcp_init(void)
2370 {
2371         struct sk_buff *skb = NULL;
2372         unsigned long goal;
2373         int order, i;
2374
2375         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2376                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2377                                            sizeof(skb->cb));
2378
2379         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2380                                                    sizeof(struct open_request),
2381                                                0, SLAB_HWCACHE_ALIGN,
2382                                                NULL, NULL);
2383         if (!tcp_openreq_cachep)
2384                 panic("tcp_init: Cannot alloc open_request cache.");
2385
2386         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2387                                               sizeof(struct tcp_bind_bucket),
2388                                               0, SLAB_HWCACHE_ALIGN,
2389                                               NULL, NULL);
2390         if (!tcp_bucket_cachep)
2391                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2392
2393         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2394                                                 sizeof(struct tcp_tw_bucket),
2395                                                 0, SLAB_HWCACHE_ALIGN,
2396                                                 NULL, NULL);
2397         if (!tcp_timewait_cachep)
2398                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2399
2400         /* Size and allocate the main established and bind bucket
2401          * hash tables.
2402          *
2403          * The methodology is similar to that of the buffer cache.
2404          */
2405         if (num_physpages >= (128 * 1024))
2406                 goal = num_physpages >> (21 - PAGE_SHIFT);
2407         else
2408                 goal = num_physpages >> (23 - PAGE_SHIFT);
2409
2410         if (thash_entries)
2411                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2412         for (order = 0; (1UL << order) < goal; order++)
2413                 ;
2414         do {
2415                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2416                         sizeof(struct tcp_ehash_bucket);
2417                 tcp_ehash_size >>= 1;
2418                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2419                         tcp_ehash_size--;
2420                 tcp_ehash = (struct tcp_ehash_bucket *)
2421                         __get_free_pages(GFP_ATOMIC, order);
2422         } while (!tcp_ehash && --order > 0);
2423
2424         if (!tcp_ehash)
2425                 panic("Failed to allocate TCP established hash table\n");
2426         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2427                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2428                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2429         }
2430
2431         do {
2432                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2433                         sizeof(struct tcp_bind_hashbucket);
2434                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2435                         continue;
2436                 tcp_bhash = (struct tcp_bind_hashbucket *)
2437                         __get_free_pages(GFP_ATOMIC, order);
2438         } while (!tcp_bhash && --order >= 0);
2439
2440         if (!tcp_bhash)
2441                 panic("Failed to allocate TCP bind hash table\n");
2442         for (i = 0; i < tcp_bhash_size; i++) {
2443                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2444                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2445         }
2446
2447         /* Try to be a bit smarter and adjust defaults depending
2448          * on available memory.
2449          */
2450         if (order > 4) {
2451                 sysctl_local_port_range[0] = 32768;
2452                 sysctl_local_port_range[1] = 61000;
2453                 sysctl_tcp_max_tw_buckets = 180000;
2454                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2455                 sysctl_max_syn_backlog = 1024;
2456         } else if (order < 3) {
2457                 sysctl_local_port_range[0] = 1024 * (3 - order);
2458                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2459                 sysctl_tcp_max_orphans >>= (3 - order);
2460                 sysctl_max_syn_backlog = 128;
2461         }
2462         tcp_port_rover = sysctl_local_port_range[0] - 1;
2463
2464         sysctl_tcp_mem[0] =  768 << order;
2465         sysctl_tcp_mem[1] = 1024 << order;
2466         sysctl_tcp_mem[2] = 1536 << order;
2467
2468         if (order < 3) {
2469                 sysctl_tcp_wmem[2] = 64 * 1024;
2470                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2471                 sysctl_tcp_rmem[1] = 43689;
2472                 sysctl_tcp_rmem[2] = 2 * 43689;
2473         }
2474
2475         printk(KERN_INFO "TCP: Hash tables configured "
2476                "(established %d bind %d)\n",
2477                tcp_ehash_size << 1, tcp_bhash_size);
2478
2479         tcpdiag_init();
2480 }
2481
2482 EXPORT_SYMBOL(tcp_accept);
2483 EXPORT_SYMBOL(tcp_close);
2484 EXPORT_SYMBOL(tcp_close_state);
2485 EXPORT_SYMBOL(tcp_destroy_sock);
2486 EXPORT_SYMBOL(tcp_disconnect);
2487 EXPORT_SYMBOL(tcp_getsockopt);
2488 EXPORT_SYMBOL(tcp_ioctl);
2489 EXPORT_SYMBOL(tcp_openreq_cachep);
2490 EXPORT_SYMBOL(tcp_poll);
2491 EXPORT_SYMBOL(tcp_read_sock);
2492 EXPORT_SYMBOL(tcp_recvmsg);
2493 EXPORT_SYMBOL(tcp_sendmsg);
2494 EXPORT_SYMBOL(tcp_sendpage);
2495 EXPORT_SYMBOL(tcp_setsockopt);
2496 EXPORT_SYMBOL(tcp_shutdown);
2497 EXPORT_SYMBOL(tcp_statistics);
2498 EXPORT_SYMBOL(tcp_timewait_cachep);
2499 EXPORT_SYMBOL_GPL(cleanup_rbuf);