net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/ckrm.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 273
 274 kmem_cache_t *tcp_openreq_cachep;
 275 kmem_cache_t *tcp_bucket_cachep;
 276 kmem_cache_t *tcp_timewait_cachep;
 277
 278 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 279
 280 int sysctl_tcp_default_win_scale;
 281
 282 int sysctl_tcp_mem[3];
 283 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 284 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 285
 286 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 287 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 288
 289 /* Pressure flag: try to collapse.
 290  * Technical note: it is used by multiple contexts non atomically.
 291  * All the tcp_mem_schedule() is of this nature: accounting
 292  * is strict, actions are advisory and have some latency. */
 293 int tcp_memory_pressure;
 294
 295 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
 296
 297 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 298 {
 299         int amt = TCP_PAGES(size);
 300
 301         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
 302         atomic_add(amt, &tcp_memory_allocated);
 303
 304         /* Under limit. */
 305         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 306                 if (tcp_memory_pressure)
 307                         tcp_memory_pressure = 0;
 308                 return 1;
 309         }
 310
 311         /* Over hard limit. */
 312         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 313                 tcp_enter_memory_pressure();
 314                 goto suppress_allocation;
 315         }
 316
 317         /* Under pressure. */
 318         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 319                 tcp_enter_memory_pressure();
 320
 321         if (kind) {
 322                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
 323                         return 1;
 324         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
 325                 return 1;
 326
 327         if (!tcp_memory_pressure ||
 328             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
 329                                 TCP_PAGES(sk->sk_wmem_queued +
 330                                           atomic_read(&sk->sk_rmem_alloc) +
 331                                           sk->sk_forward_alloc))
 332                 return 1;
 333
 334 suppress_allocation:
 335
 336         if (!kind) {
 337                 tcp_moderate_sndbuf(sk);
 338
 339                 /* Fail only if socket is _under_ its sndbuf.
 340                  * In this case we cannot block, so that we have to fail.
 341                  */
 342                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
 343                         return 1;
 344         }
 345
 346         /* Alas. Undo changes. */
 347         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
 348         atomic_sub(amt, &tcp_memory_allocated);
 349         return 0;
 350 }
 351
 352 void __tcp_mem_reclaim(struct sock *sk)
 353 {
 354         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
 355                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
 356                            &tcp_memory_allocated);
 357                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
 358                 if (tcp_memory_pressure &&
 359                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 360                         tcp_memory_pressure = 0;
 361         }
 362 }
 363
 364 void tcp_rfree(struct sk_buff *skb)
 365 {
 366         struct sock *sk = skb->sk;
 367
 368         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 369         sk->sk_forward_alloc += skb->truesize;
 370 }
 371
 372 /*
 373  * LISTEN is a special case for poll..
 374  */
 375 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 376                                                poll_table *wait)
 377 {
 378         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 379 }
 380
 381 /*
 382  *      Wait for a TCP event.
 383  *
 384  *      Note that we don't need to lock the socket, as the upper poll layers
 385  *      take care of normal races (between the test and the event) and we don't
 386  *      go look at any of the socket buffers directly.
 387  */
 388 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 389 {
 390         unsigned int mask;
 391         struct sock *sk = sock->sk;
 392         struct tcp_opt *tp = tcp_sk(sk);
 393
 394         poll_wait(file, sk->sk_sleep, wait);
 395         if (sk->sk_state == TCP_LISTEN)
 396                 return tcp_listen_poll(sk, wait);
 397
 398         /* Socket is not locked. We are protected from async events
 399            by poll logic and correct handling of state changes
 400            made by another threads is impossible in any case.
 401          */
 402
 403         mask = 0;
 404         if (sk->sk_err)
 405                 mask = POLLERR;
 406
 407         /*
 408          * POLLHUP is certainly not done right. But poll() doesn't
 409          * have a notion of HUP in just one direction, and for a
 410          * socket the read side is more interesting.
 411          *
 412          * Some poll() documentation says that POLLHUP is incompatible
 413          * with the POLLOUT/POLLWR flags, so somebody should check this
 414          * all. But careful, it tends to be safer to return too many
 415          * bits than too few, and you can easily break real applications
 416          * if you don't tell them that something has hung up!
 417          *
 418          * Check-me.
 419          *
 420          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 421          * our fs/select.c). It means that after we received EOF,
 422          * poll always returns immediately, making impossible poll() on write()
 423          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 424          * if and only if shutdown has been made in both directions.
 425          * Actually, it is interesting to look how Solaris and DUX
 426          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 427          * then we could set it on SND_SHUTDOWN. BTW examples given
 428          * in Stevens' books assume exactly this behaviour, it explains
 429          * why PULLHUP is incompatible with POLLOUT.    --ANK
 430          *
 431          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 432          * blocking on fresh not-connected or disconnected socket. --ANK
 433          */
 434         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 435                 mask |= POLLHUP;
 436         if (sk->sk_shutdown & RCV_SHUTDOWN)
 437                 mask |= POLLIN | POLLRDNORM;
 438
 439         /* Connected? */
 440         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 441                 /* Potential race condition. If read of tp below will
 442                  * escape above sk->sk_state, we can be illegally awaken
 443                  * in SYN_* states. */
 444                 if ((tp->rcv_nxt != tp->copied_seq) &&
 445                     (tp->urg_seq != tp->copied_seq ||
 446                      tp->rcv_nxt != tp->copied_seq + 1 ||
 447                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 448                         mask |= POLLIN | POLLRDNORM;
 449
 450                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 451                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 452                                 mask |= POLLOUT | POLLWRNORM;
 453                         } else {  /* send SIGIO later */
 454                                 set_bit(SOCK_ASYNC_NOSPACE,
 455                                         &sk->sk_socket->flags);
 456                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 457
 458                                 /* Race breaker. If space is freed after
 459                                  * wspace test but before the flags are set,
 460                                  * IO signal will be lost.
 461                                  */
 462                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 463                                         mask |= POLLOUT | POLLWRNORM;
 464                         }
 465                 }
 466
 467                 if (tp->urg_data & TCP_URG_VALID)
 468                         mask |= POLLPRI;
 469         }
 470         return mask;
 471 }
 472
 473 /*
 474  *      TCP socket write_space callback.
 475  */
 476 void tcp_write_space(struct sock *sk)
 477 {
 478         struct socket *sock = sk->sk_socket;
 479
 480         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
 481                 clear_bit(SOCK_NOSPACE, &sock->flags);
 482
 483                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 484                         wake_up_interruptible(sk->sk_sleep);
 485
 486                 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 487                         sock_wake_async(sock, 2, POLL_OUT);
 488         }
 489 }
 490
 491 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 492 {
 493         struct tcp_opt *tp = tcp_sk(sk);
 494         int answ;
 495
 496         switch (cmd) {
 497         case SIOCINQ:
 498                 if (sk->sk_state == TCP_LISTEN)
 499                         return -EINVAL;
 500
 501                 lock_sock(sk);
 502                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 503                         answ = 0;
 504                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 505                          !tp->urg_data ||
 506                          before(tp->urg_seq, tp->copied_seq) ||
 507                          !before(tp->urg_seq, tp->rcv_nxt)) {
 508                         answ = tp->rcv_nxt - tp->copied_seq;
 509
 510                         /* Subtract 1, if FIN is in queue. */
 511                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 512                                 answ -=
 513                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 514                 } else
 515                         answ = tp->urg_seq - tp->copied_seq;
 516                 release_sock(sk);
 517                 break;
 518         case SIOCATMARK:
 519                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 520                 break;
 521         case SIOCOUTQ:
 522                 if (sk->sk_state == TCP_LISTEN)
 523                         return -EINVAL;
 524
 525                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 526                         answ = 0;
 527                 else
 528                         answ = tp->write_seq - tp->snd_una;
 529                 break;
 530         default:
 531                 return -ENOIOCTLCMD;
 532         };
 533
 534         return put_user(answ, (int __user *)arg);
 535 }
 536
 537
 538 int tcp_listen_start(struct sock *sk)
 539 {
 540 #ifdef CONFIG_ACCEPT_QUEUES
 541         int i = 0;
 542 #endif
 543         struct inet_opt *inet = inet_sk(sk);
 544         struct tcp_opt *tp = tcp_sk(sk);
 545         struct tcp_listen_opt *lopt;
 546
 547         sk->sk_max_ack_backlog = 0;
 548         sk->sk_ack_backlog = 0;
 549         tp->accept_queue = NULL;
 550 #ifdef CONFIG_ACCEPT_QUEUES
 551         tp->class_index = 0;
 552         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
 553                 tp->acceptq[i].aq_tail = NULL;
 554                 tp->acceptq[i].aq_head = NULL;
 555                 tp->acceptq[i].aq_wait_time = 0;
 556                 tp->acceptq[i].aq_qcount = 0;
 557                 tp->acceptq[i].aq_count = 0;
 558                 if (i == 0) {
 559                         tp->acceptq[i].aq_valid = 1;
 560                         tp->acceptq[i].aq_ratio = 1;
 561                 }
 562                 else {
 563                         tp->acceptq[i].aq_valid = 0;
 564                         tp->acceptq[i].aq_ratio = 0;
 565                 }
 566         }
 567 #endif
 568         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 569         tcp_delack_init(tp);
 570
 571         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 572         if (!lopt)
 573                 return -ENOMEM;
 574
 575         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 576         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 577                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 578                         break;
 579         get_random_bytes(&lopt->hash_rnd, 4);
 580
 581         write_lock_bh(&tp->syn_wait_lock);
 582         tp->listen_opt = lopt;
 583         write_unlock_bh(&tp->syn_wait_lock);
 584
 585         /* There is race window here: we announce ourselves listening,
 586          * but this transition is still not validated by get_port().
 587          * It is OK, because this socket enters to hash table only
 588          * after validation is complete.
 589          */
 590         sk->sk_state = TCP_LISTEN;
 591         if (!sk->sk_prot->get_port(sk, inet->num)) {
 592                 inet->sport = htons(inet->num);
 593
 594                 sk_dst_reset(sk);
 595                 sk->sk_prot->hash(sk);
 596
 597 #ifdef CONFIG_CKRM
 598                 ckrm_cb_listen_start(sk);
 599 #endif
 600
 601                 return 0;
 602         }
 603
 604         sk->sk_state = TCP_CLOSE;
 605         write_lock_bh(&tp->syn_wait_lock);
 606         tp->listen_opt = NULL;
 607         write_unlock_bh(&tp->syn_wait_lock);
 608         kfree(lopt);
 609         return -EADDRINUSE;
 610 }
 611
 612 /*
 613  *      This routine closes sockets which have been at least partially
 614  *      opened, but not yet accepted.
 615  */
 616
 617 static void tcp_listen_stop (struct sock *sk)
 618 {
 619         struct tcp_opt *tp = tcp_sk(sk);
 620         struct tcp_listen_opt *lopt = tp->listen_opt;
 621         struct open_request *acc_req = tp->accept_queue;
 622         struct open_request *req;
 623         int i;
 624
 625         tcp_delete_keepalive_timer(sk);
 626
 627         /* make all the listen_opt local to us */
 628         write_lock_bh(&tp->syn_wait_lock);
 629         tp->listen_opt = NULL;
 630         write_unlock_bh(&tp->syn_wait_lock);
 631
 632 #ifdef CONFIG_CKRM
 633                 ckrm_cb_listen_stop(sk);
 634 #endif
 635
 636 #ifdef CONFIG_ACCEPT_QUEUES
 637         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
 638                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
 639 #else
 640         tp->accept_queue_tail = NULL;
 641 #endif
 642         tp->accept_queue = NULL;
 643
 644         if (lopt->qlen) {
 645                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 646                         while ((req = lopt->syn_table[i]) != NULL) {
 647                                 lopt->syn_table[i] = req->dl_next;
 648                                 lopt->qlen--;
 649                                 tcp_openreq_free(req);
 650
 651                 /* Following specs, it would be better either to send FIN
 652                  * (and enter FIN-WAIT-1, it is normal close)
 653                  * or to send active reset (abort).
 654                  * Certainly, it is pretty dangerous while synflood, but it is
 655                  * bad justification for our negligence 8)
 656                  * To be honest, we are not able to make either
 657                  * of the variants now.                 --ANK
 658                  */
 659                         }
 660                 }
 661         }
 662         BUG_TRAP(!lopt->qlen);
 663
 664         kfree(lopt);
 665
 666         while ((req = acc_req) != NULL) {
 667                 struct sock *child = req->sk;
 668
 669                 acc_req = req->dl_next;
 670
 671                 local_bh_disable();
 672                 bh_lock_sock(child);
 673                 BUG_TRAP(!sock_owned_by_user(child));
 674                 sock_hold(child);
 675
 676                 tcp_disconnect(child, O_NONBLOCK);
 677
 678                 sock_orphan(child);
 679
 680                 atomic_inc(&tcp_orphan_count);
 681
 682                 tcp_destroy_sock(child);
 683
 684                 bh_unlock_sock(child);
 685                 local_bh_enable();
 686                 sock_put(child);
 687
 688 #ifdef CONFIG_ACCEPT_QUEUES
 689                 tcp_acceptq_removed(sk, req->acceptq_class);
 690 #else
 691                 tcp_acceptq_removed(sk);
 692 #endif
 693                 tcp_openreq_fastfree(req);
 694         }
 695         BUG_TRAP(!sk->sk_ack_backlog);
 696 }
 697
 698 /*
 699  *      Wait for a socket to get into the connected state
 700  *
 701  *      Note: Must be called with the socket locked.
 702  */
 703 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
 704 {
 705         struct tcp_opt *tp = tcp_sk(sk);
 706         struct task_struct *tsk = current;
 707         DEFINE_WAIT(wait);
 708
 709         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 710                 if (sk->sk_err)
 711                         return sock_error(sk);
 712                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 713                         return -EPIPE;
 714                 if (!*timeo_p)
 715                         return -EAGAIN;
 716                 if (signal_pending(tsk))
 717                         return sock_intr_errno(*timeo_p);
 718
 719                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 720                 tp->write_pending++;
 721
 722                 release_sock(sk);
 723                 *timeo_p = schedule_timeout(*timeo_p);
 724                 lock_sock(sk);
 725
 726                 finish_wait(sk->sk_sleep, &wait);
 727                 tp->write_pending--;
 728         }
 729         return 0;
 730 }
 731
 732 static inline int tcp_memory_free(struct sock *sk)
 733 {
 734         return sk->sk_wmem_queued < sk->sk_sndbuf;
 735 }
 736
 737 /*
 738  *      Wait for more memory for a socket
 739  */
 740 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
 741 {
 742         struct tcp_opt *tp = tcp_sk(sk);
 743         int err = 0;
 744         long vm_wait = 0;
 745         long current_timeo = *timeo;
 746         DEFINE_WAIT(wait);
 747
 748         if (tcp_memory_free(sk))
 749                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
 750
 751         for (;;) {
 752                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 753
 754                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 755
 756                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 757                         goto do_error;
 758                 if (!*timeo)
 759                         goto do_nonblock;
 760                 if (signal_pending(current))
 761                         goto do_interrupted;
 762                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 763                 if (tcp_memory_free(sk) && !vm_wait)
 764                         break;
 765
 766                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 767                 tp->write_pending++;
 768                 release_sock(sk);
 769                 if (!tcp_memory_free(sk) || vm_wait)
 770                         current_timeo = schedule_timeout(current_timeo);
 771                 lock_sock(sk);
 772                 tp->write_pending--;
 773
 774                 if (vm_wait) {
 775                         vm_wait -= current_timeo;
 776                         current_timeo = *timeo;
 777                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 778                             (current_timeo -= vm_wait) < 0)
 779                                 current_timeo = 0;
 780                         vm_wait = 0;
 781                 }
 782                 *timeo = current_timeo;
 783         }
 784 out:
 785         finish_wait(sk->sk_sleep, &wait);
 786         return err;
 787
 788 do_error:
 789         err = -EPIPE;
 790         goto out;
 791 do_nonblock:
 792         err = -EAGAIN;
 793         goto out;
 794 do_interrupted:
 795         err = sock_intr_errno(*timeo);
 796         goto out;
 797 }
 798
 799 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
 800                                int off)
 801 {
 802         if (i) {
 803                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 804                 return page == frag->page &&
 805                        off == frag->page_offset + frag->size;
 806         }
 807         return 0;
 808 }
 809
 810 static inline void fill_page_desc(struct sk_buff *skb, int i,
 811                                   struct page *page, int off, int size)
 812 {
 813         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 814         frag->page = page;
 815         frag->page_offset = off;
 816         frag->size = size;
 817         skb_shinfo(skb)->nr_frags = i + 1;
 818 }
 819
 820 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 821 {
 822         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 823         tp->pushed_seq = tp->write_seq;
 824 }
 825
 826 static inline int forced_push(struct tcp_opt *tp)
 827 {
 828         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 829 }
 830
 831 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 832                               struct sk_buff *skb)
 833 {
 834         skb->csum = 0;
 835         TCP_SKB_CB(skb)->seq = tp->write_seq;
 836         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 837         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 838         TCP_SKB_CB(skb)->sacked = 0;
 839         __skb_queue_tail(&sk->sk_write_queue, skb);
 840         tcp_charge_skb(sk, skb);
 841         if (!tp->send_head)
 842                 tp->send_head = skb;
 843         else if (tp->nonagle&TCP_NAGLE_PUSH)
 844                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 845 }
 846
 847 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 848                                 struct sk_buff *skb)
 849 {
 850         if (flags & MSG_OOB) {
 851                 tp->urg_mode = 1;
 852                 tp->snd_up = tp->write_seq;
 853                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 854         }
 855 }
 856
 857 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 858                             int mss_now, int nonagle)
 859 {
 860         if (tp->send_head) {
 861                 struct sk_buff *skb = sk->sk_write_queue.prev;
 862                 if (!(flags & MSG_MORE) || forced_push(tp))
 863                         tcp_mark_push(tp, skb);
 864                 tcp_mark_urg(tp, flags, skb);
 865                 __tcp_push_pending_frames(sk, tp, mss_now,
 866                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 867         }
 868 }
 869
 870 static int tcp_error(struct sock *sk, int flags, int err)
 871 {
 872         if (err == -EPIPE)
 873                 err = sock_error(sk) ? : -EPIPE;
 874         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 875                 send_sig(SIGPIPE, current, 0);
 876         return err;
 877 }
 878
 879 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 880                          size_t psize, int flags)
 881 {
 882         struct tcp_opt *tp = tcp_sk(sk);
 883         int mss_now;
 884         int err;
 885         ssize_t copied;
 886         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 887
 888         /* Wait for a connection to finish. */
 889         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 890                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 891                         goto out_err;
 892
 893         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 894
 895         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 896         copied = 0;
 897
 898         err = -EPIPE;
 899         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 900                 goto do_error;
 901
 902         while (psize > 0) {
 903                 struct sk_buff *skb = sk->sk_write_queue.prev;
 904                 struct page *page = pages[poffset / PAGE_SIZE];
 905                 int copy, i;
 906                 int offset = poffset % PAGE_SIZE;
 907                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 908
 909                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
 910 new_segment:
 911                         if (!tcp_memory_free(sk))
 912                                 goto wait_for_sndbuf;
 913
 914                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
 915                                              sk->sk_allocation);
 916                         if (!skb)
 917                                 goto wait_for_memory;
 918
 919                         skb_entail(sk, tp, skb);
 920                         copy = mss_now;
 921                 }
 922
 923                 if (copy > size)
 924                         copy = size;
 925
 926                 i = skb_shinfo(skb)->nr_frags;
 927                 if (can_coalesce(skb, i, page, offset)) {
 928                         skb_shinfo(skb)->frags[i - 1].size += copy;
 929                 } else if (i < MAX_SKB_FRAGS) {
 930                         get_page(page);
 931                         fill_page_desc(skb, i, page, offset, copy);
 932                 } else {
 933                         tcp_mark_push(tp, skb);
 934                         goto new_segment;
 935                 }
 936
 937                 skb->len += copy;
 938                 skb->data_len += copy;
 939                 skb->ip_summed = CHECKSUM_HW;
 940                 tp->write_seq += copy;
 941                 TCP_SKB_CB(skb)->end_seq += copy;
 942
 943                 if (!copied)
 944                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 945
 946                 copied += copy;
 947                 poffset += copy;
 948                 if (!(psize -= copy))
 949                         goto out;
 950
 951                 if (skb->len != mss_now || (flags & MSG_OOB))
 952                         continue;
 953
 954                 if (forced_push(tp)) {
 955                         tcp_mark_push(tp, skb);
 956                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 957                 } else if (skb == tp->send_head)
 958                         tcp_push_one(sk, mss_now);
 959                 continue;
 960
 961 wait_for_sndbuf:
 962                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 963 wait_for_memory:
 964                 if (copied)
 965                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 966
 967                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 968                         goto do_error;
 969
 970                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 971         }
 972
 973 out:
 974         if (copied)
 975                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 976         return copied;
 977
 978 do_error:
 979         if (copied)
 980                 goto out;
 981 out_err:
 982         return tcp_error(sk, flags, err);
 983 }
 984
 985 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 986                      size_t size, int flags)
 987 {
 988         ssize_t res;
 989         struct sock *sk = sock->sk;
 990
 991 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 992
 993         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 994             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 995                 return sock_no_sendpage(sock, page, offset, size, flags);
 996
 997 #undef TCP_ZC_CSUM_FLAGS
 998
 999         lock_sock(sk);
1000         TCP_CHECK_TIMER(sk);
1001         res = do_tcp_sendpages(sk, &page, offset, size, flags);
1002         TCP_CHECK_TIMER(sk);
1003         release_sock(sk);
1004         return res;
1005 }
1006
1007 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
1008 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
1009
1010 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
1011                                    struct sk_buff *skb, struct page *page,
1012                                    int off, int copy)
1013 {
1014         int err = 0;
1015         unsigned int csum;
1016
1017         if (skb->ip_summed == CHECKSUM_NONE) {
1018                 csum = csum_and_copy_from_user(from, page_address(page) + off,
1019                                        copy, 0, &err);
1020                 if (err) return err;
1021                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1022         } else {
1023                 if (copy_from_user(page_address(page) + off, from, copy))
1024                         return -EFAULT;
1025         }
1026
1027         skb->len += copy;
1028         skb->data_len += copy;
1029         skb->truesize += copy;
1030         sk->sk_wmem_queued += copy;
1031         sk->sk_forward_alloc -= copy;
1032         return 0;
1033 }
1034
1035 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
1036 {
1037         int err = 0;
1038         unsigned int csum;
1039         int off = skb->len;
1040
1041         if (skb->ip_summed == CHECKSUM_NONE) {
1042                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1043                                        copy, 0, &err);
1044                 if (!err) {
1045                         skb->csum = csum_block_add(skb->csum, csum, off);
1046                         return 0;
1047                 }
1048         } else {
1049                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1050                         return 0;
1051         }
1052
1053         __skb_trim(skb, off);
1054         return -EFAULT;
1055 }
1056
1057 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1058 {
1059         int tmp = tp->mss_cache_std;
1060
1061         if (sk->sk_route_caps & NETIF_F_SG) {
1062                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1063
1064                 if (tmp >= pgbreak &&
1065                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1066                         tmp = pgbreak;
1067         }
1068         return tmp;
1069 }
1070
1071 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1072                 size_t size)
1073 {
1074         struct iovec *iov;
1075         struct tcp_opt *tp = tcp_sk(sk);
1076         struct sk_buff *skb;
1077         int iovlen, flags;
1078         int mss_now;
1079         int err, copied;
1080         long timeo;
1081
1082         lock_sock(sk);
1083         TCP_CHECK_TIMER(sk);
1084
1085         flags = msg->msg_flags;
1086         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1087
1088         /* Wait for a connection to finish. */
1089         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1090                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1091                         goto out_err;
1092
1093         /* This should be in poll */
1094         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1095
1096         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1097
1098         /* Ok commence sending. */
1099         iovlen = msg->msg_iovlen;
1100         iov = msg->msg_iov;
1101         copied = 0;
1102
1103         err = -EPIPE;
1104         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1105                 goto do_error;
1106
1107         while (--iovlen >= 0) {
1108                 int seglen = iov->iov_len;
1109                 unsigned char __user *from = iov->iov_base;
1110
1111                 iov++;
1112
1113                 while (seglen > 0) {
1114                         int copy;
1115
1116                         skb = sk->sk_write_queue.prev;
1117
1118                         if (!tp->send_head ||
1119                             (copy = mss_now - skb->len) <= 0) {
1120
1121 new_segment:
1122                                 /* Allocate new segment. If the interface is SG,
1123                                  * allocate skb fitting to single page.
1124                                  */
1125                                 if (!tcp_memory_free(sk))
1126                                         goto wait_for_sndbuf;
1127
1128                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1129                                                      0, sk->sk_allocation);
1130                                 if (!skb)
1131                                         goto wait_for_memory;
1132
1133                                 /*
1134                                  * Check whether we can use HW checksum.
1135                                  */
1136                                 if (sk->sk_route_caps &
1137                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1138                                      NETIF_F_HW_CSUM))
1139                                         skb->ip_summed = CHECKSUM_HW;
1140
1141                                 skb_entail(sk, tp, skb);
1142                                 copy = mss_now;
1143                         }
1144
1145                         /* Try to append data to the end of skb. */
1146                         if (copy > seglen)
1147                                 copy = seglen;
1148
1149                         /* Where to copy to? */
1150                         if (skb_tailroom(skb) > 0) {
1151                                 /* We have some space in skb head. Superb! */
1152                                 if (copy > skb_tailroom(skb))
1153                                         copy = skb_tailroom(skb);
1154                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1155                                         goto do_fault;
1156                         } else {
1157                                 int merge = 0;
1158                                 int i = skb_shinfo(skb)->nr_frags;
1159                                 struct page *page = TCP_PAGE(sk);
1160                                 int off = TCP_OFF(sk);
1161
1162                                 if (can_coalesce(skb, i, page, off) &&
1163                                     off != PAGE_SIZE) {
1164                                         /* We can extend the last page
1165                                          * fragment. */
1166                                         merge = 1;
1167                                 } else if (i == MAX_SKB_FRAGS ||
1168                                            (!i &&
1169                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1170                                         /* Need to add new fragment and cannot
1171                                          * do this because interface is non-SG,
1172                                          * or because all the page slots are
1173                                          * busy. */
1174                                         tcp_mark_push(tp, skb);
1175                                         goto new_segment;
1176                                 } else if (page) {
1177                                         /* If page is cached, align
1178                                          * offset to L1 cache boundary
1179                                          */
1180                                         off = (off + L1_CACHE_BYTES - 1) &
1181                                               ~(L1_CACHE_BYTES - 1);
1182                                         if (off == PAGE_SIZE) {
1183                                                 put_page(page);
1184                                                 TCP_PAGE(sk) = page = NULL;
1185                                         }
1186                                 }
1187
1188                                 if (!page) {
1189                                         /* Allocate new cache page. */
1190                                         if (!(page = tcp_alloc_page(sk)))
1191                                                 goto wait_for_memory;
1192                                         off = 0;
1193                                 }
1194
1195                                 if (copy > PAGE_SIZE - off)
1196                                         copy = PAGE_SIZE - off;
1197
1198                                 /* Time to copy data. We are close to
1199                                  * the end! */
1200                                 err = tcp_copy_to_page(sk, from, skb, page,
1201                                                        off, copy);
1202                                 if (err) {
1203                                         /* If this page was new, give it to the
1204                                          * socket so it does not get leaked.
1205                                          */
1206                                         if (!TCP_PAGE(sk)) {
1207                                                 TCP_PAGE(sk) = page;
1208                                                 TCP_OFF(sk) = 0;
1209                                         }
1210                                         goto do_error;
1211                                 }
1212
1213                                 /* Update the skb. */
1214                                 if (merge) {
1215                                         skb_shinfo(skb)->frags[i - 1].size +=
1216                                                                         copy;
1217                                 } else {
1218                                         fill_page_desc(skb, i, page, off, copy);
1219                                         if (TCP_PAGE(sk)) {
1220                                                 get_page(page);
1221                                         } else if (off + copy < PAGE_SIZE) {
1222                                                 get_page(page);
1223                                                 TCP_PAGE(sk) = page;
1224                                         }
1225                                 }
1226
1227                                 TCP_OFF(sk) = off + copy;
1228                         }
1229
1230                         if (!copied)
1231                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1232
1233                         tp->write_seq += copy;
1234                         TCP_SKB_CB(skb)->end_seq += copy;
1235
1236                         from += copy;
1237                         copied += copy;
1238                         if ((seglen -= copy) == 0 && iovlen == 0)
1239                                 goto out;
1240
1241                         if (skb->len != mss_now || (flags & MSG_OOB))
1242                                 continue;
1243
1244                         if (forced_push(tp)) {
1245                                 tcp_mark_push(tp, skb);
1246                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1247                         } else if (skb == tp->send_head)
1248                                 tcp_push_one(sk, mss_now);
1249                         continue;
1250
1251 wait_for_sndbuf:
1252                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1253 wait_for_memory:
1254                         if (copied)
1255                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1256
1257                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1258                                 goto do_error;
1259
1260                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1261                 }
1262         }
1263
1264 out:
1265         if (copied)
1266                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1267         TCP_CHECK_TIMER(sk);
1268         release_sock(sk);
1269         return copied;
1270
1271 do_fault:
1272         if (!skb->len) {
1273                 if (tp->send_head == skb)
1274                         tp->send_head = NULL;
1275                 __skb_unlink(skb, skb->list);
1276                 tcp_free_skb(sk, skb);
1277         }
1278
1279 do_error:
1280         if (copied)
1281                 goto out;
1282 out_err:
1283         err = tcp_error(sk, flags, err);
1284         TCP_CHECK_TIMER(sk);
1285         release_sock(sk);
1286         return err;
1287 }
1288
1289 /*
1290  *      Handle reading urgent data. BSD has very simple semantics for
1291  *      this, no blocking and very strange errors 8)
1292  */
1293
1294 static int tcp_recv_urg(struct sock *sk, long timeo,
1295                         struct msghdr *msg, int len, int flags,
1296                         int *addr_len)
1297 {
1298         struct tcp_opt *tp = tcp_sk(sk);
1299
1300         /* No URG data to read. */
1301         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1302             tp->urg_data == TCP_URG_READ)
1303                 return -EINVAL; /* Yes this is right ! */
1304
1305         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1306                 return -ENOTCONN;
1307
1308         if (tp->urg_data & TCP_URG_VALID) {
1309                 int err = 0;
1310                 char c = tp->urg_data;
1311
1312                 if (!(flags & MSG_PEEK))
1313                         tp->urg_data = TCP_URG_READ;
1314
1315                 /* Read urgent data. */
1316                 msg->msg_flags |= MSG_OOB;
1317
1318                 if (len > 0) {
1319                         if (!(flags & MSG_TRUNC))
1320                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1321                         len = 1;
1322                 } else
1323                         msg->msg_flags |= MSG_TRUNC;
1324
1325                 return err ? -EFAULT : len;
1326         }
1327
1328         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1329                 return 0;
1330
1331         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1332          * the available implementations agree in this case:
1333          * this call should never block, independent of the
1334          * blocking state of the socket.
1335          * Mike <pall@rz.uni-karlsruhe.de>
1336          */
1337         return -EAGAIN;
1338 }
1339
1340 /*
1341  *      Release a skb if it is no longer needed. This routine
1342  *      must be called with interrupts disabled or with the
1343  *      socket locked so that the sk_buff queue operation is ok.
1344  */
1345
1346 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1347 {
1348         __skb_unlink(skb, &sk->sk_receive_queue);
1349         __kfree_skb(skb);
1350 }
1351
1352 /* Clean up the receive buffer for full frames taken by the user,
1353  * then send an ACK if necessary.  COPIED is the number of bytes
1354  * tcp_recvmsg has given to the user so far, it speeds up the
1355  * calculation of whether or not we must ACK for the sake of
1356  * a window update.
1357  */
1358 void cleanup_rbuf(struct sock *sk, int copied)
1359 {
1360         struct tcp_opt *tp = tcp_sk(sk);
1361         int time_to_ack = 0;
1362
1363 #if TCP_DEBUG
1364         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1365
1366         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1367 #endif
1368
1369         if (tcp_ack_scheduled(tp)) {
1370                    /* Delayed ACKs frequently hit locked sockets during bulk
1371                     * receive. */
1372                 if (tp->ack.blocked ||
1373                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1374                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1375                     /*
1376                      * If this read emptied read buffer, we send ACK, if
1377                      * connection is not bidirectional, user drained
1378                      * receive buffer and there was a small segment
1379                      * in queue.
1380                      */
1381                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1382                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1383                         time_to_ack = 1;
1384         }
1385
1386         /* We send an ACK if we can now advertise a non-zero window
1387          * which has been raised "significantly".
1388          *
1389          * Even if window raised up to infinity, do not send window open ACK
1390          * in states, where we will not receive more. It is useless.
1391          */
1392         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1393                 __u32 rcv_window_now = tcp_receive_window(tp);
1394
1395                 /* Optimize, __tcp_select_window() is not cheap. */
1396                 if (2*rcv_window_now <= tp->window_clamp) {
1397                         __u32 new_window = __tcp_select_window(sk);
1398
1399                         /* Send ACK now, if this read freed lots of space
1400                          * in our buffer. Certainly, new_window is new window.
1401                          * We can advertise it now, if it is not less than current one.
1402                          * "Lots" means "at least twice" here.
1403                          */
1404                         if (new_window && new_window >= 2 * rcv_window_now)
1405                                 time_to_ack = 1;
1406                 }
1407         }
1408         if (time_to_ack)
1409                 tcp_send_ack(sk);
1410 }
1411
1412 /* Now socket state including sk->sk_err is changed only under lock,
1413  * hence we may omit checks after joining wait queue.
1414  * We check receive queue before schedule() only as optimization;
1415  * it is very likely that release_sock() added new data.
1416  */
1417
1418 static long tcp_data_wait(struct sock *sk, long timeo)
1419 {
1420         DEFINE_WAIT(wait);
1421
1422         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1423
1424         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1425         release_sock(sk);
1426
1427         if (skb_queue_empty(&sk->sk_receive_queue))
1428                 timeo = schedule_timeout(timeo);
1429
1430         lock_sock(sk);
1431         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1432
1433         finish_wait(sk->sk_sleep, &wait);
1434         return timeo;
1435 }
1436
1437 static void tcp_prequeue_process(struct sock *sk)
1438 {
1439         struct sk_buff *skb;
1440         struct tcp_opt *tp = tcp_sk(sk);
1441
1442         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1443
1444         /* RX process wants to run with disabled BHs, though it is not
1445          * necessary */
1446         local_bh_disable();
1447         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1448                 sk->sk_backlog_rcv(sk, skb);
1449         local_bh_enable();
1450
1451         /* Clear memory counter. */
1452         tp->ucopy.memory = 0;
1453 }
1454
1455 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1456 {
1457         struct sk_buff *skb;
1458         u32 offset;
1459
1460         skb_queue_walk(&sk->sk_receive_queue, skb) {
1461                 offset = seq - TCP_SKB_CB(skb)->seq;
1462                 if (skb->h.th->syn)
1463                         offset--;
1464                 if (offset < skb->len || skb->h.th->fin) {
1465                         *off = offset;
1466                         return skb;
1467                 }
1468         }
1469         return NULL;
1470 }
1471
1472 /*
1473  * This routine provides an alternative to tcp_recvmsg() for routines
1474  * that would like to handle copying from skbuffs directly in 'sendfile'
1475  * fashion.
1476  * Note:
1477  *      - It is assumed that the socket was locked by the caller.
1478  *      - The routine does not block.
1479  *      - At present, there is no support for reading OOB data
1480  *        or for 'peeking' the socket using this routine
1481  *        (although both would be easy to implement).
1482  */
1483 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1484                   sk_read_actor_t recv_actor)
1485 {
1486         struct sk_buff *skb;
1487         struct tcp_opt *tp = tcp_sk(sk);
1488         u32 seq = tp->copied_seq;
1489         u32 offset;
1490         int copied = 0;
1491
1492         if (sk->sk_state == TCP_LISTEN)
1493                 return -ENOTCONN;
1494         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1495                 if (offset < skb->len) {
1496                         size_t used, len;
1497
1498                         len = skb->len - offset;
1499                         /* Stop reading if we hit a patch of urgent data */
1500                         if (tp->urg_data) {
1501                                 u32 urg_offset = tp->urg_seq - seq;
1502                                 if (urg_offset < len)
1503                                         len = urg_offset;
1504                                 if (!len)
1505                                         break;
1506                         }
1507                         used = recv_actor(desc, skb, offset, len);
1508                         if (used <= len) {
1509                                 seq += used;
1510                                 copied += used;
1511                                 offset += used;
1512                         }
1513                         if (offset != skb->len)
1514                                 break;
1515                 }
1516                 if (skb->h.th->fin) {
1517                         tcp_eat_skb(sk, skb);
1518                         ++seq;
1519                         break;
1520                 }
1521                 tcp_eat_skb(sk, skb);
1522                 if (!desc->count)
1523                         break;
1524         }
1525         tp->copied_seq = seq;
1526
1527         tcp_rcv_space_adjust(sk);
1528
1529         /* Clean up data we have read: This will do ACK frames. */
1530         if (copied)
1531                 cleanup_rbuf(sk, copied);
1532         return copied;
1533 }
1534
1535 /*
1536  *      This routine copies from a sock struct into the user buffer.
1537  *
1538  *      Technical note: in 2.3 we work on _locked_ socket, so that
1539  *      tricks with *seq access order and skb->users are not required.
1540  *      Probably, code can be easily improved even more.
1541  */
1542
1543 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1544                 size_t len, int nonblock, int flags, int *addr_len)
1545 {
1546         struct tcp_opt *tp = tcp_sk(sk);
1547         int copied = 0;
1548         u32 peek_seq;
1549         u32 *seq;
1550         unsigned long used;
1551         int err;
1552         int target;             /* Read at least this many bytes */
1553         long timeo;
1554         struct task_struct *user_recv = NULL;
1555
1556         lock_sock(sk);
1557
1558         TCP_CHECK_TIMER(sk);
1559
1560         err = -ENOTCONN;
1561         if (sk->sk_state == TCP_LISTEN)
1562                 goto out;
1563
1564         timeo = sock_rcvtimeo(sk, nonblock);
1565
1566         /* Urgent data needs to be handled specially. */
1567         if (flags & MSG_OOB)
1568                 goto recv_urg;
1569
1570         seq = &tp->copied_seq;
1571         if (flags & MSG_PEEK) {
1572                 peek_seq = tp->copied_seq;
1573                 seq = &peek_seq;
1574         }
1575
1576         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1577
1578         do {
1579                 struct sk_buff *skb;
1580                 u32 offset;
1581
1582                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1583                 if (tp->urg_data && tp->urg_seq == *seq) {
1584                         if (copied)
1585                                 break;
1586                         if (signal_pending(current)) {
1587                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1588                                 break;
1589                         }
1590                 }
1591
1592                 /* Next get a buffer. */
1593
1594                 skb = skb_peek(&sk->sk_receive_queue);
1595                 do {
1596                         if (!skb)
1597                                 break;
1598
1599                         /* Now that we have two receive queues this
1600                          * shouldn't happen.
1601                          */
1602                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1603                                 printk(KERN_INFO "recvmsg bug: copied %X "
1604                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1605                                 break;
1606                         }
1607                         offset = *seq - TCP_SKB_CB(skb)->seq;
1608                         if (skb->h.th->syn)
1609                                 offset--;
1610                         if (offset < skb->len)
1611                                 goto found_ok_skb;
1612                         if (skb->h.th->fin)
1613                                 goto found_fin_ok;
1614                         BUG_TRAP(flags & MSG_PEEK);
1615                         skb = skb->next;
1616                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1617
1618                 /* Well, if we have backlog, try to process it now yet. */
1619
1620                 if (copied >= target && !sk->sk_backlog.tail)
1621                         break;
1622
1623                 if (copied) {
1624                         if (sk->sk_err ||
1625                             sk->sk_state == TCP_CLOSE ||
1626                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1627                             !timeo ||
1628                             signal_pending(current) ||
1629                             (flags & MSG_PEEK))
1630                                 break;
1631                 } else {
1632                         if (sock_flag(sk, SOCK_DONE))
1633                                 break;
1634
1635                         if (sk->sk_err) {
1636                                 copied = sock_error(sk);
1637                                 break;
1638                         }
1639
1640                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1641                                 break;
1642
1643                         if (sk->sk_state == TCP_CLOSE) {
1644                                 if (!sock_flag(sk, SOCK_DONE)) {
1645                                         /* This occurs when user tries to read
1646                                          * from never connected socket.
1647                                          */
1648                                         copied = -ENOTCONN;
1649                                         break;
1650                                 }
1651                                 break;
1652                         }
1653
1654                         if (!timeo) {
1655                                 copied = -EAGAIN;
1656                                 break;
1657                         }
1658
1659                         if (signal_pending(current)) {
1660                                 copied = sock_intr_errno(timeo);
1661                                 break;
1662                         }
1663                 }
1664
1665                 cleanup_rbuf(sk, copied);
1666
1667                 if (tp->ucopy.task == user_recv) {
1668                         /* Install new reader */
1669                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1670                                 user_recv = current;
1671                                 tp->ucopy.task = user_recv;
1672                                 tp->ucopy.iov = msg->msg_iov;
1673                         }
1674
1675                         tp->ucopy.len = len;
1676
1677                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1678                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1679
1680                         /* Ugly... If prequeue is not empty, we have to
1681                          * process it before releasing socket, otherwise
1682                          * order will be broken at second iteration.
1683                          * More elegant solution is required!!!
1684                          *
1685                          * Look: we have the following (pseudo)queues:
1686                          *
1687                          * 1. packets in flight
1688                          * 2. backlog
1689                          * 3. prequeue
1690                          * 4. receive_queue
1691                          *
1692                          * Each queue can be processed only if the next ones
1693                          * are empty. At this point we have empty receive_queue.
1694                          * But prequeue _can_ be not empty after 2nd iteration,
1695                          * when we jumped to start of loop because backlog
1696                          * processing added something to receive_queue.
1697                          * We cannot release_sock(), because backlog contains
1698                          * packets arrived _after_ prequeued ones.
1699                          *
1700                          * Shortly, algorithm is clear --- to process all
1701                          * the queues in order. We could make it more directly,
1702                          * requeueing packets from backlog to prequeue, if
1703                          * is not empty. It is more elegant, but eats cycles,
1704                          * unfortunately.
1705                          */
1706                         if (skb_queue_len(&tp->ucopy.prequeue))
1707                                 goto do_prequeue;
1708
1709                         /* __ Set realtime policy in scheduler __ */
1710                 }
1711
1712                 if (copied >= target) {
1713                         /* Do not sleep, just process backlog. */
1714                         release_sock(sk);
1715                         lock_sock(sk);
1716                 } else {
1717                         timeo = tcp_data_wait(sk, timeo);
1718                 }
1719
1720                 if (user_recv) {
1721                         int chunk;
1722
1723                         /* __ Restore normal policy in scheduler __ */
1724
1725                         if ((chunk = len - tp->ucopy.len) != 0) {
1726                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1727                                 len -= chunk;
1728                                 copied += chunk;
1729                         }
1730
1731                         if (tp->rcv_nxt == tp->copied_seq &&
1732                             skb_queue_len(&tp->ucopy.prequeue)) {
1733 do_prequeue:
1734                                 tcp_prequeue_process(sk);
1735
1736                                 if ((chunk = len - tp->ucopy.len) != 0) {
1737                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1738                                         len -= chunk;
1739                                         copied += chunk;
1740                                 }
1741                         }
1742                 }
1743                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1744                         if (net_ratelimit())
1745                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1746                                        current->comm, current->pid);
1747                         peek_seq = tp->copied_seq;
1748                 }
1749                 continue;
1750
1751         found_ok_skb:
1752                 /* Ok so how much can we use? */
1753                 used = skb->len - offset;
1754                 if (len < used)
1755                         used = len;
1756
1757                 /* Do we have urgent data here? */
1758                 if (tp->urg_data) {
1759                         u32 urg_offset = tp->urg_seq - *seq;
1760                         if (urg_offset < used) {
1761                                 if (!urg_offset) {
1762                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1763                                                 ++*seq;
1764                                                 offset++;
1765                                                 used--;
1766                                                 if (!used)
1767                                                         goto skip_copy;
1768                                         }
1769                                 } else
1770                                         used = urg_offset;
1771                         }
1772                 }
1773
1774                 if (!(flags & MSG_TRUNC)) {
1775                         err = skb_copy_datagram_iovec(skb, offset,
1776                                                       msg->msg_iov, used);
1777                         if (err) {
1778                                 /* Exception. Bailout! */
1779                                 if (!copied)
1780                                         copied = -EFAULT;
1781                                 break;
1782                         }
1783                 }
1784
1785                 *seq += used;
1786                 copied += used;
1787                 len -= used;
1788
1789                 tcp_rcv_space_adjust(sk);
1790
1791 skip_copy:
1792                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1793                         tp->urg_data = 0;
1794                         tcp_fast_path_check(sk, tp);
1795                 }
1796                 if (used + offset < skb->len)
1797                         continue;
1798
1799                 if (skb->h.th->fin)
1800                         goto found_fin_ok;
1801                 if (!(flags & MSG_PEEK))
1802                         tcp_eat_skb(sk, skb);
1803                 continue;
1804
1805         found_fin_ok:
1806                 /* Process the FIN. */
1807                 ++*seq;
1808                 if (!(flags & MSG_PEEK))
1809                         tcp_eat_skb(sk, skb);
1810                 break;
1811         } while (len > 0);
1812
1813         if (user_recv) {
1814                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1815                         int chunk;
1816
1817                         tp->ucopy.len = copied > 0 ? len : 0;
1818
1819                         tcp_prequeue_process(sk);
1820
1821                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1822                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1823                                 len -= chunk;
1824                                 copied += chunk;
1825                         }
1826                 }
1827
1828                 tp->ucopy.task = NULL;
1829                 tp->ucopy.len = 0;
1830         }
1831
1832         /* According to UNIX98, msg_name/msg_namelen are ignored
1833          * on connected socket. I was just happy when found this 8) --ANK
1834          */
1835
1836         /* Clean up data we have read: This will do ACK frames. */
1837         cleanup_rbuf(sk, copied);
1838
1839         TCP_CHECK_TIMER(sk);
1840         release_sock(sk);
1841         return copied;
1842
1843 out:
1844         TCP_CHECK_TIMER(sk);
1845         release_sock(sk);
1846         return err;
1847
1848 recv_urg:
1849         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1850         goto out;
1851 }
1852
1853 /*
1854  *      State processing on a close. This implements the state shift for
1855  *      sending our FIN frame. Note that we only send a FIN for some
1856  *      states. A shutdown() may have already sent the FIN, or we may be
1857  *      closed.
1858  */
1859
1860 static unsigned char new_state[16] = {
1861   /* current state:        new state:      action:      */
1862   /* (Invalid)          */ TCP_CLOSE,
1863   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1864   /* TCP_SYN_SENT       */ TCP_CLOSE,
1865   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1866   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1867   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1868   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1869   /* TCP_CLOSE          */ TCP_CLOSE,
1870   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1871   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1872   /* TCP_LISTEN         */ TCP_CLOSE,
1873   /* TCP_CLOSING        */ TCP_CLOSING,
1874 };
1875
1876 static int tcp_close_state(struct sock *sk)
1877 {
1878         int next = (int)new_state[sk->sk_state];
1879         int ns = next & TCP_STATE_MASK;
1880
1881         tcp_set_state(sk, ns);
1882
1883         return next & TCP_ACTION_FIN;
1884 }
1885
1886 /*
1887  *      Shutdown the sending side of a connection. Much like close except
1888  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1889  */
1890
1891 void tcp_shutdown(struct sock *sk, int how)
1892 {
1893         /*      We need to grab some memory, and put together a FIN,
1894          *      and then put it into the queue to be sent.
1895          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1896          */
1897         if (!(how & SEND_SHUTDOWN))
1898                 return;
1899
1900         /* If we've already sent a FIN, or it's a closed state, skip this. */
1901         if ((1 << sk->sk_state) &
1902             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1903              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1904                 /* Clear out any half completed packets.  FIN if needed. */
1905                 if (tcp_close_state(sk))
1906                         tcp_send_fin(sk);
1907         }
1908 }
1909
1910
1911 /*
1912  *      Return 1 if we still have things to send in our buffers.
1913  */
1914
1915 static inline int closing(struct sock *sk)
1916 {
1917         return (1 << sk->sk_state) &
1918                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1919 }
1920
1921 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1922 {
1923         /* First the read buffer. */
1924         __skb_queue_purge(&sk->sk_receive_queue);
1925
1926         /* Next, the error queue. */
1927         __skb_queue_purge(&sk->sk_error_queue);
1928
1929         /* Next, the write queue. */
1930         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1931
1932         /* Account for returned memory. */
1933         tcp_mem_reclaim(sk);
1934
1935         BUG_TRAP(!sk->sk_wmem_queued);
1936         BUG_TRAP(!sk->sk_forward_alloc);
1937
1938         /* It is _impossible_ for the backlog to contain anything
1939          * when we get here.  All user references to this socket
1940          * have gone away, only the net layer knows can touch it.
1941          */
1942 }
1943
1944 /*
1945  * At this point, there should be no process reference to this
1946  * socket, and thus no user references at all.  Therefore we
1947  * can assume the socket waitqueue is inactive and nobody will
1948  * try to jump onto it.
1949  */
1950 void tcp_destroy_sock(struct sock *sk)
1951 {
1952         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1953         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1954
1955         /* It cannot be in hash table! */
1956         BUG_TRAP(sk_unhashed(sk));
1957
1958         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1959         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1960
1961 #ifdef TCP_DEBUG
1962         if (sk->sk_zapped) {
1963                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1964                 sock_hold(sk);
1965         }
1966         sk->sk_zapped = 1;
1967 #endif
1968
1969         sk->sk_prot->destroy(sk);
1970
1971         tcp_kill_sk_queues(sk);
1972
1973         xfrm_sk_free_policy(sk);
1974
1975 #ifdef INET_REFCNT_DEBUG
1976         if (atomic_read(&sk->sk_refcnt) != 1) {
1977                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1978                        sk, atomic_read(&sk->sk_refcnt));
1979         }
1980 #endif
1981
1982         atomic_dec(&tcp_orphan_count);
1983         sock_put(sk);
1984 }
1985
1986 void tcp_close(struct sock *sk, long timeout)
1987 {
1988         struct sk_buff *skb;
1989         int data_was_unread = 0;
1990
1991         lock_sock(sk);
1992         sk->sk_shutdown = SHUTDOWN_MASK;
1993
1994         if (sk->sk_state == TCP_LISTEN) {
1995                 tcp_set_state(sk, TCP_CLOSE);
1996
1997                 /* Special case. */
1998                 tcp_listen_stop(sk);
1999
2000                 goto adjudge_to_death;
2001         }
2002
2003         /*  We need to flush the recv. buffs.  We do this only on the
2004          *  descriptor close, not protocol-sourced closes, because the
2005          *  reader process may not have drained the data yet!
2006          */
2007         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2008                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2009                           skb->h.th->fin;
2010                 data_was_unread += len;
2011                 __kfree_skb(skb);
2012         }
2013
2014         tcp_mem_reclaim(sk);
2015
2016         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
2017          * 3.10, we send a RST here because data was lost.  To
2018          * witness the awful effects of the old behavior of always
2019          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
2020          * a bulk GET in an FTP client, suspend the process, wait
2021          * for the client to advertise a zero window, then kill -9
2022          * the FTP client, wheee...  Note: timeout is always zero
2023          * in such a case.
2024          */
2025         if (data_was_unread) {
2026                 /* Unread data was tossed, zap the connection. */
2027                 NET_INC_STATS_USER(TCPAbortOnClose);
2028                 tcp_set_state(sk, TCP_CLOSE);
2029                 tcp_send_active_reset(sk, GFP_KERNEL);
2030         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2031                 /* Check zero linger _after_ checking for unread data. */
2032                 sk->sk_prot->disconnect(sk, 0);
2033                 NET_INC_STATS_USER(TCPAbortOnData);
2034         } else if (tcp_close_state(sk)) {
2035                 /* We FIN if the application ate all the data before
2036                  * zapping the connection.
2037                  */
2038
2039                 /* RED-PEN. Formally speaking, we have broken TCP state
2040                  * machine. State transitions:
2041                  *
2042                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2043                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2044                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2045                  *
2046                  * are legal only when FIN has been sent (i.e. in window),
2047                  * rather than queued out of window. Purists blame.
2048                  *
2049                  * F.e. "RFC state" is ESTABLISHED,
2050                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2051                  *
2052                  * The visible declinations are that sometimes
2053                  * we enter time-wait state, when it is not required really
2054                  * (harmless), do not send active resets, when they are
2055                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2056                  * they look as CLOSING or LAST_ACK for Linux)
2057                  * Probably, I missed some more holelets.
2058                  *                                              --ANK
2059                  */
2060                 tcp_send_fin(sk);
2061         }
2062
2063         if (timeout) {
2064                 struct task_struct *tsk = current;
2065                 DEFINE_WAIT(wait);
2066
2067                 do {
2068                         prepare_to_wait(sk->sk_sleep, &wait,
2069                                         TASK_INTERRUPTIBLE);
2070                         if (!closing(sk))
2071                                 break;
2072                         release_sock(sk);
2073                         timeout = schedule_timeout(timeout);
2074                         lock_sock(sk);
2075                 } while (!signal_pending(tsk) && timeout);
2076
2077                 finish_wait(sk->sk_sleep, &wait);
2078         }
2079
2080 adjudge_to_death:
2081         /* It is the last release_sock in its life. It will remove backlog. */
2082         release_sock(sk);
2083
2084
2085         /* Now socket is owned by kernel and we acquire BH lock
2086            to finish close. No need to check for user refs.
2087          */
2088         local_bh_disable();
2089         bh_lock_sock(sk);
2090         BUG_TRAP(!sock_owned_by_user(sk));
2091
2092         sock_hold(sk);
2093         sock_orphan(sk);
2094
2095         /*      This is a (useful) BSD violating of the RFC. There is a
2096          *      problem with TCP as specified in that the other end could
2097          *      keep a socket open forever with no application left this end.
2098          *      We use a 3 minute timeout (about the same as BSD) then kill
2099          *      our end. If they send after that then tough - BUT: long enough
2100          *      that we won't make the old 4*rto = almost no time - whoops
2101          *      reset mistake.
2102          *
2103          *      Nope, it was not mistake. It is really desired behaviour
2104          *      f.e. on http servers, when such sockets are useless, but
2105          *      consume significant resources. Let's do it with special
2106          *      linger2 option.                                 --ANK
2107          */
2108
2109         if (sk->sk_state == TCP_FIN_WAIT2) {
2110                 struct tcp_opt *tp = tcp_sk(sk);
2111                 if (tp->linger2 < 0) {
2112                         tcp_set_state(sk, TCP_CLOSE);
2113                         tcp_send_active_reset(sk, GFP_ATOMIC);
2114                         NET_INC_STATS_BH(TCPAbortOnLinger);
2115                 } else {
2116                         int tmo = tcp_fin_time(tp);
2117
2118                         if (tmo > TCP_TIMEWAIT_LEN) {
2119                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2120                         } else {
2121                                 atomic_inc(&tcp_orphan_count);
2122                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2123                                 goto out;
2124                         }
2125                 }
2126         }
2127         if (sk->sk_state != TCP_CLOSE) {
2128                 tcp_mem_reclaim(sk);
2129                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2130                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2131                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2132                         if (net_ratelimit())
2133                                 printk(KERN_INFO "TCP: too many of orphaned "
2134                                        "sockets\n");
2135                         tcp_set_state(sk, TCP_CLOSE);
2136                         tcp_send_active_reset(sk, GFP_ATOMIC);
2137                         NET_INC_STATS_BH(TCPAbortOnMemory);
2138                 }
2139         }
2140         atomic_inc(&tcp_orphan_count);
2141
2142         if (sk->sk_state == TCP_CLOSE)
2143                 tcp_destroy_sock(sk);
2144         /* Otherwise, socket is reprieved until protocol close. */
2145
2146 out:
2147         bh_unlock_sock(sk);
2148         local_bh_enable();
2149         sock_put(sk);
2150 }
2151
2152 /* These states need RST on ABORT according to RFC793 */
2153
2154 static inline int tcp_need_reset(int state)
2155 {
2156         return (1 << state) &
2157                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2158                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2159 }
2160
2161 int tcp_disconnect(struct sock *sk, int flags)
2162 {
2163         struct inet_opt *inet = inet_sk(sk);
2164         struct tcp_opt *tp = tcp_sk(sk);
2165         int err = 0;
2166         int old_state = sk->sk_state;
2167
2168         if (old_state != TCP_CLOSE)
2169                 tcp_set_state(sk, TCP_CLOSE);
2170
2171         /* ABORT function of RFC793 */
2172         if (old_state == TCP_LISTEN) {
2173                 tcp_listen_stop(sk);
2174         } else if (tcp_need_reset(old_state) ||
2175                    (tp->snd_nxt != tp->write_seq &&
2176                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2177                 /* The last check adjusts for discrepance of Linux wrt. RFC
2178                  * states
2179                  */
2180                 tcp_send_active_reset(sk, gfp_any());
2181                 sk->sk_err = ECONNRESET;
2182         } else if (old_state == TCP_SYN_SENT)
2183                 sk->sk_err = ECONNRESET;
2184
2185         tcp_clear_xmit_timers(sk);
2186         __skb_queue_purge(&sk->sk_receive_queue);
2187         tcp_writequeue_purge(sk);
2188         __skb_queue_purge(&tp->out_of_order_queue);
2189
2190         inet->dport = 0;
2191
2192         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2193                 inet_reset_saddr(sk);
2194
2195         sk->sk_shutdown = 0;
2196         sock_reset_flag(sk, SOCK_DONE);
2197         tp->srtt = 0;
2198         if ((tp->write_seq += tp->max_window + 2) == 0)
2199                 tp->write_seq = 1;
2200         tp->backoff = 0;
2201         tp->snd_cwnd = 2;
2202         tp->probes_out = 0;
2203         tp->packets_out = 0;
2204         tp->snd_ssthresh = 0x7fffffff;
2205         tp->snd_cwnd_cnt = 0;
2206         tcp_set_ca_state(tp, TCP_CA_Open);
2207         tcp_clear_retrans(tp);
2208         tcp_delack_init(tp);
2209         tp->send_head = NULL;
2210         tp->saw_tstamp = 0;
2211         tcp_sack_reset(tp);
2212         __sk_dst_reset(sk);
2213
2214         BUG_TRAP(!inet->num || tp->bind_hash);
2215
2216         sk->sk_error_report(sk);
2217         return err;
2218 }
2219
2220 /*
2221  *      Wait for an incoming connection, avoid race
2222  *      conditions. This must be called with the socket locked.
2223  */
2224 static int wait_for_connect(struct sock *sk, long timeo)
2225 {
2226         struct tcp_opt *tp = tcp_sk(sk);
2227         DEFINE_WAIT(wait);
2228         int err;
2229
2230         /*
2231          * True wake-one mechanism for incoming connections: only
2232          * one process gets woken up, not the 'whole herd'.
2233          * Since we do not 'race & poll' for established sockets
2234          * anymore, the common case will execute the loop only once.
2235          *
2236          * Subtle issue: "add_wait_queue_exclusive()" will be added
2237          * after any current non-exclusive waiters, and we know that
2238          * it will always _stay_ after any new non-exclusive waiters
2239          * because all non-exclusive waiters are added at the
2240          * beginning of the wait-queue. As such, it's ok to "drop"
2241          * our exclusiveness temporarily when we get woken up without
2242          * having to remove and re-insert us on the wait queue.
2243          */
2244         for (;;) {
2245                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2246                                           TASK_INTERRUPTIBLE);
2247                 release_sock(sk);
2248                 if (!tp->accept_queue)
2249                         timeo = schedule_timeout(timeo);
2250                 lock_sock(sk);
2251                 err = 0;
2252                 if (tp->accept_queue)
2253                         break;
2254                 err = -EINVAL;
2255                 if (sk->sk_state != TCP_LISTEN)
2256                         break;
2257                 err = sock_intr_errno(timeo);
2258                 if (signal_pending(current))
2259                         break;
2260                 err = -EAGAIN;
2261                 if (!timeo)
2262                         break;
2263         }
2264         finish_wait(sk->sk_sleep, &wait);
2265         return err;
2266 }
2267
2268 /*
2269  *      This will accept the next outstanding connection.
2270  */
2271
2272 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2273 {
2274         struct tcp_opt *tp = tcp_sk(sk);
2275         struct open_request *req;
2276         struct sock *newsk;
2277         int error;
2278 #ifdef CONFIG_ACCEPT_QUEUES
2279         int prev_class = 0;
2280         int first;
2281 #endif
2282
2283         lock_sock(sk);
2284
2285         /* We need to make sure that this socket is listening,
2286          * and that it has something pending.
2287          */
2288         error = -EINVAL;
2289         if (sk->sk_state != TCP_LISTEN)
2290                 goto out;
2291
2292         /* Find already established connection */
2293         if (!tp->accept_queue) {
2294                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2295                 /* If this is a non blocking socket don't sleep */
2296                 error = -EAGAIN;
2297                 if (!timeo)
2298                         goto out;
2299
2300                 error = wait_for_connect(sk, timeo);
2301                 if (error)
2302                         goto out;
2303         }
2304
2305 #ifndef CONFIG_ACCEPT_QUEUES
2306         req = tp->accept_queue;
2307         if ((tp->accept_queue = req->dl_next) == NULL)
2308                 tp->accept_queue_tail = NULL;
2309
2310         tcp_acceptq_removed(sk);
2311 #else
2312         first = tp->class_index;
2313         /* We should always have  request queued here. The accept_queue
2314          * is already checked for NULL above.
2315          */
2316         while(!tp->acceptq[first].aq_head) {
2317                 tp->acceptq[first].aq_cnt = 0;
2318                 first = (first+1) & ~NUM_ACCEPT_QUEUES;
2319         }
2320         req = tp->acceptq[first].aq_head;
2321         tp->acceptq[first].aq_qcount--;
2322         tp->acceptq[first].aq_count++;
2323         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
2324
2325         for (prev_class= first-1 ; prev_class >=0; prev_class--)
2326                 if (tp->acceptq[prev_class].aq_tail)
2327                         break;
2328         if (prev_class>=0)
2329                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
2330         else
2331                 tp->accept_queue = req->dl_next;
2332
2333         if (req == tp->acceptq[first].aq_tail)
2334                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2335         else
2336                 tp->acceptq[first].aq_head = req->dl_next;
2337
2338         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2339                 tp->acceptq[first].aq_cnt = 0;
2340                 tp->class_index = ++first & ~NUM_ACCEPT_QUEUES;
2341         }
2342         tcp_acceptq_removed(sk, req->acceptq_class);
2343 #endif
2344         newsk = req->sk;
2345         tcp_openreq_fastfree(req);
2346         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2347         release_sock(sk);
2348         return newsk;
2349
2350 out:
2351         release_sock(sk);
2352         *err = error;
2353         return NULL;
2354 }
2355
2356 /*
2357  *      Socket option code for TCP.
2358  */
2359 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2360                    int optlen)
2361 {
2362         struct tcp_opt *tp = tcp_sk(sk);
2363         int val;
2364         int err = 0;
2365
2366         if (level != SOL_TCP)
2367                 return tp->af_specific->setsockopt(sk, level, optname,
2368                                                    optval, optlen);
2369
2370         if (optlen < sizeof(int))
2371                 return -EINVAL;
2372
2373         if (get_user(val, (int __user *)optval))
2374                 return -EFAULT;
2375
2376         lock_sock(sk);
2377
2378         switch (optname) {
2379         case TCP_MAXSEG:
2380                 /* Values greater than interface MTU won't take effect. However
2381                  * at the point when this call is done we typically don't yet
2382                  * know which interface is going to be used */
2383                 if (val < 8 || val > MAX_TCP_WINDOW) {
2384                         err = -EINVAL;
2385                         break;
2386                 }
2387                 tp->user_mss = val;
2388                 break;
2389
2390         case TCP_NODELAY:
2391                 if (val) {
2392                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2393                          * this option on corked socket is remembered, but
2394                          * it is not activated until cork is cleared.
2395                          *
2396                          * However, when TCP_NODELAY is set we make
2397                          * an explicit push, which overrides even TCP_CORK
2398                          * for currently queued segments.
2399                          */
2400                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2401                         tcp_push_pending_frames(sk, tp);
2402                 } else {
2403                         tp->nonagle &= ~TCP_NAGLE_OFF;
2404                 }
2405                 break;
2406
2407         case TCP_CORK:
2408                 /* When set indicates to always queue non-full frames.
2409                  * Later the user clears this option and we transmit
2410                  * any pending partial frames in the queue.  This is
2411                  * meant to be used alongside sendfile() to get properly
2412                  * filled frames when the user (for example) must write
2413                  * out headers with a write() call first and then use
2414                  * sendfile to send out the data parts.
2415                  *
2416                  * TCP_CORK can be set together with TCP_NODELAY and it is
2417                  * stronger than TCP_NODELAY.
2418                  */
2419                 if (val) {
2420                         tp->nonagle |= TCP_NAGLE_CORK;
2421                 } else {
2422                         tp->nonagle &= ~TCP_NAGLE_CORK;
2423                         if (tp->nonagle&TCP_NAGLE_OFF)
2424                                 tp->nonagle |= TCP_NAGLE_PUSH;
2425                         tcp_push_pending_frames(sk, tp);
2426                 }
2427                 break;
2428
2429         case TCP_KEEPIDLE:
2430                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2431                         err = -EINVAL;
2432                 else {
2433                         tp->keepalive_time = val * HZ;
2434                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2435                             !((1 << sk->sk_state) &
2436                               (TCPF_CLOSE | TCPF_LISTEN))) {
2437                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2438                                 if (tp->keepalive_time > elapsed)
2439                                         elapsed = tp->keepalive_time - elapsed;
2440                                 else
2441                                         elapsed = 0;
2442                                 tcp_reset_keepalive_timer(sk, elapsed);
2443                         }
2444                 }
2445                 break;
2446         case TCP_KEEPINTVL:
2447                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2448                         err = -EINVAL;
2449                 else
2450                         tp->keepalive_intvl = val * HZ;
2451                 break;
2452         case TCP_KEEPCNT:
2453                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2454                         err = -EINVAL;
2455                 else
2456                         tp->keepalive_probes = val;
2457                 break;
2458         case TCP_SYNCNT:
2459                 if (val < 1 || val > MAX_TCP_SYNCNT)
2460                         err = -EINVAL;
2461                 else
2462                         tp->syn_retries = val;
2463                 break;
2464
2465         case TCP_LINGER2:
2466                 if (val < 0)
2467                         tp->linger2 = -1;
2468                 else if (val > sysctl_tcp_fin_timeout / HZ)
2469                         tp->linger2 = 0;
2470                 else
2471                         tp->linger2 = val * HZ;
2472                 break;
2473
2474         case TCP_DEFER_ACCEPT:
2475                 tp->defer_accept = 0;
2476                 if (val > 0) {
2477                         /* Translate value in seconds to number of
2478                          * retransmits */
2479                         while (tp->defer_accept < 32 &&
2480                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2481                                        tp->defer_accept))
2482                                 tp->defer_accept++;
2483                         tp->defer_accept++;
2484                 }
2485                 break;
2486
2487         case TCP_WINDOW_CLAMP:
2488                 if (!val) {
2489                         if (sk->sk_state != TCP_CLOSE) {
2490                                 err = -EINVAL;
2491                                 break;
2492                         }
2493                         tp->window_clamp = 0;
2494                 } else
2495                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2496                                                 SOCK_MIN_RCVBUF / 2 : val;
2497                 break;
2498
2499         case TCP_QUICKACK:
2500                 if (!val) {
2501                         tp->ack.pingpong = 1;
2502                 } else {
2503                         tp->ack.pingpong = 0;
2504                         if ((1 << sk->sk_state) &
2505                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2506                             tcp_ack_scheduled(tp)) {
2507                                 tp->ack.pending |= TCP_ACK_PUSHED;
2508                                 cleanup_rbuf(sk, 1);
2509                                 if (!(val & 1))
2510                                         tp->ack.pingpong = 1;
2511                         }
2512                 }
2513                 break;
2514
2515 #ifdef CONFIG_ACCEPT_QUEUES
2516         case TCP_ACCEPTQ_SHARE:
2517                 {
2518                         char share_wt[NUM_ACCEPT_QUEUES];
2519                         int i,j;
2520
2521                         if (sk->sk_state != TCP_LISTEN)
2522                                 return -EOPNOTSUPP;
2523
2524                         if (copy_from_user(share_wt,optval, optlen)) {
2525                                 err = -EFAULT;
2526                                 break;
2527                         }
2528                         j = 0;
2529                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2530                                 if (share_wt[i]) {
2531                                         if (!j)
2532                                                 j = share_wt[i];
2533                                         else if (share_wt[i] < j) {
2534                                                 j = share_wt[i];
2535                                         }
2536                                         tp->acceptq[i].aq_valid = 1;
2537                                 }
2538                                 else
2539                                         tp->acceptq[i].aq_valid = 0;
2540
2541                         }
2542                         if (j == 0) {
2543                                 /* Class 0 is always valid. If nothing is
2544                                  * specified set class 0 as 1.
2545                                  */
2546                                 share_wt[0] = 1;
2547                                 tp->acceptq[0].aq_valid = 1;
2548                                 j = 1;
2549                         }
2550                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2551                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2552                                 tp->acceptq[i].aq_cnt = 0;
2553                         }
2554                 }
2555                 break;
2556 #endif
2557
2558         default:
2559                 err = -ENOPROTOOPT;
2560                 break;
2561         };
2562         release_sock(sk);
2563         return err;
2564 }
2565
2566 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2567                    int __user *optlen)
2568 {
2569         struct tcp_opt *tp = tcp_sk(sk);
2570         int val, len;
2571
2572         if (level != SOL_TCP)
2573                 return tp->af_specific->getsockopt(sk, level, optname,
2574                                                    optval, optlen);
2575
2576         if (get_user(len, optlen))
2577                 return -EFAULT;
2578
2579         len = min_t(unsigned int, len, sizeof(int));
2580
2581         if (len < 0)
2582                 return -EINVAL;
2583
2584         switch (optname) {
2585         case TCP_MAXSEG:
2586                 val = tp->mss_cache_std;
2587                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2588                         val = tp->user_mss;
2589                 break;
2590         case TCP_NODELAY:
2591                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2592                 break;
2593         case TCP_CORK:
2594                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2595                 break;
2596         case TCP_KEEPIDLE:
2597                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2598                 break;
2599         case TCP_KEEPINTVL:
2600                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2601                 break;
2602         case TCP_KEEPCNT:
2603                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2604                 break;
2605         case TCP_SYNCNT:
2606                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2607                 break;
2608         case TCP_LINGER2:
2609                 val = tp->linger2;
2610                 if (val >= 0)
2611                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2612                 break;
2613         case TCP_DEFER_ACCEPT:
2614                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2615                                                (tp->defer_accept - 1));
2616                 break;
2617         case TCP_WINDOW_CLAMP:
2618                 val = tp->window_clamp;
2619                 break;
2620         case TCP_INFO: {
2621                 struct tcp_info info;
2622                 u32 now = tcp_time_stamp;
2623
2624                 if (get_user(len, optlen))
2625                         return -EFAULT;
2626                 info.tcpi_state = sk->sk_state;
2627                 info.tcpi_ca_state = tp->ca_state;
2628                 info.tcpi_retransmits = tp->retransmits;
2629                 info.tcpi_probes = tp->probes_out;
2630                 info.tcpi_backoff = tp->backoff;
2631                 info.tcpi_options = 0;
2632                 if (tp->tstamp_ok)
2633                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2634                 if (tp->sack_ok)
2635                         info.tcpi_options |= TCPI_OPT_SACK;
2636                 if (tp->wscale_ok) {
2637                         info.tcpi_options |= TCPI_OPT_WSCALE;
2638                         info.tcpi_snd_wscale = tp->snd_wscale;
2639                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2640                 } else {
2641                         info.tcpi_snd_wscale = 0;
2642                         info.tcpi_rcv_wscale = 0;
2643                 }
2644                 if (tp->ecn_flags & TCP_ECN_OK)
2645                         info.tcpi_options |= TCPI_OPT_ECN;
2646
2647                 info.tcpi_rto = (1000000 * tp->rto) / HZ;
2648                 info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2649                 info.tcpi_snd_mss = tp->mss_cache_std;
2650                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2651
2652                 info.tcpi_unacked = tp->packets_out;
2653                 info.tcpi_sacked = tp->sacked_out;
2654                 info.tcpi_lost = tp->lost_out;
2655                 info.tcpi_retrans = tp->retrans_out;
2656                 info.tcpi_fackets = tp->fackets_out;
2657
2658                 info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2659                 info.tcpi_last_ack_sent = 0;
2660                 info.tcpi_last_data_recv = ((now -
2661                                              tp->ack.lrcvtime) * 1000) / HZ;
2662                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2663
2664                 info.tcpi_pmtu = tp->pmtu_cookie;
2665                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2666                 info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2667                 info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2668                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2669                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2670                 info.tcpi_advmss = tp->advmss;
2671                 info.tcpi_reordering = tp->reordering;
2672
2673                 len = min_t(unsigned int, len, sizeof(info));
2674                 if (put_user(len, optlen))
2675                         return -EFAULT;
2676                 if (copy_to_user(optval, &info, len))
2677                         return -EFAULT;
2678                 return 0;
2679         }
2680         case TCP_QUICKACK:
2681                 val = !tp->ack.pingpong;
2682                 break;
2683
2684 #ifdef CONFIG_ACCEPT_QUEUES
2685         case TCP_ACCEPTQ_SHARE: {
2686                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2687                 int i;
2688
2689                 if (sk->sk_state != TCP_LISTEN)
2690                         return -EOPNOTSUPP;
2691
2692                 if (get_user(len, optlen))
2693                         return -EFAULT;
2694
2695                 memset(tinfo, 0, sizeof(tinfo));
2696
2697                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2698                         tinfo[i].acceptq_wait_time =
2699                                 tp->acceptq[i].aq_wait_time/(HZ/USER_HZ);
2700                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2701                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2702                         if (tp->acceptq[i].aq_valid)
2703                                 tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2704                         else
2705                                 tinfo[i].acceptq_shares = 0;
2706                 }
2707
2708                 len = min_t(unsigned int, len, sizeof(tinfo));
2709                 if (put_user(len, optlen))
2710                         return -EFAULT;
2711
2712                 if (copy_to_user(optval, (char *)tinfo, len))
2713                         return -EFAULT;
2714
2715                 return 0;
2716         }
2717 #endif
2718         default:
2719                 return -ENOPROTOOPT;
2720         };
2721
2722         if (put_user(len, optlen))
2723                 return -EFAULT;
2724         if (copy_to_user(optval, &val, len))
2725                 return -EFAULT;
2726         return 0;
2727 }
2728
2729
2730 extern void __skb_cb_too_small_for_tcp(int, int);
2731 extern void tcpdiag_init(void);
2732
2733 static __initdata unsigned long thash_entries;
2734 static int __init set_thash_entries(char *str)
2735 {
2736         if (!str)
2737                 return 0;
2738         thash_entries = simple_strtoul(str, &str, 0);
2739         return 1;
2740 }
2741 __setup("thash_entries=", set_thash_entries);
2742
2743 void __init tcp_init(void)
2744 {
2745         struct sk_buff *skb = NULL;
2746         unsigned long goal;
2747         int order, i;
2748
2749         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2750                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2751                                            sizeof(skb->cb));
2752
2753         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2754                                                    sizeof(struct open_request),
2755                                                0, SLAB_HWCACHE_ALIGN,
2756                                                NULL, NULL);
2757         if (!tcp_openreq_cachep)
2758                 panic("tcp_init: Cannot alloc open_request cache.");
2759
2760         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2761                                               sizeof(struct tcp_bind_bucket),
2762                                               0, SLAB_HWCACHE_ALIGN,
2763                                               NULL, NULL);
2764         if (!tcp_bucket_cachep)
2765                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2766
2767         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2768                                                 sizeof(struct tcp_tw_bucket),
2769                                                 0, SLAB_HWCACHE_ALIGN,
2770                                                 NULL, NULL);
2771         if (!tcp_timewait_cachep)
2772                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2773
2774         /* Size and allocate the main established and bind bucket
2775          * hash tables.
2776          *
2777          * The methodology is similar to that of the buffer cache.
2778          */
2779         if (num_physpages >= (128 * 1024))
2780                 goal = num_physpages >> (21 - PAGE_SHIFT);
2781         else
2782                 goal = num_physpages >> (23 - PAGE_SHIFT);
2783
2784         if (thash_entries)
2785                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2786         for (order = 0; (1UL << order) < goal; order++)
2787                 ;
2788         do {
2789                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2790                         sizeof(struct tcp_ehash_bucket);
2791                 tcp_ehash_size >>= 1;
2792                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2793                         tcp_ehash_size--;
2794                 tcp_ehash = (struct tcp_ehash_bucket *)
2795                         __get_free_pages(GFP_ATOMIC, order);
2796         } while (!tcp_ehash && --order > 0);
2797
2798         if (!tcp_ehash)
2799                 panic("Failed to allocate TCP established hash table\n");
2800         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2801                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2802                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2803         }
2804
2805         do {
2806                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2807                         sizeof(struct tcp_bind_hashbucket);
2808                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2809                         continue;
2810                 tcp_bhash = (struct tcp_bind_hashbucket *)
2811                         __get_free_pages(GFP_ATOMIC, order);
2812         } while (!tcp_bhash && --order >= 0);
2813
2814         if (!tcp_bhash)
2815                 panic("Failed to allocate TCP bind hash table\n");
2816         for (i = 0; i < tcp_bhash_size; i++) {
2817                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2818                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2819         }
2820
2821         /* Try to be a bit smarter and adjust defaults depending
2822          * on available memory.
2823          */
2824         if (order > 4) {
2825                 sysctl_local_port_range[0] = 32768;
2826                 sysctl_local_port_range[1] = 61000;
2827                 sysctl_tcp_max_tw_buckets = 180000;
2828                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2829                 sysctl_max_syn_backlog = 1024;
2830         } else if (order < 3) {
2831                 sysctl_local_port_range[0] = 1024 * (3 - order);
2832                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2833                 sysctl_tcp_max_orphans >>= (3 - order);
2834                 sysctl_max_syn_backlog = 128;
2835         }
2836         tcp_port_rover = sysctl_local_port_range[0] - 1;
2837
2838         sysctl_tcp_mem[0] =  768 << order;
2839         sysctl_tcp_mem[1] = 1024 << order;
2840         sysctl_tcp_mem[2] = 1536 << order;
2841
2842         if (order < 3) {
2843                 sysctl_tcp_wmem[2] = 64 * 1024;
2844                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2845                 sysctl_tcp_rmem[1] = 43689;
2846                 sysctl_tcp_rmem[2] = 2 * 43689;
2847         }
2848
2849         printk(KERN_INFO "TCP: Hash tables configured "
2850                "(established %d bind %d)\n",
2851                tcp_ehash_size << 1, tcp_bhash_size);
2852
2853         tcpdiag_init();
2854 }
2855
2856 EXPORT_SYMBOL(__tcp_mem_reclaim);
2857 EXPORT_SYMBOL(sysctl_tcp_rmem);
2858 EXPORT_SYMBOL(sysctl_tcp_wmem);
2859 EXPORT_SYMBOL(tcp_accept);
2860 EXPORT_SYMBOL(tcp_close);
2861 EXPORT_SYMBOL(tcp_close_state);
2862 EXPORT_SYMBOL(tcp_destroy_sock);
2863 EXPORT_SYMBOL(tcp_disconnect);
2864 EXPORT_SYMBOL(tcp_getsockopt);
2865 EXPORT_SYMBOL(tcp_ioctl);
2866 EXPORT_SYMBOL(tcp_openreq_cachep);
2867 EXPORT_SYMBOL(tcp_poll);
2868 EXPORT_SYMBOL(tcp_read_sock);
2869 EXPORT_SYMBOL(tcp_recvmsg);
2870 EXPORT_SYMBOL(tcp_sendmsg);
2871 EXPORT_SYMBOL(tcp_sendpage);
2872 EXPORT_SYMBOL(tcp_setsockopt);
2873 EXPORT_SYMBOL(tcp_shutdown);
2874 EXPORT_SYMBOL(tcp_sockets_allocated);
2875 EXPORT_SYMBOL(tcp_statistics);
2876 EXPORT_SYMBOL(tcp_timewait_cachep);
2877 EXPORT_SYMBOL(tcp_write_space);
2878 EXPORT_SYMBOL_GPL(cleanup_rbuf);