net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/ckrm.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 273
 274 kmem_cache_t *tcp_openreq_cachep;
 275 kmem_cache_t *tcp_bucket_cachep;
 276 kmem_cache_t *tcp_timewait_cachep;
 277
 278 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 279
 280 int sysctl_tcp_mem[3];
 281 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 282 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 283
 284 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 285 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 286
 287 /* Pressure flag: try to collapse.
 288  * Technical note: it is used by multiple contexts non atomically.
 289  * All the tcp_mem_schedule() is of this nature: accounting
 290  * is strict, actions are advisory and have some latency. */
 291 int tcp_memory_pressure;
 292
 293 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
 294
 295 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 296 {
 297         int amt = TCP_PAGES(size);
 298
 299         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
 300         atomic_add(amt, &tcp_memory_allocated);
 301
 302         /* Under limit. */
 303         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 304                 if (tcp_memory_pressure)
 305                         tcp_memory_pressure = 0;
 306                 return 1;
 307         }
 308
 309         /* Over hard limit. */
 310         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 311                 tcp_enter_memory_pressure();
 312                 goto suppress_allocation;
 313         }
 314
 315         /* Under pressure. */
 316         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 317                 tcp_enter_memory_pressure();
 318
 319         if (kind) {
 320                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
 321                         return 1;
 322         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
 323                 return 1;
 324
 325         if (!tcp_memory_pressure ||
 326             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
 327                                 TCP_PAGES(sk->sk_wmem_queued +
 328                                           atomic_read(&sk->sk_rmem_alloc) +
 329                                           sk->sk_forward_alloc))
 330                 return 1;
 331
 332 suppress_allocation:
 333
 334         if (!kind) {
 335                 tcp_moderate_sndbuf(sk);
 336
 337                 /* Fail only if socket is _under_ its sndbuf.
 338                  * In this case we cannot block, so that we have to fail.
 339                  */
 340                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
 341                         return 1;
 342         }
 343
 344         /* Alas. Undo changes. */
 345         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
 346         atomic_sub(amt, &tcp_memory_allocated);
 347         return 0;
 348 }
 349
 350 void __tcp_mem_reclaim(struct sock *sk)
 351 {
 352         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
 353                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
 354                            &tcp_memory_allocated);
 355                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
 356                 if (tcp_memory_pressure &&
 357                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 358                         tcp_memory_pressure = 0;
 359         }
 360 }
 361
 362 void tcp_rfree(struct sk_buff *skb)
 363 {
 364         struct sock *sk = skb->sk;
 365
 366         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 367         sk->sk_forward_alloc += skb->truesize;
 368 }
 369
 370 /*
 371  * LISTEN is a special case for poll..
 372  */
 373 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 374                                                poll_table *wait)
 375 {
 376         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 377 }
 378
 379 /*
 380  *      Wait for a TCP event.
 381  *
 382  *      Note that we don't need to lock the socket, as the upper poll layers
 383  *      take care of normal races (between the test and the event) and we don't
 384  *      go look at any of the socket buffers directly.
 385  */
 386 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 387 {
 388         unsigned int mask;
 389         struct sock *sk = sock->sk;
 390         struct tcp_opt *tp = tcp_sk(sk);
 391
 392         poll_wait(file, sk->sk_sleep, wait);
 393         if (sk->sk_state == TCP_LISTEN)
 394                 return tcp_listen_poll(sk, wait);
 395
 396         /* Socket is not locked. We are protected from async events
 397            by poll logic and correct handling of state changes
 398            made by another threads is impossible in any case.
 399          */
 400
 401         mask = 0;
 402         if (sk->sk_err)
 403                 mask = POLLERR;
 404
 405         /*
 406          * POLLHUP is certainly not done right. But poll() doesn't
 407          * have a notion of HUP in just one direction, and for a
 408          * socket the read side is more interesting.
 409          *
 410          * Some poll() documentation says that POLLHUP is incompatible
 411          * with the POLLOUT/POLLWR flags, so somebody should check this
 412          * all. But careful, it tends to be safer to return too many
 413          * bits than too few, and you can easily break real applications
 414          * if you don't tell them that something has hung up!
 415          *
 416          * Check-me.
 417          *
 418          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 419          * our fs/select.c). It means that after we received EOF,
 420          * poll always returns immediately, making impossible poll() on write()
 421          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 422          * if and only if shutdown has been made in both directions.
 423          * Actually, it is interesting to look how Solaris and DUX
 424          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 425          * then we could set it on SND_SHUTDOWN. BTW examples given
 426          * in Stevens' books assume exactly this behaviour, it explains
 427          * why PULLHUP is incompatible with POLLOUT.    --ANK
 428          *
 429          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 430          * blocking on fresh not-connected or disconnected socket. --ANK
 431          */
 432         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 433                 mask |= POLLHUP;
 434         if (sk->sk_shutdown & RCV_SHUTDOWN)
 435                 mask |= POLLIN | POLLRDNORM;
 436
 437         /* Connected? */
 438         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 439                 /* Potential race condition. If read of tp below will
 440                  * escape above sk->sk_state, we can be illegally awaken
 441                  * in SYN_* states. */
 442                 if ((tp->rcv_nxt != tp->copied_seq) &&
 443                     (tp->urg_seq != tp->copied_seq ||
 444                      tp->rcv_nxt != tp->copied_seq + 1 ||
 445                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 446                         mask |= POLLIN | POLLRDNORM;
 447
 448                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 449                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 450                                 mask |= POLLOUT | POLLWRNORM;
 451                         } else {  /* send SIGIO later */
 452                                 set_bit(SOCK_ASYNC_NOSPACE,
 453                                         &sk->sk_socket->flags);
 454                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 455
 456                                 /* Race breaker. If space is freed after
 457                                  * wspace test but before the flags are set,
 458                                  * IO signal will be lost.
 459                                  */
 460                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 461                                         mask |= POLLOUT | POLLWRNORM;
 462                         }
 463                 }
 464
 465                 if (tp->urg_data & TCP_URG_VALID)
 466                         mask |= POLLPRI;
 467         }
 468         return mask;
 469 }
 470
 471 /*
 472  *      TCP socket write_space callback.
 473  */
 474 void tcp_write_space(struct sock *sk)
 475 {
 476         struct socket *sock = sk->sk_socket;
 477
 478         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
 479                 clear_bit(SOCK_NOSPACE, &sock->flags);
 480
 481                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 482                         wake_up_interruptible(sk->sk_sleep);
 483
 484                 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 485                         sock_wake_async(sock, 2, POLL_OUT);
 486         }
 487 }
 488
 489 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 490 {
 491         struct tcp_opt *tp = tcp_sk(sk);
 492         int answ;
 493
 494         switch (cmd) {
 495         case SIOCINQ:
 496                 if (sk->sk_state == TCP_LISTEN)
 497                         return -EINVAL;
 498
 499                 lock_sock(sk);
 500                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 501                         answ = 0;
 502                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 503                          !tp->urg_data ||
 504                          before(tp->urg_seq, tp->copied_seq) ||
 505                          !before(tp->urg_seq, tp->rcv_nxt)) {
 506                         answ = tp->rcv_nxt - tp->copied_seq;
 507
 508                         /* Subtract 1, if FIN is in queue. */
 509                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 510                                 answ -=
 511                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 512                 } else
 513                         answ = tp->urg_seq - tp->copied_seq;
 514                 release_sock(sk);
 515                 break;
 516         case SIOCATMARK:
 517                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 518                 break;
 519         case SIOCOUTQ:
 520                 if (sk->sk_state == TCP_LISTEN)
 521                         return -EINVAL;
 522
 523                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 524                         answ = 0;
 525                 else
 526                         answ = tp->write_seq - tp->snd_una;
 527                 break;
 528         default:
 529                 return -ENOIOCTLCMD;
 530         };
 531
 532         return put_user(answ, (int *)arg);
 533 }
 534
 535
 536 int tcp_listen_start(struct sock *sk)
 537 {
 538 #ifdef CONFIG_ACCEPT_QUEUES
 539         int i = 0;
 540 #endif
 541         struct inet_opt *inet = inet_sk(sk);
 542         struct tcp_opt *tp = tcp_sk(sk);
 543         struct tcp_listen_opt *lopt;
 544
 545         sk->sk_max_ack_backlog = 0;
 546         sk->sk_ack_backlog = 0;
 547         tp->accept_queue = NULL;
 548 #ifdef CONFIG_ACCEPT_QUEUES
 549         tp->class_index = 0;
 550         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
 551                 tp->acceptq[i].aq_tail = NULL;
 552                 tp->acceptq[i].aq_head = NULL;
 553                 tp->acceptq[i].aq_wait_time = 0;
 554                 tp->acceptq[i].aq_qcount = 0;
 555                 tp->acceptq[i].aq_count = 0;
 556                 if (i == 0) {
 557                         tp->acceptq[i].aq_valid = 1;
 558                         tp->acceptq[i].aq_ratio = 1;
 559                 }
 560                 else {
 561                         tp->acceptq[i].aq_valid = 0;
 562                         tp->acceptq[i].aq_ratio = 0;
 563                 }
 564         }
 565 #endif
 566         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 567         tcp_delack_init(tp);
 568
 569         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 570         if (!lopt)
 571                 return -ENOMEM;
 572
 573         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 574         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 575                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 576                         break;
 577         get_random_bytes(&lopt->hash_rnd, 4);
 578
 579         write_lock_bh(&tp->syn_wait_lock);
 580         tp->listen_opt = lopt;
 581         write_unlock_bh(&tp->syn_wait_lock);
 582
 583         /* There is race window here: we announce ourselves listening,
 584          * but this transition is still not validated by get_port().
 585          * It is OK, because this socket enters to hash table only
 586          * after validation is complete.
 587          */
 588         sk->sk_state = TCP_LISTEN;
 589         if (!sk->sk_prot->get_port(sk, inet->num)) {
 590                 inet->sport = htons(inet->num);
 591
 592                 sk_dst_reset(sk);
 593                 sk->sk_prot->hash(sk);
 594
 595 #ifdef CONFIG_CKRM
 596                 ckrm_cb_listen_start(sk);
 597 #endif
 598
 599                 return 0;
 600         }
 601
 602         sk->sk_state = TCP_CLOSE;
 603         write_lock_bh(&tp->syn_wait_lock);
 604         tp->listen_opt = NULL;
 605         write_unlock_bh(&tp->syn_wait_lock);
 606         kfree(lopt);
 607         return -EADDRINUSE;
 608 }
 609
 610 /*
 611  *      This routine closes sockets which have been at least partially
 612  *      opened, but not yet accepted.
 613  */
 614
 615 static void tcp_listen_stop (struct sock *sk)
 616 {
 617         struct tcp_opt *tp = tcp_sk(sk);
 618         struct tcp_listen_opt *lopt = tp->listen_opt;
 619         struct open_request *acc_req = tp->accept_queue;
 620         struct open_request *req;
 621         int i;
 622
 623         tcp_delete_keepalive_timer(sk);
 624
 625         /* make all the listen_opt local to us */
 626         write_lock_bh(&tp->syn_wait_lock);
 627         tp->listen_opt = NULL;
 628         write_unlock_bh(&tp->syn_wait_lock);
 629
 630 #ifdef CONFIG_CKRM
 631                 ckrm_cb_listen_stop(sk);
 632 #endif
 633
 634 #ifdef CONFIG_ACCEPT_QUEUES
 635         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
 636                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
 637 #else
 638         tp->accept_queue_tail = NULL;
 639 #endif
 640         tp->accept_queue = NULL;
 641
 642         if (lopt->qlen) {
 643                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 644                         while ((req = lopt->syn_table[i]) != NULL) {
 645                                 lopt->syn_table[i] = req->dl_next;
 646                                 lopt->qlen--;
 647                                 tcp_openreq_free(req);
 648
 649                 /* Following specs, it would be better either to send FIN
 650                  * (and enter FIN-WAIT-1, it is normal close)
 651                  * or to send active reset (abort).
 652                  * Certainly, it is pretty dangerous while synflood, but it is
 653                  * bad justification for our negligence 8)
 654                  * To be honest, we are not able to make either
 655                  * of the variants now.                 --ANK
 656                  */
 657                         }
 658                 }
 659         }
 660         BUG_TRAP(!lopt->qlen);
 661
 662         kfree(lopt);
 663
 664         while ((req = acc_req) != NULL) {
 665                 struct sock *child = req->sk;
 666
 667                 acc_req = req->dl_next;
 668
 669                 local_bh_disable();
 670                 bh_lock_sock(child);
 671                 BUG_TRAP(!sock_owned_by_user(child));
 672                 sock_hold(child);
 673
 674                 tcp_disconnect(child, O_NONBLOCK);
 675
 676                 sock_orphan(child);
 677
 678                 atomic_inc(&tcp_orphan_count);
 679
 680                 tcp_destroy_sock(child);
 681
 682                 bh_unlock_sock(child);
 683                 local_bh_enable();
 684                 sock_put(child);
 685
 686 #ifdef CONFIG_ACCEPT_QUEUES
 687                 tcp_acceptq_removed(sk, req->acceptq_class);
 688 #else
 689                 tcp_acceptq_removed(sk);
 690 #endif
 691                 tcp_openreq_fastfree(req);
 692         }
 693         BUG_TRAP(!sk->sk_ack_backlog);
 694 }
 695
 696 /*
 697  *      Wait for a socket to get into the connected state
 698  *
 699  *      Note: Must be called with the socket locked.
 700  */
 701 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
 702 {
 703         struct tcp_opt *tp = tcp_sk(sk);
 704         struct task_struct *tsk = current;
 705         DEFINE_WAIT(wait);
 706
 707         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 708                 if (sk->sk_err)
 709                         return sock_error(sk);
 710                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 711                         return -EPIPE;
 712                 if (!*timeo_p)
 713                         return -EAGAIN;
 714                 if (signal_pending(tsk))
 715                         return sock_intr_errno(*timeo_p);
 716
 717                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 718                 tp->write_pending++;
 719
 720                 release_sock(sk);
 721                 *timeo_p = schedule_timeout(*timeo_p);
 722                 lock_sock(sk);
 723
 724                 finish_wait(sk->sk_sleep, &wait);
 725                 tp->write_pending--;
 726         }
 727         return 0;
 728 }
 729
 730 static inline int tcp_memory_free(struct sock *sk)
 731 {
 732         return sk->sk_wmem_queued < sk->sk_sndbuf;
 733 }
 734
 735 /*
 736  *      Wait for more memory for a socket
 737  */
 738 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
 739 {
 740         struct tcp_opt *tp = tcp_sk(sk);
 741         int err = 0;
 742         long vm_wait = 0;
 743         long current_timeo = *timeo;
 744         DEFINE_WAIT(wait);
 745
 746         if (tcp_memory_free(sk))
 747                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
 748
 749         for (;;) {
 750                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 751
 752                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 753
 754                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 755                         goto do_error;
 756                 if (!*timeo)
 757                         goto do_nonblock;
 758                 if (signal_pending(current))
 759                         goto do_interrupted;
 760                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 761                 if (tcp_memory_free(sk) && !vm_wait)
 762                         break;
 763
 764                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 765                 tp->write_pending++;
 766                 release_sock(sk);
 767                 if (!tcp_memory_free(sk) || vm_wait)
 768                         current_timeo = schedule_timeout(current_timeo);
 769                 lock_sock(sk);
 770                 tp->write_pending--;
 771
 772                 if (vm_wait) {
 773                         vm_wait -= current_timeo;
 774                         current_timeo = *timeo;
 775                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 776                             (current_timeo -= vm_wait) < 0)
 777                                 current_timeo = 0;
 778                         vm_wait = 0;
 779                 }
 780                 *timeo = current_timeo;
 781         }
 782 out:
 783         finish_wait(sk->sk_sleep, &wait);
 784         return err;
 785
 786 do_error:
 787         err = -EPIPE;
 788         goto out;
 789 do_nonblock:
 790         err = -EAGAIN;
 791         goto out;
 792 do_interrupted:
 793         err = sock_intr_errno(*timeo);
 794         goto out;
 795 }
 796
 797 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
 798                                int off)
 799 {
 800         if (i) {
 801                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 802                 return page == frag->page &&
 803                        off == frag->page_offset + frag->size;
 804         }
 805         return 0;
 806 }
 807
 808 static inline void fill_page_desc(struct sk_buff *skb, int i,
 809                                   struct page *page, int off, int size)
 810 {
 811         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 812         frag->page = page;
 813         frag->page_offset = off;
 814         frag->size = size;
 815         skb_shinfo(skb)->nr_frags = i + 1;
 816 }
 817
 818 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 819 {
 820         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 821         tp->pushed_seq = tp->write_seq;
 822 }
 823
 824 static inline int forced_push(struct tcp_opt *tp)
 825 {
 826         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 827 }
 828
 829 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 830                               struct sk_buff *skb)
 831 {
 832         skb->csum = 0;
 833         TCP_SKB_CB(skb)->seq = tp->write_seq;
 834         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 835         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 836         TCP_SKB_CB(skb)->sacked = 0;
 837         __skb_queue_tail(&sk->sk_write_queue, skb);
 838         tcp_charge_skb(sk, skb);
 839         if (!tp->send_head)
 840                 tp->send_head = skb;
 841         else if (tp->nonagle&TCP_NAGLE_PUSH)
 842                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 843 }
 844
 845 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 846                                 struct sk_buff *skb)
 847 {
 848         if (flags & MSG_OOB) {
 849                 tp->urg_mode = 1;
 850                 tp->snd_up = tp->write_seq;
 851                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 852         }
 853 }
 854
 855 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 856                             int mss_now, int nonagle)
 857 {
 858         if (tp->send_head) {
 859                 struct sk_buff *skb = sk->sk_write_queue.prev;
 860                 if (!(flags & MSG_MORE) || forced_push(tp))
 861                         tcp_mark_push(tp, skb);
 862                 tcp_mark_urg(tp, flags, skb);
 863                 __tcp_push_pending_frames(sk, tp, mss_now,
 864                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 865         }
 866 }
 867
 868 static int tcp_error(struct sock *sk, int flags, int err)
 869 {
 870         if (err == -EPIPE)
 871                 err = sock_error(sk) ? : -EPIPE;
 872         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 873                 send_sig(SIGPIPE, current, 0);
 874         return err;
 875 }
 876
 877 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 878                          size_t psize, int flags)
 879 {
 880         struct tcp_opt *tp = tcp_sk(sk);
 881         int mss_now;
 882         int err;
 883         ssize_t copied;
 884         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 885
 886         /* Wait for a connection to finish. */
 887         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 888                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 889                         goto out_err;
 890
 891         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 892
 893         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 894         copied = 0;
 895
 896         err = -EPIPE;
 897         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 898                 goto do_error;
 899
 900         while (psize > 0) {
 901                 struct sk_buff *skb = sk->sk_write_queue.prev;
 902                 struct page *page = pages[poffset / PAGE_SIZE];
 903                 int copy, i;
 904                 int offset = poffset % PAGE_SIZE;
 905                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 906
 907                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
 908 new_segment:
 909                         if (!tcp_memory_free(sk))
 910                                 goto wait_for_sndbuf;
 911
 912                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
 913                                              sk->sk_allocation);
 914                         if (!skb)
 915                                 goto wait_for_memory;
 916
 917                         skb_entail(sk, tp, skb);
 918                         copy = mss_now;
 919                 }
 920
 921                 if (copy > size)
 922                         copy = size;
 923
 924                 i = skb_shinfo(skb)->nr_frags;
 925                 if (can_coalesce(skb, i, page, offset)) {
 926                         skb_shinfo(skb)->frags[i - 1].size += copy;
 927                 } else if (i < MAX_SKB_FRAGS) {
 928                         get_page(page);
 929                         fill_page_desc(skb, i, page, offset, copy);
 930                 } else {
 931                         tcp_mark_push(tp, skb);
 932                         goto new_segment;
 933                 }
 934
 935                 skb->len += copy;
 936                 skb->data_len += copy;
 937                 skb->ip_summed = CHECKSUM_HW;
 938                 tp->write_seq += copy;
 939                 TCP_SKB_CB(skb)->end_seq += copy;
 940
 941                 if (!copied)
 942                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 943
 944                 copied += copy;
 945                 poffset += copy;
 946                 if (!(psize -= copy))
 947                         goto out;
 948
 949                 if (skb->len != mss_now || (flags & MSG_OOB))
 950                         continue;
 951
 952                 if (forced_push(tp)) {
 953                         tcp_mark_push(tp, skb);
 954                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 955                 } else if (skb == tp->send_head)
 956                         tcp_push_one(sk, mss_now);
 957                 continue;
 958
 959 wait_for_sndbuf:
 960                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 961 wait_for_memory:
 962                 if (copied)
 963                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 964
 965                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 966                         goto do_error;
 967
 968                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 969         }
 970
 971 out:
 972         if (copied)
 973                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 974         return copied;
 975
 976 do_error:
 977         if (copied)
 978                 goto out;
 979 out_err:
 980         return tcp_error(sk, flags, err);
 981 }
 982
 983 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 984                      size_t size, int flags)
 985 {
 986         ssize_t res;
 987         struct sock *sk = sock->sk;
 988
 989 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 990
 991         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 992             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 993                 return sock_no_sendpage(sock, page, offset, size, flags);
 994
 995 #undef TCP_ZC_CSUM_FLAGS
 996
 997         lock_sock(sk);
 998         TCP_CHECK_TIMER(sk);
 999         res = do_tcp_sendpages(sk, &page, offset, size, flags);
1000         TCP_CHECK_TIMER(sk);
1001         release_sock(sk);
1002         return res;
1003 }
1004
1005 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
1006 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
1007
1008 static inline int tcp_copy_to_page(struct sock *sk, char *from,
1009                                    struct sk_buff *skb, struct page *page,
1010                                    int off, int copy)
1011 {
1012         int err = 0;
1013         unsigned int csum;
1014
1015         if (skb->ip_summed == CHECKSUM_NONE) {
1016                 csum = csum_and_copy_from_user(from, page_address(page) + off,
1017                                        copy, 0, &err);
1018                 if (err) return err;
1019                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1020         } else {
1021                 if (copy_from_user(page_address(page) + off, from, copy))
1022                         return -EFAULT;
1023         }
1024
1025         skb->len += copy;
1026         skb->data_len += copy;
1027         skb->truesize += copy;
1028         sk->sk_wmem_queued += copy;
1029         sk->sk_forward_alloc -= copy;
1030         return 0;
1031 }
1032
1033 static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
1034 {
1035         int err = 0;
1036         unsigned int csum;
1037         int off = skb->len;
1038
1039         if (skb->ip_summed == CHECKSUM_NONE) {
1040                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1041                                        copy, 0, &err);
1042                 if (!err) {
1043                         skb->csum = csum_block_add(skb->csum, csum, off);
1044                         return 0;
1045                 }
1046         } else {
1047                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1048                         return 0;
1049         }
1050
1051         __skb_trim(skb, off);
1052         return -EFAULT;
1053 }
1054
1055 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1056 {
1057         int tmp = tp->mss_cache_std;
1058
1059         if (sk->sk_route_caps & NETIF_F_SG) {
1060                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1061
1062                 if (tmp >= pgbreak &&
1063                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1064                         tmp = pgbreak;
1065         }
1066         return tmp;
1067 }
1068
1069 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1070                 size_t size)
1071 {
1072         struct iovec *iov;
1073         struct tcp_opt *tp = tcp_sk(sk);
1074         struct sk_buff *skb;
1075         int iovlen, flags;
1076         int mss_now;
1077         int err, copied;
1078         long timeo;
1079
1080         lock_sock(sk);
1081         TCP_CHECK_TIMER(sk);
1082
1083         flags = msg->msg_flags;
1084         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1085
1086         /* Wait for a connection to finish. */
1087         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1088                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1089                         goto out_err;
1090
1091         /* This should be in poll */
1092         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1093
1094         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1095
1096         /* Ok commence sending. */
1097         iovlen = msg->msg_iovlen;
1098         iov = msg->msg_iov;
1099         copied = 0;
1100
1101         err = -EPIPE;
1102         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1103                 goto do_error;
1104
1105         while (--iovlen >= 0) {
1106                 int seglen = iov->iov_len;
1107                 unsigned char *from = iov->iov_base;
1108
1109                 iov++;
1110
1111                 while (seglen > 0) {
1112                         int copy;
1113
1114                         skb = sk->sk_write_queue.prev;
1115
1116                         if (!tp->send_head ||
1117                             (copy = mss_now - skb->len) <= 0) {
1118
1119 new_segment:
1120                                 /* Allocate new segment. If the interface is SG,
1121                                  * allocate skb fitting to single page.
1122                                  */
1123                                 if (!tcp_memory_free(sk))
1124                                         goto wait_for_sndbuf;
1125
1126                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1127                                                      0, sk->sk_allocation);
1128                                 if (!skb)
1129                                         goto wait_for_memory;
1130
1131                                 /*
1132                                  * Check whether we can use HW checksum.
1133                                  */
1134                                 if (sk->sk_route_caps &
1135                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1136                                      NETIF_F_HW_CSUM))
1137                                         skb->ip_summed = CHECKSUM_HW;
1138
1139                                 skb_entail(sk, tp, skb);
1140                                 copy = mss_now;
1141                         }
1142
1143                         /* Try to append data to the end of skb. */
1144                         if (copy > seglen)
1145                                 copy = seglen;
1146
1147                         /* Where to copy to? */
1148                         if (skb_tailroom(skb) > 0) {
1149                                 /* We have some space in skb head. Superb! */
1150                                 if (copy > skb_tailroom(skb))
1151                                         copy = skb_tailroom(skb);
1152                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1153                                         goto do_fault;
1154                         } else {
1155                                 int merge = 0;
1156                                 int i = skb_shinfo(skb)->nr_frags;
1157                                 struct page *page = TCP_PAGE(sk);
1158                                 int off = TCP_OFF(sk);
1159
1160                                 if (can_coalesce(skb, i, page, off) &&
1161                                     off != PAGE_SIZE) {
1162                                         /* We can extend the last page
1163                                          * fragment. */
1164                                         merge = 1;
1165                                 } else if (i == MAX_SKB_FRAGS ||
1166                                            (!i &&
1167                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1168                                         /* Need to add new fragment and cannot
1169                                          * do this because interface is non-SG,
1170                                          * or because all the page slots are
1171                                          * busy. */
1172                                         tcp_mark_push(tp, skb);
1173                                         goto new_segment;
1174                                 } else if (page) {
1175                                         /* If page is cached, align
1176                                          * offset to L1 cache boundary
1177                                          */
1178                                         off = (off + L1_CACHE_BYTES - 1) &
1179                                               ~(L1_CACHE_BYTES - 1);
1180                                         if (off == PAGE_SIZE) {
1181                                                 put_page(page);
1182                                                 TCP_PAGE(sk) = page = NULL;
1183                                         }
1184                                 }
1185
1186                                 if (!page) {
1187                                         /* Allocate new cache page. */
1188                                         if (!(page = tcp_alloc_page(sk)))
1189                                                 goto wait_for_memory;
1190                                         off = 0;
1191                                 }
1192
1193                                 if (copy > PAGE_SIZE - off)
1194                                         copy = PAGE_SIZE - off;
1195
1196                                 /* Time to copy data. We are close to
1197                                  * the end! */
1198                                 err = tcp_copy_to_page(sk, from, skb, page,
1199                                                        off, copy);
1200                                 if (err) {
1201                                         /* If this page was new, give it to the
1202                                          * socket so it does not get leaked.
1203                                          */
1204                                         if (!TCP_PAGE(sk)) {
1205                                                 TCP_PAGE(sk) = page;
1206                                                 TCP_OFF(sk) = 0;
1207                                         }
1208                                         goto do_error;
1209                                 }
1210
1211                                 /* Update the skb. */
1212                                 if (merge) {
1213                                         skb_shinfo(skb)->frags[i - 1].size +=
1214                                                                         copy;
1215                                 } else {
1216                                         fill_page_desc(skb, i, page, off, copy);
1217                                         if (TCP_PAGE(sk)) {
1218                                                 get_page(page);
1219                                         } else if (off + copy < PAGE_SIZE) {
1220                                                 get_page(page);
1221                                                 TCP_PAGE(sk) = page;
1222                                         }
1223                                 }
1224
1225                                 TCP_OFF(sk) = off + copy;
1226                         }
1227
1228                         if (!copied)
1229                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1230
1231                         tp->write_seq += copy;
1232                         TCP_SKB_CB(skb)->end_seq += copy;
1233
1234                         from += copy;
1235                         copied += copy;
1236                         if ((seglen -= copy) == 0 && iovlen == 0)
1237                                 goto out;
1238
1239                         if (skb->len != mss_now || (flags & MSG_OOB))
1240                                 continue;
1241
1242                         if (forced_push(tp)) {
1243                                 tcp_mark_push(tp, skb);
1244                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1245                         } else if (skb == tp->send_head)
1246                                 tcp_push_one(sk, mss_now);
1247                         continue;
1248
1249 wait_for_sndbuf:
1250                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1251 wait_for_memory:
1252                         if (copied)
1253                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1254
1255                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1256                                 goto do_error;
1257
1258                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1259                 }
1260         }
1261
1262 out:
1263         if (copied)
1264                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1265         TCP_CHECK_TIMER(sk);
1266         release_sock(sk);
1267         return copied;
1268
1269 do_fault:
1270         if (!skb->len) {
1271                 if (tp->send_head == skb)
1272                         tp->send_head = NULL;
1273                 __skb_unlink(skb, skb->list);
1274                 tcp_free_skb(sk, skb);
1275         }
1276
1277 do_error:
1278         if (copied)
1279                 goto out;
1280 out_err:
1281         err = tcp_error(sk, flags, err);
1282         TCP_CHECK_TIMER(sk);
1283         release_sock(sk);
1284         return err;
1285 }
1286
1287 /*
1288  *      Handle reading urgent data. BSD has very simple semantics for
1289  *      this, no blocking and very strange errors 8)
1290  */
1291
1292 static int tcp_recv_urg(struct sock *sk, long timeo,
1293                         struct msghdr *msg, int len, int flags,
1294                         int *addr_len)
1295 {
1296         struct tcp_opt *tp = tcp_sk(sk);
1297
1298         /* No URG data to read. */
1299         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1300             tp->urg_data == TCP_URG_READ)
1301                 return -EINVAL; /* Yes this is right ! */
1302
1303         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1304                 return -ENOTCONN;
1305
1306         if (tp->urg_data & TCP_URG_VALID) {
1307                 int err = 0;
1308                 char c = tp->urg_data;
1309
1310                 if (!(flags & MSG_PEEK))
1311                         tp->urg_data = TCP_URG_READ;
1312
1313                 /* Read urgent data. */
1314                 msg->msg_flags |= MSG_OOB;
1315
1316                 if (len > 0) {
1317                         if (!(flags & MSG_TRUNC))
1318                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1319                         len = 1;
1320                 } else
1321                         msg->msg_flags |= MSG_TRUNC;
1322
1323                 return err ? -EFAULT : len;
1324         }
1325
1326         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1327                 return 0;
1328
1329         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1330          * the available implementations agree in this case:
1331          * this call should never block, independent of the
1332          * blocking state of the socket.
1333          * Mike <pall@rz.uni-karlsruhe.de>
1334          */
1335         return -EAGAIN;
1336 }
1337
1338 /*
1339  *      Release a skb if it is no longer needed. This routine
1340  *      must be called with interrupts disabled or with the
1341  *      socket locked so that the sk_buff queue operation is ok.
1342  */
1343
1344 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1345 {
1346         __skb_unlink(skb, &sk->sk_receive_queue);
1347         __kfree_skb(skb);
1348 }
1349
1350 /* Clean up the receive buffer for full frames taken by the user,
1351  * then send an ACK if necessary.  COPIED is the number of bytes
1352  * tcp_recvmsg has given to the user so far, it speeds up the
1353  * calculation of whether or not we must ACK for the sake of
1354  * a window update.
1355  */
1356 static void cleanup_rbuf(struct sock *sk, int copied)
1357 {
1358         struct tcp_opt *tp = tcp_sk(sk);
1359         int time_to_ack = 0;
1360
1361 #if TCP_DEBUG
1362         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1363
1364         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1365 #endif
1366
1367         if (tcp_ack_scheduled(tp)) {
1368                    /* Delayed ACKs frequently hit locked sockets during bulk
1369                     * receive. */
1370                 if (tp->ack.blocked ||
1371                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1372                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1373                     /*
1374                      * If this read emptied read buffer, we send ACK, if
1375                      * connection is not bidirectional, user drained
1376                      * receive buffer and there was a small segment
1377                      * in queue.
1378                      */
1379                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1380                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1381                         time_to_ack = 1;
1382         }
1383
1384         /* We send an ACK if we can now advertise a non-zero window
1385          * which has been raised "significantly".
1386          *
1387          * Even if window raised up to infinity, do not send window open ACK
1388          * in states, where we will not receive more. It is useless.
1389          */
1390         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1391                 __u32 rcv_window_now = tcp_receive_window(tp);
1392
1393                 /* Optimize, __tcp_select_window() is not cheap. */
1394                 if (2*rcv_window_now <= tp->window_clamp) {
1395                         __u32 new_window = __tcp_select_window(sk);
1396
1397                         /* Send ACK now, if this read freed lots of space
1398                          * in our buffer. Certainly, new_window is new window.
1399                          * We can advertise it now, if it is not less than current one.
1400                          * "Lots" means "at least twice" here.
1401                          */
1402                         if (new_window && new_window >= 2 * rcv_window_now)
1403                                 time_to_ack = 1;
1404                 }
1405         }
1406         if (time_to_ack)
1407                 tcp_send_ack(sk);
1408 }
1409
1410 /* Now socket state including sk->sk_err is changed only under lock,
1411  * hence we may omit checks after joining wait queue.
1412  * We check receive queue before schedule() only as optimization;
1413  * it is very likely that release_sock() added new data.
1414  */
1415
1416 static long tcp_data_wait(struct sock *sk, long timeo)
1417 {
1418         DEFINE_WAIT(wait);
1419
1420         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1421
1422         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1423         release_sock(sk);
1424
1425         if (skb_queue_empty(&sk->sk_receive_queue))
1426                 timeo = schedule_timeout(timeo);
1427
1428         lock_sock(sk);
1429         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1430
1431         finish_wait(sk->sk_sleep, &wait);
1432         return timeo;
1433 }
1434
1435 static void tcp_prequeue_process(struct sock *sk)
1436 {
1437         struct sk_buff *skb;
1438         struct tcp_opt *tp = tcp_sk(sk);
1439
1440         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1441
1442         /* RX process wants to run with disabled BHs, though it is not
1443          * necessary */
1444         local_bh_disable();
1445         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1446                 sk->sk_backlog_rcv(sk, skb);
1447         local_bh_enable();
1448
1449         /* Clear memory counter. */
1450         tp->ucopy.memory = 0;
1451 }
1452
1453 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1454 {
1455         struct sk_buff *skb;
1456         u32 offset;
1457
1458         skb_queue_walk(&sk->sk_receive_queue, skb) {
1459                 offset = seq - TCP_SKB_CB(skb)->seq;
1460                 if (skb->h.th->syn)
1461                         offset--;
1462                 if (offset < skb->len || skb->h.th->fin) {
1463                         *off = offset;
1464                         return skb;
1465                 }
1466         }
1467         return NULL;
1468 }
1469
1470 /*
1471  * This routine provides an alternative to tcp_recvmsg() for routines
1472  * that would like to handle copying from skbuffs directly in 'sendfile'
1473  * fashion.
1474  * Note:
1475  *      - It is assumed that the socket was locked by the caller.
1476  *      - The routine does not block.
1477  *      - At present, there is no support for reading OOB data
1478  *        or for 'peeking' the socket using this routine
1479  *        (although both would be easy to implement).
1480  */
1481 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1482                   sk_read_actor_t recv_actor)
1483 {
1484         struct sk_buff *skb;
1485         struct tcp_opt *tp = tcp_sk(sk);
1486         u32 seq = tp->copied_seq;
1487         u32 offset;
1488         int copied = 0;
1489
1490         if (sk->sk_state == TCP_LISTEN)
1491                 return -ENOTCONN;
1492         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1493                 if (offset < skb->len) {
1494                         size_t used, len;
1495
1496                         len = skb->len - offset;
1497                         /* Stop reading if we hit a patch of urgent data */
1498                         if (tp->urg_data) {
1499                                 u32 urg_offset = tp->urg_seq - seq;
1500                                 if (urg_offset < len)
1501                                         len = urg_offset;
1502                                 if (!len)
1503                                         break;
1504                         }
1505                         used = recv_actor(desc, skb, offset, len);
1506                         if (used <= len) {
1507                                 seq += used;
1508                                 copied += used;
1509                                 offset += used;
1510                         }
1511                         if (offset != skb->len)
1512                                 break;
1513                 }
1514                 if (skb->h.th->fin) {
1515                         tcp_eat_skb(sk, skb);
1516                         ++seq;
1517                         break;
1518                 }
1519                 tcp_eat_skb(sk, skb);
1520                 if (!desc->count)
1521                         break;
1522         }
1523         tp->copied_seq = seq;
1524         /* Clean up data we have read: This will do ACK frames. */
1525         if (copied)
1526                 cleanup_rbuf(sk, copied);
1527         return copied;
1528 }
1529
1530 /*
1531  *      This routine copies from a sock struct into the user buffer.
1532  *
1533  *      Technical note: in 2.3 we work on _locked_ socket, so that
1534  *      tricks with *seq access order and skb->users are not required.
1535  *      Probably, code can be easily improved even more.
1536  */
1537
1538 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1539                 size_t len, int nonblock, int flags, int *addr_len)
1540 {
1541         struct tcp_opt *tp = tcp_sk(sk);
1542         int copied = 0;
1543         u32 peek_seq;
1544         u32 *seq;
1545         unsigned long used;
1546         int err;
1547         int target;             /* Read at least this many bytes */
1548         long timeo;
1549         struct task_struct *user_recv = NULL;
1550
1551         lock_sock(sk);
1552
1553         TCP_CHECK_TIMER(sk);
1554
1555         err = -ENOTCONN;
1556         if (sk->sk_state == TCP_LISTEN)
1557                 goto out;
1558
1559         timeo = sock_rcvtimeo(sk, nonblock);
1560
1561         /* Urgent data needs to be handled specially. */
1562         if (flags & MSG_OOB)
1563                 goto recv_urg;
1564
1565         seq = &tp->copied_seq;
1566         if (flags & MSG_PEEK) {
1567                 peek_seq = tp->copied_seq;
1568                 seq = &peek_seq;
1569         }
1570
1571         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1572
1573         do {
1574                 struct sk_buff *skb;
1575                 u32 offset;
1576
1577                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1578                 if (tp->urg_data && tp->urg_seq == *seq) {
1579                         if (copied)
1580                                 break;
1581                         if (signal_pending(current)) {
1582                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1583                                 break;
1584                         }
1585                 }
1586
1587                 /* Next get a buffer. */
1588
1589                 skb = skb_peek(&sk->sk_receive_queue);
1590                 do {
1591                         if (!skb)
1592                                 break;
1593
1594                         /* Now that we have two receive queues this
1595                          * shouldn't happen.
1596                          */
1597                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1598                                 printk(KERN_INFO "recvmsg bug: copied %X "
1599                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1600                                 break;
1601                         }
1602                         offset = *seq - TCP_SKB_CB(skb)->seq;
1603                         if (skb->h.th->syn)
1604                                 offset--;
1605                         if (offset < skb->len)
1606                                 goto found_ok_skb;
1607                         if (skb->h.th->fin)
1608                                 goto found_fin_ok;
1609                         BUG_TRAP(flags & MSG_PEEK);
1610                         skb = skb->next;
1611                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1612
1613                 /* Well, if we have backlog, try to process it now yet. */
1614
1615                 if (copied >= target && !sk->sk_backlog.tail)
1616                         break;
1617
1618                 if (copied) {
1619                         if (sk->sk_err ||
1620                             sk->sk_state == TCP_CLOSE ||
1621                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1622                             !timeo ||
1623                             signal_pending(current) ||
1624                             (flags & MSG_PEEK))
1625                                 break;
1626                 } else {
1627                         if (sock_flag(sk, SOCK_DONE))
1628                                 break;
1629
1630                         if (sk->sk_err) {
1631                                 copied = sock_error(sk);
1632                                 break;
1633                         }
1634
1635                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1636                                 break;
1637
1638                         if (sk->sk_state == TCP_CLOSE) {
1639                                 if (!sock_flag(sk, SOCK_DONE)) {
1640                                         /* This occurs when user tries to read
1641                                          * from never connected socket.
1642                                          */
1643                                         copied = -ENOTCONN;
1644                                         break;
1645                                 }
1646                                 break;
1647                         }
1648
1649                         if (!timeo) {
1650                                 copied = -EAGAIN;
1651                                 break;
1652                         }
1653
1654                         if (signal_pending(current)) {
1655                                 copied = sock_intr_errno(timeo);
1656                                 break;
1657                         }
1658                 }
1659
1660                 cleanup_rbuf(sk, copied);
1661
1662                 if (tp->ucopy.task == user_recv) {
1663                         /* Install new reader */
1664                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1665                                 user_recv = current;
1666                                 tp->ucopy.task = user_recv;
1667                                 tp->ucopy.iov = msg->msg_iov;
1668                         }
1669
1670                         tp->ucopy.len = len;
1671
1672                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1673                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1674
1675                         /* Ugly... If prequeue is not empty, we have to
1676                          * process it before releasing socket, otherwise
1677                          * order will be broken at second iteration.
1678                          * More elegant solution is required!!!
1679                          *
1680                          * Look: we have the following (pseudo)queues:
1681                          *
1682                          * 1. packets in flight
1683                          * 2. backlog
1684                          * 3. prequeue
1685                          * 4. receive_queue
1686                          *
1687                          * Each queue can be processed only if the next ones
1688                          * are empty. At this point we have empty receive_queue.
1689                          * But prequeue _can_ be not empty after 2nd iteration,
1690                          * when we jumped to start of loop because backlog
1691                          * processing added something to receive_queue.
1692                          * We cannot release_sock(), because backlog contains
1693                          * packets arrived _after_ prequeued ones.
1694                          *
1695                          * Shortly, algorithm is clear --- to process all
1696                          * the queues in order. We could make it more directly,
1697                          * requeueing packets from backlog to prequeue, if
1698                          * is not empty. It is more elegant, but eats cycles,
1699                          * unfortunately.
1700                          */
1701                         if (skb_queue_len(&tp->ucopy.prequeue))
1702                                 goto do_prequeue;
1703
1704                         /* __ Set realtime policy in scheduler __ */
1705                 }
1706
1707                 if (copied >= target) {
1708                         /* Do not sleep, just process backlog. */
1709                         release_sock(sk);
1710                         lock_sock(sk);
1711                 } else {
1712                         timeo = tcp_data_wait(sk, timeo);
1713                 }
1714
1715                 if (user_recv) {
1716                         int chunk;
1717
1718                         /* __ Restore normal policy in scheduler __ */
1719
1720                         if ((chunk = len - tp->ucopy.len) != 0) {
1721                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1722                                 len -= chunk;
1723                                 copied += chunk;
1724                         }
1725
1726                         if (tp->rcv_nxt == tp->copied_seq &&
1727                             skb_queue_len(&tp->ucopy.prequeue)) {
1728 do_prequeue:
1729                                 tcp_prequeue_process(sk);
1730
1731                                 if ((chunk = len - tp->ucopy.len) != 0) {
1732                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1733                                         len -= chunk;
1734                                         copied += chunk;
1735                                 }
1736                         }
1737                 }
1738                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1739                         if (net_ratelimit())
1740                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1741                                        current->comm, current->pid);
1742                         peek_seq = tp->copied_seq;
1743                 }
1744                 continue;
1745
1746         found_ok_skb:
1747                 /* Ok so how much can we use? */
1748                 used = skb->len - offset;
1749                 if (len < used)
1750                         used = len;
1751
1752                 /* Do we have urgent data here? */
1753                 if (tp->urg_data) {
1754                         u32 urg_offset = tp->urg_seq - *seq;
1755                         if (urg_offset < used) {
1756                                 if (!urg_offset) {
1757                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1758                                                 ++*seq;
1759                                                 offset++;
1760                                                 used--;
1761                                                 if (!used)
1762                                                         goto skip_copy;
1763                                         }
1764                                 } else
1765                                         used = urg_offset;
1766                         }
1767                 }
1768
1769                 if (!(flags & MSG_TRUNC)) {
1770                         err = skb_copy_datagram_iovec(skb, offset,
1771                                                       msg->msg_iov, used);
1772                         if (err) {
1773                                 /* Exception. Bailout! */
1774                                 if (!copied)
1775                                         copied = -EFAULT;
1776                                 break;
1777                         }
1778                 }
1779
1780                 *seq += used;
1781                 copied += used;
1782                 len -= used;
1783
1784 skip_copy:
1785                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1786                         tp->urg_data = 0;
1787                         tcp_fast_path_check(sk, tp);
1788                 }
1789                 if (used + offset < skb->len)
1790                         continue;
1791
1792                 if (skb->h.th->fin)
1793                         goto found_fin_ok;
1794                 if (!(flags & MSG_PEEK))
1795                         tcp_eat_skb(sk, skb);
1796                 continue;
1797
1798         found_fin_ok:
1799                 /* Process the FIN. */
1800                 ++*seq;
1801                 if (!(flags & MSG_PEEK))
1802                         tcp_eat_skb(sk, skb);
1803                 break;
1804         } while (len > 0);
1805
1806         if (user_recv) {
1807                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1808                         int chunk;
1809
1810                         tp->ucopy.len = copied > 0 ? len : 0;
1811
1812                         tcp_prequeue_process(sk);
1813
1814                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1815                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1816                                 len -= chunk;
1817                                 copied += chunk;
1818                         }
1819                 }
1820
1821                 tp->ucopy.task = NULL;
1822                 tp->ucopy.len = 0;
1823         }
1824
1825         /* According to UNIX98, msg_name/msg_namelen are ignored
1826          * on connected socket. I was just happy when found this 8) --ANK
1827          */
1828
1829         /* Clean up data we have read: This will do ACK frames. */
1830         cleanup_rbuf(sk, copied);
1831
1832         TCP_CHECK_TIMER(sk);
1833         release_sock(sk);
1834         return copied;
1835
1836 out:
1837         TCP_CHECK_TIMER(sk);
1838         release_sock(sk);
1839         return err;
1840
1841 recv_urg:
1842         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1843         goto out;
1844 }
1845
1846 /*
1847  *      State processing on a close. This implements the state shift for
1848  *      sending our FIN frame. Note that we only send a FIN for some
1849  *      states. A shutdown() may have already sent the FIN, or we may be
1850  *      closed.
1851  */
1852
1853 static unsigned char new_state[16] = {
1854   /* current state:        new state:      action:      */
1855   /* (Invalid)          */ TCP_CLOSE,
1856   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1857   /* TCP_SYN_SENT       */ TCP_CLOSE,
1858   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1859   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1860   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1861   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1862   /* TCP_CLOSE          */ TCP_CLOSE,
1863   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1864   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1865   /* TCP_LISTEN         */ TCP_CLOSE,
1866   /* TCP_CLOSING        */ TCP_CLOSING,
1867 };
1868
1869 static int tcp_close_state(struct sock *sk)
1870 {
1871         int next = (int)new_state[sk->sk_state];
1872         int ns = next & TCP_STATE_MASK;
1873
1874         tcp_set_state(sk, ns);
1875
1876         return next & TCP_ACTION_FIN;
1877 }
1878
1879 /*
1880  *      Shutdown the sending side of a connection. Much like close except
1881  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1882  */
1883
1884 void tcp_shutdown(struct sock *sk, int how)
1885 {
1886         /*      We need to grab some memory, and put together a FIN,
1887          *      and then put it into the queue to be sent.
1888          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1889          */
1890         if (!(how & SEND_SHUTDOWN))
1891                 return;
1892
1893         /* If we've already sent a FIN, or it's a closed state, skip this. */
1894         if ((1 << sk->sk_state) &
1895             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1896              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1897                 /* Clear out any half completed packets.  FIN if needed. */
1898                 if (tcp_close_state(sk))
1899                         tcp_send_fin(sk);
1900         }
1901 }
1902
1903
1904 /*
1905  *      Return 1 if we still have things to send in our buffers.
1906  */
1907
1908 static inline int closing(struct sock *sk)
1909 {
1910         return (1 << sk->sk_state) &
1911                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1912 }
1913
1914 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1915 {
1916         /* First the read buffer. */
1917         __skb_queue_purge(&sk->sk_receive_queue);
1918
1919         /* Next, the error queue. */
1920         __skb_queue_purge(&sk->sk_error_queue);
1921
1922         /* Next, the write queue. */
1923         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1924
1925         /* Account for returned memory. */
1926         tcp_mem_reclaim(sk);
1927
1928         BUG_TRAP(!sk->sk_wmem_queued);
1929         BUG_TRAP(!sk->sk_forward_alloc);
1930
1931         /* It is _impossible_ for the backlog to contain anything
1932          * when we get here.  All user references to this socket
1933          * have gone away, only the net layer knows can touch it.
1934          */
1935 }
1936
1937 /*
1938  * At this point, there should be no process reference to this
1939  * socket, and thus no user references at all.  Therefore we
1940  * can assume the socket waitqueue is inactive and nobody will
1941  * try to jump onto it.
1942  */
1943 void tcp_destroy_sock(struct sock *sk)
1944 {
1945         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1946         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1947
1948         /* It cannot be in hash table! */
1949         BUG_TRAP(sk_unhashed(sk));
1950
1951         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1952         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1953
1954 #ifdef TCP_DEBUG
1955         if (sk->sk_zapped) {
1956                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1957                 sock_hold(sk);
1958         }
1959         sk->sk_zapped = 1;
1960 #endif
1961
1962         sk->sk_prot->destroy(sk);
1963
1964         tcp_kill_sk_queues(sk);
1965
1966         xfrm_sk_free_policy(sk);
1967
1968 #ifdef INET_REFCNT_DEBUG
1969         if (atomic_read(&sk->sk_refcnt) != 1) {
1970                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1971                        sk, atomic_read(&sk->sk_refcnt));
1972         }
1973 #endif
1974
1975         atomic_dec(&tcp_orphan_count);
1976         sock_put(sk);
1977 }
1978
1979 void tcp_close(struct sock *sk, long timeout)
1980 {
1981         struct sk_buff *skb;
1982         int data_was_unread = 0;
1983
1984         lock_sock(sk);
1985         sk->sk_shutdown = SHUTDOWN_MASK;
1986
1987         if (sk->sk_state == TCP_LISTEN) {
1988                 tcp_set_state(sk, TCP_CLOSE);
1989
1990                 /* Special case. */
1991                 tcp_listen_stop(sk);
1992
1993                 goto adjudge_to_death;
1994         }
1995
1996         /*  We need to flush the recv. buffs.  We do this only on the
1997          *  descriptor close, not protocol-sourced closes, because the
1998          *  reader process may not have drained the data yet!
1999          */
2000         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2001                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2002                           skb->h.th->fin;
2003                 data_was_unread += len;
2004                 __kfree_skb(skb);
2005         }
2006
2007         tcp_mem_reclaim(sk);
2008
2009         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
2010          * 3.10, we send a RST here because data was lost.  To
2011          * witness the awful effects of the old behavior of always
2012          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
2013          * a bulk GET in an FTP client, suspend the process, wait
2014          * for the client to advertise a zero window, then kill -9
2015          * the FTP client, wheee...  Note: timeout is always zero
2016          * in such a case.
2017          */
2018         if (data_was_unread) {
2019                 /* Unread data was tossed, zap the connection. */
2020                 NET_INC_STATS_USER(TCPAbortOnClose);
2021                 tcp_set_state(sk, TCP_CLOSE);
2022                 tcp_send_active_reset(sk, GFP_KERNEL);
2023         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2024                 /* Check zero linger _after_ checking for unread data. */
2025                 sk->sk_prot->disconnect(sk, 0);
2026                 NET_INC_STATS_USER(TCPAbortOnData);
2027         } else if (tcp_close_state(sk)) {
2028                 /* We FIN if the application ate all the data before
2029                  * zapping the connection.
2030                  */
2031
2032                 /* RED-PEN. Formally speaking, we have broken TCP state
2033                  * machine. State transitions:
2034                  *
2035                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2036                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2037                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2038                  *
2039                  * are legal only when FIN has been sent (i.e. in window),
2040                  * rather than queued out of window. Purists blame.
2041                  *
2042                  * F.e. "RFC state" is ESTABLISHED,
2043                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2044                  *
2045                  * The visible declinations are that sometimes
2046                  * we enter time-wait state, when it is not required really
2047                  * (harmless), do not send active resets, when they are
2048                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2049                  * they look as CLOSING or LAST_ACK for Linux)
2050                  * Probably, I missed some more holelets.
2051                  *                                              --ANK
2052                  */
2053                 tcp_send_fin(sk);
2054         }
2055
2056         if (timeout) {
2057                 struct task_struct *tsk = current;
2058                 DEFINE_WAIT(wait);
2059
2060                 do {
2061                         prepare_to_wait(sk->sk_sleep, &wait,
2062                                         TASK_INTERRUPTIBLE);
2063                         if (!closing(sk))
2064                                 break;
2065                         release_sock(sk);
2066                         timeout = schedule_timeout(timeout);
2067                         lock_sock(sk);
2068                 } while (!signal_pending(tsk) && timeout);
2069
2070                 finish_wait(sk->sk_sleep, &wait);
2071         }
2072
2073 adjudge_to_death:
2074         /* It is the last release_sock in its life. It will remove backlog. */
2075         release_sock(sk);
2076
2077
2078         /* Now socket is owned by kernel and we acquire BH lock
2079            to finish close. No need to check for user refs.
2080          */
2081         local_bh_disable();
2082         bh_lock_sock(sk);
2083         BUG_TRAP(!sock_owned_by_user(sk));
2084
2085         sock_hold(sk);
2086         sock_orphan(sk);
2087
2088         /*      This is a (useful) BSD violating of the RFC. There is a
2089          *      problem with TCP as specified in that the other end could
2090          *      keep a socket open forever with no application left this end.
2091          *      We use a 3 minute timeout (about the same as BSD) then kill
2092          *      our end. If they send after that then tough - BUT: long enough
2093          *      that we won't make the old 4*rto = almost no time - whoops
2094          *      reset mistake.
2095          *
2096          *      Nope, it was not mistake. It is really desired behaviour
2097          *      f.e. on http servers, when such sockets are useless, but
2098          *      consume significant resources. Let's do it with special
2099          *      linger2 option.                                 --ANK
2100          */
2101
2102         if (sk->sk_state == TCP_FIN_WAIT2) {
2103                 struct tcp_opt *tp = tcp_sk(sk);
2104                 if (tp->linger2 < 0) {
2105                         tcp_set_state(sk, TCP_CLOSE);
2106                         tcp_send_active_reset(sk, GFP_ATOMIC);
2107                         NET_INC_STATS_BH(TCPAbortOnLinger);
2108                 } else {
2109                         int tmo = tcp_fin_time(tp);
2110
2111                         if (tmo > TCP_TIMEWAIT_LEN) {
2112                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2113                         } else {
2114                                 atomic_inc(&tcp_orphan_count);
2115                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2116                                 goto out;
2117                         }
2118                 }
2119         }
2120         if (sk->sk_state != TCP_CLOSE) {
2121                 tcp_mem_reclaim(sk);
2122                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2123                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2124                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2125                         if (net_ratelimit())
2126                                 printk(KERN_INFO "TCP: too many of orphaned "
2127                                        "sockets\n");
2128                         tcp_set_state(sk, TCP_CLOSE);
2129                         tcp_send_active_reset(sk, GFP_ATOMIC);
2130                         NET_INC_STATS_BH(TCPAbortOnMemory);
2131                 }
2132         }
2133         atomic_inc(&tcp_orphan_count);
2134
2135         if (sk->sk_state == TCP_CLOSE)
2136                 tcp_destroy_sock(sk);
2137         /* Otherwise, socket is reprieved until protocol close. */
2138
2139 out:
2140         bh_unlock_sock(sk);
2141         local_bh_enable();
2142         sock_put(sk);
2143 }
2144
2145 /* These states need RST on ABORT according to RFC793 */
2146
2147 static inline int tcp_need_reset(int state)
2148 {
2149         return (1 << state) &
2150                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2151                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2152 }
2153
2154 int tcp_disconnect(struct sock *sk, int flags)
2155 {
2156         struct inet_opt *inet = inet_sk(sk);
2157         struct tcp_opt *tp = tcp_sk(sk);
2158         int err = 0;
2159         int old_state = sk->sk_state;
2160
2161         if (old_state != TCP_CLOSE)
2162                 tcp_set_state(sk, TCP_CLOSE);
2163
2164         /* ABORT function of RFC793 */
2165         if (old_state == TCP_LISTEN) {
2166                 tcp_listen_stop(sk);
2167         } else if (tcp_need_reset(old_state) ||
2168                    (tp->snd_nxt != tp->write_seq &&
2169                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2170                 /* The last check adjusts for discrepance of Linux wrt. RFC
2171                  * states
2172                  */
2173                 tcp_send_active_reset(sk, gfp_any());
2174                 sk->sk_err = ECONNRESET;
2175         } else if (old_state == TCP_SYN_SENT)
2176                 sk->sk_err = ECONNRESET;
2177
2178         tcp_clear_xmit_timers(sk);
2179         __skb_queue_purge(&sk->sk_receive_queue);
2180         tcp_writequeue_purge(sk);
2181         __skb_queue_purge(&tp->out_of_order_queue);
2182
2183         inet->dport = 0;
2184
2185         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2186                 inet_reset_saddr(sk);
2187
2188         sk->sk_shutdown = 0;
2189         sock_reset_flag(sk, SOCK_DONE);
2190         tp->srtt = 0;
2191         if ((tp->write_seq += tp->max_window + 2) == 0)
2192                 tp->write_seq = 1;
2193         tp->backoff = 0;
2194         tp->snd_cwnd = 2;
2195         tp->probes_out = 0;
2196         tp->packets_out = 0;
2197         tp->snd_ssthresh = 0x7fffffff;
2198         tp->snd_cwnd_cnt = 0;
2199         tcp_set_ca_state(tp, TCP_CA_Open);
2200         tcp_clear_retrans(tp);
2201         tcp_delack_init(tp);
2202         tp->send_head = NULL;
2203         tp->saw_tstamp = 0;
2204         tcp_sack_reset(tp);
2205         __sk_dst_reset(sk);
2206
2207         BUG_TRAP(!inet->num || tp->bind_hash);
2208
2209         sk->sk_error_report(sk);
2210         return err;
2211 }
2212
2213 /*
2214  *      Wait for an incoming connection, avoid race
2215  *      conditions. This must be called with the socket locked.
2216  */
2217 static int wait_for_connect(struct sock *sk, long timeo)
2218 {
2219         struct tcp_opt *tp = tcp_sk(sk);
2220         DEFINE_WAIT(wait);
2221         int err;
2222
2223         /*
2224          * True wake-one mechanism for incoming connections: only
2225          * one process gets woken up, not the 'whole herd'.
2226          * Since we do not 'race & poll' for established sockets
2227          * anymore, the common case will execute the loop only once.
2228          *
2229          * Subtle issue: "add_wait_queue_exclusive()" will be added
2230          * after any current non-exclusive waiters, and we know that
2231          * it will always _stay_ after any new non-exclusive waiters
2232          * because all non-exclusive waiters are added at the
2233          * beginning of the wait-queue. As such, it's ok to "drop"
2234          * our exclusiveness temporarily when we get woken up without
2235          * having to remove and re-insert us on the wait queue.
2236          */
2237         for (;;) {
2238                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2239                                           TASK_INTERRUPTIBLE);
2240                 release_sock(sk);
2241                 if (!tp->accept_queue)
2242                         timeo = schedule_timeout(timeo);
2243                 lock_sock(sk);
2244                 err = 0;
2245                 if (tp->accept_queue)
2246                         break;
2247                 err = -EINVAL;
2248                 if (sk->sk_state != TCP_LISTEN)
2249                         break;
2250                 err = sock_intr_errno(timeo);
2251                 if (signal_pending(current))
2252                         break;
2253                 err = -EAGAIN;
2254                 if (!timeo)
2255                         break;
2256         }
2257         finish_wait(sk->sk_sleep, &wait);
2258         return err;
2259 }
2260
2261 /*
2262  *      This will accept the next outstanding connection.
2263  */
2264
2265 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2266 {
2267         struct tcp_opt *tp = tcp_sk(sk);
2268         struct open_request *req;
2269         struct sock *newsk;
2270         int error;
2271 #ifdef CONFIG_ACCEPT_QUEUES
2272         int prev_class = 0;
2273         int first;
2274 #endif
2275
2276         lock_sock(sk);
2277
2278         /* We need to make sure that this socket is listening,
2279          * and that it has something pending.
2280          */
2281         error = -EINVAL;
2282         if (sk->sk_state != TCP_LISTEN)
2283                 goto out;
2284
2285         /* Find already established connection */
2286         if (!tp->accept_queue) {
2287                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2288                 /* If this is a non blocking socket don't sleep */
2289                 error = -EAGAIN;
2290                 if (!timeo)
2291                         goto out;
2292
2293                 error = wait_for_connect(sk, timeo);
2294                 if (error)
2295                         goto out;
2296         }
2297
2298 #ifndef CONFIG_ACCEPT_QUEUES
2299         req = tp->accept_queue;
2300         if ((tp->accept_queue = req->dl_next) == NULL)
2301                 tp->accept_queue_tail = NULL;
2302
2303         tcp_acceptq_removed(sk);
2304 #else
2305         first = tp->class_index;
2306         /* We should always have  request queued here. The accept_queue
2307          * is already checked for NULL above.
2308          */
2309         while(!tp->acceptq[first].aq_head) {
2310                 tp->acceptq[first].aq_cnt = 0;
2311                 first = (first+1) & ~NUM_ACCEPT_QUEUES;
2312         }
2313         req = tp->acceptq[first].aq_head;
2314         tp->acceptq[first].aq_qcount--;
2315         tp->acceptq[first].aq_count++;
2316         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
2317
2318         for (prev_class= first-1 ; prev_class >=0; prev_class--)
2319                 if (tp->acceptq[prev_class].aq_tail)
2320                         break;
2321         if (prev_class>=0)
2322                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
2323         else
2324                 tp->accept_queue = req->dl_next;
2325
2326         if (req == tp->acceptq[first].aq_tail)
2327                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2328         else
2329                 tp->acceptq[first].aq_head = req->dl_next;
2330
2331         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2332                 tp->acceptq[first].aq_cnt = 0;
2333                 tp->class_index = ++first & ~NUM_ACCEPT_QUEUES;
2334         }
2335         tcp_acceptq_removed(sk, req->acceptq_class);
2336 #endif
2337         newsk = req->sk;
2338         tcp_openreq_fastfree(req);
2339         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2340         release_sock(sk);
2341         return newsk;
2342
2343 out:
2344         release_sock(sk);
2345         *err = error;
2346         return NULL;
2347 }
2348
2349 /*
2350  *      Socket option code for TCP.
2351  */
2352 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2353                    int optlen)
2354 {
2355         struct tcp_opt *tp = tcp_sk(sk);
2356         int val;
2357         int err = 0;
2358
2359         if (level != SOL_TCP)
2360                 return tp->af_specific->setsockopt(sk, level, optname,
2361                                                    optval, optlen);
2362
2363         if (optlen < sizeof(int))
2364                 return -EINVAL;
2365
2366         if (get_user(val, (int *)optval))
2367                 return -EFAULT;
2368
2369         lock_sock(sk);
2370
2371         switch (optname) {
2372         case TCP_MAXSEG:
2373                 /* Values greater than interface MTU won't take effect. However
2374                  * at the point when this call is done we typically don't yet
2375                  * know which interface is going to be used */
2376                 if (val < 8 || val > MAX_TCP_WINDOW) {
2377                         err = -EINVAL;
2378                         break;
2379                 }
2380                 tp->user_mss = val;
2381                 break;
2382
2383         case TCP_NODELAY:
2384                 if (val) {
2385                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2386                          * this option on corked socket is remembered, but
2387                          * it is not activated until cork is cleared.
2388                          *
2389                          * However, when TCP_NODELAY is set we make
2390                          * an explicit push, which overrides even TCP_CORK
2391                          * for currently queued segments.
2392                          */
2393                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2394                         tcp_push_pending_frames(sk, tp);
2395                 } else {
2396                         tp->nonagle &= ~TCP_NAGLE_OFF;
2397                 }
2398                 break;
2399
2400         case TCP_CORK:
2401                 /* When set indicates to always queue non-full frames.
2402                  * Later the user clears this option and we transmit
2403                  * any pending partial frames in the queue.  This is
2404                  * meant to be used alongside sendfile() to get properly
2405                  * filled frames when the user (for example) must write
2406                  * out headers with a write() call first and then use
2407                  * sendfile to send out the data parts.
2408                  *
2409                  * TCP_CORK can be set together with TCP_NODELAY and it is
2410                  * stronger than TCP_NODELAY.
2411                  */
2412                 if (val) {
2413                         tp->nonagle |= TCP_NAGLE_CORK;
2414                 } else {
2415                         tp->nonagle &= ~TCP_NAGLE_CORK;
2416                         if (tp->nonagle&TCP_NAGLE_OFF)
2417                                 tp->nonagle |= TCP_NAGLE_PUSH;
2418                         tcp_push_pending_frames(sk, tp);
2419                 }
2420                 break;
2421
2422         case TCP_KEEPIDLE:
2423                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2424                         err = -EINVAL;
2425                 else {
2426                         tp->keepalive_time = val * HZ;
2427                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2428                             !((1 << sk->sk_state) &
2429                               (TCPF_CLOSE | TCPF_LISTEN))) {
2430                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2431                                 if (tp->keepalive_time > elapsed)
2432                                         elapsed = tp->keepalive_time - elapsed;
2433                                 else
2434                                         elapsed = 0;
2435                                 tcp_reset_keepalive_timer(sk, elapsed);
2436                         }
2437                 }
2438                 break;
2439         case TCP_KEEPINTVL:
2440                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2441                         err = -EINVAL;
2442                 else
2443                         tp->keepalive_intvl = val * HZ;
2444                 break;
2445         case TCP_KEEPCNT:
2446                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2447                         err = -EINVAL;
2448                 else
2449                         tp->keepalive_probes = val;
2450                 break;
2451         case TCP_SYNCNT:
2452                 if (val < 1 || val > MAX_TCP_SYNCNT)
2453                         err = -EINVAL;
2454                 else
2455                         tp->syn_retries = val;
2456                 break;
2457
2458         case TCP_LINGER2:
2459                 if (val < 0)
2460                         tp->linger2 = -1;
2461                 else if (val > sysctl_tcp_fin_timeout / HZ)
2462                         tp->linger2 = 0;
2463                 else
2464                         tp->linger2 = val * HZ;
2465                 break;
2466
2467         case TCP_DEFER_ACCEPT:
2468                 tp->defer_accept = 0;
2469                 if (val > 0) {
2470                         /* Translate value in seconds to number of
2471                          * retransmits */
2472                         while (tp->defer_accept < 32 &&
2473                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2474                                        tp->defer_accept))
2475                                 tp->defer_accept++;
2476                         tp->defer_accept++;
2477                 }
2478                 break;
2479
2480         case TCP_WINDOW_CLAMP:
2481                 if (!val) {
2482                         if (sk->sk_state != TCP_CLOSE) {
2483                                 err = -EINVAL;
2484                                 break;
2485                         }
2486                         tp->window_clamp = 0;
2487                 } else
2488                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2489                                                 SOCK_MIN_RCVBUF / 2 : val;
2490                 break;
2491
2492         case TCP_QUICKACK:
2493                 if (!val) {
2494                         tp->ack.pingpong = 1;
2495                 } else {
2496                         tp->ack.pingpong = 0;
2497                         if ((1 << sk->sk_state) &
2498                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2499                             tcp_ack_scheduled(tp)) {
2500                                 tp->ack.pending |= TCP_ACK_PUSHED;
2501                                 cleanup_rbuf(sk, 1);
2502                                 if (!(val & 1))
2503                                         tp->ack.pingpong = 1;
2504                         }
2505                 }
2506                 break;
2507
2508 #ifdef CONFIG_ACCEPT_QUEUES
2509         case TCP_ACCEPTQ_SHARE:
2510                 {
2511                         char share_wt[NUM_ACCEPT_QUEUES];
2512                         int i,j;
2513
2514                         if (sk->sk_state != TCP_LISTEN)
2515                                 return -EOPNOTSUPP;
2516
2517                         if (copy_from_user(share_wt,optval, optlen)) {
2518                                 err = -EFAULT;
2519                                 break;
2520                         }
2521                         j = 0;
2522                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2523                                 if (share_wt[i]) {
2524                                         if (!j)
2525                                                 j = share_wt[i];
2526                                         else if (share_wt[i] < j) {
2527                                                 j = share_wt[i];
2528                                         }
2529                                         tp->acceptq[i].aq_valid = 1;
2530                                 }
2531                                 else
2532                                         tp->acceptq[i].aq_valid = 0;
2533
2534                         }
2535                         if (j == 0) {
2536                                 /* Class 0 is always valid. If nothing is
2537                                  * specified set class 0 as 1.
2538                                  */
2539                                 share_wt[0] = 1;
2540                                 tp->acceptq[0].aq_valid = 1;
2541                                 j = 1;
2542                         }
2543                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2544                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2545                                 tp->acceptq[i].aq_cnt = 0;
2546                         }
2547                 }
2548                 break;
2549 #endif
2550
2551         default:
2552                 err = -ENOPROTOOPT;
2553                 break;
2554         };
2555         release_sock(sk);
2556         return err;
2557 }
2558
2559 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2560                    int *optlen)
2561 {
2562         struct tcp_opt *tp = tcp_sk(sk);
2563         int val, len;
2564
2565         if (level != SOL_TCP)
2566                 return tp->af_specific->getsockopt(sk, level, optname,
2567                                                    optval, optlen);
2568
2569         if (get_user(len, optlen))
2570                 return -EFAULT;
2571
2572         len = min_t(unsigned int, len, sizeof(int));
2573
2574         if (len < 0)
2575                 return -EINVAL;
2576
2577         switch (optname) {
2578         case TCP_MAXSEG:
2579                 val = tp->mss_cache_std;
2580                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2581                         val = tp->user_mss;
2582                 break;
2583         case TCP_NODELAY:
2584                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2585                 break;
2586         case TCP_CORK:
2587                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2588                 break;
2589         case TCP_KEEPIDLE:
2590                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2591                 break;
2592         case TCP_KEEPINTVL:
2593                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2594                 break;
2595         case TCP_KEEPCNT:
2596                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2597                 break;
2598         case TCP_SYNCNT:
2599                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2600                 break;
2601         case TCP_LINGER2:
2602                 val = tp->linger2;
2603                 if (val >= 0)
2604                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2605                 break;
2606         case TCP_DEFER_ACCEPT:
2607                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2608                                                (tp->defer_accept - 1));
2609                 break;
2610         case TCP_WINDOW_CLAMP:
2611                 val = tp->window_clamp;
2612                 break;
2613         case TCP_INFO: {
2614                 struct tcp_info info;
2615                 u32 now = tcp_time_stamp;
2616
2617                 if (get_user(len, optlen))
2618                         return -EFAULT;
2619                 info.tcpi_state = sk->sk_state;
2620                 info.tcpi_ca_state = tp->ca_state;
2621                 info.tcpi_retransmits = tp->retransmits;
2622                 info.tcpi_probes = tp->probes_out;
2623                 info.tcpi_backoff = tp->backoff;
2624                 info.tcpi_options = 0;
2625                 if (tp->tstamp_ok)
2626                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2627                 if (tp->sack_ok)
2628                         info.tcpi_options |= TCPI_OPT_SACK;
2629                 if (tp->wscale_ok) {
2630                         info.tcpi_options |= TCPI_OPT_WSCALE;
2631                         info.tcpi_snd_wscale = tp->snd_wscale;
2632                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2633                 } else {
2634                         info.tcpi_snd_wscale = 0;
2635                         info.tcpi_rcv_wscale = 0;
2636                 }
2637                 if (tp->ecn_flags & TCP_ECN_OK)
2638                         info.tcpi_options |= TCPI_OPT_ECN;
2639
2640                 info.tcpi_rto = (1000000 * tp->rto) / HZ;
2641                 info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2642                 info.tcpi_snd_mss = tp->mss_cache_std;
2643                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2644
2645                 info.tcpi_unacked = tp->packets_out;
2646                 info.tcpi_sacked = tp->sacked_out;
2647                 info.tcpi_lost = tp->lost_out;
2648                 info.tcpi_retrans = tp->retrans_out;
2649                 info.tcpi_fackets = tp->fackets_out;
2650
2651                 info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2652                 info.tcpi_last_ack_sent = 0;
2653                 info.tcpi_last_data_recv = ((now -
2654                                              tp->ack.lrcvtime) * 1000) / HZ;
2655                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2656
2657                 info.tcpi_pmtu = tp->pmtu_cookie;
2658                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2659                 info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2660                 info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2661                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2662                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2663                 info.tcpi_advmss = tp->advmss;
2664                 info.tcpi_reordering = tp->reordering;
2665
2666                 len = min_t(unsigned int, len, sizeof(info));
2667                 if (put_user(len, optlen))
2668                         return -EFAULT;
2669                 if (copy_to_user(optval, &info, len))
2670                         return -EFAULT;
2671                 return 0;
2672         }
2673         case TCP_QUICKACK:
2674                 val = !tp->ack.pingpong;
2675                 break;
2676
2677 #ifdef CONFIG_ACCEPT_QUEUES
2678         case TCP_ACCEPTQ_SHARE: {
2679                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2680                 int i;
2681
2682                 if (sk->sk_state != TCP_LISTEN)
2683                         return -EOPNOTSUPP;
2684
2685                 if (get_user(len, optlen))
2686                         return -EFAULT;
2687
2688                 memset(tinfo, 0, sizeof(tinfo));
2689
2690                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2691                         tinfo[i].acceptq_wait_time =
2692                                 tp->acceptq[i].aq_wait_time/(HZ/USER_HZ);
2693                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2694                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2695                         if (tp->acceptq[i].aq_valid)
2696                                 tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2697                         else
2698                                 tinfo[i].acceptq_shares = 0;
2699                 }
2700
2701                 len = min_t(unsigned int, len, sizeof(tinfo));
2702                 if (put_user(len, optlen))
2703                         return -EFAULT;
2704
2705                 if (copy_to_user(optval, (char *)tinfo, len))
2706                         return -EFAULT;
2707
2708                 return 0;
2709         }
2710 #endif
2711         default:
2712                 return -ENOPROTOOPT;
2713         };
2714
2715         if (put_user(len, optlen))
2716                 return -EFAULT;
2717         if (copy_to_user(optval, &val, len))
2718                 return -EFAULT;
2719         return 0;
2720 }
2721
2722
2723 extern void __skb_cb_too_small_for_tcp(int, int);
2724 extern void tcpdiag_init(void);
2725
2726 static __initdata unsigned long thash_entries;
2727 static int __init set_thash_entries(char *str)
2728 {
2729         if (!str)
2730                 return 0;
2731         thash_entries = simple_strtoul(str, &str, 0);
2732         return 1;
2733 }
2734 __setup("thash_entries=", set_thash_entries);
2735
2736 void __init tcp_init(void)
2737 {
2738         struct sk_buff *skb = NULL;
2739         unsigned long goal;
2740         int order, i;
2741
2742         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2743                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2744                                            sizeof(skb->cb));
2745
2746         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2747                                                    sizeof(struct open_request),
2748                                                0, SLAB_HWCACHE_ALIGN,
2749                                                NULL, NULL);
2750         if (!tcp_openreq_cachep)
2751                 panic("tcp_init: Cannot alloc open_request cache.");
2752
2753         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2754                                               sizeof(struct tcp_bind_bucket),
2755                                               0, SLAB_HWCACHE_ALIGN,
2756                                               NULL, NULL);
2757         if (!tcp_bucket_cachep)
2758                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2759
2760         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2761                                                 sizeof(struct tcp_tw_bucket),
2762                                                 0, SLAB_HWCACHE_ALIGN,
2763                                                 NULL, NULL);
2764         if (!tcp_timewait_cachep)
2765                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2766
2767         /* Size and allocate the main established and bind bucket
2768          * hash tables.
2769          *
2770          * The methodology is similar to that of the buffer cache.
2771          */
2772         if (num_physpages >= (128 * 1024))
2773                 goal = num_physpages >> (21 - PAGE_SHIFT);
2774         else
2775                 goal = num_physpages >> (23 - PAGE_SHIFT);
2776
2777         if (thash_entries)
2778                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2779         for (order = 0; (1UL << order) < goal; order++)
2780                 ;
2781         do {
2782                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2783                         sizeof(struct tcp_ehash_bucket);
2784                 tcp_ehash_size >>= 1;
2785                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2786                         tcp_ehash_size--;
2787                 tcp_ehash = (struct tcp_ehash_bucket *)
2788                         __get_free_pages(GFP_ATOMIC, order);
2789         } while (!tcp_ehash && --order > 0);
2790
2791         if (!tcp_ehash)
2792                 panic("Failed to allocate TCP established hash table\n");
2793         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2794                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2795                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2796         }
2797
2798         do {
2799                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2800                         sizeof(struct tcp_bind_hashbucket);
2801                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2802                         continue;
2803                 tcp_bhash = (struct tcp_bind_hashbucket *)
2804                         __get_free_pages(GFP_ATOMIC, order);
2805         } while (!tcp_bhash && --order >= 0);
2806
2807         if (!tcp_bhash)
2808                 panic("Failed to allocate TCP bind hash table\n");
2809         for (i = 0; i < tcp_bhash_size; i++) {
2810                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2811                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2812         }
2813
2814         /* Try to be a bit smarter and adjust defaults depending
2815          * on available memory.
2816          */
2817         if (order > 4) {
2818                 sysctl_local_port_range[0] = 32768;
2819                 sysctl_local_port_range[1] = 61000;
2820                 sysctl_tcp_max_tw_buckets = 180000;
2821                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2822                 sysctl_max_syn_backlog = 1024;
2823         } else if (order < 3) {
2824                 sysctl_local_port_range[0] = 1024 * (3 - order);
2825                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2826                 sysctl_tcp_max_orphans >>= (3 - order);
2827                 sysctl_max_syn_backlog = 128;
2828         }
2829         tcp_port_rover = sysctl_local_port_range[0] - 1;
2830
2831         sysctl_tcp_mem[0] =  768 << order;
2832         sysctl_tcp_mem[1] = 1024 << order;
2833         sysctl_tcp_mem[2] = 1536 << order;
2834         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2835                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2836         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2837                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2838
2839         if (order < 3) {
2840                 sysctl_tcp_wmem[2] = 64 * 1024;
2841                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2842                 sysctl_tcp_rmem[1] = 43689;
2843                 sysctl_tcp_rmem[2] = 2 * 43689;
2844         }
2845
2846         printk(KERN_INFO "TCP: Hash tables configured "
2847                "(established %d bind %d)\n",
2848                tcp_ehash_size << 1, tcp_bhash_size);
2849
2850         tcpdiag_init();
2851 }
2852
2853 EXPORT_SYMBOL(__tcp_mem_reclaim);
2854 EXPORT_SYMBOL(sysctl_tcp_rmem);
2855 EXPORT_SYMBOL(sysctl_tcp_wmem);
2856 EXPORT_SYMBOL(tcp_accept);
2857 EXPORT_SYMBOL(tcp_close);
2858 EXPORT_SYMBOL(tcp_close_state);
2859 EXPORT_SYMBOL(tcp_destroy_sock);
2860 EXPORT_SYMBOL(tcp_disconnect);
2861 EXPORT_SYMBOL(tcp_getsockopt);
2862 EXPORT_SYMBOL(tcp_ioctl);
2863 EXPORT_SYMBOL(tcp_openreq_cachep);
2864 EXPORT_SYMBOL(tcp_poll);
2865 EXPORT_SYMBOL(tcp_read_sock);
2866 EXPORT_SYMBOL(tcp_recvmsg);
2867 EXPORT_SYMBOL(tcp_sendmsg);
2868 EXPORT_SYMBOL(tcp_sendpage);
2869 EXPORT_SYMBOL(tcp_setsockopt);
2870 EXPORT_SYMBOL(tcp_shutdown);
2871 EXPORT_SYMBOL(tcp_sockets_allocated);
2872 EXPORT_SYMBOL(tcp_statistics);
2873 EXPORT_SYMBOL(tcp_timewait_cachep);
2874 EXPORT_SYMBOL(tcp_write_space);