net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/ckrm.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 273
 274 kmem_cache_t *tcp_openreq_cachep;
 275 kmem_cache_t *tcp_bucket_cachep;
 276 kmem_cache_t *tcp_timewait_cachep;
 277
 278 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 279
 280 int sysctl_tcp_default_win_scale = 7;
 281
 282 int sysctl_tcp_mem[3];
 283 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 284 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 285
 286 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 287 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 288
 289 /* Pressure flag: try to collapse.
 290  * Technical note: it is used by multiple contexts non atomically.
 291  * All the tcp_mem_schedule() is of this nature: accounting
 292  * is strict, actions are advisory and have some latency. */
 293 int tcp_memory_pressure;
 294
 295 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
 296
 297 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 298 {
 299         int amt = TCP_PAGES(size);
 300
 301         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
 302         atomic_add(amt, &tcp_memory_allocated);
 303
 304         /* Under limit. */
 305         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 306                 if (tcp_memory_pressure)
 307                         tcp_memory_pressure = 0;
 308                 return 1;
 309         }
 310
 311         /* Over hard limit. */
 312         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 313                 tcp_enter_memory_pressure();
 314                 goto suppress_allocation;
 315         }
 316
 317         /* Under pressure. */
 318         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 319                 tcp_enter_memory_pressure();
 320
 321         if (kind) {
 322                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
 323                         return 1;
 324         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
 325                 return 1;
 326
 327         if (!tcp_memory_pressure ||
 328             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
 329                                 TCP_PAGES(sk->sk_wmem_queued +
 330                                           atomic_read(&sk->sk_rmem_alloc) +
 331                                           sk->sk_forward_alloc))
 332                 return 1;
 333
 334 suppress_allocation:
 335
 336         if (!kind) {
 337                 tcp_moderate_sndbuf(sk);
 338
 339                 /* Fail only if socket is _under_ its sndbuf.
 340                  * In this case we cannot block, so that we have to fail.
 341                  */
 342                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
 343                         return 1;
 344         }
 345
 346         /* Alas. Undo changes. */
 347         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
 348         atomic_sub(amt, &tcp_memory_allocated);
 349         return 0;
 350 }
 351
 352 void __tcp_mem_reclaim(struct sock *sk)
 353 {
 354         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
 355                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
 356                            &tcp_memory_allocated);
 357                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
 358                 if (tcp_memory_pressure &&
 359                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 360                         tcp_memory_pressure = 0;
 361         }
 362 }
 363
 364 void tcp_rfree(struct sk_buff *skb)
 365 {
 366         struct sock *sk = skb->sk;
 367
 368         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 369         sk->sk_forward_alloc += skb->truesize;
 370 }
 371
 372 /*
 373  * LISTEN is a special case for poll..
 374  */
 375 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 376                                                poll_table *wait)
 377 {
 378         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 379 }
 380
 381 /*
 382  *      Wait for a TCP event.
 383  *
 384  *      Note that we don't need to lock the socket, as the upper poll layers
 385  *      take care of normal races (between the test and the event) and we don't
 386  *      go look at any of the socket buffers directly.
 387  */
 388 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 389 {
 390         unsigned int mask;
 391         struct sock *sk = sock->sk;
 392         struct tcp_opt *tp = tcp_sk(sk);
 393
 394         poll_wait(file, sk->sk_sleep, wait);
 395         if (sk->sk_state == TCP_LISTEN)
 396                 return tcp_listen_poll(sk, wait);
 397
 398         /* Socket is not locked. We are protected from async events
 399            by poll logic and correct handling of state changes
 400            made by another threads is impossible in any case.
 401          */
 402
 403         mask = 0;
 404         if (sk->sk_err)
 405                 mask = POLLERR;
 406
 407         /*
 408          * POLLHUP is certainly not done right. But poll() doesn't
 409          * have a notion of HUP in just one direction, and for a
 410          * socket the read side is more interesting.
 411          *
 412          * Some poll() documentation says that POLLHUP is incompatible
 413          * with the POLLOUT/POLLWR flags, so somebody should check this
 414          * all. But careful, it tends to be safer to return too many
 415          * bits than too few, and you can easily break real applications
 416          * if you don't tell them that something has hung up!
 417          *
 418          * Check-me.
 419          *
 420          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 421          * our fs/select.c). It means that after we received EOF,
 422          * poll always returns immediately, making impossible poll() on write()
 423          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 424          * if and only if shutdown has been made in both directions.
 425          * Actually, it is interesting to look how Solaris and DUX
 426          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 427          * then we could set it on SND_SHUTDOWN. BTW examples given
 428          * in Stevens' books assume exactly this behaviour, it explains
 429          * why PULLHUP is incompatible with POLLOUT.    --ANK
 430          *
 431          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 432          * blocking on fresh not-connected or disconnected socket. --ANK
 433          */
 434         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 435                 mask |= POLLHUP;
 436         if (sk->sk_shutdown & RCV_SHUTDOWN)
 437                 mask |= POLLIN | POLLRDNORM;
 438
 439         /* Connected? */
 440         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 441                 /* Potential race condition. If read of tp below will
 442                  * escape above sk->sk_state, we can be illegally awaken
 443                  * in SYN_* states. */
 444                 if ((tp->rcv_nxt != tp->copied_seq) &&
 445                     (tp->urg_seq != tp->copied_seq ||
 446                      tp->rcv_nxt != tp->copied_seq + 1 ||
 447                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 448                         mask |= POLLIN | POLLRDNORM;
 449
 450                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 451                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 452                                 mask |= POLLOUT | POLLWRNORM;
 453                         } else {  /* send SIGIO later */
 454                                 set_bit(SOCK_ASYNC_NOSPACE,
 455                                         &sk->sk_socket->flags);
 456                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 457
 458                                 /* Race breaker. If space is freed after
 459                                  * wspace test but before the flags are set,
 460                                  * IO signal will be lost.
 461                                  */
 462                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 463                                         mask |= POLLOUT | POLLWRNORM;
 464                         }
 465                 }
 466
 467                 if (tp->urg_data & TCP_URG_VALID)
 468                         mask |= POLLPRI;
 469         }
 470         return mask;
 471 }
 472
 473 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 474 {
 475         struct tcp_opt *tp = tcp_sk(sk);
 476         int answ;
 477
 478         switch (cmd) {
 479         case SIOCINQ:
 480                 if (sk->sk_state == TCP_LISTEN)
 481                         return -EINVAL;
 482
 483                 lock_sock(sk);
 484                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 485                         answ = 0;
 486                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 487                          !tp->urg_data ||
 488                          before(tp->urg_seq, tp->copied_seq) ||
 489                          !before(tp->urg_seq, tp->rcv_nxt)) {
 490                         answ = tp->rcv_nxt - tp->copied_seq;
 491
 492                         /* Subtract 1, if FIN is in queue. */
 493                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 494                                 answ -=
 495                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 496                 } else
 497                         answ = tp->urg_seq - tp->copied_seq;
 498                 release_sock(sk);
 499                 break;
 500         case SIOCATMARK:
 501                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 502                 break;
 503         case SIOCOUTQ:
 504                 if (sk->sk_state == TCP_LISTEN)
 505                         return -EINVAL;
 506
 507                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 508                         answ = 0;
 509                 else
 510                         answ = tp->write_seq - tp->snd_una;
 511                 break;
 512         default:
 513                 return -ENOIOCTLCMD;
 514         };
 515
 516         return put_user(answ, (int __user *)arg);
 517 }
 518
 519
 520 int tcp_listen_start(struct sock *sk)
 521 {
 522 #ifdef CONFIG_ACCEPT_QUEUES
 523         int i = 0;
 524 #endif
 525         struct inet_opt *inet = inet_sk(sk);
 526         struct tcp_opt *tp = tcp_sk(sk);
 527         struct tcp_listen_opt *lopt;
 528
 529         sk->sk_max_ack_backlog = 0;
 530         sk->sk_ack_backlog = 0;
 531         tp->accept_queue = NULL;
 532 #ifdef CONFIG_ACCEPT_QUEUES
 533         tp->class_index = 0;
 534         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
 535                 tp->acceptq[i].aq_tail = NULL;
 536                 tp->acceptq[i].aq_head = NULL;
 537                 tp->acceptq[i].aq_wait_time = 0;
 538                 tp->acceptq[i].aq_qcount = 0;
 539                 tp->acceptq[i].aq_count = 0;
 540                 if (i == 0) {
 541                         tp->acceptq[i].aq_valid = 1;
 542                         tp->acceptq[i].aq_ratio = 1;
 543                 }
 544                 else {
 545                         tp->acceptq[i].aq_valid = 0;
 546                         tp->acceptq[i].aq_ratio = 0;
 547                 }
 548         }
 549 #endif
 550         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 551         tcp_delack_init(tp);
 552
 553         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 554         if (!lopt)
 555                 return -ENOMEM;
 556
 557         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 558         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 559                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 560                         break;
 561         get_random_bytes(&lopt->hash_rnd, 4);
 562
 563         write_lock_bh(&tp->syn_wait_lock);
 564         tp->listen_opt = lopt;
 565         write_unlock_bh(&tp->syn_wait_lock);
 566
 567         /* There is race window here: we announce ourselves listening,
 568          * but this transition is still not validated by get_port().
 569          * It is OK, because this socket enters to hash table only
 570          * after validation is complete.
 571          */
 572         sk->sk_state = TCP_LISTEN;
 573         if (!sk->sk_prot->get_port(sk, inet->num)) {
 574                 inet->sport = htons(inet->num);
 575
 576                 sk_dst_reset(sk);
 577                 sk->sk_prot->hash(sk);
 578
 579 #ifdef CONFIG_CKRM
 580                 ckrm_cb_listen_start(sk);
 581 #endif
 582
 583                 return 0;
 584         }
 585
 586         sk->sk_state = TCP_CLOSE;
 587         write_lock_bh(&tp->syn_wait_lock);
 588         tp->listen_opt = NULL;
 589         write_unlock_bh(&tp->syn_wait_lock);
 590         kfree(lopt);
 591         return -EADDRINUSE;
 592 }
 593
 594 /*
 595  *      This routine closes sockets which have been at least partially
 596  *      opened, but not yet accepted.
 597  */
 598
 599 static void tcp_listen_stop (struct sock *sk)
 600 {
 601         struct tcp_opt *tp = tcp_sk(sk);
 602         struct tcp_listen_opt *lopt = tp->listen_opt;
 603         struct open_request *acc_req = tp->accept_queue;
 604         struct open_request *req;
 605         int i;
 606
 607         tcp_delete_keepalive_timer(sk);
 608
 609         /* make all the listen_opt local to us */
 610         write_lock_bh(&tp->syn_wait_lock);
 611         tp->listen_opt = NULL;
 612         write_unlock_bh(&tp->syn_wait_lock);
 613
 614 #ifdef CONFIG_CKRM
 615                 ckrm_cb_listen_stop(sk);
 616 #endif
 617
 618 #ifdef CONFIG_ACCEPT_QUEUES
 619         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
 620                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
 621 #else
 622         tp->accept_queue_tail = NULL;
 623 #endif
 624         tp->accept_queue = NULL;
 625
 626         if (lopt->qlen) {
 627                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 628                         while ((req = lopt->syn_table[i]) != NULL) {
 629                                 lopt->syn_table[i] = req->dl_next;
 630                                 lopt->qlen--;
 631                                 tcp_openreq_free(req);
 632
 633                 /* Following specs, it would be better either to send FIN
 634                  * (and enter FIN-WAIT-1, it is normal close)
 635                  * or to send active reset (abort).
 636                  * Certainly, it is pretty dangerous while synflood, but it is
 637                  * bad justification for our negligence 8)
 638                  * To be honest, we are not able to make either
 639                  * of the variants now.                 --ANK
 640                  */
 641                         }
 642                 }
 643         }
 644         BUG_TRAP(!lopt->qlen);
 645
 646         kfree(lopt);
 647
 648         while ((req = acc_req) != NULL) {
 649                 struct sock *child = req->sk;
 650
 651                 acc_req = req->dl_next;
 652
 653                 local_bh_disable();
 654                 bh_lock_sock(child);
 655                 BUG_TRAP(!sock_owned_by_user(child));
 656                 sock_hold(child);
 657
 658                 tcp_disconnect(child, O_NONBLOCK);
 659
 660                 sock_orphan(child);
 661
 662                 atomic_inc(&tcp_orphan_count);
 663
 664                 tcp_destroy_sock(child);
 665
 666                 bh_unlock_sock(child);
 667                 local_bh_enable();
 668                 sock_put(child);
 669
 670 #ifdef CONFIG_ACCEPT_QUEUES
 671                 tcp_acceptq_removed(sk, req->acceptq_class);
 672 #else
 673                 sk_acceptq_removed(sk);
 674 #endif
 675                 tcp_openreq_fastfree(req);
 676         }
 677         BUG_TRAP(!sk->sk_ack_backlog);
 678 }
 679
 680 /*
 681  *      Wait for a socket to get into the connected state
 682  *
 683  *      Note: Must be called with the socket locked.
 684  */
 685 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
 686 {
 687         struct tcp_opt *tp = tcp_sk(sk);
 688         struct task_struct *tsk = current;
 689         DEFINE_WAIT(wait);
 690
 691         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 692                 if (sk->sk_err)
 693                         return sock_error(sk);
 694                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 695                         return -EPIPE;
 696                 if (!*timeo_p)
 697                         return -EAGAIN;
 698                 if (signal_pending(tsk))
 699                         return sock_intr_errno(*timeo_p);
 700
 701                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 702                 tp->write_pending++;
 703
 704                 release_sock(sk);
 705                 *timeo_p = schedule_timeout(*timeo_p);
 706                 lock_sock(sk);
 707
 708                 finish_wait(sk->sk_sleep, &wait);
 709                 tp->write_pending--;
 710         }
 711         return 0;
 712 }
 713
 714 static inline int tcp_memory_free(struct sock *sk)
 715 {
 716         return sk->sk_wmem_queued < sk->sk_sndbuf;
 717 }
 718
 719 /*
 720  *      Wait for more memory for a socket
 721  */
 722 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
 723 {
 724         struct tcp_opt *tp = tcp_sk(sk);
 725         int err = 0;
 726         long vm_wait = 0;
 727         long current_timeo = *timeo;
 728         DEFINE_WAIT(wait);
 729
 730         if (tcp_memory_free(sk))
 731                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
 732
 733         for (;;) {
 734                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 735
 736                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 737
 738                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 739                         goto do_error;
 740                 if (!*timeo)
 741                         goto do_nonblock;
 742                 if (signal_pending(current))
 743                         goto do_interrupted;
 744                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 745                 if (tcp_memory_free(sk) && !vm_wait)
 746                         break;
 747
 748                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 749                 tp->write_pending++;
 750                 release_sock(sk);
 751                 if (!tcp_memory_free(sk) || vm_wait)
 752                         current_timeo = schedule_timeout(current_timeo);
 753                 lock_sock(sk);
 754                 tp->write_pending--;
 755
 756                 if (vm_wait) {
 757                         vm_wait -= current_timeo;
 758                         current_timeo = *timeo;
 759                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 760                             (current_timeo -= vm_wait) < 0)
 761                                 current_timeo = 0;
 762                         vm_wait = 0;
 763                 }
 764                 *timeo = current_timeo;
 765         }
 766 out:
 767         finish_wait(sk->sk_sleep, &wait);
 768         return err;
 769
 770 do_error:
 771         err = -EPIPE;
 772         goto out;
 773 do_nonblock:
 774         err = -EAGAIN;
 775         goto out;
 776 do_interrupted:
 777         err = sock_intr_errno(*timeo);
 778         goto out;
 779 }
 780
 781 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
 782                                int off)
 783 {
 784         if (i) {
 785                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 786                 return page == frag->page &&
 787                        off == frag->page_offset + frag->size;
 788         }
 789         return 0;
 790 }
 791
 792 static inline void fill_page_desc(struct sk_buff *skb, int i,
 793                                   struct page *page, int off, int size)
 794 {
 795         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 796         frag->page = page;
 797         frag->page_offset = off;
 798         frag->size = size;
 799         skb_shinfo(skb)->nr_frags = i + 1;
 800 }
 801
 802 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 803 {
 804         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 805         tp->pushed_seq = tp->write_seq;
 806 }
 807
 808 static inline int forced_push(struct tcp_opt *tp)
 809 {
 810         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 811 }
 812
 813 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 814                               struct sk_buff *skb)
 815 {
 816         skb->csum = 0;
 817         TCP_SKB_CB(skb)->seq = tp->write_seq;
 818         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 819         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 820         TCP_SKB_CB(skb)->sacked = 0;
 821         __skb_queue_tail(&sk->sk_write_queue, skb);
 822         sk_charge_skb(sk, skb);
 823         if (!tp->send_head)
 824                 tp->send_head = skb;
 825         else if (tp->nonagle&TCP_NAGLE_PUSH)
 826                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 827 }
 828
 829 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 830                                 struct sk_buff *skb)
 831 {
 832         if (flags & MSG_OOB) {
 833                 tp->urg_mode = 1;
 834                 tp->snd_up = tp->write_seq;
 835                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 836         }
 837 }
 838
 839 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 840                             int mss_now, int nonagle)
 841 {
 842         if (tp->send_head) {
 843                 struct sk_buff *skb = sk->sk_write_queue.prev;
 844                 if (!(flags & MSG_MORE) || forced_push(tp))
 845                         tcp_mark_push(tp, skb);
 846                 tcp_mark_urg(tp, flags, skb);
 847                 __tcp_push_pending_frames(sk, tp, mss_now,
 848                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 849         }
 850 }
 851
 852 static int tcp_error(struct sock *sk, int flags, int err)
 853 {
 854         if (err == -EPIPE)
 855                 err = sock_error(sk) ? : -EPIPE;
 856         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 857                 send_sig(SIGPIPE, current, 0);
 858         return err;
 859 }
 860
 861 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 862                          size_t psize, int flags)
 863 {
 864         struct tcp_opt *tp = tcp_sk(sk);
 865         int mss_now;
 866         int err;
 867         ssize_t copied;
 868         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 869
 870         /* Wait for a connection to finish. */
 871         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 872                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 873                         goto out_err;
 874
 875         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 876
 877         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 878         copied = 0;
 879
 880         err = -EPIPE;
 881         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 882                 goto do_error;
 883
 884         while (psize > 0) {
 885                 struct sk_buff *skb = sk->sk_write_queue.prev;
 886                 struct page *page = pages[poffset / PAGE_SIZE];
 887                 int copy, i;
 888                 int offset = poffset % PAGE_SIZE;
 889                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 890
 891                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
 892 new_segment:
 893                         if (!tcp_memory_free(sk))
 894                                 goto wait_for_sndbuf;
 895
 896                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
 897                                              sk->sk_allocation);
 898                         if (!skb)
 899                                 goto wait_for_memory;
 900
 901                         skb_entail(sk, tp, skb);
 902                         copy = mss_now;
 903                 }
 904
 905                 if (copy > size)
 906                         copy = size;
 907
 908                 i = skb_shinfo(skb)->nr_frags;
 909                 if (can_coalesce(skb, i, page, offset)) {
 910                         skb_shinfo(skb)->frags[i - 1].size += copy;
 911                 } else if (i < MAX_SKB_FRAGS) {
 912                         get_page(page);
 913                         fill_page_desc(skb, i, page, offset, copy);
 914                 } else {
 915                         tcp_mark_push(tp, skb);
 916                         goto new_segment;
 917                 }
 918
 919                 skb->len += copy;
 920                 skb->data_len += copy;
 921                 skb->ip_summed = CHECKSUM_HW;
 922                 tp->write_seq += copy;
 923                 TCP_SKB_CB(skb)->end_seq += copy;
 924
 925                 if (!copied)
 926                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 927
 928                 copied += copy;
 929                 poffset += copy;
 930                 if (!(psize -= copy))
 931                         goto out;
 932
 933                 if (skb->len != mss_now || (flags & MSG_OOB))
 934                         continue;
 935
 936                 if (forced_push(tp)) {
 937                         tcp_mark_push(tp, skb);
 938                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 939                 } else if (skb == tp->send_head)
 940                         tcp_push_one(sk, mss_now);
 941                 continue;
 942
 943 wait_for_sndbuf:
 944                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 945 wait_for_memory:
 946                 if (copied)
 947                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 948
 949                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 950                         goto do_error;
 951
 952                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 953         }
 954
 955 out:
 956         if (copied)
 957                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 958         return copied;
 959
 960 do_error:
 961         if (copied)
 962                 goto out;
 963 out_err:
 964         return tcp_error(sk, flags, err);
 965 }
 966
 967 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 968                      size_t size, int flags)
 969 {
 970         ssize_t res;
 971         struct sock *sk = sock->sk;
 972
 973 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 974
 975         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 976             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 977                 return sock_no_sendpage(sock, page, offset, size, flags);
 978
 979 #undef TCP_ZC_CSUM_FLAGS
 980
 981         lock_sock(sk);
 982         TCP_CHECK_TIMER(sk);
 983         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 984         TCP_CHECK_TIMER(sk);
 985         release_sock(sk);
 986         return res;
 987 }
 988
 989 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
 990 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
 991
 992 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
 993                                    struct sk_buff *skb, struct page *page,
 994                                    int off, int copy)
 995 {
 996         int err = 0;
 997         unsigned int csum;
 998
 999         if (skb->ip_summed == CHECKSUM_NONE) {
1000                 csum = csum_and_copy_from_user(from, page_address(page) + off,
1001                                        copy, 0, &err);
1002                 if (err) return err;
1003                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1004         } else {
1005                 if (copy_from_user(page_address(page) + off, from, copy))
1006                         return -EFAULT;
1007         }
1008
1009         skb->len += copy;
1010         skb->data_len += copy;
1011         skb->truesize += copy;
1012         sk->sk_wmem_queued += copy;
1013         sk->sk_forward_alloc -= copy;
1014         return 0;
1015 }
1016
1017 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
1018 {
1019         int err = 0;
1020         unsigned int csum;
1021         int off = skb->len;
1022
1023         if (skb->ip_summed == CHECKSUM_NONE) {
1024                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1025                                        copy, 0, &err);
1026                 if (!err) {
1027                         skb->csum = csum_block_add(skb->csum, csum, off);
1028                         return 0;
1029                 }
1030         } else {
1031                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1032                         return 0;
1033         }
1034
1035         __skb_trim(skb, off);
1036         return -EFAULT;
1037 }
1038
1039 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1040 {
1041         int tmp = tp->mss_cache_std;
1042
1043         if (sk->sk_route_caps & NETIF_F_SG) {
1044                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1045
1046                 if (tmp >= pgbreak &&
1047                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1048                         tmp = pgbreak;
1049         }
1050         return tmp;
1051 }
1052
1053 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1054                 size_t size)
1055 {
1056         struct iovec *iov;
1057         struct tcp_opt *tp = tcp_sk(sk);
1058         struct sk_buff *skb;
1059         int iovlen, flags;
1060         int mss_now;
1061         int err, copied;
1062         long timeo;
1063
1064         lock_sock(sk);
1065         TCP_CHECK_TIMER(sk);
1066
1067         flags = msg->msg_flags;
1068         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1069
1070         /* Wait for a connection to finish. */
1071         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1072                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1073                         goto out_err;
1074
1075         /* This should be in poll */
1076         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1077
1078         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1079
1080         /* Ok commence sending. */
1081         iovlen = msg->msg_iovlen;
1082         iov = msg->msg_iov;
1083         copied = 0;
1084
1085         err = -EPIPE;
1086         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1087                 goto do_error;
1088
1089         while (--iovlen >= 0) {
1090                 int seglen = iov->iov_len;
1091                 unsigned char __user *from = iov->iov_base;
1092
1093                 iov++;
1094
1095                 while (seglen > 0) {
1096                         int copy;
1097
1098                         skb = sk->sk_write_queue.prev;
1099
1100                         if (!tp->send_head ||
1101                             (copy = mss_now - skb->len) <= 0) {
1102
1103 new_segment:
1104                                 /* Allocate new segment. If the interface is SG,
1105                                  * allocate skb fitting to single page.
1106                                  */
1107                                 if (!tcp_memory_free(sk))
1108                                         goto wait_for_sndbuf;
1109
1110                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1111                                                      0, sk->sk_allocation);
1112                                 if (!skb)
1113                                         goto wait_for_memory;
1114
1115                                 /*
1116                                  * Check whether we can use HW checksum.
1117                                  */
1118                                 if (sk->sk_route_caps &
1119                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1120                                      NETIF_F_HW_CSUM))
1121                                         skb->ip_summed = CHECKSUM_HW;
1122
1123                                 skb_entail(sk, tp, skb);
1124                                 copy = mss_now;
1125                         }
1126
1127                         /* Try to append data to the end of skb. */
1128                         if (copy > seglen)
1129                                 copy = seglen;
1130
1131                         /* Where to copy to? */
1132                         if (skb_tailroom(skb) > 0) {
1133                                 /* We have some space in skb head. Superb! */
1134                                 if (copy > skb_tailroom(skb))
1135                                         copy = skb_tailroom(skb);
1136                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1137                                         goto do_fault;
1138                         } else {
1139                                 int merge = 0;
1140                                 int i = skb_shinfo(skb)->nr_frags;
1141                                 struct page *page = TCP_PAGE(sk);
1142                                 int off = TCP_OFF(sk);
1143
1144                                 if (can_coalesce(skb, i, page, off) &&
1145                                     off != PAGE_SIZE) {
1146                                         /* We can extend the last page
1147                                          * fragment. */
1148                                         merge = 1;
1149                                 } else if (i == MAX_SKB_FRAGS ||
1150                                            (!i &&
1151                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1152                                         /* Need to add new fragment and cannot
1153                                          * do this because interface is non-SG,
1154                                          * or because all the page slots are
1155                                          * busy. */
1156                                         tcp_mark_push(tp, skb);
1157                                         goto new_segment;
1158                                 } else if (page) {
1159                                         /* If page is cached, align
1160                                          * offset to L1 cache boundary
1161                                          */
1162                                         off = (off + L1_CACHE_BYTES - 1) &
1163                                               ~(L1_CACHE_BYTES - 1);
1164                                         if (off == PAGE_SIZE) {
1165                                                 put_page(page);
1166                                                 TCP_PAGE(sk) = page = NULL;
1167                                         }
1168                                 }
1169
1170                                 if (!page) {
1171                                         /* Allocate new cache page. */
1172                                         if (!(page = tcp_alloc_page(sk)))
1173                                                 goto wait_for_memory;
1174                                         off = 0;
1175                                 }
1176
1177                                 if (copy > PAGE_SIZE - off)
1178                                         copy = PAGE_SIZE - off;
1179
1180                                 /* Time to copy data. We are close to
1181                                  * the end! */
1182                                 err = tcp_copy_to_page(sk, from, skb, page,
1183                                                        off, copy);
1184                                 if (err) {
1185                                         /* If this page was new, give it to the
1186                                          * socket so it does not get leaked.
1187                                          */
1188                                         if (!TCP_PAGE(sk)) {
1189                                                 TCP_PAGE(sk) = page;
1190                                                 TCP_OFF(sk) = 0;
1191                                         }
1192                                         goto do_error;
1193                                 }
1194
1195                                 /* Update the skb. */
1196                                 if (merge) {
1197                                         skb_shinfo(skb)->frags[i - 1].size +=
1198                                                                         copy;
1199                                 } else {
1200                                         fill_page_desc(skb, i, page, off, copy);
1201                                         if (TCP_PAGE(sk)) {
1202                                                 get_page(page);
1203                                         } else if (off + copy < PAGE_SIZE) {
1204                                                 get_page(page);
1205                                                 TCP_PAGE(sk) = page;
1206                                         }
1207                                 }
1208
1209                                 TCP_OFF(sk) = off + copy;
1210                         }
1211
1212                         if (!copied)
1213                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1214
1215                         tp->write_seq += copy;
1216                         TCP_SKB_CB(skb)->end_seq += copy;
1217
1218                         from += copy;
1219                         copied += copy;
1220                         if ((seglen -= copy) == 0 && iovlen == 0)
1221                                 goto out;
1222
1223                         if (skb->len != mss_now || (flags & MSG_OOB))
1224                                 continue;
1225
1226                         if (forced_push(tp)) {
1227                                 tcp_mark_push(tp, skb);
1228                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1229                         } else if (skb == tp->send_head)
1230                                 tcp_push_one(sk, mss_now);
1231                         continue;
1232
1233 wait_for_sndbuf:
1234                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1235 wait_for_memory:
1236                         if (copied)
1237                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1238
1239                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1240                                 goto do_error;
1241
1242                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1243                 }
1244         }
1245
1246 out:
1247         if (copied)
1248                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1249         TCP_CHECK_TIMER(sk);
1250         release_sock(sk);
1251         return copied;
1252
1253 do_fault:
1254         if (!skb->len) {
1255                 if (tp->send_head == skb)
1256                         tp->send_head = NULL;
1257                 __skb_unlink(skb, skb->list);
1258                 tcp_free_skb(sk, skb);
1259         }
1260
1261 do_error:
1262         if (copied)
1263                 goto out;
1264 out_err:
1265         err = tcp_error(sk, flags, err);
1266         TCP_CHECK_TIMER(sk);
1267         release_sock(sk);
1268         return err;
1269 }
1270
1271 /*
1272  *      Handle reading urgent data. BSD has very simple semantics for
1273  *      this, no blocking and very strange errors 8)
1274  */
1275
1276 static int tcp_recv_urg(struct sock *sk, long timeo,
1277                         struct msghdr *msg, int len, int flags,
1278                         int *addr_len)
1279 {
1280         struct tcp_opt *tp = tcp_sk(sk);
1281
1282         /* No URG data to read. */
1283         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1284             tp->urg_data == TCP_URG_READ)
1285                 return -EINVAL; /* Yes this is right ! */
1286
1287         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1288                 return -ENOTCONN;
1289
1290         if (tp->urg_data & TCP_URG_VALID) {
1291                 int err = 0;
1292                 char c = tp->urg_data;
1293
1294                 if (!(flags & MSG_PEEK))
1295                         tp->urg_data = TCP_URG_READ;
1296
1297                 /* Read urgent data. */
1298                 msg->msg_flags |= MSG_OOB;
1299
1300                 if (len > 0) {
1301                         if (!(flags & MSG_TRUNC))
1302                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1303                         len = 1;
1304                 } else
1305                         msg->msg_flags |= MSG_TRUNC;
1306
1307                 return err ? -EFAULT : len;
1308         }
1309
1310         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1311                 return 0;
1312
1313         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1314          * the available implementations agree in this case:
1315          * this call should never block, independent of the
1316          * blocking state of the socket.
1317          * Mike <pall@rz.uni-karlsruhe.de>
1318          */
1319         return -EAGAIN;
1320 }
1321
1322 /* Clean up the receive buffer for full frames taken by the user,
1323  * then send an ACK if necessary.  COPIED is the number of bytes
1324  * tcp_recvmsg has given to the user so far, it speeds up the
1325  * calculation of whether or not we must ACK for the sake of
1326  * a window update.
1327  */
1328 void cleanup_rbuf(struct sock *sk, int copied)
1329 {
1330         struct tcp_opt *tp = tcp_sk(sk);
1331         int time_to_ack = 0;
1332
1333 #if TCP_DEBUG
1334         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1335
1336         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1337 #endif
1338
1339         if (tcp_ack_scheduled(tp)) {
1340                    /* Delayed ACKs frequently hit locked sockets during bulk
1341                     * receive. */
1342                 if (tp->ack.blocked ||
1343                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1344                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1345                     /*
1346                      * If this read emptied read buffer, we send ACK, if
1347                      * connection is not bidirectional, user drained
1348                      * receive buffer and there was a small segment
1349                      * in queue.
1350                      */
1351                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1352                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1353                         time_to_ack = 1;
1354         }
1355
1356         /* We send an ACK if we can now advertise a non-zero window
1357          * which has been raised "significantly".
1358          *
1359          * Even if window raised up to infinity, do not send window open ACK
1360          * in states, where we will not receive more. It is useless.
1361          */
1362         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1363                 __u32 rcv_window_now = tcp_receive_window(tp);
1364
1365                 /* Optimize, __tcp_select_window() is not cheap. */
1366                 if (2*rcv_window_now <= tp->window_clamp) {
1367                         __u32 new_window = __tcp_select_window(sk);
1368
1369                         /* Send ACK now, if this read freed lots of space
1370                          * in our buffer. Certainly, new_window is new window.
1371                          * We can advertise it now, if it is not less than current one.
1372                          * "Lots" means "at least twice" here.
1373                          */
1374                         if (new_window && new_window >= 2 * rcv_window_now)
1375                                 time_to_ack = 1;
1376                 }
1377         }
1378         if (time_to_ack)
1379                 tcp_send_ack(sk);
1380 }
1381
1382 static void tcp_prequeue_process(struct sock *sk)
1383 {
1384         struct sk_buff *skb;
1385         struct tcp_opt *tp = tcp_sk(sk);
1386
1387         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1388
1389         /* RX process wants to run with disabled BHs, though it is not
1390          * necessary */
1391         local_bh_disable();
1392         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1393                 sk->sk_backlog_rcv(sk, skb);
1394         local_bh_enable();
1395
1396         /* Clear memory counter. */
1397         tp->ucopy.memory = 0;
1398 }
1399
1400 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1401 {
1402         struct sk_buff *skb;
1403         u32 offset;
1404
1405         skb_queue_walk(&sk->sk_receive_queue, skb) {
1406                 offset = seq - TCP_SKB_CB(skb)->seq;
1407                 if (skb->h.th->syn)
1408                         offset--;
1409                 if (offset < skb->len || skb->h.th->fin) {
1410                         *off = offset;
1411                         return skb;
1412                 }
1413         }
1414         return NULL;
1415 }
1416
1417 /*
1418  * This routine provides an alternative to tcp_recvmsg() for routines
1419  * that would like to handle copying from skbuffs directly in 'sendfile'
1420  * fashion.
1421  * Note:
1422  *      - It is assumed that the socket was locked by the caller.
1423  *      - The routine does not block.
1424  *      - At present, there is no support for reading OOB data
1425  *        or for 'peeking' the socket using this routine
1426  *        (although both would be easy to implement).
1427  */
1428 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1429                   sk_read_actor_t recv_actor)
1430 {
1431         struct sk_buff *skb;
1432         struct tcp_opt *tp = tcp_sk(sk);
1433         u32 seq = tp->copied_seq;
1434         u32 offset;
1435         int copied = 0;
1436
1437         if (sk->sk_state == TCP_LISTEN)
1438                 return -ENOTCONN;
1439         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1440                 if (offset < skb->len) {
1441                         size_t used, len;
1442
1443                         len = skb->len - offset;
1444                         /* Stop reading if we hit a patch of urgent data */
1445                         if (tp->urg_data) {
1446                                 u32 urg_offset = tp->urg_seq - seq;
1447                                 if (urg_offset < len)
1448                                         len = urg_offset;
1449                                 if (!len)
1450                                         break;
1451                         }
1452                         used = recv_actor(desc, skb, offset, len);
1453                         if (used <= len) {
1454                                 seq += used;
1455                                 copied += used;
1456                                 offset += used;
1457                         }
1458                         if (offset != skb->len)
1459                                 break;
1460                 }
1461                 if (skb->h.th->fin) {
1462                         sk_eat_skb(sk, skb);
1463                         ++seq;
1464                         break;
1465                 }
1466                 sk_eat_skb(sk, skb);
1467                 if (!desc->count)
1468                         break;
1469         }
1470         tp->copied_seq = seq;
1471
1472         tcp_rcv_space_adjust(sk);
1473
1474         /* Clean up data we have read: This will do ACK frames. */
1475         if (copied)
1476                 cleanup_rbuf(sk, copied);
1477         return copied;
1478 }
1479
1480 /*
1481  *      This routine copies from a sock struct into the user buffer.
1482  *
1483  *      Technical note: in 2.3 we work on _locked_ socket, so that
1484  *      tricks with *seq access order and skb->users are not required.
1485  *      Probably, code can be easily improved even more.
1486  */
1487
1488 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1489                 size_t len, int nonblock, int flags, int *addr_len)
1490 {
1491         struct tcp_opt *tp = tcp_sk(sk);
1492         int copied = 0;
1493         u32 peek_seq;
1494         u32 *seq;
1495         unsigned long used;
1496         int err;
1497         int target;             /* Read at least this many bytes */
1498         long timeo;
1499         struct task_struct *user_recv = NULL;
1500
1501         lock_sock(sk);
1502
1503         TCP_CHECK_TIMER(sk);
1504
1505         err = -ENOTCONN;
1506         if (sk->sk_state == TCP_LISTEN)
1507                 goto out;
1508
1509         timeo = sock_rcvtimeo(sk, nonblock);
1510
1511         /* Urgent data needs to be handled specially. */
1512         if (flags & MSG_OOB)
1513                 goto recv_urg;
1514
1515         seq = &tp->copied_seq;
1516         if (flags & MSG_PEEK) {
1517                 peek_seq = tp->copied_seq;
1518                 seq = &peek_seq;
1519         }
1520
1521         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1522
1523         do {
1524                 struct sk_buff *skb;
1525                 u32 offset;
1526
1527                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1528                 if (tp->urg_data && tp->urg_seq == *seq) {
1529                         if (copied)
1530                                 break;
1531                         if (signal_pending(current)) {
1532                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1533                                 break;
1534                         }
1535                 }
1536
1537                 /* Next get a buffer. */
1538
1539                 skb = skb_peek(&sk->sk_receive_queue);
1540                 do {
1541                         if (!skb)
1542                                 break;
1543
1544                         /* Now that we have two receive queues this
1545                          * shouldn't happen.
1546                          */
1547                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1548                                 printk(KERN_INFO "recvmsg bug: copied %X "
1549                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1550                                 break;
1551                         }
1552                         offset = *seq - TCP_SKB_CB(skb)->seq;
1553                         if (skb->h.th->syn)
1554                                 offset--;
1555                         if (offset < skb->len)
1556                                 goto found_ok_skb;
1557                         if (skb->h.th->fin)
1558                                 goto found_fin_ok;
1559                         BUG_TRAP(flags & MSG_PEEK);
1560                         skb = skb->next;
1561                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1562
1563                 /* Well, if we have backlog, try to process it now yet. */
1564
1565                 if (copied >= target && !sk->sk_backlog.tail)
1566                         break;
1567
1568                 if (copied) {
1569                         if (sk->sk_err ||
1570                             sk->sk_state == TCP_CLOSE ||
1571                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1572                             !timeo ||
1573                             signal_pending(current) ||
1574                             (flags & MSG_PEEK))
1575                                 break;
1576                 } else {
1577                         if (sock_flag(sk, SOCK_DONE))
1578                                 break;
1579
1580                         if (sk->sk_err) {
1581                                 copied = sock_error(sk);
1582                                 break;
1583                         }
1584
1585                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1586                                 break;
1587
1588                         if (sk->sk_state == TCP_CLOSE) {
1589                                 if (!sock_flag(sk, SOCK_DONE)) {
1590                                         /* This occurs when user tries to read
1591                                          * from never connected socket.
1592                                          */
1593                                         copied = -ENOTCONN;
1594                                         break;
1595                                 }
1596                                 break;
1597                         }
1598
1599                         if (!timeo) {
1600                                 copied = -EAGAIN;
1601                                 break;
1602                         }
1603
1604                         if (signal_pending(current)) {
1605                                 copied = sock_intr_errno(timeo);
1606                                 break;
1607                         }
1608                 }
1609
1610                 cleanup_rbuf(sk, copied);
1611
1612                 if (tp->ucopy.task == user_recv) {
1613                         /* Install new reader */
1614                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1615                                 user_recv = current;
1616                                 tp->ucopy.task = user_recv;
1617                                 tp->ucopy.iov = msg->msg_iov;
1618                         }
1619
1620                         tp->ucopy.len = len;
1621
1622                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1623                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1624
1625                         /* Ugly... If prequeue is not empty, we have to
1626                          * process it before releasing socket, otherwise
1627                          * order will be broken at second iteration.
1628                          * More elegant solution is required!!!
1629                          *
1630                          * Look: we have the following (pseudo)queues:
1631                          *
1632                          * 1. packets in flight
1633                          * 2. backlog
1634                          * 3. prequeue
1635                          * 4. receive_queue
1636                          *
1637                          * Each queue can be processed only if the next ones
1638                          * are empty. At this point we have empty receive_queue.
1639                          * But prequeue _can_ be not empty after 2nd iteration,
1640                          * when we jumped to start of loop because backlog
1641                          * processing added something to receive_queue.
1642                          * We cannot release_sock(), because backlog contains
1643                          * packets arrived _after_ prequeued ones.
1644                          *
1645                          * Shortly, algorithm is clear --- to process all
1646                          * the queues in order. We could make it more directly,
1647                          * requeueing packets from backlog to prequeue, if
1648                          * is not empty. It is more elegant, but eats cycles,
1649                          * unfortunately.
1650                          */
1651                         if (skb_queue_len(&tp->ucopy.prequeue))
1652                                 goto do_prequeue;
1653
1654                         /* __ Set realtime policy in scheduler __ */
1655                 }
1656
1657                 if (copied >= target) {
1658                         /* Do not sleep, just process backlog. */
1659                         release_sock(sk);
1660                         lock_sock(sk);
1661                 } else
1662                         sk_wait_data(sk, &timeo);
1663
1664                 if (user_recv) {
1665                         int chunk;
1666
1667                         /* __ Restore normal policy in scheduler __ */
1668
1669                         if ((chunk = len - tp->ucopy.len) != 0) {
1670                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1671                                 len -= chunk;
1672                                 copied += chunk;
1673                         }
1674
1675                         if (tp->rcv_nxt == tp->copied_seq &&
1676                             skb_queue_len(&tp->ucopy.prequeue)) {
1677 do_prequeue:
1678                                 tcp_prequeue_process(sk);
1679
1680                                 if ((chunk = len - tp->ucopy.len) != 0) {
1681                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1682                                         len -= chunk;
1683                                         copied += chunk;
1684                                 }
1685                         }
1686                 }
1687                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1688                         if (net_ratelimit())
1689                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1690                                        current->comm, current->pid);
1691                         peek_seq = tp->copied_seq;
1692                 }
1693                 continue;
1694
1695         found_ok_skb:
1696                 /* Ok so how much can we use? */
1697                 used = skb->len - offset;
1698                 if (len < used)
1699                         used = len;
1700
1701                 /* Do we have urgent data here? */
1702                 if (tp->urg_data) {
1703                         u32 urg_offset = tp->urg_seq - *seq;
1704                         if (urg_offset < used) {
1705                                 if (!urg_offset) {
1706                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1707                                                 ++*seq;
1708                                                 offset++;
1709                                                 used--;
1710                                                 if (!used)
1711                                                         goto skip_copy;
1712                                         }
1713                                 } else
1714                                         used = urg_offset;
1715                         }
1716                 }
1717
1718                 if (!(flags & MSG_TRUNC)) {
1719                         err = skb_copy_datagram_iovec(skb, offset,
1720                                                       msg->msg_iov, used);
1721                         if (err) {
1722                                 /* Exception. Bailout! */
1723                                 if (!copied)
1724                                         copied = -EFAULT;
1725                                 break;
1726                         }
1727                 }
1728
1729                 *seq += used;
1730                 copied += used;
1731                 len -= used;
1732
1733                 tcp_rcv_space_adjust(sk);
1734
1735 skip_copy:
1736                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1737                         tp->urg_data = 0;
1738                         tcp_fast_path_check(sk, tp);
1739                 }
1740                 if (used + offset < skb->len)
1741                         continue;
1742
1743                 if (skb->h.th->fin)
1744                         goto found_fin_ok;
1745                 if (!(flags & MSG_PEEK))
1746                         sk_eat_skb(sk, skb);
1747                 continue;
1748
1749         found_fin_ok:
1750                 /* Process the FIN. */
1751                 ++*seq;
1752                 if (!(flags & MSG_PEEK))
1753                         sk_eat_skb(sk, skb);
1754                 break;
1755         } while (len > 0);
1756
1757         if (user_recv) {
1758                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1759                         int chunk;
1760
1761                         tp->ucopy.len = copied > 0 ? len : 0;
1762
1763                         tcp_prequeue_process(sk);
1764
1765                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1766                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1767                                 len -= chunk;
1768                                 copied += chunk;
1769                         }
1770                 }
1771
1772                 tp->ucopy.task = NULL;
1773                 tp->ucopy.len = 0;
1774         }
1775
1776         /* According to UNIX98, msg_name/msg_namelen are ignored
1777          * on connected socket. I was just happy when found this 8) --ANK
1778          */
1779
1780         /* Clean up data we have read: This will do ACK frames. */
1781         cleanup_rbuf(sk, copied);
1782
1783         TCP_CHECK_TIMER(sk);
1784         release_sock(sk);
1785         return copied;
1786
1787 out:
1788         TCP_CHECK_TIMER(sk);
1789         release_sock(sk);
1790         return err;
1791
1792 recv_urg:
1793         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1794         goto out;
1795 }
1796
1797 /*
1798  *      State processing on a close. This implements the state shift for
1799  *      sending our FIN frame. Note that we only send a FIN for some
1800  *      states. A shutdown() may have already sent the FIN, or we may be
1801  *      closed.
1802  */
1803
1804 static unsigned char new_state[16] = {
1805   /* current state:        new state:      action:      */
1806   /* (Invalid)          */ TCP_CLOSE,
1807   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1808   /* TCP_SYN_SENT       */ TCP_CLOSE,
1809   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1810   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1811   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1812   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1813   /* TCP_CLOSE          */ TCP_CLOSE,
1814   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1815   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1816   /* TCP_LISTEN         */ TCP_CLOSE,
1817   /* TCP_CLOSING        */ TCP_CLOSING,
1818 };
1819
1820 static int tcp_close_state(struct sock *sk)
1821 {
1822         int next = (int)new_state[sk->sk_state];
1823         int ns = next & TCP_STATE_MASK;
1824
1825         tcp_set_state(sk, ns);
1826
1827         return next & TCP_ACTION_FIN;
1828 }
1829
1830 /*
1831  *      Shutdown the sending side of a connection. Much like close except
1832  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1833  */
1834
1835 void tcp_shutdown(struct sock *sk, int how)
1836 {
1837         /*      We need to grab some memory, and put together a FIN,
1838          *      and then put it into the queue to be sent.
1839          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1840          */
1841         if (!(how & SEND_SHUTDOWN))
1842                 return;
1843
1844         /* If we've already sent a FIN, or it's a closed state, skip this. */
1845         if ((1 << sk->sk_state) &
1846             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1847              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1848                 /* Clear out any half completed packets.  FIN if needed. */
1849                 if (tcp_close_state(sk))
1850                         tcp_send_fin(sk);
1851         }
1852 }
1853
1854
1855 /*
1856  *      Return 1 if we still have things to send in our buffers.
1857  */
1858
1859 static inline int closing(struct sock *sk)
1860 {
1861         return (1 << sk->sk_state) &
1862                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1863 }
1864
1865 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1866 {
1867         /* First the read buffer. */
1868         __skb_queue_purge(&sk->sk_receive_queue);
1869
1870         /* Next, the error queue. */
1871         __skb_queue_purge(&sk->sk_error_queue);
1872
1873         /* Next, the write queue. */
1874         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1875
1876         /* Account for returned memory. */
1877         tcp_mem_reclaim(sk);
1878
1879         BUG_TRAP(!sk->sk_wmem_queued);
1880         BUG_TRAP(!sk->sk_forward_alloc);
1881
1882         /* It is _impossible_ for the backlog to contain anything
1883          * when we get here.  All user references to this socket
1884          * have gone away, only the net layer knows can touch it.
1885          */
1886 }
1887
1888 /*
1889  * At this point, there should be no process reference to this
1890  * socket, and thus no user references at all.  Therefore we
1891  * can assume the socket waitqueue is inactive and nobody will
1892  * try to jump onto it.
1893  */
1894 void tcp_destroy_sock(struct sock *sk)
1895 {
1896         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1897         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1898
1899         /* It cannot be in hash table! */
1900         BUG_TRAP(sk_unhashed(sk));
1901
1902         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1903         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1904
1905 #ifdef TCP_DEBUG
1906         if (sk->sk_zapped) {
1907                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1908                 sock_hold(sk);
1909         }
1910         sk->sk_zapped = 1;
1911 #endif
1912
1913         sk->sk_prot->destroy(sk);
1914
1915         tcp_kill_sk_queues(sk);
1916
1917         xfrm_sk_free_policy(sk);
1918
1919 #ifdef INET_REFCNT_DEBUG
1920         if (atomic_read(&sk->sk_refcnt) != 1) {
1921                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1922                        sk, atomic_read(&sk->sk_refcnt));
1923         }
1924 #endif
1925
1926         atomic_dec(&tcp_orphan_count);
1927         sock_put(sk);
1928 }
1929
1930 void tcp_close(struct sock *sk, long timeout)
1931 {
1932         struct sk_buff *skb;
1933         int data_was_unread = 0;
1934
1935         lock_sock(sk);
1936         sk->sk_shutdown = SHUTDOWN_MASK;
1937
1938         if (sk->sk_state == TCP_LISTEN) {
1939                 tcp_set_state(sk, TCP_CLOSE);
1940
1941                 /* Special case. */
1942                 tcp_listen_stop(sk);
1943
1944                 goto adjudge_to_death;
1945         }
1946
1947         /*  We need to flush the recv. buffs.  We do this only on the
1948          *  descriptor close, not protocol-sourced closes, because the
1949          *  reader process may not have drained the data yet!
1950          */
1951         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1952                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1953                           skb->h.th->fin;
1954                 data_was_unread += len;
1955                 __kfree_skb(skb);
1956         }
1957
1958         tcp_mem_reclaim(sk);
1959
1960         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1961          * 3.10, we send a RST here because data was lost.  To
1962          * witness the awful effects of the old behavior of always
1963          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1964          * a bulk GET in an FTP client, suspend the process, wait
1965          * for the client to advertise a zero window, then kill -9
1966          * the FTP client, wheee...  Note: timeout is always zero
1967          * in such a case.
1968          */
1969         if (data_was_unread) {
1970                 /* Unread data was tossed, zap the connection. */
1971                 NET_INC_STATS_USER(TCPAbortOnClose);
1972                 tcp_set_state(sk, TCP_CLOSE);
1973                 tcp_send_active_reset(sk, GFP_KERNEL);
1974         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1975                 /* Check zero linger _after_ checking for unread data. */
1976                 sk->sk_prot->disconnect(sk, 0);
1977                 NET_INC_STATS_USER(TCPAbortOnData);
1978         } else if (tcp_close_state(sk)) {
1979                 /* We FIN if the application ate all the data before
1980                  * zapping the connection.
1981                  */
1982
1983                 /* RED-PEN. Formally speaking, we have broken TCP state
1984                  * machine. State transitions:
1985                  *
1986                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1987                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1988                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1989                  *
1990                  * are legal only when FIN has been sent (i.e. in window),
1991                  * rather than queued out of window. Purists blame.
1992                  *
1993                  * F.e. "RFC state" is ESTABLISHED,
1994                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1995                  *
1996                  * The visible declinations are that sometimes
1997                  * we enter time-wait state, when it is not required really
1998                  * (harmless), do not send active resets, when they are
1999                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2000                  * they look as CLOSING or LAST_ACK for Linux)
2001                  * Probably, I missed some more holelets.
2002                  *                                              --ANK
2003                  */
2004                 tcp_send_fin(sk);
2005         }
2006
2007         if (timeout) {
2008                 struct task_struct *tsk = current;
2009                 DEFINE_WAIT(wait);
2010
2011                 do {
2012                         prepare_to_wait(sk->sk_sleep, &wait,
2013                                         TASK_INTERRUPTIBLE);
2014                         if (!closing(sk))
2015                                 break;
2016                         release_sock(sk);
2017                         timeout = schedule_timeout(timeout);
2018                         lock_sock(sk);
2019                 } while (!signal_pending(tsk) && timeout);
2020
2021                 finish_wait(sk->sk_sleep, &wait);
2022         }
2023
2024 adjudge_to_death:
2025         /* It is the last release_sock in its life. It will remove backlog. */
2026         release_sock(sk);
2027
2028
2029         /* Now socket is owned by kernel and we acquire BH lock
2030            to finish close. No need to check for user refs.
2031          */
2032         local_bh_disable();
2033         bh_lock_sock(sk);
2034         BUG_TRAP(!sock_owned_by_user(sk));
2035
2036         sock_hold(sk);
2037         sock_orphan(sk);
2038
2039         /*      This is a (useful) BSD violating of the RFC. There is a
2040          *      problem with TCP as specified in that the other end could
2041          *      keep a socket open forever with no application left this end.
2042          *      We use a 3 minute timeout (about the same as BSD) then kill
2043          *      our end. If they send after that then tough - BUT: long enough
2044          *      that we won't make the old 4*rto = almost no time - whoops
2045          *      reset mistake.
2046          *
2047          *      Nope, it was not mistake. It is really desired behaviour
2048          *      f.e. on http servers, when such sockets are useless, but
2049          *      consume significant resources. Let's do it with special
2050          *      linger2 option.                                 --ANK
2051          */
2052
2053         if (sk->sk_state == TCP_FIN_WAIT2) {
2054                 struct tcp_opt *tp = tcp_sk(sk);
2055                 if (tp->linger2 < 0) {
2056                         tcp_set_state(sk, TCP_CLOSE);
2057                         tcp_send_active_reset(sk, GFP_ATOMIC);
2058                         NET_INC_STATS_BH(TCPAbortOnLinger);
2059                 } else {
2060                         int tmo = tcp_fin_time(tp);
2061
2062                         if (tmo > TCP_TIMEWAIT_LEN) {
2063                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2064                         } else {
2065                                 atomic_inc(&tcp_orphan_count);
2066                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2067                                 goto out;
2068                         }
2069                 }
2070         }
2071         if (sk->sk_state != TCP_CLOSE) {
2072                 tcp_mem_reclaim(sk);
2073                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2074                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2075                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2076                         if (net_ratelimit())
2077                                 printk(KERN_INFO "TCP: too many of orphaned "
2078                                        "sockets\n");
2079                         tcp_set_state(sk, TCP_CLOSE);
2080                         tcp_send_active_reset(sk, GFP_ATOMIC);
2081                         NET_INC_STATS_BH(TCPAbortOnMemory);
2082                 }
2083         }
2084         atomic_inc(&tcp_orphan_count);
2085
2086         if (sk->sk_state == TCP_CLOSE)
2087                 tcp_destroy_sock(sk);
2088         /* Otherwise, socket is reprieved until protocol close. */
2089
2090 out:
2091         bh_unlock_sock(sk);
2092         local_bh_enable();
2093         sock_put(sk);
2094 }
2095
2096 /* These states need RST on ABORT according to RFC793 */
2097
2098 static inline int tcp_need_reset(int state)
2099 {
2100         return (1 << state) &
2101                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2102                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2103 }
2104
2105 int tcp_disconnect(struct sock *sk, int flags)
2106 {
2107         struct inet_opt *inet = inet_sk(sk);
2108         struct tcp_opt *tp = tcp_sk(sk);
2109         int err = 0;
2110         int old_state = sk->sk_state;
2111
2112         if (old_state != TCP_CLOSE)
2113                 tcp_set_state(sk, TCP_CLOSE);
2114
2115         /* ABORT function of RFC793 */
2116         if (old_state == TCP_LISTEN) {
2117                 tcp_listen_stop(sk);
2118         } else if (tcp_need_reset(old_state) ||
2119                    (tp->snd_nxt != tp->write_seq &&
2120                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2121                 /* The last check adjusts for discrepance of Linux wrt. RFC
2122                  * states
2123                  */
2124                 tcp_send_active_reset(sk, gfp_any());
2125                 sk->sk_err = ECONNRESET;
2126         } else if (old_state == TCP_SYN_SENT)
2127                 sk->sk_err = ECONNRESET;
2128
2129         tcp_clear_xmit_timers(sk);
2130         __skb_queue_purge(&sk->sk_receive_queue);
2131         tcp_writequeue_purge(sk);
2132         __skb_queue_purge(&tp->out_of_order_queue);
2133
2134         inet->dport = 0;
2135
2136         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2137                 inet_reset_saddr(sk);
2138
2139         sk->sk_shutdown = 0;
2140         sock_reset_flag(sk, SOCK_DONE);
2141         tp->srtt = 0;
2142         if ((tp->write_seq += tp->max_window + 2) == 0)
2143                 tp->write_seq = 1;
2144         tp->backoff = 0;
2145         tp->snd_cwnd = 2;
2146         tp->probes_out = 0;
2147         tp->packets_out = 0;
2148         tp->snd_ssthresh = 0x7fffffff;
2149         tp->snd_cwnd_cnt = 0;
2150         tcp_set_ca_state(tp, TCP_CA_Open);
2151         tcp_clear_retrans(tp);
2152         tcp_delack_init(tp);
2153         tp->send_head = NULL;
2154         tp->saw_tstamp = 0;
2155         tcp_sack_reset(tp);
2156         __sk_dst_reset(sk);
2157
2158         BUG_TRAP(!inet->num || tp->bind_hash);
2159
2160         sk->sk_error_report(sk);
2161         return err;
2162 }
2163
2164 /*
2165  *      Wait for an incoming connection, avoid race
2166  *      conditions. This must be called with the socket locked.
2167  */
2168 static int wait_for_connect(struct sock *sk, long timeo)
2169 {
2170         struct tcp_opt *tp = tcp_sk(sk);
2171         DEFINE_WAIT(wait);
2172         int err;
2173
2174         /*
2175          * True wake-one mechanism for incoming connections: only
2176          * one process gets woken up, not the 'whole herd'.
2177          * Since we do not 'race & poll' for established sockets
2178          * anymore, the common case will execute the loop only once.
2179          *
2180          * Subtle issue: "add_wait_queue_exclusive()" will be added
2181          * after any current non-exclusive waiters, and we know that
2182          * it will always _stay_ after any new non-exclusive waiters
2183          * because all non-exclusive waiters are added at the
2184          * beginning of the wait-queue. As such, it's ok to "drop"
2185          * our exclusiveness temporarily when we get woken up without
2186          * having to remove and re-insert us on the wait queue.
2187          */
2188         for (;;) {
2189                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2190                                           TASK_INTERRUPTIBLE);
2191                 release_sock(sk);
2192                 if (!tp->accept_queue)
2193                         timeo = schedule_timeout(timeo);
2194                 lock_sock(sk);
2195                 err = 0;
2196                 if (tp->accept_queue)
2197                         break;
2198                 err = -EINVAL;
2199                 if (sk->sk_state != TCP_LISTEN)
2200                         break;
2201                 err = sock_intr_errno(timeo);
2202                 if (signal_pending(current))
2203                         break;
2204                 err = -EAGAIN;
2205                 if (!timeo)
2206                         break;
2207         }
2208         finish_wait(sk->sk_sleep, &wait);
2209         return err;
2210 }
2211
2212 /*
2213  *      This will accept the next outstanding connection.
2214  */
2215
2216 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2217 {
2218         struct tcp_opt *tp = tcp_sk(sk);
2219         struct open_request *req;
2220         struct sock *newsk;
2221         int error;
2222 #ifdef CONFIG_ACCEPT_QUEUES
2223         int prev_class = 0;
2224         int first;
2225 #endif
2226
2227         lock_sock(sk);
2228
2229         /* We need to make sure that this socket is listening,
2230          * and that it has something pending.
2231          */
2232         error = -EINVAL;
2233         if (sk->sk_state != TCP_LISTEN)
2234                 goto out;
2235
2236         /* Find already established connection */
2237         if (!tp->accept_queue) {
2238                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2239                 /* If this is a non blocking socket don't sleep */
2240                 error = -EAGAIN;
2241                 if (!timeo)
2242                         goto out;
2243
2244                 error = wait_for_connect(sk, timeo);
2245                 if (error)
2246                         goto out;
2247         }
2248
2249 #ifndef CONFIG_ACCEPT_QUEUES
2250         req = tp->accept_queue;
2251         if ((tp->accept_queue = req->dl_next) == NULL)
2252                 tp->accept_queue_tail = NULL;
2253
2254         tcp_acceptq_removed(sk);
2255 #else
2256         first = tp->class_index;
2257         /* We should always have  request queued here. The accept_queue
2258          * is already checked for NULL above.
2259          */
2260         while(!tp->acceptq[first].aq_head) {
2261                 tp->acceptq[first].aq_cnt = 0;
2262                 first = (first+1) & ~NUM_ACCEPT_QUEUES;
2263         }
2264         req = tp->acceptq[first].aq_head;
2265         tp->acceptq[first].aq_qcount--;
2266         tp->acceptq[first].aq_count++;
2267         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
2268
2269         for (prev_class= first-1 ; prev_class >=0; prev_class--)
2270                 if (tp->acceptq[prev_class].aq_tail)
2271                         break;
2272         if (prev_class>=0)
2273                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
2274         else
2275                 tp->accept_queue = req->dl_next;
2276
2277         if (req == tp->acceptq[first].aq_tail)
2278                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2279         else
2280                 tp->acceptq[first].aq_head = req->dl_next;
2281
2282         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2283                 tp->acceptq[first].aq_cnt = 0;
2284                 tp->class_index = ++first & ~NUM_ACCEPT_QUEUES;
2285         }
2286         tcp_acceptq_removed(sk, req->acceptq_class);
2287 #endif
2288         newsk = req->sk;
2289
2290         /* MEF REVISIT: The following sk_acceptq_removed(sk); wasn't
2291            in CKRM E13, but a latter patch should fix this
2292            properly.
2293          */
2294         sk_acceptq_removed(sk);
2295         tcp_openreq_fastfree(req);
2296         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2297         release_sock(sk);
2298         return newsk;
2299
2300 out:
2301         release_sock(sk);
2302         *err = error;
2303         return NULL;
2304 }
2305
2306 /*
2307  *      Socket option code for TCP.
2308  */
2309 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2310                    int optlen)
2311 {
2312         struct tcp_opt *tp = tcp_sk(sk);
2313         int val;
2314         int err = 0;
2315
2316         if (level != SOL_TCP)
2317                 return tp->af_specific->setsockopt(sk, level, optname,
2318                                                    optval, optlen);
2319
2320         if (optlen < sizeof(int))
2321                 return -EINVAL;
2322
2323         if (get_user(val, (int __user *)optval))
2324                 return -EFAULT;
2325
2326         lock_sock(sk);
2327
2328         switch (optname) {
2329         case TCP_MAXSEG:
2330                 /* Values greater than interface MTU won't take effect. However
2331                  * at the point when this call is done we typically don't yet
2332                  * know which interface is going to be used */
2333                 if (val < 8 || val > MAX_TCP_WINDOW) {
2334                         err = -EINVAL;
2335                         break;
2336                 }
2337                 tp->user_mss = val;
2338                 break;
2339
2340         case TCP_NODELAY:
2341                 if (val) {
2342                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2343                          * this option on corked socket is remembered, but
2344                          * it is not activated until cork is cleared.
2345                          *
2346                          * However, when TCP_NODELAY is set we make
2347                          * an explicit push, which overrides even TCP_CORK
2348                          * for currently queued segments.
2349                          */
2350                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2351                         tcp_push_pending_frames(sk, tp);
2352                 } else {
2353                         tp->nonagle &= ~TCP_NAGLE_OFF;
2354                 }
2355                 break;
2356
2357         case TCP_CORK:
2358                 /* When set indicates to always queue non-full frames.
2359                  * Later the user clears this option and we transmit
2360                  * any pending partial frames in the queue.  This is
2361                  * meant to be used alongside sendfile() to get properly
2362                  * filled frames when the user (for example) must write
2363                  * out headers with a write() call first and then use
2364                  * sendfile to send out the data parts.
2365                  *
2366                  * TCP_CORK can be set together with TCP_NODELAY and it is
2367                  * stronger than TCP_NODELAY.
2368                  */
2369                 if (val) {
2370                         tp->nonagle |= TCP_NAGLE_CORK;
2371                 } else {
2372                         tp->nonagle &= ~TCP_NAGLE_CORK;
2373                         if (tp->nonagle&TCP_NAGLE_OFF)
2374                                 tp->nonagle |= TCP_NAGLE_PUSH;
2375                         tcp_push_pending_frames(sk, tp);
2376                 }
2377                 break;
2378
2379         case TCP_KEEPIDLE:
2380                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2381                         err = -EINVAL;
2382                 else {
2383                         tp->keepalive_time = val * HZ;
2384                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2385                             !((1 << sk->sk_state) &
2386                               (TCPF_CLOSE | TCPF_LISTEN))) {
2387                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2388                                 if (tp->keepalive_time > elapsed)
2389                                         elapsed = tp->keepalive_time - elapsed;
2390                                 else
2391                                         elapsed = 0;
2392                                 tcp_reset_keepalive_timer(sk, elapsed);
2393                         }
2394                 }
2395                 break;
2396         case TCP_KEEPINTVL:
2397                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2398                         err = -EINVAL;
2399                 else
2400                         tp->keepalive_intvl = val * HZ;
2401                 break;
2402         case TCP_KEEPCNT:
2403                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2404                         err = -EINVAL;
2405                 else
2406                         tp->keepalive_probes = val;
2407                 break;
2408         case TCP_SYNCNT:
2409                 if (val < 1 || val > MAX_TCP_SYNCNT)
2410                         err = -EINVAL;
2411                 else
2412                         tp->syn_retries = val;
2413                 break;
2414
2415         case TCP_LINGER2:
2416                 if (val < 0)
2417                         tp->linger2 = -1;
2418                 else if (val > sysctl_tcp_fin_timeout / HZ)
2419                         tp->linger2 = 0;
2420                 else
2421                         tp->linger2 = val * HZ;
2422                 break;
2423
2424         case TCP_DEFER_ACCEPT:
2425                 tp->defer_accept = 0;
2426                 if (val > 0) {
2427                         /* Translate value in seconds to number of
2428                          * retransmits */
2429                         while (tp->defer_accept < 32 &&
2430                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2431                                        tp->defer_accept))
2432                                 tp->defer_accept++;
2433                         tp->defer_accept++;
2434                 }
2435                 break;
2436
2437         case TCP_WINDOW_CLAMP:
2438                 if (!val) {
2439                         if (sk->sk_state != TCP_CLOSE) {
2440                                 err = -EINVAL;
2441                                 break;
2442                         }
2443                         tp->window_clamp = 0;
2444                 } else
2445                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2446                                                 SOCK_MIN_RCVBUF / 2 : val;
2447                 break;
2448
2449         case TCP_QUICKACK:
2450                 if (!val) {
2451                         tp->ack.pingpong = 1;
2452                 } else {
2453                         tp->ack.pingpong = 0;
2454                         if ((1 << sk->sk_state) &
2455                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2456                             tcp_ack_scheduled(tp)) {
2457                                 tp->ack.pending |= TCP_ACK_PUSHED;
2458                                 cleanup_rbuf(sk, 1);
2459                                 if (!(val & 1))
2460                                         tp->ack.pingpong = 1;
2461                         }
2462                 }
2463                 break;
2464
2465 #ifdef CONFIG_ACCEPT_QUEUES
2466         case TCP_ACCEPTQ_SHARE:
2467                 {
2468                         char share_wt[NUM_ACCEPT_QUEUES];
2469                         int i,j;
2470
2471                         if (sk->sk_state != TCP_LISTEN)
2472                                 return -EOPNOTSUPP;
2473
2474                         if (copy_from_user(share_wt,optval, optlen)) {
2475                                 err = -EFAULT;
2476                                 break;
2477                         }
2478                         j = 0;
2479                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2480                                 if (share_wt[i]) {
2481                                         if (!j)
2482                                                 j = share_wt[i];
2483                                         else if (share_wt[i] < j) {
2484                                                 j = share_wt[i];
2485                                         }
2486                                         tp->acceptq[i].aq_valid = 1;
2487                                 }
2488                                 else
2489                                         tp->acceptq[i].aq_valid = 0;
2490
2491                         }
2492                         if (j == 0) {
2493                                 /* Class 0 is always valid. If nothing is
2494                                  * specified set class 0 as 1.
2495                                  */
2496                                 share_wt[0] = 1;
2497                                 tp->acceptq[0].aq_valid = 1;
2498                                 j = 1;
2499                         }
2500                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2501                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2502                                 tp->acceptq[i].aq_cnt = 0;
2503                         }
2504                 }
2505                 break;
2506 #endif
2507
2508         default:
2509                 err = -ENOPROTOOPT;
2510                 break;
2511         };
2512         release_sock(sk);
2513         return err;
2514 }
2515
2516 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2517                    int __user *optlen)
2518 {
2519         struct tcp_opt *tp = tcp_sk(sk);
2520         int val, len;
2521
2522         if (level != SOL_TCP)
2523                 return tp->af_specific->getsockopt(sk, level, optname,
2524                                                    optval, optlen);
2525
2526         if (get_user(len, optlen))
2527                 return -EFAULT;
2528
2529         len = min_t(unsigned int, len, sizeof(int));
2530
2531         if (len < 0)
2532                 return -EINVAL;
2533
2534         switch (optname) {
2535         case TCP_MAXSEG:
2536                 val = tp->mss_cache_std;
2537                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2538                         val = tp->user_mss;
2539                 break;
2540         case TCP_NODELAY:
2541                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2542                 break;
2543         case TCP_CORK:
2544                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2545                 break;
2546         case TCP_KEEPIDLE:
2547                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2548                 break;
2549         case TCP_KEEPINTVL:
2550                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2551                 break;
2552         case TCP_KEEPCNT:
2553                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2554                 break;
2555         case TCP_SYNCNT:
2556                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2557                 break;
2558         case TCP_LINGER2:
2559                 val = tp->linger2;
2560                 if (val >= 0)
2561                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2562                 break;
2563         case TCP_DEFER_ACCEPT:
2564                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2565                                                (tp->defer_accept - 1));
2566                 break;
2567         case TCP_WINDOW_CLAMP:
2568                 val = tp->window_clamp;
2569                 break;
2570         case TCP_INFO: {
2571                 struct tcp_info info;
2572
2573                 if (get_user(len, optlen))
2574                         return -EFAULT;
2575
2576                 tcp_get_info(sk, &info);
2577
2578                 len = min_t(unsigned int, len, sizeof(info));
2579                 if (put_user(len, optlen))
2580                         return -EFAULT;
2581                 if (copy_to_user(optval, &info, len))
2582                         return -EFAULT;
2583                 return 0;
2584         }
2585         case TCP_QUICKACK:
2586                 val = !tp->ack.pingpong;
2587                 break;
2588
2589 #ifdef CONFIG_ACCEPT_QUEUES
2590         case TCP_ACCEPTQ_SHARE: {
2591                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2592                 int i;
2593
2594                 if (sk->sk_state != TCP_LISTEN)
2595                         return -EOPNOTSUPP;
2596
2597                 if (get_user(len, optlen))
2598                         return -EFAULT;
2599
2600                 memset(tinfo, 0, sizeof(tinfo));
2601
2602                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2603                         tinfo[i].acceptq_wait_time =
2604                                 tp->acceptq[i].aq_wait_time/(HZ/USER_HZ);
2605                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2606                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2607                         if (tp->acceptq[i].aq_valid)
2608                                 tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2609                         else
2610                                 tinfo[i].acceptq_shares = 0;
2611                 }
2612
2613                 len = min_t(unsigned int, len, sizeof(tinfo));
2614                 if (put_user(len, optlen))
2615                         return -EFAULT;
2616
2617                 if (copy_to_user(optval, (char *)tinfo, len))
2618                         return -EFAULT;
2619
2620                 return 0;
2621         }
2622 #endif
2623         default:
2624                 return -ENOPROTOOPT;
2625         };
2626
2627         if (put_user(len, optlen))
2628                 return -EFAULT;
2629         if (copy_to_user(optval, &val, len))
2630                 return -EFAULT;
2631         return 0;
2632 }
2633
2634
2635 extern void __skb_cb_too_small_for_tcp(int, int);
2636 extern void tcpdiag_init(void);
2637
2638 static __initdata unsigned long thash_entries;
2639 static int __init set_thash_entries(char *str)
2640 {
2641         if (!str)
2642                 return 0;
2643         thash_entries = simple_strtoul(str, &str, 0);
2644         return 1;
2645 }
2646 __setup("thash_entries=", set_thash_entries);
2647
2648 void __init tcp_init(void)
2649 {
2650         struct sk_buff *skb = NULL;
2651         unsigned long goal;
2652         int order, i;
2653
2654         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2655                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2656                                            sizeof(skb->cb));
2657
2658         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2659                                                    sizeof(struct open_request),
2660                                                0, SLAB_HWCACHE_ALIGN,
2661                                                NULL, NULL);
2662         if (!tcp_openreq_cachep)
2663                 panic("tcp_init: Cannot alloc open_request cache.");
2664
2665         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2666                                               sizeof(struct tcp_bind_bucket),
2667                                               0, SLAB_HWCACHE_ALIGN,
2668                                               NULL, NULL);
2669         if (!tcp_bucket_cachep)
2670                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2671
2672         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2673                                                 sizeof(struct tcp_tw_bucket),
2674                                                 0, SLAB_HWCACHE_ALIGN,
2675                                                 NULL, NULL);
2676         if (!tcp_timewait_cachep)
2677                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2678
2679         /* Size and allocate the main established and bind bucket
2680          * hash tables.
2681          *
2682          * The methodology is similar to that of the buffer cache.
2683          */
2684         if (num_physpages >= (128 * 1024))
2685                 goal = num_physpages >> (21 - PAGE_SHIFT);
2686         else
2687                 goal = num_physpages >> (23 - PAGE_SHIFT);
2688
2689         if (thash_entries)
2690                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2691         for (order = 0; (1UL << order) < goal; order++)
2692                 ;
2693         do {
2694                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2695                         sizeof(struct tcp_ehash_bucket);
2696                 tcp_ehash_size >>= 1;
2697                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2698                         tcp_ehash_size--;
2699                 tcp_ehash = (struct tcp_ehash_bucket *)
2700                         __get_free_pages(GFP_ATOMIC, order);
2701         } while (!tcp_ehash && --order > 0);
2702
2703         if (!tcp_ehash)
2704                 panic("Failed to allocate TCP established hash table\n");
2705         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2706                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2707                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2708         }
2709
2710         do {
2711                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2712                         sizeof(struct tcp_bind_hashbucket);
2713                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2714                         continue;
2715                 tcp_bhash = (struct tcp_bind_hashbucket *)
2716                         __get_free_pages(GFP_ATOMIC, order);
2717         } while (!tcp_bhash && --order >= 0);
2718
2719         if (!tcp_bhash)
2720                 panic("Failed to allocate TCP bind hash table\n");
2721         for (i = 0; i < tcp_bhash_size; i++) {
2722                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2723                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2724         }
2725
2726         /* Try to be a bit smarter and adjust defaults depending
2727          * on available memory.
2728          */
2729         if (order > 4) {
2730                 sysctl_local_port_range[0] = 32768;
2731                 sysctl_local_port_range[1] = 61000;
2732                 sysctl_tcp_max_tw_buckets = 180000;
2733                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2734                 sysctl_max_syn_backlog = 1024;
2735         } else if (order < 3) {
2736                 sysctl_local_port_range[0] = 1024 * (3 - order);
2737                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2738                 sysctl_tcp_max_orphans >>= (3 - order);
2739                 sysctl_max_syn_backlog = 128;
2740         }
2741         tcp_port_rover = sysctl_local_port_range[0] - 1;
2742
2743         sysctl_tcp_mem[0] =  768 << order;
2744         sysctl_tcp_mem[1] = 1024 << order;
2745         sysctl_tcp_mem[2] = 1536 << order;
2746
2747         if (order < 3) {
2748                 sysctl_tcp_wmem[2] = 64 * 1024;
2749                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2750                 sysctl_tcp_rmem[1] = 43689;
2751                 sysctl_tcp_rmem[2] = 2 * 43689;
2752         }
2753
2754         printk(KERN_INFO "TCP: Hash tables configured "
2755                "(established %d bind %d)\n",
2756                tcp_ehash_size << 1, tcp_bhash_size);
2757
2758         tcpdiag_init();
2759 }
2760
2761 EXPORT_SYMBOL(__tcp_mem_reclaim);
2762 EXPORT_SYMBOL(sysctl_tcp_rmem);
2763 EXPORT_SYMBOL(sysctl_tcp_wmem);
2764 EXPORT_SYMBOL(tcp_accept);
2765 EXPORT_SYMBOL(tcp_close);
2766 EXPORT_SYMBOL(tcp_close_state);
2767 EXPORT_SYMBOL(tcp_destroy_sock);
2768 EXPORT_SYMBOL(tcp_disconnect);
2769 EXPORT_SYMBOL(tcp_getsockopt);
2770 EXPORT_SYMBOL(tcp_ioctl);
2771 EXPORT_SYMBOL(tcp_openreq_cachep);
2772 EXPORT_SYMBOL(tcp_poll);
2773 EXPORT_SYMBOL(tcp_read_sock);
2774 EXPORT_SYMBOL(tcp_recvmsg);
2775 EXPORT_SYMBOL(tcp_sendmsg);
2776 EXPORT_SYMBOL(tcp_sendpage);
2777 EXPORT_SYMBOL(tcp_setsockopt);
2778 EXPORT_SYMBOL(tcp_shutdown);
2779 EXPORT_SYMBOL(tcp_sockets_allocated);
2780 EXPORT_SYMBOL(tcp_statistics);
2781 EXPORT_SYMBOL(tcp_timewait_cachep);
2782 EXPORT_SYMBOL_GPL(cleanup_rbuf);