net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259
 260 #ifdef CONFIG_CKRM
 261 #include <linux/ckrm.h>
 262 #endif
 263
 264 #include <net/icmp.h>
 265 #include <net/tcp.h>
 266 #include <net/xfrm.h>
 267 #include <net/ip.h>
 268
 269
 270 #include <asm/uaccess.h>
 271 #include <asm/ioctls.h>
 272
 273 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 274
 275 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 276
 277 kmem_cache_t *tcp_openreq_cachep;
 278 kmem_cache_t *tcp_bucket_cachep;
 279 kmem_cache_t *tcp_timewait_cachep;
 280
 281 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 282
 283 int sysctl_tcp_default_win_scale = 7;
 284 int sysctl_tcp_mem[3];
 285 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 286 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 287
 288 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 289 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 290
 291 /* Pressure flag: try to collapse.
 292  * Technical note: it is used by multiple contexts non atomically.
 293  * All the tcp_mem_schedule() is of this nature: accounting
 294  * is strict, actions are advisory and have some latency. */
 295 int tcp_memory_pressure;
 296
 297 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
 298
 299 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 300 {
 301         int amt = TCP_PAGES(size);
 302
 303         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
 304         atomic_add(amt, &tcp_memory_allocated);
 305
 306         /* Under limit. */
 307         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 308                 if (tcp_memory_pressure)
 309                         tcp_memory_pressure = 0;
 310                 return 1;
 311         }
 312
 313         /* Over hard limit. */
 314         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 315                 tcp_enter_memory_pressure();
 316                 goto suppress_allocation;
 317         }
 318
 319         /* Under pressure. */
 320         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 321                 tcp_enter_memory_pressure();
 322
 323         if (kind) {
 324                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
 325                         return 1;
 326         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
 327                 return 1;
 328
 329         if (!tcp_memory_pressure ||
 330             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
 331                                 TCP_PAGES(sk->sk_wmem_queued +
 332                                           atomic_read(&sk->sk_rmem_alloc) +
 333                                           sk->sk_forward_alloc))
 334                 return 1;
 335
 336 suppress_allocation:
 337
 338         if (!kind) {
 339                 tcp_moderate_sndbuf(sk);
 340
 341                 /* Fail only if socket is _under_ its sndbuf.
 342                  * In this case we cannot block, so that we have to fail.
 343                  */
 344                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
 345                         return 1;
 346         }
 347
 348         /* Alas. Undo changes. */
 349         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
 350         atomic_sub(amt, &tcp_memory_allocated);
 351         return 0;
 352 }
 353
 354 void __tcp_mem_reclaim(struct sock *sk)
 355 {
 356         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
 357                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
 358                            &tcp_memory_allocated);
 359                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
 360                 if (tcp_memory_pressure &&
 361                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 362                         tcp_memory_pressure = 0;
 363         }
 364 }
 365
 366 void tcp_rfree(struct sk_buff *skb)
 367 {
 368         struct sock *sk = skb->sk;
 369
 370         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 371         sk->sk_forward_alloc += skb->truesize;
 372 }
 373
 374 /*
 375  * LISTEN is a special case for poll..
 376  */
 377 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 378                                                poll_table *wait)
 379 {
 380         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 381 }
 382
 383 /*
 384  *      Wait for a TCP event.
 385  *
 386  *      Note that we don't need to lock the socket, as the upper poll layers
 387  *      take care of normal races (between the test and the event) and we don't
 388  *      go look at any of the socket buffers directly.
 389  */
 390 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 391 {
 392         unsigned int mask;
 393         struct sock *sk = sock->sk;
 394         struct tcp_opt *tp = tcp_sk(sk);
 395
 396         poll_wait(file, sk->sk_sleep, wait);
 397         if (sk->sk_state == TCP_LISTEN)
 398                 return tcp_listen_poll(sk, wait);
 399
 400         /* Socket is not locked. We are protected from async events
 401            by poll logic and correct handling of state changes
 402            made by another threads is impossible in any case.
 403          */
 404
 405         mask = 0;
 406         if (sk->sk_err)
 407                 mask = POLLERR;
 408
 409         /*
 410          * POLLHUP is certainly not done right. But poll() doesn't
 411          * have a notion of HUP in just one direction, and for a
 412          * socket the read side is more interesting.
 413          *
 414          * Some poll() documentation says that POLLHUP is incompatible
 415          * with the POLLOUT/POLLWR flags, so somebody should check this
 416          * all. But careful, it tends to be safer to return too many
 417          * bits than too few, and you can easily break real applications
 418          * if you don't tell them that something has hung up!
 419          *
 420          * Check-me.
 421          *
 422          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 423          * our fs/select.c). It means that after we received EOF,
 424          * poll always returns immediately, making impossible poll() on write()
 425          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 426          * if and only if shutdown has been made in both directions.
 427          * Actually, it is interesting to look how Solaris and DUX
 428          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 429          * then we could set it on SND_SHUTDOWN. BTW examples given
 430          * in Stevens' books assume exactly this behaviour, it explains
 431          * why PULLHUP is incompatible with POLLOUT.    --ANK
 432          *
 433          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 434          * blocking on fresh not-connected or disconnected socket. --ANK
 435          */
 436         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 437                 mask |= POLLHUP;
 438         if (sk->sk_shutdown & RCV_SHUTDOWN)
 439                 mask |= POLLIN | POLLRDNORM;
 440
 441         /* Connected? */
 442         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 443                 /* Potential race condition. If read of tp below will
 444                  * escape above sk->sk_state, we can be illegally awaken
 445                  * in SYN_* states. */
 446                 if ((tp->rcv_nxt != tp->copied_seq) &&
 447                     (tp->urg_seq != tp->copied_seq ||
 448                      tp->rcv_nxt != tp->copied_seq + 1 ||
 449                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 450                         mask |= POLLIN | POLLRDNORM;
 451
 452                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 453                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 454                                 mask |= POLLOUT | POLLWRNORM;
 455                         } else {  /* send SIGIO later */
 456                                 set_bit(SOCK_ASYNC_NOSPACE,
 457                                         &sk->sk_socket->flags);
 458                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 459
 460                                 /* Race breaker. If space is freed after
 461                                  * wspace test but before the flags are set,
 462                                  * IO signal will be lost.
 463                                  */
 464                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 465                                         mask |= POLLOUT | POLLWRNORM;
 466                         }
 467                 }
 468
 469                 if (tp->urg_data & TCP_URG_VALID)
 470                         mask |= POLLPRI;
 471         }
 472         return mask;
 473 }
 474
 475 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 476 {
 477         struct tcp_opt *tp = tcp_sk(sk);
 478         int answ;
 479
 480         switch (cmd) {
 481         case SIOCINQ:
 482                 if (sk->sk_state == TCP_LISTEN)
 483                         return -EINVAL;
 484
 485                 lock_sock(sk);
 486                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 487                         answ = 0;
 488                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 489                          !tp->urg_data ||
 490                          before(tp->urg_seq, tp->copied_seq) ||
 491                          !before(tp->urg_seq, tp->rcv_nxt)) {
 492                         answ = tp->rcv_nxt - tp->copied_seq;
 493
 494                         /* Subtract 1, if FIN is in queue. */
 495                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 496                                 answ -=
 497                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 498                 } else
 499                         answ = tp->urg_seq - tp->copied_seq;
 500                 release_sock(sk);
 501                 break;
 502         case SIOCATMARK:
 503                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 504                 break;
 505         case SIOCOUTQ:
 506                 if (sk->sk_state == TCP_LISTEN)
 507                         return -EINVAL;
 508
 509                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 510                         answ = 0;
 511                 else
 512                         answ = tp->write_seq - tp->snd_una;
 513                 break;
 514         default:
 515                 return -ENOIOCTLCMD;
 516         };
 517
 518         return put_user(answ, (int __user *)arg);
 519 }
 520
 521
 522 int tcp_listen_start(struct sock *sk)
 523 {
 524 #ifdef CONFIG_ACCEPT_QUEUES
 525         int i = 0;
 526 #endif
 527         struct inet_opt *inet = inet_sk(sk);
 528         struct tcp_opt *tp = tcp_sk(sk);
 529         struct tcp_listen_opt *lopt;
 530
 531         sk->sk_max_ack_backlog = 0;
 532         sk->sk_ack_backlog = 0;
 533 #ifdef CONFIG_ACCEPT_QUEUES
 534         tp->accept_queue = NULL;
 535 #else
 536         tp->accept_queue = tp->accept_queue_tail = NULL;
 537 #endif
 538         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 539         tcp_delack_init(tp);
 540
 541         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 542         if (!lopt)
 543                 return -ENOMEM;
 544
 545         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 546         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 547                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 548                         break;
 549         get_random_bytes(&lopt->hash_rnd, 4);
 550
 551 #ifdef CONFIG_ACCEPT_QUEUES
 552         tp->class_index = 0;
 553         for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
 554                 tp->acceptq[i].aq_tail = NULL;
 555                 tp->acceptq[i].aq_head = NULL;
 556                 tp->acceptq[i].aq_wait_time = 0;
 557                 tp->acceptq[i].aq_qcount = 0;
 558                 tp->acceptq[i].aq_count = 0;
 559                 if (i == 0) {
 560                         tp->acceptq[i].aq_ratio = 1;
 561                 }
 562                 else {
 563                         tp->acceptq[i].aq_ratio = 0;
 564                 }
 565         }
 566 #endif
 567
 568         write_lock_bh(&tp->syn_wait_lock);
 569         tp->listen_opt = lopt;
 570         write_unlock_bh(&tp->syn_wait_lock);
 571
 572         /* There is race window here: we announce ourselves listening,
 573          * but this transition is still not validated by get_port().
 574          * It is OK, because this socket enters to hash table only
 575          * after validation is complete.
 576          */
 577         sk->sk_state = TCP_LISTEN;
 578         if (!sk->sk_prot->get_port(sk, inet->num)) {
 579                 inet->sport = htons(inet->num);
 580
 581                 sk_dst_reset(sk);
 582                 sk->sk_prot->hash(sk);
 583
 584 #ifdef CONFIG_CKRM
 585                 ckrm_cb_listen_start(sk);
 586 #endif
 587
 588                 return 0;
 589         }
 590
 591         sk->sk_state = TCP_CLOSE;
 592         write_lock_bh(&tp->syn_wait_lock);
 593         tp->listen_opt = NULL;
 594         write_unlock_bh(&tp->syn_wait_lock);
 595         kfree(lopt);
 596         return -EADDRINUSE;
 597 }
 598
 599 /*
 600  *      This routine closes sockets which have been at least partially
 601  *      opened, but not yet accepted.
 602  */
 603
 604 static void tcp_listen_stop (struct sock *sk)
 605 {
 606         struct tcp_opt *tp = tcp_sk(sk);
 607         struct tcp_listen_opt *lopt = tp->listen_opt;
 608         struct open_request *acc_req = tp->accept_queue;
 609         struct open_request *req;
 610         int i;
 611
 612         tcp_delete_keepalive_timer(sk);
 613
 614         /* make all the listen_opt local to us */
 615         write_lock_bh(&tp->syn_wait_lock);
 616         tp->listen_opt = NULL;
 617         write_unlock_bh(&tp->syn_wait_lock);
 618
 619 #ifdef CONFIG_CKRM
 620                 ckrm_cb_listen_stop(sk);
 621 #endif
 622
 623 #ifdef CONFIG_ACCEPT_QUEUES
 624         for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
 625                 tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
 626 #else
 627         tp->accept_queue_tail = NULL;
 628 #endif
 629         tp->accept_queue = NULL;
 630
 631         if (lopt->qlen) {
 632                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 633                         while ((req = lopt->syn_table[i]) != NULL) {
 634                                 lopt->syn_table[i] = req->dl_next;
 635                                 lopt->qlen--;
 636                                 tcp_openreq_free(req);
 637
 638                 /* Following specs, it would be better either to send FIN
 639                  * (and enter FIN-WAIT-1, it is normal close)
 640                  * or to send active reset (abort).
 641                  * Certainly, it is pretty dangerous while synflood, but it is
 642                  * bad justification for our negligence 8)
 643                  * To be honest, we are not able to make either
 644                  * of the variants now.                 --ANK
 645                  */
 646                         }
 647                 }
 648         }
 649         BUG_TRAP(!lopt->qlen);
 650
 651         kfree(lopt);
 652
 653         while ((req = acc_req) != NULL) {
 654                 struct sock *child = req->sk;
 655
 656                 acc_req = req->dl_next;
 657
 658                 local_bh_disable();
 659                 bh_lock_sock(child);
 660                 BUG_TRAP(!sock_owned_by_user(child));
 661                 sock_hold(child);
 662
 663                 tcp_disconnect(child, O_NONBLOCK);
 664
 665                 sock_orphan(child);
 666
 667                 atomic_inc(&tcp_orphan_count);
 668
 669                 tcp_destroy_sock(child);
 670
 671                 bh_unlock_sock(child);
 672                 local_bh_enable();
 673                 sock_put(child);
 674
 675 #ifdef CONFIG_ACCEPT_QUEUES
 676                 sk_acceptq_removed(sk, req->acceptq_class);
 677 #else
 678                 sk_acceptq_removed(sk);
 679 #endif
 680                 tcp_openreq_fastfree(req);
 681         }
 682         BUG_TRAP(!sk->sk_ack_backlog);
 683 }
 684
 685 /*
 686  *      Wait for a socket to get into the connected state
 687  *
 688  *      Note: Must be called with the socket locked.
 689  */
 690 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
 691 {
 692         struct tcp_opt *tp = tcp_sk(sk);
 693         struct task_struct *tsk = current;
 694         DEFINE_WAIT(wait);
 695
 696         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 697                 if (sk->sk_err)
 698                         return sock_error(sk);
 699                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 700                         return -EPIPE;
 701                 if (!*timeo_p)
 702                         return -EAGAIN;
 703                 if (signal_pending(tsk))
 704                         return sock_intr_errno(*timeo_p);
 705
 706                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 707                 tp->write_pending++;
 708
 709                 release_sock(sk);
 710                 *timeo_p = schedule_timeout(*timeo_p);
 711                 lock_sock(sk);
 712
 713                 finish_wait(sk->sk_sleep, &wait);
 714                 tp->write_pending--;
 715         }
 716         return 0;
 717 }
 718
 719 static inline int tcp_memory_free(struct sock *sk)
 720 {
 721         return sk->sk_wmem_queued < sk->sk_sndbuf;
 722 }
 723
 724 /*
 725  *      Wait for more memory for a socket
 726  */
 727 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
 728 {
 729         struct tcp_opt *tp = tcp_sk(sk);
 730         int err = 0;
 731         long vm_wait = 0;
 732         long current_timeo = *timeo;
 733         DEFINE_WAIT(wait);
 734
 735         if (tcp_memory_free(sk))
 736                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
 737
 738         for (;;) {
 739                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 740
 741                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 742
 743                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 744                         goto do_error;
 745                 if (!*timeo)
 746                         goto do_nonblock;
 747                 if (signal_pending(current))
 748                         goto do_interrupted;
 749                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 750                 if (tcp_memory_free(sk) && !vm_wait)
 751                         break;
 752
 753                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 754                 tp->write_pending++;
 755                 release_sock(sk);
 756                 if (!tcp_memory_free(sk) || vm_wait)
 757                         current_timeo = schedule_timeout(current_timeo);
 758                 lock_sock(sk);
 759                 tp->write_pending--;
 760
 761                 if (vm_wait) {
 762                         vm_wait -= current_timeo;
 763                         current_timeo = *timeo;
 764                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 765                             (current_timeo -= vm_wait) < 0)
 766                                 current_timeo = 0;
 767                         vm_wait = 0;
 768                 }
 769                 *timeo = current_timeo;
 770         }
 771 out:
 772         finish_wait(sk->sk_sleep, &wait);
 773         return err;
 774
 775 do_error:
 776         err = -EPIPE;
 777         goto out;
 778 do_nonblock:
 779         err = -EAGAIN;
 780         goto out;
 781 do_interrupted:
 782         err = sock_intr_errno(*timeo);
 783         goto out;
 784 }
 785
 786 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
 787                                int off)
 788 {
 789         if (i) {
 790                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 791                 return page == frag->page &&
 792                        off == frag->page_offset + frag->size;
 793         }
 794         return 0;
 795 }
 796
 797 static inline void fill_page_desc(struct sk_buff *skb, int i,
 798                                   struct page *page, int off, int size)
 799 {
 800         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 801         frag->page = page;
 802         frag->page_offset = off;
 803         frag->size = size;
 804         skb_shinfo(skb)->nr_frags = i + 1;
 805 }
 806
 807 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 808 {
 809         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 810         tp->pushed_seq = tp->write_seq;
 811 }
 812
 813 static inline int forced_push(struct tcp_opt *tp)
 814 {
 815         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 816 }
 817
 818 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 819                               struct sk_buff *skb)
 820 {
 821         skb->csum = 0;
 822         TCP_SKB_CB(skb)->seq = tp->write_seq;
 823         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 824         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 825         TCP_SKB_CB(skb)->sacked = 0;
 826         __skb_queue_tail(&sk->sk_write_queue, skb);
 827         sk_charge_skb(sk, skb);
 828         if (!tp->send_head)
 829                 tp->send_head = skb;
 830         else if (tp->nonagle&TCP_NAGLE_PUSH)
 831                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 832 }
 833
 834 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 835                                 struct sk_buff *skb)
 836 {
 837         if (flags & MSG_OOB) {
 838                 tp->urg_mode = 1;
 839                 tp->snd_up = tp->write_seq;
 840                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 841         }
 842 }
 843
 844 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 845                             int mss_now, int nonagle)
 846 {
 847         if (tp->send_head) {
 848                 struct sk_buff *skb = sk->sk_write_queue.prev;
 849                 if (!(flags & MSG_MORE) || forced_push(tp))
 850                         tcp_mark_push(tp, skb);
 851                 tcp_mark_urg(tp, flags, skb);
 852                 __tcp_push_pending_frames(sk, tp, mss_now,
 853                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 854         }
 855 }
 856
 857 static int tcp_error(struct sock *sk, int flags, int err)
 858 {
 859         if (err == -EPIPE)
 860                 err = sock_error(sk) ? : -EPIPE;
 861         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 862                 send_sig(SIGPIPE, current, 0);
 863         return err;
 864 }
 865
 866 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 867                          size_t psize, int flags)
 868 {
 869         struct tcp_opt *tp = tcp_sk(sk);
 870         int mss_now;
 871         int err;
 872         ssize_t copied;
 873         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 874
 875         /* Wait for a connection to finish. */
 876         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 877                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 878                         goto out_err;
 879
 880         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 881
 882         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 883         copied = 0;
 884
 885         err = -EPIPE;
 886         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 887                 goto do_error;
 888
 889         while (psize > 0) {
 890                 struct sk_buff *skb = sk->sk_write_queue.prev;
 891                 struct page *page = pages[poffset / PAGE_SIZE];
 892                 int copy, i;
 893                 int offset = poffset % PAGE_SIZE;
 894                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 895
 896                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
 897 new_segment:
 898                         if (!tcp_memory_free(sk))
 899                                 goto wait_for_sndbuf;
 900
 901                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
 902                                              sk->sk_allocation);
 903                         if (!skb)
 904                                 goto wait_for_memory;
 905
 906                         skb_entail(sk, tp, skb);
 907                         copy = mss_now;
 908                 }
 909
 910                 if (copy > size)
 911                         copy = size;
 912
 913                 i = skb_shinfo(skb)->nr_frags;
 914                 if (can_coalesce(skb, i, page, offset)) {
 915                         skb_shinfo(skb)->frags[i - 1].size += copy;
 916                 } else if (i < MAX_SKB_FRAGS) {
 917                         get_page(page);
 918                         fill_page_desc(skb, i, page, offset, copy);
 919                 } else {
 920                         tcp_mark_push(tp, skb);
 921                         goto new_segment;
 922                 }
 923
 924                 skb->len += copy;
 925                 skb->data_len += copy;
 926                 skb->ip_summed = CHECKSUM_HW;
 927                 tp->write_seq += copy;
 928                 TCP_SKB_CB(skb)->end_seq += copy;
 929
 930                 if (!copied)
 931                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 932
 933                 copied += copy;
 934                 poffset += copy;
 935                 if (!(psize -= copy))
 936                         goto out;
 937
 938                 if (skb->len != mss_now || (flags & MSG_OOB))
 939                         continue;
 940
 941                 if (forced_push(tp)) {
 942                         tcp_mark_push(tp, skb);
 943                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 944                 } else if (skb == tp->send_head)
 945                         tcp_push_one(sk, mss_now);
 946                 continue;
 947
 948 wait_for_sndbuf:
 949                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 950 wait_for_memory:
 951                 if (copied)
 952                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 953
 954                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 955                         goto do_error;
 956
 957                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 958         }
 959
 960 out:
 961         if (copied)
 962                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 963         return copied;
 964
 965 do_error:
 966         if (copied)
 967                 goto out;
 968 out_err:
 969         return tcp_error(sk, flags, err);
 970 }
 971
 972 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 973                      size_t size, int flags)
 974 {
 975         ssize_t res;
 976         struct sock *sk = sock->sk;
 977
 978 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 979
 980         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 981             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 982                 return sock_no_sendpage(sock, page, offset, size, flags);
 983
 984 #undef TCP_ZC_CSUM_FLAGS
 985
 986         lock_sock(sk);
 987         TCP_CHECK_TIMER(sk);
 988         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 989         TCP_CHECK_TIMER(sk);
 990         release_sock(sk);
 991         return res;
 992 }
 993
 994 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
 995 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
 996
 997 static inline int tcp_copy_to_page(struct sock *sk, char __user *from,
 998                                    struct sk_buff *skb, struct page *page,
 999                                    int off, int copy)
1000 {
1001         int err = 0;
1002         unsigned int csum;
1003
1004         if (skb->ip_summed == CHECKSUM_NONE) {
1005                 csum = csum_and_copy_from_user(from, page_address(page) + off,
1006                                        copy, 0, &err);
1007                 if (err) return err;
1008                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1009         } else {
1010                 if (copy_from_user(page_address(page) + off, from, copy))
1011                         return -EFAULT;
1012         }
1013
1014         skb->len += copy;
1015         skb->data_len += copy;
1016         skb->truesize += copy;
1017         sk->sk_wmem_queued += copy;
1018         sk->sk_forward_alloc -= copy;
1019         return 0;
1020 }
1021
1022 static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy)
1023 {
1024         int err = 0;
1025         unsigned int csum;
1026         int off = skb->len;
1027
1028         if (skb->ip_summed == CHECKSUM_NONE) {
1029                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1030                                        copy, 0, &err);
1031                 if (!err) {
1032                         skb->csum = csum_block_add(skb->csum, csum, off);
1033                         return 0;
1034                 }
1035         } else {
1036                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1037                         return 0;
1038         }
1039
1040         __skb_trim(skb, off);
1041         return -EFAULT;
1042 }
1043
1044 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1045 {
1046         int tmp = tp->mss_cache_std;
1047
1048         if (sk->sk_route_caps & NETIF_F_SG) {
1049                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1050
1051                 if (tmp >= pgbreak &&
1052                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1053                         tmp = pgbreak;
1054         }
1055         return tmp;
1056 }
1057
1058 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1059                 size_t size)
1060 {
1061         struct iovec *iov;
1062         struct tcp_opt *tp = tcp_sk(sk);
1063         struct sk_buff *skb;
1064         int iovlen, flags;
1065         int mss_now;
1066         int err, copied;
1067         long timeo;
1068
1069         lock_sock(sk);
1070         TCP_CHECK_TIMER(sk);
1071
1072         flags = msg->msg_flags;
1073         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1074
1075         /* Wait for a connection to finish. */
1076         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1077                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1078                         goto out_err;
1079
1080         /* This should be in poll */
1081         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1082
1083         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1084
1085         /* Ok commence sending. */
1086         iovlen = msg->msg_iovlen;
1087         iov = msg->msg_iov;
1088         copied = 0;
1089
1090         err = -EPIPE;
1091         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1092                 goto do_error;
1093
1094         while (--iovlen >= 0) {
1095                 int seglen = iov->iov_len;
1096                 unsigned char __user *from = iov->iov_base;
1097
1098                 iov++;
1099
1100                 while (seglen > 0) {
1101                         int copy;
1102
1103                         skb = sk->sk_write_queue.prev;
1104
1105                         if (!tp->send_head ||
1106                             (copy = mss_now - skb->len) <= 0) {
1107
1108 new_segment:
1109                                 /* Allocate new segment. If the interface is SG,
1110                                  * allocate skb fitting to single page.
1111                                  */
1112                                 if (!tcp_memory_free(sk))
1113                                         goto wait_for_sndbuf;
1114
1115                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1116                                                      0, sk->sk_allocation);
1117                                 if (!skb)
1118                                         goto wait_for_memory;
1119
1120                                 /*
1121                                  * Check whether we can use HW checksum.
1122                                  */
1123                                 if (sk->sk_route_caps &
1124                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1125                                      NETIF_F_HW_CSUM))
1126                                         skb->ip_summed = CHECKSUM_HW;
1127
1128                                 skb_entail(sk, tp, skb);
1129                                 copy = mss_now;
1130                         }
1131
1132                         /* Try to append data to the end of skb. */
1133                         if (copy > seglen)
1134                                 copy = seglen;
1135
1136                         /* Where to copy to? */
1137                         if (skb_tailroom(skb) > 0) {
1138                                 /* We have some space in skb head. Superb! */
1139                                 if (copy > skb_tailroom(skb))
1140                                         copy = skb_tailroom(skb);
1141                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1142                                         goto do_fault;
1143                         } else {
1144                                 int merge = 0;
1145                                 int i = skb_shinfo(skb)->nr_frags;
1146                                 struct page *page = TCP_PAGE(sk);
1147                                 int off = TCP_OFF(sk);
1148
1149                                 if (can_coalesce(skb, i, page, off) &&
1150                                     off != PAGE_SIZE) {
1151                                         /* We can extend the last page
1152                                          * fragment. */
1153                                         merge = 1;
1154                                 } else if (i == MAX_SKB_FRAGS ||
1155                                            (!i &&
1156                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1157                                         /* Need to add new fragment and cannot
1158                                          * do this because interface is non-SG,
1159                                          * or because all the page slots are
1160                                          * busy. */
1161                                         tcp_mark_push(tp, skb);
1162                                         goto new_segment;
1163                                 } else if (page) {
1164                                         /* If page is cached, align
1165                                          * offset to L1 cache boundary
1166                                          */
1167                                         off = (off + L1_CACHE_BYTES - 1) &
1168                                               ~(L1_CACHE_BYTES - 1);
1169                                         if (off == PAGE_SIZE) {
1170                                                 put_page(page);
1171                                                 TCP_PAGE(sk) = page = NULL;
1172                                         }
1173                                 }
1174
1175                                 if (!page) {
1176                                         /* Allocate new cache page. */
1177                                         if (!(page = tcp_alloc_page(sk)))
1178                                                 goto wait_for_memory;
1179                                         off = 0;
1180                                 }
1181
1182                                 if (copy > PAGE_SIZE - off)
1183                                         copy = PAGE_SIZE - off;
1184
1185                                 /* Time to copy data. We are close to
1186                                  * the end! */
1187                                 err = tcp_copy_to_page(sk, from, skb, page,
1188                                                        off, copy);
1189                                 if (err) {
1190                                         /* If this page was new, give it to the
1191                                          * socket so it does not get leaked.
1192                                          */
1193                                         if (!TCP_PAGE(sk)) {
1194                                                 TCP_PAGE(sk) = page;
1195                                                 TCP_OFF(sk) = 0;
1196                                         }
1197                                         goto do_error;
1198                                 }
1199
1200                                 /* Update the skb. */
1201                                 if (merge) {
1202                                         skb_shinfo(skb)->frags[i - 1].size +=
1203                                                                         copy;
1204                                 } else {
1205                                         fill_page_desc(skb, i, page, off, copy);
1206                                         if (TCP_PAGE(sk)) {
1207                                                 get_page(page);
1208                                         } else if (off + copy < PAGE_SIZE) {
1209                                                 get_page(page);
1210                                                 TCP_PAGE(sk) = page;
1211                                         }
1212                                 }
1213
1214                                 TCP_OFF(sk) = off + copy;
1215                         }
1216
1217                         if (!copied)
1218                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1219
1220                         tp->write_seq += copy;
1221                         TCP_SKB_CB(skb)->end_seq += copy;
1222
1223                         from += copy;
1224                         copied += copy;
1225                         if ((seglen -= copy) == 0 && iovlen == 0)
1226                                 goto out;
1227
1228                         if (skb->len != mss_now || (flags & MSG_OOB))
1229                                 continue;
1230
1231                         if (forced_push(tp)) {
1232                                 tcp_mark_push(tp, skb);
1233                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1234                         } else if (skb == tp->send_head)
1235                                 tcp_push_one(sk, mss_now);
1236                         continue;
1237
1238 wait_for_sndbuf:
1239                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1240 wait_for_memory:
1241                         if (copied)
1242                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1243
1244                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1245                                 goto do_error;
1246
1247                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1248                 }
1249         }
1250
1251 out:
1252         if (copied)
1253                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1254         TCP_CHECK_TIMER(sk);
1255         release_sock(sk);
1256         return copied;
1257
1258 do_fault:
1259         if (!skb->len) {
1260                 if (tp->send_head == skb)
1261                         tp->send_head = NULL;
1262                 __skb_unlink(skb, skb->list);
1263                 tcp_free_skb(sk, skb);
1264         }
1265
1266 do_error:
1267         if (copied)
1268                 goto out;
1269 out_err:
1270         err = tcp_error(sk, flags, err);
1271         TCP_CHECK_TIMER(sk);
1272         release_sock(sk);
1273         return err;
1274 }
1275
1276 /*
1277  *      Handle reading urgent data. BSD has very simple semantics for
1278  *      this, no blocking and very strange errors 8)
1279  */
1280
1281 static int tcp_recv_urg(struct sock *sk, long timeo,
1282                         struct msghdr *msg, int len, int flags,
1283                         int *addr_len)
1284 {
1285         struct tcp_opt *tp = tcp_sk(sk);
1286
1287         /* No URG data to read. */
1288         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1289             tp->urg_data == TCP_URG_READ)
1290                 return -EINVAL; /* Yes this is right ! */
1291
1292         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1293                 return -ENOTCONN;
1294
1295         if (tp->urg_data & TCP_URG_VALID) {
1296                 int err = 0;
1297                 char c = tp->urg_data;
1298
1299                 if (!(flags & MSG_PEEK))
1300                         tp->urg_data = TCP_URG_READ;
1301
1302                 /* Read urgent data. */
1303                 msg->msg_flags |= MSG_OOB;
1304
1305                 if (len > 0) {
1306                         if (!(flags & MSG_TRUNC))
1307                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1308                         len = 1;
1309                 } else
1310                         msg->msg_flags |= MSG_TRUNC;
1311
1312                 return err ? -EFAULT : len;
1313         }
1314
1315         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1316                 return 0;
1317
1318         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1319          * the available implementations agree in this case:
1320          * this call should never block, independent of the
1321          * blocking state of the socket.
1322          * Mike <pall@rz.uni-karlsruhe.de>
1323          */
1324         return -EAGAIN;
1325 }
1326
1327 /* Clean up the receive buffer for full frames taken by the user,
1328  * then send an ACK if necessary.  COPIED is the number of bytes
1329  * tcp_recvmsg has given to the user so far, it speeds up the
1330  * calculation of whether or not we must ACK for the sake of
1331  * a window update.
1332  */
1333 void cleanup_rbuf(struct sock *sk, int copied)
1334 {
1335         struct tcp_opt *tp = tcp_sk(sk);
1336         int time_to_ack = 0;
1337
1338 #if TCP_DEBUG
1339         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1340
1341         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1342 #endif
1343
1344         if (tcp_ack_scheduled(tp)) {
1345                    /* Delayed ACKs frequently hit locked sockets during bulk
1346                     * receive. */
1347                 if (tp->ack.blocked ||
1348                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1349                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1350                     /*
1351                      * If this read emptied read buffer, we send ACK, if
1352                      * connection is not bidirectional, user drained
1353                      * receive buffer and there was a small segment
1354                      * in queue.
1355                      */
1356                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1357                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1358                         time_to_ack = 1;
1359         }
1360
1361         /* We send an ACK if we can now advertise a non-zero window
1362          * which has been raised "significantly".
1363          *
1364          * Even if window raised up to infinity, do not send window open ACK
1365          * in states, where we will not receive more. It is useless.
1366          */
1367         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1368                 __u32 rcv_window_now = tcp_receive_window(tp);
1369
1370                 /* Optimize, __tcp_select_window() is not cheap. */
1371                 if (2*rcv_window_now <= tp->window_clamp) {
1372                         __u32 new_window = __tcp_select_window(sk);
1373
1374                         /* Send ACK now, if this read freed lots of space
1375                          * in our buffer. Certainly, new_window is new window.
1376                          * We can advertise it now, if it is not less than current one.
1377                          * "Lots" means "at least twice" here.
1378                          */
1379                         if (new_window && new_window >= 2 * rcv_window_now)
1380                                 time_to_ack = 1;
1381                 }
1382         }
1383         if (time_to_ack)
1384                 tcp_send_ack(sk);
1385 }
1386
1387 static void tcp_prequeue_process(struct sock *sk)
1388 {
1389         struct sk_buff *skb;
1390         struct tcp_opt *tp = tcp_sk(sk);
1391
1392         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1393
1394         /* RX process wants to run with disabled BHs, though it is not
1395          * necessary */
1396         local_bh_disable();
1397         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1398                 sk->sk_backlog_rcv(sk, skb);
1399         local_bh_enable();
1400
1401         /* Clear memory counter. */
1402         tp->ucopy.memory = 0;
1403 }
1404
1405 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1406 {
1407         struct sk_buff *skb;
1408         u32 offset;
1409
1410         skb_queue_walk(&sk->sk_receive_queue, skb) {
1411                 offset = seq - TCP_SKB_CB(skb)->seq;
1412                 if (skb->h.th->syn)
1413                         offset--;
1414                 if (offset < skb->len || skb->h.th->fin) {
1415                         *off = offset;
1416                         return skb;
1417                 }
1418         }
1419         return NULL;
1420 }
1421
1422 /*
1423  * This routine provides an alternative to tcp_recvmsg() for routines
1424  * that would like to handle copying from skbuffs directly in 'sendfile'
1425  * fashion.
1426  * Note:
1427  *      - It is assumed that the socket was locked by the caller.
1428  *      - The routine does not block.
1429  *      - At present, there is no support for reading OOB data
1430  *        or for 'peeking' the socket using this routine
1431  *        (although both would be easy to implement).
1432  */
1433 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1434                   sk_read_actor_t recv_actor)
1435 {
1436         struct sk_buff *skb;
1437         struct tcp_opt *tp = tcp_sk(sk);
1438         u32 seq = tp->copied_seq;
1439         u32 offset;
1440         int copied = 0;
1441
1442         if (sk->sk_state == TCP_LISTEN)
1443                 return -ENOTCONN;
1444         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1445                 if (offset < skb->len) {
1446                         size_t used, len;
1447
1448                         len = skb->len - offset;
1449                         /* Stop reading if we hit a patch of urgent data */
1450                         if (tp->urg_data) {
1451                                 u32 urg_offset = tp->urg_seq - seq;
1452                                 if (urg_offset < len)
1453                                         len = urg_offset;
1454                                 if (!len)
1455                                         break;
1456                         }
1457                         used = recv_actor(desc, skb, offset, len);
1458                         if (used <= len) {
1459                                 seq += used;
1460                                 copied += used;
1461                                 offset += used;
1462                         }
1463                         if (offset != skb->len)
1464                                 break;
1465                 }
1466                 if (skb->h.th->fin) {
1467                         sk_eat_skb(sk, skb);
1468                         ++seq;
1469                         break;
1470                 }
1471                 sk_eat_skb(sk, skb);
1472                 if (!desc->count)
1473                         break;
1474         }
1475         tp->copied_seq = seq;
1476
1477         tcp_rcv_space_adjust(sk);
1478
1479         /* Clean up data we have read: This will do ACK frames. */
1480         if (copied)
1481                 cleanup_rbuf(sk, copied);
1482         return copied;
1483 }
1484
1485 /*
1486  *      This routine copies from a sock struct into the user buffer.
1487  *
1488  *      Technical note: in 2.3 we work on _locked_ socket, so that
1489  *      tricks with *seq access order and skb->users are not required.
1490  *      Probably, code can be easily improved even more.
1491  */
1492
1493 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1494                 size_t len, int nonblock, int flags, int *addr_len)
1495 {
1496         struct tcp_opt *tp = tcp_sk(sk);
1497         int copied = 0;
1498         u32 peek_seq;
1499         u32 *seq;
1500         unsigned long used;
1501         int err;
1502         int target;             /* Read at least this many bytes */
1503         long timeo;
1504         struct task_struct *user_recv = NULL;
1505
1506         lock_sock(sk);
1507
1508         TCP_CHECK_TIMER(sk);
1509
1510         err = -ENOTCONN;
1511         if (sk->sk_state == TCP_LISTEN)
1512                 goto out;
1513
1514         timeo = sock_rcvtimeo(sk, nonblock);
1515
1516         /* Urgent data needs to be handled specially. */
1517         if (flags & MSG_OOB)
1518                 goto recv_urg;
1519
1520         seq = &tp->copied_seq;
1521         if (flags & MSG_PEEK) {
1522                 peek_seq = tp->copied_seq;
1523                 seq = &peek_seq;
1524         }
1525
1526         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1527
1528         do {
1529                 struct sk_buff *skb;
1530                 u32 offset;
1531
1532                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1533                 if (tp->urg_data && tp->urg_seq == *seq) {
1534                         if (copied)
1535                                 break;
1536                         if (signal_pending(current)) {
1537                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1538                                 break;
1539                         }
1540                 }
1541
1542                 /* Next get a buffer. */
1543
1544                 skb = skb_peek(&sk->sk_receive_queue);
1545                 do {
1546                         if (!skb)
1547                                 break;
1548
1549                         /* Now that we have two receive queues this
1550                          * shouldn't happen.
1551                          */
1552                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1553                                 printk(KERN_INFO "recvmsg bug: copied %X "
1554                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1555                                 break;
1556                         }
1557                         offset = *seq - TCP_SKB_CB(skb)->seq;
1558                         if (skb->h.th->syn)
1559                                 offset--;
1560                         if (offset < skb->len)
1561                                 goto found_ok_skb;
1562                         if (skb->h.th->fin)
1563                                 goto found_fin_ok;
1564                         BUG_TRAP(flags & MSG_PEEK);
1565                         skb = skb->next;
1566                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1567
1568                 /* Well, if we have backlog, try to process it now yet. */
1569
1570                 if (copied >= target && !sk->sk_backlog.tail)
1571                         break;
1572
1573                 if (copied) {
1574                         if (sk->sk_err ||
1575                             sk->sk_state == TCP_CLOSE ||
1576                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1577                             !timeo ||
1578                             signal_pending(current) ||
1579                             (flags & MSG_PEEK))
1580                                 break;
1581                 } else {
1582                         if (sock_flag(sk, SOCK_DONE))
1583                                 break;
1584
1585                         if (sk->sk_err) {
1586                                 copied = sock_error(sk);
1587                                 break;
1588                         }
1589
1590                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1591                                 break;
1592
1593                         if (sk->sk_state == TCP_CLOSE) {
1594                                 if (!sock_flag(sk, SOCK_DONE)) {
1595                                         /* This occurs when user tries to read
1596                                          * from never connected socket.
1597                                          */
1598                                         copied = -ENOTCONN;
1599                                         break;
1600                                 }
1601                                 break;
1602                         }
1603
1604                         if (!timeo) {
1605                                 copied = -EAGAIN;
1606                                 break;
1607                         }
1608
1609                         if (signal_pending(current)) {
1610                                 copied = sock_intr_errno(timeo);
1611                                 break;
1612                         }
1613                 }
1614
1615                 cleanup_rbuf(sk, copied);
1616
1617                 if (tp->ucopy.task == user_recv) {
1618                         /* Install new reader */
1619                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1620                                 user_recv = current;
1621                                 tp->ucopy.task = user_recv;
1622                                 tp->ucopy.iov = msg->msg_iov;
1623                         }
1624
1625                         tp->ucopy.len = len;
1626
1627                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1628                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1629
1630                         /* Ugly... If prequeue is not empty, we have to
1631                          * process it before releasing socket, otherwise
1632                          * order will be broken at second iteration.
1633                          * More elegant solution is required!!!
1634                          *
1635                          * Look: we have the following (pseudo)queues:
1636                          *
1637                          * 1. packets in flight
1638                          * 2. backlog
1639                          * 3. prequeue
1640                          * 4. receive_queue
1641                          *
1642                          * Each queue can be processed only if the next ones
1643                          * are empty. At this point we have empty receive_queue.
1644                          * But prequeue _can_ be not empty after 2nd iteration,
1645                          * when we jumped to start of loop because backlog
1646                          * processing added something to receive_queue.
1647                          * We cannot release_sock(), because backlog contains
1648                          * packets arrived _after_ prequeued ones.
1649                          *
1650                          * Shortly, algorithm is clear --- to process all
1651                          * the queues in order. We could make it more directly,
1652                          * requeueing packets from backlog to prequeue, if
1653                          * is not empty. It is more elegant, but eats cycles,
1654                          * unfortunately.
1655                          */
1656                         if (skb_queue_len(&tp->ucopy.prequeue))
1657                                 goto do_prequeue;
1658
1659                         /* __ Set realtime policy in scheduler __ */
1660                 }
1661
1662                 if (copied >= target) {
1663                         /* Do not sleep, just process backlog. */
1664                         release_sock(sk);
1665                         lock_sock(sk);
1666                 } else
1667                         sk_wait_data(sk, &timeo);
1668
1669                 if (user_recv) {
1670                         int chunk;
1671
1672                         /* __ Restore normal policy in scheduler __ */
1673
1674                         if ((chunk = len - tp->ucopy.len) != 0) {
1675                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1676                                 len -= chunk;
1677                                 copied += chunk;
1678                         }
1679
1680                         if (tp->rcv_nxt == tp->copied_seq &&
1681                             skb_queue_len(&tp->ucopy.prequeue)) {
1682 do_prequeue:
1683                                 tcp_prequeue_process(sk);
1684
1685                                 if ((chunk = len - tp->ucopy.len) != 0) {
1686                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1687                                         len -= chunk;
1688                                         copied += chunk;
1689                                 }
1690                         }
1691                 }
1692                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1693                         if (net_ratelimit())
1694                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1695                                        current->comm, current->pid);
1696                         peek_seq = tp->copied_seq;
1697                 }
1698                 continue;
1699
1700         found_ok_skb:
1701                 /* Ok so how much can we use? */
1702                 used = skb->len - offset;
1703                 if (len < used)
1704                         used = len;
1705
1706                 /* Do we have urgent data here? */
1707                 if (tp->urg_data) {
1708                         u32 urg_offset = tp->urg_seq - *seq;
1709                         if (urg_offset < used) {
1710                                 if (!urg_offset) {
1711                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1712                                                 ++*seq;
1713                                                 offset++;
1714                                                 used--;
1715                                                 if (!used)
1716                                                         goto skip_copy;
1717                                         }
1718                                 } else
1719                                         used = urg_offset;
1720                         }
1721                 }
1722
1723                 if (!(flags & MSG_TRUNC)) {
1724                         err = skb_copy_datagram_iovec(skb, offset,
1725                                                       msg->msg_iov, used);
1726                         if (err) {
1727                                 /* Exception. Bailout! */
1728                                 if (!copied)
1729                                         copied = -EFAULT;
1730                                 break;
1731                         }
1732                 }
1733
1734                 *seq += used;
1735                 copied += used;
1736                 len -= used;
1737
1738                 tcp_rcv_space_adjust(sk);
1739
1740 skip_copy:
1741                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1742                         tp->urg_data = 0;
1743                         tcp_fast_path_check(sk, tp);
1744                 }
1745                 if (used + offset < skb->len)
1746                         continue;
1747
1748                 if (skb->h.th->fin)
1749                         goto found_fin_ok;
1750                 if (!(flags & MSG_PEEK))
1751                         sk_eat_skb(sk, skb);
1752                 continue;
1753
1754         found_fin_ok:
1755                 /* Process the FIN. */
1756                 ++*seq;
1757                 if (!(flags & MSG_PEEK))
1758                         sk_eat_skb(sk, skb);
1759                 break;
1760         } while (len > 0);
1761
1762         if (user_recv) {
1763                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1764                         int chunk;
1765
1766                         tp->ucopy.len = copied > 0 ? len : 0;
1767
1768                         tcp_prequeue_process(sk);
1769
1770                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1771                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1772                                 len -= chunk;
1773                                 copied += chunk;
1774                         }
1775                 }
1776
1777                 tp->ucopy.task = NULL;
1778                 tp->ucopy.len = 0;
1779         }
1780
1781         /* According to UNIX98, msg_name/msg_namelen are ignored
1782          * on connected socket. I was just happy when found this 8) --ANK
1783          */
1784
1785         /* Clean up data we have read: This will do ACK frames. */
1786         cleanup_rbuf(sk, copied);
1787
1788         TCP_CHECK_TIMER(sk);
1789         release_sock(sk);
1790         return copied;
1791
1792 out:
1793         TCP_CHECK_TIMER(sk);
1794         release_sock(sk);
1795         return err;
1796
1797 recv_urg:
1798         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1799         goto out;
1800 }
1801
1802 /*
1803  *      State processing on a close. This implements the state shift for
1804  *      sending our FIN frame. Note that we only send a FIN for some
1805  *      states. A shutdown() may have already sent the FIN, or we may be
1806  *      closed.
1807  */
1808
1809 static unsigned char new_state[16] = {
1810   /* current state:        new state:      action:      */
1811   /* (Invalid)          */ TCP_CLOSE,
1812   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1813   /* TCP_SYN_SENT       */ TCP_CLOSE,
1814   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1815   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1816   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1817   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1818   /* TCP_CLOSE          */ TCP_CLOSE,
1819   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1820   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1821   /* TCP_LISTEN         */ TCP_CLOSE,
1822   /* TCP_CLOSING        */ TCP_CLOSING,
1823 };
1824
1825 static int tcp_close_state(struct sock *sk)
1826 {
1827         int next = (int)new_state[sk->sk_state];
1828         int ns = next & TCP_STATE_MASK;
1829
1830         tcp_set_state(sk, ns);
1831
1832         return next & TCP_ACTION_FIN;
1833 }
1834
1835 /*
1836  *      Shutdown the sending side of a connection. Much like close except
1837  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1838  */
1839
1840 void tcp_shutdown(struct sock *sk, int how)
1841 {
1842         /*      We need to grab some memory, and put together a FIN,
1843          *      and then put it into the queue to be sent.
1844          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1845          */
1846         if (!(how & SEND_SHUTDOWN))
1847                 return;
1848
1849         /* If we've already sent a FIN, or it's a closed state, skip this. */
1850         if ((1 << sk->sk_state) &
1851             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1852              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1853                 /* Clear out any half completed packets.  FIN if needed. */
1854                 if (tcp_close_state(sk))
1855                         tcp_send_fin(sk);
1856         }
1857 }
1858
1859
1860 /*
1861  *      Return 1 if we still have things to send in our buffers.
1862  */
1863
1864 static inline int closing(struct sock *sk)
1865 {
1866         return (1 << sk->sk_state) &
1867                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1868 }
1869
1870 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1871 {
1872         /* First the read buffer. */
1873         __skb_queue_purge(&sk->sk_receive_queue);
1874
1875         /* Next, the error queue. */
1876         __skb_queue_purge(&sk->sk_error_queue);
1877
1878         /* Next, the write queue. */
1879         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1880
1881         /* Account for returned memory. */
1882         tcp_mem_reclaim(sk);
1883
1884         BUG_TRAP(!sk->sk_wmem_queued);
1885         BUG_TRAP(!sk->sk_forward_alloc);
1886
1887         /* It is _impossible_ for the backlog to contain anything
1888          * when we get here.  All user references to this socket
1889          * have gone away, only the net layer knows can touch it.
1890          */
1891 }
1892
1893 /*
1894  * At this point, there should be no process reference to this
1895  * socket, and thus no user references at all.  Therefore we
1896  * can assume the socket waitqueue is inactive and nobody will
1897  * try to jump onto it.
1898  */
1899 void tcp_destroy_sock(struct sock *sk)
1900 {
1901         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1902         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1903
1904         /* It cannot be in hash table! */
1905         BUG_TRAP(sk_unhashed(sk));
1906
1907         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1908         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1909
1910 #ifdef TCP_DEBUG
1911         if (sk->sk_zapped) {
1912                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1913                 sock_hold(sk);
1914         }
1915         sk->sk_zapped = 1;
1916 #endif
1917
1918         sk->sk_prot->destroy(sk);
1919
1920         tcp_kill_sk_queues(sk);
1921
1922         xfrm_sk_free_policy(sk);
1923
1924 #ifdef INET_REFCNT_DEBUG
1925         if (atomic_read(&sk->sk_refcnt) != 1) {
1926                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1927                        sk, atomic_read(&sk->sk_refcnt));
1928         }
1929 #endif
1930
1931         atomic_dec(&tcp_orphan_count);
1932         sock_put(sk);
1933 }
1934
1935 void tcp_close(struct sock *sk, long timeout)
1936 {
1937         struct sk_buff *skb;
1938         int data_was_unread = 0;
1939
1940         lock_sock(sk);
1941         sk->sk_shutdown = SHUTDOWN_MASK;
1942
1943         if (sk->sk_state == TCP_LISTEN) {
1944                 tcp_set_state(sk, TCP_CLOSE);
1945
1946                 /* Special case. */
1947                 tcp_listen_stop(sk);
1948
1949                 goto adjudge_to_death;
1950         }
1951
1952         /*  We need to flush the recv. buffs.  We do this only on the
1953          *  descriptor close, not protocol-sourced closes, because the
1954          *  reader process may not have drained the data yet!
1955          */
1956         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1957                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1958                           skb->h.th->fin;
1959                 data_was_unread += len;
1960                 __kfree_skb(skb);
1961         }
1962
1963         tcp_mem_reclaim(sk);
1964
1965         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1966          * 3.10, we send a RST here because data was lost.  To
1967          * witness the awful effects of the old behavior of always
1968          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1969          * a bulk GET in an FTP client, suspend the process, wait
1970          * for the client to advertise a zero window, then kill -9
1971          * the FTP client, wheee...  Note: timeout is always zero
1972          * in such a case.
1973          */
1974         if (data_was_unread) {
1975                 /* Unread data was tossed, zap the connection. */
1976                 NET_INC_STATS_USER(TCPAbortOnClose);
1977                 tcp_set_state(sk, TCP_CLOSE);
1978                 tcp_send_active_reset(sk, GFP_KERNEL);
1979         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1980                 /* Check zero linger _after_ checking for unread data. */
1981                 sk->sk_prot->disconnect(sk, 0);
1982                 NET_INC_STATS_USER(TCPAbortOnData);
1983         } else if (tcp_close_state(sk)) {
1984                 /* We FIN if the application ate all the data before
1985                  * zapping the connection.
1986                  */
1987
1988                 /* RED-PEN. Formally speaking, we have broken TCP state
1989                  * machine. State transitions:
1990                  *
1991                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1992                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1993                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1994                  *
1995                  * are legal only when FIN has been sent (i.e. in window),
1996                  * rather than queued out of window. Purists blame.
1997                  *
1998                  * F.e. "RFC state" is ESTABLISHED,
1999                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2000                  *
2001                  * The visible declinations are that sometimes
2002                  * we enter time-wait state, when it is not required really
2003                  * (harmless), do not send active resets, when they are
2004                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2005                  * they look as CLOSING or LAST_ACK for Linux)
2006                  * Probably, I missed some more holelets.
2007                  *                                              --ANK
2008                  */
2009                 tcp_send_fin(sk);
2010         }
2011
2012         if (timeout) {
2013                 struct task_struct *tsk = current;
2014                 DEFINE_WAIT(wait);
2015
2016                 do {
2017                         prepare_to_wait(sk->sk_sleep, &wait,
2018                                         TASK_INTERRUPTIBLE);
2019                         if (!closing(sk))
2020                                 break;
2021                         release_sock(sk);
2022                         timeout = schedule_timeout(timeout);
2023                         lock_sock(sk);
2024                 } while (!signal_pending(tsk) && timeout);
2025
2026                 finish_wait(sk->sk_sleep, &wait);
2027         }
2028
2029 adjudge_to_death:
2030         /* It is the last release_sock in its life. It will remove backlog. */
2031         release_sock(sk);
2032
2033
2034         /* Now socket is owned by kernel and we acquire BH lock
2035            to finish close. No need to check for user refs.
2036          */
2037         local_bh_disable();
2038         bh_lock_sock(sk);
2039         BUG_TRAP(!sock_owned_by_user(sk));
2040
2041         sock_hold(sk);
2042         sock_orphan(sk);
2043
2044         /*      This is a (useful) BSD violating of the RFC. There is a
2045          *      problem with TCP as specified in that the other end could
2046          *      keep a socket open forever with no application left this end.
2047          *      We use a 3 minute timeout (about the same as BSD) then kill
2048          *      our end. If they send after that then tough - BUT: long enough
2049          *      that we won't make the old 4*rto = almost no time - whoops
2050          *      reset mistake.
2051          *
2052          *      Nope, it was not mistake. It is really desired behaviour
2053          *      f.e. on http servers, when such sockets are useless, but
2054          *      consume significant resources. Let's do it with special
2055          *      linger2 option.                                 --ANK
2056          */
2057
2058         if (sk->sk_state == TCP_FIN_WAIT2) {
2059                 struct tcp_opt *tp = tcp_sk(sk);
2060                 if (tp->linger2 < 0) {
2061                         tcp_set_state(sk, TCP_CLOSE);
2062                         tcp_send_active_reset(sk, GFP_ATOMIC);
2063                         NET_INC_STATS_BH(TCPAbortOnLinger);
2064                 } else {
2065                         int tmo = tcp_fin_time(tp);
2066
2067                         if (tmo > TCP_TIMEWAIT_LEN) {
2068                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2069                         } else {
2070                                 atomic_inc(&tcp_orphan_count);
2071                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2072                                 goto out;
2073                         }
2074                 }
2075         }
2076         if (sk->sk_state != TCP_CLOSE) {
2077                 tcp_mem_reclaim(sk);
2078                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2079                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2080                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2081                         if (net_ratelimit())
2082                                 printk(KERN_INFO "TCP: too many of orphaned "
2083                                        "sockets\n");
2084                         tcp_set_state(sk, TCP_CLOSE);
2085                         tcp_send_active_reset(sk, GFP_ATOMIC);
2086                         NET_INC_STATS_BH(TCPAbortOnMemory);
2087                 }
2088         }
2089         atomic_inc(&tcp_orphan_count);
2090
2091         if (sk->sk_state == TCP_CLOSE)
2092                 tcp_destroy_sock(sk);
2093         /* Otherwise, socket is reprieved until protocol close. */
2094
2095 out:
2096         bh_unlock_sock(sk);
2097         local_bh_enable();
2098         sock_put(sk);
2099 }
2100
2101 /* These states need RST on ABORT according to RFC793 */
2102
2103 static inline int tcp_need_reset(int state)
2104 {
2105         return (1 << state) &
2106                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2107                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2108 }
2109
2110 int tcp_disconnect(struct sock *sk, int flags)
2111 {
2112         struct inet_opt *inet = inet_sk(sk);
2113         struct tcp_opt *tp = tcp_sk(sk);
2114         int err = 0;
2115         int old_state = sk->sk_state;
2116
2117         if (old_state != TCP_CLOSE)
2118                 tcp_set_state(sk, TCP_CLOSE);
2119
2120         /* ABORT function of RFC793 */
2121         if (old_state == TCP_LISTEN) {
2122                 tcp_listen_stop(sk);
2123         } else if (tcp_need_reset(old_state) ||
2124                    (tp->snd_nxt != tp->write_seq &&
2125                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2126                 /* The last check adjusts for discrepance of Linux wrt. RFC
2127                  * states
2128                  */
2129                 tcp_send_active_reset(sk, gfp_any());
2130                 sk->sk_err = ECONNRESET;
2131         } else if (old_state == TCP_SYN_SENT)
2132                 sk->sk_err = ECONNRESET;
2133
2134         tcp_clear_xmit_timers(sk);
2135         __skb_queue_purge(&sk->sk_receive_queue);
2136         tcp_writequeue_purge(sk);
2137         __skb_queue_purge(&tp->out_of_order_queue);
2138
2139         inet->dport = 0;
2140
2141         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2142                 inet_reset_saddr(sk);
2143
2144         sk->sk_shutdown = 0;
2145         sock_reset_flag(sk, SOCK_DONE);
2146         tp->srtt = 0;
2147         if ((tp->write_seq += tp->max_window + 2) == 0)
2148                 tp->write_seq = 1;
2149         tp->backoff = 0;
2150         tp->snd_cwnd = 2;
2151         tp->probes_out = 0;
2152         tp->packets_out = 0;
2153         tp->snd_ssthresh = 0x7fffffff;
2154         tp->snd_cwnd_cnt = 0;
2155         tcp_set_ca_state(tp, TCP_CA_Open);
2156         tcp_clear_retrans(tp);
2157         tcp_delack_init(tp);
2158         tp->send_head = NULL;
2159         tp->saw_tstamp = 0;
2160         tcp_sack_reset(tp);
2161         __sk_dst_reset(sk);
2162
2163         BUG_TRAP(!inet->num || tp->bind_hash);
2164
2165         sk->sk_error_report(sk);
2166         return err;
2167 }
2168
2169 /*
2170  *      Wait for an incoming connection, avoid race
2171  *      conditions. This must be called with the socket locked.
2172  */
2173 static int wait_for_connect(struct sock *sk, long timeo)
2174 {
2175         struct tcp_opt *tp = tcp_sk(sk);
2176         DEFINE_WAIT(wait);
2177         int err;
2178
2179         /*
2180          * True wake-one mechanism for incoming connections: only
2181          * one process gets woken up, not the 'whole herd'.
2182          * Since we do not 'race & poll' for established sockets
2183          * anymore, the common case will execute the loop only once.
2184          *
2185          * Subtle issue: "add_wait_queue_exclusive()" will be added
2186          * after any current non-exclusive waiters, and we know that
2187          * it will always _stay_ after any new non-exclusive waiters
2188          * because all non-exclusive waiters are added at the
2189          * beginning of the wait-queue. As such, it's ok to "drop"
2190          * our exclusiveness temporarily when we get woken up without
2191          * having to remove and re-insert us on the wait queue.
2192          */
2193         for (;;) {
2194                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2195                                           TASK_INTERRUPTIBLE);
2196                 release_sock(sk);
2197                 if (!tp->accept_queue)
2198                         timeo = schedule_timeout(timeo);
2199                 lock_sock(sk);
2200                 err = 0;
2201                 if (tp->accept_queue)
2202                         break;
2203                 err = -EINVAL;
2204                 if (sk->sk_state != TCP_LISTEN)
2205                         break;
2206                 err = sock_intr_errno(timeo);
2207                 if (signal_pending(current))
2208                         break;
2209                 err = -EAGAIN;
2210                 if (!timeo)
2211                         break;
2212         }
2213         finish_wait(sk->sk_sleep, &wait);
2214         return err;
2215 }
2216
2217 /*
2218  *      This will accept the next outstanding connection.
2219  */
2220
2221 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2222 {
2223         struct tcp_opt *tp = tcp_sk(sk);
2224         struct open_request *req;
2225         struct sock *newsk;
2226         int error;
2227 #ifdef CONFIG_ACCEPT_QUEUES
2228         int prev_class = 0;
2229         int first;
2230 #endif
2231
2232         lock_sock(sk);
2233
2234         /* We need to make sure that this socket is listening,
2235          * and that it has something pending.
2236          */
2237         error = -EINVAL;
2238         if (sk->sk_state != TCP_LISTEN)
2239                 goto out;
2240
2241         /* Find already established connection */
2242         if (!tp->accept_queue) {
2243                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2244                 /* If this is a non blocking socket don't sleep */
2245                 error = -EAGAIN;
2246                 if (!timeo)
2247                         goto out;
2248
2249                 error = wait_for_connect(sk, timeo);
2250                 if (error)
2251                         goto out;
2252         }
2253
2254 #ifndef CONFIG_ACCEPT_QUEUES
2255         req = tp->accept_queue;
2256         if ((tp->accept_queue = req->dl_next) == NULL)
2257                 tp->accept_queue_tail = NULL;
2258         newsk = req->sk;
2259         sk_acceptq_removed(sk);
2260 #else
2261         first = tp->class_index;
2262         /* We should always have  request queued here. The accept_queue
2263          * is already checked for NULL above.
2264          */
2265         while(!tp->acceptq[first].aq_head) {
2266                 tp->acceptq[first].aq_cnt = 0;
2267                 first = (first+1) & ~NUM_ACCEPT_QUEUES;
2268         }
2269         req = tp->acceptq[first].aq_head;
2270         tp->acceptq[first].aq_qcount--;
2271         tp->acceptq[first].aq_count++;
2272         tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
2273
2274         for (prev_class= first-1 ; prev_class >=0; prev_class--)
2275                 if (tp->acceptq[prev_class].aq_tail)
2276                         break;
2277         if (prev_class>=0)
2278                 tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next;
2279         else
2280                 tp->accept_queue = req->dl_next;
2281
2282         if (req == tp->acceptq[first].aq_tail)
2283                 tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
2284         else
2285                 tp->acceptq[first].aq_head = req->dl_next;
2286
2287         if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
2288                 tp->acceptq[first].aq_cnt = 0;
2289                 tp->class_index = ++first & (NUM_ACCEPT_QUEUES-1);
2290         }
2291         newsk = req->sk;
2292         sk_acceptq_removed(sk, req->acceptq_class);
2293 #endif
2294         tcp_openreq_fastfree(req);
2295         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2296         release_sock(sk);
2297         return newsk;
2298
2299 out:
2300         release_sock(sk);
2301         *err = error;
2302         return NULL;
2303 }
2304
2305
2306 /*
2307  *      Socket option code for TCP.
2308  */
2309 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2310                    int optlen)
2311 {
2312         struct tcp_opt *tp = tcp_sk(sk);
2313         int val;
2314         int err = 0;
2315
2316         if (level != SOL_TCP)
2317                 return tp->af_specific->setsockopt(sk, level, optname,
2318                                                    optval, optlen);
2319
2320         if (optlen < sizeof(int))
2321                 return -EINVAL;
2322
2323         if (get_user(val, (int __user *)optval))
2324                 return -EFAULT;
2325
2326         lock_sock(sk);
2327
2328         switch (optname) {
2329         case TCP_MAXSEG:
2330                 /* Values greater than interface MTU won't take effect. However
2331                  * at the point when this call is done we typically don't yet
2332                  * know which interface is going to be used */
2333                 if (val < 8 || val > MAX_TCP_WINDOW) {
2334                         err = -EINVAL;
2335                         break;
2336                 }
2337                 tp->user_mss = val;
2338                 break;
2339
2340         case TCP_NODELAY:
2341                 if (val) {
2342                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2343                          * this option on corked socket is remembered, but
2344                          * it is not activated until cork is cleared.
2345                          *
2346                          * However, when TCP_NODELAY is set we make
2347                          * an explicit push, which overrides even TCP_CORK
2348                          * for currently queued segments.
2349                          */
2350                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2351                         tcp_push_pending_frames(sk, tp);
2352                 } else {
2353                         tp->nonagle &= ~TCP_NAGLE_OFF;
2354                 }
2355                 break;
2356
2357         case TCP_CORK:
2358                 /* When set indicates to always queue non-full frames.
2359                  * Later the user clears this option and we transmit
2360                  * any pending partial frames in the queue.  This is
2361                  * meant to be used alongside sendfile() to get properly
2362                  * filled frames when the user (for example) must write
2363                  * out headers with a write() call first and then use
2364                  * sendfile to send out the data parts.
2365                  *
2366                  * TCP_CORK can be set together with TCP_NODELAY and it is
2367                  * stronger than TCP_NODELAY.
2368                  */
2369                 if (val) {
2370                         tp->nonagle |= TCP_NAGLE_CORK;
2371                 } else {
2372                         tp->nonagle &= ~TCP_NAGLE_CORK;
2373                         if (tp->nonagle&TCP_NAGLE_OFF)
2374                                 tp->nonagle |= TCP_NAGLE_PUSH;
2375                         tcp_push_pending_frames(sk, tp);
2376                 }
2377                 break;
2378
2379         case TCP_KEEPIDLE:
2380                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2381                         err = -EINVAL;
2382                 else {
2383                         tp->keepalive_time = val * HZ;
2384                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2385                             !((1 << sk->sk_state) &
2386                               (TCPF_CLOSE | TCPF_LISTEN))) {
2387                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2388                                 if (tp->keepalive_time > elapsed)
2389                                         elapsed = tp->keepalive_time - elapsed;
2390                                 else
2391                                         elapsed = 0;
2392                                 tcp_reset_keepalive_timer(sk, elapsed);
2393                         }
2394                 }
2395                 break;
2396         case TCP_KEEPINTVL:
2397                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2398                         err = -EINVAL;
2399                 else
2400                         tp->keepalive_intvl = val * HZ;
2401                 break;
2402         case TCP_KEEPCNT:
2403                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2404                         err = -EINVAL;
2405                 else
2406                         tp->keepalive_probes = val;
2407                 break;
2408         case TCP_SYNCNT:
2409                 if (val < 1 || val > MAX_TCP_SYNCNT)
2410                         err = -EINVAL;
2411                 else
2412                         tp->syn_retries = val;
2413                 break;
2414
2415         case TCP_LINGER2:
2416                 if (val < 0)
2417                         tp->linger2 = -1;
2418                 else if (val > sysctl_tcp_fin_timeout / HZ)
2419                         tp->linger2 = 0;
2420                 else
2421                         tp->linger2 = val * HZ;
2422                 break;
2423
2424         case TCP_DEFER_ACCEPT:
2425                 tp->defer_accept = 0;
2426                 if (val > 0) {
2427                         /* Translate value in seconds to number of
2428                          * retransmits */
2429                         while (tp->defer_accept < 32 &&
2430                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2431                                        tp->defer_accept))
2432                                 tp->defer_accept++;
2433                         tp->defer_accept++;
2434                 }
2435                 break;
2436
2437         case TCP_WINDOW_CLAMP:
2438                 if (!val) {
2439                         if (sk->sk_state != TCP_CLOSE) {
2440                                 err = -EINVAL;
2441                                 break;
2442                         }
2443                         tp->window_clamp = 0;
2444                 } else
2445                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2446                                                 SOCK_MIN_RCVBUF / 2 : val;
2447                 break;
2448
2449         case TCP_QUICKACK:
2450                 if (!val) {
2451                         tp->ack.pingpong = 1;
2452                 } else {
2453                         tp->ack.pingpong = 0;
2454                         if ((1 << sk->sk_state) &
2455                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2456                             tcp_ack_scheduled(tp)) {
2457                                 tp->ack.pending |= TCP_ACK_PUSHED;
2458                                 cleanup_rbuf(sk, 1);
2459                                 if (!(val & 1))
2460                                         tp->ack.pingpong = 1;
2461                         }
2462                 }
2463                 break;
2464
2465 #ifdef CONFIG_ACCEPT_QUEUES
2466         case TCP_ACCEPTQ_SHARE:
2467 #ifdef CONFIG_CKRM
2468                 // If CKRM is set then the shares are set through rcfs.
2469                 // Get shares will still succeed.
2470                 err = -EOPNOTSUPP;
2471                 break;
2472 #else
2473                 {
2474                         char share_wt[NUM_ACCEPT_QUEUES];
2475                         int i,j;
2476
2477                         if (sk->sk_state != TCP_LISTEN)
2478                                 return -EOPNOTSUPP;
2479
2480                         if (copy_from_user(share_wt,optval, optlen)) {
2481                                 err = -EFAULT;
2482                                 break;
2483                         }
2484                         j = 0;
2485                         for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
2486                                 if (share_wt[i]) {
2487                                         if (!j)
2488                                                 j = share_wt[i];
2489                                         else if (share_wt[i] < j) {
2490                                                 j = share_wt[i];
2491                                         }
2492                                 }
2493                                 else
2494                                         tp->acceptq[i].aq_ratio = 0;
2495
2496                         }
2497                         if (j == 0) {
2498                                 /* Class 0 is always valid. If nothing is
2499                                  * specified set class 0 as 1.
2500                                  */
2501                                 share_wt[0] = 1;
2502                                 j = 1;
2503                         }
2504                         for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
2505                                 tp->acceptq[i].aq_ratio = share_wt[i]/j;
2506                                 tp->acceptq[i].aq_cnt = 0;
2507                         }
2508                 }
2509                 break;
2510 #endif
2511 #endif
2512         default:
2513                 err = -ENOPROTOOPT;
2514                 break;
2515         };
2516         release_sock(sk);
2517         return err;
2518 }
2519
2520 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2521                    int __user *optlen)
2522 {
2523         struct tcp_opt *tp = tcp_sk(sk);
2524         int val, len;
2525
2526         if (level != SOL_TCP)
2527                 return tp->af_specific->getsockopt(sk, level, optname,
2528                                                    optval, optlen);
2529
2530         if (get_user(len, optlen))
2531                 return -EFAULT;
2532
2533         len = min_t(unsigned int, len, sizeof(int));
2534
2535         if (len < 0)
2536                 return -EINVAL;
2537
2538         switch (optname) {
2539         case TCP_MAXSEG:
2540                 val = tp->mss_cache_std;
2541                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2542                         val = tp->user_mss;
2543                 break;
2544         case TCP_NODELAY:
2545                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2546                 break;
2547         case TCP_CORK:
2548                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2549                 break;
2550         case TCP_KEEPIDLE:
2551                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2552                 break;
2553         case TCP_KEEPINTVL:
2554                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2555                 break;
2556         case TCP_KEEPCNT:
2557                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2558                 break;
2559         case TCP_SYNCNT:
2560                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2561                 break;
2562         case TCP_LINGER2:
2563                 val = tp->linger2;
2564                 if (val >= 0)
2565                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2566                 break;
2567         case TCP_DEFER_ACCEPT:
2568                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2569                                                (tp->defer_accept - 1));
2570                 break;
2571         case TCP_WINDOW_CLAMP:
2572                 val = tp->window_clamp;
2573                 break;
2574         case TCP_INFO: {
2575                 struct tcp_info info;
2576
2577                 if (get_user(len, optlen))
2578                         return -EFAULT;
2579
2580                 tcp_get_info(sk, &info);
2581
2582                 len = min_t(unsigned int, len, sizeof(info));
2583                 if (put_user(len, optlen))
2584                         return -EFAULT;
2585                 if (copy_to_user(optval, &info, len))
2586                         return -EFAULT;
2587                 return 0;
2588         }
2589         case TCP_QUICKACK:
2590                 val = !tp->ack.pingpong;
2591                 break;
2592
2593 #ifdef CONFIG_ACCEPT_QUEUES
2594         case TCP_ACCEPTQ_SHARE:
2595         {
2596                 struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
2597                 int i;
2598
2599                 if (sk->sk_state != TCP_LISTEN)
2600                         return -EOPNOTSUPP;
2601
2602                 if (get_user(len, optlen))
2603                         return -EFAULT;
2604
2605                 memset(tinfo, 0, sizeof(tinfo));
2606
2607                 for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
2608                         tinfo[i].acceptq_wait_time =
2609                              jiffies_to_msecs(tp->acceptq[i].aq_wait_time);
2610                         tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
2611                         tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
2612                         tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
2613                 }
2614
2615                 len = min_t(unsigned int, len, sizeof(tinfo));
2616                 if (put_user(len, optlen))
2617                         return -EFAULT;
2618
2619                 if (copy_to_user(optval, (char *)tinfo, len))
2620                         return -EFAULT;
2621
2622                 return 0;
2623         }
2624         break;
2625 #endif
2626         default:
2627                 return -ENOPROTOOPT;
2628         };
2629
2630         if (put_user(len, optlen))
2631                 return -EFAULT;
2632         if (copy_to_user(optval, &val, len))
2633                 return -EFAULT;
2634         return 0;
2635 }
2636
2637
2638 extern void __skb_cb_too_small_for_tcp(int, int);
2639 extern void tcpdiag_init(void);
2640
2641 static __initdata unsigned long thash_entries;
2642 static int __init set_thash_entries(char *str)
2643 {
2644         if (!str)
2645                 return 0;
2646         thash_entries = simple_strtoul(str, &str, 0);
2647         return 1;
2648 }
2649 __setup("thash_entries=", set_thash_entries);
2650
2651 void __init tcp_init(void)
2652 {
2653         struct sk_buff *skb = NULL;
2654         unsigned long goal;
2655         int order, i;
2656
2657         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2658                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2659                                            sizeof(skb->cb));
2660
2661         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2662                                                    sizeof(struct open_request),
2663                                                0, SLAB_HWCACHE_ALIGN,
2664                                                NULL, NULL);
2665         if (!tcp_openreq_cachep)
2666                 panic("tcp_init: Cannot alloc open_request cache.");
2667
2668         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2669                                               sizeof(struct tcp_bind_bucket),
2670                                               0, SLAB_HWCACHE_ALIGN,
2671                                               NULL, NULL);
2672         if (!tcp_bucket_cachep)
2673                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2674
2675         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2676                                                 sizeof(struct tcp_tw_bucket),
2677                                                 0, SLAB_HWCACHE_ALIGN,
2678                                                 NULL, NULL);
2679         if (!tcp_timewait_cachep)
2680                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2681
2682         /* Size and allocate the main established and bind bucket
2683          * hash tables.
2684          *
2685          * The methodology is similar to that of the buffer cache.
2686          */
2687         if (num_physpages >= (128 * 1024))
2688                 goal = num_physpages >> (21 - PAGE_SHIFT);
2689         else
2690                 goal = num_physpages >> (23 - PAGE_SHIFT);
2691
2692         if (thash_entries)
2693                 goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT;
2694         for (order = 0; (1UL << order) < goal; order++)
2695                 ;
2696         do {
2697                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2698                         sizeof(struct tcp_ehash_bucket);
2699                 tcp_ehash_size >>= 1;
2700                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2701                         tcp_ehash_size--;
2702                 tcp_ehash = (struct tcp_ehash_bucket *)
2703                         __get_free_pages(GFP_ATOMIC, order);
2704         } while (!tcp_ehash && --order > 0);
2705
2706         if (!tcp_ehash)
2707                 panic("Failed to allocate TCP established hash table\n");
2708         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2709                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2710                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2711         }
2712
2713         do {
2714                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2715                         sizeof(struct tcp_bind_hashbucket);
2716                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2717                         continue;
2718                 tcp_bhash = (struct tcp_bind_hashbucket *)
2719                         __get_free_pages(GFP_ATOMIC, order);
2720         } while (!tcp_bhash && --order >= 0);
2721
2722         if (!tcp_bhash)
2723                 panic("Failed to allocate TCP bind hash table\n");
2724         for (i = 0; i < tcp_bhash_size; i++) {
2725                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2726                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2727         }
2728
2729         /* Try to be a bit smarter and adjust defaults depending
2730          * on available memory.
2731          */
2732         if (order > 4) {
2733                 sysctl_local_port_range[0] = 32768;
2734                 sysctl_local_port_range[1] = 61000;
2735                 sysctl_tcp_max_tw_buckets = 180000;
2736                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2737                 sysctl_max_syn_backlog = 1024;
2738         } else if (order < 3) {
2739                 sysctl_local_port_range[0] = 1024 * (3 - order);
2740                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2741                 sysctl_tcp_max_orphans >>= (3 - order);
2742                 sysctl_max_syn_backlog = 128;
2743         }
2744         tcp_port_rover = sysctl_local_port_range[0] - 1;
2745
2746         sysctl_tcp_mem[0] =  768 << order;
2747         sysctl_tcp_mem[1] = 1024 << order;
2748         sysctl_tcp_mem[2] = 1536 << order;
2749
2750         if (order < 3) {
2751                 sysctl_tcp_wmem[2] = 64 * 1024;
2752                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2753                 sysctl_tcp_rmem[1] = 43689;
2754                 sysctl_tcp_rmem[2] = 2 * 43689;
2755         }
2756
2757         printk(KERN_INFO "TCP: Hash tables configured "
2758                "(established %d bind %d)\n",
2759                tcp_ehash_size << 1, tcp_bhash_size);
2760
2761         tcpdiag_init();
2762 }
2763
2764 EXPORT_SYMBOL(__tcp_mem_reclaim);
2765 EXPORT_SYMBOL(sysctl_tcp_rmem);
2766 EXPORT_SYMBOL(sysctl_tcp_wmem);
2767 EXPORT_SYMBOL(tcp_accept);
2768 EXPORT_SYMBOL(tcp_close);
2769 EXPORT_SYMBOL(tcp_close_state);
2770 EXPORT_SYMBOL(tcp_destroy_sock);
2771 EXPORT_SYMBOL(tcp_disconnect);
2772 EXPORT_SYMBOL(tcp_getsockopt);
2773 EXPORT_SYMBOL(tcp_ioctl);
2774 EXPORT_SYMBOL(tcp_openreq_cachep);
2775 EXPORT_SYMBOL(tcp_poll);
2776 EXPORT_SYMBOL(tcp_read_sock);
2777 EXPORT_SYMBOL(tcp_recvmsg);
2778 EXPORT_SYMBOL(tcp_sendmsg);
2779 EXPORT_SYMBOL(tcp_sendpage);
2780 EXPORT_SYMBOL(tcp_setsockopt);
2781 EXPORT_SYMBOL(tcp_shutdown);
2782 EXPORT_SYMBOL(tcp_sockets_allocated);
2783 EXPORT_SYMBOL(tcp_statistics);
2784 EXPORT_SYMBOL(tcp_timewait_cachep);
2785 EXPORT_SYMBOL_GPL(cleanup_rbuf);