net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/bootmem.h>
 260 #include <linux/cache.h>
 261 #include <linux/in.h>
 262
 263 #include <net/icmp.h>
 264 #include <net/tcp.h>
 265 #include <net/xfrm.h>
 266 #include <net/ip.h>
 267
 268
 269 #include <asm/uaccess.h>
 270 #include <asm/ioctls.h>
 271
 272 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 273
 274 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 275
 276 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 277
 278 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 279
 280 int sysctl_tcp_mem[3] __read_mostly;
 281 int sysctl_tcp_wmem[3] __read_mostly;
 282 int sysctl_tcp_rmem[3] __read_mostly;
 283
 284 EXPORT_SYMBOL(sysctl_tcp_mem);
 285 EXPORT_SYMBOL(sysctl_tcp_rmem);
 286 EXPORT_SYMBOL(sysctl_tcp_wmem);
 287
 288 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 289 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 290
 291 EXPORT_SYMBOL(tcp_memory_allocated);
 292 EXPORT_SYMBOL(tcp_sockets_allocated);
 293
 294 /*
 295  * Pressure flag: try to collapse.
 296  * Technical note: it is used by multiple contexts non atomically.
 297  * All the sk_stream_mem_schedule() is of this nature: accounting
 298  * is strict, actions are advisory and have some latency.
 299  */
 300 int tcp_memory_pressure;
 301
 302 EXPORT_SYMBOL(tcp_memory_pressure);
 303
 304 void tcp_enter_memory_pressure(void)
 305 {
 306         if (!tcp_memory_pressure) {
 307                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 308                 tcp_memory_pressure = 1;
 309         }
 310 }
 311
 312 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 313
 314 /*
 315  *      Wait for a TCP event.
 316  *
 317  *      Note that we don't need to lock the socket, as the upper poll layers
 318  *      take care of normal races (between the test and the event) and we don't
 319  *      go look at any of the socket buffers directly.
 320  */
 321 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 322 {
 323         unsigned int mask;
 324         struct sock *sk = sock->sk;
 325         struct tcp_sock *tp = tcp_sk(sk);
 326
 327         poll_wait(file, sk->sk_sleep, wait);
 328         if (sk->sk_state == TCP_LISTEN)
 329                 return inet_csk_listen_poll(sk);
 330
 331         /* Socket is not locked. We are protected from async events
 332            by poll logic and correct handling of state changes
 333            made by another threads is impossible in any case.
 334          */
 335
 336         mask = 0;
 337         if (sk->sk_err)
 338                 mask = POLLERR;
 339
 340         /*
 341          * POLLHUP is certainly not done right. But poll() doesn't
 342          * have a notion of HUP in just one direction, and for a
 343          * socket the read side is more interesting.
 344          *
 345          * Some poll() documentation says that POLLHUP is incompatible
 346          * with the POLLOUT/POLLWR flags, so somebody should check this
 347          * all. But careful, it tends to be safer to return too many
 348          * bits than too few, and you can easily break real applications
 349          * if you don't tell them that something has hung up!
 350          *
 351          * Check-me.
 352          *
 353          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 354          * our fs/select.c). It means that after we received EOF,
 355          * poll always returns immediately, making impossible poll() on write()
 356          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 357          * if and only if shutdown has been made in both directions.
 358          * Actually, it is interesting to look how Solaris and DUX
 359          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 360          * then we could set it on SND_SHUTDOWN. BTW examples given
 361          * in Stevens' books assume exactly this behaviour, it explains
 362          * why PULLHUP is incompatible with POLLOUT.    --ANK
 363          *
 364          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 365          * blocking on fresh not-connected or disconnected socket. --ANK
 366          */
 367         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 368                 mask |= POLLHUP;
 369         if (sk->sk_shutdown & RCV_SHUTDOWN)
 370                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 371
 372         /* Connected? */
 373         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 374                 /* Potential race condition. If read of tp below will
 375                  * escape above sk->sk_state, we can be illegally awaken
 376                  * in SYN_* states. */
 377                 if ((tp->rcv_nxt != tp->copied_seq) &&
 378                     (tp->urg_seq != tp->copied_seq ||
 379                      tp->rcv_nxt != tp->copied_seq + 1 ||
 380                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 381                         mask |= POLLIN | POLLRDNORM;
 382
 383                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 384                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 385                                 mask |= POLLOUT | POLLWRNORM;
 386                         } else {  /* send SIGIO later */
 387                                 set_bit(SOCK_ASYNC_NOSPACE,
 388                                         &sk->sk_socket->flags);
 389                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 390
 391                                 /* Race breaker. If space is freed after
 392                                  * wspace test but before the flags are set,
 393                                  * IO signal will be lost.
 394                                  */
 395                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 396                                         mask |= POLLOUT | POLLWRNORM;
 397                         }
 398                 }
 399
 400                 if (tp->urg_data & TCP_URG_VALID)
 401                         mask |= POLLPRI;
 402         }
 403         return mask;
 404 }
 405
 406 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 407 {
 408         struct tcp_sock *tp = tcp_sk(sk);
 409         int answ;
 410
 411         switch (cmd) {
 412         case SIOCINQ:
 413                 if (sk->sk_state == TCP_LISTEN)
 414                         return -EINVAL;
 415
 416                 lock_sock(sk);
 417                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 418                         answ = 0;
 419                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 420                          !tp->urg_data ||
 421                          before(tp->urg_seq, tp->copied_seq) ||
 422                          !before(tp->urg_seq, tp->rcv_nxt)) {
 423                         answ = tp->rcv_nxt - tp->copied_seq;
 424
 425                         /* Subtract 1, if FIN is in queue. */
 426                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 427                                 answ -=
 428                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 429                 } else
 430                         answ = tp->urg_seq - tp->copied_seq;
 431                 release_sock(sk);
 432                 break;
 433         case SIOCATMARK:
 434                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 435                 break;
 436         case SIOCOUTQ:
 437                 if (sk->sk_state == TCP_LISTEN)
 438                         return -EINVAL;
 439
 440                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 441                         answ = 0;
 442                 else
 443                         answ = tp->write_seq - tp->snd_una;
 444                 break;
 445         default:
 446                 return -ENOIOCTLCMD;
 447         };
 448
 449         return put_user(answ, (int __user *)arg);
 450 }
 451
 452 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 453 {
 454         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 455         tp->pushed_seq = tp->write_seq;
 456 }
 457
 458 static inline int forced_push(struct tcp_sock *tp)
 459 {
 460         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 461 }
 462
 463 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 464                               struct sk_buff *skb)
 465 {
 466         skb->csum = 0;
 467         TCP_SKB_CB(skb)->seq = tp->write_seq;
 468         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 469         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 470         TCP_SKB_CB(skb)->sacked = 0;
 471         skb_header_release(skb);
 472         __skb_queue_tail(&sk->sk_write_queue, skb);
 473         sk_charge_skb(sk, skb);
 474         if (!sk->sk_send_head)
 475                 sk->sk_send_head = skb;
 476         if (tp->nonagle & TCP_NAGLE_PUSH)
 477                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 478 }
 479
 480 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 481                                 struct sk_buff *skb)
 482 {
 483         if (flags & MSG_OOB) {
 484                 tp->urg_mode = 1;
 485                 tp->snd_up = tp->write_seq;
 486                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 487         }
 488 }
 489
 490 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
 491                             int mss_now, int nonagle)
 492 {
 493         if (sk->sk_send_head) {
 494                 struct sk_buff *skb = sk->sk_write_queue.prev;
 495                 if (!(flags & MSG_MORE) || forced_push(tp))
 496                         tcp_mark_push(tp, skb);
 497                 tcp_mark_urg(tp, flags, skb);
 498                 __tcp_push_pending_frames(sk, tp, mss_now,
 499                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 500         }
 501 }
 502
 503 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 504                          size_t psize, int flags)
 505 {
 506         struct tcp_sock *tp = tcp_sk(sk);
 507         int mss_now, size_goal;
 508         int err;
 509         ssize_t copied;
 510         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 511
 512         /* Wait for a connection to finish. */
 513         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 514                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 515                         goto out_err;
 516
 517         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 518
 519         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 520         size_goal = tp->xmit_size_goal;
 521         copied = 0;
 522
 523         err = -EPIPE;
 524         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 525                 goto do_error;
 526
 527         while (psize > 0) {
 528                 struct sk_buff *skb = sk->sk_write_queue.prev;
 529                 struct page *page = pages[poffset / PAGE_SIZE];
 530                 int copy, i, can_coalesce;
 531                 int offset = poffset % PAGE_SIZE;
 532                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 533
 534                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 535 new_segment:
 536                         if (!sk_stream_memory_free(sk))
 537                                 goto wait_for_sndbuf;
 538
 539                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 540                                                    sk->sk_allocation);
 541                         if (!skb)
 542                                 goto wait_for_memory;
 543
 544                         skb_entail(sk, tp, skb);
 545                         copy = size_goal;
 546                 }
 547
 548                 if (copy > size)
 549                         copy = size;
 550
 551                 i = skb_shinfo(skb)->nr_frags;
 552                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 553                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 554                         tcp_mark_push(tp, skb);
 555                         goto new_segment;
 556                 }
 557                 if (!sk_stream_wmem_schedule(sk, copy))
 558                         goto wait_for_memory;
 559
 560                 if (can_coalesce) {
 561                         skb_shinfo(skb)->frags[i - 1].size += copy;
 562                 } else {
 563                         get_page(page);
 564                         skb_fill_page_desc(skb, i, page, offset, copy);
 565                 }
 566
 567                 skb->len += copy;
 568                 skb->data_len += copy;
 569                 skb->truesize += copy;
 570                 sk->sk_wmem_queued += copy;
 571                 sk->sk_forward_alloc -= copy;
 572                 skb->ip_summed = CHECKSUM_HW;
 573                 tp->write_seq += copy;
 574                 TCP_SKB_CB(skb)->end_seq += copy;
 575                 skb_shinfo(skb)->tso_segs = 0;
 576
 577                 if (!copied)
 578                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 579
 580                 copied += copy;
 581                 poffset += copy;
 582                 if (!(psize -= copy))
 583                         goto out;
 584
 585                 if (skb->len < mss_now || (flags & MSG_OOB))
 586                         continue;
 587
 588                 if (forced_push(tp)) {
 589                         tcp_mark_push(tp, skb);
 590                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 591                 } else if (skb == sk->sk_send_head)
 592                         tcp_push_one(sk, mss_now);
 593                 continue;
 594
 595 wait_for_sndbuf:
 596                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 597 wait_for_memory:
 598                 if (copied)
 599                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 600
 601                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 602                         goto do_error;
 603
 604                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 605                 size_goal = tp->xmit_size_goal;
 606         }
 607
 608 out:
 609         if (copied)
 610                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 611         return copied;
 612
 613 do_error:
 614         if (copied)
 615                 goto out;
 616 out_err:
 617         return sk_stream_error(sk, flags, err);
 618 }
 619
 620 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 621                      size_t size, int flags)
 622 {
 623         ssize_t res;
 624         struct sock *sk = sock->sk;
 625
 626 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 627
 628         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 629             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 630                 return sock_no_sendpage(sock, page, offset, size, flags);
 631
 632 #undef TCP_ZC_CSUM_FLAGS
 633
 634         lock_sock(sk);
 635         TCP_CHECK_TIMER(sk);
 636         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 637         TCP_CHECK_TIMER(sk);
 638         release_sock(sk);
 639         return res;
 640 }
 641
 642 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 643 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 644
 645 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 646 {
 647         int tmp = tp->mss_cache;
 648
 649         if (sk->sk_route_caps & NETIF_F_SG) {
 650                 if (sk->sk_route_caps & NETIF_F_TSO)
 651                         tmp = 0;
 652                 else {
 653                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 654
 655                         if (tmp >= pgbreak &&
 656                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 657                                 tmp = pgbreak;
 658                 }
 659         }
 660
 661         return tmp;
 662 }
 663
 664 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 665                 size_t size)
 666 {
 667         struct iovec *iov;
 668         struct tcp_sock *tp = tcp_sk(sk);
 669         struct sk_buff *skb;
 670         int iovlen, flags;
 671         int mss_now, size_goal;
 672         int err, copied;
 673         long timeo;
 674
 675         lock_sock(sk);
 676         TCP_CHECK_TIMER(sk);
 677
 678         flags = msg->msg_flags;
 679         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 680
 681         /* Wait for a connection to finish. */
 682         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 683                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 684                         goto out_err;
 685
 686         /* This should be in poll */
 687         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 688
 689         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 690         size_goal = tp->xmit_size_goal;
 691
 692         /* Ok commence sending. */
 693         iovlen = msg->msg_iovlen;
 694         iov = msg->msg_iov;
 695         copied = 0;
 696
 697         err = -EPIPE;
 698         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 699                 goto do_error;
 700
 701         while (--iovlen >= 0) {
 702                 int seglen = iov->iov_len;
 703                 unsigned char __user *from = iov->iov_base;
 704
 705                 iov++;
 706
 707                 while (seglen > 0) {
 708                         int copy;
 709
 710                         skb = sk->sk_write_queue.prev;
 711
 712                         if (!sk->sk_send_head ||
 713                             (copy = size_goal - skb->len) <= 0) {
 714
 715 new_segment:
 716                                 /* Allocate new segment. If the interface is SG,
 717                                  * allocate skb fitting to single page.
 718                                  */
 719                                 if (!sk_stream_memory_free(sk))
 720                                         goto wait_for_sndbuf;
 721
 722                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 723                                                            0, sk->sk_allocation);
 724                                 if (!skb)
 725                                         goto wait_for_memory;
 726
 727                                 /*
 728                                  * Check whether we can use HW checksum.
 729                                  */
 730                                 if (sk->sk_route_caps &
 731                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 732                                      NETIF_F_HW_CSUM))
 733                                         skb->ip_summed = CHECKSUM_HW;
 734
 735                                 skb_entail(sk, tp, skb);
 736                                 copy = size_goal;
 737                         }
 738
 739                         /* Try to append data to the end of skb. */
 740                         if (copy > seglen)
 741                                 copy = seglen;
 742
 743                         /* Where to copy to? */
 744                         if (skb_tailroom(skb) > 0) {
 745                                 /* We have some space in skb head. Superb! */
 746                                 if (copy > skb_tailroom(skb))
 747                                         copy = skb_tailroom(skb);
 748                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 749                                         goto do_fault;
 750                         } else {
 751                                 int merge = 0;
 752                                 int i = skb_shinfo(skb)->nr_frags;
 753                                 struct page *page = TCP_PAGE(sk);
 754                                 int off = TCP_OFF(sk);
 755
 756                                 if (skb_can_coalesce(skb, i, page, off) &&
 757                                     off != PAGE_SIZE) {
 758                                         /* We can extend the last page
 759                                          * fragment. */
 760                                         merge = 1;
 761                                 } else if (i == MAX_SKB_FRAGS ||
 762                                            (!i &&
 763                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 764                                         /* Need to add new fragment and cannot
 765                                          * do this because interface is non-SG,
 766                                          * or because all the page slots are
 767                                          * busy. */
 768                                         tcp_mark_push(tp, skb);
 769                                         goto new_segment;
 770                                 } else if (page) {
 771                                         if (off == PAGE_SIZE) {
 772                                                 put_page(page);
 773                                                 TCP_PAGE(sk) = page = NULL;
 774                                                 off = 0;
 775                                         }
 776                                 } else
 777                                         off = 0;
 778
 779                                 if (copy > PAGE_SIZE - off)
 780                                         copy = PAGE_SIZE - off;
 781
 782                                 if (!sk_stream_wmem_schedule(sk, copy))
 783                                         goto wait_for_memory;
 784
 785                                 if (!page) {
 786                                         /* Allocate new cache page. */
 787                                         if (!(page = sk_stream_alloc_page(sk)))
 788                                                 goto wait_for_memory;
 789                                 }
 790
 791                                 /* Time to copy data. We are close to
 792                                  * the end! */
 793                                 err = skb_copy_to_page(sk, from, skb, page,
 794                                                        off, copy);
 795                                 if (err) {
 796                                         /* If this page was new, give it to the
 797                                          * socket so it does not get leaked.
 798                                          */
 799                                         if (!TCP_PAGE(sk)) {
 800                                                 TCP_PAGE(sk) = page;
 801                                                 TCP_OFF(sk) = 0;
 802                                         }
 803                                         goto do_error;
 804                                 }
 805
 806                                 /* Update the skb. */
 807                                 if (merge) {
 808                                         skb_shinfo(skb)->frags[i - 1].size +=
 809                                                                         copy;
 810                                 } else {
 811                                         skb_fill_page_desc(skb, i, page, off, copy);
 812                                         if (TCP_PAGE(sk)) {
 813                                                 get_page(page);
 814                                         } else if (off + copy < PAGE_SIZE) {
 815                                                 get_page(page);
 816                                                 TCP_PAGE(sk) = page;
 817                                         }
 818                                 }
 819
 820                                 TCP_OFF(sk) = off + copy;
 821                         }
 822
 823                         if (!copied)
 824                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 825
 826                         tp->write_seq += copy;
 827                         TCP_SKB_CB(skb)->end_seq += copy;
 828                         skb_shinfo(skb)->tso_segs = 0;
 829
 830                         from += copy;
 831                         copied += copy;
 832                         if ((seglen -= copy) == 0 && iovlen == 0)
 833                                 goto out;
 834
 835                         if (skb->len < mss_now || (flags & MSG_OOB))
 836                                 continue;
 837
 838                         if (forced_push(tp)) {
 839                                 tcp_mark_push(tp, skb);
 840                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 841                         } else if (skb == sk->sk_send_head)
 842                                 tcp_push_one(sk, mss_now);
 843                         continue;
 844
 845 wait_for_sndbuf:
 846                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 847 wait_for_memory:
 848                         if (copied)
 849                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 850
 851                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 852                                 goto do_error;
 853
 854                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 855                         size_goal = tp->xmit_size_goal;
 856                 }
 857         }
 858
 859 out:
 860         if (copied)
 861                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 862         TCP_CHECK_TIMER(sk);
 863         release_sock(sk);
 864         return copied;
 865
 866 do_fault:
 867         if (!skb->len) {
 868                 if (sk->sk_send_head == skb)
 869                         sk->sk_send_head = NULL;
 870                 __skb_unlink(skb, &sk->sk_write_queue);
 871                 sk_stream_free_skb(sk, skb);
 872         }
 873
 874 do_error:
 875         if (copied)
 876                 goto out;
 877 out_err:
 878         err = sk_stream_error(sk, flags, err);
 879         TCP_CHECK_TIMER(sk);
 880         release_sock(sk);
 881         return err;
 882 }
 883
 884 /*
 885  *      Handle reading urgent data. BSD has very simple semantics for
 886  *      this, no blocking and very strange errors 8)
 887  */
 888
 889 static int tcp_recv_urg(struct sock *sk, long timeo,
 890                         struct msghdr *msg, int len, int flags,
 891                         int *addr_len)
 892 {
 893         struct tcp_sock *tp = tcp_sk(sk);
 894
 895         /* No URG data to read. */
 896         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
 897             tp->urg_data == TCP_URG_READ)
 898                 return -EINVAL; /* Yes this is right ! */
 899
 900         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
 901                 return -ENOTCONN;
 902
 903         if (tp->urg_data & TCP_URG_VALID) {
 904                 int err = 0;
 905                 char c = tp->urg_data;
 906
 907                 if (!(flags & MSG_PEEK))
 908                         tp->urg_data = TCP_URG_READ;
 909
 910                 /* Read urgent data. */
 911                 msg->msg_flags |= MSG_OOB;
 912
 913                 if (len > 0) {
 914                         if (!(flags & MSG_TRUNC))
 915                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
 916                         len = 1;
 917                 } else
 918                         msg->msg_flags |= MSG_TRUNC;
 919
 920                 return err ? -EFAULT : len;
 921         }
 922
 923         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
 924                 return 0;
 925
 926         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 927          * the available implementations agree in this case:
 928          * this call should never block, independent of the
 929          * blocking state of the socket.
 930          * Mike <pall@rz.uni-karlsruhe.de>
 931          */
 932         return -EAGAIN;
 933 }
 934
 935 /* Clean up the receive buffer for full frames taken by the user,
 936  * then send an ACK if necessary.  COPIED is the number of bytes
 937  * tcp_recvmsg has given to the user so far, it speeds up the
 938  * calculation of whether or not we must ACK for the sake of
 939  * a window update.
 940  */
 941 void cleanup_rbuf(struct sock *sk, int copied)
 942 {
 943         struct tcp_sock *tp = tcp_sk(sk);
 944         int time_to_ack = 0;
 945
 946 #if TCP_DEBUG
 947         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 948
 949         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 950 #endif
 951
 952         if (inet_csk_ack_scheduled(sk)) {
 953                 const struct inet_connection_sock *icsk = inet_csk(sk);
 954                    /* Delayed ACKs frequently hit locked sockets during bulk
 955                     * receive. */
 956                 if (icsk->icsk_ack.blocked ||
 957                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
 958                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 959                     /*
 960                      * If this read emptied read buffer, we send ACK, if
 961                      * connection is not bidirectional, user drained
 962                      * receive buffer and there was a small segment
 963                      * in queue.
 964                      */
 965                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
 966                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 967                         time_to_ack = 1;
 968         }
 969
 970         /* We send an ACK if we can now advertise a non-zero window
 971          * which has been raised "significantly".
 972          *
 973          * Even if window raised up to infinity, do not send window open ACK
 974          * in states, where we will not receive more. It is useless.
 975          */
 976         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 977                 __u32 rcv_window_now = tcp_receive_window(tp);
 978
 979                 /* Optimize, __tcp_select_window() is not cheap. */
 980                 if (2*rcv_window_now <= tp->window_clamp) {
 981                         __u32 new_window = __tcp_select_window(sk);
 982
 983                         /* Send ACK now, if this read freed lots of space
 984                          * in our buffer. Certainly, new_window is new window.
 985                          * We can advertise it now, if it is not less than current one.
 986                          * "Lots" means "at least twice" here.
 987                          */
 988                         if (new_window && new_window >= 2 * rcv_window_now)
 989                                 time_to_ack = 1;
 990                 }
 991         }
 992         if (time_to_ack)
 993                 tcp_send_ack(sk);
 994 }
 995
 996 static void tcp_prequeue_process(struct sock *sk)
 997 {
 998         struct sk_buff *skb;
 999         struct tcp_sock *tp = tcp_sk(sk);
1000
1001         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1002
1003         /* RX process wants to run with disabled BHs, though it is not
1004          * necessary */
1005         local_bh_disable();
1006         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1007                 sk->sk_backlog_rcv(sk, skb);
1008         local_bh_enable();
1009
1010         /* Clear memory counter. */
1011         tp->ucopy.memory = 0;
1012 }
1013
1014 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1015 {
1016         struct sk_buff *skb;
1017         u32 offset;
1018
1019         skb_queue_walk(&sk->sk_receive_queue, skb) {
1020                 offset = seq - TCP_SKB_CB(skb)->seq;
1021                 if (skb->h.th->syn)
1022                         offset--;
1023                 if (offset < skb->len || skb->h.th->fin) {
1024                         *off = offset;
1025                         return skb;
1026                 }
1027         }
1028         return NULL;
1029 }
1030
1031 /*
1032  * This routine provides an alternative to tcp_recvmsg() for routines
1033  * that would like to handle copying from skbuffs directly in 'sendfile'
1034  * fashion.
1035  * Note:
1036  *      - It is assumed that the socket was locked by the caller.
1037  *      - The routine does not block.
1038  *      - At present, there is no support for reading OOB data
1039  *        or for 'peeking' the socket using this routine
1040  *        (although both would be easy to implement).
1041  */
1042 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1043                   sk_read_actor_t recv_actor)
1044 {
1045         struct sk_buff *skb;
1046         struct tcp_sock *tp = tcp_sk(sk);
1047         u32 seq = tp->copied_seq;
1048         u32 offset;
1049         int copied = 0;
1050
1051         if (sk->sk_state == TCP_LISTEN)
1052                 return -ENOTCONN;
1053         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1054                 if (offset < skb->len) {
1055                         size_t used, len;
1056
1057                         len = skb->len - offset;
1058                         /* Stop reading if we hit a patch of urgent data */
1059                         if (tp->urg_data) {
1060                                 u32 urg_offset = tp->urg_seq - seq;
1061                                 if (urg_offset < len)
1062                                         len = urg_offset;
1063                                 if (!len)
1064                                         break;
1065                         }
1066                         used = recv_actor(desc, skb, offset, len);
1067                         if (used <= len) {
1068                                 seq += used;
1069                                 copied += used;
1070                                 offset += used;
1071                         }
1072                         if (offset != skb->len)
1073                                 break;
1074                 }
1075                 if (skb->h.th->fin) {
1076                         sk_eat_skb(sk, skb);
1077                         ++seq;
1078                         break;
1079                 }
1080                 sk_eat_skb(sk, skb);
1081                 if (!desc->count)
1082                         break;
1083         }
1084         tp->copied_seq = seq;
1085
1086         tcp_rcv_space_adjust(sk);
1087
1088         /* Clean up data we have read: This will do ACK frames. */
1089         if (copied)
1090                 cleanup_rbuf(sk, copied);
1091         return copied;
1092 }
1093
1094 /*
1095  *      This routine copies from a sock struct into the user buffer.
1096  *
1097  *      Technical note: in 2.3 we work on _locked_ socket, so that
1098  *      tricks with *seq access order and skb->users are not required.
1099  *      Probably, code can be easily improved even more.
1100  */
1101
1102 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1103                 size_t len, int nonblock, int flags, int *addr_len)
1104 {
1105         struct tcp_sock *tp = tcp_sk(sk);
1106         int copied = 0;
1107         u32 peek_seq;
1108         u32 *seq;
1109         unsigned long used;
1110         int err;
1111         int target;             /* Read at least this many bytes */
1112         long timeo;
1113         struct task_struct *user_recv = NULL;
1114
1115         lock_sock(sk);
1116
1117         TCP_CHECK_TIMER(sk);
1118
1119         err = -ENOTCONN;
1120         if (sk->sk_state == TCP_LISTEN)
1121                 goto out;
1122
1123         timeo = sock_rcvtimeo(sk, nonblock);
1124
1125         /* Urgent data needs to be handled specially. */
1126         if (flags & MSG_OOB)
1127                 goto recv_urg;
1128
1129         seq = &tp->copied_seq;
1130         if (flags & MSG_PEEK) {
1131                 peek_seq = tp->copied_seq;
1132                 seq = &peek_seq;
1133         }
1134
1135         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1136
1137         do {
1138                 struct sk_buff *skb;
1139                 u32 offset;
1140
1141                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1142                 if (tp->urg_data && tp->urg_seq == *seq) {
1143                         if (copied)
1144                                 break;
1145                         if (signal_pending(current)) {
1146                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1147                                 break;
1148                         }
1149                 }
1150
1151                 /* Next get a buffer. */
1152
1153                 skb = skb_peek(&sk->sk_receive_queue);
1154                 do {
1155                         if (!skb)
1156                                 break;
1157
1158                         /* Now that we have two receive queues this
1159                          * shouldn't happen.
1160                          */
1161                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1162                                 printk(KERN_INFO "recvmsg bug: copied %X "
1163                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1164                                 break;
1165                         }
1166                         offset = *seq - TCP_SKB_CB(skb)->seq;
1167                         if (skb->h.th->syn)
1168                                 offset--;
1169                         if (offset < skb->len)
1170                                 goto found_ok_skb;
1171                         if (skb->h.th->fin)
1172                                 goto found_fin_ok;
1173                         BUG_TRAP(flags & MSG_PEEK);
1174                         skb = skb->next;
1175                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1176
1177                 /* Well, if we have backlog, try to process it now yet. */
1178
1179                 if (copied >= target && !sk->sk_backlog.tail)
1180                         break;
1181
1182                 if (copied) {
1183                         if (sk->sk_err ||
1184                             sk->sk_state == TCP_CLOSE ||
1185                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1186                             !timeo ||
1187                             signal_pending(current) ||
1188                             (flags & MSG_PEEK))
1189                                 break;
1190                 } else {
1191                         if (sock_flag(sk, SOCK_DONE))
1192                                 break;
1193
1194                         if (sk->sk_err) {
1195                                 copied = sock_error(sk);
1196                                 break;
1197                         }
1198
1199                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1200                                 break;
1201
1202                         if (sk->sk_state == TCP_CLOSE) {
1203                                 if (!sock_flag(sk, SOCK_DONE)) {
1204                                         /* This occurs when user tries to read
1205                                          * from never connected socket.
1206                                          */
1207                                         copied = -ENOTCONN;
1208                                         break;
1209                                 }
1210                                 break;
1211                         }
1212
1213                         if (!timeo) {
1214                                 copied = -EAGAIN;
1215                                 break;
1216                         }
1217
1218                         if (signal_pending(current)) {
1219                                 copied = sock_intr_errno(timeo);
1220                                 break;
1221                         }
1222                 }
1223
1224                 cleanup_rbuf(sk, copied);
1225
1226                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1227                         /* Install new reader */
1228                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1229                                 user_recv = current;
1230                                 tp->ucopy.task = user_recv;
1231                                 tp->ucopy.iov = msg->msg_iov;
1232                         }
1233
1234                         tp->ucopy.len = len;
1235
1236                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1237                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1238
1239                         /* Ugly... If prequeue is not empty, we have to
1240                          * process it before releasing socket, otherwise
1241                          * order will be broken at second iteration.
1242                          * More elegant solution is required!!!
1243                          *
1244                          * Look: we have the following (pseudo)queues:
1245                          *
1246                          * 1. packets in flight
1247                          * 2. backlog
1248                          * 3. prequeue
1249                          * 4. receive_queue
1250                          *
1251                          * Each queue can be processed only if the next ones
1252                          * are empty. At this point we have empty receive_queue.
1253                          * But prequeue _can_ be not empty after 2nd iteration,
1254                          * when we jumped to start of loop because backlog
1255                          * processing added something to receive_queue.
1256                          * We cannot release_sock(), because backlog contains
1257                          * packets arrived _after_ prequeued ones.
1258                          *
1259                          * Shortly, algorithm is clear --- to process all
1260                          * the queues in order. We could make it more directly,
1261                          * requeueing packets from backlog to prequeue, if
1262                          * is not empty. It is more elegant, but eats cycles,
1263                          * unfortunately.
1264                          */
1265                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1266                                 goto do_prequeue;
1267
1268                         /* __ Set realtime policy in scheduler __ */
1269                 }
1270
1271                 if (copied >= target) {
1272                         /* Do not sleep, just process backlog. */
1273                         release_sock(sk);
1274                         lock_sock(sk);
1275                 } else
1276                         sk_wait_data(sk, &timeo);
1277
1278                 if (user_recv) {
1279                         int chunk;
1280
1281                         /* __ Restore normal policy in scheduler __ */
1282
1283                         if ((chunk = len - tp->ucopy.len) != 0) {
1284                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1285                                 len -= chunk;
1286                                 copied += chunk;
1287                         }
1288
1289                         if (tp->rcv_nxt == tp->copied_seq &&
1290                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1291 do_prequeue:
1292                                 tcp_prequeue_process(sk);
1293
1294                                 if ((chunk = len - tp->ucopy.len) != 0) {
1295                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1296                                         len -= chunk;
1297                                         copied += chunk;
1298                                 }
1299                         }
1300                 }
1301                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1302                         if (net_ratelimit())
1303                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1304                                        current->comm, current->pid);
1305                         peek_seq = tp->copied_seq;
1306                 }
1307                 continue;
1308
1309         found_ok_skb:
1310                 /* Ok so how much can we use? */
1311                 used = skb->len - offset;
1312                 if (len < used)
1313                         used = len;
1314
1315                 /* Do we have urgent data here? */
1316                 if (tp->urg_data) {
1317                         u32 urg_offset = tp->urg_seq - *seq;
1318                         if (urg_offset < used) {
1319                                 if (!urg_offset) {
1320                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1321                                                 ++*seq;
1322                                                 offset++;
1323                                                 used--;
1324                                                 if (!used)
1325                                                         goto skip_copy;
1326                                         }
1327                                 } else
1328                                         used = urg_offset;
1329                         }
1330                 }
1331
1332                 if (!(flags & MSG_TRUNC)) {
1333                         err = skb_copy_datagram_iovec(skb, offset,
1334                                                       msg->msg_iov, used);
1335                         if (err) {
1336                                 /* Exception. Bailout! */
1337                                 if (!copied)
1338                                         copied = -EFAULT;
1339                                 break;
1340                         }
1341                 }
1342
1343                 *seq += used;
1344                 copied += used;
1345                 len -= used;
1346
1347                 tcp_rcv_space_adjust(sk);
1348
1349 skip_copy:
1350                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1351                         tp->urg_data = 0;
1352                         tcp_fast_path_check(sk, tp);
1353                 }
1354                 if (used + offset < skb->len)
1355                         continue;
1356
1357                 if (skb->h.th->fin)
1358                         goto found_fin_ok;
1359                 if (!(flags & MSG_PEEK))
1360                         sk_eat_skb(sk, skb);
1361                 continue;
1362
1363         found_fin_ok:
1364                 /* Process the FIN. */
1365                 ++*seq;
1366                 if (!(flags & MSG_PEEK))
1367                         sk_eat_skb(sk, skb);
1368                 break;
1369         } while (len > 0);
1370
1371         if (user_recv) {
1372                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1373                         int chunk;
1374
1375                         tp->ucopy.len = copied > 0 ? len : 0;
1376
1377                         tcp_prequeue_process(sk);
1378
1379                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1380                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1381                                 len -= chunk;
1382                                 copied += chunk;
1383                         }
1384                 }
1385
1386                 tp->ucopy.task = NULL;
1387                 tp->ucopy.len = 0;
1388         }
1389
1390         /* According to UNIX98, msg_name/msg_namelen are ignored
1391          * on connected socket. I was just happy when found this 8) --ANK
1392          */
1393
1394         /* Clean up data we have read: This will do ACK frames. */
1395         cleanup_rbuf(sk, copied);
1396
1397         TCP_CHECK_TIMER(sk);
1398         release_sock(sk);
1399         return copied;
1400
1401 out:
1402         TCP_CHECK_TIMER(sk);
1403         release_sock(sk);
1404         return err;
1405
1406 recv_urg:
1407         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1408         goto out;
1409 }
1410
1411 /*
1412  *      State processing on a close. This implements the state shift for
1413  *      sending our FIN frame. Note that we only send a FIN for some
1414  *      states. A shutdown() may have already sent the FIN, or we may be
1415  *      closed.
1416  */
1417
1418 static const unsigned char new_state[16] = {
1419   /* current state:        new state:      action:      */
1420   /* (Invalid)          */ TCP_CLOSE,
1421   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1422   /* TCP_SYN_SENT       */ TCP_CLOSE,
1423   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1424   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1425   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1426   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1427   /* TCP_CLOSE          */ TCP_CLOSE,
1428   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1429   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1430   /* TCP_LISTEN         */ TCP_CLOSE,
1431   /* TCP_CLOSING        */ TCP_CLOSING,
1432 };
1433
1434 static int tcp_close_state(struct sock *sk)
1435 {
1436         int next = (int)new_state[sk->sk_state];
1437         int ns = next & TCP_STATE_MASK;
1438
1439         tcp_set_state(sk, ns);
1440
1441         return next & TCP_ACTION_FIN;
1442 }
1443
1444 /*
1445  *      Shutdown the sending side of a connection. Much like close except
1446  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1447  */
1448
1449 void tcp_shutdown(struct sock *sk, int how)
1450 {
1451         /*      We need to grab some memory, and put together a FIN,
1452          *      and then put it into the queue to be sent.
1453          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1454          */
1455         if (!(how & SEND_SHUTDOWN))
1456                 return;
1457
1458         /* If we've already sent a FIN, or it's a closed state, skip this. */
1459         if ((1 << sk->sk_state) &
1460             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1461              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1462                 /* Clear out any half completed packets.  FIN if needed. */
1463                 if (tcp_close_state(sk))
1464                         tcp_send_fin(sk);
1465         }
1466 }
1467
1468 void tcp_close(struct sock *sk, long timeout)
1469 {
1470         struct sk_buff *skb;
1471         int data_was_unread = 0;
1472         int state;
1473
1474         lock_sock(sk);
1475         sk->sk_shutdown = SHUTDOWN_MASK;
1476
1477         if (sk->sk_state == TCP_LISTEN) {
1478                 tcp_set_state(sk, TCP_CLOSE);
1479
1480                 /* Special case. */
1481                 inet_csk_listen_stop(sk);
1482
1483                 goto adjudge_to_death;
1484         }
1485
1486         /*  We need to flush the recv. buffs.  We do this only on the
1487          *  descriptor close, not protocol-sourced closes, because the
1488          *  reader process may not have drained the data yet!
1489          */
1490         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1491                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1492                           skb->h.th->fin;
1493                 data_was_unread += len;
1494                 __kfree_skb(skb);
1495         }
1496
1497         sk_stream_mem_reclaim(sk);
1498
1499         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1500          * 3.10, we send a RST here because data was lost.  To
1501          * witness the awful effects of the old behavior of always
1502          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1503          * a bulk GET in an FTP client, suspend the process, wait
1504          * for the client to advertise a zero window, then kill -9
1505          * the FTP client, wheee...  Note: timeout is always zero
1506          * in such a case.
1507          */
1508         if (data_was_unread) {
1509                 /* Unread data was tossed, zap the connection. */
1510                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1511                 tcp_set_state(sk, TCP_CLOSE);
1512                 tcp_send_active_reset(sk, GFP_KERNEL);
1513         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1514                 /* Check zero linger _after_ checking for unread data. */
1515                 sk->sk_prot->disconnect(sk, 0);
1516                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1517         } else if (tcp_close_state(sk)) {
1518                 /* We FIN if the application ate all the data before
1519                  * zapping the connection.
1520                  */
1521
1522                 /* RED-PEN. Formally speaking, we have broken TCP state
1523                  * machine. State transitions:
1524                  *
1525                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1526                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1527                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1528                  *
1529                  * are legal only when FIN has been sent (i.e. in window),
1530                  * rather than queued out of window. Purists blame.
1531                  *
1532                  * F.e. "RFC state" is ESTABLISHED,
1533                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1534                  *
1535                  * The visible declinations are that sometimes
1536                  * we enter time-wait state, when it is not required really
1537                  * (harmless), do not send active resets, when they are
1538                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1539                  * they look as CLOSING or LAST_ACK for Linux)
1540                  * Probably, I missed some more holelets.
1541                  *                                              --ANK
1542                  */
1543                 tcp_send_fin(sk);
1544         }
1545
1546         sk_stream_wait_close(sk, timeout);
1547
1548 adjudge_to_death:
1549         state = sk->sk_state;
1550         sock_hold(sk);
1551         sock_orphan(sk);
1552         atomic_inc(sk->sk_prot->orphan_count);
1553
1554         /* It is the last release_sock in its life. It will remove backlog. */
1555         release_sock(sk);
1556
1557
1558         /* Now socket is owned by kernel and we acquire BH lock
1559            to finish close. No need to check for user refs.
1560          */
1561         local_bh_disable();
1562         bh_lock_sock(sk);
1563         BUG_TRAP(!sock_owned_by_user(sk));
1564
1565         /* Have we already been destroyed by a softirq or backlog? */
1566         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1567                 goto out;
1568
1569         /*      This is a (useful) BSD violating of the RFC. There is a
1570          *      problem with TCP as specified in that the other end could
1571          *      keep a socket open forever with no application left this end.
1572          *      We use a 3 minute timeout (about the same as BSD) then kill
1573          *      our end. If they send after that then tough - BUT: long enough
1574          *      that we won't make the old 4*rto = almost no time - whoops
1575          *      reset mistake.
1576          *
1577          *      Nope, it was not mistake. It is really desired behaviour
1578          *      f.e. on http servers, when such sockets are useless, but
1579          *      consume significant resources. Let's do it with special
1580          *      linger2 option.                                 --ANK
1581          */
1582
1583         if (sk->sk_state == TCP_FIN_WAIT2) {
1584                 struct tcp_sock *tp = tcp_sk(sk);
1585                 if (tp->linger2 < 0) {
1586                         tcp_set_state(sk, TCP_CLOSE);
1587                         tcp_send_active_reset(sk, GFP_ATOMIC);
1588                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1589                 } else {
1590                         const int tmo = tcp_fin_time(sk);
1591
1592                         if (tmo > TCP_TIMEWAIT_LEN) {
1593                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1594                         } else {
1595                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1596                                 goto out;
1597                         }
1598                 }
1599         }
1600         if (sk->sk_state != TCP_CLOSE) {
1601                 sk_stream_mem_reclaim(sk);
1602                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1603                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1604                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1605                         if (net_ratelimit())
1606                                 printk(KERN_INFO "TCP: too many of orphaned "
1607                                        "sockets\n");
1608                         tcp_set_state(sk, TCP_CLOSE);
1609                         tcp_send_active_reset(sk, GFP_ATOMIC);
1610                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1611                 }
1612         }
1613
1614         if (sk->sk_state == TCP_CLOSE)
1615                 inet_csk_destroy_sock(sk);
1616         /* Otherwise, socket is reprieved until protocol close. */
1617
1618 out:
1619         bh_unlock_sock(sk);
1620         local_bh_enable();
1621         sock_put(sk);
1622 }
1623
1624 /* These states need RST on ABORT according to RFC793 */
1625
1626 static inline int tcp_need_reset(int state)
1627 {
1628         return (1 << state) &
1629                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1630                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1631 }
1632
1633 int tcp_disconnect(struct sock *sk, int flags)
1634 {
1635         struct inet_sock *inet = inet_sk(sk);
1636         struct inet_connection_sock *icsk = inet_csk(sk);
1637         struct tcp_sock *tp = tcp_sk(sk);
1638         int err = 0;
1639         int old_state = sk->sk_state;
1640
1641         if (old_state != TCP_CLOSE)
1642                 tcp_set_state(sk, TCP_CLOSE);
1643
1644         /* ABORT function of RFC793 */
1645         if (old_state == TCP_LISTEN) {
1646                 inet_csk_listen_stop(sk);
1647         } else if (tcp_need_reset(old_state) ||
1648                    (tp->snd_nxt != tp->write_seq &&
1649                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1650                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1651                  * states
1652                  */
1653                 tcp_send_active_reset(sk, gfp_any());
1654                 sk->sk_err = ECONNRESET;
1655         } else if (old_state == TCP_SYN_SENT)
1656                 sk->sk_err = ECONNRESET;
1657
1658         tcp_clear_xmit_timers(sk);
1659         __skb_queue_purge(&sk->sk_receive_queue);
1660         sk_stream_writequeue_purge(sk);
1661         __skb_queue_purge(&tp->out_of_order_queue);
1662
1663         inet->dport = 0;
1664
1665         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1666                 inet_reset_saddr(sk);
1667
1668         sk->sk_shutdown = 0;
1669         sock_reset_flag(sk, SOCK_DONE);
1670         tp->srtt = 0;
1671         if ((tp->write_seq += tp->max_window + 2) == 0)
1672                 tp->write_seq = 1;
1673         icsk->icsk_backoff = 0;
1674         tp->snd_cwnd = 2;
1675         icsk->icsk_probes_out = 0;
1676         tp->packets_out = 0;
1677         tp->snd_ssthresh = 0x7fffffff;
1678         tp->snd_cwnd_cnt = 0;
1679         tp->bytes_acked = 0;
1680         tcp_set_ca_state(sk, TCP_CA_Open);
1681         tcp_clear_retrans(tp);
1682         inet_csk_delack_init(sk);
1683         sk->sk_send_head = NULL;
1684         tp->rx_opt.saw_tstamp = 0;
1685         tcp_sack_reset(&tp->rx_opt);
1686         __sk_dst_reset(sk);
1687
1688         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1689
1690         sk->sk_error_report(sk);
1691         return err;
1692 }
1693
1694 /*
1695  *      Socket option code for TCP.
1696  */
1697 static int do_tcp_setsockopt(struct sock *sk, int level,
1698                 int optname, char __user *optval, int optlen)
1699 {
1700         struct tcp_sock *tp = tcp_sk(sk);
1701         struct inet_connection_sock *icsk = inet_csk(sk);
1702         int val;
1703         int err = 0;
1704
1705         /* This is a string value all the others are int's */
1706         if (optname == TCP_CONGESTION) {
1707                 char name[TCP_CA_NAME_MAX];
1708
1709                 if (optlen < 1)
1710                         return -EINVAL;
1711
1712                 val = strncpy_from_user(name, optval,
1713                                         min(TCP_CA_NAME_MAX-1, optlen));
1714                 if (val < 0)
1715                         return -EFAULT;
1716                 name[val] = 0;
1717
1718                 lock_sock(sk);
1719                 err = tcp_set_congestion_control(sk, name);
1720                 release_sock(sk);
1721                 return err;
1722         }
1723
1724         if (optlen < sizeof(int))
1725                 return -EINVAL;
1726
1727         if (get_user(val, (int __user *)optval))
1728                 return -EFAULT;
1729
1730         lock_sock(sk);
1731
1732         switch (optname) {
1733         case TCP_MAXSEG:
1734                 /* Values greater than interface MTU won't take effect. However
1735                  * at the point when this call is done we typically don't yet
1736                  * know which interface is going to be used */
1737                 if (val < 8 || val > MAX_TCP_WINDOW) {
1738                         err = -EINVAL;
1739                         break;
1740                 }
1741                 tp->rx_opt.user_mss = val;
1742                 break;
1743
1744         case TCP_NODELAY:
1745                 if (val) {
1746                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1747                          * this option on corked socket is remembered, but
1748                          * it is not activated until cork is cleared.
1749                          *
1750                          * However, when TCP_NODELAY is set we make
1751                          * an explicit push, which overrides even TCP_CORK
1752                          * for currently queued segments.
1753                          */
1754                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1755                         tcp_push_pending_frames(sk, tp);
1756                 } else {
1757                         tp->nonagle &= ~TCP_NAGLE_OFF;
1758                 }
1759                 break;
1760
1761         case TCP_CORK:
1762                 /* When set indicates to always queue non-full frames.
1763                  * Later the user clears this option and we transmit
1764                  * any pending partial frames in the queue.  This is
1765                  * meant to be used alongside sendfile() to get properly
1766                  * filled frames when the user (for example) must write
1767                  * out headers with a write() call first and then use
1768                  * sendfile to send out the data parts.
1769                  *
1770                  * TCP_CORK can be set together with TCP_NODELAY and it is
1771                  * stronger than TCP_NODELAY.
1772                  */
1773                 if (val) {
1774                         tp->nonagle |= TCP_NAGLE_CORK;
1775                 } else {
1776                         tp->nonagle &= ~TCP_NAGLE_CORK;
1777                         if (tp->nonagle&TCP_NAGLE_OFF)
1778                                 tp->nonagle |= TCP_NAGLE_PUSH;
1779                         tcp_push_pending_frames(sk, tp);
1780                 }
1781                 break;
1782
1783         case TCP_KEEPIDLE:
1784                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1785                         err = -EINVAL;
1786                 else {
1787                         tp->keepalive_time = val * HZ;
1788                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1789                             !((1 << sk->sk_state) &
1790                               (TCPF_CLOSE | TCPF_LISTEN))) {
1791                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1792                                 if (tp->keepalive_time > elapsed)
1793                                         elapsed = tp->keepalive_time - elapsed;
1794                                 else
1795                                         elapsed = 0;
1796                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1797                         }
1798                 }
1799                 break;
1800         case TCP_KEEPINTVL:
1801                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1802                         err = -EINVAL;
1803                 else
1804                         tp->keepalive_intvl = val * HZ;
1805                 break;
1806         case TCP_KEEPCNT:
1807                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1808                         err = -EINVAL;
1809                 else
1810                         tp->keepalive_probes = val;
1811                 break;
1812         case TCP_SYNCNT:
1813                 if (val < 1 || val > MAX_TCP_SYNCNT)
1814                         err = -EINVAL;
1815                 else
1816                         icsk->icsk_syn_retries = val;
1817                 break;
1818
1819         case TCP_LINGER2:
1820                 if (val < 0)
1821                         tp->linger2 = -1;
1822                 else if (val > sysctl_tcp_fin_timeout / HZ)
1823                         tp->linger2 = 0;
1824                 else
1825                         tp->linger2 = val * HZ;
1826                 break;
1827
1828         case TCP_DEFER_ACCEPT:
1829                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
1830                 if (val > 0) {
1831                         /* Translate value in seconds to number of
1832                          * retransmits */
1833                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1834                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1835                                        icsk->icsk_accept_queue.rskq_defer_accept))
1836                                 icsk->icsk_accept_queue.rskq_defer_accept++;
1837                         icsk->icsk_accept_queue.rskq_defer_accept++;
1838                 }
1839                 break;
1840
1841         case TCP_WINDOW_CLAMP:
1842                 if (!val) {
1843                         if (sk->sk_state != TCP_CLOSE) {
1844                                 err = -EINVAL;
1845                                 break;
1846                         }
1847                         tp->window_clamp = 0;
1848                 } else
1849                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1850                                                 SOCK_MIN_RCVBUF / 2 : val;
1851                 break;
1852
1853         case TCP_QUICKACK:
1854                 if (!val) {
1855                         icsk->icsk_ack.pingpong = 1;
1856                 } else {
1857                         icsk->icsk_ack.pingpong = 0;
1858                         if ((1 << sk->sk_state) &
1859                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1860                             inet_csk_ack_scheduled(sk)) {
1861                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1862                                 cleanup_rbuf(sk, 1);
1863                                 if (!(val & 1))
1864                                         icsk->icsk_ack.pingpong = 1;
1865                         }
1866                 }
1867                 break;
1868
1869         default:
1870                 err = -ENOPROTOOPT;
1871                 break;
1872         };
1873         release_sock(sk);
1874         return err;
1875 }
1876
1877 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1878                    int optlen)
1879 {
1880         struct inet_connection_sock *icsk = inet_csk(sk);
1881
1882         if (level != SOL_TCP)
1883                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1884                                                      optval, optlen);
1885         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1886 }
1887
1888 #ifdef CONFIG_COMPAT
1889 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
1890                           char __user *optval, int optlen)
1891 {
1892         if (level != SOL_TCP)
1893                 return inet_csk_compat_setsockopt(sk, level, optname,
1894                                                   optval, optlen);
1895         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1896 }
1897
1898 EXPORT_SYMBOL(compat_tcp_setsockopt);
1899 #endif
1900
1901 /* Return information about state of tcp endpoint in API format. */
1902 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1903 {
1904         struct tcp_sock *tp = tcp_sk(sk);
1905         const struct inet_connection_sock *icsk = inet_csk(sk);
1906         u32 now = tcp_time_stamp;
1907
1908         memset(info, 0, sizeof(*info));
1909
1910         info->tcpi_state = sk->sk_state;
1911         info->tcpi_ca_state = icsk->icsk_ca_state;
1912         info->tcpi_retransmits = icsk->icsk_retransmits;
1913         info->tcpi_probes = icsk->icsk_probes_out;
1914         info->tcpi_backoff = icsk->icsk_backoff;
1915
1916         if (tp->rx_opt.tstamp_ok)
1917                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1918         if (tp->rx_opt.sack_ok)
1919                 info->tcpi_options |= TCPI_OPT_SACK;
1920         if (tp->rx_opt.wscale_ok) {
1921                 info->tcpi_options |= TCPI_OPT_WSCALE;
1922                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1923                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1924         }
1925
1926         if (tp->ecn_flags&TCP_ECN_OK)
1927                 info->tcpi_options |= TCPI_OPT_ECN;
1928
1929         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1930         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1931         info->tcpi_snd_mss = tp->mss_cache;
1932         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1933
1934         info->tcpi_unacked = tp->packets_out;
1935         info->tcpi_sacked = tp->sacked_out;
1936         info->tcpi_lost = tp->lost_out;
1937         info->tcpi_retrans = tp->retrans_out;
1938         info->tcpi_fackets = tp->fackets_out;
1939
1940         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1941         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1942         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1943
1944         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
1945         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1946         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1947         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1948         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1949         info->tcpi_snd_cwnd = tp->snd_cwnd;
1950         info->tcpi_advmss = tp->advmss;
1951         info->tcpi_reordering = tp->reordering;
1952
1953         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1954         info->tcpi_rcv_space = tp->rcvq_space.space;
1955
1956         info->tcpi_total_retrans = tp->total_retrans;
1957 }
1958
1959 EXPORT_SYMBOL_GPL(tcp_get_info);
1960
1961 static int do_tcp_getsockopt(struct sock *sk, int level,
1962                 int optname, char __user *optval, int __user *optlen)
1963 {
1964         struct inet_connection_sock *icsk = inet_csk(sk);
1965         struct tcp_sock *tp = tcp_sk(sk);
1966         int val, len;
1967
1968         if (get_user(len, optlen))
1969                 return -EFAULT;
1970
1971         len = min_t(unsigned int, len, sizeof(int));
1972
1973         if (len < 0)
1974                 return -EINVAL;
1975
1976         switch (optname) {
1977         case TCP_MAXSEG:
1978                 val = tp->mss_cache;
1979                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1980                         val = tp->rx_opt.user_mss;
1981                 break;
1982         case TCP_NODELAY:
1983                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
1984                 break;
1985         case TCP_CORK:
1986                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
1987                 break;
1988         case TCP_KEEPIDLE:
1989                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1990                 break;
1991         case TCP_KEEPINTVL:
1992                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1993                 break;
1994         case TCP_KEEPCNT:
1995                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1996                 break;
1997         case TCP_SYNCNT:
1998                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1999                 break;
2000         case TCP_LINGER2:
2001                 val = tp->linger2;
2002                 if (val >= 0)
2003                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2004                 break;
2005         case TCP_DEFER_ACCEPT:
2006                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2007                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2008                 break;
2009         case TCP_WINDOW_CLAMP:
2010                 val = tp->window_clamp;
2011                 break;
2012         case TCP_INFO: {
2013                 struct tcp_info info;
2014
2015                 if (get_user(len, optlen))
2016                         return -EFAULT;
2017
2018                 tcp_get_info(sk, &info);
2019
2020                 len = min_t(unsigned int, len, sizeof(info));
2021                 if (put_user(len, optlen))
2022                         return -EFAULT;
2023                 if (copy_to_user(optval, &info, len))
2024                         return -EFAULT;
2025                 return 0;
2026         }
2027         case TCP_QUICKACK:
2028                 val = !icsk->icsk_ack.pingpong;
2029                 break;
2030
2031         case TCP_CONGESTION:
2032                 if (get_user(len, optlen))
2033                         return -EFAULT;
2034                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2035                 if (put_user(len, optlen))
2036                         return -EFAULT;
2037                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2038                         return -EFAULT;
2039                 return 0;
2040         default:
2041                 return -ENOPROTOOPT;
2042         };
2043
2044         if (put_user(len, optlen))
2045                 return -EFAULT;
2046         if (copy_to_user(optval, &val, len))
2047                 return -EFAULT;
2048         return 0;
2049 }
2050
2051 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2052                    int __user *optlen)
2053 {
2054         struct inet_connection_sock *icsk = inet_csk(sk);
2055
2056         if (level != SOL_TCP)
2057                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2058                                                      optval, optlen);
2059         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2060 }
2061
2062 #ifdef CONFIG_COMPAT
2063 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2064                           char __user *optval, int __user *optlen)
2065 {
2066         if (level != SOL_TCP)
2067                 return inet_csk_compat_getsockopt(sk, level, optname,
2068                                                   optval, optlen);
2069         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2070 }
2071
2072 EXPORT_SYMBOL(compat_tcp_getsockopt);
2073 #endif
2074
2075 extern void __skb_cb_too_small_for_tcp(int, int);
2076 extern struct tcp_congestion_ops tcp_reno;
2077
2078 static __initdata unsigned long thash_entries;
2079 static int __init set_thash_entries(char *str)
2080 {
2081         if (!str)
2082                 return 0;
2083         thash_entries = simple_strtoul(str, &str, 0);
2084         return 1;
2085 }
2086 __setup("thash_entries=", set_thash_entries);
2087
2088 void __init tcp_init(void)
2089 {
2090         struct sk_buff *skb = NULL;
2091         unsigned long limit;
2092         int order, i, max_share;
2093
2094         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2095                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2096                                            sizeof(skb->cb));
2097
2098         tcp_hashinfo.bind_bucket_cachep =
2099                 kmem_cache_create("tcp_bind_bucket",
2100                                   sizeof(struct inet_bind_bucket), 0,
2101                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2102         if (!tcp_hashinfo.bind_bucket_cachep)
2103                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2104
2105         /* Size and allocate the main established and bind bucket
2106          * hash tables.
2107          *
2108          * The methodology is similar to that of the buffer cache.
2109          */
2110         tcp_hashinfo.ehash =
2111                 alloc_large_system_hash("TCP established",
2112                                         sizeof(struct inet_ehash_bucket),
2113                                         thash_entries,
2114                                         (num_physpages >= 128 * 1024) ?
2115                                         13 : 15,
2116                                         HASH_HIGHMEM,
2117                                         &tcp_hashinfo.ehash_size,
2118                                         NULL,
2119                                         0);
2120         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2121         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2122                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2123                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2124         }
2125
2126         tcp_hashinfo.bhash =
2127                 alloc_large_system_hash("TCP bind",
2128                                         sizeof(struct inet_bind_hashbucket),
2129                                         tcp_hashinfo.ehash_size,
2130                                         (num_physpages >= 128 * 1024) ?
2131                                         13 : 15,
2132                                         HASH_HIGHMEM,
2133                                         &tcp_hashinfo.bhash_size,
2134                                         NULL,
2135                                         64 * 1024);
2136         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2137         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2138                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2139                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2140         }
2141
2142         /* Try to be a bit smarter and adjust defaults depending
2143          * on available memory.
2144          */
2145         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2146                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2147                         order++)
2148                 ;
2149         if (order >= 4) {
2150                 sysctl_local_port_range[0] = 32768;
2151                 sysctl_local_port_range[1] = 61000;
2152                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2153                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2154                 sysctl_max_syn_backlog = 1024;
2155         } else if (order < 3) {
2156                 sysctl_local_port_range[0] = 1024 * (3 - order);
2157                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2158                 sysctl_tcp_max_orphans >>= (3 - order);
2159                 sysctl_max_syn_backlog = 128;
2160         }
2161
2162         sysctl_tcp_mem[0] =  768 << order;
2163         sysctl_tcp_mem[1] = 1024 << order;
2164         sysctl_tcp_mem[2] = 1536 << order;
2165
2166         limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2167         max_share = min(4UL*1024*1024, limit);
2168
2169         sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2170         sysctl_tcp_wmem[1] = 16*1024;
2171         sysctl_tcp_wmem[2] = max(64*1024, max_share);
2172
2173         sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2174         sysctl_tcp_rmem[1] = 87380;
2175         sysctl_tcp_rmem[2] = max(87380, max_share);
2176
2177         printk(KERN_INFO "TCP: Hash tables configured "
2178                "(established %d bind %d)\n",
2179                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2180
2181         tcp_register_congestion_control(&tcp_reno);
2182 }
2183
2184 EXPORT_SYMBOL(tcp_close);
2185 EXPORT_SYMBOL(tcp_disconnect);
2186 EXPORT_SYMBOL(tcp_getsockopt);
2187 EXPORT_SYMBOL(tcp_ioctl);
2188 EXPORT_SYMBOL(tcp_poll);
2189 EXPORT_SYMBOL(tcp_read_sock);
2190 EXPORT_SYMBOL(tcp_recvmsg);
2191 EXPORT_SYMBOL(tcp_sendmsg);
2192 EXPORT_SYMBOL(tcp_sendpage);
2193 EXPORT_SYMBOL(tcp_setsockopt);
2194 EXPORT_SYMBOL(tcp_shutdown);
2195 EXPORT_SYMBOL(tcp_statistics);
2196 EXPORT_SYMBOL_GPL(cleanup_rbuf);