net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/bootmem.h>
 260 #include <linux/cache.h>
 261 #include <linux/err.h>
 262 #include <linux/in.h>
 263
 264 #include <net/icmp.h>
 265 #include <net/tcp.h>
 266 #include <net/xfrm.h>
 267 #include <net/ip.h>
 268
 269
 270 #include <asm/uaccess.h>
 271 #include <asm/ioctls.h>
 272
 273 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 274
 275 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 276
 277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 278
 279 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 280
 281 int sysctl_tcp_mem[3] __read_mostly;
 282 int sysctl_tcp_wmem[3] __read_mostly;
 283 int sysctl_tcp_rmem[3] __read_mostly;
 284
 285 EXPORT_SYMBOL(sysctl_tcp_mem);
 286 EXPORT_SYMBOL(sysctl_tcp_rmem);
 287 EXPORT_SYMBOL(sysctl_tcp_wmem);
 288
 289 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 290 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 291
 292 EXPORT_SYMBOL(tcp_memory_allocated);
 293 EXPORT_SYMBOL(tcp_sockets_allocated);
 294
 295 /*
 296  * Pressure flag: try to collapse.
 297  * Technical note: it is used by multiple contexts non atomically.
 298  * All the sk_stream_mem_schedule() is of this nature: accounting
 299  * is strict, actions are advisory and have some latency.
 300  */
 301 int tcp_memory_pressure;
 302
 303 EXPORT_SYMBOL(tcp_memory_pressure);
 304
 305 void tcp_enter_memory_pressure(void)
 306 {
 307         if (!tcp_memory_pressure) {
 308                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 309                 tcp_memory_pressure = 1;
 310         }
 311 }
 312
 313 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 314
 315 /*
 316  *      Wait for a TCP event.
 317  *
 318  *      Note that we don't need to lock the socket, as the upper poll layers
 319  *      take care of normal races (between the test and the event) and we don't
 320  *      go look at any of the socket buffers directly.
 321  */
 322 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 323 {
 324         unsigned int mask;
 325         struct sock *sk = sock->sk;
 326         struct tcp_sock *tp = tcp_sk(sk);
 327
 328         poll_wait(file, sk->sk_sleep, wait);
 329         if (sk->sk_state == TCP_LISTEN)
 330                 return inet_csk_listen_poll(sk);
 331
 332         /* Socket is not locked. We are protected from async events
 333            by poll logic and correct handling of state changes
 334            made by another threads is impossible in any case.
 335          */
 336
 337         mask = 0;
 338         if (sk->sk_err)
 339                 mask = POLLERR;
 340
 341         /*
 342          * POLLHUP is certainly not done right. But poll() doesn't
 343          * have a notion of HUP in just one direction, and for a
 344          * socket the read side is more interesting.
 345          *
 346          * Some poll() documentation says that POLLHUP is incompatible
 347          * with the POLLOUT/POLLWR flags, so somebody should check this
 348          * all. But careful, it tends to be safer to return too many
 349          * bits than too few, and you can easily break real applications
 350          * if you don't tell them that something has hung up!
 351          *
 352          * Check-me.
 353          *
 354          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 355          * our fs/select.c). It means that after we received EOF,
 356          * poll always returns immediately, making impossible poll() on write()
 357          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 358          * if and only if shutdown has been made in both directions.
 359          * Actually, it is interesting to look how Solaris and DUX
 360          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 361          * then we could set it on SND_SHUTDOWN. BTW examples given
 362          * in Stevens' books assume exactly this behaviour, it explains
 363          * why PULLHUP is incompatible with POLLOUT.    --ANK
 364          *
 365          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 366          * blocking on fresh not-connected or disconnected socket. --ANK
 367          */
 368         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 369                 mask |= POLLHUP;
 370         if (sk->sk_shutdown & RCV_SHUTDOWN)
 371                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 372
 373         /* Connected? */
 374         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 375                 /* Potential race condition. If read of tp below will
 376                  * escape above sk->sk_state, we can be illegally awaken
 377                  * in SYN_* states. */
 378                 if ((tp->rcv_nxt != tp->copied_seq) &&
 379                     (tp->urg_seq != tp->copied_seq ||
 380                      tp->rcv_nxt != tp->copied_seq + 1 ||
 381                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 382                         mask |= POLLIN | POLLRDNORM;
 383
 384                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 385                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 386                                 mask |= POLLOUT | POLLWRNORM;
 387                         } else {  /* send SIGIO later */
 388                                 set_bit(SOCK_ASYNC_NOSPACE,
 389                                         &sk->sk_socket->flags);
 390                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 391
 392                                 /* Race breaker. If space is freed after
 393                                  * wspace test but before the flags are set,
 394                                  * IO signal will be lost.
 395                                  */
 396                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 397                                         mask |= POLLOUT | POLLWRNORM;
 398                         }
 399                 }
 400
 401                 if (tp->urg_data & TCP_URG_VALID)
 402                         mask |= POLLPRI;
 403         }
 404         return mask;
 405 }
 406
 407 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 408 {
 409         struct tcp_sock *tp = tcp_sk(sk);
 410         int answ;
 411
 412         switch (cmd) {
 413         case SIOCINQ:
 414                 if (sk->sk_state == TCP_LISTEN)
 415                         return -EINVAL;
 416
 417                 lock_sock(sk);
 418                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 419                         answ = 0;
 420                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 421                          !tp->urg_data ||
 422                          before(tp->urg_seq, tp->copied_seq) ||
 423                          !before(tp->urg_seq, tp->rcv_nxt)) {
 424                         answ = tp->rcv_nxt - tp->copied_seq;
 425
 426                         /* Subtract 1, if FIN is in queue. */
 427                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 428                                 answ -=
 429                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 430                 } else
 431                         answ = tp->urg_seq - tp->copied_seq;
 432                 release_sock(sk);
 433                 break;
 434         case SIOCATMARK:
 435                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 436                 break;
 437         case SIOCOUTQ:
 438                 if (sk->sk_state == TCP_LISTEN)
 439                         return -EINVAL;
 440
 441                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 442                         answ = 0;
 443                 else
 444                         answ = tp->write_seq - tp->snd_una;
 445                 break;
 446         default:
 447                 return -ENOIOCTLCMD;
 448         };
 449
 450         return put_user(answ, (int __user *)arg);
 451 }
 452
 453 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 454 {
 455         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 456         tp->pushed_seq = tp->write_seq;
 457 }
 458
 459 static inline int forced_push(struct tcp_sock *tp)
 460 {
 461         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 462 }
 463
 464 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 465                               struct sk_buff *skb)
 466 {
 467         skb->csum = 0;
 468         TCP_SKB_CB(skb)->seq = tp->write_seq;
 469         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 470         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 471         TCP_SKB_CB(skb)->sacked = 0;
 472         skb_header_release(skb);
 473         __skb_queue_tail(&sk->sk_write_queue, skb);
 474         sk_charge_skb(sk, skb);
 475         if (!sk->sk_send_head)
 476                 sk->sk_send_head = skb;
 477         if (tp->nonagle & TCP_NAGLE_PUSH)
 478                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 479 }
 480
 481 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 482                                 struct sk_buff *skb)
 483 {
 484         if (flags & MSG_OOB) {
 485                 tp->urg_mode = 1;
 486                 tp->snd_up = tp->write_seq;
 487                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 488         }
 489 }
 490
 491 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
 492                             int mss_now, int nonagle)
 493 {
 494         if (sk->sk_send_head) {
 495                 struct sk_buff *skb = sk->sk_write_queue.prev;
 496                 if (!(flags & MSG_MORE) || forced_push(tp))
 497                         tcp_mark_push(tp, skb);
 498                 tcp_mark_urg(tp, flags, skb);
 499                 __tcp_push_pending_frames(sk, tp, mss_now,
 500                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 501         }
 502 }
 503
 504 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 505                          size_t psize, int flags)
 506 {
 507         struct tcp_sock *tp = tcp_sk(sk);
 508         int mss_now, size_goal;
 509         int err;
 510         ssize_t copied;
 511         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 512
 513         /* Wait for a connection to finish. */
 514         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 515                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 516                         goto out_err;
 517
 518         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 519
 520         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 521         size_goal = tp->xmit_size_goal;
 522         copied = 0;
 523
 524         err = -EPIPE;
 525         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 526                 goto do_error;
 527
 528         while (psize > 0) {
 529                 struct sk_buff *skb = sk->sk_write_queue.prev;
 530                 struct page *page = pages[poffset / PAGE_SIZE];
 531                 int copy, i, can_coalesce;
 532                 int offset = poffset % PAGE_SIZE;
 533                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 534
 535                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 536 new_segment:
 537                         if (!sk_stream_memory_free(sk))
 538                                 goto wait_for_sndbuf;
 539
 540                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 541                                                    sk->sk_allocation);
 542                         if (!skb)
 543                                 goto wait_for_memory;
 544
 545                         skb_entail(sk, tp, skb);
 546                         copy = size_goal;
 547                 }
 548
 549                 if (copy > size)
 550                         copy = size;
 551
 552                 i = skb_shinfo(skb)->nr_frags;
 553                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 554                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 555                         tcp_mark_push(tp, skb);
 556                         goto new_segment;
 557                 }
 558                 if (!sk_stream_wmem_schedule(sk, copy))
 559                         goto wait_for_memory;
 560
 561                 if (can_coalesce) {
 562                         skb_shinfo(skb)->frags[i - 1].size += copy;
 563                 } else {
 564                         get_page(page);
 565                         skb_fill_page_desc(skb, i, page, offset, copy);
 566                 }
 567
 568                 skb->len += copy;
 569                 skb->data_len += copy;
 570                 skb->truesize += copy;
 571                 sk->sk_wmem_queued += copy;
 572                 sk->sk_forward_alloc -= copy;
 573                 skb->ip_summed = CHECKSUM_HW;
 574                 tp->write_seq += copy;
 575                 TCP_SKB_CB(skb)->end_seq += copy;
 576                 skb_shinfo(skb)->gso_segs = 0;
 577
 578                 if (!copied)
 579                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 580
 581                 copied += copy;
 582                 poffset += copy;
 583                 if (!(psize -= copy))
 584                         goto out;
 585
 586                 if (skb->len < mss_now || (flags & MSG_OOB))
 587                         continue;
 588
 589                 if (forced_push(tp)) {
 590                         tcp_mark_push(tp, skb);
 591                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 592                 } else if (skb == sk->sk_send_head)
 593                         tcp_push_one(sk, mss_now);
 594                 continue;
 595
 596 wait_for_sndbuf:
 597                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 598 wait_for_memory:
 599                 if (copied)
 600                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 601
 602                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 603                         goto do_error;
 604
 605                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 606                 size_goal = tp->xmit_size_goal;
 607         }
 608
 609 out:
 610         if (copied)
 611                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 612         return copied;
 613
 614 do_error:
 615         if (copied)
 616                 goto out;
 617 out_err:
 618         return sk_stream_error(sk, flags, err);
 619 }
 620
 621 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 622                      size_t size, int flags)
 623 {
 624         ssize_t res;
 625         struct sock *sk = sock->sk;
 626
 627         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 628             !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 629                 return sock_no_sendpage(sock, page, offset, size, flags);
 630
 631         lock_sock(sk);
 632         TCP_CHECK_TIMER(sk);
 633         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 634         TCP_CHECK_TIMER(sk);
 635         release_sock(sk);
 636         return res;
 637 }
 638
 639 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 640 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 641
 642 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 643 {
 644         int tmp = tp->mss_cache;
 645
 646         if (sk->sk_route_caps & NETIF_F_SG) {
 647                 if (sk->sk_route_caps & NETIF_F_TSO)
 648                         tmp = 0;
 649                 else {
 650                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 651
 652                         if (tmp >= pgbreak &&
 653                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 654                                 tmp = pgbreak;
 655                 }
 656         }
 657
 658         return tmp;
 659 }
 660
 661 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 662                 size_t size)
 663 {
 664         struct iovec *iov;
 665         struct tcp_sock *tp = tcp_sk(sk);
 666         struct sk_buff *skb;
 667         int iovlen, flags;
 668         int mss_now, size_goal;
 669         int err, copied;
 670         long timeo;
 671
 672         lock_sock(sk);
 673         TCP_CHECK_TIMER(sk);
 674
 675         flags = msg->msg_flags;
 676         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 677
 678         /* Wait for a connection to finish. */
 679         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 680                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 681                         goto out_err;
 682
 683         /* This should be in poll */
 684         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 685
 686         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 687         size_goal = tp->xmit_size_goal;
 688
 689         /* Ok commence sending. */
 690         iovlen = msg->msg_iovlen;
 691         iov = msg->msg_iov;
 692         copied = 0;
 693
 694         err = -EPIPE;
 695         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 696                 goto do_error;
 697
 698         while (--iovlen >= 0) {
 699                 int seglen = iov->iov_len;
 700                 unsigned char __user *from = iov->iov_base;
 701
 702                 iov++;
 703
 704                 while (seglen > 0) {
 705                         int copy;
 706
 707                         skb = sk->sk_write_queue.prev;
 708
 709                         if (!sk->sk_send_head ||
 710                             (copy = size_goal - skb->len) <= 0) {
 711
 712 new_segment:
 713                                 /* Allocate new segment. If the interface is SG,
 714                                  * allocate skb fitting to single page.
 715                                  */
 716                                 if (!sk_stream_memory_free(sk))
 717                                         goto wait_for_sndbuf;
 718
 719                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 720                                                            0, sk->sk_allocation);
 721                                 if (!skb)
 722                                         goto wait_for_memory;
 723
 724                                 /*
 725                                  * Check whether we can use HW checksum.
 726                                  */
 727                                 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 728                                         skb->ip_summed = CHECKSUM_HW;
 729
 730                                 skb_entail(sk, tp, skb);
 731                                 copy = size_goal;
 732                         }
 733
 734                         /* Try to append data to the end of skb. */
 735                         if (copy > seglen)
 736                                 copy = seglen;
 737
 738                         /* Where to copy to? */
 739                         if (skb_tailroom(skb) > 0) {
 740                                 /* We have some space in skb head. Superb! */
 741                                 if (copy > skb_tailroom(skb))
 742                                         copy = skb_tailroom(skb);
 743                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 744                                         goto do_fault;
 745                         } else {
 746                                 int merge = 0;
 747                                 int i = skb_shinfo(skb)->nr_frags;
 748                                 struct page *page = TCP_PAGE(sk);
 749                                 int off = TCP_OFF(sk);
 750
 751                                 if (skb_can_coalesce(skb, i, page, off) &&
 752                                     off != PAGE_SIZE) {
 753                                         /* We can extend the last page
 754                                          * fragment. */
 755                                         merge = 1;
 756                                 } else if (i == MAX_SKB_FRAGS ||
 757                                            (!i &&
 758                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 759                                         /* Need to add new fragment and cannot
 760                                          * do this because interface is non-SG,
 761                                          * or because all the page slots are
 762                                          * busy. */
 763                                         tcp_mark_push(tp, skb);
 764                                         goto new_segment;
 765                                 } else if (page) {
 766                                         if (off == PAGE_SIZE) {
 767                                                 put_page(page);
 768                                                 TCP_PAGE(sk) = page = NULL;
 769                                                 off = 0;
 770                                         }
 771                                 } else
 772                                         off = 0;
 773
 774                                 if (copy > PAGE_SIZE - off)
 775                                         copy = PAGE_SIZE - off;
 776
 777                                 if (!sk_stream_wmem_schedule(sk, copy))
 778                                         goto wait_for_memory;
 779
 780                                 if (!page) {
 781                                         /* Allocate new cache page. */
 782                                         if (!(page = sk_stream_alloc_page(sk)))
 783                                                 goto wait_for_memory;
 784                                 }
 785
 786                                 /* Time to copy data. We are close to
 787                                  * the end! */
 788                                 err = skb_copy_to_page(sk, from, skb, page,
 789                                                        off, copy);
 790                                 if (err) {
 791                                         /* If this page was new, give it to the
 792                                          * socket so it does not get leaked.
 793                                          */
 794                                         if (!TCP_PAGE(sk)) {
 795                                                 TCP_PAGE(sk) = page;
 796                                                 TCP_OFF(sk) = 0;
 797                                         }
 798                                         goto do_error;
 799                                 }
 800
 801                                 /* Update the skb. */
 802                                 if (merge) {
 803                                         skb_shinfo(skb)->frags[i - 1].size +=
 804                                                                         copy;
 805                                 } else {
 806                                         skb_fill_page_desc(skb, i, page, off, copy);
 807                                         if (TCP_PAGE(sk)) {
 808                                                 get_page(page);
 809                                         } else if (off + copy < PAGE_SIZE) {
 810                                                 get_page(page);
 811                                                 TCP_PAGE(sk) = page;
 812                                         }
 813                                 }
 814
 815                                 TCP_OFF(sk) = off + copy;
 816                         }
 817
 818                         if (!copied)
 819                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 820
 821                         tp->write_seq += copy;
 822                         TCP_SKB_CB(skb)->end_seq += copy;
 823                         skb_shinfo(skb)->gso_segs = 0;
 824
 825                         from += copy;
 826                         copied += copy;
 827                         if ((seglen -= copy) == 0 && iovlen == 0)
 828                                 goto out;
 829
 830                         if (skb->len < mss_now || (flags & MSG_OOB))
 831                                 continue;
 832
 833                         if (forced_push(tp)) {
 834                                 tcp_mark_push(tp, skb);
 835                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 836                         } else if (skb == sk->sk_send_head)
 837                                 tcp_push_one(sk, mss_now);
 838                         continue;
 839
 840 wait_for_sndbuf:
 841                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 842 wait_for_memory:
 843                         if (copied)
 844                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 845
 846                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 847                                 goto do_error;
 848
 849                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 850                         size_goal = tp->xmit_size_goal;
 851                 }
 852         }
 853
 854 out:
 855         if (copied)
 856                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 857         TCP_CHECK_TIMER(sk);
 858         release_sock(sk);
 859         return copied;
 860
 861 do_fault:
 862         if (!skb->len) {
 863                 if (sk->sk_send_head == skb)
 864                         sk->sk_send_head = NULL;
 865                 __skb_unlink(skb, &sk->sk_write_queue);
 866                 sk_stream_free_skb(sk, skb);
 867         }
 868
 869 do_error:
 870         if (copied)
 871                 goto out;
 872 out_err:
 873         err = sk_stream_error(sk, flags, err);
 874         TCP_CHECK_TIMER(sk);
 875         release_sock(sk);
 876         return err;
 877 }
 878
 879 /*
 880  *      Handle reading urgent data. BSD has very simple semantics for
 881  *      this, no blocking and very strange errors 8)
 882  */
 883
 884 static int tcp_recv_urg(struct sock *sk, long timeo,
 885                         struct msghdr *msg, int len, int flags,
 886                         int *addr_len)
 887 {
 888         struct tcp_sock *tp = tcp_sk(sk);
 889
 890         /* No URG data to read. */
 891         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
 892             tp->urg_data == TCP_URG_READ)
 893                 return -EINVAL; /* Yes this is right ! */
 894
 895         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
 896                 return -ENOTCONN;
 897
 898         if (tp->urg_data & TCP_URG_VALID) {
 899                 int err = 0;
 900                 char c = tp->urg_data;
 901
 902                 if (!(flags & MSG_PEEK))
 903                         tp->urg_data = TCP_URG_READ;
 904
 905                 /* Read urgent data. */
 906                 msg->msg_flags |= MSG_OOB;
 907
 908                 if (len > 0) {
 909                         if (!(flags & MSG_TRUNC))
 910                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
 911                         len = 1;
 912                 } else
 913                         msg->msg_flags |= MSG_TRUNC;
 914
 915                 return err ? -EFAULT : len;
 916         }
 917
 918         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
 919                 return 0;
 920
 921         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 922          * the available implementations agree in this case:
 923          * this call should never block, independent of the
 924          * blocking state of the socket.
 925          * Mike <pall@rz.uni-karlsruhe.de>
 926          */
 927         return -EAGAIN;
 928 }
 929
 930 /* Clean up the receive buffer for full frames taken by the user,
 931  * then send an ACK if necessary.  COPIED is the number of bytes
 932  * tcp_recvmsg has given to the user so far, it speeds up the
 933  * calculation of whether or not we must ACK for the sake of
 934  * a window update.
 935  */
 936 void cleanup_rbuf(struct sock *sk, int copied)
 937 {
 938         struct tcp_sock *tp = tcp_sk(sk);
 939         int time_to_ack = 0;
 940
 941 #if TCP_DEBUG
 942         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 943
 944         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 945 #endif
 946
 947         if (inet_csk_ack_scheduled(sk)) {
 948                 const struct inet_connection_sock *icsk = inet_csk(sk);
 949                    /* Delayed ACKs frequently hit locked sockets during bulk
 950                     * receive. */
 951                 if (icsk->icsk_ack.blocked ||
 952                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
 953                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 954                     /*
 955                      * If this read emptied read buffer, we send ACK, if
 956                      * connection is not bidirectional, user drained
 957                      * receive buffer and there was a small segment
 958                      * in queue.
 959                      */
 960                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
 961                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 962                         time_to_ack = 1;
 963         }
 964
 965         /* We send an ACK if we can now advertise a non-zero window
 966          * which has been raised "significantly".
 967          *
 968          * Even if window raised up to infinity, do not send window open ACK
 969          * in states, where we will not receive more. It is useless.
 970          */
 971         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 972                 __u32 rcv_window_now = tcp_receive_window(tp);
 973
 974                 /* Optimize, __tcp_select_window() is not cheap. */
 975                 if (2*rcv_window_now <= tp->window_clamp) {
 976                         __u32 new_window = __tcp_select_window(sk);
 977
 978                         /* Send ACK now, if this read freed lots of space
 979                          * in our buffer. Certainly, new_window is new window.
 980                          * We can advertise it now, if it is not less than current one.
 981                          * "Lots" means "at least twice" here.
 982                          */
 983                         if (new_window && new_window >= 2 * rcv_window_now)
 984                                 time_to_ack = 1;
 985                 }
 986         }
 987         if (time_to_ack)
 988                 tcp_send_ack(sk);
 989 }
 990
 991 static void tcp_prequeue_process(struct sock *sk)
 992 {
 993         struct sk_buff *skb;
 994         struct tcp_sock *tp = tcp_sk(sk);
 995
 996         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
 997
 998         /* RX process wants to run with disabled BHs, though it is not
 999          * necessary */
1000         local_bh_disable();
1001         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1002                 sk->sk_backlog_rcv(sk, skb);
1003         local_bh_enable();
1004
1005         /* Clear memory counter. */
1006         tp->ucopy.memory = 0;
1007 }
1008
1009 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1010 {
1011         struct sk_buff *skb;
1012         u32 offset;
1013
1014         skb_queue_walk(&sk->sk_receive_queue, skb) {
1015                 offset = seq - TCP_SKB_CB(skb)->seq;
1016                 if (skb->h.th->syn)
1017                         offset--;
1018                 if (offset < skb->len || skb->h.th->fin) {
1019                         *off = offset;
1020                         return skb;
1021                 }
1022         }
1023         return NULL;
1024 }
1025
1026 /*
1027  * This routine provides an alternative to tcp_recvmsg() for routines
1028  * that would like to handle copying from skbuffs directly in 'sendfile'
1029  * fashion.
1030  * Note:
1031  *      - It is assumed that the socket was locked by the caller.
1032  *      - The routine does not block.
1033  *      - At present, there is no support for reading OOB data
1034  *        or for 'peeking' the socket using this routine
1035  *        (although both would be easy to implement).
1036  */
1037 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1038                   sk_read_actor_t recv_actor)
1039 {
1040         struct sk_buff *skb;
1041         struct tcp_sock *tp = tcp_sk(sk);
1042         u32 seq = tp->copied_seq;
1043         u32 offset;
1044         int copied = 0;
1045
1046         if (sk->sk_state == TCP_LISTEN)
1047                 return -ENOTCONN;
1048         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1049                 if (offset < skb->len) {
1050                         size_t used, len;
1051
1052                         len = skb->len - offset;
1053                         /* Stop reading if we hit a patch of urgent data */
1054                         if (tp->urg_data) {
1055                                 u32 urg_offset = tp->urg_seq - seq;
1056                                 if (urg_offset < len)
1057                                         len = urg_offset;
1058                                 if (!len)
1059                                         break;
1060                         }
1061                         used = recv_actor(desc, skb, offset, len);
1062                         if (used <= len) {
1063                                 seq += used;
1064                                 copied += used;
1065                                 offset += used;
1066                         }
1067                         if (offset != skb->len)
1068                                 break;
1069                 }
1070                 if (skb->h.th->fin) {
1071                         sk_eat_skb(sk, skb);
1072                         ++seq;
1073                         break;
1074                 }
1075                 sk_eat_skb(sk, skb);
1076                 if (!desc->count)
1077                         break;
1078         }
1079         tp->copied_seq = seq;
1080
1081         tcp_rcv_space_adjust(sk);
1082
1083         /* Clean up data we have read: This will do ACK frames. */
1084         if (copied)
1085                 cleanup_rbuf(sk, copied);
1086         return copied;
1087 }
1088
1089 /*
1090  *      This routine copies from a sock struct into the user buffer.
1091  *
1092  *      Technical note: in 2.3 we work on _locked_ socket, so that
1093  *      tricks with *seq access order and skb->users are not required.
1094  *      Probably, code can be easily improved even more.
1095  */
1096
1097 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1098                 size_t len, int nonblock, int flags, int *addr_len)
1099 {
1100         struct tcp_sock *tp = tcp_sk(sk);
1101         int copied = 0;
1102         u32 peek_seq;
1103         u32 *seq;
1104         unsigned long used;
1105         int err;
1106         int target;             /* Read at least this many bytes */
1107         long timeo;
1108         struct task_struct *user_recv = NULL;
1109
1110         lock_sock(sk);
1111
1112         TCP_CHECK_TIMER(sk);
1113
1114         err = -ENOTCONN;
1115         if (sk->sk_state == TCP_LISTEN)
1116                 goto out;
1117
1118         timeo = sock_rcvtimeo(sk, nonblock);
1119
1120         /* Urgent data needs to be handled specially. */
1121         if (flags & MSG_OOB)
1122                 goto recv_urg;
1123
1124         seq = &tp->copied_seq;
1125         if (flags & MSG_PEEK) {
1126                 peek_seq = tp->copied_seq;
1127                 seq = &peek_seq;
1128         }
1129
1130         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1131
1132         do {
1133                 struct sk_buff *skb;
1134                 u32 offset;
1135
1136                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1137                 if (tp->urg_data && tp->urg_seq == *seq) {
1138                         if (copied)
1139                                 break;
1140                         if (signal_pending(current)) {
1141                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1142                                 break;
1143                         }
1144                 }
1145
1146                 /* Next get a buffer. */
1147
1148                 skb = skb_peek(&sk->sk_receive_queue);
1149                 do {
1150                         if (!skb)
1151                                 break;
1152
1153                         /* Now that we have two receive queues this
1154                          * shouldn't happen.
1155                          */
1156                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1157                                 printk(KERN_INFO "recvmsg bug: copied %X "
1158                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1159                                 break;
1160                         }
1161                         offset = *seq - TCP_SKB_CB(skb)->seq;
1162                         if (skb->h.th->syn)
1163                                 offset--;
1164                         if (offset < skb->len)
1165                                 goto found_ok_skb;
1166                         if (skb->h.th->fin)
1167                                 goto found_fin_ok;
1168                         BUG_TRAP(flags & MSG_PEEK);
1169                         skb = skb->next;
1170                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1171
1172                 /* Well, if we have backlog, try to process it now yet. */
1173
1174                 if (copied >= target && !sk->sk_backlog.tail)
1175                         break;
1176
1177                 if (copied) {
1178                         if (sk->sk_err ||
1179                             sk->sk_state == TCP_CLOSE ||
1180                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1181                             !timeo ||
1182                             signal_pending(current) ||
1183                             (flags & MSG_PEEK))
1184                                 break;
1185                 } else {
1186                         if (sock_flag(sk, SOCK_DONE))
1187                                 break;
1188
1189                         if (sk->sk_err) {
1190                                 copied = sock_error(sk);
1191                                 break;
1192                         }
1193
1194                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1195                                 break;
1196
1197                         if (sk->sk_state == TCP_CLOSE) {
1198                                 if (!sock_flag(sk, SOCK_DONE)) {
1199                                         /* This occurs when user tries to read
1200                                          * from never connected socket.
1201                                          */
1202                                         copied = -ENOTCONN;
1203                                         break;
1204                                 }
1205                                 break;
1206                         }
1207
1208                         if (!timeo) {
1209                                 copied = -EAGAIN;
1210                                 break;
1211                         }
1212
1213                         if (signal_pending(current)) {
1214                                 copied = sock_intr_errno(timeo);
1215                                 break;
1216                         }
1217                 }
1218
1219                 cleanup_rbuf(sk, copied);
1220
1221                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1222                         /* Install new reader */
1223                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1224                                 user_recv = current;
1225                                 tp->ucopy.task = user_recv;
1226                                 tp->ucopy.iov = msg->msg_iov;
1227                         }
1228
1229                         tp->ucopy.len = len;
1230
1231                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1232                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1233
1234                         /* Ugly... If prequeue is not empty, we have to
1235                          * process it before releasing socket, otherwise
1236                          * order will be broken at second iteration.
1237                          * More elegant solution is required!!!
1238                          *
1239                          * Look: we have the following (pseudo)queues:
1240                          *
1241                          * 1. packets in flight
1242                          * 2. backlog
1243                          * 3. prequeue
1244                          * 4. receive_queue
1245                          *
1246                          * Each queue can be processed only if the next ones
1247                          * are empty. At this point we have empty receive_queue.
1248                          * But prequeue _can_ be not empty after 2nd iteration,
1249                          * when we jumped to start of loop because backlog
1250                          * processing added something to receive_queue.
1251                          * We cannot release_sock(), because backlog contains
1252                          * packets arrived _after_ prequeued ones.
1253                          *
1254                          * Shortly, algorithm is clear --- to process all
1255                          * the queues in order. We could make it more directly,
1256                          * requeueing packets from backlog to prequeue, if
1257                          * is not empty. It is more elegant, but eats cycles,
1258                          * unfortunately.
1259                          */
1260                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1261                                 goto do_prequeue;
1262
1263                         /* __ Set realtime policy in scheduler __ */
1264                 }
1265
1266                 if (copied >= target) {
1267                         /* Do not sleep, just process backlog. */
1268                         release_sock(sk);
1269                         lock_sock(sk);
1270                 } else
1271                         sk_wait_data(sk, &timeo);
1272
1273                 if (user_recv) {
1274                         int chunk;
1275
1276                         /* __ Restore normal policy in scheduler __ */
1277
1278                         if ((chunk = len - tp->ucopy.len) != 0) {
1279                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1280                                 len -= chunk;
1281                                 copied += chunk;
1282                         }
1283
1284                         if (tp->rcv_nxt == tp->copied_seq &&
1285                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1286 do_prequeue:
1287                                 tcp_prequeue_process(sk);
1288
1289                                 if ((chunk = len - tp->ucopy.len) != 0) {
1290                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1291                                         len -= chunk;
1292                                         copied += chunk;
1293                                 }
1294                         }
1295                 }
1296                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1297                         if (net_ratelimit())
1298                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1299                                        current->comm, current->pid);
1300                         peek_seq = tp->copied_seq;
1301                 }
1302                 continue;
1303
1304         found_ok_skb:
1305                 /* Ok so how much can we use? */
1306                 used = skb->len - offset;
1307                 if (len < used)
1308                         used = len;
1309
1310                 /* Do we have urgent data here? */
1311                 if (tp->urg_data) {
1312                         u32 urg_offset = tp->urg_seq - *seq;
1313                         if (urg_offset < used) {
1314                                 if (!urg_offset) {
1315                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1316                                                 ++*seq;
1317                                                 offset++;
1318                                                 used--;
1319                                                 if (!used)
1320                                                         goto skip_copy;
1321                                         }
1322                                 } else
1323                                         used = urg_offset;
1324                         }
1325                 }
1326
1327                 if (!(flags & MSG_TRUNC)) {
1328                         err = skb_copy_datagram_iovec(skb, offset,
1329                                                       msg->msg_iov, used);
1330                         if (err) {
1331                                 /* Exception. Bailout! */
1332                                 if (!copied)
1333                                         copied = -EFAULT;
1334                                 break;
1335                         }
1336                 }
1337
1338                 *seq += used;
1339                 copied += used;
1340                 len -= used;
1341
1342                 tcp_rcv_space_adjust(sk);
1343
1344 skip_copy:
1345                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1346                         tp->urg_data = 0;
1347                         tcp_fast_path_check(sk, tp);
1348                 }
1349                 if (used + offset < skb->len)
1350                         continue;
1351
1352                 if (skb->h.th->fin)
1353                         goto found_fin_ok;
1354                 if (!(flags & MSG_PEEK))
1355                         sk_eat_skb(sk, skb);
1356                 continue;
1357
1358         found_fin_ok:
1359                 /* Process the FIN. */
1360                 ++*seq;
1361                 if (!(flags & MSG_PEEK))
1362                         sk_eat_skb(sk, skb);
1363                 break;
1364         } while (len > 0);
1365
1366         if (user_recv) {
1367                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1368                         int chunk;
1369
1370                         tp->ucopy.len = copied > 0 ? len : 0;
1371
1372                         tcp_prequeue_process(sk);
1373
1374                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1375                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1376                                 len -= chunk;
1377                                 copied += chunk;
1378                         }
1379                 }
1380
1381                 tp->ucopy.task = NULL;
1382                 tp->ucopy.len = 0;
1383         }
1384
1385         /* According to UNIX98, msg_name/msg_namelen are ignored
1386          * on connected socket. I was just happy when found this 8) --ANK
1387          */
1388
1389         /* Clean up data we have read: This will do ACK frames. */
1390         cleanup_rbuf(sk, copied);
1391
1392         TCP_CHECK_TIMER(sk);
1393         release_sock(sk);
1394         return copied;
1395
1396 out:
1397         TCP_CHECK_TIMER(sk);
1398         release_sock(sk);
1399         return err;
1400
1401 recv_urg:
1402         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1403         goto out;
1404 }
1405
1406 /*
1407  *      State processing on a close. This implements the state shift for
1408  *      sending our FIN frame. Note that we only send a FIN for some
1409  *      states. A shutdown() may have already sent the FIN, or we may be
1410  *      closed.
1411  */
1412
1413 static const unsigned char new_state[16] = {
1414   /* current state:        new state:      action:      */
1415   /* (Invalid)          */ TCP_CLOSE,
1416   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1417   /* TCP_SYN_SENT       */ TCP_CLOSE,
1418   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1419   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1420   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1421   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1422   /* TCP_CLOSE          */ TCP_CLOSE,
1423   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1424   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1425   /* TCP_LISTEN         */ TCP_CLOSE,
1426   /* TCP_CLOSING        */ TCP_CLOSING,
1427 };
1428
1429 static int tcp_close_state(struct sock *sk)
1430 {
1431         int next = (int)new_state[sk->sk_state];
1432         int ns = next & TCP_STATE_MASK;
1433
1434         tcp_set_state(sk, ns);
1435
1436         return next & TCP_ACTION_FIN;
1437 }
1438
1439 /*
1440  *      Shutdown the sending side of a connection. Much like close except
1441  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1442  */
1443
1444 void tcp_shutdown(struct sock *sk, int how)
1445 {
1446         /*      We need to grab some memory, and put together a FIN,
1447          *      and then put it into the queue to be sent.
1448          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1449          */
1450         if (!(how & SEND_SHUTDOWN))
1451                 return;
1452
1453         /* If we've already sent a FIN, or it's a closed state, skip this. */
1454         if ((1 << sk->sk_state) &
1455             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1456              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1457                 /* Clear out any half completed packets.  FIN if needed. */
1458                 if (tcp_close_state(sk))
1459                         tcp_send_fin(sk);
1460         }
1461 }
1462
1463 void tcp_close(struct sock *sk, long timeout)
1464 {
1465         struct sk_buff *skb;
1466         int data_was_unread = 0;
1467         int state;
1468
1469         lock_sock(sk);
1470         sk->sk_shutdown = SHUTDOWN_MASK;
1471
1472         if (sk->sk_state == TCP_LISTEN) {
1473                 tcp_set_state(sk, TCP_CLOSE);
1474
1475                 /* Special case. */
1476                 inet_csk_listen_stop(sk);
1477
1478                 goto adjudge_to_death;
1479         }
1480
1481         /*  We need to flush the recv. buffs.  We do this only on the
1482          *  descriptor close, not protocol-sourced closes, because the
1483          *  reader process may not have drained the data yet!
1484          */
1485         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1486                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1487                           skb->h.th->fin;
1488                 data_was_unread += len;
1489                 __kfree_skb(skb);
1490         }
1491
1492         sk_stream_mem_reclaim(sk);
1493
1494         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1495          * 3.10, we send a RST here because data was lost.  To
1496          * witness the awful effects of the old behavior of always
1497          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1498          * a bulk GET in an FTP client, suspend the process, wait
1499          * for the client to advertise a zero window, then kill -9
1500          * the FTP client, wheee...  Note: timeout is always zero
1501          * in such a case.
1502          */
1503         if (data_was_unread) {
1504                 /* Unread data was tossed, zap the connection. */
1505                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1506                 tcp_set_state(sk, TCP_CLOSE);
1507                 tcp_send_active_reset(sk, GFP_KERNEL);
1508         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1509                 /* Check zero linger _after_ checking for unread data. */
1510                 sk->sk_prot->disconnect(sk, 0);
1511                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1512         } else if (tcp_close_state(sk)) {
1513                 /* We FIN if the application ate all the data before
1514                  * zapping the connection.
1515                  */
1516
1517                 /* RED-PEN. Formally speaking, we have broken TCP state
1518                  * machine. State transitions:
1519                  *
1520                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1521                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1522                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1523                  *
1524                  * are legal only when FIN has been sent (i.e. in window),
1525                  * rather than queued out of window. Purists blame.
1526                  *
1527                  * F.e. "RFC state" is ESTABLISHED,
1528                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1529                  *
1530                  * The visible declinations are that sometimes
1531                  * we enter time-wait state, when it is not required really
1532                  * (harmless), do not send active resets, when they are
1533                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1534                  * they look as CLOSING or LAST_ACK for Linux)
1535                  * Probably, I missed some more holelets.
1536                  *                                              --ANK
1537                  */
1538                 tcp_send_fin(sk);
1539         }
1540
1541         sk_stream_wait_close(sk, timeout);
1542
1543 adjudge_to_death:
1544         state = sk->sk_state;
1545         sock_hold(sk);
1546         sock_orphan(sk);
1547         atomic_inc(sk->sk_prot->orphan_count);
1548
1549         /* It is the last release_sock in its life. It will remove backlog. */
1550         release_sock(sk);
1551
1552
1553         /* Now socket is owned by kernel and we acquire BH lock
1554            to finish close. No need to check for user refs.
1555          */
1556         local_bh_disable();
1557         bh_lock_sock(sk);
1558         BUG_TRAP(!sock_owned_by_user(sk));
1559
1560         /* Have we already been destroyed by a softirq or backlog? */
1561         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1562                 goto out;
1563
1564         /*      This is a (useful) BSD violating of the RFC. There is a
1565          *      problem with TCP as specified in that the other end could
1566          *      keep a socket open forever with no application left this end.
1567          *      We use a 3 minute timeout (about the same as BSD) then kill
1568          *      our end. If they send after that then tough - BUT: long enough
1569          *      that we won't make the old 4*rto = almost no time - whoops
1570          *      reset mistake.
1571          *
1572          *      Nope, it was not mistake. It is really desired behaviour
1573          *      f.e. on http servers, when such sockets are useless, but
1574          *      consume significant resources. Let's do it with special
1575          *      linger2 option.                                 --ANK
1576          */
1577
1578         if (sk->sk_state == TCP_FIN_WAIT2) {
1579                 struct tcp_sock *tp = tcp_sk(sk);
1580                 if (tp->linger2 < 0) {
1581                         tcp_set_state(sk, TCP_CLOSE);
1582                         tcp_send_active_reset(sk, GFP_ATOMIC);
1583                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1584                 } else {
1585                         const int tmo = tcp_fin_time(sk);
1586
1587                         if (tmo > TCP_TIMEWAIT_LEN) {
1588                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1589                         } else {
1590                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1591                                 goto out;
1592                         }
1593                 }
1594         }
1595         if (sk->sk_state != TCP_CLOSE) {
1596                 sk_stream_mem_reclaim(sk);
1597                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1598                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1599                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1600                         if (net_ratelimit())
1601                                 printk(KERN_INFO "TCP: too many of orphaned "
1602                                        "sockets\n");
1603                         tcp_set_state(sk, TCP_CLOSE);
1604                         tcp_send_active_reset(sk, GFP_ATOMIC);
1605                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1606                 }
1607         }
1608
1609         if (sk->sk_state == TCP_CLOSE)
1610                 inet_csk_destroy_sock(sk);
1611         /* Otherwise, socket is reprieved until protocol close. */
1612
1613 out:
1614         bh_unlock_sock(sk);
1615         local_bh_enable();
1616         sock_put(sk);
1617 }
1618
1619 /* These states need RST on ABORT according to RFC793 */
1620
1621 static inline int tcp_need_reset(int state)
1622 {
1623         return (1 << state) &
1624                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1625                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1626 }
1627
1628 int tcp_disconnect(struct sock *sk, int flags)
1629 {
1630         struct inet_sock *inet = inet_sk(sk);
1631         struct inet_connection_sock *icsk = inet_csk(sk);
1632         struct tcp_sock *tp = tcp_sk(sk);
1633         int err = 0;
1634         int old_state = sk->sk_state;
1635
1636         if (old_state != TCP_CLOSE)
1637                 tcp_set_state(sk, TCP_CLOSE);
1638
1639         /* ABORT function of RFC793 */
1640         if (old_state == TCP_LISTEN) {
1641                 inet_csk_listen_stop(sk);
1642         } else if (tcp_need_reset(old_state) ||
1643                    (tp->snd_nxt != tp->write_seq &&
1644                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1645                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1646                  * states
1647                  */
1648                 tcp_send_active_reset(sk, gfp_any());
1649                 sk->sk_err = ECONNRESET;
1650         } else if (old_state == TCP_SYN_SENT)
1651                 sk->sk_err = ECONNRESET;
1652
1653         tcp_clear_xmit_timers(sk);
1654         __skb_queue_purge(&sk->sk_receive_queue);
1655         sk_stream_writequeue_purge(sk);
1656         __skb_queue_purge(&tp->out_of_order_queue);
1657
1658         inet->dport = 0;
1659
1660         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1661                 inet_reset_saddr(sk);
1662
1663         sk->sk_shutdown = 0;
1664         sock_reset_flag(sk, SOCK_DONE);
1665         tp->srtt = 0;
1666         if ((tp->write_seq += tp->max_window + 2) == 0)
1667                 tp->write_seq = 1;
1668         icsk->icsk_backoff = 0;
1669         tp->snd_cwnd = 2;
1670         icsk->icsk_probes_out = 0;
1671         tp->packets_out = 0;
1672         tp->snd_ssthresh = 0x7fffffff;
1673         tp->snd_cwnd_cnt = 0;
1674         tp->bytes_acked = 0;
1675         tcp_set_ca_state(sk, TCP_CA_Open);
1676         tcp_clear_retrans(tp);
1677         inet_csk_delack_init(sk);
1678         sk->sk_send_head = NULL;
1679         tp->rx_opt.saw_tstamp = 0;
1680         tcp_sack_reset(&tp->rx_opt);
1681         __sk_dst_reset(sk);
1682
1683         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1684
1685         sk->sk_error_report(sk);
1686         return err;
1687 }
1688
1689 /*
1690  *      Socket option code for TCP.
1691  */
1692 static int do_tcp_setsockopt(struct sock *sk, int level,
1693                 int optname, char __user *optval, int optlen)
1694 {
1695         struct tcp_sock *tp = tcp_sk(sk);
1696         struct inet_connection_sock *icsk = inet_csk(sk);
1697         int val;
1698         int err = 0;
1699
1700         /* This is a string value all the others are int's */
1701         if (optname == TCP_CONGESTION) {
1702                 char name[TCP_CA_NAME_MAX];
1703
1704                 if (optlen < 1)
1705                         return -EINVAL;
1706
1707                 val = strncpy_from_user(name, optval,
1708                                         min(TCP_CA_NAME_MAX-1, optlen));
1709                 if (val < 0)
1710                         return -EFAULT;
1711                 name[val] = 0;
1712
1713                 lock_sock(sk);
1714                 err = tcp_set_congestion_control(sk, name);
1715                 release_sock(sk);
1716                 return err;
1717         }
1718
1719         if (optlen < sizeof(int))
1720                 return -EINVAL;
1721
1722         if (get_user(val, (int __user *)optval))
1723                 return -EFAULT;
1724
1725         lock_sock(sk);
1726
1727         switch (optname) {
1728         case TCP_MAXSEG:
1729                 /* Values greater than interface MTU won't take effect. However
1730                  * at the point when this call is done we typically don't yet
1731                  * know which interface is going to be used */
1732                 if (val < 8 || val > MAX_TCP_WINDOW) {
1733                         err = -EINVAL;
1734                         break;
1735                 }
1736                 tp->rx_opt.user_mss = val;
1737                 break;
1738
1739         case TCP_NODELAY:
1740                 if (val) {
1741                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1742                          * this option on corked socket is remembered, but
1743                          * it is not activated until cork is cleared.
1744                          *
1745                          * However, when TCP_NODELAY is set we make
1746                          * an explicit push, which overrides even TCP_CORK
1747                          * for currently queued segments.
1748                          */
1749                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1750                         tcp_push_pending_frames(sk, tp);
1751                 } else {
1752                         tp->nonagle &= ~TCP_NAGLE_OFF;
1753                 }
1754                 break;
1755
1756         case TCP_CORK:
1757                 /* When set indicates to always queue non-full frames.
1758                  * Later the user clears this option and we transmit
1759                  * any pending partial frames in the queue.  This is
1760                  * meant to be used alongside sendfile() to get properly
1761                  * filled frames when the user (for example) must write
1762                  * out headers with a write() call first and then use
1763                  * sendfile to send out the data parts.
1764                  *
1765                  * TCP_CORK can be set together with TCP_NODELAY and it is
1766                  * stronger than TCP_NODELAY.
1767                  */
1768                 if (val) {
1769                         tp->nonagle |= TCP_NAGLE_CORK;
1770                 } else {
1771                         tp->nonagle &= ~TCP_NAGLE_CORK;
1772                         if (tp->nonagle&TCP_NAGLE_OFF)
1773                                 tp->nonagle |= TCP_NAGLE_PUSH;
1774                         tcp_push_pending_frames(sk, tp);
1775                 }
1776                 break;
1777
1778         case TCP_KEEPIDLE:
1779                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1780                         err = -EINVAL;
1781                 else {
1782                         tp->keepalive_time = val * HZ;
1783                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1784                             !((1 << sk->sk_state) &
1785                               (TCPF_CLOSE | TCPF_LISTEN))) {
1786                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1787                                 if (tp->keepalive_time > elapsed)
1788                                         elapsed = tp->keepalive_time - elapsed;
1789                                 else
1790                                         elapsed = 0;
1791                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1792                         }
1793                 }
1794                 break;
1795         case TCP_KEEPINTVL:
1796                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1797                         err = -EINVAL;
1798                 else
1799                         tp->keepalive_intvl = val * HZ;
1800                 break;
1801         case TCP_KEEPCNT:
1802                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1803                         err = -EINVAL;
1804                 else
1805                         tp->keepalive_probes = val;
1806                 break;
1807         case TCP_SYNCNT:
1808                 if (val < 1 || val > MAX_TCP_SYNCNT)
1809                         err = -EINVAL;
1810                 else
1811                         icsk->icsk_syn_retries = val;
1812                 break;
1813
1814         case TCP_LINGER2:
1815                 if (val < 0)
1816                         tp->linger2 = -1;
1817                 else if (val > sysctl_tcp_fin_timeout / HZ)
1818                         tp->linger2 = 0;
1819                 else
1820                         tp->linger2 = val * HZ;
1821                 break;
1822
1823         case TCP_DEFER_ACCEPT:
1824                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
1825                 if (val > 0) {
1826                         /* Translate value in seconds to number of
1827                          * retransmits */
1828                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1829                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1830                                        icsk->icsk_accept_queue.rskq_defer_accept))
1831                                 icsk->icsk_accept_queue.rskq_defer_accept++;
1832                         icsk->icsk_accept_queue.rskq_defer_accept++;
1833                 }
1834                 break;
1835
1836         case TCP_WINDOW_CLAMP:
1837                 if (!val) {
1838                         if (sk->sk_state != TCP_CLOSE) {
1839                                 err = -EINVAL;
1840                                 break;
1841                         }
1842                         tp->window_clamp = 0;
1843                 } else
1844                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1845                                                 SOCK_MIN_RCVBUF / 2 : val;
1846                 break;
1847
1848         case TCP_QUICKACK:
1849                 if (!val) {
1850                         icsk->icsk_ack.pingpong = 1;
1851                 } else {
1852                         icsk->icsk_ack.pingpong = 0;
1853                         if ((1 << sk->sk_state) &
1854                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1855                             inet_csk_ack_scheduled(sk)) {
1856                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1857                                 cleanup_rbuf(sk, 1);
1858                                 if (!(val & 1))
1859                                         icsk->icsk_ack.pingpong = 1;
1860                         }
1861                 }
1862                 break;
1863
1864         default:
1865                 err = -ENOPROTOOPT;
1866                 break;
1867         };
1868         release_sock(sk);
1869         return err;
1870 }
1871
1872 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1873                    int optlen)
1874 {
1875         struct inet_connection_sock *icsk = inet_csk(sk);
1876
1877         if (level != SOL_TCP)
1878                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1879                                                      optval, optlen);
1880         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1881 }
1882
1883 #ifdef CONFIG_COMPAT
1884 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
1885                           char __user *optval, int optlen)
1886 {
1887         if (level != SOL_TCP)
1888                 return inet_csk_compat_setsockopt(sk, level, optname,
1889                                                   optval, optlen);
1890         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1891 }
1892
1893 EXPORT_SYMBOL(compat_tcp_setsockopt);
1894 #endif
1895
1896 /* Return information about state of tcp endpoint in API format. */
1897 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1898 {
1899         struct tcp_sock *tp = tcp_sk(sk);
1900         const struct inet_connection_sock *icsk = inet_csk(sk);
1901         u32 now = tcp_time_stamp;
1902
1903         memset(info, 0, sizeof(*info));
1904
1905         info->tcpi_state = sk->sk_state;
1906         info->tcpi_ca_state = icsk->icsk_ca_state;
1907         info->tcpi_retransmits = icsk->icsk_retransmits;
1908         info->tcpi_probes = icsk->icsk_probes_out;
1909         info->tcpi_backoff = icsk->icsk_backoff;
1910
1911         if (tp->rx_opt.tstamp_ok)
1912                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1913         if (tp->rx_opt.sack_ok)
1914                 info->tcpi_options |= TCPI_OPT_SACK;
1915         if (tp->rx_opt.wscale_ok) {
1916                 info->tcpi_options |= TCPI_OPT_WSCALE;
1917                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1918                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1919         }
1920
1921         if (tp->ecn_flags&TCP_ECN_OK)
1922                 info->tcpi_options |= TCPI_OPT_ECN;
1923
1924         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1925         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1926         info->tcpi_snd_mss = tp->mss_cache;
1927         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1928
1929         info->tcpi_unacked = tp->packets_out;
1930         info->tcpi_sacked = tp->sacked_out;
1931         info->tcpi_lost = tp->lost_out;
1932         info->tcpi_retrans = tp->retrans_out;
1933         info->tcpi_fackets = tp->fackets_out;
1934
1935         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1936         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1937         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1938
1939         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
1940         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1941         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1942         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1943         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1944         info->tcpi_snd_cwnd = tp->snd_cwnd;
1945         info->tcpi_advmss = tp->advmss;
1946         info->tcpi_reordering = tp->reordering;
1947
1948         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1949         info->tcpi_rcv_space = tp->rcvq_space.space;
1950
1951         info->tcpi_total_retrans = tp->total_retrans;
1952 }
1953
1954 EXPORT_SYMBOL_GPL(tcp_get_info);
1955
1956 static int do_tcp_getsockopt(struct sock *sk, int level,
1957                 int optname, char __user *optval, int __user *optlen)
1958 {
1959         struct inet_connection_sock *icsk = inet_csk(sk);
1960         struct tcp_sock *tp = tcp_sk(sk);
1961         int val, len;
1962
1963         if (get_user(len, optlen))
1964                 return -EFAULT;
1965
1966         len = min_t(unsigned int, len, sizeof(int));
1967
1968         if (len < 0)
1969                 return -EINVAL;
1970
1971         switch (optname) {
1972         case TCP_MAXSEG:
1973                 val = tp->mss_cache;
1974                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1975                         val = tp->rx_opt.user_mss;
1976                 break;
1977         case TCP_NODELAY:
1978                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
1979                 break;
1980         case TCP_CORK:
1981                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
1982                 break;
1983         case TCP_KEEPIDLE:
1984                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1985                 break;
1986         case TCP_KEEPINTVL:
1987                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1988                 break;
1989         case TCP_KEEPCNT:
1990                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1991                 break;
1992         case TCP_SYNCNT:
1993                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1994                 break;
1995         case TCP_LINGER2:
1996                 val = tp->linger2;
1997                 if (val >= 0)
1998                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1999                 break;
2000         case TCP_DEFER_ACCEPT:
2001                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2002                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2003                 break;
2004         case TCP_WINDOW_CLAMP:
2005                 val = tp->window_clamp;
2006                 break;
2007         case TCP_INFO: {
2008                 struct tcp_info info;
2009
2010                 if (get_user(len, optlen))
2011                         return -EFAULT;
2012
2013                 tcp_get_info(sk, &info);
2014
2015                 len = min_t(unsigned int, len, sizeof(info));
2016                 if (put_user(len, optlen))
2017                         return -EFAULT;
2018                 if (copy_to_user(optval, &info, len))
2019                         return -EFAULT;
2020                 return 0;
2021         }
2022         case TCP_QUICKACK:
2023                 val = !icsk->icsk_ack.pingpong;
2024                 break;
2025
2026         case TCP_CONGESTION:
2027                 if (get_user(len, optlen))
2028                         return -EFAULT;
2029                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2030                 if (put_user(len, optlen))
2031                         return -EFAULT;
2032                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2033                         return -EFAULT;
2034                 return 0;
2035         default:
2036                 return -ENOPROTOOPT;
2037         };
2038
2039         if (put_user(len, optlen))
2040                 return -EFAULT;
2041         if (copy_to_user(optval, &val, len))
2042                 return -EFAULT;
2043         return 0;
2044 }
2045
2046 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2047                    int __user *optlen)
2048 {
2049         struct inet_connection_sock *icsk = inet_csk(sk);
2050
2051         if (level != SOL_TCP)
2052                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2053                                                      optval, optlen);
2054         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2055 }
2056
2057 #ifdef CONFIG_COMPAT
2058 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2059                           char __user *optval, int __user *optlen)
2060 {
2061         if (level != SOL_TCP)
2062                 return inet_csk_compat_getsockopt(sk, level, optname,
2063                                                   optval, optlen);
2064         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2065 }
2066
2067 EXPORT_SYMBOL(compat_tcp_getsockopt);
2068 #endif
2069
2070 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2071 {
2072         struct sk_buff *segs = ERR_PTR(-EINVAL);
2073         struct tcphdr *th;
2074         unsigned thlen;
2075         unsigned int seq;
2076         unsigned int delta;
2077         unsigned int oldlen;
2078         unsigned int len;
2079
2080         if (!pskb_may_pull(skb, sizeof(*th)))
2081                 goto out;
2082
2083         th = skb->h.th;
2084         thlen = th->doff * 4;
2085         if (thlen < sizeof(*th))
2086                 goto out;
2087
2088         if (!pskb_may_pull(skb, thlen))
2089                 goto out;
2090
2091         oldlen = (u16)~skb->len;
2092         __skb_pull(skb, thlen);
2093
2094         if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2095                 /* Packet is from an untrusted source, reset gso_segs. */
2096                 int mss = skb_shinfo(skb)->gso_size;
2097
2098                 skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
2099
2100                 segs = NULL;
2101                 goto out;
2102         }
2103
2104         segs = skb_segment(skb, features);
2105         if (IS_ERR(segs))
2106                 goto out;
2107
2108         len = skb_shinfo(skb)->gso_size;
2109         delta = htonl(oldlen + (thlen + len));
2110
2111         skb = segs;
2112         th = skb->h.th;
2113         seq = ntohl(th->seq);
2114
2115         do {
2116                 th->fin = th->psh = 0;
2117
2118                 th->check = ~csum_fold(th->check + delta);
2119                 if (skb->ip_summed != CHECKSUM_HW)
2120                         th->check = csum_fold(csum_partial(skb->h.raw, thlen,
2121                                                            skb->csum));
2122
2123                 seq += len;
2124                 skb = skb->next;
2125                 th = skb->h.th;
2126
2127                 th->seq = htonl(seq);
2128                 th->cwr = 0;
2129         } while (skb->next);
2130
2131         delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
2132         th->check = ~csum_fold(th->check + delta);
2133         if (skb->ip_summed != CHECKSUM_HW)
2134                 th->check = csum_fold(csum_partial(skb->h.raw, thlen,
2135                                                    skb->csum));
2136
2137 out:
2138         return segs;
2139 }
2140
2141 extern void __skb_cb_too_small_for_tcp(int, int);
2142 extern struct tcp_congestion_ops tcp_reno;
2143
2144 static __initdata unsigned long thash_entries;
2145 static int __init set_thash_entries(char *str)
2146 {
2147         if (!str)
2148                 return 0;
2149         thash_entries = simple_strtoul(str, &str, 0);
2150         return 1;
2151 }
2152 __setup("thash_entries=", set_thash_entries);
2153
2154 void __init tcp_init(void)
2155 {
2156         struct sk_buff *skb = NULL;
2157         unsigned long limit;
2158         int order, i, max_share;
2159
2160         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2161                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2162                                            sizeof(skb->cb));
2163
2164         tcp_hashinfo.bind_bucket_cachep =
2165                 kmem_cache_create("tcp_bind_bucket",
2166                                   sizeof(struct inet_bind_bucket), 0,
2167                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2168         if (!tcp_hashinfo.bind_bucket_cachep)
2169                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2170
2171         /* Size and allocate the main established and bind bucket
2172          * hash tables.
2173          *
2174          * The methodology is similar to that of the buffer cache.
2175          */
2176         tcp_hashinfo.ehash =
2177                 alloc_large_system_hash("TCP established",
2178                                         sizeof(struct inet_ehash_bucket),
2179                                         thash_entries,
2180                                         (num_physpages >= 128 * 1024) ?
2181                                         13 : 15,
2182                                         HASH_HIGHMEM,
2183                                         &tcp_hashinfo.ehash_size,
2184                                         NULL,
2185                                         0);
2186         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2187         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2188                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2189                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2190         }
2191
2192         tcp_hashinfo.bhash =
2193                 alloc_large_system_hash("TCP bind",
2194                                         sizeof(struct inet_bind_hashbucket),
2195                                         tcp_hashinfo.ehash_size,
2196                                         (num_physpages >= 128 * 1024) ?
2197                                         13 : 15,
2198                                         HASH_HIGHMEM,
2199                                         &tcp_hashinfo.bhash_size,
2200                                         NULL,
2201                                         64 * 1024);
2202         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2203         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2204                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2205                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2206         }
2207
2208         /* Try to be a bit smarter and adjust defaults depending
2209          * on available memory.
2210          */
2211         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2212                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2213                         order++)
2214                 ;
2215         if (order >= 4) {
2216                 sysctl_local_port_range[0] = 32768;
2217                 sysctl_local_port_range[1] = 61000;
2218                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2219                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2220                 sysctl_max_syn_backlog = 1024;
2221         } else if (order < 3) {
2222                 sysctl_local_port_range[0] = 1024 * (3 - order);
2223                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2224                 sysctl_tcp_max_orphans >>= (3 - order);
2225                 sysctl_max_syn_backlog = 128;
2226         }
2227
2228         sysctl_tcp_mem[0] =  768 << order;
2229         sysctl_tcp_mem[1] = 1024 << order;
2230         sysctl_tcp_mem[2] = 1536 << order;
2231
2232         limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2233         max_share = min(4UL*1024*1024, limit);
2234
2235         sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2236         sysctl_tcp_wmem[1] = 16*1024;
2237         sysctl_tcp_wmem[2] = max(64*1024, max_share);
2238
2239         sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2240         sysctl_tcp_rmem[1] = 87380;
2241         sysctl_tcp_rmem[2] = max(87380, max_share);
2242
2243         printk(KERN_INFO "TCP: Hash tables configured "
2244                "(established %d bind %d)\n",
2245                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2246
2247         tcp_register_congestion_control(&tcp_reno);
2248 }
2249
2250 EXPORT_SYMBOL(tcp_close);
2251 EXPORT_SYMBOL(tcp_disconnect);
2252 EXPORT_SYMBOL(tcp_getsockopt);
2253 EXPORT_SYMBOL(tcp_ioctl);
2254 EXPORT_SYMBOL(tcp_poll);
2255 EXPORT_SYMBOL(tcp_read_sock);
2256 EXPORT_SYMBOL(tcp_recvmsg);
2257 EXPORT_SYMBOL(tcp_sendmsg);
2258 EXPORT_SYMBOL(tcp_sendpage);
2259 EXPORT_SYMBOL(tcp_setsockopt);
2260 EXPORT_SYMBOL(tcp_shutdown);
2261 EXPORT_SYMBOL(tcp_statistics);
2262 EXPORT_SYMBOL_GPL(cleanup_rbuf);