upgrade to fedora-2.6.12-1.1398.FC4 + vserver 2.0.rc7
[linux-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_openreq_cachep;
275 kmem_cache_t *tcp_bucket_cachep;
276 kmem_cache_t *tcp_timewait_cachep;
277
278 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
279
280 int sysctl_tcp_mem[3];
281 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
282 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
283
284 EXPORT_SYMBOL(sysctl_tcp_mem);
285 EXPORT_SYMBOL(sysctl_tcp_rmem);
286 EXPORT_SYMBOL(sysctl_tcp_wmem);
287
288 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
289 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
290
291 EXPORT_SYMBOL(tcp_memory_allocated);
292 EXPORT_SYMBOL(tcp_sockets_allocated);
293
294 /*
295  * Pressure flag: try to collapse.
296  * Technical note: it is used by multiple contexts non atomically.
297  * All the sk_stream_mem_schedule() is of this nature: accounting
298  * is strict, actions are advisory and have some latency.
299  */
300 int tcp_memory_pressure;
301
302 EXPORT_SYMBOL(tcp_memory_pressure);
303
304 void tcp_enter_memory_pressure(void)
305 {
306         if (!tcp_memory_pressure) {
307                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308                 tcp_memory_pressure = 1;
309         }
310 }
311
312 EXPORT_SYMBOL(tcp_enter_memory_pressure);
313
314 /*
315  * LISTEN is a special case for poll..
316  */
317 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318                                                poll_table *wait)
319 {
320         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
321 }
322
323 /*
324  *      Wait for a TCP event.
325  *
326  *      Note that we don't need to lock the socket, as the upper poll layers
327  *      take care of normal races (between the test and the event) and we don't
328  *      go look at any of the socket buffers directly.
329  */
330 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
331 {
332         unsigned int mask;
333         struct sock *sk = sock->sk;
334         struct tcp_sock *tp = tcp_sk(sk);
335
336         poll_wait(file, sk->sk_sleep, wait);
337         if (sk->sk_state == TCP_LISTEN)
338                 return tcp_listen_poll(sk, wait);
339
340         /* Socket is not locked. We are protected from async events
341            by poll logic and correct handling of state changes
342            made by another threads is impossible in any case.
343          */
344
345         mask = 0;
346         if (sk->sk_err)
347                 mask = POLLERR;
348
349         /*
350          * POLLHUP is certainly not done right. But poll() doesn't
351          * have a notion of HUP in just one direction, and for a
352          * socket the read side is more interesting.
353          *
354          * Some poll() documentation says that POLLHUP is incompatible
355          * with the POLLOUT/POLLWR flags, so somebody should check this
356          * all. But careful, it tends to be safer to return too many
357          * bits than too few, and you can easily break real applications
358          * if you don't tell them that something has hung up!
359          *
360          * Check-me.
361          *
362          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
363          * our fs/select.c). It means that after we received EOF,
364          * poll always returns immediately, making impossible poll() on write()
365          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
366          * if and only if shutdown has been made in both directions.
367          * Actually, it is interesting to look how Solaris and DUX
368          * solve this dilemma. I would prefer, if PULLHUP were maskable,
369          * then we could set it on SND_SHUTDOWN. BTW examples given
370          * in Stevens' books assume exactly this behaviour, it explains
371          * why PULLHUP is incompatible with POLLOUT.    --ANK
372          *
373          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
374          * blocking on fresh not-connected or disconnected socket. --ANK
375          */
376         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
377                 mask |= POLLHUP;
378         if (sk->sk_shutdown & RCV_SHUTDOWN)
379                 mask |= POLLIN | POLLRDNORM;
380
381         /* Connected? */
382         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
383                 /* Potential race condition. If read of tp below will
384                  * escape above sk->sk_state, we can be illegally awaken
385                  * in SYN_* states. */
386                 if ((tp->rcv_nxt != tp->copied_seq) &&
387                     (tp->urg_seq != tp->copied_seq ||
388                      tp->rcv_nxt != tp->copied_seq + 1 ||
389                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
390                         mask |= POLLIN | POLLRDNORM;
391
392                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
393                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
394                                 mask |= POLLOUT | POLLWRNORM;
395                         } else {  /* send SIGIO later */
396                                 set_bit(SOCK_ASYNC_NOSPACE,
397                                         &sk->sk_socket->flags);
398                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
399
400                                 /* Race breaker. If space is freed after
401                                  * wspace test but before the flags are set,
402                                  * IO signal will be lost.
403                                  */
404                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
405                                         mask |= POLLOUT | POLLWRNORM;
406                         }
407                 }
408
409                 if (tp->urg_data & TCP_URG_VALID)
410                         mask |= POLLPRI;
411         }
412         return mask;
413 }
414
415 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
416 {
417         struct tcp_sock *tp = tcp_sk(sk);
418         int answ;
419
420         switch (cmd) {
421         case SIOCINQ:
422                 if (sk->sk_state == TCP_LISTEN)
423                         return -EINVAL;
424
425                 lock_sock(sk);
426                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
427                         answ = 0;
428                 else if (sock_flag(sk, SOCK_URGINLINE) ||
429                          !tp->urg_data ||
430                          before(tp->urg_seq, tp->copied_seq) ||
431                          !before(tp->urg_seq, tp->rcv_nxt)) {
432                         answ = tp->rcv_nxt - tp->copied_seq;
433
434                         /* Subtract 1, if FIN is in queue. */
435                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
436                                 answ -=
437                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
438                 } else
439                         answ = tp->urg_seq - tp->copied_seq;
440                 release_sock(sk);
441                 break;
442         case SIOCATMARK:
443                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
444                 break;
445         case SIOCOUTQ:
446                 if (sk->sk_state == TCP_LISTEN)
447                         return -EINVAL;
448
449                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
450                         answ = 0;
451                 else
452                         answ = tp->write_seq - tp->snd_una;
453                 break;
454         default:
455                 return -ENOIOCTLCMD;
456         };
457
458         return put_user(answ, (int __user *)arg);
459 }
460
461
462 int tcp_listen_start(struct sock *sk)
463 {
464         struct inet_sock *inet = inet_sk(sk);
465         struct tcp_sock *tp = tcp_sk(sk);
466         struct tcp_listen_opt *lopt;
467
468         sk->sk_max_ack_backlog = 0;
469         sk->sk_ack_backlog = 0;
470         tp->accept_queue = tp->accept_queue_tail = NULL;
471         rwlock_init(&tp->syn_wait_lock);
472         tcp_delack_init(tp);
473
474         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475         if (!lopt)
476                 return -ENOMEM;
477
478         memset(lopt, 0, sizeof(struct tcp_listen_opt));
479         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481                         break;
482         get_random_bytes(&lopt->hash_rnd, 4);
483
484         write_lock_bh(&tp->syn_wait_lock);
485         tp->listen_opt = lopt;
486         write_unlock_bh(&tp->syn_wait_lock);
487
488         /* There is race window here: we announce ourselves listening,
489          * but this transition is still not validated by get_port().
490          * It is OK, because this socket enters to hash table only
491          * after validation is complete.
492          */
493         sk->sk_state = TCP_LISTEN;
494         if (!sk->sk_prot->get_port(sk, inet->num)) {
495                 inet->sport = htons(inet->num);
496
497                 sk_dst_reset(sk);
498                 sk->sk_prot->hash(sk);
499                 return 0;
500         }
501
502         sk->sk_state = TCP_CLOSE;
503         write_lock_bh(&tp->syn_wait_lock);
504         tp->listen_opt = NULL;
505         write_unlock_bh(&tp->syn_wait_lock);
506         kfree(lopt);
507         return -EADDRINUSE;
508 }
509
510 /*
511  *      This routine closes sockets which have been at least partially
512  *      opened, but not yet accepted.
513  */
514
515 static void tcp_listen_stop (struct sock *sk)
516 {
517         struct tcp_sock *tp = tcp_sk(sk);
518         struct tcp_listen_opt *lopt = tp->listen_opt;
519         struct open_request *acc_req = tp->accept_queue;
520         struct open_request *req;
521         int i;
522
523         tcp_delete_keepalive_timer(sk);
524
525         /* make all the listen_opt local to us */
526         write_lock_bh(&tp->syn_wait_lock);
527         tp->listen_opt = NULL;
528         write_unlock_bh(&tp->syn_wait_lock);
529
530         tp->accept_queue_tail = NULL;
531         tp->accept_queue = NULL;
532
533         if (lopt->qlen) {
534                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
535                         while ((req = lopt->syn_table[i]) != NULL) {
536                                 lopt->syn_table[i] = req->dl_next;
537                                 lopt->qlen--;
538                                 tcp_openreq_free(req);
539
540                 /* Following specs, it would be better either to send FIN
541                  * (and enter FIN-WAIT-1, it is normal close)
542                  * or to send active reset (abort).
543                  * Certainly, it is pretty dangerous while synflood, but it is
544                  * bad justification for our negligence 8)
545                  * To be honest, we are not able to make either
546                  * of the variants now.                 --ANK
547                  */
548                         }
549                 }
550         }
551         BUG_TRAP(!lopt->qlen);
552
553         kfree(lopt);
554
555         while ((req = acc_req) != NULL) {
556                 struct sock *child = req->sk;
557
558                 acc_req = req->dl_next;
559
560                 local_bh_disable();
561                 bh_lock_sock(child);
562                 BUG_TRAP(!sock_owned_by_user(child));
563                 sock_hold(child);
564
565                 tcp_disconnect(child, O_NONBLOCK);
566
567                 sock_orphan(child);
568
569                 atomic_inc(&tcp_orphan_count);
570
571                 tcp_destroy_sock(child);
572
573                 bh_unlock_sock(child);
574                 local_bh_enable();
575                 sock_put(child);
576
577                 sk_acceptq_removed(sk);
578                 tcp_openreq_fastfree(req);
579         }
580         BUG_TRAP(!sk->sk_ack_backlog);
581 }
582
583 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
584 {
585         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
586         tp->pushed_seq = tp->write_seq;
587 }
588
589 static inline int forced_push(struct tcp_sock *tp)
590 {
591         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
592 }
593
594 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
595                               struct sk_buff *skb)
596 {
597         skb->csum = 0;
598         TCP_SKB_CB(skb)->seq = tp->write_seq;
599         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
600         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
601         TCP_SKB_CB(skb)->sacked = 0;
602         skb_header_release(skb);
603         __skb_queue_tail(&sk->sk_write_queue, skb);
604         sk_charge_skb(sk, skb);
605         if (!sk->sk_send_head)
606                 sk->sk_send_head = skb;
607         else if (tp->nonagle&TCP_NAGLE_PUSH)
608                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
609 }
610
611 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
612                                 struct sk_buff *skb)
613 {
614         if (flags & MSG_OOB) {
615                 tp->urg_mode = 1;
616                 tp->snd_up = tp->write_seq;
617                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
618         }
619 }
620
621 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
622                             int mss_now, int nonagle)
623 {
624         if (sk->sk_send_head) {
625                 struct sk_buff *skb = sk->sk_write_queue.prev;
626                 if (!(flags & MSG_MORE) || forced_push(tp))
627                         tcp_mark_push(tp, skb);
628                 tcp_mark_urg(tp, flags, skb);
629                 __tcp_push_pending_frames(sk, tp, mss_now,
630                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
631         }
632 }
633
634 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
635                          size_t psize, int flags)
636 {
637         struct tcp_sock *tp = tcp_sk(sk);
638         int mss_now;
639         int err;
640         ssize_t copied;
641         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
642
643         /* Wait for a connection to finish. */
644         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
645                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
646                         goto out_err;
647
648         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
649
650         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
651         copied = 0;
652
653         err = -EPIPE;
654         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
655                 goto do_error;
656
657         while (psize > 0) {
658                 struct sk_buff *skb = sk->sk_write_queue.prev;
659                 struct page *page = pages[poffset / PAGE_SIZE];
660                 int copy, i, can_coalesce;
661                 int offset = poffset % PAGE_SIZE;
662                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
663
664                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
665 new_segment:
666                         if (!sk_stream_memory_free(sk))
667                                 goto wait_for_sndbuf;
668
669                         skb = sk_stream_alloc_pskb(sk, 0, 0,
670                                                    sk->sk_allocation);
671                         if (!skb)
672                                 goto wait_for_memory;
673
674                         skb_entail(sk, tp, skb);
675                         copy = mss_now;
676                 }
677
678                 if (copy > size)
679                         copy = size;
680
681                 i = skb_shinfo(skb)->nr_frags;
682                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
683                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
684                         tcp_mark_push(tp, skb);
685                         goto new_segment;
686                 }
687                 if (sk->sk_forward_alloc < copy &&
688                     !sk_stream_mem_schedule(sk, copy, 0))
689                         goto wait_for_memory;
690                 
691                 if (can_coalesce) {
692                         skb_shinfo(skb)->frags[i - 1].size += copy;
693                 } else {
694                         get_page(page);
695                         skb_fill_page_desc(skb, i, page, offset, copy);
696                 }
697
698                 skb->len += copy;
699                 skb->data_len += copy;
700                 skb->truesize += copy;
701                 sk->sk_wmem_queued += copy;
702                 sk->sk_forward_alloc -= copy;
703                 skb->ip_summed = CHECKSUM_HW;
704                 tp->write_seq += copy;
705                 TCP_SKB_CB(skb)->end_seq += copy;
706                 skb_shinfo(skb)->tso_segs = 0;
707
708                 if (!copied)
709                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
710
711                 copied += copy;
712                 poffset += copy;
713                 if (!(psize -= copy))
714                         goto out;
715
716                 if (skb->len != mss_now || (flags & MSG_OOB))
717                         continue;
718
719                 if (forced_push(tp)) {
720                         tcp_mark_push(tp, skb);
721                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
722                 } else if (skb == sk->sk_send_head)
723                         tcp_push_one(sk, mss_now);
724                 continue;
725
726 wait_for_sndbuf:
727                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
728 wait_for_memory:
729                 if (copied)
730                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
731
732                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
733                         goto do_error;
734
735                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
736         }
737
738 out:
739         if (copied)
740                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
741         return copied;
742
743 do_error:
744         if (copied)
745                 goto out;
746 out_err:
747         return sk_stream_error(sk, flags, err);
748 }
749
750 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
751                      size_t size, int flags)
752 {
753         ssize_t res;
754         struct sock *sk = sock->sk;
755
756 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
757
758         if (!(sk->sk_route_caps & NETIF_F_SG) ||
759             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
760                 return sock_no_sendpage(sock, page, offset, size, flags);
761
762 #undef TCP_ZC_CSUM_FLAGS
763
764         lock_sock(sk);
765         TCP_CHECK_TIMER(sk);
766         res = do_tcp_sendpages(sk, &page, offset, size, flags);
767         TCP_CHECK_TIMER(sk);
768         release_sock(sk);
769         return res;
770 }
771
772 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
773 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
774
775 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
776 {
777         int tmp = tp->mss_cache_std;
778
779         if (sk->sk_route_caps & NETIF_F_SG) {
780                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
781
782                 if (tmp >= pgbreak &&
783                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
784                         tmp = pgbreak;
785         }
786         return tmp;
787 }
788
789 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
790                 size_t size)
791 {
792         struct iovec *iov;
793         struct tcp_sock *tp = tcp_sk(sk);
794         struct sk_buff *skb;
795         int iovlen, flags;
796         int mss_now;
797         int err, copied;
798         long timeo;
799
800         lock_sock(sk);
801         TCP_CHECK_TIMER(sk);
802
803         flags = msg->msg_flags;
804         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
805
806         /* Wait for a connection to finish. */
807         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
808                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
809                         goto out_err;
810
811         /* This should be in poll */
812         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
813
814         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
815
816         /* Ok commence sending. */
817         iovlen = msg->msg_iovlen;
818         iov = msg->msg_iov;
819         copied = 0;
820
821         err = -EPIPE;
822         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
823                 goto do_error;
824
825         while (--iovlen >= 0) {
826                 int seglen = iov->iov_len;
827                 unsigned char __user *from = iov->iov_base;
828
829                 iov++;
830
831                 while (seglen > 0) {
832                         int copy;
833
834                         skb = sk->sk_write_queue.prev;
835
836                         if (!sk->sk_send_head ||
837                             (copy = mss_now - skb->len) <= 0) {
838
839 new_segment:
840                                 /* Allocate new segment. If the interface is SG,
841                                  * allocate skb fitting to single page.
842                                  */
843                                 if (!sk_stream_memory_free(sk))
844                                         goto wait_for_sndbuf;
845
846                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
847                                                            0, sk->sk_allocation);
848                                 if (!skb)
849                                         goto wait_for_memory;
850
851                                 /*
852                                  * Check whether we can use HW checksum.
853                                  */
854                                 if (sk->sk_route_caps &
855                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
856                                      NETIF_F_HW_CSUM))
857                                         skb->ip_summed = CHECKSUM_HW;
858
859                                 skb_entail(sk, tp, skb);
860                                 copy = mss_now;
861                         }
862
863                         /* Try to append data to the end of skb. */
864                         if (copy > seglen)
865                                 copy = seglen;
866
867                         /* Where to copy to? */
868                         if (skb_tailroom(skb) > 0) {
869                                 /* We have some space in skb head. Superb! */
870                                 if (copy > skb_tailroom(skb))
871                                         copy = skb_tailroom(skb);
872                                 if ((err = skb_add_data(skb, from, copy)) != 0)
873                                         goto do_fault;
874                         } else {
875                                 int merge = 0;
876                                 int i = skb_shinfo(skb)->nr_frags;
877                                 struct page *page = TCP_PAGE(sk);
878                                 int off = TCP_OFF(sk);
879
880                                 if (skb_can_coalesce(skb, i, page, off) &&
881                                     off != PAGE_SIZE) {
882                                         /* We can extend the last page
883                                          * fragment. */
884                                         merge = 1;
885                                 } else if (i == MAX_SKB_FRAGS ||
886                                            (!i &&
887                                            !(sk->sk_route_caps & NETIF_F_SG))) {
888                                         /* Need to add new fragment and cannot
889                                          * do this because interface is non-SG,
890                                          * or because all the page slots are
891                                          * busy. */
892                                         tcp_mark_push(tp, skb);
893                                         goto new_segment;
894                                 } else if (page) {
895                                         /* If page is cached, align
896                                          * offset to L1 cache boundary
897                                          */
898                                         off = (off + L1_CACHE_BYTES - 1) &
899                                               ~(L1_CACHE_BYTES - 1);
900                                         if (off == PAGE_SIZE) {
901                                                 put_page(page);
902                                                 TCP_PAGE(sk) = page = NULL;
903                                         }
904                                 }
905
906                                 if (!page) {
907                                         /* Allocate new cache page. */
908                                         if (!(page = sk_stream_alloc_page(sk)))
909                                                 goto wait_for_memory;
910                                         off = 0;
911                                 }
912
913                                 if (copy > PAGE_SIZE - off)
914                                         copy = PAGE_SIZE - off;
915
916                                 /* Time to copy data. We are close to
917                                  * the end! */
918                                 err = skb_copy_to_page(sk, from, skb, page,
919                                                        off, copy);
920                                 if (err) {
921                                         /* If this page was new, give it to the
922                                          * socket so it does not get leaked.
923                                          */
924                                         if (!TCP_PAGE(sk)) {
925                                                 TCP_PAGE(sk) = page;
926                                                 TCP_OFF(sk) = 0;
927                                         }
928                                         goto do_error;
929                                 }
930
931                                 /* Update the skb. */
932                                 if (merge) {
933                                         skb_shinfo(skb)->frags[i - 1].size +=
934                                                                         copy;
935                                 } else {
936                                         skb_fill_page_desc(skb, i, page, off, copy);
937                                         if (TCP_PAGE(sk)) {
938                                                 get_page(page);
939                                         } else if (off + copy < PAGE_SIZE) {
940                                                 get_page(page);
941                                                 TCP_PAGE(sk) = page;
942                                         }
943                                 }
944
945                                 TCP_OFF(sk) = off + copy;
946                         }
947
948                         if (!copied)
949                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
950
951                         tp->write_seq += copy;
952                         TCP_SKB_CB(skb)->end_seq += copy;
953                         skb_shinfo(skb)->tso_segs = 0;
954
955                         from += copy;
956                         copied += copy;
957                         if ((seglen -= copy) == 0 && iovlen == 0)
958                                 goto out;
959
960                         if (skb->len != mss_now || (flags & MSG_OOB))
961                                 continue;
962
963                         if (forced_push(tp)) {
964                                 tcp_mark_push(tp, skb);
965                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
966                         } else if (skb == sk->sk_send_head)
967                                 tcp_push_one(sk, mss_now);
968                         continue;
969
970 wait_for_sndbuf:
971                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
972 wait_for_memory:
973                         if (copied)
974                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
975
976                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
977                                 goto do_error;
978
979                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
980                 }
981         }
982
983 out:
984         if (copied)
985                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
986         TCP_CHECK_TIMER(sk);
987         release_sock(sk);
988         return copied;
989
990 do_fault:
991         if (!skb->len) {
992                 if (sk->sk_send_head == skb)
993                         sk->sk_send_head = NULL;
994                 __skb_unlink(skb, skb->list);
995                 sk_stream_free_skb(sk, skb);
996         }
997
998 do_error:
999         if (copied)
1000                 goto out;
1001 out_err:
1002         err = sk_stream_error(sk, flags, err);
1003         TCP_CHECK_TIMER(sk);
1004         release_sock(sk);
1005         return err;
1006 }
1007
1008 /*
1009  *      Handle reading urgent data. BSD has very simple semantics for
1010  *      this, no blocking and very strange errors 8)
1011  */
1012
1013 static int tcp_recv_urg(struct sock *sk, long timeo,
1014                         struct msghdr *msg, int len, int flags,
1015                         int *addr_len)
1016 {
1017         struct tcp_sock *tp = tcp_sk(sk);
1018
1019         /* No URG data to read. */
1020         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1021             tp->urg_data == TCP_URG_READ)
1022                 return -EINVAL; /* Yes this is right ! */
1023
1024         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1025                 return -ENOTCONN;
1026
1027         if (tp->urg_data & TCP_URG_VALID) {
1028                 int err = 0;
1029                 char c = tp->urg_data;
1030
1031                 if (!(flags & MSG_PEEK))
1032                         tp->urg_data = TCP_URG_READ;
1033
1034                 /* Read urgent data. */
1035                 msg->msg_flags |= MSG_OOB;
1036
1037                 if (len > 0) {
1038                         if (!(flags & MSG_TRUNC))
1039                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1040                         len = 1;
1041                 } else
1042                         msg->msg_flags |= MSG_TRUNC;
1043
1044                 return err ? -EFAULT : len;
1045         }
1046
1047         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1048                 return 0;
1049
1050         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1051          * the available implementations agree in this case:
1052          * this call should never block, independent of the
1053          * blocking state of the socket.
1054          * Mike <pall@rz.uni-karlsruhe.de>
1055          */
1056         return -EAGAIN;
1057 }
1058
1059 /* Clean up the receive buffer for full frames taken by the user,
1060  * then send an ACK if necessary.  COPIED is the number of bytes
1061  * tcp_recvmsg has given to the user so far, it speeds up the
1062  * calculation of whether or not we must ACK for the sake of
1063  * a window update.
1064  */
1065 void cleanup_rbuf(struct sock *sk, int copied)
1066 {
1067         struct tcp_sock *tp = tcp_sk(sk);
1068         int time_to_ack = 0;
1069
1070 #if TCP_DEBUG
1071         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1072
1073         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1074 #endif
1075
1076         if (tcp_ack_scheduled(tp)) {
1077                    /* Delayed ACKs frequently hit locked sockets during bulk
1078                     * receive. */
1079                 if (tp->ack.blocked ||
1080                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1081                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1082                     /*
1083                      * If this read emptied read buffer, we send ACK, if
1084                      * connection is not bidirectional, user drained
1085                      * receive buffer and there was a small segment
1086                      * in queue.
1087                      */
1088                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1089                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1090                         time_to_ack = 1;
1091         }
1092
1093         /* We send an ACK if we can now advertise a non-zero window
1094          * which has been raised "significantly".
1095          *
1096          * Even if window raised up to infinity, do not send window open ACK
1097          * in states, where we will not receive more. It is useless.
1098          */
1099         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1100                 __u32 rcv_window_now = tcp_receive_window(tp);
1101
1102                 /* Optimize, __tcp_select_window() is not cheap. */
1103                 if (2*rcv_window_now <= tp->window_clamp) {
1104                         __u32 new_window = __tcp_select_window(sk);
1105
1106                         /* Send ACK now, if this read freed lots of space
1107                          * in our buffer. Certainly, new_window is new window.
1108                          * We can advertise it now, if it is not less than current one.
1109                          * "Lots" means "at least twice" here.
1110                          */
1111                         if (new_window && new_window >= 2 * rcv_window_now)
1112                                 time_to_ack = 1;
1113                 }
1114         }
1115         if (time_to_ack)
1116                 tcp_send_ack(sk);
1117 }
1118
1119 static void tcp_prequeue_process(struct sock *sk)
1120 {
1121         struct sk_buff *skb;
1122         struct tcp_sock *tp = tcp_sk(sk);
1123
1124         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1125
1126         /* RX process wants to run with disabled BHs, though it is not
1127          * necessary */
1128         local_bh_disable();
1129         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1130                 sk->sk_backlog_rcv(sk, skb);
1131         local_bh_enable();
1132
1133         /* Clear memory counter. */
1134         tp->ucopy.memory = 0;
1135 }
1136
1137 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1138 {
1139         struct sk_buff *skb;
1140         u32 offset;
1141
1142         skb_queue_walk(&sk->sk_receive_queue, skb) {
1143                 offset = seq - TCP_SKB_CB(skb)->seq;
1144                 if (skb->h.th->syn)
1145                         offset--;
1146                 if (offset < skb->len || skb->h.th->fin) {
1147                         *off = offset;
1148                         return skb;
1149                 }
1150         }
1151         return NULL;
1152 }
1153
1154 /*
1155  * This routine provides an alternative to tcp_recvmsg() for routines
1156  * that would like to handle copying from skbuffs directly in 'sendfile'
1157  * fashion.
1158  * Note:
1159  *      - It is assumed that the socket was locked by the caller.
1160  *      - The routine does not block.
1161  *      - At present, there is no support for reading OOB data
1162  *        or for 'peeking' the socket using this routine
1163  *        (although both would be easy to implement).
1164  */
1165 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1166                   sk_read_actor_t recv_actor)
1167 {
1168         struct sk_buff *skb;
1169         struct tcp_sock *tp = tcp_sk(sk);
1170         u32 seq = tp->copied_seq;
1171         u32 offset;
1172         int copied = 0;
1173
1174         if (sk->sk_state == TCP_LISTEN)
1175                 return -ENOTCONN;
1176         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1177                 if (offset < skb->len) {
1178                         size_t used, len;
1179
1180                         len = skb->len - offset;
1181                         /* Stop reading if we hit a patch of urgent data */
1182                         if (tp->urg_data) {
1183                                 u32 urg_offset = tp->urg_seq - seq;
1184                                 if (urg_offset < len)
1185                                         len = urg_offset;
1186                                 if (!len)
1187                                         break;
1188                         }
1189                         used = recv_actor(desc, skb, offset, len);
1190                         if (used <= len) {
1191                                 seq += used;
1192                                 copied += used;
1193                                 offset += used;
1194                         }
1195                         if (offset != skb->len)
1196                                 break;
1197                 }
1198                 if (skb->h.th->fin) {
1199                         sk_eat_skb(sk, skb);
1200                         ++seq;
1201                         break;
1202                 }
1203                 sk_eat_skb(sk, skb);
1204                 if (!desc->count)
1205                         break;
1206         }
1207         tp->copied_seq = seq;
1208
1209         tcp_rcv_space_adjust(sk);
1210
1211         /* Clean up data we have read: This will do ACK frames. */
1212         if (copied)
1213                 cleanup_rbuf(sk, copied);
1214         return copied;
1215 }
1216
1217 /*
1218  *      This routine copies from a sock struct into the user buffer.
1219  *
1220  *      Technical note: in 2.3 we work on _locked_ socket, so that
1221  *      tricks with *seq access order and skb->users are not required.
1222  *      Probably, code can be easily improved even more.
1223  */
1224
1225 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1226                 size_t len, int nonblock, int flags, int *addr_len)
1227 {
1228         struct tcp_sock *tp = tcp_sk(sk);
1229         int copied = 0;
1230         u32 peek_seq;
1231         u32 *seq;
1232         unsigned long used;
1233         int err;
1234         int target;             /* Read at least this many bytes */
1235         long timeo;
1236         struct task_struct *user_recv = NULL;
1237
1238         lock_sock(sk);
1239
1240         TCP_CHECK_TIMER(sk);
1241
1242         err = -ENOTCONN;
1243         if (sk->sk_state == TCP_LISTEN)
1244                 goto out;
1245
1246         timeo = sock_rcvtimeo(sk, nonblock);
1247
1248         /* Urgent data needs to be handled specially. */
1249         if (flags & MSG_OOB)
1250                 goto recv_urg;
1251
1252         seq = &tp->copied_seq;
1253         if (flags & MSG_PEEK) {
1254                 peek_seq = tp->copied_seq;
1255                 seq = &peek_seq;
1256         }
1257
1258         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1259
1260         do {
1261                 struct sk_buff *skb;
1262                 u32 offset;
1263
1264                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1265                 if (tp->urg_data && tp->urg_seq == *seq) {
1266                         if (copied)
1267                                 break;
1268                         if (signal_pending(current)) {
1269                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1270                                 break;
1271                         }
1272                 }
1273
1274                 /* Next get a buffer. */
1275
1276                 skb = skb_peek(&sk->sk_receive_queue);
1277                 do {
1278                         if (!skb)
1279                                 break;
1280
1281                         /* Now that we have two receive queues this
1282                          * shouldn't happen.
1283                          */
1284                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1285                                 printk(KERN_INFO "recvmsg bug: copied %X "
1286                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1287                                 break;
1288                         }
1289                         offset = *seq - TCP_SKB_CB(skb)->seq;
1290                         if (skb->h.th->syn)
1291                                 offset--;
1292                         if (offset < skb->len)
1293                                 goto found_ok_skb;
1294                         if (skb->h.th->fin)
1295                                 goto found_fin_ok;
1296                         BUG_TRAP(flags & MSG_PEEK);
1297                         skb = skb->next;
1298                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1299
1300                 /* Well, if we have backlog, try to process it now yet. */
1301
1302                 if (copied >= target && !sk->sk_backlog.tail)
1303                         break;
1304
1305                 if (copied) {
1306                         if (sk->sk_err ||
1307                             sk->sk_state == TCP_CLOSE ||
1308                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1309                             !timeo ||
1310                             signal_pending(current) ||
1311                             (flags & MSG_PEEK))
1312                                 break;
1313                 } else {
1314                         if (sock_flag(sk, SOCK_DONE))
1315                                 break;
1316
1317                         if (sk->sk_err) {
1318                                 copied = sock_error(sk);
1319                                 break;
1320                         }
1321
1322                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1323                                 break;
1324
1325                         if (sk->sk_state == TCP_CLOSE) {
1326                                 if (!sock_flag(sk, SOCK_DONE)) {
1327                                         /* This occurs when user tries to read
1328                                          * from never connected socket.
1329                                          */
1330                                         copied = -ENOTCONN;
1331                                         break;
1332                                 }
1333                                 break;
1334                         }
1335
1336                         if (!timeo) {
1337                                 copied = -EAGAIN;
1338                                 break;
1339                         }
1340
1341                         if (signal_pending(current)) {
1342                                 copied = sock_intr_errno(timeo);
1343                                 break;
1344                         }
1345                 }
1346
1347                 cleanup_rbuf(sk, copied);
1348
1349                 if (tp->ucopy.task == user_recv) {
1350                         /* Install new reader */
1351                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1352                                 user_recv = current;
1353                                 tp->ucopy.task = user_recv;
1354                                 tp->ucopy.iov = msg->msg_iov;
1355                         }
1356
1357                         tp->ucopy.len = len;
1358
1359                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1360                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1361
1362                         /* Ugly... If prequeue is not empty, we have to
1363                          * process it before releasing socket, otherwise
1364                          * order will be broken at second iteration.
1365                          * More elegant solution is required!!!
1366                          *
1367                          * Look: we have the following (pseudo)queues:
1368                          *
1369                          * 1. packets in flight
1370                          * 2. backlog
1371                          * 3. prequeue
1372                          * 4. receive_queue
1373                          *
1374                          * Each queue can be processed only if the next ones
1375                          * are empty. At this point we have empty receive_queue.
1376                          * But prequeue _can_ be not empty after 2nd iteration,
1377                          * when we jumped to start of loop because backlog
1378                          * processing added something to receive_queue.
1379                          * We cannot release_sock(), because backlog contains
1380                          * packets arrived _after_ prequeued ones.
1381                          *
1382                          * Shortly, algorithm is clear --- to process all
1383                          * the queues in order. We could make it more directly,
1384                          * requeueing packets from backlog to prequeue, if
1385                          * is not empty. It is more elegant, but eats cycles,
1386                          * unfortunately.
1387                          */
1388                         if (skb_queue_len(&tp->ucopy.prequeue))
1389                                 goto do_prequeue;
1390
1391                         /* __ Set realtime policy in scheduler __ */
1392                 }
1393
1394                 if (copied >= target) {
1395                         /* Do not sleep, just process backlog. */
1396                         release_sock(sk);
1397                         lock_sock(sk);
1398                 } else
1399                         sk_wait_data(sk, &timeo);
1400
1401                 if (user_recv) {
1402                         int chunk;
1403
1404                         /* __ Restore normal policy in scheduler __ */
1405
1406                         if ((chunk = len - tp->ucopy.len) != 0) {
1407                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1408                                 len -= chunk;
1409                                 copied += chunk;
1410                         }
1411
1412                         if (tp->rcv_nxt == tp->copied_seq &&
1413                             skb_queue_len(&tp->ucopy.prequeue)) {
1414 do_prequeue:
1415                                 tcp_prequeue_process(sk);
1416
1417                                 if ((chunk = len - tp->ucopy.len) != 0) {
1418                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1419                                         len -= chunk;
1420                                         copied += chunk;
1421                                 }
1422                         }
1423                 }
1424                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1425                         if (net_ratelimit())
1426                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1427                                        current->comm, current->pid);
1428                         peek_seq = tp->copied_seq;
1429                 }
1430                 continue;
1431
1432         found_ok_skb:
1433                 /* Ok so how much can we use? */
1434                 used = skb->len - offset;
1435                 if (len < used)
1436                         used = len;
1437
1438                 /* Do we have urgent data here? */
1439                 if (tp->urg_data) {
1440                         u32 urg_offset = tp->urg_seq - *seq;
1441                         if (urg_offset < used) {
1442                                 if (!urg_offset) {
1443                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1444                                                 ++*seq;
1445                                                 offset++;
1446                                                 used--;
1447                                                 if (!used)
1448                                                         goto skip_copy;
1449                                         }
1450                                 } else
1451                                         used = urg_offset;
1452                         }
1453                 }
1454
1455                 if (!(flags & MSG_TRUNC)) {
1456                         err = skb_copy_datagram_iovec(skb, offset,
1457                                                       msg->msg_iov, used);
1458                         if (err) {
1459                                 /* Exception. Bailout! */
1460                                 if (!copied)
1461                                         copied = -EFAULT;
1462                                 break;
1463                         }
1464                 }
1465
1466                 *seq += used;
1467                 copied += used;
1468                 len -= used;
1469
1470                 tcp_rcv_space_adjust(sk);
1471
1472 skip_copy:
1473                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1474                         tp->urg_data = 0;
1475                         tcp_fast_path_check(sk, tp);
1476                 }
1477                 if (used + offset < skb->len)
1478                         continue;
1479
1480                 if (skb->h.th->fin)
1481                         goto found_fin_ok;
1482                 if (!(flags & MSG_PEEK))
1483                         sk_eat_skb(sk, skb);
1484                 continue;
1485
1486         found_fin_ok:
1487                 /* Process the FIN. */
1488                 ++*seq;
1489                 if (!(flags & MSG_PEEK))
1490                         sk_eat_skb(sk, skb);
1491                 break;
1492         } while (len > 0);
1493
1494         if (user_recv) {
1495                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1496                         int chunk;
1497
1498                         tp->ucopy.len = copied > 0 ? len : 0;
1499
1500                         tcp_prequeue_process(sk);
1501
1502                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1503                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1504                                 len -= chunk;
1505                                 copied += chunk;
1506                         }
1507                 }
1508
1509                 tp->ucopy.task = NULL;
1510                 tp->ucopy.len = 0;
1511         }
1512
1513         /* According to UNIX98, msg_name/msg_namelen are ignored
1514          * on connected socket. I was just happy when found this 8) --ANK
1515          */
1516
1517         /* Clean up data we have read: This will do ACK frames. */
1518         cleanup_rbuf(sk, copied);
1519
1520         TCP_CHECK_TIMER(sk);
1521         release_sock(sk);
1522         return copied;
1523
1524 out:
1525         TCP_CHECK_TIMER(sk);
1526         release_sock(sk);
1527         return err;
1528
1529 recv_urg:
1530         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1531         goto out;
1532 }
1533
1534 /*
1535  *      State processing on a close. This implements the state shift for
1536  *      sending our FIN frame. Note that we only send a FIN for some
1537  *      states. A shutdown() may have already sent the FIN, or we may be
1538  *      closed.
1539  */
1540
1541 static unsigned char new_state[16] = {
1542   /* current state:        new state:      action:      */
1543   /* (Invalid)          */ TCP_CLOSE,
1544   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1545   /* TCP_SYN_SENT       */ TCP_CLOSE,
1546   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1547   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1548   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1549   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1550   /* TCP_CLOSE          */ TCP_CLOSE,
1551   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1552   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1553   /* TCP_LISTEN         */ TCP_CLOSE,
1554   /* TCP_CLOSING        */ TCP_CLOSING,
1555 };
1556
1557 static int tcp_close_state(struct sock *sk)
1558 {
1559         int next = (int)new_state[sk->sk_state];
1560         int ns = next & TCP_STATE_MASK;
1561
1562         tcp_set_state(sk, ns);
1563
1564         return next & TCP_ACTION_FIN;
1565 }
1566
1567 /*
1568  *      Shutdown the sending side of a connection. Much like close except
1569  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1570  */
1571
1572 void tcp_shutdown(struct sock *sk, int how)
1573 {
1574         /*      We need to grab some memory, and put together a FIN,
1575          *      and then put it into the queue to be sent.
1576          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1577          */
1578         if (!(how & SEND_SHUTDOWN))
1579                 return;
1580
1581         /* If we've already sent a FIN, or it's a closed state, skip this. */
1582         if ((1 << sk->sk_state) &
1583             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1584              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1585                 /* Clear out any half completed packets.  FIN if needed. */
1586                 if (tcp_close_state(sk))
1587                         tcp_send_fin(sk);
1588         }
1589 }
1590
1591 /*
1592  * At this point, there should be no process reference to this
1593  * socket, and thus no user references at all.  Therefore we
1594  * can assume the socket waitqueue is inactive and nobody will
1595  * try to jump onto it.
1596  */
1597 void tcp_destroy_sock(struct sock *sk)
1598 {
1599         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1600         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1601
1602         /* It cannot be in hash table! */
1603         BUG_TRAP(sk_unhashed(sk));
1604
1605         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1606         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1607
1608         sk->sk_prot->destroy(sk);
1609
1610         sk_stream_kill_queues(sk);
1611
1612         xfrm_sk_free_policy(sk);
1613
1614 #ifdef INET_REFCNT_DEBUG
1615         if (atomic_read(&sk->sk_refcnt) != 1) {
1616                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1617                        sk, atomic_read(&sk->sk_refcnt));
1618         }
1619 #endif
1620
1621         atomic_dec(&tcp_orphan_count);
1622         sock_put(sk);
1623 }
1624
1625 void tcp_close(struct sock *sk, long timeout)
1626 {
1627         struct sk_buff *skb;
1628         int data_was_unread = 0;
1629
1630         lock_sock(sk);
1631         sk->sk_shutdown = SHUTDOWN_MASK;
1632
1633         if (sk->sk_state == TCP_LISTEN) {
1634                 tcp_set_state(sk, TCP_CLOSE);
1635
1636                 /* Special case. */
1637                 tcp_listen_stop(sk);
1638
1639                 goto adjudge_to_death;
1640         }
1641
1642         /*  We need to flush the recv. buffs.  We do this only on the
1643          *  descriptor close, not protocol-sourced closes, because the
1644          *  reader process may not have drained the data yet!
1645          */
1646         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1647                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1648                           skb->h.th->fin;
1649                 data_was_unread += len;
1650                 __kfree_skb(skb);
1651         }
1652
1653         sk_stream_mem_reclaim(sk);
1654
1655         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1656          * 3.10, we send a RST here because data was lost.  To
1657          * witness the awful effects of the old behavior of always
1658          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1659          * a bulk GET in an FTP client, suspend the process, wait
1660          * for the client to advertise a zero window, then kill -9
1661          * the FTP client, wheee...  Note: timeout is always zero
1662          * in such a case.
1663          */
1664         if (data_was_unread) {
1665                 /* Unread data was tossed, zap the connection. */
1666                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1667                 tcp_set_state(sk, TCP_CLOSE);
1668                 tcp_send_active_reset(sk, GFP_KERNEL);
1669         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1670                 /* Check zero linger _after_ checking for unread data. */
1671                 sk->sk_prot->disconnect(sk, 0);
1672                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1673         } else if (tcp_close_state(sk)) {
1674                 /* We FIN if the application ate all the data before
1675                  * zapping the connection.
1676                  */
1677
1678                 /* RED-PEN. Formally speaking, we have broken TCP state
1679                  * machine. State transitions:
1680                  *
1681                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1682                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1683                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1684                  *
1685                  * are legal only when FIN has been sent (i.e. in window),
1686                  * rather than queued out of window. Purists blame.
1687                  *
1688                  * F.e. "RFC state" is ESTABLISHED,
1689                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1690                  *
1691                  * The visible declinations are that sometimes
1692                  * we enter time-wait state, when it is not required really
1693                  * (harmless), do not send active resets, when they are
1694                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1695                  * they look as CLOSING or LAST_ACK for Linux)
1696                  * Probably, I missed some more holelets.
1697                  *                                              --ANK
1698                  */
1699                 tcp_send_fin(sk);
1700         }
1701
1702         sk_stream_wait_close(sk, timeout);
1703
1704 adjudge_to_death:
1705         /* It is the last release_sock in its life. It will remove backlog. */
1706         release_sock(sk);
1707
1708
1709         /* Now socket is owned by kernel and we acquire BH lock
1710            to finish close. No need to check for user refs.
1711          */
1712         local_bh_disable();
1713         bh_lock_sock(sk);
1714         BUG_TRAP(!sock_owned_by_user(sk));
1715
1716         sock_hold(sk);
1717         sock_orphan(sk);
1718
1719         /*      This is a (useful) BSD violating of the RFC. There is a
1720          *      problem with TCP as specified in that the other end could
1721          *      keep a socket open forever with no application left this end.
1722          *      We use a 3 minute timeout (about the same as BSD) then kill
1723          *      our end. If they send after that then tough - BUT: long enough
1724          *      that we won't make the old 4*rto = almost no time - whoops
1725          *      reset mistake.
1726          *
1727          *      Nope, it was not mistake. It is really desired behaviour
1728          *      f.e. on http servers, when such sockets are useless, but
1729          *      consume significant resources. Let's do it with special
1730          *      linger2 option.                                 --ANK
1731          */
1732
1733         if (sk->sk_state == TCP_FIN_WAIT2) {
1734                 struct tcp_sock *tp = tcp_sk(sk);
1735                 if (tp->linger2 < 0) {
1736                         tcp_set_state(sk, TCP_CLOSE);
1737                         tcp_send_active_reset(sk, GFP_ATOMIC);
1738                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1739                 } else {
1740                         int tmo = tcp_fin_time(tp);
1741
1742                         if (tmo > TCP_TIMEWAIT_LEN) {
1743                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1744                         } else {
1745                                 atomic_inc(&tcp_orphan_count);
1746                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1747                                 goto out;
1748                         }
1749                 }
1750         }
1751         if (sk->sk_state != TCP_CLOSE) {
1752                 sk_stream_mem_reclaim(sk);
1753                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1754                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1755                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1756                         if (net_ratelimit())
1757                                 printk(KERN_INFO "TCP: too many of orphaned "
1758                                        "sockets\n");
1759                         tcp_set_state(sk, TCP_CLOSE);
1760                         tcp_send_active_reset(sk, GFP_ATOMIC);
1761                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1762                 }
1763         }
1764         atomic_inc(&tcp_orphan_count);
1765
1766         if (sk->sk_state == TCP_CLOSE)
1767                 tcp_destroy_sock(sk);
1768         /* Otherwise, socket is reprieved until protocol close. */
1769
1770 out:
1771         bh_unlock_sock(sk);
1772         local_bh_enable();
1773         sock_put(sk);
1774 }
1775
1776 /* These states need RST on ABORT according to RFC793 */
1777
1778 static inline int tcp_need_reset(int state)
1779 {
1780         return (1 << state) &
1781                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1782                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1783 }
1784
1785 int tcp_disconnect(struct sock *sk, int flags)
1786 {
1787         struct inet_sock *inet = inet_sk(sk);
1788         struct tcp_sock *tp = tcp_sk(sk);
1789         int err = 0;
1790         int old_state = sk->sk_state;
1791
1792         if (old_state != TCP_CLOSE)
1793                 tcp_set_state(sk, TCP_CLOSE);
1794
1795         /* ABORT function of RFC793 */
1796         if (old_state == TCP_LISTEN) {
1797                 tcp_listen_stop(sk);
1798         } else if (tcp_need_reset(old_state) ||
1799                    (tp->snd_nxt != tp->write_seq &&
1800                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1801                 /* The last check adjusts for discrepance of Linux wrt. RFC
1802                  * states
1803                  */
1804                 tcp_send_active_reset(sk, gfp_any());
1805                 sk->sk_err = ECONNRESET;
1806         } else if (old_state == TCP_SYN_SENT)
1807                 sk->sk_err = ECONNRESET;
1808
1809         tcp_clear_xmit_timers(sk);
1810         __skb_queue_purge(&sk->sk_receive_queue);
1811         sk_stream_writequeue_purge(sk);
1812         __skb_queue_purge(&tp->out_of_order_queue);
1813
1814         inet->dport = 0;
1815
1816         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1817                 inet_reset_saddr(sk);
1818
1819         sk->sk_shutdown = 0;
1820         sock_reset_flag(sk, SOCK_DONE);
1821         tp->srtt = 0;
1822         if ((tp->write_seq += tp->max_window + 2) == 0)
1823                 tp->write_seq = 1;
1824         tp->backoff = 0;
1825         tp->snd_cwnd = 2;
1826         tp->probes_out = 0;
1827         tp->packets_out = 0;
1828         tp->snd_ssthresh = 0x7fffffff;
1829         tp->snd_cwnd_cnt = 0;
1830         tcp_set_ca_state(tp, TCP_CA_Open);
1831         tcp_clear_retrans(tp);
1832         tcp_delack_init(tp);
1833         sk->sk_send_head = NULL;
1834         tp->rx_opt.saw_tstamp = 0;
1835         tcp_sack_reset(&tp->rx_opt);
1836         __sk_dst_reset(sk);
1837
1838         BUG_TRAP(!inet->num || tp->bind_hash);
1839
1840         sk->sk_error_report(sk);
1841         return err;
1842 }
1843
1844 /*
1845  *      Wait for an incoming connection, avoid race
1846  *      conditions. This must be called with the socket locked.
1847  */
1848 static int wait_for_connect(struct sock *sk, long timeo)
1849 {
1850         struct tcp_sock *tp = tcp_sk(sk);
1851         DEFINE_WAIT(wait);
1852         int err;
1853
1854         /*
1855          * True wake-one mechanism for incoming connections: only
1856          * one process gets woken up, not the 'whole herd'.
1857          * Since we do not 'race & poll' for established sockets
1858          * anymore, the common case will execute the loop only once.
1859          *
1860          * Subtle issue: "add_wait_queue_exclusive()" will be added
1861          * after any current non-exclusive waiters, and we know that
1862          * it will always _stay_ after any new non-exclusive waiters
1863          * because all non-exclusive waiters are added at the
1864          * beginning of the wait-queue. As such, it's ok to "drop"
1865          * our exclusiveness temporarily when we get woken up without
1866          * having to remove and re-insert us on the wait queue.
1867          */
1868         for (;;) {
1869                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1870                                           TASK_INTERRUPTIBLE);
1871                 release_sock(sk);
1872                 if (!tp->accept_queue)
1873                         timeo = schedule_timeout(timeo);
1874                 lock_sock(sk);
1875                 err = 0;
1876                 if (tp->accept_queue)
1877                         break;
1878                 err = -EINVAL;
1879                 if (sk->sk_state != TCP_LISTEN)
1880                         break;
1881                 err = sock_intr_errno(timeo);
1882                 if (signal_pending(current))
1883                         break;
1884                 err = -EAGAIN;
1885                 if (!timeo)
1886                         break;
1887         }
1888         finish_wait(sk->sk_sleep, &wait);
1889         return err;
1890 }
1891
1892 /*
1893  *      This will accept the next outstanding connection.
1894  */
1895
1896 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1897 {
1898         struct tcp_sock *tp = tcp_sk(sk);
1899         struct open_request *req;
1900         struct sock *newsk;
1901         int error;
1902
1903         lock_sock(sk);
1904
1905         /* We need to make sure that this socket is listening,
1906          * and that it has something pending.
1907          */
1908         error = -EINVAL;
1909         if (sk->sk_state != TCP_LISTEN)
1910                 goto out;
1911
1912         /* Find already established connection */
1913         if (!tp->accept_queue) {
1914                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1915                 /* If this is a non blocking socket don't sleep */
1916                 error = -EAGAIN;
1917                 if (!timeo)
1918                         goto out;
1919
1920                 error = wait_for_connect(sk, timeo);
1921                 if (error)
1922                         goto out;
1923         }
1924
1925         req = tp->accept_queue;
1926         if ((tp->accept_queue = req->dl_next) == NULL)
1927                 tp->accept_queue_tail = NULL;
1928         newsk = req->sk;
1929         sk_acceptq_removed(sk);
1930         tcp_openreq_fastfree(req);
1931         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1932         release_sock(sk);
1933         return newsk;
1934
1935 out:
1936         release_sock(sk);
1937         *err = error;
1938         return NULL;
1939 }
1940
1941
1942 /*
1943  *      Socket option code for TCP.
1944  */
1945 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1946                    int optlen)
1947 {
1948         struct tcp_sock *tp = tcp_sk(sk);
1949         int val;
1950         int err = 0;
1951
1952         if (level != SOL_TCP)
1953                 return tp->af_specific->setsockopt(sk, level, optname,
1954                                                    optval, optlen);
1955
1956         if (optlen < sizeof(int))
1957                 return -EINVAL;
1958
1959         if (get_user(val, (int __user *)optval))
1960                 return -EFAULT;
1961
1962         lock_sock(sk);
1963
1964         switch (optname) {
1965         case TCP_MAXSEG:
1966                 /* Values greater than interface MTU won't take effect. However
1967                  * at the point when this call is done we typically don't yet
1968                  * know which interface is going to be used */
1969                 if (val < 8 || val > MAX_TCP_WINDOW) {
1970                         err = -EINVAL;
1971                         break;
1972                 }
1973                 tp->rx_opt.user_mss = val;
1974                 break;
1975
1976         case TCP_NODELAY:
1977                 if (val) {
1978                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1979                          * this option on corked socket is remembered, but
1980                          * it is not activated until cork is cleared.
1981                          *
1982                          * However, when TCP_NODELAY is set we make
1983                          * an explicit push, which overrides even TCP_CORK
1984                          * for currently queued segments.
1985                          */
1986                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1987                         tcp_push_pending_frames(sk, tp);
1988                 } else {
1989                         tp->nonagle &= ~TCP_NAGLE_OFF;
1990                 }
1991                 break;
1992
1993         case TCP_CORK:
1994                 /* When set indicates to always queue non-full frames.
1995                  * Later the user clears this option and we transmit
1996                  * any pending partial frames in the queue.  This is
1997                  * meant to be used alongside sendfile() to get properly
1998                  * filled frames when the user (for example) must write
1999                  * out headers with a write() call first and then use
2000                  * sendfile to send out the data parts.
2001                  *
2002                  * TCP_CORK can be set together with TCP_NODELAY and it is
2003                  * stronger than TCP_NODELAY.
2004                  */
2005                 if (val) {
2006                         tp->nonagle |= TCP_NAGLE_CORK;
2007                 } else {
2008                         tp->nonagle &= ~TCP_NAGLE_CORK;
2009                         if (tp->nonagle&TCP_NAGLE_OFF)
2010                                 tp->nonagle |= TCP_NAGLE_PUSH;
2011                         tcp_push_pending_frames(sk, tp);
2012                 }
2013                 break;
2014
2015         case TCP_KEEPIDLE:
2016                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2017                         err = -EINVAL;
2018                 else {
2019                         tp->keepalive_time = val * HZ;
2020                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2021                             !((1 << sk->sk_state) &
2022                               (TCPF_CLOSE | TCPF_LISTEN))) {
2023                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2024                                 if (tp->keepalive_time > elapsed)
2025                                         elapsed = tp->keepalive_time - elapsed;
2026                                 else
2027                                         elapsed = 0;
2028                                 tcp_reset_keepalive_timer(sk, elapsed);
2029                         }
2030                 }
2031                 break;
2032         case TCP_KEEPINTVL:
2033                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2034                         err = -EINVAL;
2035                 else
2036                         tp->keepalive_intvl = val * HZ;
2037                 break;
2038         case TCP_KEEPCNT:
2039                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2040                         err = -EINVAL;
2041                 else
2042                         tp->keepalive_probes = val;
2043                 break;
2044         case TCP_SYNCNT:
2045                 if (val < 1 || val > MAX_TCP_SYNCNT)
2046                         err = -EINVAL;
2047                 else
2048                         tp->syn_retries = val;
2049                 break;
2050
2051         case TCP_LINGER2:
2052                 if (val < 0)
2053                         tp->linger2 = -1;
2054                 else if (val > sysctl_tcp_fin_timeout / HZ)
2055                         tp->linger2 = 0;
2056                 else
2057                         tp->linger2 = val * HZ;
2058                 break;
2059
2060         case TCP_DEFER_ACCEPT:
2061                 tp->defer_accept = 0;
2062                 if (val > 0) {
2063                         /* Translate value in seconds to number of
2064                          * retransmits */
2065                         while (tp->defer_accept < 32 &&
2066                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2067                                        tp->defer_accept))
2068                                 tp->defer_accept++;
2069                         tp->defer_accept++;
2070                 }
2071                 break;
2072
2073         case TCP_WINDOW_CLAMP:
2074                 if (!val) {
2075                         if (sk->sk_state != TCP_CLOSE) {
2076                                 err = -EINVAL;
2077                                 break;
2078                         }
2079                         tp->window_clamp = 0;
2080                 } else
2081                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2082                                                 SOCK_MIN_RCVBUF / 2 : val;
2083                 break;
2084
2085         case TCP_QUICKACK:
2086                 if (!val) {
2087                         tp->ack.pingpong = 1;
2088                 } else {
2089                         tp->ack.pingpong = 0;
2090                         if ((1 << sk->sk_state) &
2091                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2092                             tcp_ack_scheduled(tp)) {
2093                                 tp->ack.pending |= TCP_ACK_PUSHED;
2094                                 cleanup_rbuf(sk, 1);
2095                                 if (!(val & 1))
2096                                         tp->ack.pingpong = 1;
2097                         }
2098                 }
2099                 break;
2100                 
2101         default:
2102                 err = -ENOPROTOOPT;
2103                 break;
2104         };
2105         release_sock(sk);
2106         return err;
2107 }
2108
2109 /* Return information about state of tcp endpoint in API format. */
2110 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2111 {
2112         struct tcp_sock *tp = tcp_sk(sk);
2113         u32 now = tcp_time_stamp;
2114
2115         memset(info, 0, sizeof(*info));
2116
2117         info->tcpi_state = sk->sk_state;
2118         info->tcpi_ca_state = tp->ca_state;
2119         info->tcpi_retransmits = tp->retransmits;
2120         info->tcpi_probes = tp->probes_out;
2121         info->tcpi_backoff = tp->backoff;
2122
2123         if (tp->rx_opt.tstamp_ok)
2124                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2125         if (tp->rx_opt.sack_ok)
2126                 info->tcpi_options |= TCPI_OPT_SACK;
2127         if (tp->rx_opt.wscale_ok) {
2128                 info->tcpi_options |= TCPI_OPT_WSCALE;
2129                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2130                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2131         } 
2132
2133         if (tp->ecn_flags&TCP_ECN_OK)
2134                 info->tcpi_options |= TCPI_OPT_ECN;
2135
2136         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2137         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2138         info->tcpi_snd_mss = tp->mss_cache_std;
2139         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2140
2141         info->tcpi_unacked = tp->packets_out;
2142         info->tcpi_sacked = tp->sacked_out;
2143         info->tcpi_lost = tp->lost_out;
2144         info->tcpi_retrans = tp->retrans_out;
2145         info->tcpi_fackets = tp->fackets_out;
2146
2147         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2148         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2149         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2150
2151         info->tcpi_pmtu = tp->pmtu_cookie;
2152         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2153         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2154         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2155         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2156         info->tcpi_snd_cwnd = tp->snd_cwnd;
2157         info->tcpi_advmss = tp->advmss;
2158         info->tcpi_reordering = tp->reordering;
2159
2160         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2161         info->tcpi_rcv_space = tp->rcvq_space.space;
2162
2163         info->tcpi_total_retrans = tp->total_retrans;
2164 }
2165
2166 EXPORT_SYMBOL_GPL(tcp_get_info);
2167
2168 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2169                    int __user *optlen)
2170 {
2171         struct tcp_sock *tp = tcp_sk(sk);
2172         int val, len;
2173
2174         if (level != SOL_TCP)
2175                 return tp->af_specific->getsockopt(sk, level, optname,
2176                                                    optval, optlen);
2177
2178         if (get_user(len, optlen))
2179                 return -EFAULT;
2180
2181         len = min_t(unsigned int, len, sizeof(int));
2182
2183         if (len < 0)
2184                 return -EINVAL;
2185
2186         switch (optname) {
2187         case TCP_MAXSEG:
2188                 val = tp->mss_cache_std;
2189                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2190                         val = tp->rx_opt.user_mss;
2191                 break;
2192         case TCP_NODELAY:
2193                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2194                 break;
2195         case TCP_CORK:
2196                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2197                 break;
2198         case TCP_KEEPIDLE:
2199                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2200                 break;
2201         case TCP_KEEPINTVL:
2202                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2203                 break;
2204         case TCP_KEEPCNT:
2205                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2206                 break;
2207         case TCP_SYNCNT:
2208                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2209                 break;
2210         case TCP_LINGER2:
2211                 val = tp->linger2;
2212                 if (val >= 0)
2213                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2214                 break;
2215         case TCP_DEFER_ACCEPT:
2216                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2217                                                (tp->defer_accept - 1));
2218                 break;
2219         case TCP_WINDOW_CLAMP:
2220                 val = tp->window_clamp;
2221                 break;
2222         case TCP_INFO: {
2223                 struct tcp_info info;
2224
2225                 if (get_user(len, optlen))
2226                         return -EFAULT;
2227
2228                 tcp_get_info(sk, &info);
2229
2230                 len = min_t(unsigned int, len, sizeof(info));
2231                 if (put_user(len, optlen))
2232                         return -EFAULT;
2233                 if (copy_to_user(optval, &info, len))
2234                         return -EFAULT;
2235                 return 0;
2236         }
2237         case TCP_QUICKACK:
2238                 val = !tp->ack.pingpong;
2239                 break;
2240
2241         default:
2242                 return -ENOPROTOOPT;
2243         };
2244
2245         if (put_user(len, optlen))
2246                 return -EFAULT;
2247         if (copy_to_user(optval, &val, len))
2248                 return -EFAULT;
2249         return 0;
2250 }
2251
2252
2253 extern void __skb_cb_too_small_for_tcp(int, int);
2254 extern void tcpdiag_init(void);
2255
2256 static __initdata unsigned long thash_entries;
2257 static int __init set_thash_entries(char *str)
2258 {
2259         if (!str)
2260                 return 0;
2261         thash_entries = simple_strtoul(str, &str, 0);
2262         return 1;
2263 }
2264 __setup("thash_entries=", set_thash_entries);
2265
2266 void __init tcp_init(void)
2267 {
2268         struct sk_buff *skb = NULL;
2269         int order, i;
2270
2271         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2272                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2273                                            sizeof(skb->cb));
2274
2275         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2276                                                    sizeof(struct open_request),
2277                                                0, SLAB_HWCACHE_ALIGN,
2278                                                NULL, NULL);
2279         if (!tcp_openreq_cachep)
2280                 panic("tcp_init: Cannot alloc open_request cache.");
2281
2282         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2283                                               sizeof(struct tcp_bind_bucket),
2284                                               0, SLAB_HWCACHE_ALIGN,
2285                                               NULL, NULL);
2286         if (!tcp_bucket_cachep)
2287                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2288
2289         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2290                                                 sizeof(struct tcp_tw_bucket),
2291                                                 0, SLAB_HWCACHE_ALIGN,
2292                                                 NULL, NULL);
2293         if (!tcp_timewait_cachep)
2294                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2295
2296         /* Size and allocate the main established and bind bucket
2297          * hash tables.
2298          *
2299          * The methodology is similar to that of the buffer cache.
2300          */
2301         tcp_ehash = (struct tcp_ehash_bucket *)
2302                 alloc_large_system_hash("TCP established",
2303                                         sizeof(struct tcp_ehash_bucket),
2304                                         thash_entries,
2305                                         (num_physpages >= 128 * 1024) ?
2306                                                 (25 - PAGE_SHIFT) :
2307                                                 (27 - PAGE_SHIFT),
2308                                         HASH_HIGHMEM,
2309                                         &tcp_ehash_size,
2310                                         NULL,
2311                                         0);
2312         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2313         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2314                 rwlock_init(&tcp_ehash[i].lock);
2315                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2316         }
2317
2318         tcp_bhash = (struct tcp_bind_hashbucket *)
2319                 alloc_large_system_hash("TCP bind",
2320                                         sizeof(struct tcp_bind_hashbucket),
2321                                         tcp_ehash_size,
2322                                         (num_physpages >= 128 * 1024) ?
2323                                                 (25 - PAGE_SHIFT) :
2324                                                 (27 - PAGE_SHIFT),
2325                                         HASH_HIGHMEM,
2326                                         &tcp_bhash_size,
2327                                         NULL,
2328                                         64 * 1024);
2329         tcp_bhash_size = 1 << tcp_bhash_size;
2330         for (i = 0; i < tcp_bhash_size; i++) {
2331                 spin_lock_init(&tcp_bhash[i].lock);
2332                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2333         }
2334
2335         /* Try to be a bit smarter and adjust defaults depending
2336          * on available memory.
2337          */
2338         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2339                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2340                         order++)
2341                 ;
2342         if (order >= 4) {
2343                 sysctl_local_port_range[0] = 32768;
2344                 sysctl_local_port_range[1] = 61000;
2345                 sysctl_tcp_max_tw_buckets = 180000;
2346                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2347                 sysctl_max_syn_backlog = 1024;
2348         } else if (order < 3) {
2349                 sysctl_local_port_range[0] = 1024 * (3 - order);
2350                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2351                 sysctl_tcp_max_orphans >>= (3 - order);
2352                 sysctl_max_syn_backlog = 128;
2353         }
2354         tcp_port_rover = sysctl_local_port_range[0] - 1;
2355
2356         sysctl_tcp_mem[0] =  768 << order;
2357         sysctl_tcp_mem[1] = 1024 << order;
2358         sysctl_tcp_mem[2] = 1536 << order;
2359
2360         if (order < 3) {
2361                 sysctl_tcp_wmem[2] = 64 * 1024;
2362                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2363                 sysctl_tcp_rmem[1] = 43689;
2364                 sysctl_tcp_rmem[2] = 2 * 43689;
2365         }
2366
2367         printk(KERN_INFO "TCP: Hash tables configured "
2368                "(established %d bind %d)\n",
2369                tcp_ehash_size << 1, tcp_bhash_size);
2370 }
2371
2372 EXPORT_SYMBOL(tcp_accept);
2373 EXPORT_SYMBOL(tcp_close);
2374 EXPORT_SYMBOL(tcp_destroy_sock);
2375 EXPORT_SYMBOL(tcp_disconnect);
2376 EXPORT_SYMBOL(tcp_getsockopt);
2377 EXPORT_SYMBOL(tcp_ioctl);
2378 EXPORT_SYMBOL(tcp_openreq_cachep);
2379 EXPORT_SYMBOL(tcp_poll);
2380 EXPORT_SYMBOL(tcp_read_sock);
2381 EXPORT_SYMBOL(tcp_recvmsg);
2382 EXPORT_SYMBOL(tcp_sendmsg);
2383 EXPORT_SYMBOL(tcp_sendpage);
2384 EXPORT_SYMBOL(tcp_setsockopt);
2385 EXPORT_SYMBOL(tcp_shutdown);
2386 EXPORT_SYMBOL(tcp_statistics);
2387 EXPORT_SYMBOL(tcp_timewait_cachep);
2388 EXPORT_SYMBOL_GPL(cleanup_rbuf);