vserver 1.9.5.x5
[linux-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_openreq_cachep;
275 kmem_cache_t *tcp_bucket_cachep;
276 kmem_cache_t *tcp_timewait_cachep;
277
278 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
279
280 int sysctl_tcp_mem[3];
281 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
282 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
283
284 EXPORT_SYMBOL(sysctl_tcp_mem);
285 EXPORT_SYMBOL(sysctl_tcp_rmem);
286 EXPORT_SYMBOL(sysctl_tcp_wmem);
287
288 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
289 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
290
291 EXPORT_SYMBOL(tcp_memory_allocated);
292 EXPORT_SYMBOL(tcp_sockets_allocated);
293
294 /*
295  * Pressure flag: try to collapse.
296  * Technical note: it is used by multiple contexts non atomically.
297  * All the sk_stream_mem_schedule() is of this nature: accounting
298  * is strict, actions are advisory and have some latency.
299  */
300 int tcp_memory_pressure;
301
302 EXPORT_SYMBOL(tcp_memory_pressure);
303
304 void tcp_enter_memory_pressure(void)
305 {
306         if (!tcp_memory_pressure) {
307                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308                 tcp_memory_pressure = 1;
309         }
310 }
311
312 EXPORT_SYMBOL(tcp_enter_memory_pressure);
313
314 /*
315  * LISTEN is a special case for poll..
316  */
317 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318                                                poll_table *wait)
319 {
320         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
321 }
322
323 /*
324  *      Wait for a TCP event.
325  *
326  *      Note that we don't need to lock the socket, as the upper poll layers
327  *      take care of normal races (between the test and the event) and we don't
328  *      go look at any of the socket buffers directly.
329  */
330 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
331 {
332         unsigned int mask;
333         struct sock *sk = sock->sk;
334         struct tcp_sock *tp = tcp_sk(sk);
335
336         poll_wait(file, sk->sk_sleep, wait);
337         if (sk->sk_state == TCP_LISTEN)
338                 return tcp_listen_poll(sk, wait);
339
340         /* Socket is not locked. We are protected from async events
341            by poll logic and correct handling of state changes
342            made by another threads is impossible in any case.
343          */
344
345         mask = 0;
346         if (sk->sk_err)
347                 mask = POLLERR;
348
349         /*
350          * POLLHUP is certainly not done right. But poll() doesn't
351          * have a notion of HUP in just one direction, and for a
352          * socket the read side is more interesting.
353          *
354          * Some poll() documentation says that POLLHUP is incompatible
355          * with the POLLOUT/POLLWR flags, so somebody should check this
356          * all. But careful, it tends to be safer to return too many
357          * bits than too few, and you can easily break real applications
358          * if you don't tell them that something has hung up!
359          *
360          * Check-me.
361          *
362          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
363          * our fs/select.c). It means that after we received EOF,
364          * poll always returns immediately, making impossible poll() on write()
365          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
366          * if and only if shutdown has been made in both directions.
367          * Actually, it is interesting to look how Solaris and DUX
368          * solve this dilemma. I would prefer, if PULLHUP were maskable,
369          * then we could set it on SND_SHUTDOWN. BTW examples given
370          * in Stevens' books assume exactly this behaviour, it explains
371          * why PULLHUP is incompatible with POLLOUT.    --ANK
372          *
373          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
374          * blocking on fresh not-connected or disconnected socket. --ANK
375          */
376         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
377                 mask |= POLLHUP;
378         if (sk->sk_shutdown & RCV_SHUTDOWN)
379                 mask |= POLLIN | POLLRDNORM;
380
381         /* Connected? */
382         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
383                 /* Potential race condition. If read of tp below will
384                  * escape above sk->sk_state, we can be illegally awaken
385                  * in SYN_* states. */
386                 if ((tp->rcv_nxt != tp->copied_seq) &&
387                     (tp->urg_seq != tp->copied_seq ||
388                      tp->rcv_nxt != tp->copied_seq + 1 ||
389                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
390                         mask |= POLLIN | POLLRDNORM;
391
392                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
393                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
394                                 mask |= POLLOUT | POLLWRNORM;
395                         } else {  /* send SIGIO later */
396                                 set_bit(SOCK_ASYNC_NOSPACE,
397                                         &sk->sk_socket->flags);
398                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
399
400                                 /* Race breaker. If space is freed after
401                                  * wspace test but before the flags are set,
402                                  * IO signal will be lost.
403                                  */
404                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
405                                         mask |= POLLOUT | POLLWRNORM;
406                         }
407                 }
408
409                 if (tp->urg_data & TCP_URG_VALID)
410                         mask |= POLLPRI;
411         }
412         return mask;
413 }
414
415 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
416 {
417         struct tcp_sock *tp = tcp_sk(sk);
418         int answ;
419
420         switch (cmd) {
421         case SIOCINQ:
422                 if (sk->sk_state == TCP_LISTEN)
423                         return -EINVAL;
424
425                 lock_sock(sk);
426                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
427                         answ = 0;
428                 else if (sock_flag(sk, SOCK_URGINLINE) ||
429                          !tp->urg_data ||
430                          before(tp->urg_seq, tp->copied_seq) ||
431                          !before(tp->urg_seq, tp->rcv_nxt)) {
432                         answ = tp->rcv_nxt - tp->copied_seq;
433
434                         /* Subtract 1, if FIN is in queue. */
435                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
436                                 answ -=
437                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
438                 } else
439                         answ = tp->urg_seq - tp->copied_seq;
440                 release_sock(sk);
441                 break;
442         case SIOCATMARK:
443                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
444                 break;
445         case SIOCOUTQ:
446                 if (sk->sk_state == TCP_LISTEN)
447                         return -EINVAL;
448
449                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
450                         answ = 0;
451                 else
452                         answ = tp->write_seq - tp->snd_una;
453                 break;
454         default:
455                 return -ENOIOCTLCMD;
456         };
457
458         return put_user(answ, (int __user *)arg);
459 }
460
461
462 int tcp_listen_start(struct sock *sk)
463 {
464         struct inet_sock *inet = inet_sk(sk);
465         struct tcp_sock *tp = tcp_sk(sk);
466         struct tcp_listen_opt *lopt;
467
468         sk->sk_max_ack_backlog = 0;
469         sk->sk_ack_backlog = 0;
470         tp->accept_queue = tp->accept_queue_tail = NULL;
471         rwlock_init(&tp->syn_wait_lock);
472         tcp_delack_init(tp);
473
474         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475         if (!lopt)
476                 return -ENOMEM;
477
478         memset(lopt, 0, sizeof(struct tcp_listen_opt));
479         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481                         break;
482         get_random_bytes(&lopt->hash_rnd, 4);
483
484         write_lock_bh(&tp->syn_wait_lock);
485         tp->listen_opt = lopt;
486         write_unlock_bh(&tp->syn_wait_lock);
487
488         /* There is race window here: we announce ourselves listening,
489          * but this transition is still not validated by get_port().
490          * It is OK, because this socket enters to hash table only
491          * after validation is complete.
492          */
493         sk->sk_state = TCP_LISTEN;
494         if (!sk->sk_prot->get_port(sk, inet->num)) {
495                 inet->sport = htons(inet->num);
496
497                 sk_dst_reset(sk);
498                 sk->sk_prot->hash(sk);
499
500                 return 0;
501         }
502
503         sk->sk_state = TCP_CLOSE;
504         write_lock_bh(&tp->syn_wait_lock);
505         tp->listen_opt = NULL;
506         write_unlock_bh(&tp->syn_wait_lock);
507         kfree(lopt);
508         return -EADDRINUSE;
509 }
510
511 /*
512  *      This routine closes sockets which have been at least partially
513  *      opened, but not yet accepted.
514  */
515
516 static void tcp_listen_stop (struct sock *sk)
517 {
518         struct tcp_sock *tp = tcp_sk(sk);
519         struct tcp_listen_opt *lopt = tp->listen_opt;
520         struct open_request *acc_req = tp->accept_queue;
521         struct open_request *req;
522         int i;
523
524         tcp_delete_keepalive_timer(sk);
525
526         /* make all the listen_opt local to us */
527         write_lock_bh(&tp->syn_wait_lock);
528         tp->listen_opt = NULL;
529         write_unlock_bh(&tp->syn_wait_lock);
530         tp->accept_queue = tp->accept_queue_tail = NULL;
531
532         if (lopt->qlen) {
533                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534                         while ((req = lopt->syn_table[i]) != NULL) {
535                                 lopt->syn_table[i] = req->dl_next;
536                                 lopt->qlen--;
537                                 tcp_openreq_free(req);
538
539                 /* Following specs, it would be better either to send FIN
540                  * (and enter FIN-WAIT-1, it is normal close)
541                  * or to send active reset (abort).
542                  * Certainly, it is pretty dangerous while synflood, but it is
543                  * bad justification for our negligence 8)
544                  * To be honest, we are not able to make either
545                  * of the variants now.                 --ANK
546                  */
547                         }
548                 }
549         }
550         BUG_TRAP(!lopt->qlen);
551
552         kfree(lopt);
553
554         while ((req = acc_req) != NULL) {
555                 struct sock *child = req->sk;
556
557                 acc_req = req->dl_next;
558
559                 local_bh_disable();
560                 bh_lock_sock(child);
561                 BUG_TRAP(!sock_owned_by_user(child));
562                 sock_hold(child);
563
564                 tcp_disconnect(child, O_NONBLOCK);
565
566                 sock_orphan(child);
567
568                 atomic_inc(&tcp_orphan_count);
569
570                 tcp_destroy_sock(child);
571
572                 bh_unlock_sock(child);
573                 local_bh_enable();
574                 sock_put(child);
575
576                 sk_acceptq_removed(sk);
577                 tcp_openreq_fastfree(req);
578         }
579         BUG_TRAP(!sk->sk_ack_backlog);
580 }
581
582 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
583 {
584         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
585         tp->pushed_seq = tp->write_seq;
586 }
587
588 static inline int forced_push(struct tcp_sock *tp)
589 {
590         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
591 }
592
593 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
594                               struct sk_buff *skb)
595 {
596         skb->csum = 0;
597         TCP_SKB_CB(skb)->seq = tp->write_seq;
598         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
599         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
600         TCP_SKB_CB(skb)->sacked = 0;
601         __skb_queue_tail(&sk->sk_write_queue, skb);
602         sk_charge_skb(sk, skb);
603         if (!sk->sk_send_head)
604                 sk->sk_send_head = skb;
605         else if (tp->nonagle&TCP_NAGLE_PUSH)
606                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
607 }
608
609 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
610                                 struct sk_buff *skb)
611 {
612         if (flags & MSG_OOB) {
613                 tp->urg_mode = 1;
614                 tp->snd_up = tp->write_seq;
615                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
616         }
617 }
618
619 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
620                             int mss_now, int nonagle)
621 {
622         if (sk->sk_send_head) {
623                 struct sk_buff *skb = sk->sk_write_queue.prev;
624                 if (!(flags & MSG_MORE) || forced_push(tp))
625                         tcp_mark_push(tp, skb);
626                 tcp_mark_urg(tp, flags, skb);
627                 __tcp_push_pending_frames(sk, tp, mss_now,
628                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
629         }
630 }
631
632 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
633                          size_t psize, int flags)
634 {
635         struct tcp_sock *tp = tcp_sk(sk);
636         int mss_now;
637         int err;
638         ssize_t copied;
639         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
640
641         /* Wait for a connection to finish. */
642         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
643                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
644                         goto out_err;
645
646         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
647
648         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
649         copied = 0;
650
651         err = -EPIPE;
652         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
653                 goto do_error;
654
655         while (psize > 0) {
656                 struct sk_buff *skb = sk->sk_write_queue.prev;
657                 struct page *page = pages[poffset / PAGE_SIZE];
658                 int copy, i, can_coalesce;
659                 int offset = poffset % PAGE_SIZE;
660                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
661
662                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
663 new_segment:
664                         if (!sk_stream_memory_free(sk))
665                                 goto wait_for_sndbuf;
666
667                         skb = sk_stream_alloc_pskb(sk, 0, 0,
668                                                    sk->sk_allocation);
669                         if (!skb)
670                                 goto wait_for_memory;
671
672                         skb_entail(sk, tp, skb);
673                         copy = mss_now;
674                 }
675
676                 if (copy > size)
677                         copy = size;
678
679                 i = skb_shinfo(skb)->nr_frags;
680                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
681                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
682                         tcp_mark_push(tp, skb);
683                         goto new_segment;
684                 }
685                 if (sk->sk_forward_alloc < copy &&
686                     !sk_stream_mem_schedule(sk, copy, 0))
687                         goto wait_for_memory;
688                 
689                 if (can_coalesce) {
690                         skb_shinfo(skb)->frags[i - 1].size += copy;
691                 } else {
692                         get_page(page);
693                         skb_fill_page_desc(skb, i, page, offset, copy);
694                 }
695
696                 skb->len += copy;
697                 skb->data_len += copy;
698                 skb->truesize += copy;
699                 sk->sk_wmem_queued += copy;
700                 sk->sk_forward_alloc -= copy;
701                 skb->ip_summed = CHECKSUM_HW;
702                 tp->write_seq += copy;
703                 TCP_SKB_CB(skb)->end_seq += copy;
704                 skb_shinfo(skb)->tso_segs = 0;
705
706                 if (!copied)
707                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
708
709                 copied += copy;
710                 poffset += copy;
711                 if (!(psize -= copy))
712                         goto out;
713
714                 if (skb->len != mss_now || (flags & MSG_OOB))
715                         continue;
716
717                 if (forced_push(tp)) {
718                         tcp_mark_push(tp, skb);
719                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
720                 } else if (skb == sk->sk_send_head)
721                         tcp_push_one(sk, mss_now);
722                 continue;
723
724 wait_for_sndbuf:
725                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
726 wait_for_memory:
727                 if (copied)
728                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
729
730                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
731                         goto do_error;
732
733                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
734         }
735
736 out:
737         if (copied)
738                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
739         return copied;
740
741 do_error:
742         if (copied)
743                 goto out;
744 out_err:
745         return sk_stream_error(sk, flags, err);
746 }
747
748 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
749                      size_t size, int flags)
750 {
751         ssize_t res;
752         struct sock *sk = sock->sk;
753
754 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
755
756         if (!(sk->sk_route_caps & NETIF_F_SG) ||
757             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
758                 return sock_no_sendpage(sock, page, offset, size, flags);
759
760 #undef TCP_ZC_CSUM_FLAGS
761
762         lock_sock(sk);
763         TCP_CHECK_TIMER(sk);
764         res = do_tcp_sendpages(sk, &page, offset, size, flags);
765         TCP_CHECK_TIMER(sk);
766         release_sock(sk);
767         return res;
768 }
769
770 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
771 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
772
773 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
774 {
775         int tmp = tp->mss_cache_std;
776
777         if (sk->sk_route_caps & NETIF_F_SG) {
778                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
779
780                 if (tmp >= pgbreak &&
781                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
782                         tmp = pgbreak;
783         }
784         return tmp;
785 }
786
787 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
788                 size_t size)
789 {
790         struct iovec *iov;
791         struct tcp_sock *tp = tcp_sk(sk);
792         struct sk_buff *skb;
793         int iovlen, flags;
794         int mss_now;
795         int err, copied;
796         long timeo;
797
798         lock_sock(sk);
799         TCP_CHECK_TIMER(sk);
800
801         flags = msg->msg_flags;
802         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
803
804         /* Wait for a connection to finish. */
805         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
806                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
807                         goto out_err;
808
809         /* This should be in poll */
810         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
811
812         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
813
814         /* Ok commence sending. */
815         iovlen = msg->msg_iovlen;
816         iov = msg->msg_iov;
817         copied = 0;
818
819         err = -EPIPE;
820         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
821                 goto do_error;
822
823         while (--iovlen >= 0) {
824                 int seglen = iov->iov_len;
825                 unsigned char __user *from = iov->iov_base;
826
827                 iov++;
828
829                 while (seglen > 0) {
830                         int copy;
831
832                         skb = sk->sk_write_queue.prev;
833
834                         if (!sk->sk_send_head ||
835                             (copy = mss_now - skb->len) <= 0) {
836
837 new_segment:
838                                 /* Allocate new segment. If the interface is SG,
839                                  * allocate skb fitting to single page.
840                                  */
841                                 if (!sk_stream_memory_free(sk))
842                                         goto wait_for_sndbuf;
843
844                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
845                                                            0, sk->sk_allocation);
846                                 if (!skb)
847                                         goto wait_for_memory;
848
849                                 /*
850                                  * Check whether we can use HW checksum.
851                                  */
852                                 if (sk->sk_route_caps &
853                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
854                                      NETIF_F_HW_CSUM))
855                                         skb->ip_summed = CHECKSUM_HW;
856
857                                 skb_entail(sk, tp, skb);
858                                 copy = mss_now;
859                         }
860
861                         /* Try to append data to the end of skb. */
862                         if (copy > seglen)
863                                 copy = seglen;
864
865                         /* Where to copy to? */
866                         if (skb_tailroom(skb) > 0) {
867                                 /* We have some space in skb head. Superb! */
868                                 if (copy > skb_tailroom(skb))
869                                         copy = skb_tailroom(skb);
870                                 if ((err = skb_add_data(skb, from, copy)) != 0)
871                                         goto do_fault;
872                         } else {
873                                 int merge = 0;
874                                 int i = skb_shinfo(skb)->nr_frags;
875                                 struct page *page = TCP_PAGE(sk);
876                                 int off = TCP_OFF(sk);
877
878                                 if (skb_can_coalesce(skb, i, page, off) &&
879                                     off != PAGE_SIZE) {
880                                         /* We can extend the last page
881                                          * fragment. */
882                                         merge = 1;
883                                 } else if (i == MAX_SKB_FRAGS ||
884                                            (!i &&
885                                            !(sk->sk_route_caps & NETIF_F_SG))) {
886                                         /* Need to add new fragment and cannot
887                                          * do this because interface is non-SG,
888                                          * or because all the page slots are
889                                          * busy. */
890                                         tcp_mark_push(tp, skb);
891                                         goto new_segment;
892                                 } else if (page) {
893                                         /* If page is cached, align
894                                          * offset to L1 cache boundary
895                                          */
896                                         off = (off + L1_CACHE_BYTES - 1) &
897                                               ~(L1_CACHE_BYTES - 1);
898                                         if (off == PAGE_SIZE) {
899                                                 put_page(page);
900                                                 TCP_PAGE(sk) = page = NULL;
901                                         }
902                                 }
903
904                                 if (!page) {
905                                         /* Allocate new cache page. */
906                                         if (!(page = sk_stream_alloc_page(sk)))
907                                                 goto wait_for_memory;
908                                         off = 0;
909                                 }
910
911                                 if (copy > PAGE_SIZE - off)
912                                         copy = PAGE_SIZE - off;
913
914                                 /* Time to copy data. We are close to
915                                  * the end! */
916                                 err = skb_copy_to_page(sk, from, skb, page,
917                                                        off, copy);
918                                 if (err) {
919                                         /* If this page was new, give it to the
920                                          * socket so it does not get leaked.
921                                          */
922                                         if (!TCP_PAGE(sk)) {
923                                                 TCP_PAGE(sk) = page;
924                                                 TCP_OFF(sk) = 0;
925                                         }
926                                         goto do_error;
927                                 }
928
929                                 /* Update the skb. */
930                                 if (merge) {
931                                         skb_shinfo(skb)->frags[i - 1].size +=
932                                                                         copy;
933                                 } else {
934                                         skb_fill_page_desc(skb, i, page, off, copy);
935                                         if (TCP_PAGE(sk)) {
936                                                 get_page(page);
937                                         } else if (off + copy < PAGE_SIZE) {
938                                                 get_page(page);
939                                                 TCP_PAGE(sk) = page;
940                                         }
941                                 }
942
943                                 TCP_OFF(sk) = off + copy;
944                         }
945
946                         if (!copied)
947                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
948
949                         tp->write_seq += copy;
950                         TCP_SKB_CB(skb)->end_seq += copy;
951                         skb_shinfo(skb)->tso_segs = 0;
952
953                         from += copy;
954                         copied += copy;
955                         if ((seglen -= copy) == 0 && iovlen == 0)
956                                 goto out;
957
958                         if (skb->len != mss_now || (flags & MSG_OOB))
959                                 continue;
960
961                         if (forced_push(tp)) {
962                                 tcp_mark_push(tp, skb);
963                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
964                         } else if (skb == sk->sk_send_head)
965                                 tcp_push_one(sk, mss_now);
966                         continue;
967
968 wait_for_sndbuf:
969                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
970 wait_for_memory:
971                         if (copied)
972                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
973
974                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
975                                 goto do_error;
976
977                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
978                 }
979         }
980
981 out:
982         if (copied)
983                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
984         TCP_CHECK_TIMER(sk);
985         release_sock(sk);
986         return copied;
987
988 do_fault:
989         if (!skb->len) {
990                 if (sk->sk_send_head == skb)
991                         sk->sk_send_head = NULL;
992                 __skb_unlink(skb, skb->list);
993                 sk_stream_free_skb(sk, skb);
994         }
995
996 do_error:
997         if (copied)
998                 goto out;
999 out_err:
1000         err = sk_stream_error(sk, flags, err);
1001         TCP_CHECK_TIMER(sk);
1002         release_sock(sk);
1003         return err;
1004 }
1005
1006 /*
1007  *      Handle reading urgent data. BSD has very simple semantics for
1008  *      this, no blocking and very strange errors 8)
1009  */
1010
1011 static int tcp_recv_urg(struct sock *sk, long timeo,
1012                         struct msghdr *msg, int len, int flags,
1013                         int *addr_len)
1014 {
1015         struct tcp_sock *tp = tcp_sk(sk);
1016
1017         /* No URG data to read. */
1018         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1019             tp->urg_data == TCP_URG_READ)
1020                 return -EINVAL; /* Yes this is right ! */
1021
1022         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1023                 return -ENOTCONN;
1024
1025         if (tp->urg_data & TCP_URG_VALID) {
1026                 int err = 0;
1027                 char c = tp->urg_data;
1028
1029                 if (!(flags & MSG_PEEK))
1030                         tp->urg_data = TCP_URG_READ;
1031
1032                 /* Read urgent data. */
1033                 msg->msg_flags |= MSG_OOB;
1034
1035                 if (len > 0) {
1036                         if (!(flags & MSG_TRUNC))
1037                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1038                         len = 1;
1039                 } else
1040                         msg->msg_flags |= MSG_TRUNC;
1041
1042                 return err ? -EFAULT : len;
1043         }
1044
1045         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1046                 return 0;
1047
1048         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1049          * the available implementations agree in this case:
1050          * this call should never block, independent of the
1051          * blocking state of the socket.
1052          * Mike <pall@rz.uni-karlsruhe.de>
1053          */
1054         return -EAGAIN;
1055 }
1056
1057 /* Clean up the receive buffer for full frames taken by the user,
1058  * then send an ACK if necessary.  COPIED is the number of bytes
1059  * tcp_recvmsg has given to the user so far, it speeds up the
1060  * calculation of whether or not we must ACK for the sake of
1061  * a window update.
1062  */
1063 static void cleanup_rbuf(struct sock *sk, int copied)
1064 {
1065         struct tcp_sock *tp = tcp_sk(sk);
1066         int time_to_ack = 0;
1067
1068 #if TCP_DEBUG
1069         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1070
1071         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1072 #endif
1073
1074         if (tcp_ack_scheduled(tp)) {
1075                    /* Delayed ACKs frequently hit locked sockets during bulk
1076                     * receive. */
1077                 if (tp->ack.blocked ||
1078                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1079                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1080                     /*
1081                      * If this read emptied read buffer, we send ACK, if
1082                      * connection is not bidirectional, user drained
1083                      * receive buffer and there was a small segment
1084                      * in queue.
1085                      */
1086                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1087                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1088                         time_to_ack = 1;
1089         }
1090
1091         /* We send an ACK if we can now advertise a non-zero window
1092          * which has been raised "significantly".
1093          *
1094          * Even if window raised up to infinity, do not send window open ACK
1095          * in states, where we will not receive more. It is useless.
1096          */
1097         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1098                 __u32 rcv_window_now = tcp_receive_window(tp);
1099
1100                 /* Optimize, __tcp_select_window() is not cheap. */
1101                 if (2*rcv_window_now <= tp->window_clamp) {
1102                         __u32 new_window = __tcp_select_window(sk);
1103
1104                         /* Send ACK now, if this read freed lots of space
1105                          * in our buffer. Certainly, new_window is new window.
1106                          * We can advertise it now, if it is not less than current one.
1107                          * "Lots" means "at least twice" here.
1108                          */
1109                         if (new_window && new_window >= 2 * rcv_window_now)
1110                                 time_to_ack = 1;
1111                 }
1112         }
1113         if (time_to_ack)
1114                 tcp_send_ack(sk);
1115 }
1116
1117 static void tcp_prequeue_process(struct sock *sk)
1118 {
1119         struct sk_buff *skb;
1120         struct tcp_sock *tp = tcp_sk(sk);
1121
1122         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1123
1124         /* RX process wants to run with disabled BHs, though it is not
1125          * necessary */
1126         local_bh_disable();
1127         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1128                 sk->sk_backlog_rcv(sk, skb);
1129         local_bh_enable();
1130
1131         /* Clear memory counter. */
1132         tp->ucopy.memory = 0;
1133 }
1134
1135 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1136 {
1137         struct sk_buff *skb;
1138         u32 offset;
1139
1140         skb_queue_walk(&sk->sk_receive_queue, skb) {
1141                 offset = seq - TCP_SKB_CB(skb)->seq;
1142                 if (skb->h.th->syn)
1143                         offset--;
1144                 if (offset < skb->len || skb->h.th->fin) {
1145                         *off = offset;
1146                         return skb;
1147                 }
1148         }
1149         return NULL;
1150 }
1151
1152 /*
1153  * This routine provides an alternative to tcp_recvmsg() for routines
1154  * that would like to handle copying from skbuffs directly in 'sendfile'
1155  * fashion.
1156  * Note:
1157  *      - It is assumed that the socket was locked by the caller.
1158  *      - The routine does not block.
1159  *      - At present, there is no support for reading OOB data
1160  *        or for 'peeking' the socket using this routine
1161  *        (although both would be easy to implement).
1162  */
1163 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1164                   sk_read_actor_t recv_actor)
1165 {
1166         struct sk_buff *skb;
1167         struct tcp_sock *tp = tcp_sk(sk);
1168         u32 seq = tp->copied_seq;
1169         u32 offset;
1170         int copied = 0;
1171
1172         if (sk->sk_state == TCP_LISTEN)
1173                 return -ENOTCONN;
1174         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1175                 if (offset < skb->len) {
1176                         size_t used, len;
1177
1178                         len = skb->len - offset;
1179                         /* Stop reading if we hit a patch of urgent data */
1180                         if (tp->urg_data) {
1181                                 u32 urg_offset = tp->urg_seq - seq;
1182                                 if (urg_offset < len)
1183                                         len = urg_offset;
1184                                 if (!len)
1185                                         break;
1186                         }
1187                         used = recv_actor(desc, skb, offset, len);
1188                         if (used <= len) {
1189                                 seq += used;
1190                                 copied += used;
1191                                 offset += used;
1192                         }
1193                         if (offset != skb->len)
1194                                 break;
1195                 }
1196                 if (skb->h.th->fin) {
1197                         sk_eat_skb(sk, skb);
1198                         ++seq;
1199                         break;
1200                 }
1201                 sk_eat_skb(sk, skb);
1202                 if (!desc->count)
1203                         break;
1204         }
1205         tp->copied_seq = seq;
1206
1207         tcp_rcv_space_adjust(sk);
1208
1209         /* Clean up data we have read: This will do ACK frames. */
1210         if (copied)
1211                 cleanup_rbuf(sk, copied);
1212         return copied;
1213 }
1214
1215 /*
1216  *      This routine copies from a sock struct into the user buffer.
1217  *
1218  *      Technical note: in 2.3 we work on _locked_ socket, so that
1219  *      tricks with *seq access order and skb->users are not required.
1220  *      Probably, code can be easily improved even more.
1221  */
1222
1223 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1224                 size_t len, int nonblock, int flags, int *addr_len)
1225 {
1226         struct tcp_sock *tp = tcp_sk(sk);
1227         int copied = 0;
1228         u32 peek_seq;
1229         u32 *seq;
1230         unsigned long used;
1231         int err;
1232         int target;             /* Read at least this many bytes */
1233         long timeo;
1234         struct task_struct *user_recv = NULL;
1235
1236         lock_sock(sk);
1237
1238         TCP_CHECK_TIMER(sk);
1239
1240         err = -ENOTCONN;
1241         if (sk->sk_state == TCP_LISTEN)
1242                 goto out;
1243
1244         timeo = sock_rcvtimeo(sk, nonblock);
1245
1246         /* Urgent data needs to be handled specially. */
1247         if (flags & MSG_OOB)
1248                 goto recv_urg;
1249
1250         seq = &tp->copied_seq;
1251         if (flags & MSG_PEEK) {
1252                 peek_seq = tp->copied_seq;
1253                 seq = &peek_seq;
1254         }
1255
1256         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1257
1258         do {
1259                 struct sk_buff *skb;
1260                 u32 offset;
1261
1262                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1263                 if (tp->urg_data && tp->urg_seq == *seq) {
1264                         if (copied)
1265                                 break;
1266                         if (signal_pending(current)) {
1267                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1268                                 break;
1269                         }
1270                 }
1271
1272                 /* Next get a buffer. */
1273
1274                 skb = skb_peek(&sk->sk_receive_queue);
1275                 do {
1276                         if (!skb)
1277                                 break;
1278
1279                         /* Now that we have two receive queues this
1280                          * shouldn't happen.
1281                          */
1282                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1283                                 printk(KERN_INFO "recvmsg bug: copied %X "
1284                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1285                                 break;
1286                         }
1287                         offset = *seq - TCP_SKB_CB(skb)->seq;
1288                         if (skb->h.th->syn)
1289                                 offset--;
1290                         if (offset < skb->len)
1291                                 goto found_ok_skb;
1292                         if (skb->h.th->fin)
1293                                 goto found_fin_ok;
1294                         BUG_TRAP(flags & MSG_PEEK);
1295                         skb = skb->next;
1296                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1297
1298                 /* Well, if we have backlog, try to process it now yet. */
1299
1300                 if (copied >= target && !sk->sk_backlog.tail)
1301                         break;
1302
1303                 if (copied) {
1304                         if (sk->sk_err ||
1305                             sk->sk_state == TCP_CLOSE ||
1306                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1307                             !timeo ||
1308                             signal_pending(current) ||
1309                             (flags & MSG_PEEK))
1310                                 break;
1311                 } else {
1312                         if (sock_flag(sk, SOCK_DONE))
1313                                 break;
1314
1315                         if (sk->sk_err) {
1316                                 copied = sock_error(sk);
1317                                 break;
1318                         }
1319
1320                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1321                                 break;
1322
1323                         if (sk->sk_state == TCP_CLOSE) {
1324                                 if (!sock_flag(sk, SOCK_DONE)) {
1325                                         /* This occurs when user tries to read
1326                                          * from never connected socket.
1327                                          */
1328                                         copied = -ENOTCONN;
1329                                         break;
1330                                 }
1331                                 break;
1332                         }
1333
1334                         if (!timeo) {
1335                                 copied = -EAGAIN;
1336                                 break;
1337                         }
1338
1339                         if (signal_pending(current)) {
1340                                 copied = sock_intr_errno(timeo);
1341                                 break;
1342                         }
1343                 }
1344
1345                 cleanup_rbuf(sk, copied);
1346
1347                 if (tp->ucopy.task == user_recv) {
1348                         /* Install new reader */
1349                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1350                                 user_recv = current;
1351                                 tp->ucopy.task = user_recv;
1352                                 tp->ucopy.iov = msg->msg_iov;
1353                         }
1354
1355                         tp->ucopy.len = len;
1356
1357                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1358                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1359
1360                         /* Ugly... If prequeue is not empty, we have to
1361                          * process it before releasing socket, otherwise
1362                          * order will be broken at second iteration.
1363                          * More elegant solution is required!!!
1364                          *
1365                          * Look: we have the following (pseudo)queues:
1366                          *
1367                          * 1. packets in flight
1368                          * 2. backlog
1369                          * 3. prequeue
1370                          * 4. receive_queue
1371                          *
1372                          * Each queue can be processed only if the next ones
1373                          * are empty. At this point we have empty receive_queue.
1374                          * But prequeue _can_ be not empty after 2nd iteration,
1375                          * when we jumped to start of loop because backlog
1376                          * processing added something to receive_queue.
1377                          * We cannot release_sock(), because backlog contains
1378                          * packets arrived _after_ prequeued ones.
1379                          *
1380                          * Shortly, algorithm is clear --- to process all
1381                          * the queues in order. We could make it more directly,
1382                          * requeueing packets from backlog to prequeue, if
1383                          * is not empty. It is more elegant, but eats cycles,
1384                          * unfortunately.
1385                          */
1386                         if (skb_queue_len(&tp->ucopy.prequeue))
1387                                 goto do_prequeue;
1388
1389                         /* __ Set realtime policy in scheduler __ */
1390                 }
1391
1392                 if (copied >= target) {
1393                         /* Do not sleep, just process backlog. */
1394                         release_sock(sk);
1395                         lock_sock(sk);
1396                 } else
1397                         sk_wait_data(sk, &timeo);
1398
1399                 if (user_recv) {
1400                         int chunk;
1401
1402                         /* __ Restore normal policy in scheduler __ */
1403
1404                         if ((chunk = len - tp->ucopy.len) != 0) {
1405                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1406                                 len -= chunk;
1407                                 copied += chunk;
1408                         }
1409
1410                         if (tp->rcv_nxt == tp->copied_seq &&
1411                             skb_queue_len(&tp->ucopy.prequeue)) {
1412 do_prequeue:
1413                                 tcp_prequeue_process(sk);
1414
1415                                 if ((chunk = len - tp->ucopy.len) != 0) {
1416                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1417                                         len -= chunk;
1418                                         copied += chunk;
1419                                 }
1420                         }
1421                 }
1422                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1423                         if (net_ratelimit())
1424                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1425                                        current->comm, current->pid);
1426                         peek_seq = tp->copied_seq;
1427                 }
1428                 continue;
1429
1430         found_ok_skb:
1431                 /* Ok so how much can we use? */
1432                 used = skb->len - offset;
1433                 if (len < used)
1434                         used = len;
1435
1436                 /* Do we have urgent data here? */
1437                 if (tp->urg_data) {
1438                         u32 urg_offset = tp->urg_seq - *seq;
1439                         if (urg_offset < used) {
1440                                 if (!urg_offset) {
1441                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1442                                                 ++*seq;
1443                                                 offset++;
1444                                                 used--;
1445                                                 if (!used)
1446                                                         goto skip_copy;
1447                                         }
1448                                 } else
1449                                         used = urg_offset;
1450                         }
1451                 }
1452
1453                 if (!(flags & MSG_TRUNC)) {
1454                         err = skb_copy_datagram_iovec(skb, offset,
1455                                                       msg->msg_iov, used);
1456                         if (err) {
1457                                 /* Exception. Bailout! */
1458                                 if (!copied)
1459                                         copied = -EFAULT;
1460                                 break;
1461                         }
1462                 }
1463
1464                 *seq += used;
1465                 copied += used;
1466                 len -= used;
1467
1468                 tcp_rcv_space_adjust(sk);
1469
1470 skip_copy:
1471                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1472                         tp->urg_data = 0;
1473                         tcp_fast_path_check(sk, tp);
1474                 }
1475                 if (used + offset < skb->len)
1476                         continue;
1477
1478                 if (skb->h.th->fin)
1479                         goto found_fin_ok;
1480                 if (!(flags & MSG_PEEK))
1481                         sk_eat_skb(sk, skb);
1482                 continue;
1483
1484         found_fin_ok:
1485                 /* Process the FIN. */
1486                 ++*seq;
1487                 if (!(flags & MSG_PEEK))
1488                         sk_eat_skb(sk, skb);
1489                 break;
1490         } while (len > 0);
1491
1492         if (user_recv) {
1493                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1494                         int chunk;
1495
1496                         tp->ucopy.len = copied > 0 ? len : 0;
1497
1498                         tcp_prequeue_process(sk);
1499
1500                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1501                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1502                                 len -= chunk;
1503                                 copied += chunk;
1504                         }
1505                 }
1506
1507                 tp->ucopy.task = NULL;
1508                 tp->ucopy.len = 0;
1509         }
1510
1511         /* According to UNIX98, msg_name/msg_namelen are ignored
1512          * on connected socket. I was just happy when found this 8) --ANK
1513          */
1514
1515         /* Clean up data we have read: This will do ACK frames. */
1516         cleanup_rbuf(sk, copied);
1517
1518         TCP_CHECK_TIMER(sk);
1519         release_sock(sk);
1520         return copied;
1521
1522 out:
1523         TCP_CHECK_TIMER(sk);
1524         release_sock(sk);
1525         return err;
1526
1527 recv_urg:
1528         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1529         goto out;
1530 }
1531
1532 /*
1533  *      State processing on a close. This implements the state shift for
1534  *      sending our FIN frame. Note that we only send a FIN for some
1535  *      states. A shutdown() may have already sent the FIN, or we may be
1536  *      closed.
1537  */
1538
1539 static unsigned char new_state[16] = {
1540   /* current state:        new state:      action:      */
1541   /* (Invalid)          */ TCP_CLOSE,
1542   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1543   /* TCP_SYN_SENT       */ TCP_CLOSE,
1544   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1545   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1546   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1547   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1548   /* TCP_CLOSE          */ TCP_CLOSE,
1549   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1550   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1551   /* TCP_LISTEN         */ TCP_CLOSE,
1552   /* TCP_CLOSING        */ TCP_CLOSING,
1553 };
1554
1555 static int tcp_close_state(struct sock *sk)
1556 {
1557         int next = (int)new_state[sk->sk_state];
1558         int ns = next & TCP_STATE_MASK;
1559
1560         tcp_set_state(sk, ns);
1561
1562         return next & TCP_ACTION_FIN;
1563 }
1564
1565 /*
1566  *      Shutdown the sending side of a connection. Much like close except
1567  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1568  */
1569
1570 void tcp_shutdown(struct sock *sk, int how)
1571 {
1572         /*      We need to grab some memory, and put together a FIN,
1573          *      and then put it into the queue to be sent.
1574          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1575          */
1576         if (!(how & SEND_SHUTDOWN))
1577                 return;
1578
1579         /* If we've already sent a FIN, or it's a closed state, skip this. */
1580         if ((1 << sk->sk_state) &
1581             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1582              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1583                 /* Clear out any half completed packets.  FIN if needed. */
1584                 if (tcp_close_state(sk))
1585                         tcp_send_fin(sk);
1586         }
1587 }
1588
1589 /*
1590  * At this point, there should be no process reference to this
1591  * socket, and thus no user references at all.  Therefore we
1592  * can assume the socket waitqueue is inactive and nobody will
1593  * try to jump onto it.
1594  */
1595 void tcp_destroy_sock(struct sock *sk)
1596 {
1597         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1598         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1599
1600         /* It cannot be in hash table! */
1601         BUG_TRAP(sk_unhashed(sk));
1602
1603         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1604         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1605
1606         sk->sk_prot->destroy(sk);
1607
1608         sk_stream_kill_queues(sk);
1609
1610         xfrm_sk_free_policy(sk);
1611
1612 #ifdef INET_REFCNT_DEBUG
1613         if (atomic_read(&sk->sk_refcnt) != 1) {
1614                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1615                        sk, atomic_read(&sk->sk_refcnt));
1616         }
1617 #endif
1618
1619         atomic_dec(&tcp_orphan_count);
1620         sock_put(sk);
1621 }
1622
1623 void tcp_close(struct sock *sk, long timeout)
1624 {
1625         struct sk_buff *skb;
1626         int data_was_unread = 0;
1627
1628         lock_sock(sk);
1629         sk->sk_shutdown = SHUTDOWN_MASK;
1630
1631         if (sk->sk_state == TCP_LISTEN) {
1632                 tcp_set_state(sk, TCP_CLOSE);
1633
1634                 /* Special case. */
1635                 tcp_listen_stop(sk);
1636
1637                 goto adjudge_to_death;
1638         }
1639
1640         /*  We need to flush the recv. buffs.  We do this only on the
1641          *  descriptor close, not protocol-sourced closes, because the
1642          *  reader process may not have drained the data yet!
1643          */
1644         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1645                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1646                           skb->h.th->fin;
1647                 data_was_unread += len;
1648                 __kfree_skb(skb);
1649         }
1650
1651         sk_stream_mem_reclaim(sk);
1652
1653         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1654          * 3.10, we send a RST here because data was lost.  To
1655          * witness the awful effects of the old behavior of always
1656          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1657          * a bulk GET in an FTP client, suspend the process, wait
1658          * for the client to advertise a zero window, then kill -9
1659          * the FTP client, wheee...  Note: timeout is always zero
1660          * in such a case.
1661          */
1662         if (data_was_unread) {
1663                 /* Unread data was tossed, zap the connection. */
1664                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1665                 tcp_set_state(sk, TCP_CLOSE);
1666                 tcp_send_active_reset(sk, GFP_KERNEL);
1667         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1668                 /* Check zero linger _after_ checking for unread data. */
1669                 sk->sk_prot->disconnect(sk, 0);
1670                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1671         } else if (tcp_close_state(sk)) {
1672                 /* We FIN if the application ate all the data before
1673                  * zapping the connection.
1674                  */
1675
1676                 /* RED-PEN. Formally speaking, we have broken TCP state
1677                  * machine. State transitions:
1678                  *
1679                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1680                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1681                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1682                  *
1683                  * are legal only when FIN has been sent (i.e. in window),
1684                  * rather than queued out of window. Purists blame.
1685                  *
1686                  * F.e. "RFC state" is ESTABLISHED,
1687                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1688                  *
1689                  * The visible declinations are that sometimes
1690                  * we enter time-wait state, when it is not required really
1691                  * (harmless), do not send active resets, when they are
1692                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1693                  * they look as CLOSING or LAST_ACK for Linux)
1694                  * Probably, I missed some more holelets.
1695                  *                                              --ANK
1696                  */
1697                 tcp_send_fin(sk);
1698         }
1699
1700         sk_stream_wait_close(sk, timeout);
1701
1702 adjudge_to_death:
1703         /* It is the last release_sock in its life. It will remove backlog. */
1704         release_sock(sk);
1705
1706
1707         /* Now socket is owned by kernel and we acquire BH lock
1708            to finish close. No need to check for user refs.
1709          */
1710         local_bh_disable();
1711         bh_lock_sock(sk);
1712         BUG_TRAP(!sock_owned_by_user(sk));
1713
1714         sock_hold(sk);
1715         sock_orphan(sk);
1716
1717         /*      This is a (useful) BSD violating of the RFC. There is a
1718          *      problem with TCP as specified in that the other end could
1719          *      keep a socket open forever with no application left this end.
1720          *      We use a 3 minute timeout (about the same as BSD) then kill
1721          *      our end. If they send after that then tough - BUT: long enough
1722          *      that we won't make the old 4*rto = almost no time - whoops
1723          *      reset mistake.
1724          *
1725          *      Nope, it was not mistake. It is really desired behaviour
1726          *      f.e. on http servers, when such sockets are useless, but
1727          *      consume significant resources. Let's do it with special
1728          *      linger2 option.                                 --ANK
1729          */
1730
1731         if (sk->sk_state == TCP_FIN_WAIT2) {
1732                 struct tcp_sock *tp = tcp_sk(sk);
1733                 if (tp->linger2 < 0) {
1734                         tcp_set_state(sk, TCP_CLOSE);
1735                         tcp_send_active_reset(sk, GFP_ATOMIC);
1736                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1737                 } else {
1738                         int tmo = tcp_fin_time(tp);
1739
1740                         if (tmo > TCP_TIMEWAIT_LEN) {
1741                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1742                         } else {
1743                                 atomic_inc(&tcp_orphan_count);
1744                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1745                                 goto out;
1746                         }
1747                 }
1748         }
1749         if (sk->sk_state != TCP_CLOSE) {
1750                 sk_stream_mem_reclaim(sk);
1751                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1752                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1753                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1754                         if (net_ratelimit())
1755                                 printk(KERN_INFO "TCP: too many of orphaned "
1756                                        "sockets\n");
1757                         tcp_set_state(sk, TCP_CLOSE);
1758                         tcp_send_active_reset(sk, GFP_ATOMIC);
1759                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1760                 }
1761         }
1762         atomic_inc(&tcp_orphan_count);
1763
1764         if (sk->sk_state == TCP_CLOSE)
1765                 tcp_destroy_sock(sk);
1766         /* Otherwise, socket is reprieved until protocol close. */
1767
1768 out:
1769         bh_unlock_sock(sk);
1770         local_bh_enable();
1771         sock_put(sk);
1772 }
1773
1774 /* These states need RST on ABORT according to RFC793 */
1775
1776 static inline int tcp_need_reset(int state)
1777 {
1778         return (1 << state) &
1779                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1780                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1781 }
1782
1783 int tcp_disconnect(struct sock *sk, int flags)
1784 {
1785         struct inet_sock *inet = inet_sk(sk);
1786         struct tcp_sock *tp = tcp_sk(sk);
1787         int err = 0;
1788         int old_state = sk->sk_state;
1789
1790         if (old_state != TCP_CLOSE)
1791                 tcp_set_state(sk, TCP_CLOSE);
1792
1793         /* ABORT function of RFC793 */
1794         if (old_state == TCP_LISTEN) {
1795                 tcp_listen_stop(sk);
1796         } else if (tcp_need_reset(old_state) ||
1797                    (tp->snd_nxt != tp->write_seq &&
1798                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1799                 /* The last check adjusts for discrepance of Linux wrt. RFC
1800                  * states
1801                  */
1802                 tcp_send_active_reset(sk, gfp_any());
1803                 sk->sk_err = ECONNRESET;
1804         } else if (old_state == TCP_SYN_SENT)
1805                 sk->sk_err = ECONNRESET;
1806
1807         tcp_clear_xmit_timers(sk);
1808         __skb_queue_purge(&sk->sk_receive_queue);
1809         sk_stream_writequeue_purge(sk);
1810         __skb_queue_purge(&tp->out_of_order_queue);
1811
1812         inet->dport = 0;
1813
1814         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1815                 inet_reset_saddr(sk);
1816
1817         sk->sk_shutdown = 0;
1818         sock_reset_flag(sk, SOCK_DONE);
1819         tp->srtt = 0;
1820         if ((tp->write_seq += tp->max_window + 2) == 0)
1821                 tp->write_seq = 1;
1822         tp->backoff = 0;
1823         tp->snd_cwnd = 2;
1824         tp->probes_out = 0;
1825         tp->packets_out = 0;
1826         tp->snd_ssthresh = 0x7fffffff;
1827         tp->snd_cwnd_cnt = 0;
1828         tcp_set_ca_state(tp, TCP_CA_Open);
1829         tcp_clear_retrans(tp);
1830         tcp_delack_init(tp);
1831         sk->sk_send_head = NULL;
1832         tp->rx_opt.saw_tstamp = 0;
1833         tcp_sack_reset(&tp->rx_opt);
1834         __sk_dst_reset(sk);
1835
1836         BUG_TRAP(!inet->num || tp->bind_hash);
1837
1838         sk->sk_error_report(sk);
1839         return err;
1840 }
1841
1842 /*
1843  *      Wait for an incoming connection, avoid race
1844  *      conditions. This must be called with the socket locked.
1845  */
1846 static int wait_for_connect(struct sock *sk, long timeo)
1847 {
1848         struct tcp_sock *tp = tcp_sk(sk);
1849         DEFINE_WAIT(wait);
1850         int err;
1851
1852         /*
1853          * True wake-one mechanism for incoming connections: only
1854          * one process gets woken up, not the 'whole herd'.
1855          * Since we do not 'race & poll' for established sockets
1856          * anymore, the common case will execute the loop only once.
1857          *
1858          * Subtle issue: "add_wait_queue_exclusive()" will be added
1859          * after any current non-exclusive waiters, and we know that
1860          * it will always _stay_ after any new non-exclusive waiters
1861          * because all non-exclusive waiters are added at the
1862          * beginning of the wait-queue. As such, it's ok to "drop"
1863          * our exclusiveness temporarily when we get woken up without
1864          * having to remove and re-insert us on the wait queue.
1865          */
1866         for (;;) {
1867                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1868                                           TASK_INTERRUPTIBLE);
1869                 release_sock(sk);
1870                 if (!tp->accept_queue)
1871                         timeo = schedule_timeout(timeo);
1872                 lock_sock(sk);
1873                 err = 0;
1874                 if (tp->accept_queue)
1875                         break;
1876                 err = -EINVAL;
1877                 if (sk->sk_state != TCP_LISTEN)
1878                         break;
1879                 err = sock_intr_errno(timeo);
1880                 if (signal_pending(current))
1881                         break;
1882                 err = -EAGAIN;
1883                 if (!timeo)
1884                         break;
1885         }
1886         finish_wait(sk->sk_sleep, &wait);
1887         return err;
1888 }
1889
1890 /*
1891  *      This will accept the next outstanding connection.
1892  */
1893
1894 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1895 {
1896         struct tcp_sock *tp = tcp_sk(sk);
1897         struct open_request *req;
1898         struct sock *newsk;
1899         int error;
1900
1901         lock_sock(sk);
1902
1903         /* We need to make sure that this socket is listening,
1904          * and that it has something pending.
1905          */
1906         error = -EINVAL;
1907         if (sk->sk_state != TCP_LISTEN)
1908                 goto out;
1909
1910         /* Find already established connection */
1911         if (!tp->accept_queue) {
1912                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1913
1914                 /* If this is a non blocking socket don't sleep */
1915                 error = -EAGAIN;
1916                 if (!timeo)
1917                         goto out;
1918
1919                 error = wait_for_connect(sk, timeo);
1920                 if (error)
1921                         goto out;
1922         }
1923
1924         req = tp->accept_queue;
1925         if ((tp->accept_queue = req->dl_next) == NULL)
1926                 tp->accept_queue_tail = NULL;
1927
1928         newsk = req->sk;
1929         sk_acceptq_removed(sk);
1930         tcp_openreq_fastfree(req);
1931         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1932         release_sock(sk);
1933         return newsk;
1934
1935 out:
1936         release_sock(sk);
1937         *err = error;
1938         return NULL;
1939 }
1940
1941 /*
1942  *      Socket option code for TCP.
1943  */
1944 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1945                    int optlen)
1946 {
1947         struct tcp_sock *tp = tcp_sk(sk);
1948         int val;
1949         int err = 0;
1950
1951         if (level != SOL_TCP)
1952                 return tp->af_specific->setsockopt(sk, level, optname,
1953                                                    optval, optlen);
1954
1955         if (optlen < sizeof(int))
1956                 return -EINVAL;
1957
1958         if (get_user(val, (int __user *)optval))
1959                 return -EFAULT;
1960
1961         lock_sock(sk);
1962
1963         switch (optname) {
1964         case TCP_MAXSEG:
1965                 /* Values greater than interface MTU won't take effect. However
1966                  * at the point when this call is done we typically don't yet
1967                  * know which interface is going to be used */
1968                 if (val < 8 || val > MAX_TCP_WINDOW) {
1969                         err = -EINVAL;
1970                         break;
1971                 }
1972                 tp->rx_opt.user_mss = val;
1973                 break;
1974
1975         case TCP_NODELAY:
1976                 if (val) {
1977                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1978                          * this option on corked socket is remembered, but
1979                          * it is not activated until cork is cleared.
1980                          *
1981                          * However, when TCP_NODELAY is set we make
1982                          * an explicit push, which overrides even TCP_CORK
1983                          * for currently queued segments.
1984                          */
1985                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1986                         tcp_push_pending_frames(sk, tp);
1987                 } else {
1988                         tp->nonagle &= ~TCP_NAGLE_OFF;
1989                 }
1990                 break;
1991
1992         case TCP_CORK:
1993                 /* When set indicates to always queue non-full frames.
1994                  * Later the user clears this option and we transmit
1995                  * any pending partial frames in the queue.  This is
1996                  * meant to be used alongside sendfile() to get properly
1997                  * filled frames when the user (for example) must write
1998                  * out headers with a write() call first and then use
1999                  * sendfile to send out the data parts.
2000                  *
2001                  * TCP_CORK can be set together with TCP_NODELAY and it is
2002                  * stronger than TCP_NODELAY.
2003                  */
2004                 if (val) {
2005                         tp->nonagle |= TCP_NAGLE_CORK;
2006                 } else {
2007                         tp->nonagle &= ~TCP_NAGLE_CORK;
2008                         if (tp->nonagle&TCP_NAGLE_OFF)
2009                                 tp->nonagle |= TCP_NAGLE_PUSH;
2010                         tcp_push_pending_frames(sk, tp);
2011                 }
2012                 break;
2013
2014         case TCP_KEEPIDLE:
2015                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2016                         err = -EINVAL;
2017                 else {
2018                         tp->keepalive_time = val * HZ;
2019                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2020                             !((1 << sk->sk_state) &
2021                               (TCPF_CLOSE | TCPF_LISTEN))) {
2022                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2023                                 if (tp->keepalive_time > elapsed)
2024                                         elapsed = tp->keepalive_time - elapsed;
2025                                 else
2026                                         elapsed = 0;
2027                                 tcp_reset_keepalive_timer(sk, elapsed);
2028                         }
2029                 }
2030                 break;
2031         case TCP_KEEPINTVL:
2032                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2033                         err = -EINVAL;
2034                 else
2035                         tp->keepalive_intvl = val * HZ;
2036                 break;
2037         case TCP_KEEPCNT:
2038                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2039                         err = -EINVAL;
2040                 else
2041                         tp->keepalive_probes = val;
2042                 break;
2043         case TCP_SYNCNT:
2044                 if (val < 1 || val > MAX_TCP_SYNCNT)
2045                         err = -EINVAL;
2046                 else
2047                         tp->syn_retries = val;
2048                 break;
2049
2050         case TCP_LINGER2:
2051                 if (val < 0)
2052                         tp->linger2 = -1;
2053                 else if (val > sysctl_tcp_fin_timeout / HZ)
2054                         tp->linger2 = 0;
2055                 else
2056                         tp->linger2 = val * HZ;
2057                 break;
2058
2059         case TCP_DEFER_ACCEPT:
2060                 tp->defer_accept = 0;
2061                 if (val > 0) {
2062                         /* Translate value in seconds to number of
2063                          * retransmits */
2064                         while (tp->defer_accept < 32 &&
2065                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2066                                        tp->defer_accept))
2067                                 tp->defer_accept++;
2068                         tp->defer_accept++;
2069                 }
2070                 break;
2071
2072         case TCP_WINDOW_CLAMP:
2073                 if (!val) {
2074                         if (sk->sk_state != TCP_CLOSE) {
2075                                 err = -EINVAL;
2076                                 break;
2077                         }
2078                         tp->window_clamp = 0;
2079                 } else
2080                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2081                                                 SOCK_MIN_RCVBUF / 2 : val;
2082                 break;
2083
2084         case TCP_QUICKACK:
2085                 if (!val) {
2086                         tp->ack.pingpong = 1;
2087                 } else {
2088                         tp->ack.pingpong = 0;
2089                         if ((1 << sk->sk_state) &
2090                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2091                             tcp_ack_scheduled(tp)) {
2092                                 tp->ack.pending |= TCP_ACK_PUSHED;
2093                                 cleanup_rbuf(sk, 1);
2094                                 if (!(val & 1))
2095                                         tp->ack.pingpong = 1;
2096                         }
2097                 }
2098                 break;
2099
2100         default:
2101                 err = -ENOPROTOOPT;
2102                 break;
2103         };
2104         release_sock(sk);
2105         return err;
2106 }
2107
2108 /* Return information about state of tcp endpoint in API format. */
2109 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2110 {
2111         struct tcp_sock *tp = tcp_sk(sk);
2112         u32 now = tcp_time_stamp;
2113
2114         memset(info, 0, sizeof(*info));
2115
2116         info->tcpi_state = sk->sk_state;
2117         info->tcpi_ca_state = tp->ca_state;
2118         info->tcpi_retransmits = tp->retransmits;
2119         info->tcpi_probes = tp->probes_out;
2120         info->tcpi_backoff = tp->backoff;
2121
2122         if (tp->rx_opt.tstamp_ok)
2123                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2124         if (tp->rx_opt.sack_ok)
2125                 info->tcpi_options |= TCPI_OPT_SACK;
2126         if (tp->rx_opt.wscale_ok) {
2127                 info->tcpi_options |= TCPI_OPT_WSCALE;
2128                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2129                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2130         } 
2131
2132         if (tp->ecn_flags&TCP_ECN_OK)
2133                 info->tcpi_options |= TCPI_OPT_ECN;
2134
2135         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2136         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2137         info->tcpi_snd_mss = tp->mss_cache_std;
2138         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2139
2140         info->tcpi_unacked = tp->packets_out;
2141         info->tcpi_sacked = tp->sacked_out;
2142         info->tcpi_lost = tp->lost_out;
2143         info->tcpi_retrans = tp->retrans_out;
2144         info->tcpi_fackets = tp->fackets_out;
2145
2146         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2147         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2148         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2149
2150         info->tcpi_pmtu = tp->pmtu_cookie;
2151         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2152         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2153         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2154         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2155         info->tcpi_snd_cwnd = tp->snd_cwnd;
2156         info->tcpi_advmss = tp->advmss;
2157         info->tcpi_reordering = tp->reordering;
2158
2159         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2160         info->tcpi_rcv_space = tp->rcvq_space.space;
2161
2162         info->tcpi_total_retrans = tp->total_retrans;
2163 }
2164
2165 EXPORT_SYMBOL_GPL(tcp_get_info);
2166
2167 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2168                    int __user *optlen)
2169 {
2170         struct tcp_sock *tp = tcp_sk(sk);
2171         int val, len;
2172
2173         if (level != SOL_TCP)
2174                 return tp->af_specific->getsockopt(sk, level, optname,
2175                                                    optval, optlen);
2176
2177         if (get_user(len, optlen))
2178                 return -EFAULT;
2179
2180         len = min_t(unsigned int, len, sizeof(int));
2181
2182         if (len < 0)
2183                 return -EINVAL;
2184
2185         switch (optname) {
2186         case TCP_MAXSEG:
2187                 val = tp->mss_cache_std;
2188                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2189                         val = tp->rx_opt.user_mss;
2190                 break;
2191         case TCP_NODELAY:
2192                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2193                 break;
2194         case TCP_CORK:
2195                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2196                 break;
2197         case TCP_KEEPIDLE:
2198                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2199                 break;
2200         case TCP_KEEPINTVL:
2201                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2202                 break;
2203         case TCP_KEEPCNT:
2204                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2205                 break;
2206         case TCP_SYNCNT:
2207                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2208                 break;
2209         case TCP_LINGER2:
2210                 val = tp->linger2;
2211                 if (val >= 0)
2212                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2213                 break;
2214         case TCP_DEFER_ACCEPT:
2215                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2216                                                (tp->defer_accept - 1));
2217                 break;
2218         case TCP_WINDOW_CLAMP:
2219                 val = tp->window_clamp;
2220                 break;
2221         case TCP_INFO: {
2222                 struct tcp_info info;
2223
2224                 if (get_user(len, optlen))
2225                         return -EFAULT;
2226
2227                 tcp_get_info(sk, &info);
2228
2229                 len = min_t(unsigned int, len, sizeof(info));
2230                 if (put_user(len, optlen))
2231                         return -EFAULT;
2232                 if (copy_to_user(optval, &info, len))
2233                         return -EFAULT;
2234                 return 0;
2235         }
2236         case TCP_QUICKACK:
2237                 val = !tp->ack.pingpong;
2238                 break;
2239         default:
2240                 return -ENOPROTOOPT;
2241         };
2242
2243         if (put_user(len, optlen))
2244                 return -EFAULT;
2245         if (copy_to_user(optval, &val, len))
2246                 return -EFAULT;
2247         return 0;
2248 }
2249
2250
2251 extern void __skb_cb_too_small_for_tcp(int, int);
2252 extern void tcpdiag_init(void);
2253
2254 static __initdata unsigned long thash_entries;
2255 static int __init set_thash_entries(char *str)
2256 {
2257         if (!str)
2258                 return 0;
2259         thash_entries = simple_strtoul(str, &str, 0);
2260         return 1;
2261 }
2262 __setup("thash_entries=", set_thash_entries);
2263
2264 void __init tcp_init(void)
2265 {
2266         struct sk_buff *skb = NULL;
2267         int order, i;
2268
2269         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2270                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2271                                            sizeof(skb->cb));
2272
2273         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2274                                                    sizeof(struct open_request),
2275                                                0, SLAB_HWCACHE_ALIGN,
2276                                                NULL, NULL);
2277         if (!tcp_openreq_cachep)
2278                 panic("tcp_init: Cannot alloc open_request cache.");
2279
2280         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2281                                               sizeof(struct tcp_bind_bucket),
2282                                               0, SLAB_HWCACHE_ALIGN,
2283                                               NULL, NULL);
2284         if (!tcp_bucket_cachep)
2285                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2286
2287         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2288                                                 sizeof(struct tcp_tw_bucket),
2289                                                 0, SLAB_HWCACHE_ALIGN,
2290                                                 NULL, NULL);
2291         if (!tcp_timewait_cachep)
2292                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2293
2294         /* Size and allocate the main established and bind bucket
2295          * hash tables.
2296          *
2297          * The methodology is similar to that of the buffer cache.
2298          */
2299         tcp_ehash = (struct tcp_ehash_bucket *)
2300                 alloc_large_system_hash("TCP established",
2301                                         sizeof(struct tcp_ehash_bucket),
2302                                         thash_entries,
2303                                         (num_physpages >= 128 * 1024) ?
2304                                                 (25 - PAGE_SHIFT) :
2305                                                 (27 - PAGE_SHIFT),
2306                                         HASH_HIGHMEM,
2307                                         &tcp_ehash_size,
2308                                         NULL,
2309                                         0);
2310         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2311         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2312                 rwlock_init(&tcp_ehash[i].lock);
2313                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2314         }
2315
2316         tcp_bhash = (struct tcp_bind_hashbucket *)
2317                 alloc_large_system_hash("TCP bind",
2318                                         sizeof(struct tcp_bind_hashbucket),
2319                                         tcp_ehash_size,
2320                                         (num_physpages >= 128 * 1024) ?
2321                                                 (25 - PAGE_SHIFT) :
2322                                                 (27 - PAGE_SHIFT),
2323                                         HASH_HIGHMEM,
2324                                         &tcp_bhash_size,
2325                                         NULL,
2326                                         64 * 1024);
2327         tcp_bhash_size = 1 << tcp_bhash_size;
2328         for (i = 0; i < tcp_bhash_size; i++) {
2329                 spin_lock_init(&tcp_bhash[i].lock);
2330                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2331         }
2332
2333         /* Try to be a bit smarter and adjust defaults depending
2334          * on available memory.
2335          */
2336         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2337                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2338                         order++)
2339                 ;
2340         if (order > 4) {
2341                 sysctl_local_port_range[0] = 32768;
2342                 sysctl_local_port_range[1] = 61000;
2343                 sysctl_tcp_max_tw_buckets = 180000;
2344                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2345                 sysctl_max_syn_backlog = 1024;
2346         } else if (order < 3) {
2347                 sysctl_local_port_range[0] = 1024 * (3 - order);
2348                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2349                 sysctl_tcp_max_orphans >>= (3 - order);
2350                 sysctl_max_syn_backlog = 128;
2351         }
2352         tcp_port_rover = sysctl_local_port_range[0] - 1;
2353
2354         sysctl_tcp_mem[0] =  768 << order;
2355         sysctl_tcp_mem[1] = 1024 << order;
2356         sysctl_tcp_mem[2] = 1536 << order;
2357
2358         if (order < 3) {
2359                 sysctl_tcp_wmem[2] = 64 * 1024;
2360                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2361                 sysctl_tcp_rmem[1] = 43689;
2362                 sysctl_tcp_rmem[2] = 2 * 43689;
2363         }
2364
2365         printk(KERN_INFO "TCP: Hash tables configured "
2366                "(established %d bind %d)\n",
2367                tcp_ehash_size << 1, tcp_bhash_size);
2368 }
2369
2370 EXPORT_SYMBOL(tcp_accept);
2371 EXPORT_SYMBOL(tcp_close);
2372 EXPORT_SYMBOL(tcp_destroy_sock);
2373 EXPORT_SYMBOL(tcp_disconnect);
2374 EXPORT_SYMBOL(tcp_getsockopt);
2375 EXPORT_SYMBOL(tcp_ioctl);
2376 EXPORT_SYMBOL(tcp_openreq_cachep);
2377 EXPORT_SYMBOL(tcp_poll);
2378 EXPORT_SYMBOL(tcp_read_sock);
2379 EXPORT_SYMBOL(tcp_recvmsg);
2380 EXPORT_SYMBOL(tcp_sendmsg);
2381 EXPORT_SYMBOL(tcp_sendpage);
2382 EXPORT_SYMBOL(tcp_setsockopt);
2383 EXPORT_SYMBOL(tcp_shutdown);
2384 EXPORT_SYMBOL(tcp_statistics);
2385 EXPORT_SYMBOL(tcp_timewait_cachep);