vserver 1.9.3
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      open_request handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/tcp.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/xfrm.h>
71
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/vserver/debug.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
94         .__tcp_lhash_users      =       ATOMIC_INIT(0),
95         .__tcp_lhash_wait
96           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
98 };
99
100 /*
101  * This array holds the first and last local port number.
102  * For high-usage systems, use sysctl to change this to
103  * 32768-61000
104  */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
107
108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
109                                  __u32 faddr, __u16 fport)
110 {
111         int h = (laddr ^ lport) ^ (faddr ^ fport);
112         h ^= h >> 16;
113         h ^= h >> 8;
114         return h & (tcp_ehash_size - 1);
115 }
116
117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 {
119         struct inet_opt *inet = inet_sk(sk);
120         __u32 laddr = inet->rcv_saddr;
121         __u16 lport = inet->num;
122         __u32 faddr = inet->daddr;
123         __u16 fport = inet->dport;
124
125         return tcp_hashfn(laddr, lport, faddr, fport);
126 }
127
128 /* Allocate and initialize a new TCP local port bind bucket.
129  * The bindhash mutex for snum's hash chain must be held here.
130  */
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
132                                           unsigned short snum)
133 {
134         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
135                                                       SLAB_ATOMIC);
136         if (tb) {
137                 tb->port = snum;
138                 tb->fastreuse = 0;
139                 INIT_HLIST_HEAD(&tb->owners);
140                 hlist_add_head(&tb->node, &head->chain);
141         }
142         return tb;
143 }
144
145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 {
148         if (hlist_empty(&tb->owners)) {
149                 __hlist_del(&tb->node);
150                 kmem_cache_free(tcp_bucket_cachep, tb);
151         }
152 }
153
154 /* Caller must disable local BH processing. */
155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 {
157         struct tcp_bind_hashbucket *head =
158                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
159         struct tcp_bind_bucket *tb;
160
161         spin_lock(&head->lock);
162         tb = tcp_sk(sk)->bind_hash;
163         sk_add_bind_node(child, &tb->owners);
164         tcp_sk(child)->bind_hash = tb;
165         spin_unlock(&head->lock);
166 }
167
168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
169 {
170         local_bh_disable();
171         __tcp_inherit_port(sk, child);
172         local_bh_enable();
173 }
174
175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
176                    unsigned short snum)
177 {
178         inet_sk(sk)->num = snum;
179         sk_add_bind_node(sk, &tb->owners);
180         tcp_sk(sk)->bind_hash = tb;
181 }
182
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 {
185         struct sock *sk2;
186         struct hlist_node *node;
187         int reuse = sk->sk_reuse;
188
189         sk_for_each_bound(sk2, node, &tb->owners) {
190                 if (sk != sk2 &&
191                     !tcp_v6_ipv6only(sk2) &&
192                     (!sk->sk_bound_dev_if ||
193                      !sk2->sk_bound_dev_if ||
194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195                         if (!reuse || !sk2->sk_reuse ||
196                             sk2->sk_state == TCP_LISTEN) {
197                                 if (nx_addr_conflict(sk->sk_nx_info,
198                                         tcp_v4_rcv_saddr(sk), sk2))
199                                         break;
200                         }
201                 }
202         }
203         return node != NULL;
204 }
205
206 /* Obtain a reference to a local port for the given sock,
207  * if snum is zero it means select any available local port.
208  */
209 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
210 {
211         struct tcp_bind_hashbucket *head;
212         struct hlist_node *node;
213         struct tcp_bind_bucket *tb;
214         int ret;
215
216         local_bh_disable();
217         if (!snum) {
218                 int low = sysctl_local_port_range[0];
219                 int high = sysctl_local_port_range[1];
220                 int remaining = (high - low) + 1;
221                 int rover;
222
223                 spin_lock(&tcp_portalloc_lock);
224                 rover = tcp_port_rover;
225                 do {
226                         rover++;
227                         if (rover < low || rover > high)
228                                 rover = low;
229                         head = &tcp_bhash[tcp_bhashfn(rover)];
230                         spin_lock(&head->lock);
231                         tb_for_each(tb, node, &head->chain)
232                                 if (tb->port == rover)
233                                         goto next;
234                         break;
235                 next:
236                         spin_unlock(&head->lock);
237                 } while (--remaining > 0);
238                 tcp_port_rover = rover;
239                 spin_unlock(&tcp_portalloc_lock);
240
241                 /* Exhausted local port range during search? */
242                 ret = 1;
243                 if (remaining <= 0)
244                         goto fail;
245
246                 /* OK, here is the one we will use.  HEAD is
247                  * non-NULL and we hold it's mutex.
248                  */
249                 snum = rover;
250         } else {
251                 head = &tcp_bhash[tcp_bhashfn(snum)];
252                 spin_lock(&head->lock);
253                 tb_for_each(tb, node, &head->chain)
254                         if (tb->port == snum)
255                                 goto tb_found;
256         }
257         tb = NULL;
258         goto tb_not_found;
259 tb_found:
260         if (!hlist_empty(&tb->owners)) {
261                 if (sk->sk_reuse > 1)
262                         goto success;
263                 if (tb->fastreuse > 0 &&
264                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
265                         goto success;
266                 } else {
267                         ret = 1;
268                         if (tcp_bind_conflict(sk, tb))
269                                 goto fail_unlock;
270                 }
271         }
272 tb_not_found:
273         ret = 1;
274         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
275                 goto fail_unlock;
276         if (hlist_empty(&tb->owners)) {
277                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
278                         tb->fastreuse = 1;
279                 else
280                         tb->fastreuse = 0;
281         } else if (tb->fastreuse &&
282                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
283                 tb->fastreuse = 0;
284 success:
285         if (!tcp_sk(sk)->bind_hash)
286                 tcp_bind_hash(sk, tb, snum);
287         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
288         ret = 0;
289
290 fail_unlock:
291         spin_unlock(&head->lock);
292 fail:
293         local_bh_enable();
294         return ret;
295 }
296
297 /* Get rid of any references to a local port held by the
298  * given sock.
299  */
300 static void __tcp_put_port(struct sock *sk)
301 {
302         struct inet_opt *inet = inet_sk(sk);
303         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
304         struct tcp_bind_bucket *tb;
305
306         spin_lock(&head->lock);
307         tb = tcp_sk(sk)->bind_hash;
308         __sk_del_bind_node(sk);
309         tcp_sk(sk)->bind_hash = NULL;
310         inet->num = 0;
311         tcp_bucket_destroy(tb);
312         spin_unlock(&head->lock);
313 }
314
315 void tcp_put_port(struct sock *sk)
316 {
317         local_bh_disable();
318         __tcp_put_port(sk);
319         local_bh_enable();
320 }
321
322 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
323  * Look, when several writers sleep and reader wakes them up, all but one
324  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
325  * this, _but_ remember, it adds useless work on UP machines (wake up each
326  * exclusive lock release). It should be ifdefed really.
327  */
328
329 void tcp_listen_wlock(void)
330 {
331         write_lock(&tcp_lhash_lock);
332
333         if (atomic_read(&tcp_lhash_users)) {
334                 DEFINE_WAIT(wait);
335
336                 for (;;) {
337                         prepare_to_wait_exclusive(&tcp_lhash_wait,
338                                                 &wait, TASK_UNINTERRUPTIBLE);
339                         if (!atomic_read(&tcp_lhash_users))
340                                 break;
341                         write_unlock_bh(&tcp_lhash_lock);
342                         schedule();
343                         write_lock_bh(&tcp_lhash_lock);
344                 }
345
346                 finish_wait(&tcp_lhash_wait, &wait);
347         }
348 }
349
350 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
351 {
352         struct hlist_head *list;
353         rwlock_t *lock;
354
355         BUG_TRAP(sk_unhashed(sk));
356         if (listen_possible && sk->sk_state == TCP_LISTEN) {
357                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
358                 lock = &tcp_lhash_lock;
359                 tcp_listen_wlock();
360         } else {
361                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
362                 lock = &tcp_ehash[sk->sk_hashent].lock;
363                 write_lock(lock);
364         }
365         __sk_add_node(sk, list);
366         sock_prot_inc_use(sk->sk_prot);
367         write_unlock(lock);
368         if (listen_possible && sk->sk_state == TCP_LISTEN)
369                 wake_up(&tcp_lhash_wait);
370 }
371
372 static void tcp_v4_hash(struct sock *sk)
373 {
374         if (sk->sk_state != TCP_CLOSE) {
375                 local_bh_disable();
376                 __tcp_v4_hash(sk, 1);
377                 local_bh_enable();
378         }
379 }
380
381 void tcp_unhash(struct sock *sk)
382 {
383         rwlock_t *lock;
384
385         if (sk_unhashed(sk))
386                 goto ende;
387
388         if (sk->sk_state == TCP_LISTEN) {
389                 local_bh_disable();
390                 tcp_listen_wlock();
391                 lock = &tcp_lhash_lock;
392         } else {
393                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
394                 lock = &head->lock;
395                 write_lock_bh(&head->lock);
396         }
397
398         if (__sk_del_node_init(sk))
399                 sock_prot_dec_use(sk->sk_prot);
400         write_unlock_bh(lock);
401
402  ende:
403         if (sk->sk_state == TCP_LISTEN)
404                 wake_up(&tcp_lhash_wait);
405 }
406
407
408 /*
409  *      Check if a given address matches for a tcp socket
410  *
411  *      nxi:    the socket's nx_info if any
412  *      addr:   to be verified address
413  *      saddr:  socket addresses
414  */
415 static inline int tcp_addr_match (
416         struct nx_info *nxi,
417         uint32_t addr,
418         uint32_t saddr)
419 {
420         if (addr && (saddr == addr))
421                 return 1;
422         if (!saddr)
423                 return addr_in_nx_info(nxi, addr);
424         return 0;
425 }
426
427 /* Don't inline this cruft.  Here are some nice properties to
428  * exploit here.  The BSD API does not allow a listening TCP
429  * to specify the remote port nor the remote address for the
430  * connection.  So always assume those are both wildcarded
431  * during the search since they can never be otherwise.
432  */
433 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
434                                              unsigned short hnum, int dif)
435 {
436         struct sock *result = NULL, *sk;
437         struct hlist_node *node;
438         int score, hiscore;
439
440         hiscore=-1;
441         sk_for_each(sk, node, head) {
442                 struct inet_opt *inet = inet_sk(sk);
443
444                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
445                         __u32 rcv_saddr = inet->rcv_saddr;
446
447                         score = (sk->sk_family == PF_INET ? 1 : 0);
448                         if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
449                                 score+=2;
450                         else
451                                 continue;
452                         if (sk->sk_bound_dev_if) {
453                                 if (sk->sk_bound_dev_if != dif)
454                                         continue;
455                                 score+=2;
456                         }
457                         if (score == 5)
458                                 return sk;
459                         if (score > hiscore) {
460                                 hiscore = score;
461                                 result = sk;
462                         }
463                 }
464         }
465         return result;
466 }
467
468 /* Optimize the common listener case. */
469 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
470                                            int dif)
471 {
472         struct sock *sk = NULL;
473         struct hlist_head *head;
474
475         read_lock(&tcp_lhash_lock);
476         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
477         if (!hlist_empty(head)) {
478                 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
479
480                 if (inet->num == hnum && !sk->sk_node.next &&
481                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
482                     tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
483                     !sk->sk_bound_dev_if)
484                         goto sherry_cache;
485                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
486         }
487         if (sk) {
488 sherry_cache:
489                 sock_hold(sk);
490         }
491         read_unlock(&tcp_lhash_lock);
492         return sk;
493 }
494
495 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
496  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
497  *
498  * Local BH must be disabled here.
499  */
500
501 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
502                                                        u32 daddr, u16 hnum,
503                                                        int dif)
504 {
505         struct tcp_ehash_bucket *head;
506         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
507         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
508         struct sock *sk;
509         struct hlist_node *node;
510         /* Optimize here for direct hit, only listening connections can
511          * have wildcards anyways.
512          */
513         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
514         head = &tcp_ehash[hash];
515         read_lock(&head->lock);
516         sk_for_each(sk, node, &head->chain) {
517                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
518                         goto hit; /* You sunk my battleship! */
519         }
520
521         /* Must check for a TIME_WAIT'er before going to listener hash. */
522         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
523                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
524                         goto hit;
525         }
526         sk = NULL;
527 out:
528         read_unlock(&head->lock);
529         return sk;
530 hit:
531         sock_hold(sk);
532         goto out;
533 }
534
535 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
536                                            u32 daddr, u16 hnum, int dif)
537 {
538         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
539                                                       daddr, hnum, dif);
540
541         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
542 }
543
544 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
545                                   u16 dport, int dif)
546 {
547         struct sock *sk;
548
549         local_bh_disable();
550         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
551         local_bh_enable();
552
553         return sk;
554 }
555
556 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
557 {
558         return secure_tcp_sequence_number(skb->nh.iph->daddr,
559                                           skb->nh.iph->saddr,
560                                           skb->h.th->dest,
561                                           skb->h.th->source);
562 }
563
564 /* called with local bh disabled */
565 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
566                                       struct tcp_tw_bucket **twp)
567 {
568         struct inet_opt *inet = inet_sk(sk);
569         u32 daddr = inet->rcv_saddr;
570         u32 saddr = inet->daddr;
571         int dif = sk->sk_bound_dev_if;
572         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
573         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
574         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
575         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
576         struct sock *sk2;
577         struct hlist_node *node;
578         struct tcp_tw_bucket *tw;
579
580         write_lock(&head->lock);
581
582         /* Check TIME-WAIT sockets first. */
583         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
584                 tw = (struct tcp_tw_bucket *)sk2;
585
586                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
587                         struct tcp_opt *tp = tcp_sk(sk);
588
589                         /* With PAWS, it is safe from the viewpoint
590                            of data integrity. Even without PAWS it
591                            is safe provided sequence spaces do not
592                            overlap i.e. at data rates <= 80Mbit/sec.
593
594                            Actually, the idea is close to VJ's one,
595                            only timestamp cache is held not per host,
596                            but per port pair and TW bucket is used
597                            as state holder.
598
599                            If TW bucket has been already destroyed we
600                            fall back to VJ's scheme and use initial
601                            timestamp retrieved from peer table.
602                          */
603                         if (tw->tw_ts_recent_stamp &&
604                             (!twp || (sysctl_tcp_tw_reuse &&
605                                       xtime.tv_sec -
606                                       tw->tw_ts_recent_stamp > 1))) {
607                                 if ((tp->write_seq =
608                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
609                                         tp->write_seq = 1;
610                                 tp->ts_recent       = tw->tw_ts_recent;
611                                 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
612                                 sock_hold(sk2);
613                                 goto unique;
614                         } else
615                                 goto not_unique;
616                 }
617         }
618         tw = NULL;
619
620         /* And established part... */
621         sk_for_each(sk2, node, &head->chain) {
622                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
623                         goto not_unique;
624         }
625
626 unique:
627         /* Must record num and sport now. Otherwise we will see
628          * in hash table socket with a funny identity. */
629         inet->num = lport;
630         inet->sport = htons(lport);
631         sk->sk_hashent = hash;
632         BUG_TRAP(sk_unhashed(sk));
633         __sk_add_node(sk, &head->chain);
634         sock_prot_inc_use(sk->sk_prot);
635         write_unlock(&head->lock);
636
637         if (twp) {
638                 *twp = tw;
639                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
640         } else if (tw) {
641                 /* Silly. Should hash-dance instead... */
642                 tcp_tw_deschedule(tw);
643                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
644
645                 tcp_tw_put(tw);
646         }
647
648         return 0;
649
650 not_unique:
651         write_unlock(&head->lock);
652         return -EADDRNOTAVAIL;
653 }
654
655 /*
656  * Bind a port for a connect operation and hash it.
657  */
658 static int tcp_v4_hash_connect(struct sock *sk)
659 {
660         unsigned short snum = inet_sk(sk)->num;
661         struct tcp_bind_hashbucket *head;
662         struct tcp_bind_bucket *tb;
663         int ret;
664
665         if (!snum) {
666                 int rover;
667                 int low = sysctl_local_port_range[0];
668                 int high = sysctl_local_port_range[1];
669                 int remaining = (high - low) + 1;
670                 struct hlist_node *node;
671                 struct tcp_tw_bucket *tw = NULL;
672
673                 local_bh_disable();
674
675                 /* TODO. Actually it is not so bad idea to remove
676                  * tcp_portalloc_lock before next submission to Linus.
677                  * As soon as we touch this place at all it is time to think.
678                  *
679                  * Now it protects single _advisory_ variable tcp_port_rover,
680                  * hence it is mostly useless.
681                  * Code will work nicely if we just delete it, but
682                  * I am afraid in contented case it will work not better or
683                  * even worse: another cpu just will hit the same bucket
684                  * and spin there.
685                  * So some cpu salt could remove both contention and
686                  * memory pingpong. Any ideas how to do this in a nice way?
687                  */
688                 spin_lock(&tcp_portalloc_lock);
689                 rover = tcp_port_rover;
690
691                 do {
692                         rover++;
693                         if ((rover < low) || (rover > high))
694                                 rover = low;
695                         head = &tcp_bhash[tcp_bhashfn(rover)];
696                         spin_lock(&head->lock);
697
698                         /* Does not bother with rcv_saddr checks,
699                          * because the established check is already
700                          * unique enough.
701                          */
702                         tb_for_each(tb, node, &head->chain) {
703                                 if (tb->port == rover) {
704                                         BUG_TRAP(!hlist_empty(&tb->owners));
705                                         if (tb->fastreuse >= 0)
706                                                 goto next_port;
707                                         if (!__tcp_v4_check_established(sk,
708                                                                         rover,
709                                                                         &tw))
710                                                 goto ok;
711                                         goto next_port;
712                                 }
713                         }
714
715                         tb = tcp_bucket_create(head, rover);
716                         if (!tb) {
717                                 spin_unlock(&head->lock);
718                                 break;
719                         }
720                         tb->fastreuse = -1;
721                         goto ok;
722
723                 next_port:
724                         spin_unlock(&head->lock);
725                 } while (--remaining > 0);
726                 tcp_port_rover = rover;
727                 spin_unlock(&tcp_portalloc_lock);
728
729                 local_bh_enable();
730
731                 return -EADDRNOTAVAIL;
732
733 ok:
734                 /* All locks still held and bhs disabled */
735                 tcp_port_rover = rover;
736                 spin_unlock(&tcp_portalloc_lock);
737
738                 tcp_bind_hash(sk, tb, rover);
739                 if (sk_unhashed(sk)) {
740                         inet_sk(sk)->sport = htons(rover);
741                         __tcp_v4_hash(sk, 0);
742                 }
743                 spin_unlock(&head->lock);
744
745                 if (tw) {
746                         tcp_tw_deschedule(tw);
747                         tcp_tw_put(tw);
748                 }
749
750                 ret = 0;
751                 goto out;
752         }
753
754         head  = &tcp_bhash[tcp_bhashfn(snum)];
755         tb  = tcp_sk(sk)->bind_hash;
756         spin_lock_bh(&head->lock);
757         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
758                 __tcp_v4_hash(sk, 0);
759                 spin_unlock_bh(&head->lock);
760                 return 0;
761         } else {
762                 spin_unlock(&head->lock);
763                 /* No definite answer... Walk to established hash table */
764                 ret = __tcp_v4_check_established(sk, snum, NULL);
765 out:
766                 local_bh_enable();
767                 return ret;
768         }
769 }
770
771 /* This will initiate an outgoing connection. */
772 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
773 {
774         struct inet_opt *inet = inet_sk(sk);
775         struct tcp_opt *tp = tcp_sk(sk);
776         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
777         struct rtable *rt;
778         u32 daddr, nexthop;
779         int tmp;
780         int err;
781
782         if (addr_len < sizeof(struct sockaddr_in))
783                 return -EINVAL;
784
785         if (usin->sin_family != AF_INET)
786                 return -EAFNOSUPPORT;
787
788         nexthop = daddr = usin->sin_addr.s_addr;
789         if (inet->opt && inet->opt->srr) {
790                 if (!daddr)
791                         return -EINVAL;
792                 nexthop = inet->opt->faddr;
793         }
794
795         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
796                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
797                                IPPROTO_TCP,
798                                inet->sport, usin->sin_port, sk);
799         if (tmp < 0)
800                 return tmp;
801
802         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
803                 ip_rt_put(rt);
804                 return -ENETUNREACH;
805         }
806
807         if (!inet->opt || !inet->opt->srr)
808                 daddr = rt->rt_dst;
809
810         if (!inet->saddr)
811                 inet->saddr = rt->rt_src;
812         inet->rcv_saddr = inet->saddr;
813
814         if (tp->ts_recent_stamp && inet->daddr != daddr) {
815                 /* Reset inherited state */
816                 tp->ts_recent       = 0;
817                 tp->ts_recent_stamp = 0;
818                 tp->write_seq       = 0;
819         }
820
821         if (sysctl_tcp_tw_recycle &&
822             !tp->ts_recent_stamp && rt->rt_dst == daddr) {
823                 struct inet_peer *peer = rt_get_peer(rt);
824
825                 /* VJ's idea. We save last timestamp seen from
826                  * the destination in peer table, when entering state TIME-WAIT
827                  * and initialize ts_recent from it, when trying new connection.
828                  */
829
830                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
831                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
832                         tp->ts_recent = peer->tcp_ts;
833                 }
834         }
835
836         inet->dport = usin->sin_port;
837         inet->daddr = daddr;
838
839         tp->ext_header_len = 0;
840         if (inet->opt)
841                 tp->ext_header_len = inet->opt->optlen;
842
843         tp->mss_clamp = 536;
844
845         /* Socket identity is still unknown (sport may be zero).
846          * However we set state to SYN-SENT and not releasing socket
847          * lock select source port, enter ourselves into the hash tables and
848          * complete initialization after this.
849          */
850         tcp_set_state(sk, TCP_SYN_SENT);
851         err = tcp_v4_hash_connect(sk);
852         if (err)
853                 goto failure;
854
855         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
856         if (err)
857                 goto failure;
858
859         /* OK, now commit destination to socket.  */
860         __sk_dst_set(sk, &rt->u.dst);
861         tcp_v4_setup_caps(sk, &rt->u.dst);
862         tp->ext2_header_len = rt->u.dst.header_len;
863
864         if (!tp->write_seq)
865                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
866                                                            inet->daddr,
867                                                            inet->sport,
868                                                            usin->sin_port);
869
870         inet->id = tp->write_seq ^ jiffies;
871
872         err = tcp_connect(sk);
873         rt = NULL;
874         if (err)
875                 goto failure;
876
877         return 0;
878
879 failure:
880         /* This unhashes the socket and releases the local port, if necessary. */
881         tcp_set_state(sk, TCP_CLOSE);
882         ip_rt_put(rt);
883         sk->sk_route_caps = 0;
884         inet->dport = 0;
885         return err;
886 }
887
888 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
889 {
890         return ((struct rtable *)skb->dst)->rt_iif;
891 }
892
893 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
894 {
895         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
896 }
897
898 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
899                                               struct open_request ***prevp,
900                                               __u16 rport,
901                                               __u32 raddr, __u32 laddr)
902 {
903         struct tcp_listen_opt *lopt = tp->listen_opt;
904         struct open_request *req, **prev;
905
906         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
907              (req = *prev) != NULL;
908              prev = &req->dl_next) {
909                 if (req->rmt_port == rport &&
910                     req->af.v4_req.rmt_addr == raddr &&
911                     req->af.v4_req.loc_addr == laddr &&
912                     TCP_INET_FAMILY(req->class->family)) {
913                         BUG_TRAP(!req->sk);
914                         *prevp = prev;
915                         break;
916                 }
917         }
918
919         return req;
920 }
921
922 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
923 {
924         struct tcp_opt *tp = tcp_sk(sk);
925         struct tcp_listen_opt *lopt = tp->listen_opt;
926         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
927
928         req->expires = jiffies + TCP_TIMEOUT_INIT;
929         req->retrans = 0;
930         req->sk = NULL;
931         req->dl_next = lopt->syn_table[h];
932
933         write_lock(&tp->syn_wait_lock);
934         lopt->syn_table[h] = req;
935         write_unlock(&tp->syn_wait_lock);
936
937         tcp_synq_added(sk);
938 }
939
940
941 /*
942  * This routine does path mtu discovery as defined in RFC1191.
943  */
944 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
945                                      u32 mtu)
946 {
947         struct dst_entry *dst;
948         struct inet_opt *inet = inet_sk(sk);
949         struct tcp_opt *tp = tcp_sk(sk);
950
951         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
952          * send out by Linux are always <576bytes so they should go through
953          * unfragmented).
954          */
955         if (sk->sk_state == TCP_LISTEN)
956                 return;
957
958         /* We don't check in the destentry if pmtu discovery is forbidden
959          * on this route. We just assume that no packet_to_big packets
960          * are send back when pmtu discovery is not active.
961          * There is a small race when the user changes this flag in the
962          * route, but I think that's acceptable.
963          */
964         if ((dst = __sk_dst_check(sk, 0)) == NULL)
965                 return;
966
967         dst->ops->update_pmtu(dst, mtu);
968
969         /* Something is about to be wrong... Remember soft error
970          * for the case, if this connection will not able to recover.
971          */
972         if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
973                 sk->sk_err_soft = EMSGSIZE;
974
975         mtu = dst_pmtu(dst);
976
977         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
978             tp->pmtu_cookie > mtu) {
979                 tcp_sync_mss(sk, mtu);
980
981                 /* Resend the TCP packet because it's
982                  * clear that the old packet has been
983                  * dropped. This is the new "fast" path mtu
984                  * discovery.
985                  */
986                 tcp_simple_retransmit(sk);
987         } /* else let the usual retransmit timer handle it */
988 }
989
990 /*
991  * This routine is called by the ICMP module when it gets some
992  * sort of error condition.  If err < 0 then the socket should
993  * be closed and the error returned to the user.  If err > 0
994  * it's just the icmp type << 8 | icmp code.  After adjustment
995  * header points to the first 8 bytes of the tcp header.  We need
996  * to find the appropriate port.
997  *
998  * The locking strategy used here is very "optimistic". When
999  * someone else accesses the socket the ICMP is just dropped
1000  * and for some paths there is no check at all.
1001  * A more general error queue to queue errors for later handling
1002  * is probably better.
1003  *
1004  */
1005
1006 void tcp_v4_err(struct sk_buff *skb, u32 info)
1007 {
1008         struct iphdr *iph = (struct iphdr *)skb->data;
1009         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1010         struct tcp_opt *tp;
1011         struct inet_opt *inet;
1012         int type = skb->h.icmph->type;
1013         int code = skb->h.icmph->code;
1014         struct sock *sk;
1015         __u32 seq;
1016         int err;
1017
1018         if (skb->len < (iph->ihl << 2) + 8) {
1019                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1020                 return;
1021         }
1022
1023         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1024                            th->source, tcp_v4_iif(skb));
1025         if (!sk) {
1026                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1027                 return;
1028         }
1029         if (sk->sk_state == TCP_TIME_WAIT) {
1030                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1031                 return;
1032         }
1033
1034         bh_lock_sock(sk);
1035         /* If too many ICMPs get dropped on busy
1036          * servers this needs to be solved differently.
1037          */
1038         if (sock_owned_by_user(sk))
1039                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1040
1041         if (sk->sk_state == TCP_CLOSE)
1042                 goto out;
1043
1044         tp = tcp_sk(sk);
1045         seq = ntohl(th->seq);
1046         if (sk->sk_state != TCP_LISTEN &&
1047             !between(seq, tp->snd_una, tp->snd_nxt)) {
1048                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1049                 goto out;
1050         }
1051
1052         switch (type) {
1053         case ICMP_SOURCE_QUENCH:
1054                 /* Just silently ignore these. */
1055                 goto out;
1056         case ICMP_PARAMETERPROB:
1057                 err = EPROTO;
1058                 break;
1059         case ICMP_DEST_UNREACH:
1060                 if (code > NR_ICMP_UNREACH)
1061                         goto out;
1062
1063                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1064                         if (!sock_owned_by_user(sk))
1065                                 do_pmtu_discovery(sk, iph, info);
1066                         goto out;
1067                 }
1068
1069                 err = icmp_err_convert[code].errno;
1070                 break;
1071         case ICMP_TIME_EXCEEDED:
1072                 err = EHOSTUNREACH;
1073                 break;
1074         default:
1075                 goto out;
1076         }
1077
1078         switch (sk->sk_state) {
1079                 struct open_request *req, **prev;
1080         case TCP_LISTEN:
1081                 if (sock_owned_by_user(sk))
1082                         goto out;
1083
1084                 req = tcp_v4_search_req(tp, &prev, th->dest,
1085                                         iph->daddr, iph->saddr);
1086                 if (!req)
1087                         goto out;
1088
1089                 /* ICMPs are not backlogged, hence we cannot get
1090                    an established socket here.
1091                  */
1092                 BUG_TRAP(!req->sk);
1093
1094                 if (seq != req->snt_isn) {
1095                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1096                         goto out;
1097                 }
1098
1099                 /*
1100                  * Still in SYN_RECV, just remove it silently.
1101                  * There is no good way to pass the error to the newly
1102                  * created socket, and POSIX does not want network
1103                  * errors returned from accept().
1104                  */
1105                 tcp_synq_drop(sk, req, prev);
1106                 goto out;
1107
1108         case TCP_SYN_SENT:
1109         case TCP_SYN_RECV:  /* Cannot happen.
1110                                It can f.e. if SYNs crossed.
1111                              */
1112                 if (!sock_owned_by_user(sk)) {
1113                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1114                         sk->sk_err = err;
1115
1116                         sk->sk_error_report(sk);
1117
1118                         tcp_done(sk);
1119                 } else {
1120                         sk->sk_err_soft = err;
1121                 }
1122                 goto out;
1123         }
1124
1125         /* If we've already connected we will keep trying
1126          * until we time out, or the user gives up.
1127          *
1128          * rfc1122 4.2.3.9 allows to consider as hard errors
1129          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1130          * but it is obsoleted by pmtu discovery).
1131          *
1132          * Note, that in modern internet, where routing is unreliable
1133          * and in each dark corner broken firewalls sit, sending random
1134          * errors ordered by their masters even this two messages finally lose
1135          * their original sense (even Linux sends invalid PORT_UNREACHs)
1136          *
1137          * Now we are in compliance with RFCs.
1138          *                                                      --ANK (980905)
1139          */
1140
1141         inet = inet_sk(sk);
1142         if (!sock_owned_by_user(sk) && inet->recverr) {
1143                 sk->sk_err = err;
1144                 sk->sk_error_report(sk);
1145         } else  { /* Only an error on timeout */
1146                 sk->sk_err_soft = err;
1147         }
1148
1149 out:
1150         bh_unlock_sock(sk);
1151         sock_put(sk);
1152 }
1153
1154 /* This routine computes an IPv4 TCP checksum. */
1155 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1156                        struct sk_buff *skb)
1157 {
1158         struct inet_opt *inet = inet_sk(sk);
1159
1160         if (skb->ip_summed == CHECKSUM_HW) {
1161                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1162                 skb->csum = offsetof(struct tcphdr, check);
1163         } else {
1164                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1165                                          csum_partial((char *)th,
1166                                                       th->doff << 2,
1167                                                       skb->csum));
1168         }
1169 }
1170
1171 /*
1172  *      This routine will send an RST to the other tcp.
1173  *
1174  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1175  *                    for reset.
1176  *      Answer: if a packet caused RST, it is not for a socket
1177  *              existing in our system, if it is matched to a socket,
1178  *              it is just duplicate segment or bug in other side's TCP.
1179  *              So that we build reply only basing on parameters
1180  *              arrived with segment.
1181  *      Exception: precedence violation. We do not implement it in any case.
1182  */
1183
1184 static void tcp_v4_send_reset(struct sk_buff *skb)
1185 {
1186         struct tcphdr *th = skb->h.th;
1187         struct tcphdr rth;
1188         struct ip_reply_arg arg;
1189
1190         /* Never send a reset in response to a reset. */
1191         if (th->rst)
1192                 return;
1193
1194         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1195                 return;
1196
1197         /* Swap the send and the receive. */
1198         memset(&rth, 0, sizeof(struct tcphdr));
1199         rth.dest   = th->source;
1200         rth.source = th->dest;
1201         rth.doff   = sizeof(struct tcphdr) / 4;
1202         rth.rst    = 1;
1203
1204         if (th->ack) {
1205                 rth.seq = th->ack_seq;
1206         } else {
1207                 rth.ack = 1;
1208                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1209                                     skb->len - (th->doff << 2));
1210         }
1211
1212         memset(&arg, 0, sizeof arg);
1213         arg.iov[0].iov_base = (unsigned char *)&rth;
1214         arg.iov[0].iov_len  = sizeof rth;
1215         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1216                                       skb->nh.iph->saddr, /*XXX*/
1217                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1218         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1219
1220         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1221
1222         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1223         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1224 }
1225
1226 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1227    outside socket context is ugly, certainly. What can I do?
1228  */
1229
1230 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1231                             u32 win, u32 ts)
1232 {
1233         struct tcphdr *th = skb->h.th;
1234         struct {
1235                 struct tcphdr th;
1236                 u32 tsopt[3];
1237         } rep;
1238         struct ip_reply_arg arg;
1239
1240         memset(&rep.th, 0, sizeof(struct tcphdr));
1241         memset(&arg, 0, sizeof arg);
1242
1243         arg.iov[0].iov_base = (unsigned char *)&rep;
1244         arg.iov[0].iov_len  = sizeof(rep.th);
1245         if (ts) {
1246                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1247                                      (TCPOPT_TIMESTAMP << 8) |
1248                                      TCPOLEN_TIMESTAMP);
1249                 rep.tsopt[1] = htonl(tcp_time_stamp);
1250                 rep.tsopt[2] = htonl(ts);
1251                 arg.iov[0].iov_len = sizeof(rep);
1252         }
1253
1254         /* Swap the send and the receive. */
1255         rep.th.dest    = th->source;
1256         rep.th.source  = th->dest;
1257         rep.th.doff    = arg.iov[0].iov_len / 4;
1258         rep.th.seq     = htonl(seq);
1259         rep.th.ack_seq = htonl(ack);
1260         rep.th.ack     = 1;
1261         rep.th.window  = htons(win);
1262
1263         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1264                                       skb->nh.iph->saddr, /*XXX*/
1265                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1266         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1267
1268         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1269
1270         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1271 }
1272
1273 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1274 {
1275         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1276
1277         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1278                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1279
1280         tcp_tw_put(tw);
1281 }
1282
1283 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1284 {
1285         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1286                         req->ts_recent);
1287 }
1288
1289 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1290                                           struct open_request *req)
1291 {
1292         struct rtable *rt;
1293         struct ip_options *opt = req->af.v4_req.opt;
1294         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1295                             .nl_u = { .ip4_u =
1296                                       { .daddr = ((opt && opt->srr) ?
1297                                                   opt->faddr :
1298                                                   req->af.v4_req.rmt_addr),
1299                                         .saddr = req->af.v4_req.loc_addr,
1300                                         .tos = RT_CONN_FLAGS(sk) } },
1301                             .proto = IPPROTO_TCP,
1302                             .uli_u = { .ports =
1303                                        { .sport = inet_sk(sk)->sport,
1304                                          .dport = req->rmt_port } } };
1305
1306         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1307                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1308                 return NULL;
1309         }
1310         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1311                 ip_rt_put(rt);
1312                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1313                 return NULL;
1314         }
1315         return &rt->u.dst;
1316 }
1317
1318 /*
1319  *      Send a SYN-ACK after having received an ACK.
1320  *      This still operates on a open_request only, not on a big
1321  *      socket.
1322  */
1323 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1324                               struct dst_entry *dst)
1325 {
1326         int err = -1;
1327         struct sk_buff * skb;
1328
1329         /* First, grab a route. */
1330         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1331                 goto out;
1332
1333         skb = tcp_make_synack(sk, dst, req);
1334
1335         if (skb) {
1336                 struct tcphdr *th = skb->h.th;
1337
1338                 th->check = tcp_v4_check(th, skb->len,
1339                                          req->af.v4_req.loc_addr,
1340                                          req->af.v4_req.rmt_addr,
1341                                          csum_partial((char *)th, skb->len,
1342                                                       skb->csum));
1343
1344                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1345                                             req->af.v4_req.rmt_addr,
1346                                             req->af.v4_req.opt);
1347                 if (err == NET_XMIT_CN)
1348                         err = 0;
1349         }
1350
1351 out:
1352         dst_release(dst);
1353         return err;
1354 }
1355
1356 /*
1357  *      IPv4 open_request destructor.
1358  */
1359 static void tcp_v4_or_free(struct open_request *req)
1360 {
1361         if (req->af.v4_req.opt)
1362                 kfree(req->af.v4_req.opt);
1363 }
1364
1365 static inline void syn_flood_warning(struct sk_buff *skb)
1366 {
1367         static unsigned long warntime;
1368
1369         if (time_after(jiffies, (warntime + HZ * 60))) {
1370                 warntime = jiffies;
1371                 printk(KERN_INFO
1372                        "possible SYN flooding on port %d. Sending cookies.\n",
1373                        ntohs(skb->h.th->dest));
1374         }
1375 }
1376
1377 /*
1378  * Save and compile IPv4 options into the open_request if needed.
1379  */
1380 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1381                                                      struct sk_buff *skb)
1382 {
1383         struct ip_options *opt = &(IPCB(skb)->opt);
1384         struct ip_options *dopt = NULL;
1385
1386         if (opt && opt->optlen) {
1387                 int opt_size = optlength(opt);
1388                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1389                 if (dopt) {
1390                         if (ip_options_echo(dopt, skb)) {
1391                                 kfree(dopt);
1392                                 dopt = NULL;
1393                         }
1394                 }
1395         }
1396         return dopt;
1397 }
1398
1399 /*
1400  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1401  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1402  * It would be better to replace it with a global counter for all sockets
1403  * but then some measure against one socket starving all other sockets
1404  * would be needed.
1405  *
1406  * It was 128 by default. Experiments with real servers show, that
1407  * it is absolutely not enough even at 100conn/sec. 256 cures most
1408  * of problems. This value is adjusted to 128 for very small machines
1409  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1410  * Further increasing requires to change hash table size.
1411  */
1412 int sysctl_max_syn_backlog = 256;
1413
1414 struct or_calltable or_ipv4 = {
1415         .family         =       PF_INET,
1416         .rtx_syn_ack    =       tcp_v4_send_synack,
1417         .send_ack       =       tcp_v4_or_send_ack,
1418         .destructor     =       tcp_v4_or_free,
1419         .send_reset     =       tcp_v4_send_reset,
1420 };
1421
1422 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1423 {
1424         struct tcp_opt tp;
1425         struct open_request *req;
1426         __u32 saddr = skb->nh.iph->saddr;
1427         __u32 daddr = skb->nh.iph->daddr;
1428         __u32 isn = TCP_SKB_CB(skb)->when;
1429         struct dst_entry *dst = NULL;
1430 #ifdef CONFIG_SYN_COOKIES
1431         int want_cookie = 0;
1432 #else
1433 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1434 #endif
1435
1436         /* Never answer to SYNs send to broadcast or multicast */
1437         if (((struct rtable *)skb->dst)->rt_flags &
1438             (RTCF_BROADCAST | RTCF_MULTICAST))
1439                 goto drop;
1440
1441         /* TW buckets are converted to open requests without
1442          * limitations, they conserve resources and peer is
1443          * evidently real one.
1444          */
1445         if (tcp_synq_is_full(sk) && !isn) {
1446 #ifdef CONFIG_SYN_COOKIES
1447                 if (sysctl_tcp_syncookies) {
1448                         want_cookie = 1;
1449                 } else
1450 #endif
1451                 goto drop;
1452         }
1453
1454         /* Accept backlog is full. If we have already queued enough
1455          * of warm entries in syn queue, drop request. It is better than
1456          * clogging syn queue with openreqs with exponentially increasing
1457          * timeout.
1458          */
1459         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1460                 goto drop;
1461
1462         req = tcp_openreq_alloc();
1463         if (!req)
1464                 goto drop;
1465
1466         tcp_clear_options(&tp);
1467         tp.mss_clamp = 536;
1468         tp.user_mss  = tcp_sk(sk)->user_mss;
1469
1470         tcp_parse_options(skb, &tp, 0);
1471
1472         if (want_cookie) {
1473                 tcp_clear_options(&tp);
1474                 tp.saw_tstamp = 0;
1475         }
1476
1477         if (tp.saw_tstamp && !tp.rcv_tsval) {
1478                 /* Some OSes (unknown ones, but I see them on web server, which
1479                  * contains information interesting only for windows'
1480                  * users) do not send their stamp in SYN. It is easy case.
1481                  * We simply do not advertise TS support.
1482                  */
1483                 tp.saw_tstamp = 0;
1484                 tp.tstamp_ok  = 0;
1485         }
1486         tp.tstamp_ok = tp.saw_tstamp;
1487
1488         tcp_openreq_init(req, &tp, skb);
1489
1490         req->af.v4_req.loc_addr = daddr;
1491         req->af.v4_req.rmt_addr = saddr;
1492         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1493         req->class = &or_ipv4;
1494         if (!want_cookie)
1495                 TCP_ECN_create_request(req, skb->h.th);
1496
1497         if (want_cookie) {
1498 #ifdef CONFIG_SYN_COOKIES
1499                 syn_flood_warning(skb);
1500 #endif
1501                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1502         } else if (!isn) {
1503                 struct inet_peer *peer = NULL;
1504
1505                 /* VJ's idea. We save last timestamp seen
1506                  * from the destination in peer table, when entering
1507                  * state TIME-WAIT, and check against it before
1508                  * accepting new connection request.
1509                  *
1510                  * If "isn" is not zero, this request hit alive
1511                  * timewait bucket, so that all the necessary checks
1512                  * are made in the function processing timewait state.
1513                  */
1514                 if (tp.saw_tstamp &&
1515                     sysctl_tcp_tw_recycle &&
1516                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1517                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1518                     peer->v4daddr == saddr) {
1519                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1520                             (s32)(peer->tcp_ts - req->ts_recent) >
1521                                                         TCP_PAWS_WINDOW) {
1522                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1523                                 dst_release(dst);
1524                                 goto drop_and_free;
1525                         }
1526                 }
1527                 /* Kill the following clause, if you dislike this way. */
1528                 else if (!sysctl_tcp_syncookies &&
1529                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1530                           (sysctl_max_syn_backlog >> 2)) &&
1531                          (!peer || !peer->tcp_ts_stamp) &&
1532                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1533                         /* Without syncookies last quarter of
1534                          * backlog is filled with destinations,
1535                          * proven to be alive.
1536                          * It means that we continue to communicate
1537                          * to destinations, already remembered
1538                          * to the moment of synflood.
1539                          */
1540                         NETDEBUG(if (net_ratelimit()) \
1541                                         printk(KERN_DEBUG "TCP: drop open "
1542                                                           "request from %u.%u."
1543                                                           "%u.%u/%u\n", \
1544                                                NIPQUAD(saddr),
1545                                                ntohs(skb->h.th->source)));
1546                         dst_release(dst);
1547                         goto drop_and_free;
1548                 }
1549
1550                 isn = tcp_v4_init_sequence(sk, skb);
1551         }
1552         req->snt_isn = isn;
1553
1554         if (tcp_v4_send_synack(sk, req, dst))
1555                 goto drop_and_free;
1556
1557         if (want_cookie) {
1558                 tcp_openreq_free(req);
1559         } else {
1560                 tcp_v4_synq_add(sk, req);
1561         }
1562         return 0;
1563
1564 drop_and_free:
1565         tcp_openreq_free(req);
1566 drop:
1567         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1568         return 0;
1569 }
1570
1571
1572 /*
1573  * The three way handshake has completed - we got a valid synack -
1574  * now create the new socket.
1575  */
1576 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1577                                   struct open_request *req,
1578                                   struct dst_entry *dst)
1579 {
1580         struct inet_opt *newinet;
1581         struct tcp_opt *newtp;
1582         struct sock *newsk;
1583
1584         if (sk_acceptq_is_full(sk))
1585                 goto exit_overflow;
1586
1587         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1588                 goto exit;
1589
1590         newsk = tcp_create_openreq_child(sk, req, skb);
1591         if (!newsk)
1592                 goto exit;
1593
1594         newsk->sk_dst_cache = dst;
1595         tcp_v4_setup_caps(newsk, dst);
1596
1597         newtp                 = tcp_sk(newsk);
1598         newinet               = inet_sk(newsk);
1599         newinet->daddr        = req->af.v4_req.rmt_addr;
1600         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1601         newinet->saddr        = req->af.v4_req.loc_addr;
1602         newinet->opt          = req->af.v4_req.opt;
1603         req->af.v4_req.opt    = NULL;
1604         newinet->mc_index     = tcp_v4_iif(skb);
1605         newinet->mc_ttl       = skb->nh.iph->ttl;
1606         newtp->ext_header_len = 0;
1607         if (newinet->opt)
1608                 newtp->ext_header_len = newinet->opt->optlen;
1609         newtp->ext2_header_len = dst->header_len;
1610         newinet->id = newtp->write_seq ^ jiffies;
1611
1612         tcp_sync_mss(newsk, dst_pmtu(dst));
1613         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1614         tcp_initialize_rcv_mss(newsk);
1615
1616         __tcp_v4_hash(newsk, 0);
1617         __tcp_inherit_port(sk, newsk);
1618
1619         return newsk;
1620
1621 exit_overflow:
1622         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1623 exit:
1624         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1625         dst_release(dst);
1626         return NULL;
1627 }
1628
1629 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1630 {
1631         struct tcphdr *th = skb->h.th;
1632         struct iphdr *iph = skb->nh.iph;
1633         struct tcp_opt *tp = tcp_sk(sk);
1634         struct sock *nsk;
1635         struct open_request **prev;
1636         /* Find possible connection requests. */
1637         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1638                                                      iph->saddr, iph->daddr);
1639         if (req)
1640                 return tcp_check_req(sk, skb, req, prev);
1641
1642         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1643                                           th->source,
1644                                           skb->nh.iph->daddr,
1645                                           ntohs(th->dest),
1646                                           tcp_v4_iif(skb));
1647
1648         if (nsk) {
1649                 if (nsk->sk_state != TCP_TIME_WAIT) {
1650                         bh_lock_sock(nsk);
1651                         return nsk;
1652                 }
1653                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1654                 return NULL;
1655         }
1656
1657 #ifdef CONFIG_SYN_COOKIES
1658         if (!th->rst && !th->syn && th->ack)
1659                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1660 #endif
1661         return sk;
1662 }
1663
1664 static int tcp_v4_checksum_init(struct sk_buff *skb)
1665 {
1666         if (skb->ip_summed == CHECKSUM_HW) {
1667                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1668                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1669                                   skb->nh.iph->daddr, skb->csum))
1670                         return 0;
1671
1672                 NETDEBUG(if (net_ratelimit())
1673                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1674                 skb->ip_summed = CHECKSUM_NONE;
1675         }
1676         if (skb->len <= 76) {
1677                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1678                                  skb->nh.iph->daddr,
1679                                  skb_checksum(skb, 0, skb->len, 0)))
1680                         return -1;
1681                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1682         } else {
1683                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1684                                           skb->nh.iph->saddr,
1685                                           skb->nh.iph->daddr, 0);
1686         }
1687         return 0;
1688 }
1689
1690
1691 /* The socket must have it's spinlock held when we get
1692  * here.
1693  *
1694  * We have a potential double-lock case here, so even when
1695  * doing backlog processing we use the BH locking scheme.
1696  * This is because we cannot sleep with the original spinlock
1697  * held.
1698  */
1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1700 {
1701         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1702                 TCP_CHECK_TIMER(sk);
1703                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1704                         goto reset;
1705                 TCP_CHECK_TIMER(sk);
1706                 return 0;
1707         }
1708
1709         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1710                 goto csum_err;
1711
1712         if (sk->sk_state == TCP_LISTEN) {
1713                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1714                 if (!nsk)
1715                         goto discard;
1716
1717                 if (nsk != sk) {
1718                         if (tcp_child_process(sk, nsk, skb))
1719                                 goto reset;
1720                         return 0;
1721                 }
1722         }
1723
1724         TCP_CHECK_TIMER(sk);
1725         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1726                 goto reset;
1727         TCP_CHECK_TIMER(sk);
1728         return 0;
1729
1730 reset:
1731         tcp_v4_send_reset(skb);
1732 discard:
1733         kfree_skb(skb);
1734         /* Be careful here. If this function gets more complicated and
1735          * gcc suffers from register pressure on the x86, sk (in %ebx)
1736          * might be destroyed here. This current version compiles correctly,
1737          * but you have been warned.
1738          */
1739         return 0;
1740
1741 csum_err:
1742         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1743         goto discard;
1744 }
1745
1746 /*
1747  *      From tcp_input.c
1748  */
1749
1750 int tcp_v4_rcv(struct sk_buff *skb)
1751 {
1752         struct tcphdr *th;
1753         struct sock *sk;
1754         int ret;
1755
1756         if (skb->pkt_type != PACKET_HOST)
1757                 goto discard_it;
1758
1759         /* Count it even if it's bad */
1760         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1761
1762         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1763                 goto discard_it;
1764
1765         th = skb->h.th;
1766
1767         if (th->doff < sizeof(struct tcphdr) / 4)
1768                 goto bad_packet;
1769         if (!pskb_may_pull(skb, th->doff * 4))
1770                 goto discard_it;
1771
1772         /* An explanation is required here, I think.
1773          * Packet length and doff are validated by header prediction,
1774          * provided case of th->doff==0 is elimineted.
1775          * So, we defer the checks. */
1776         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1777              tcp_v4_checksum_init(skb) < 0))
1778                 goto bad_packet;
1779
1780         th = skb->h.th;
1781         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1782         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1783                                     skb->len - th->doff * 4);
1784         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1785         TCP_SKB_CB(skb)->when    = 0;
1786         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1787         TCP_SKB_CB(skb)->sacked  = 0;
1788
1789         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1790                              skb->nh.iph->daddr, ntohs(th->dest),
1791                              tcp_v4_iif(skb));
1792
1793         if (!sk)
1794                 goto no_tcp_socket;
1795
1796 process:
1797         if (sk->sk_state == TCP_TIME_WAIT)
1798                 goto do_time_wait;
1799
1800         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1801                 goto discard_and_relse;
1802
1803         if (sk_filter(sk, skb, 0))
1804                 goto discard_and_relse;
1805
1806         skb->dev = NULL;
1807
1808         bh_lock_sock(sk);
1809         ret = 0;
1810         if (!sock_owned_by_user(sk)) {
1811                 if (!tcp_prequeue(sk, skb))
1812                         ret = tcp_v4_do_rcv(sk, skb);
1813         } else
1814                 sk_add_backlog(sk, skb);
1815         bh_unlock_sock(sk);
1816
1817         sock_put(sk);
1818
1819         return ret;
1820
1821 no_tcp_socket:
1822         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1823                 goto discard_it;
1824
1825         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1826 bad_packet:
1827                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1828         } else {
1829                 tcp_v4_send_reset(skb);
1830         }
1831
1832 discard_it:
1833         /* Discard frame. */
1834         kfree_skb(skb);
1835         return 0;
1836
1837 discard_and_relse:
1838         sock_put(sk);
1839         goto discard_it;
1840
1841 do_time_wait:
1842         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1843                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1844                 goto discard_it;
1845         }
1846
1847         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1848                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1849                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1850                 goto discard_it;
1851         }
1852         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1853                                            skb, th, skb->len)) {
1854         case TCP_TW_SYN: {
1855                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1856                                                           ntohs(th->dest),
1857                                                           tcp_v4_iif(skb));
1858                 if (sk2) {
1859                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1860                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1861                         sk = sk2;
1862                         goto process;
1863                 }
1864                 /* Fall through to ACK */
1865         }
1866         case TCP_TW_ACK:
1867                 tcp_v4_timewait_ack(sk, skb);
1868                 break;
1869         case TCP_TW_RST:
1870                 goto no_tcp_socket;
1871         case TCP_TW_SUCCESS:;
1872         }
1873         goto discard_it;
1874 }
1875
1876 /* With per-bucket locks this operation is not-atomic, so that
1877  * this version is not worse.
1878  */
1879 static void __tcp_v4_rehash(struct sock *sk)
1880 {
1881         sk->sk_prot->unhash(sk);
1882         sk->sk_prot->hash(sk);
1883 }
1884
1885 static int tcp_v4_reselect_saddr(struct sock *sk)
1886 {
1887         struct inet_opt *inet = inet_sk(sk);
1888         int err;
1889         struct rtable *rt;
1890         __u32 old_saddr = inet->saddr;
1891         __u32 new_saddr;
1892         __u32 daddr = inet->daddr;
1893
1894         if (inet->opt && inet->opt->srr)
1895                 daddr = inet->opt->faddr;
1896
1897         /* Query new route. */
1898         err = ip_route_connect(&rt, daddr, 0,
1899                                RT_TOS(inet->tos) | sk->sk_localroute,
1900                                sk->sk_bound_dev_if,
1901                                IPPROTO_TCP,
1902                                inet->sport, inet->dport, sk);
1903         if (err)
1904                 return err;
1905
1906         __sk_dst_set(sk, &rt->u.dst);
1907         tcp_v4_setup_caps(sk, &rt->u.dst);
1908         tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1909
1910         new_saddr = rt->rt_src;
1911
1912         if (new_saddr == old_saddr)
1913                 return 0;
1914
1915         if (sysctl_ip_dynaddr > 1) {
1916                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1917                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1918                        NIPQUAD(old_saddr),
1919                        NIPQUAD(new_saddr));
1920         }
1921
1922         inet->saddr = new_saddr;
1923         inet->rcv_saddr = new_saddr;
1924
1925         /* XXX The only one ugly spot where we need to
1926          * XXX really change the sockets identity after
1927          * XXX it has entered the hashes. -DaveM
1928          *
1929          * Besides that, it does not check for connection
1930          * uniqueness. Wait for troubles.
1931          */
1932         __tcp_v4_rehash(sk);
1933         return 0;
1934 }
1935
1936 int tcp_v4_rebuild_header(struct sock *sk)
1937 {
1938         struct inet_opt *inet = inet_sk(sk);
1939         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1940         u32 daddr;
1941         int err;
1942
1943         /* Route is OK, nothing to do. */
1944         if (rt)
1945                 return 0;
1946
1947         /* Reroute. */
1948         daddr = inet->daddr;
1949         if (inet->opt && inet->opt->srr)
1950                 daddr = inet->opt->faddr;
1951
1952         {
1953                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1954                                     .nl_u = { .ip4_u =
1955                                               { .daddr = daddr,
1956                                                 .saddr = inet->saddr,
1957                                                 .tos = RT_CONN_FLAGS(sk) } },
1958                                     .proto = IPPROTO_TCP,
1959                                     .uli_u = { .ports =
1960                                                { .sport = inet->sport,
1961                                                  .dport = inet->dport } } };
1962                                                 
1963                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1964         }
1965         if (!err) {
1966                 __sk_dst_set(sk, &rt->u.dst);
1967                 tcp_v4_setup_caps(sk, &rt->u.dst);
1968                 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1969                 return 0;
1970         }
1971
1972         /* Routing failed... */
1973         sk->sk_route_caps = 0;
1974
1975         if (!sysctl_ip_dynaddr ||
1976             sk->sk_state != TCP_SYN_SENT ||
1977             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1978             (err = tcp_v4_reselect_saddr(sk)) != 0)
1979                 sk->sk_err_soft = -err;
1980
1981         return err;
1982 }
1983
1984 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1985 {
1986         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1987         struct inet_opt *inet = inet_sk(sk);
1988
1989         sin->sin_family         = AF_INET;
1990         sin->sin_addr.s_addr    = inet->daddr;
1991         sin->sin_port           = inet->dport;
1992 }
1993
1994 /* VJ's idea. Save last timestamp seen from this destination
1995  * and hold it at least for normal timewait interval to use for duplicate
1996  * segment detection in subsequent connections, before they enter synchronized
1997  * state.
1998  */
1999
2000 int tcp_v4_remember_stamp(struct sock *sk)
2001 {
2002         struct inet_opt *inet = inet_sk(sk);
2003         struct tcp_opt *tp = tcp_sk(sk);
2004         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2005         struct inet_peer *peer = NULL;
2006         int release_it = 0;
2007
2008         if (!rt || rt->rt_dst != inet->daddr) {
2009                 peer = inet_getpeer(inet->daddr, 1);
2010                 release_it = 1;
2011         } else {
2012                 if (!rt->peer)
2013                         rt_bind_peer(rt, 1);
2014                 peer = rt->peer;
2015         }
2016
2017         if (peer) {
2018                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2019                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2020                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2021                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
2022                         peer->tcp_ts = tp->ts_recent;
2023                 }
2024                 if (release_it)
2025                         inet_putpeer(peer);
2026                 return 1;
2027         }
2028
2029         return 0;
2030 }
2031
2032 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2033 {
2034         struct inet_peer *peer = NULL;
2035
2036         peer = inet_getpeer(tw->tw_daddr, 1);
2037
2038         if (peer) {
2039                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2040                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2041                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2042                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2043                         peer->tcp_ts = tw->tw_ts_recent;
2044                 }
2045                 inet_putpeer(peer);
2046                 return 1;
2047         }
2048
2049         return 0;
2050 }
2051
2052 struct tcp_func ipv4_specific = {
2053         .queue_xmit     =       ip_queue_xmit,
2054         .send_check     =       tcp_v4_send_check,
2055         .rebuild_header =       tcp_v4_rebuild_header,
2056         .conn_request   =       tcp_v4_conn_request,
2057         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2058         .remember_stamp =       tcp_v4_remember_stamp,
2059         .net_header_len =       sizeof(struct iphdr),
2060         .setsockopt     =       ip_setsockopt,
2061         .getsockopt     =       ip_getsockopt,
2062         .addr2sockaddr  =       v4_addr2sockaddr,
2063         .sockaddr_len   =       sizeof(struct sockaddr_in),
2064 };
2065
2066 /* NOTE: A lot of things set to zero explicitly by call to
2067  *       sk_alloc() so need not be done here.
2068  */
2069 static int tcp_v4_init_sock(struct sock *sk)
2070 {
2071         struct tcp_opt *tp = tcp_sk(sk);
2072
2073         skb_queue_head_init(&tp->out_of_order_queue);
2074         tcp_init_xmit_timers(sk);
2075         tcp_prequeue_init(tp);
2076
2077         tp->rto  = TCP_TIMEOUT_INIT;
2078         tp->mdev = TCP_TIMEOUT_INIT;
2079
2080         /* So many TCP implementations out there (incorrectly) count the
2081          * initial SYN frame in their delayed-ACK and congestion control
2082          * algorithms that we must have the following bandaid to talk
2083          * efficiently to them.  -DaveM
2084          */
2085         tp->snd_cwnd = 2;
2086
2087         /* See draft-stevens-tcpca-spec-01 for discussion of the
2088          * initialization of these values.
2089          */
2090         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2091         tp->snd_cwnd_clamp = ~0;
2092         tp->mss_cache_std = tp->mss_cache = 536;
2093
2094         tp->reordering = sysctl_tcp_reordering;
2095
2096         sk->sk_state = TCP_CLOSE;
2097
2098         sk->sk_write_space = sk_stream_write_space;
2099         sk->sk_use_write_queue = 1;
2100
2101         tp->af_specific = &ipv4_specific;
2102
2103         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2104         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2105
2106         atomic_inc(&tcp_sockets_allocated);
2107
2108         return 0;
2109 }
2110
2111 int tcp_v4_destroy_sock(struct sock *sk)
2112 {
2113         struct tcp_opt *tp = tcp_sk(sk);
2114
2115         tcp_clear_xmit_timers(sk);
2116
2117         /* Cleanup up the write buffer. */
2118         sk_stream_writequeue_purge(sk);
2119
2120         /* Cleans up our, hopefully empty, out_of_order_queue. */
2121         __skb_queue_purge(&tp->out_of_order_queue);
2122
2123         /* Clean prequeue, it must be empty really */
2124         __skb_queue_purge(&tp->ucopy.prequeue);
2125
2126         /* Clean up a referenced TCP bind bucket. */
2127         if (tp->bind_hash)
2128                 tcp_put_port(sk);
2129
2130         /*
2131          * If sendmsg cached page exists, toss it.
2132          */
2133         if (sk->sk_sndmsg_page) {
2134                 __free_page(sk->sk_sndmsg_page);
2135                 sk->sk_sndmsg_page = NULL;
2136         }
2137
2138         atomic_dec(&tcp_sockets_allocated);
2139
2140         return 0;
2141 }
2142
2143 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2144
2145 #ifdef CONFIG_PROC_FS
2146 /* Proc filesystem TCP sock list dumping. */
2147
2148 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2149 {
2150         return hlist_empty(head) ? NULL :
2151                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2152 }
2153
2154 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2155 {
2156         return tw->tw_node.next ?
2157                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2158 }
2159
2160 static void *listening_get_next(struct seq_file *seq, void *cur)
2161 {
2162         struct tcp_opt *tp;
2163         struct hlist_node *node;
2164         struct sock *sk = cur;
2165         struct tcp_iter_state* st = seq->private;
2166
2167         if (!sk) {
2168                 st->bucket = 0;
2169                 sk = sk_head(&tcp_listening_hash[0]);
2170                 goto get_sk;
2171         }
2172
2173         ++st->num;
2174
2175         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2176                 struct open_request *req = cur;
2177
2178                 tp = tcp_sk(st->syn_wait_sk);
2179                 req = req->dl_next;
2180                 while (1) {
2181                         while (req) {
2182                                 vxdprintk(VXD_CBIT(net, 6),
2183                                         "sk,req: %p [#%d] (from %d)", req->sk,
2184                                         (req->sk)?req->sk->sk_xid:0, current->xid);
2185                                 if (req->sk &&
2186                                         !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2187                                         continue;
2188                                 if (req->class->family == st->family) {
2189                                         cur = req;
2190                                         goto out;
2191                                 }
2192                                 req = req->dl_next;
2193                         }
2194                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2195                                 break;
2196 get_req:
2197                         req = tp->listen_opt->syn_table[st->sbucket];
2198                 }
2199                 sk        = sk_next(st->syn_wait_sk);
2200                 st->state = TCP_SEQ_STATE_LISTENING;
2201                 read_unlock_bh(&tp->syn_wait_lock);
2202         } else {
2203                 tp = tcp_sk(sk);
2204                 read_lock_bh(&tp->syn_wait_lock);
2205                 if (tp->listen_opt && tp->listen_opt->qlen)
2206                         goto start_req;
2207                 read_unlock_bh(&tp->syn_wait_lock);
2208                 sk = sk_next(sk);
2209         }
2210 get_sk:
2211         sk_for_each_from(sk, node) {
2212                 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2213                         sk, sk->sk_xid, current->xid);
2214                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2215                         continue;
2216                 if (sk->sk_family == st->family) {
2217                         cur = sk;
2218                         goto out;
2219                 }
2220                 tp = tcp_sk(sk);
2221                 read_lock_bh(&tp->syn_wait_lock);
2222                 if (tp->listen_opt && tp->listen_opt->qlen) {
2223 start_req:
2224                         st->uid         = sock_i_uid(sk);
2225                         st->syn_wait_sk = sk;
2226                         st->state       = TCP_SEQ_STATE_OPENREQ;
2227                         st->sbucket     = 0;
2228                         goto get_req;
2229                 }
2230                 read_unlock_bh(&tp->syn_wait_lock);
2231         }
2232         if (++st->bucket < TCP_LHTABLE_SIZE) {
2233                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2234                 goto get_sk;
2235         }
2236         cur = NULL;
2237 out:
2238         return cur;
2239 }
2240
2241 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2242 {
2243         void *rc = listening_get_next(seq, NULL);
2244
2245         while (rc && *pos) {
2246                 rc = listening_get_next(seq, rc);
2247                 --*pos;
2248         }
2249         return rc;
2250 }
2251
2252 static void *established_get_first(struct seq_file *seq)
2253 {
2254         struct tcp_iter_state* st = seq->private;
2255         void *rc = NULL;
2256
2257         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2258                 struct sock *sk;
2259                 struct hlist_node *node;
2260                 struct tcp_tw_bucket *tw;
2261                
2262                 read_lock(&tcp_ehash[st->bucket].lock);
2263                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2264                         vxdprintk(VXD_CBIT(net, 6),
2265                                 "sk,egf: %p [#%d] (from %d)",
2266                                 sk, sk->sk_xid, current->xid);
2267                         if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2268                                 continue;
2269                         if (sk->sk_family != st->family)
2270                                 continue;
2271                         rc = sk;
2272                         goto out;
2273                 }
2274                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2275                 tw_for_each(tw, node,
2276                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2277                         vxdprintk(VXD_CBIT(net, 6),
2278                                 "tw: %p [#%d] (from %d)",
2279                                 tw, tw->tw_xid, current->xid);
2280                         if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2281                                 continue;
2282                         if (tw->tw_family != st->family)
2283                                 continue;
2284                         rc = tw;
2285                         goto out;
2286                 }
2287                 read_unlock(&tcp_ehash[st->bucket].lock);
2288                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2289         }
2290 out:
2291         return rc;
2292 }
2293
2294 static void *established_get_next(struct seq_file *seq, void *cur)
2295 {
2296         struct sock *sk = cur;
2297         struct tcp_tw_bucket *tw;
2298         struct hlist_node *node;
2299         struct tcp_iter_state* st = seq->private;
2300
2301         ++st->num;
2302
2303         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2304                 tw = cur;
2305                 tw = tw_next(tw);
2306 get_tw:
2307                 while (tw && (tw->tw_family != st->family ||
2308                         !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2309                         tw = tw_next(tw);
2310                 }
2311                 if (tw) {
2312                         cur = tw;
2313                         goto out;
2314                 }
2315                 read_unlock(&tcp_ehash[st->bucket].lock);
2316                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2317                 if (++st->bucket < tcp_ehash_size) {
2318                         read_lock(&tcp_ehash[st->bucket].lock);
2319                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2320                 } else {
2321                         cur = NULL;
2322                         goto out;
2323                 }
2324         } else
2325                 sk = sk_next(sk);
2326
2327         sk_for_each_from(sk, node) {
2328                 vxdprintk(VXD_CBIT(net, 6),
2329                         "sk,egn: %p [#%d] (from %d)",
2330                         sk, sk->sk_xid, current->xid);
2331                 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2332                         continue;
2333                 if (sk->sk_family == st->family)
2334                         goto found;
2335         }
2336
2337         st->state = TCP_SEQ_STATE_TIME_WAIT;
2338         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2339         goto get_tw;
2340 found:
2341         cur = sk;
2342 out:
2343         return cur;
2344 }
2345
2346 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2347 {
2348         void *rc = established_get_first(seq);
2349
2350         while (rc && pos) {
2351                 rc = established_get_next(seq, rc);
2352                 --pos;
2353         }               
2354         return rc;
2355 }
2356
2357 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2358 {
2359         void *rc;
2360         struct tcp_iter_state* st = seq->private;
2361
2362         tcp_listen_lock();
2363         st->state = TCP_SEQ_STATE_LISTENING;
2364         rc        = listening_get_idx(seq, &pos);
2365
2366         if (!rc) {
2367                 tcp_listen_unlock();
2368                 local_bh_disable();
2369                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2370                 rc        = established_get_idx(seq, pos);
2371         }
2372
2373         return rc;
2374 }
2375
2376 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2377 {
2378         struct tcp_iter_state* st = seq->private;
2379         st->state = TCP_SEQ_STATE_LISTENING;
2380         st->num = 0;
2381         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2382 }
2383
2384 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2385 {
2386         void *rc = NULL;
2387         struct tcp_iter_state* st;
2388
2389         if (v == SEQ_START_TOKEN) {
2390                 rc = tcp_get_idx(seq, 0);
2391                 goto out;
2392         }
2393         st = seq->private;
2394
2395         switch (st->state) {
2396         case TCP_SEQ_STATE_OPENREQ:
2397         case TCP_SEQ_STATE_LISTENING:
2398                 rc = listening_get_next(seq, v);
2399                 if (!rc) {
2400                         tcp_listen_unlock();
2401                         local_bh_disable();
2402                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2403                         rc        = established_get_first(seq);
2404                 }
2405                 break;
2406         case TCP_SEQ_STATE_ESTABLISHED:
2407         case TCP_SEQ_STATE_TIME_WAIT:
2408                 rc = established_get_next(seq, v);
2409                 break;
2410         }
2411 out:
2412         ++*pos;
2413         return rc;
2414 }
2415
2416 static void tcp_seq_stop(struct seq_file *seq, void *v)
2417 {
2418         struct tcp_iter_state* st = seq->private;
2419
2420         switch (st->state) {
2421         case TCP_SEQ_STATE_OPENREQ:
2422                 if (v) {
2423                         struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2424                         read_unlock_bh(&tp->syn_wait_lock);
2425                 }
2426         case TCP_SEQ_STATE_LISTENING:
2427                 if (v != SEQ_START_TOKEN)
2428                         tcp_listen_unlock();
2429                 break;
2430         case TCP_SEQ_STATE_TIME_WAIT:
2431         case TCP_SEQ_STATE_ESTABLISHED:
2432                 if (v)
2433                         read_unlock(&tcp_ehash[st->bucket].lock);
2434                 local_bh_enable();
2435                 break;
2436         }
2437 }
2438
2439 static int tcp_seq_open(struct inode *inode, struct file *file)
2440 {
2441         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2442         struct seq_file *seq;
2443         struct tcp_iter_state *s;
2444         int rc;
2445
2446         if (unlikely(afinfo == NULL))
2447                 return -EINVAL;
2448
2449         s = kmalloc(sizeof(*s), GFP_KERNEL);
2450         if (!s)
2451                 return -ENOMEM;
2452         memset(s, 0, sizeof(*s));
2453         s->family               = afinfo->family;
2454         s->seq_ops.start        = tcp_seq_start;
2455         s->seq_ops.next         = tcp_seq_next;
2456         s->seq_ops.show         = afinfo->seq_show;
2457         s->seq_ops.stop         = tcp_seq_stop;
2458
2459         rc = seq_open(file, &s->seq_ops);
2460         if (rc)
2461                 goto out_kfree;
2462         seq          = file->private_data;
2463         seq->private = s;
2464 out:
2465         return rc;
2466 out_kfree:
2467         kfree(s);
2468         goto out;
2469 }
2470
2471 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2472 {
2473         int rc = 0;
2474         struct proc_dir_entry *p;
2475
2476         if (!afinfo)
2477                 return -EINVAL;
2478         afinfo->seq_fops->owner         = afinfo->owner;
2479         afinfo->seq_fops->open          = tcp_seq_open;
2480         afinfo->seq_fops->read          = seq_read;
2481         afinfo->seq_fops->llseek        = seq_lseek;
2482         afinfo->seq_fops->release       = seq_release_private;
2483         
2484         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2485         if (p)
2486                 p->data = afinfo;
2487         else
2488                 rc = -ENOMEM;
2489         return rc;
2490 }
2491
2492 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2493 {
2494         if (!afinfo)
2495                 return;
2496         proc_net_remove(afinfo->name);
2497         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2498 }
2499
2500 static void get_openreq4(struct sock *sk, struct open_request *req,
2501                          char *tmpbuf, int i, int uid)
2502 {
2503         int ttd = req->expires - jiffies;
2504
2505         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2506                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2507                 i,
2508                 req->af.v4_req.loc_addr,
2509                 ntohs(inet_sk(sk)->sport),
2510                 req->af.v4_req.rmt_addr,
2511                 ntohs(req->rmt_port),
2512                 TCP_SYN_RECV,
2513                 0, 0, /* could print option size, but that is af dependent. */
2514                 1,    /* timers active (only the expire timer) */
2515                 jiffies_to_clock_t(ttd),
2516                 req->retrans,
2517                 uid,
2518                 0,  /* non standard timer */
2519                 0, /* open_requests have no inode */
2520                 atomic_read(&sk->sk_refcnt),
2521                 req);
2522 }
2523
2524 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2525 {
2526         int timer_active;
2527         unsigned long timer_expires;
2528         struct tcp_opt *tp = tcp_sk(sp);
2529         struct inet_opt *inet = inet_sk(sp);
2530         unsigned int dest = inet->daddr;
2531         unsigned int src = inet->rcv_saddr;
2532         __u16 destp = ntohs(inet->dport);
2533         __u16 srcp = ntohs(inet->sport);
2534
2535         if (tp->pending == TCP_TIME_RETRANS) {
2536                 timer_active    = 1;
2537                 timer_expires   = tp->timeout;
2538         } else if (tp->pending == TCP_TIME_PROBE0) {
2539                 timer_active    = 4;
2540                 timer_expires   = tp->timeout;
2541         } else if (timer_pending(&sp->sk_timer)) {
2542                 timer_active    = 2;
2543                 timer_expires   = sp->sk_timer.expires;
2544         } else {
2545                 timer_active    = 0;
2546                 timer_expires = jiffies;
2547         }
2548
2549         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2550                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2551                 i, src, srcp, dest, destp, sp->sk_state,
2552                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2553                 timer_active,
2554                 jiffies_to_clock_t(timer_expires - jiffies),
2555                 tp->retransmits,
2556                 sock_i_uid(sp),
2557                 tp->probes_out,
2558                 sock_i_ino(sp),
2559                 atomic_read(&sp->sk_refcnt), sp,
2560                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2561                 tp->snd_cwnd,
2562                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2563 }
2564
2565 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2566 {
2567         unsigned int dest, src;
2568         __u16 destp, srcp;
2569         int ttd = tw->tw_ttd - jiffies;
2570
2571         if (ttd < 0)
2572                 ttd = 0;
2573
2574         dest  = tw->tw_daddr;
2575         src   = tw->tw_rcv_saddr;
2576         destp = ntohs(tw->tw_dport);
2577         srcp  = ntohs(tw->tw_sport);
2578
2579         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2580                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2581                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2582                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2583                 atomic_read(&tw->tw_refcnt), tw);
2584 }
2585
2586 #define TMPSZ 150
2587
2588 static int tcp4_seq_show(struct seq_file *seq, void *v)
2589 {
2590         struct tcp_iter_state* st;
2591         char tmpbuf[TMPSZ + 1];
2592
2593         if (v == SEQ_START_TOKEN) {
2594                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2595                            "  sl  local_address rem_address   st tx_queue "
2596                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2597                            "inode");
2598                 goto out;
2599         }
2600         st = seq->private;
2601
2602         switch (st->state) {
2603         case TCP_SEQ_STATE_LISTENING:
2604         case TCP_SEQ_STATE_ESTABLISHED:
2605                 get_tcp4_sock(v, tmpbuf, st->num);
2606                 break;
2607         case TCP_SEQ_STATE_OPENREQ:
2608                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2609                 break;
2610         case TCP_SEQ_STATE_TIME_WAIT:
2611                 get_timewait4_sock(v, tmpbuf, st->num);
2612                 break;
2613         }
2614         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2615 out:
2616         return 0;
2617 }
2618
2619 static struct file_operations tcp4_seq_fops;
2620 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2621         .owner          = THIS_MODULE,
2622         .name           = "tcp",
2623         .family         = AF_INET,
2624         .seq_show       = tcp4_seq_show,
2625         .seq_fops       = &tcp4_seq_fops,
2626 };
2627
2628 int __init tcp4_proc_init(void)
2629 {
2630         return tcp_proc_register(&tcp4_seq_afinfo);
2631 }
2632
2633 void tcp4_proc_exit(void)
2634 {
2635         tcp_proc_unregister(&tcp4_seq_afinfo);
2636 }
2637 #endif /* CONFIG_PROC_FS */
2638
2639 struct proto tcp_prot = {
2640         .name                   = "TCP",
2641         .close                  = tcp_close,
2642         .connect                = tcp_v4_connect,
2643         .disconnect             = tcp_disconnect,
2644         .accept                 = tcp_accept,
2645         .ioctl                  = tcp_ioctl,
2646         .init                   = tcp_v4_init_sock,
2647         .destroy                = tcp_v4_destroy_sock,
2648         .shutdown               = tcp_shutdown,
2649         .setsockopt             = tcp_setsockopt,
2650         .getsockopt             = tcp_getsockopt,
2651         .sendmsg                = tcp_sendmsg,
2652         .recvmsg                = tcp_recvmsg,
2653         .backlog_rcv            = tcp_v4_do_rcv,
2654         .hash                   = tcp_v4_hash,
2655         .unhash                 = tcp_unhash,
2656         .get_port               = tcp_v4_get_port,
2657         .enter_memory_pressure  = tcp_enter_memory_pressure,
2658         .sockets_allocated      = &tcp_sockets_allocated,
2659         .memory_allocated       = &tcp_memory_allocated,
2660         .memory_pressure        = &tcp_memory_pressure,
2661         .sysctl_mem             = sysctl_tcp_mem,
2662         .sysctl_wmem            = sysctl_tcp_wmem,
2663         .sysctl_rmem            = sysctl_tcp_rmem,
2664         .max_header             = MAX_TCP_HEADER,
2665         .slab_obj_size          = sizeof(struct tcp_sock),
2666 };
2667
2668
2669
2670 void __init tcp_v4_init(struct net_proto_family *ops)
2671 {
2672         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2673         if (err < 0)
2674                 panic("Failed to create the TCP control socket.\n");
2675         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2676         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2677
2678         /* Unhash it so that IP input processing does not even
2679          * see it, we do not wish this socket to see incoming
2680          * packets.
2681          */
2682         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2683 }
2684
2685 EXPORT_SYMBOL(ipv4_specific);
2686 EXPORT_SYMBOL(tcp_bind_hash);
2687 EXPORT_SYMBOL(tcp_bucket_create);
2688 EXPORT_SYMBOL(tcp_hashinfo);
2689 EXPORT_SYMBOL(tcp_inherit_port);
2690 EXPORT_SYMBOL(tcp_listen_wlock);
2691 EXPORT_SYMBOL(tcp_port_rover);
2692 EXPORT_SYMBOL(tcp_prot);
2693 EXPORT_SYMBOL(tcp_put_port);
2694 EXPORT_SYMBOL(tcp_unhash);
2695 EXPORT_SYMBOL(tcp_v4_conn_request);
2696 EXPORT_SYMBOL(tcp_v4_connect);
2697 EXPORT_SYMBOL(tcp_v4_do_rcv);
2698 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2699 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2700 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2701 EXPORT_SYMBOL(tcp_v4_send_check);
2702 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2703
2704 #ifdef CONFIG_PROC_FS
2705 EXPORT_SYMBOL(tcp_proc_register);
2706 EXPORT_SYMBOL(tcp_proc_unregister);
2707 #endif
2708 #ifdef CONFIG_SYSCTL
2709 EXPORT_SYMBOL(sysctl_local_port_range);
2710 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2711 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2712 #endif