ckrm-E13
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      open_request handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/tcp.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/xfrm.h>
71
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
81
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
84
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
87
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89                        struct sk_buff *skb);
90
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
94         .__tcp_lhash_wait
95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
97 };
98
99 /*
100  * This array holds the first and last local port number.
101  * For high-usage systems, use sysctl to change this to
102  * 32768-61000
103  */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108                                  __u32 faddr, __u16 fport)
109 {
110         int h = (laddr ^ lport) ^ (faddr ^ fport);
111         h ^= h >> 16;
112         h ^= h >> 8;
113         return h & (tcp_ehash_size - 1);
114 }
115
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
117 {
118         struct inet_opt *inet = inet_sk(sk);
119         __u32 laddr = inet->rcv_saddr;
120         __u16 lport = inet->num;
121         __u32 faddr = inet->daddr;
122         __u16 fport = inet->dport;
123
124         return tcp_hashfn(laddr, lport, faddr, fport);
125 }
126
127 /* Allocate and initialize a new TCP local port bind bucket.
128  * The bindhash mutex for snum's hash chain must be held here.
129  */
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131                                           unsigned short snum)
132 {
133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134                                                       SLAB_ATOMIC);
135         if (tb) {
136                 tb->port = snum;
137                 tb->fastreuse = 0;
138                 INIT_HLIST_HEAD(&tb->owners);
139                 hlist_add_head(&tb->node, &head->chain);
140         }
141         return tb;
142 }
143
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146 {
147         if (hlist_empty(&tb->owners)) {
148                 __hlist_del(&tb->node);
149                 kmem_cache_free(tcp_bucket_cachep, tb);
150         }
151 }
152
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155 {
156         struct tcp_bind_hashbucket *head =
157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158         struct tcp_bind_bucket *tb;
159
160         spin_lock(&head->lock);
161         tb = tcp_sk(sk)->bind_hash;
162         sk_add_bind_node(child, &tb->owners);
163         tcp_sk(child)->bind_hash = tb;
164         spin_unlock(&head->lock);
165 }
166
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168 {
169         local_bh_disable();
170         __tcp_inherit_port(sk, child);
171         local_bh_enable();
172 }
173
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175                    unsigned short snum)
176 {
177         inet_sk(sk)->num = snum;
178         sk_add_bind_node(sk, &tb->owners);
179         tcp_sk(sk)->bind_hash = tb;
180 }
181
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183 {
184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185         struct sock *sk2;
186         struct hlist_node *node;
187         int reuse = sk->sk_reuse;
188
189         sk_for_each_bound(sk2, node, &tb->owners) {
190                 if (sk != sk2 &&
191                     !tcp_v6_ipv6only(sk2) &&
192                     (!sk->sk_bound_dev_if ||
193                      !sk2->sk_bound_dev_if ||
194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195                         if (!reuse || !sk2->sk_reuse ||
196                             sk2->sk_state == TCP_LISTEN) {
197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199                                     sk2_rcv_saddr == sk_rcv_saddr)
200                                         break;
201                         }
202                 }
203         }
204         return node != NULL;
205 }
206
207 /* Obtain a reference to a local port for the given sock,
208  * if snum is zero it means select any available local port.
209  */
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211 {
212         struct tcp_bind_hashbucket *head;
213         struct hlist_node *node;
214         struct tcp_bind_bucket *tb;
215         int ret;
216
217         local_bh_disable();
218         if (!snum) {
219                 int low = sysctl_local_port_range[0];
220                 int high = sysctl_local_port_range[1];
221                 int remaining = (high - low) + 1;
222                 int rover;
223
224                 spin_lock(&tcp_portalloc_lock);
225                 rover = tcp_port_rover;
226                 do {
227                         rover++;
228                         if (rover < low || rover > high)
229                                 rover = low;
230                         head = &tcp_bhash[tcp_bhashfn(rover)];
231                         spin_lock(&head->lock);
232                         tb_for_each(tb, node, &head->chain)
233                                 if (tb->port == rover)
234                                         goto next;
235                         break;
236                 next:
237                         spin_unlock(&head->lock);
238                 } while (--remaining > 0);
239                 tcp_port_rover = rover;
240                 spin_unlock(&tcp_portalloc_lock);
241
242                 /* Exhausted local port range during search? */
243                 ret = 1;
244                 if (remaining <= 0)
245                         goto fail;
246
247                 /* OK, here is the one we will use.  HEAD is
248                  * non-NULL and we hold it's mutex.
249                  */
250                 snum = rover;
251         } else {
252                 head = &tcp_bhash[tcp_bhashfn(snum)];
253                 spin_lock(&head->lock);
254                 tb_for_each(tb, node, &head->chain)
255                         if (tb->port == snum)
256                                 goto tb_found;
257         }
258         tb = NULL;
259         goto tb_not_found;
260 tb_found:
261         if (!hlist_empty(&tb->owners)) {
262                 if (sk->sk_reuse > 1)
263                         goto success;
264                 if (tb->fastreuse > 0 &&
265                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
266                         goto success;
267                 } else {
268                         ret = 1;
269                         if (tcp_bind_conflict(sk, tb))
270                                 goto fail_unlock;
271                 }
272         }
273 tb_not_found:
274         ret = 1;
275         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
276                 goto fail_unlock;
277         if (hlist_empty(&tb->owners)) {
278                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
279                         tb->fastreuse = 1;
280                 else
281                         tb->fastreuse = 0;
282         } else if (tb->fastreuse &&
283                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
284                 tb->fastreuse = 0;
285 success:
286         if (!tcp_sk(sk)->bind_hash)
287                 tcp_bind_hash(sk, tb, snum);
288         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
289         ret = 0;
290
291 fail_unlock:
292         spin_unlock(&head->lock);
293 fail:
294         local_bh_enable();
295         return ret;
296 }
297
298 /* Get rid of any references to a local port held by the
299  * given sock.
300  */
301 static void __tcp_put_port(struct sock *sk)
302 {
303         struct inet_opt *inet = inet_sk(sk);
304         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305         struct tcp_bind_bucket *tb;
306
307         spin_lock(&head->lock);
308         tb = tcp_sk(sk)->bind_hash;
309         __sk_del_bind_node(sk);
310         tcp_sk(sk)->bind_hash = NULL;
311         inet->num = 0;
312         tcp_bucket_destroy(tb);
313         spin_unlock(&head->lock);
314 }
315
316 void tcp_put_port(struct sock *sk)
317 {
318         local_bh_disable();
319         __tcp_put_port(sk);
320         local_bh_enable();
321 }
322
323 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324  * Look, when several writers sleep and reader wakes them up, all but one
325  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326  * this, _but_ remember, it adds useless work on UP machines (wake up each
327  * exclusive lock release). It should be ifdefed really.
328  */
329
330 void tcp_listen_wlock(void)
331 {
332         write_lock(&tcp_lhash_lock);
333
334         if (atomic_read(&tcp_lhash_users)) {
335                 DEFINE_WAIT(wait);
336
337                 for (;;) {
338                         prepare_to_wait_exclusive(&tcp_lhash_wait,
339                                                 &wait, TASK_UNINTERRUPTIBLE);
340                         if (!atomic_read(&tcp_lhash_users))
341                                 break;
342                         write_unlock_bh(&tcp_lhash_lock);
343                         schedule();
344                         write_lock_bh(&tcp_lhash_lock);
345                 }
346
347                 finish_wait(&tcp_lhash_wait, &wait);
348         }
349 }
350
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
352 {
353         struct hlist_head *list;
354         rwlock_t *lock;
355
356         BUG_TRAP(sk_unhashed(sk));
357         if (listen_possible && sk->sk_state == TCP_LISTEN) {
358                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359                 lock = &tcp_lhash_lock;
360                 tcp_listen_wlock();
361         } else {
362                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363                 lock = &tcp_ehash[sk->sk_hashent].lock;
364                 write_lock(lock);
365         }
366         __sk_add_node(sk, list);
367         sock_prot_inc_use(sk->sk_prot);
368         write_unlock(lock);
369         if (listen_possible && sk->sk_state == TCP_LISTEN)
370                 wake_up(&tcp_lhash_wait);
371 }
372
373 static void tcp_v4_hash(struct sock *sk)
374 {
375         if (sk->sk_state != TCP_CLOSE) {
376                 local_bh_disable();
377                 __tcp_v4_hash(sk, 1);
378                 local_bh_enable();
379         }
380 }
381
382 void tcp_unhash(struct sock *sk)
383 {
384         rwlock_t *lock;
385
386         if (sk_unhashed(sk))
387                 goto ende;
388
389         if (sk->sk_state == TCP_LISTEN) {
390                 local_bh_disable();
391                 tcp_listen_wlock();
392                 lock = &tcp_lhash_lock;
393         } else {
394                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
395                 lock = &head->lock;
396                 write_lock_bh(&head->lock);
397         }
398
399         if (__sk_del_node_init(sk))
400                 sock_prot_dec_use(sk->sk_prot);
401         write_unlock_bh(lock);
402
403  ende:
404         if (sk->sk_state == TCP_LISTEN)
405                 wake_up(&tcp_lhash_wait);
406 }
407
408 /* Don't inline this cruft.  Here are some nice properties to
409  * exploit here.  The BSD API does not allow a listening TCP
410  * to specify the remote port nor the remote address for the
411  * connection.  So always assume those are both wildcarded
412  * during the search since they can never be otherwise.
413  */
414 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415                                              unsigned short hnum, int dif)
416 {
417         struct sock *result = NULL, *sk;
418         struct hlist_node *node;
419         int score, hiscore;
420
421         hiscore=-1;
422         sk_for_each(sk, node, head) {
423                 struct inet_opt *inet = inet_sk(sk);
424
425                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426                         __u32 rcv_saddr = inet->rcv_saddr;
427
428                         score = (sk->sk_family == PF_INET ? 1 : 0);
429                         if (rcv_saddr) {
430                                 if (rcv_saddr != daddr)
431                                         continue;
432                                 score+=2;
433                         }
434                         if (sk->sk_bound_dev_if) {
435                                 if (sk->sk_bound_dev_if != dif)
436                                         continue;
437                                 score+=2;
438                         }
439                         if (score == 5)
440                                 return sk;
441                         if (score > hiscore) {
442                                 hiscore = score;
443                                 result = sk;
444                         }
445                 }
446         }
447         return result;
448 }
449
450 /* Optimize the common listener case. */
451 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
452                                            int dif)
453 {
454         struct sock *sk = NULL;
455         struct hlist_head *head;
456
457         read_lock(&tcp_lhash_lock);
458         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459         if (!hlist_empty(head)) {
460                 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
461                 if (inet->num == hnum && !sk->sk_node.next &&
462                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
463                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
464                     !sk->sk_bound_dev_if)
465                         goto sherry_cache;
466                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
467         }
468         if (sk) {
469 sherry_cache:
470                 sock_hold(sk);
471         }
472         read_unlock(&tcp_lhash_lock);
473         return sk;
474 }
475
476 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
477  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
478  *
479  * Local BH must be disabled here.
480  */
481
482 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
483                                                        u32 daddr, u16 hnum,
484                                                        int dif)
485 {
486         struct tcp_ehash_bucket *head;
487         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
488         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
489         struct sock *sk;
490         struct hlist_node *node;
491         /* Optimize here for direct hit, only listening connections can
492          * have wildcards anyways.
493          */
494         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
495         head = &tcp_ehash[hash];
496         read_lock(&head->lock);
497         sk_for_each(sk, node, &head->chain) {
498                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
499                         goto hit; /* You sunk my battleship! */
500         }
501
502         /* Must check for a TIME_WAIT'er before going to listener hash. */
503         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
504                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
505                         goto hit;
506         }
507         sk = NULL;
508 out:
509         read_unlock(&head->lock);
510         return sk;
511 hit:
512         sock_hold(sk);
513         goto out;
514 }
515
516 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
517                                            u32 daddr, u16 hnum, int dif)
518 {
519         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
520                                                       daddr, hnum, dif);
521
522         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
523 }
524
525 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
526                                   u16 dport, int dif)
527 {
528         struct sock *sk;
529
530         local_bh_disable();
531         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
532         local_bh_enable();
533
534         return sk;
535 }
536
537 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
538 {
539         return secure_tcp_sequence_number(skb->nh.iph->daddr,
540                                           skb->nh.iph->saddr,
541                                           skb->h.th->dest,
542                                           skb->h.th->source);
543 }
544
545 /* called with local bh disabled */
546 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
547                                       struct tcp_tw_bucket **twp)
548 {
549         struct inet_opt *inet = inet_sk(sk);
550         u32 daddr = inet->rcv_saddr;
551         u32 saddr = inet->daddr;
552         int dif = sk->sk_bound_dev_if;
553         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
554         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
555         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
556         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
557         struct sock *sk2;
558         struct hlist_node *node;
559         struct tcp_tw_bucket *tw;
560
561         write_lock(&head->lock);
562
563         /* Check TIME-WAIT sockets first. */
564         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
565                 tw = (struct tcp_tw_bucket *)sk2;
566
567                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
568                         struct tcp_opt *tp = tcp_sk(sk);
569
570                         /* With PAWS, it is safe from the viewpoint
571                            of data integrity. Even without PAWS it
572                            is safe provided sequence spaces do not
573                            overlap i.e. at data rates <= 80Mbit/sec.
574
575                            Actually, the idea is close to VJ's one,
576                            only timestamp cache is held not per host,
577                            but per port pair and TW bucket is used
578                            as state holder.
579
580                            If TW bucket has been already destroyed we
581                            fall back to VJ's scheme and use initial
582                            timestamp retrieved from peer table.
583                          */
584                         if (tw->tw_ts_recent_stamp &&
585                             (!twp || (sysctl_tcp_tw_reuse &&
586                                       xtime.tv_sec -
587                                       tw->tw_ts_recent_stamp > 1))) {
588                                 if ((tp->write_seq =
589                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
590                                         tp->write_seq = 1;
591                                 tp->ts_recent       = tw->tw_ts_recent;
592                                 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
593                                 sock_hold(sk2);
594                                 goto unique;
595                         } else
596                                 goto not_unique;
597                 }
598         }
599         tw = NULL;
600
601         /* And established part... */
602         sk_for_each(sk2, node, &head->chain) {
603                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
604                         goto not_unique;
605         }
606
607 unique:
608         /* Must record num and sport now. Otherwise we will see
609          * in hash table socket with a funny identity. */
610         inet->num = lport;
611         inet->sport = htons(lport);
612         sk->sk_hashent = hash;
613         BUG_TRAP(sk_unhashed(sk));
614         __sk_add_node(sk, &head->chain);
615         sock_prot_inc_use(sk->sk_prot);
616         write_unlock(&head->lock);
617
618         if (twp) {
619                 *twp = tw;
620                 NET_INC_STATS_BH(TimeWaitRecycled);
621         } else if (tw) {
622                 /* Silly. Should hash-dance instead... */
623                 tcp_tw_deschedule(tw);
624                 NET_INC_STATS_BH(TimeWaitRecycled);
625
626                 tcp_tw_put(tw);
627         }
628
629         return 0;
630
631 not_unique:
632         write_unlock(&head->lock);
633         return -EADDRNOTAVAIL;
634 }
635
636 /*
637  * Bind a port for a connect operation and hash it.
638  */
639 static int tcp_v4_hash_connect(struct sock *sk)
640 {
641         unsigned short snum = inet_sk(sk)->num;
642         struct tcp_bind_hashbucket *head;
643         struct tcp_bind_bucket *tb;
644         int ret;
645
646         if (!snum) {
647                 int rover;
648                 int low = sysctl_local_port_range[0];
649                 int high = sysctl_local_port_range[1];
650                 int remaining = (high - low) + 1;
651                 struct hlist_node *node;
652                 struct tcp_tw_bucket *tw = NULL;
653
654                 local_bh_disable();
655
656                 /* TODO. Actually it is not so bad idea to remove
657                  * tcp_portalloc_lock before next submission to Linus.
658                  * As soon as we touch this place at all it is time to think.
659                  *
660                  * Now it protects single _advisory_ variable tcp_port_rover,
661                  * hence it is mostly useless.
662                  * Code will work nicely if we just delete it, but
663                  * I am afraid in contented case it will work not better or
664                  * even worse: another cpu just will hit the same bucket
665                  * and spin there.
666                  * So some cpu salt could remove both contention and
667                  * memory pingpong. Any ideas how to do this in a nice way?
668                  */
669                 spin_lock(&tcp_portalloc_lock);
670                 rover = tcp_port_rover;
671
672                 do {
673                         rover++;
674                         if ((rover < low) || (rover > high))
675                                 rover = low;
676                         head = &tcp_bhash[tcp_bhashfn(rover)];
677                         spin_lock(&head->lock);
678
679                         /* Does not bother with rcv_saddr checks,
680                          * because the established check is already
681                          * unique enough.
682                          */
683                         tb_for_each(tb, node, &head->chain) {
684                                 if (tb->port == rover) {
685                                         BUG_TRAP(!hlist_empty(&tb->owners));
686                                         if (tb->fastreuse >= 0)
687                                                 goto next_port;
688                                         if (!__tcp_v4_check_established(sk,
689                                                                         rover,
690                                                                         &tw))
691                                                 goto ok;
692                                         goto next_port;
693                                 }
694                         }
695
696                         tb = tcp_bucket_create(head, rover);
697                         if (!tb) {
698                                 spin_unlock(&head->lock);
699                                 break;
700                         }
701                         tb->fastreuse = -1;
702                         goto ok;
703
704                 next_port:
705                         spin_unlock(&head->lock);
706                 } while (--remaining > 0);
707                 tcp_port_rover = rover;
708                 spin_unlock(&tcp_portalloc_lock);
709
710                 local_bh_enable();
711
712                 return -EADDRNOTAVAIL;
713
714 ok:
715                 /* All locks still held and bhs disabled */
716                 tcp_port_rover = rover;
717                 spin_unlock(&tcp_portalloc_lock);
718
719                 tcp_bind_hash(sk, tb, rover);
720                 if (sk_unhashed(sk)) {
721                         inet_sk(sk)->sport = htons(rover);
722                         __tcp_v4_hash(sk, 0);
723                 }
724                 spin_unlock(&head->lock);
725
726                 if (tw) {
727                         tcp_tw_deschedule(tw);
728                         tcp_tw_put(tw);
729                 }
730
731                 ret = 0;
732                 goto out;
733         }
734
735         head  = &tcp_bhash[tcp_bhashfn(snum)];
736         tb  = tcp_sk(sk)->bind_hash;
737         spin_lock_bh(&head->lock);
738         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
739                 __tcp_v4_hash(sk, 0);
740                 spin_unlock_bh(&head->lock);
741                 return 0;
742         } else {
743                 spin_unlock(&head->lock);
744                 /* No definite answer... Walk to established hash table */
745                 ret = __tcp_v4_check_established(sk, snum, NULL);
746 out:
747                 local_bh_enable();
748                 return ret;
749         }
750 }
751
752 /* This will initiate an outgoing connection. */
753 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
754 {
755         struct inet_opt *inet = inet_sk(sk);
756         struct tcp_opt *tp = tcp_sk(sk);
757         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
758         struct rtable *rt;
759         u32 daddr, nexthop;
760         int tmp;
761         int err;
762
763         if (addr_len < sizeof(struct sockaddr_in))
764                 return -EINVAL;
765
766         if (usin->sin_family != AF_INET)
767                 return -EAFNOSUPPORT;
768
769         nexthop = daddr = usin->sin_addr.s_addr;
770         if (inet->opt && inet->opt->srr) {
771                 if (!daddr)
772                         return -EINVAL;
773                 nexthop = inet->opt->faddr;
774         }
775
776         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
777                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
778                                IPPROTO_TCP,
779                                inet->sport, usin->sin_port, sk);
780         if (tmp < 0)
781                 return tmp;
782
783         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
784                 ip_rt_put(rt);
785                 return -ENETUNREACH;
786         }
787
788         if (!inet->opt || !inet->opt->srr)
789                 daddr = rt->rt_dst;
790
791         if (!inet->saddr)
792                 inet->saddr = rt->rt_src;
793         inet->rcv_saddr = inet->saddr;
794
795         if (tp->ts_recent_stamp && inet->daddr != daddr) {
796                 /* Reset inherited state */
797                 tp->ts_recent       = 0;
798                 tp->ts_recent_stamp = 0;
799                 tp->write_seq       = 0;
800         }
801
802         if (sysctl_tcp_tw_recycle &&
803             !tp->ts_recent_stamp && rt->rt_dst == daddr) {
804                 struct inet_peer *peer = rt_get_peer(rt);
805
806                 /* VJ's idea. We save last timestamp seen from
807                  * the destination in peer table, when entering state TIME-WAIT
808                  * and initialize ts_recent from it, when trying new connection.
809                  */
810
811                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
812                         tp->ts_recent_stamp = peer->tcp_ts_stamp;
813                         tp->ts_recent = peer->tcp_ts;
814                 }
815         }
816
817         inet->dport = usin->sin_port;
818         inet->daddr = daddr;
819
820         tp->ext_header_len = 0;
821         if (inet->opt)
822                 tp->ext_header_len = inet->opt->optlen;
823
824         tp->mss_clamp = 536;
825
826         /* Socket identity is still unknown (sport may be zero).
827          * However we set state to SYN-SENT and not releasing socket
828          * lock select source port, enter ourselves into the hash tables and
829          * complete initialization after this.
830          */
831         tcp_set_state(sk, TCP_SYN_SENT);
832         err = tcp_v4_hash_connect(sk);
833         if (err)
834                 goto failure;
835
836         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
837         if (err)
838                 goto failure;
839
840         /* OK, now commit destination to socket.  */
841         __sk_dst_set(sk, &rt->u.dst);
842         tcp_v4_setup_caps(sk, &rt->u.dst);
843         tp->ext2_header_len = rt->u.dst.header_len;
844
845         if (!tp->write_seq)
846                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
847                                                            inet->daddr,
848                                                            inet->sport,
849                                                            usin->sin_port);
850
851         inet->id = tp->write_seq ^ jiffies;
852
853         err = tcp_connect(sk);
854         rt = NULL;
855         if (err)
856                 goto failure;
857
858         return 0;
859
860 failure:
861         /* This unhashes the socket and releases the local port, if necessary. */
862         tcp_set_state(sk, TCP_CLOSE);
863         ip_rt_put(rt);
864         sk->sk_route_caps = 0;
865         inet->dport = 0;
866         return err;
867 }
868
869 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
870 {
871         return ((struct rtable *)skb->dst)->rt_iif;
872 }
873
874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
875 {
876         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
877 }
878
879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880                                               struct open_request ***prevp,
881                                               __u16 rport,
882                                               __u32 raddr, __u32 laddr)
883 {
884         struct tcp_listen_opt *lopt = tp->listen_opt;
885         struct open_request *req, **prev;
886
887         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888              (req = *prev) != NULL;
889              prev = &req->dl_next) {
890                 if (req->rmt_port == rport &&
891                     req->af.v4_req.rmt_addr == raddr &&
892                     req->af.v4_req.loc_addr == laddr &&
893                     TCP_INET_FAMILY(req->class->family)) {
894                         BUG_TRAP(!req->sk);
895                         *prevp = prev;
896                         break;
897                 }
898         }
899
900         return req;
901 }
902
903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
904 {
905         struct tcp_opt *tp = tcp_sk(sk);
906         struct tcp_listen_opt *lopt = tp->listen_opt;
907         u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
908
909         req->expires = jiffies + TCP_TIMEOUT_INIT;
910         req->retrans = 0;
911         req->sk = NULL;
912         req->dl_next = lopt->syn_table[h];
913
914         write_lock(&tp->syn_wait_lock);
915         lopt->syn_table[h] = req;
916         write_unlock(&tp->syn_wait_lock);
917
918 #ifdef CONFIG_ACCEPT_QUEUES
919         tcp_synq_added(sk, req);
920 #else
921         tcp_synq_added(sk);
922 #endif
923 }
924
925
926 /*
927  * This routine does path mtu discovery as defined in RFC1191.
928  */
929 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
930                                      u32 mtu)
931 {
932         struct dst_entry *dst;
933         struct inet_opt *inet = inet_sk(sk);
934         struct tcp_opt *tp = tcp_sk(sk);
935
936         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
937          * send out by Linux are always <576bytes so they should go through
938          * unfragmented).
939          */
940         if (sk->sk_state == TCP_LISTEN)
941                 return;
942
943         /* We don't check in the destentry if pmtu discovery is forbidden
944          * on this route. We just assume that no packet_to_big packets
945          * are send back when pmtu discovery is not active.
946          * There is a small race when the user changes this flag in the
947          * route, but I think that's acceptable.
948          */
949         if ((dst = __sk_dst_check(sk, 0)) == NULL)
950                 return;
951
952         dst->ops->update_pmtu(dst, mtu);
953
954         /* Something is about to be wrong... Remember soft error
955          * for the case, if this connection will not able to recover.
956          */
957         if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
958                 sk->sk_err_soft = EMSGSIZE;
959
960         mtu = dst_pmtu(dst);
961
962         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
963             tp->pmtu_cookie > mtu) {
964                 tcp_sync_mss(sk, mtu);
965
966                 /* Resend the TCP packet because it's
967                  * clear that the old packet has been
968                  * dropped. This is the new "fast" path mtu
969                  * discovery.
970                  */
971                 tcp_simple_retransmit(sk);
972         } /* else let the usual retransmit timer handle it */
973 }
974
975 /*
976  * This routine is called by the ICMP module when it gets some
977  * sort of error condition.  If err < 0 then the socket should
978  * be closed and the error returned to the user.  If err > 0
979  * it's just the icmp type << 8 | icmp code.  After adjustment
980  * header points to the first 8 bytes of the tcp header.  We need
981  * to find the appropriate port.
982  *
983  * The locking strategy used here is very "optimistic". When
984  * someone else accesses the socket the ICMP is just dropped
985  * and for some paths there is no check at all.
986  * A more general error queue to queue errors for later handling
987  * is probably better.
988  *
989  */
990
991 void tcp_v4_err(struct sk_buff *skb, u32 info)
992 {
993         struct iphdr *iph = (struct iphdr *)skb->data;
994         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
995         struct tcp_opt *tp;
996         struct inet_opt *inet;
997         int type = skb->h.icmph->type;
998         int code = skb->h.icmph->code;
999         struct sock *sk;
1000         __u32 seq;
1001         int err;
1002
1003         if (skb->len < (iph->ihl << 2) + 8) {
1004                 ICMP_INC_STATS_BH(IcmpInErrors);
1005                 return;
1006         }
1007
1008         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1009                            th->source, tcp_v4_iif(skb));
1010         if (!sk) {
1011                 ICMP_INC_STATS_BH(IcmpInErrors);
1012                 return;
1013         }
1014         if (sk->sk_state == TCP_TIME_WAIT) {
1015                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1016                 return;
1017         }
1018
1019         bh_lock_sock(sk);
1020         /* If too many ICMPs get dropped on busy
1021          * servers this needs to be solved differently.
1022          */
1023         if (sock_owned_by_user(sk))
1024                 NET_INC_STATS_BH(LockDroppedIcmps);
1025
1026         if (sk->sk_state == TCP_CLOSE)
1027                 goto out;
1028
1029         tp = tcp_sk(sk);
1030         seq = ntohl(th->seq);
1031         if (sk->sk_state != TCP_LISTEN &&
1032             !between(seq, tp->snd_una, tp->snd_nxt)) {
1033                 NET_INC_STATS(OutOfWindowIcmps);
1034                 goto out;
1035         }
1036
1037         switch (type) {
1038         case ICMP_SOURCE_QUENCH:
1039                 /* This is deprecated, but if someone generated it,
1040                  * we have no reasons to ignore it.
1041                  */
1042                 if (!sock_owned_by_user(sk))
1043                         tcp_enter_cwr(tp);
1044                 goto out;
1045         case ICMP_PARAMETERPROB:
1046                 err = EPROTO;
1047                 break;
1048         case ICMP_DEST_UNREACH:
1049                 if (code > NR_ICMP_UNREACH)
1050                         goto out;
1051
1052                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1053                         if (!sock_owned_by_user(sk))
1054                                 do_pmtu_discovery(sk, iph, info);
1055                         goto out;
1056                 }
1057
1058                 err = icmp_err_convert[code].errno;
1059                 break;
1060         case ICMP_TIME_EXCEEDED:
1061                 err = EHOSTUNREACH;
1062                 break;
1063         default:
1064                 goto out;
1065         }
1066
1067         switch (sk->sk_state) {
1068                 struct open_request *req, **prev;
1069         case TCP_LISTEN:
1070                 if (sock_owned_by_user(sk))
1071                         goto out;
1072
1073                 req = tcp_v4_search_req(tp, &prev, th->dest,
1074                                         iph->daddr, iph->saddr);
1075                 if (!req)
1076                         goto out;
1077
1078                 /* ICMPs are not backlogged, hence we cannot get
1079                    an established socket here.
1080                  */
1081                 BUG_TRAP(!req->sk);
1082
1083                 if (seq != req->snt_isn) {
1084                         NET_INC_STATS_BH(OutOfWindowIcmps);
1085                         goto out;
1086                 }
1087
1088                 /*
1089                  * Still in SYN_RECV, just remove it silently.
1090                  * There is no good way to pass the error to the newly
1091                  * created socket, and POSIX does not want network
1092                  * errors returned from accept().
1093                  */
1094                 tcp_synq_drop(sk, req, prev);
1095                 goto out;
1096
1097         case TCP_SYN_SENT:
1098         case TCP_SYN_RECV:  /* Cannot happen.
1099                                It can f.e. if SYNs crossed.
1100                              */
1101                 if (!sock_owned_by_user(sk)) {
1102                         TCP_INC_STATS_BH(TcpAttemptFails);
1103                         sk->sk_err = err;
1104
1105                         sk->sk_error_report(sk);
1106
1107                         tcp_done(sk);
1108                 } else {
1109                         sk->sk_err_soft = err;
1110                 }
1111                 goto out;
1112         }
1113
1114         /* If we've already connected we will keep trying
1115          * until we time out, or the user gives up.
1116          *
1117          * rfc1122 4.2.3.9 allows to consider as hard errors
1118          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1119          * but it is obsoleted by pmtu discovery).
1120          *
1121          * Note, that in modern internet, where routing is unreliable
1122          * and in each dark corner broken firewalls sit, sending random
1123          * errors ordered by their masters even this two messages finally lose
1124          * their original sense (even Linux sends invalid PORT_UNREACHs)
1125          *
1126          * Now we are in compliance with RFCs.
1127          *                                                      --ANK (980905)
1128          */
1129
1130         inet = inet_sk(sk);
1131         if (!sock_owned_by_user(sk) && inet->recverr) {
1132                 sk->sk_err = err;
1133                 sk->sk_error_report(sk);
1134         } else  { /* Only an error on timeout */
1135                 sk->sk_err_soft = err;
1136         }
1137
1138 out:
1139         bh_unlock_sock(sk);
1140         sock_put(sk);
1141 }
1142
1143 /* This routine computes an IPv4 TCP checksum. */
1144 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1145                        struct sk_buff *skb)
1146 {
1147         struct inet_opt *inet = inet_sk(sk);
1148
1149         if (skb->ip_summed == CHECKSUM_HW) {
1150                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1151                 skb->csum = offsetof(struct tcphdr, check);
1152         } else {
1153                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1154                                          csum_partial((char *)th,
1155                                                       th->doff << 2,
1156                                                       skb->csum));
1157         }
1158 }
1159
1160 /*
1161  *      This routine will send an RST to the other tcp.
1162  *
1163  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1164  *                    for reset.
1165  *      Answer: if a packet caused RST, it is not for a socket
1166  *              existing in our system, if it is matched to a socket,
1167  *              it is just duplicate segment or bug in other side's TCP.
1168  *              So that we build reply only basing on parameters
1169  *              arrived with segment.
1170  *      Exception: precedence violation. We do not implement it in any case.
1171  */
1172
1173 static void tcp_v4_send_reset(struct sk_buff *skb)
1174 {
1175         struct tcphdr *th = skb->h.th;
1176         struct tcphdr rth;
1177         struct ip_reply_arg arg;
1178
1179         /* Never send a reset in response to a reset. */
1180         if (th->rst)
1181                 return;
1182
1183         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1184                 return;
1185
1186         /* Swap the send and the receive. */
1187         memset(&rth, 0, sizeof(struct tcphdr));
1188         rth.dest   = th->source;
1189         rth.source = th->dest;
1190         rth.doff   = sizeof(struct tcphdr) / 4;
1191         rth.rst    = 1;
1192
1193         if (th->ack) {
1194                 rth.seq = th->ack_seq;
1195         } else {
1196                 rth.ack = 1;
1197                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1198                                     skb->len - (th->doff << 2));
1199         }
1200
1201         memset(&arg, 0, sizeof arg);
1202         arg.iov[0].iov_base = (unsigned char *)&rth;
1203         arg.iov[0].iov_len  = sizeof rth;
1204         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1205                                       skb->nh.iph->saddr, /*XXX*/
1206                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1207         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1208
1209         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1210
1211         TCP_INC_STATS_BH(TcpOutSegs);
1212         TCP_INC_STATS_BH(TcpOutRsts);
1213 }
1214
1215 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1216    outside socket context is ugly, certainly. What can I do?
1217  */
1218
1219 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1220                             u32 win, u32 ts)
1221 {
1222         struct tcphdr *th = skb->h.th;
1223         struct {
1224                 struct tcphdr th;
1225                 u32 tsopt[3];
1226         } rep;
1227         struct ip_reply_arg arg;
1228
1229         memset(&rep.th, 0, sizeof(struct tcphdr));
1230         memset(&arg, 0, sizeof arg);
1231
1232         arg.iov[0].iov_base = (unsigned char *)&rep;
1233         arg.iov[0].iov_len  = sizeof(rep.th);
1234         if (ts) {
1235                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1236                                      (TCPOPT_TIMESTAMP << 8) |
1237                                      TCPOLEN_TIMESTAMP);
1238                 rep.tsopt[1] = htonl(tcp_time_stamp);
1239                 rep.tsopt[2] = htonl(ts);
1240                 arg.iov[0].iov_len = sizeof(rep);
1241         }
1242
1243         /* Swap the send and the receive. */
1244         rep.th.dest    = th->source;
1245         rep.th.source  = th->dest;
1246         rep.th.doff    = arg.iov[0].iov_len / 4;
1247         rep.th.seq     = htonl(seq);
1248         rep.th.ack_seq = htonl(ack);
1249         rep.th.ack     = 1;
1250         rep.th.window  = htons(win);
1251
1252         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1253                                       skb->nh.iph->saddr, /*XXX*/
1254                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1255         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1256
1257         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1258
1259         TCP_INC_STATS_BH(TcpOutSegs);
1260 }
1261
1262 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1263 {
1264         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1265
1266         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1267                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1268
1269         tcp_tw_put(tw);
1270 }
1271
1272 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1273 {
1274         tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1275                         req->ts_recent);
1276 }
1277
1278 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1279                                           struct open_request *req)
1280 {
1281         struct rtable *rt;
1282         struct ip_options *opt = req->af.v4_req.opt;
1283         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1284                             .nl_u = { .ip4_u =
1285                                       { .daddr = ((opt && opt->srr) ?
1286                                                   opt->faddr :
1287                                                   req->af.v4_req.rmt_addr),
1288                                         .saddr = req->af.v4_req.loc_addr,
1289                                         .tos = RT_CONN_FLAGS(sk) } },
1290                             .proto = IPPROTO_TCP,
1291                             .uli_u = { .ports =
1292                                        { .sport = inet_sk(sk)->sport,
1293                                          .dport = req->rmt_port } } };
1294
1295         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1296                 IP_INC_STATS_BH(IpOutNoRoutes);
1297                 return NULL;
1298         }
1299         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1300                 ip_rt_put(rt);
1301                 IP_INC_STATS_BH(IpOutNoRoutes);
1302                 return NULL;
1303         }
1304         return &rt->u.dst;
1305 }
1306
1307 /*
1308  *      Send a SYN-ACK after having received an ACK.
1309  *      This still operates on a open_request only, not on a big
1310  *      socket.
1311  */
1312 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1313                               struct dst_entry *dst)
1314 {
1315         int err = -1;
1316         struct sk_buff * skb;
1317
1318         /* First, grab a route. */
1319         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1320                 goto out;
1321
1322         skb = tcp_make_synack(sk, dst, req);
1323
1324         if (skb) {
1325                 struct tcphdr *th = skb->h.th;
1326
1327                 th->check = tcp_v4_check(th, skb->len,
1328                                          req->af.v4_req.loc_addr,
1329                                          req->af.v4_req.rmt_addr,
1330                                          csum_partial((char *)th, skb->len,
1331                                                       skb->csum));
1332
1333                 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1334                                             req->af.v4_req.rmt_addr,
1335                                             req->af.v4_req.opt);
1336                 if (err == NET_XMIT_CN)
1337                         err = 0;
1338         }
1339
1340 out:
1341         dst_release(dst);
1342         return err;
1343 }
1344
1345 /*
1346  *      IPv4 open_request destructor.
1347  */
1348 static void tcp_v4_or_free(struct open_request *req)
1349 {
1350         if (req->af.v4_req.opt)
1351                 kfree(req->af.v4_req.opt);
1352 }
1353
1354 static inline void syn_flood_warning(struct sk_buff *skb)
1355 {
1356         static unsigned long warntime;
1357
1358         if (time_after(jiffies, (warntime + HZ * 60))) {
1359                 warntime = jiffies;
1360                 printk(KERN_INFO
1361                        "possible SYN flooding on port %d. Sending cookies.\n",
1362                        ntohs(skb->h.th->dest));
1363         }
1364 }
1365
1366 /*
1367  * Save and compile IPv4 options into the open_request if needed.
1368  */
1369 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1370                                                      struct sk_buff *skb)
1371 {
1372         struct ip_options *opt = &(IPCB(skb)->opt);
1373         struct ip_options *dopt = NULL;
1374
1375         if (opt && opt->optlen) {
1376                 int opt_size = optlength(opt);
1377                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1378                 if (dopt) {
1379                         if (ip_options_echo(dopt, skb)) {
1380                                 kfree(dopt);
1381                                 dopt = NULL;
1382                         }
1383                 }
1384         }
1385         return dopt;
1386 }
1387
1388 /*
1389  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1390  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1391  * It would be better to replace it with a global counter for all sockets
1392  * but then some measure against one socket starving all other sockets
1393  * would be needed.
1394  *
1395  * It was 128 by default. Experiments with real servers show, that
1396  * it is absolutely not enough even at 100conn/sec. 256 cures most
1397  * of problems. This value is adjusted to 128 for very small machines
1398  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1399  * Further increasing requires to change hash table size.
1400  */
1401 int sysctl_max_syn_backlog = 256;
1402
1403 struct or_calltable or_ipv4 = {
1404         .family         =       PF_INET,
1405         .rtx_syn_ack    =       tcp_v4_send_synack,
1406         .send_ack       =       tcp_v4_or_send_ack,
1407         .destructor     =       tcp_v4_or_free,
1408         .send_reset     =       tcp_v4_send_reset,
1409 };
1410
1411 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1412 {
1413         struct tcp_opt tp;
1414         struct open_request *req;
1415         __u32 saddr = skb->nh.iph->saddr;
1416         __u32 daddr = skb->nh.iph->daddr;
1417         __u32 isn = TCP_SKB_CB(skb)->when;
1418         struct dst_entry *dst = NULL;
1419 #ifdef CONFIG_ACCEPT_QUEUES
1420         int class = 0;
1421 #endif
1422 #ifdef CONFIG_SYN_COOKIES
1423         int want_cookie = 0;
1424 #else
1425 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1426 #endif
1427
1428         /* Never answer to SYNs send to broadcast or multicast */
1429         if (((struct rtable *)skb->dst)->rt_flags &
1430             (RTCF_BROADCAST | RTCF_MULTICAST))
1431                 goto drop;
1432
1433         /* TW buckets are converted to open requests without
1434          * limitations, they conserve resources and peer is
1435          * evidently real one.
1436          */
1437         if (tcp_synq_is_full(sk) && !isn) {
1438 #ifdef CONFIG_SYN_COOKIES
1439                 if (sysctl_tcp_syncookies) {
1440                         want_cookie = 1;
1441                 } else
1442 #endif
1443                 goto drop;
1444         }
1445
1446 #ifdef CONFIG_ACCEPT_QUEUES
1447         class = (skb->nfmark <= 0) ? 0 :
1448                 ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
1449         /*
1450          * Accept only if the class has shares set or if the default class
1451          * i.e. class 0 has shares
1452          */
1453         if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
1454                 if (tcp_sk(sk)->acceptq[0].aq_valid) 
1455                         class = 0;
1456                 else
1457                         goto drop;
1458         }
1459 #endif
1460
1461         /* Accept backlog is full. If we have already queued enough
1462          * of warm entries in syn queue, drop request. It is better than
1463          * clogging syn queue with openreqs with exponentially increasing
1464          * timeout.
1465          */
1466 #ifdef CONFIG_ACCEPT_QUEUES
1467         if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
1468 #else
1469         if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1470 #endif
1471                 goto drop;
1472
1473         req = tcp_openreq_alloc();
1474         if (!req)
1475                 goto drop;
1476
1477         tcp_clear_options(&tp);
1478         tp.mss_clamp = 536;
1479         tp.user_mss  = tcp_sk(sk)->user_mss;
1480
1481         tcp_parse_options(skb, &tp, 0);
1482
1483         if (want_cookie) {
1484                 tcp_clear_options(&tp);
1485                 tp.saw_tstamp = 0;
1486         }
1487
1488         if (tp.saw_tstamp && !tp.rcv_tsval) {
1489                 /* Some OSes (unknown ones, but I see them on web server, which
1490                  * contains information interesting only for windows'
1491                  * users) do not send their stamp in SYN. It is easy case.
1492                  * We simply do not advertise TS support.
1493                  */
1494                 tp.saw_tstamp = 0;
1495                 tp.tstamp_ok  = 0;
1496         }
1497         tp.tstamp_ok = tp.saw_tstamp;
1498
1499         tcp_openreq_init(req, &tp, skb);
1500 #ifdef CONFIG_ACCEPT_QUEUES
1501         req->acceptq_class = class;
1502         req->acceptq_time_stamp = jiffies;
1503 #endif
1504         req->af.v4_req.loc_addr = daddr;
1505         req->af.v4_req.rmt_addr = saddr;
1506         req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1507         req->class = &or_ipv4;
1508         if (!want_cookie)
1509                 TCP_ECN_create_request(req, skb->h.th);
1510
1511         if (want_cookie) {
1512 #ifdef CONFIG_SYN_COOKIES
1513                 syn_flood_warning(skb);
1514 #endif
1515                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1516         } else if (!isn) {
1517                 struct inet_peer *peer = NULL;
1518
1519                 /* VJ's idea. We save last timestamp seen
1520                  * from the destination in peer table, when entering
1521                  * state TIME-WAIT, and check against it before
1522                  * accepting new connection request.
1523                  *
1524                  * If "isn" is not zero, this request hit alive
1525                  * timewait bucket, so that all the necessary checks
1526                  * are made in the function processing timewait state.
1527                  */
1528                 if (tp.saw_tstamp &&
1529                     sysctl_tcp_tw_recycle &&
1530                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1531                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1532                     peer->v4daddr == saddr) {
1533                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1534                             (s32)(peer->tcp_ts - req->ts_recent) >
1535                                                         TCP_PAWS_WINDOW) {
1536                                 NET_INC_STATS_BH(PAWSPassiveRejected);
1537                                 dst_release(dst);
1538                                 goto drop_and_free;
1539                         }
1540                 }
1541                 /* Kill the following clause, if you dislike this way. */
1542                 else if (!sysctl_tcp_syncookies &&
1543                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1544                           (sysctl_max_syn_backlog >> 2)) &&
1545                          (!peer || !peer->tcp_ts_stamp) &&
1546                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1547                         /* Without syncookies last quarter of
1548                          * backlog is filled with destinations,
1549                          * proven to be alive.
1550                          * It means that we continue to communicate
1551                          * to destinations, already remembered
1552                          * to the moment of synflood.
1553                          */
1554                         NETDEBUG(if (net_ratelimit()) \
1555                                         printk(KERN_DEBUG "TCP: drop open "
1556                                                           "request from %u.%u."
1557                                                           "%u.%u/%u\n", \
1558                                                NIPQUAD(saddr),
1559                                                ntohs(skb->h.th->source)));
1560                         dst_release(dst);
1561                         goto drop_and_free;
1562                 }
1563
1564                 isn = tcp_v4_init_sequence(sk, skb);
1565         }
1566         req->snt_isn = isn;
1567
1568         if (tcp_v4_send_synack(sk, req, dst))
1569                 goto drop_and_free;
1570
1571         if (want_cookie) {
1572                 tcp_openreq_free(req);
1573         } else {
1574                 tcp_v4_synq_add(sk, req);
1575         }
1576         return 0;
1577
1578 drop_and_free:
1579         tcp_openreq_free(req);
1580 drop:
1581         TCP_INC_STATS_BH(TcpAttemptFails);
1582         return 0;
1583 }
1584
1585
1586 /*
1587  * The three way handshake has completed - we got a valid synack -
1588  * now create the new socket.
1589  */
1590 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1591                                   struct open_request *req,
1592                                   struct dst_entry *dst)
1593 {
1594         struct inet_opt *newinet;
1595         struct tcp_opt *newtp;
1596         struct sock *newsk;
1597
1598 #ifdef CONFIG_ACCEPT_QUEUES
1599         if (tcp_acceptq_is_full(sk, req->acceptq_class))
1600 #else
1601         if (tcp_acceptq_is_full(sk))
1602 #endif
1603                 goto exit_overflow;
1604
1605         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1606                 goto exit;
1607
1608         newsk = tcp_create_openreq_child(sk, req, skb);
1609         if (!newsk)
1610                 goto exit;
1611
1612         newsk->sk_dst_cache = dst;
1613         tcp_v4_setup_caps(newsk, dst);
1614
1615         newtp                 = tcp_sk(newsk);
1616         newinet               = inet_sk(newsk);
1617         newinet->daddr        = req->af.v4_req.rmt_addr;
1618         newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1619         newinet->saddr        = req->af.v4_req.loc_addr;
1620         newinet->opt          = req->af.v4_req.opt;
1621         req->af.v4_req.opt    = NULL;
1622         newinet->mc_index     = tcp_v4_iif(skb);
1623         newinet->mc_ttl       = skb->nh.iph->ttl;
1624         newtp->ext_header_len = 0;
1625         if (newinet->opt)
1626                 newtp->ext_header_len = newinet->opt->optlen;
1627         newtp->ext2_header_len = dst->header_len;
1628         newinet->id = newtp->write_seq ^ jiffies;
1629
1630         tcp_sync_mss(newsk, dst_pmtu(dst));
1631         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1632         tcp_initialize_rcv_mss(newsk);
1633
1634         __tcp_v4_hash(newsk, 0);
1635         __tcp_inherit_port(sk, newsk);
1636
1637         return newsk;
1638
1639 exit_overflow:
1640         NET_INC_STATS_BH(ListenOverflows);
1641 exit:
1642         NET_INC_STATS_BH(ListenDrops);
1643         dst_release(dst);
1644         return NULL;
1645 }
1646
1647 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1648 {
1649         struct tcphdr *th = skb->h.th;
1650         struct iphdr *iph = skb->nh.iph;
1651         struct tcp_opt *tp = tcp_sk(sk);
1652         struct sock *nsk;
1653         struct open_request **prev;
1654         /* Find possible connection requests. */
1655         struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1656                                                      iph->saddr, iph->daddr);
1657         if (req)
1658                 return tcp_check_req(sk, skb, req, prev);
1659
1660         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1661                                           th->source,
1662                                           skb->nh.iph->daddr,
1663                                           ntohs(th->dest),
1664                                           tcp_v4_iif(skb));
1665
1666         if (nsk) {
1667                 if (nsk->sk_state != TCP_TIME_WAIT) {
1668                         bh_lock_sock(nsk);
1669                         return nsk;
1670                 }
1671                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1672                 return NULL;
1673         }
1674
1675 #ifdef CONFIG_SYN_COOKIES
1676         if (!th->rst && !th->syn && th->ack)
1677                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1678 #endif
1679         return sk;
1680 }
1681
1682 static int tcp_v4_checksum_init(struct sk_buff *skb)
1683 {
1684         if (skb->ip_summed == CHECKSUM_HW) {
1685                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1686                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1687                                   skb->nh.iph->daddr, skb->csum))
1688                         return 0;
1689
1690                 NETDEBUG(if (net_ratelimit())
1691                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1692                 skb->ip_summed = CHECKSUM_NONE;
1693         }
1694         if (skb->len <= 76) {
1695                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1696                                  skb->nh.iph->daddr,
1697                                  skb_checksum(skb, 0, skb->len, 0)))
1698                         return -1;
1699                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1700         } else {
1701                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1702                                           skb->nh.iph->saddr,
1703                                           skb->nh.iph->daddr, 0);
1704         }
1705         return 0;
1706 }
1707
1708
1709 /* The socket must have it's spinlock held when we get
1710  * here.
1711  *
1712  * We have a potential double-lock case here, so even when
1713  * doing backlog processing we use the BH locking scheme.
1714  * This is because we cannot sleep with the original spinlock
1715  * held.
1716  */
1717 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1718 {
1719         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1720                 TCP_CHECK_TIMER(sk);
1721                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1722                         goto reset;
1723                 TCP_CHECK_TIMER(sk);
1724                 return 0;
1725         }
1726
1727         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1728                 goto csum_err;
1729
1730         if (sk->sk_state == TCP_LISTEN) {
1731                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1732                 if (!nsk)
1733                         goto discard;
1734
1735                 if (nsk != sk) {
1736                         if (tcp_child_process(sk, nsk, skb))
1737                                 goto reset;
1738                         return 0;
1739                 }
1740         }
1741
1742         TCP_CHECK_TIMER(sk);
1743         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1744                 goto reset;
1745         TCP_CHECK_TIMER(sk);
1746         return 0;
1747
1748 reset:
1749         tcp_v4_send_reset(skb);
1750 discard:
1751         kfree_skb(skb);
1752         /* Be careful here. If this function gets more complicated and
1753          * gcc suffers from register pressure on the x86, sk (in %ebx)
1754          * might be destroyed here. This current version compiles correctly,
1755          * but you have been warned.
1756          */
1757         return 0;
1758
1759 csum_err:
1760         TCP_INC_STATS_BH(TcpInErrs);
1761         goto discard;
1762 }
1763
1764 /*
1765  *      From tcp_input.c
1766  */
1767
1768 int tcp_v4_rcv(struct sk_buff *skb)
1769 {
1770         struct tcphdr *th;
1771         struct sock *sk;
1772         int ret;
1773
1774         if (skb->pkt_type != PACKET_HOST)
1775                 goto discard_it;
1776
1777         /* Count it even if it's bad */
1778         TCP_INC_STATS_BH(TcpInSegs);
1779
1780         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1781                 goto discard_it;
1782
1783         th = skb->h.th;
1784
1785         if (th->doff < sizeof(struct tcphdr) / 4)
1786                 goto bad_packet;
1787         if (!pskb_may_pull(skb, th->doff * 4))
1788                 goto discard_it;
1789
1790         /* An explanation is required here, I think.
1791          * Packet length and doff are validated by header prediction,
1792          * provided case of th->doff==0 is elimineted.
1793          * So, we defer the checks. */
1794         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1795              tcp_v4_checksum_init(skb) < 0))
1796                 goto bad_packet;
1797
1798         th = skb->h.th;
1799         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1800         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1801                                     skb->len - th->doff * 4);
1802         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1803         TCP_SKB_CB(skb)->when    = 0;
1804         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1805         TCP_SKB_CB(skb)->sacked  = 0;
1806
1807         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1808                              skb->nh.iph->daddr, ntohs(th->dest),
1809                              tcp_v4_iif(skb));
1810
1811         if (!sk)
1812                 goto no_tcp_socket;
1813
1814 process:
1815         if (sk->sk_state == TCP_TIME_WAIT)
1816                 goto do_time_wait;
1817
1818         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1819                 goto discard_and_relse;
1820
1821         if (sk_filter(sk, skb, 0))
1822                 goto discard_and_relse;
1823
1824         skb->dev = NULL;
1825
1826         bh_lock_sock(sk);
1827         ret = 0;
1828         if (!sock_owned_by_user(sk)) {
1829                 if (!tcp_prequeue(sk, skb))
1830                         ret = tcp_v4_do_rcv(sk, skb);
1831         } else
1832                 sk_add_backlog(sk, skb);
1833         bh_unlock_sock(sk);
1834
1835         sock_put(sk);
1836
1837         return ret;
1838
1839 no_tcp_socket:
1840         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1841                 goto discard_it;
1842
1843         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1844 bad_packet:
1845                 TCP_INC_STATS_BH(TcpInErrs);
1846         } else {
1847                 tcp_v4_send_reset(skb);
1848         }
1849
1850 discard_it:
1851         /* Discard frame. */
1852         kfree_skb(skb);
1853         return 0;
1854
1855 discard_and_relse:
1856         sock_put(sk);
1857         goto discard_it;
1858
1859 do_time_wait:
1860         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1861                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1862                 goto discard_it;
1863         }
1864
1865         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1866                 TCP_INC_STATS_BH(TcpInErrs);
1867                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1868                 goto discard_it;
1869         }
1870         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1871                                            skb, th, skb->len)) {
1872         case TCP_TW_SYN: {
1873                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1874                                                           ntohs(th->dest),
1875                                                           tcp_v4_iif(skb));
1876                 if (sk2) {
1877                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1878                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1879                         sk = sk2;
1880                         goto process;
1881                 }
1882                 /* Fall through to ACK */
1883         }
1884         case TCP_TW_ACK:
1885                 tcp_v4_timewait_ack(sk, skb);
1886                 break;
1887         case TCP_TW_RST:
1888                 goto no_tcp_socket;
1889         case TCP_TW_SUCCESS:;
1890         }
1891         goto discard_it;
1892 }
1893
1894 /* With per-bucket locks this operation is not-atomic, so that
1895  * this version is not worse.
1896  */
1897 static void __tcp_v4_rehash(struct sock *sk)
1898 {
1899         sk->sk_prot->unhash(sk);
1900         sk->sk_prot->hash(sk);
1901 }
1902
1903 static int tcp_v4_reselect_saddr(struct sock *sk)
1904 {
1905         struct inet_opt *inet = inet_sk(sk);
1906         int err;
1907         struct rtable *rt;
1908         __u32 old_saddr = inet->saddr;
1909         __u32 new_saddr;
1910         __u32 daddr = inet->daddr;
1911
1912         if (inet->opt && inet->opt->srr)
1913                 daddr = inet->opt->faddr;
1914
1915         /* Query new route. */
1916         err = ip_route_connect(&rt, daddr, 0,
1917                                RT_TOS(inet->tos) | sk->sk_localroute,
1918                                sk->sk_bound_dev_if,
1919                                IPPROTO_TCP,
1920                                inet->sport, inet->dport, sk);
1921         if (err)
1922                 return err;
1923
1924         __sk_dst_set(sk, &rt->u.dst);
1925         tcp_v4_setup_caps(sk, &rt->u.dst);
1926         tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1927
1928         new_saddr = rt->rt_src;
1929
1930         if (new_saddr == old_saddr)
1931                 return 0;
1932
1933         if (sysctl_ip_dynaddr > 1) {
1934                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1935                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1936                        NIPQUAD(old_saddr),
1937                        NIPQUAD(new_saddr));
1938         }
1939
1940         inet->saddr = new_saddr;
1941         inet->rcv_saddr = new_saddr;
1942
1943         /* XXX The only one ugly spot where we need to
1944          * XXX really change the sockets identity after
1945          * XXX it has entered the hashes. -DaveM
1946          *
1947          * Besides that, it does not check for connection
1948          * uniqueness. Wait for troubles.
1949          */
1950         __tcp_v4_rehash(sk);
1951         return 0;
1952 }
1953
1954 int tcp_v4_rebuild_header(struct sock *sk)
1955 {
1956         struct inet_opt *inet = inet_sk(sk);
1957         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1958         u32 daddr;
1959         int err;
1960
1961         /* Route is OK, nothing to do. */
1962         if (rt)
1963                 return 0;
1964
1965         /* Reroute. */
1966         daddr = inet->daddr;
1967         if (inet->opt && inet->opt->srr)
1968                 daddr = inet->opt->faddr;
1969
1970         {
1971                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1972                                     .nl_u = { .ip4_u =
1973                                               { .daddr = daddr,
1974                                                 .saddr = inet->saddr,
1975                                                 .tos = RT_CONN_FLAGS(sk) } },
1976                                     .proto = IPPROTO_TCP,
1977                                     .uli_u = { .ports =
1978                                                { .sport = inet->sport,
1979                                                  .dport = inet->dport } } };
1980                                                 
1981                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1982         }
1983         if (!err) {
1984                 __sk_dst_set(sk, &rt->u.dst);
1985                 tcp_v4_setup_caps(sk, &rt->u.dst);
1986                 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1987                 return 0;
1988         }
1989
1990         /* Routing failed... */
1991         sk->sk_route_caps = 0;
1992
1993         if (!sysctl_ip_dynaddr ||
1994             sk->sk_state != TCP_SYN_SENT ||
1995             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1996             (err = tcp_v4_reselect_saddr(sk)) != 0)
1997                 sk->sk_err_soft = -err;
1998
1999         return err;
2000 }
2001
2002 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2003 {
2004         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2005         struct inet_opt *inet = inet_sk(sk);
2006
2007         sin->sin_family         = AF_INET;
2008         sin->sin_addr.s_addr    = inet->daddr;
2009         sin->sin_port           = inet->dport;
2010 }
2011
2012 /* VJ's idea. Save last timestamp seen from this destination
2013  * and hold it at least for normal timewait interval to use for duplicate
2014  * segment detection in subsequent connections, before they enter synchronized
2015  * state.
2016  */
2017
2018 int tcp_v4_remember_stamp(struct sock *sk)
2019 {
2020         struct inet_opt *inet = inet_sk(sk);
2021         struct tcp_opt *tp = tcp_sk(sk);
2022         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2023         struct inet_peer *peer = NULL;
2024         int release_it = 0;
2025
2026         if (!rt || rt->rt_dst != inet->daddr) {
2027                 peer = inet_getpeer(inet->daddr, 1);
2028                 release_it = 1;
2029         } else {
2030                 if (!rt->peer)
2031                         rt_bind_peer(rt, 1);
2032                 peer = rt->peer;
2033         }
2034
2035         if (peer) {
2036                 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2037                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2038                      peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2039                         peer->tcp_ts_stamp = tp->ts_recent_stamp;
2040                         peer->tcp_ts = tp->ts_recent;
2041                 }
2042                 if (release_it)
2043                         inet_putpeer(peer);
2044                 return 1;
2045         }
2046
2047         return 0;
2048 }
2049
2050 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2051 {
2052         struct inet_peer *peer = NULL;
2053
2054         peer = inet_getpeer(tw->tw_daddr, 1);
2055
2056         if (peer) {
2057                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2058                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2059                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2060                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2061                         peer->tcp_ts = tw->tw_ts_recent;
2062                 }
2063                 inet_putpeer(peer);
2064                 return 1;
2065         }
2066
2067         return 0;
2068 }
2069
2070 struct tcp_func ipv4_specific = {
2071         .queue_xmit     =       ip_queue_xmit,
2072         .send_check     =       tcp_v4_send_check,
2073         .rebuild_header =       tcp_v4_rebuild_header,
2074         .conn_request   =       tcp_v4_conn_request,
2075         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2076         .remember_stamp =       tcp_v4_remember_stamp,
2077         .net_header_len =       sizeof(struct iphdr),
2078         .setsockopt     =       ip_setsockopt,
2079         .getsockopt     =       ip_getsockopt,
2080         .addr2sockaddr  =       v4_addr2sockaddr,
2081         .sockaddr_len   =       sizeof(struct sockaddr_in),
2082 };
2083
2084 /* NOTE: A lot of things set to zero explicitly by call to
2085  *       sk_alloc() so need not be done here.
2086  */
2087 static int tcp_v4_init_sock(struct sock *sk)
2088 {
2089         struct tcp_opt *tp = tcp_sk(sk);
2090
2091         skb_queue_head_init(&tp->out_of_order_queue);
2092         tcp_init_xmit_timers(sk);
2093         tcp_prequeue_init(tp);
2094
2095         tp->rto  = TCP_TIMEOUT_INIT;
2096         tp->mdev = TCP_TIMEOUT_INIT;
2097
2098         /* So many TCP implementations out there (incorrectly) count the
2099          * initial SYN frame in their delayed-ACK and congestion control
2100          * algorithms that we must have the following bandaid to talk
2101          * efficiently to them.  -DaveM
2102          */
2103         tp->snd_cwnd = 2;
2104
2105         /* See draft-stevens-tcpca-spec-01 for discussion of the
2106          * initialization of these values.
2107          */
2108         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2109         tp->snd_cwnd_clamp = ~0;
2110         tp->mss_cache = 536;
2111
2112         tp->reordering = sysctl_tcp_reordering;
2113
2114         sk->sk_state = TCP_CLOSE;
2115
2116         sk->sk_write_space = tcp_write_space;
2117         sk->sk_use_write_queue = 1;
2118
2119         tp->af_specific = &ipv4_specific;
2120
2121         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2122         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2123
2124         atomic_inc(&tcp_sockets_allocated);
2125
2126         return 0;
2127 }
2128
2129 static int tcp_v4_destroy_sock(struct sock *sk)
2130 {
2131         struct tcp_opt *tp = tcp_sk(sk);
2132
2133         tcp_clear_xmit_timers(sk);
2134
2135         /* Cleanup up the write buffer. */
2136         tcp_writequeue_purge(sk);
2137
2138         /* Cleans up our, hopefully empty, out_of_order_queue. */
2139         __skb_queue_purge(&tp->out_of_order_queue);
2140
2141         /* Clean prequeue, it must be empty really */
2142         __skb_queue_purge(&tp->ucopy.prequeue);
2143
2144         /* Clean up a referenced TCP bind bucket. */
2145         if (tp->bind_hash)
2146                 tcp_put_port(sk);
2147
2148         /* If sendmsg cached page exists, toss it. */
2149         if (inet_sk(sk)->sndmsg_page)
2150                 __free_page(inet_sk(sk)->sndmsg_page);
2151
2152         atomic_dec(&tcp_sockets_allocated);
2153
2154         return 0;
2155 }
2156
2157 #ifdef CONFIG_PROC_FS
2158 /* Proc filesystem TCP sock list dumping. */
2159
2160 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2161 {
2162         return hlist_empty(head) ? NULL :
2163                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2164 }
2165
2166 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2167 {
2168         return tw->tw_node.next ?
2169                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2170 }
2171
2172 static void *listening_get_next(struct seq_file *seq, void *cur)
2173 {
2174         struct tcp_opt *tp;
2175         struct hlist_node *node;
2176         struct sock *sk = cur;
2177         struct tcp_iter_state* st = seq->private;
2178
2179         if (!sk) {
2180                 st->bucket = 0;
2181                 sk = sk_head(&tcp_listening_hash[0]);
2182                 goto get_sk;
2183         }
2184
2185         ++st->num;
2186
2187         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2188                 struct open_request *req = cur;
2189
2190                 tp = tcp_sk(st->syn_wait_sk);
2191                 req = req->dl_next;
2192                 while (1) {
2193                         while (req) {
2194                                 if (req->class->family == st->family) {
2195                                         cur = req;
2196                                         goto out;
2197                                 }
2198                                 req = req->dl_next;
2199                         }
2200                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2201                                 break;
2202 get_req:
2203                         req = tp->listen_opt->syn_table[st->sbucket];
2204                 }
2205                 sk        = sk_next(st->syn_wait_sk);
2206                 st->state = TCP_SEQ_STATE_LISTENING;
2207                 read_unlock_bh(&tp->syn_wait_lock);
2208         } else
2209                 sk = sk_next(sk);
2210 get_sk:
2211         sk_for_each_from(sk, node) {
2212                 if (sk->sk_family == st->family) {
2213                         cur = sk;
2214                         goto out;
2215                 }
2216                 tp = tcp_sk(sk);
2217                 read_lock_bh(&tp->syn_wait_lock);
2218                 if (tp->listen_opt && tp->listen_opt->qlen) {
2219                         st->uid         = sock_i_uid(sk);
2220                         st->syn_wait_sk = sk;
2221                         st->state       = TCP_SEQ_STATE_OPENREQ;
2222                         st->sbucket     = 0;
2223                         goto get_req;
2224                 }
2225                 read_unlock_bh(&tp->syn_wait_lock);
2226         }
2227         if (++st->bucket < TCP_LHTABLE_SIZE) {
2228                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2229                 goto get_sk;
2230         }
2231         cur = NULL;
2232 out:
2233         return cur;
2234 }
2235
2236 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2237 {
2238         void *rc = listening_get_next(seq, NULL);
2239
2240         while (rc && *pos) {
2241                 rc = listening_get_next(seq, rc);
2242                 --*pos;
2243         }
2244         return rc;
2245 }
2246
2247 static void *established_get_first(struct seq_file *seq)
2248 {
2249         struct tcp_iter_state* st = seq->private;
2250         void *rc = NULL;
2251
2252         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2253                 struct sock *sk;
2254                 struct hlist_node *node;
2255                 struct tcp_tw_bucket *tw;
2256                
2257                 read_lock(&tcp_ehash[st->bucket].lock);
2258                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2259                         if (sk->sk_family != st->family) {
2260                                 continue;
2261                         }
2262                         rc = sk;
2263                         goto out;
2264                 }
2265                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2266                 tw_for_each(tw, node,
2267                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2268                         if (tw->tw_family != st->family) {
2269                                 continue;
2270                         }
2271                         rc = tw;
2272                         goto out;
2273                 }
2274                 read_unlock(&tcp_ehash[st->bucket].lock);
2275                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2276         }
2277 out:
2278         return rc;
2279 }
2280
2281 static void *established_get_next(struct seq_file *seq, void *cur)
2282 {
2283         struct sock *sk = cur;
2284         struct tcp_tw_bucket *tw;
2285         struct hlist_node *node;
2286         struct tcp_iter_state* st = seq->private;
2287
2288         ++st->num;
2289
2290         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2291                 tw = cur;
2292                 tw = tw_next(tw);
2293 get_tw:
2294                 while (tw && tw->tw_family != st->family) {
2295                         tw = tw_next(tw);
2296                 }
2297                 if (tw) {
2298                         cur = tw;
2299                         goto out;
2300                 }
2301                 read_unlock(&tcp_ehash[st->bucket].lock);
2302                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2303                 if (++st->bucket < tcp_ehash_size) {
2304                         read_lock(&tcp_ehash[st->bucket].lock);
2305                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2306                 } else {
2307                         cur = NULL;
2308                         goto out;
2309                 }
2310         } else
2311                 sk = sk_next(sk);
2312
2313         sk_for_each_from(sk, node) {
2314                 if (sk->sk_family == st->family)
2315                         goto found;
2316         }
2317
2318         st->state = TCP_SEQ_STATE_TIME_WAIT;
2319         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2320         goto get_tw;
2321 found:
2322         cur = sk;
2323 out:
2324         return cur;
2325 }
2326
2327 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2328 {
2329         void *rc = established_get_first(seq);
2330
2331         while (rc && pos) {
2332                 rc = established_get_next(seq, rc);
2333                 --pos;
2334         }               
2335         return rc;
2336 }
2337
2338 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2339 {
2340         void *rc;
2341         struct tcp_iter_state* st = seq->private;
2342
2343         tcp_listen_lock();
2344         st->state = TCP_SEQ_STATE_LISTENING;
2345         rc        = listening_get_idx(seq, &pos);
2346
2347         if (!rc) {
2348                 tcp_listen_unlock();
2349                 local_bh_disable();
2350                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2351                 rc        = established_get_idx(seq, pos);
2352         }
2353
2354         return rc;
2355 }
2356
2357 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2358 {
2359         struct tcp_iter_state* st = seq->private;
2360         st->state = TCP_SEQ_STATE_LISTENING;
2361         st->num = 0;
2362         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2363 }
2364
2365 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2366 {
2367         void *rc = NULL;
2368         struct tcp_iter_state* st;
2369
2370         if (v == SEQ_START_TOKEN) {
2371                 rc = tcp_get_idx(seq, 0);
2372                 goto out;
2373         }
2374         st = seq->private;
2375
2376         switch (st->state) {
2377         case TCP_SEQ_STATE_OPENREQ:
2378         case TCP_SEQ_STATE_LISTENING:
2379                 rc = listening_get_next(seq, v);
2380                 if (!rc) {
2381                         tcp_listen_unlock();
2382                         local_bh_disable();
2383                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2384                         rc        = established_get_first(seq);
2385                 }
2386                 break;
2387         case TCP_SEQ_STATE_ESTABLISHED:
2388         case TCP_SEQ_STATE_TIME_WAIT:
2389                 rc = established_get_next(seq, v);
2390                 break;
2391         }
2392 out:
2393         ++*pos;
2394         return rc;
2395 }
2396
2397 static void tcp_seq_stop(struct seq_file *seq, void *v)
2398 {
2399         struct tcp_iter_state* st = seq->private;
2400
2401         switch (st->state) {
2402         case TCP_SEQ_STATE_OPENREQ:
2403                 if (v) {
2404                         struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2405                         read_unlock_bh(&tp->syn_wait_lock);
2406                 }
2407         case TCP_SEQ_STATE_LISTENING:
2408                 if (v != SEQ_START_TOKEN)
2409                         tcp_listen_unlock();
2410                 break;
2411         case TCP_SEQ_STATE_TIME_WAIT:
2412         case TCP_SEQ_STATE_ESTABLISHED:
2413                 if (v)
2414                         read_unlock(&tcp_ehash[st->bucket].lock);
2415                 local_bh_enable();
2416                 break;
2417         }
2418 }
2419
2420 static int tcp_seq_open(struct inode *inode, struct file *file)
2421 {
2422         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2423         struct seq_file *seq;
2424         struct tcp_iter_state *s;
2425         int rc;
2426
2427         if (unlikely(afinfo == NULL))
2428                 return -EINVAL;
2429
2430         s = kmalloc(sizeof(*s), GFP_KERNEL);
2431         if (!s)
2432                 return -ENOMEM;
2433         memset(s, 0, sizeof(*s));
2434         s->family               = afinfo->family;
2435         s->seq_ops.start        = tcp_seq_start;
2436         s->seq_ops.next         = tcp_seq_next;
2437         s->seq_ops.show         = afinfo->seq_show;
2438         s->seq_ops.stop         = tcp_seq_stop;
2439
2440         rc = seq_open(file, &s->seq_ops);
2441         if (rc)
2442                 goto out_kfree;
2443         seq          = file->private_data;
2444         seq->private = s;
2445 out:
2446         return rc;
2447 out_kfree:
2448         kfree(s);
2449         goto out;
2450 }
2451
2452 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2453 {
2454         int rc = 0;
2455         struct proc_dir_entry *p;
2456
2457         if (!afinfo)
2458                 return -EINVAL;
2459         afinfo->seq_fops->owner         = afinfo->owner;
2460         afinfo->seq_fops->open          = tcp_seq_open;
2461         afinfo->seq_fops->read          = seq_read;
2462         afinfo->seq_fops->llseek        = seq_lseek;
2463         afinfo->seq_fops->release       = seq_release_private;
2464         
2465         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2466         if (p)
2467                 p->data = afinfo;
2468         else
2469                 rc = -ENOMEM;
2470         return rc;
2471 }
2472
2473 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2474 {
2475         if (!afinfo)
2476                 return;
2477         proc_net_remove(afinfo->name);
2478         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2479 }
2480
2481 static void get_openreq4(struct sock *sk, struct open_request *req,
2482                          char *tmpbuf, int i, int uid)
2483 {
2484         int ttd = req->expires - jiffies;
2485
2486         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2487                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2488                 i,
2489                 req->af.v4_req.loc_addr,
2490                 ntohs(inet_sk(sk)->sport),
2491                 req->af.v4_req.rmt_addr,
2492                 ntohs(req->rmt_port),
2493                 TCP_SYN_RECV,
2494                 0, 0, /* could print option size, but that is af dependent. */
2495                 1,    /* timers active (only the expire timer) */
2496                 jiffies_to_clock_t(ttd),
2497                 req->retrans,
2498                 uid,
2499                 0,  /* non standard timer */
2500                 0, /* open_requests have no inode */
2501                 atomic_read(&sk->sk_refcnt),
2502                 req);
2503 }
2504
2505 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2506 {
2507         int timer_active;
2508         unsigned long timer_expires;
2509         struct tcp_opt *tp = tcp_sk(sp);
2510         struct inet_opt *inet = inet_sk(sp);
2511         unsigned int dest = inet->daddr;
2512         unsigned int src = inet->rcv_saddr;
2513         __u16 destp = ntohs(inet->dport);
2514         __u16 srcp = ntohs(inet->sport);
2515
2516         if (tp->pending == TCP_TIME_RETRANS) {
2517                 timer_active    = 1;
2518                 timer_expires   = tp->timeout;
2519         } else if (tp->pending == TCP_TIME_PROBE0) {
2520                 timer_active    = 4;
2521                 timer_expires   = tp->timeout;
2522         } else if (timer_pending(&sp->sk_timer)) {
2523                 timer_active    = 2;
2524                 timer_expires   = sp->sk_timer.expires;
2525         } else {
2526                 timer_active    = 0;
2527                 timer_expires = jiffies;
2528         }
2529
2530         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2531                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2532                 i, src, srcp, dest, destp, sp->sk_state,
2533                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2534                 timer_active,
2535                 jiffies_to_clock_t(timer_expires - jiffies),
2536                 tp->retransmits,
2537                 sock_i_uid(sp),
2538                 tp->probes_out,
2539                 sock_i_ino(sp),
2540                 atomic_read(&sp->sk_refcnt), sp,
2541                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2542                 tp->snd_cwnd,
2543                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2544 }
2545
2546 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2547 {
2548         unsigned int dest, src;
2549         __u16 destp, srcp;
2550         int ttd = tw->tw_ttd - jiffies;
2551
2552         if (ttd < 0)
2553                 ttd = 0;
2554
2555         dest  = tw->tw_daddr;
2556         src   = tw->tw_rcv_saddr;
2557         destp = ntohs(tw->tw_dport);
2558         srcp  = ntohs(tw->tw_sport);
2559
2560         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2561                 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2562                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2563                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2564                 atomic_read(&tw->tw_refcnt), tw);
2565 }
2566
2567 #define TMPSZ 150
2568
2569 static int tcp4_seq_show(struct seq_file *seq, void *v)
2570 {
2571         struct tcp_iter_state* st;
2572         char tmpbuf[TMPSZ + 1];
2573
2574         if (v == SEQ_START_TOKEN) {
2575                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2576                            "  sl  local_address rem_address   st tx_queue "
2577                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2578                            "inode");
2579                 goto out;
2580         }
2581         st = seq->private;
2582
2583         switch (st->state) {
2584         case TCP_SEQ_STATE_LISTENING:
2585         case TCP_SEQ_STATE_ESTABLISHED:
2586                 get_tcp4_sock(v, tmpbuf, st->num);
2587                 break;
2588         case TCP_SEQ_STATE_OPENREQ:
2589                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2590                 break;
2591         case TCP_SEQ_STATE_TIME_WAIT:
2592                 get_timewait4_sock(v, tmpbuf, st->num);
2593                 break;
2594         }
2595         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2596 out:
2597         return 0;
2598 }
2599
2600 static struct file_operations tcp4_seq_fops;
2601 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2602         .owner          = THIS_MODULE,
2603         .name           = "tcp",
2604         .family         = AF_INET,
2605         .seq_show       = tcp4_seq_show,
2606         .seq_fops       = &tcp4_seq_fops,
2607 };
2608
2609 int __init tcp4_proc_init(void)
2610 {
2611         return tcp_proc_register(&tcp4_seq_afinfo);
2612 }
2613
2614 void tcp4_proc_exit(void)
2615 {
2616         tcp_proc_unregister(&tcp4_seq_afinfo);
2617 }
2618 #endif /* CONFIG_PROC_FS */
2619
2620 struct proto tcp_prot = {
2621         .name           =       "TCP",
2622         .close          =       tcp_close,
2623         .connect        =       tcp_v4_connect,
2624         .disconnect     =       tcp_disconnect,
2625         .accept         =       tcp_accept,
2626         .ioctl          =       tcp_ioctl,
2627         .init           =       tcp_v4_init_sock,
2628         .destroy        =       tcp_v4_destroy_sock,
2629         .shutdown       =       tcp_shutdown,
2630         .setsockopt     =       tcp_setsockopt,
2631         .getsockopt     =       tcp_getsockopt,
2632         .sendmsg        =       tcp_sendmsg,
2633         .recvmsg        =       tcp_recvmsg,
2634         .backlog_rcv    =       tcp_v4_do_rcv,
2635         .hash           =       tcp_v4_hash,
2636         .unhash         =       tcp_unhash,
2637         .get_port       =       tcp_v4_get_port,
2638 };
2639
2640
2641
2642 void __init tcp_v4_init(struct net_proto_family *ops)
2643 {
2644         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2645         if (err < 0)
2646                 panic("Failed to create the TCP control socket.\n");
2647         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2648         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2649
2650         /* Unhash it so that IP input processing does not even
2651          * see it, we do not wish this socket to see incoming
2652          * packets.
2653          */
2654         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2655 }
2656
2657 EXPORT_SYMBOL(ipv4_specific);
2658 EXPORT_SYMBOL(tcp_bind_hash);
2659 EXPORT_SYMBOL(tcp_bucket_create);
2660 EXPORT_SYMBOL(tcp_hashinfo);
2661 EXPORT_SYMBOL(tcp_inherit_port);
2662 EXPORT_SYMBOL(tcp_listen_wlock);
2663 EXPORT_SYMBOL(tcp_port_rover);
2664 EXPORT_SYMBOL(tcp_prot);
2665 EXPORT_SYMBOL(tcp_put_port);
2666 EXPORT_SYMBOL(tcp_unhash);
2667 EXPORT_SYMBOL(tcp_v4_conn_request);
2668 EXPORT_SYMBOL(tcp_v4_connect);
2669 EXPORT_SYMBOL(tcp_v4_do_rcv);
2670 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2671 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2672 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2673 EXPORT_SYMBOL(tcp_v4_send_check);
2674 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2675
2676 #ifdef CONFIG_PROC_FS
2677 EXPORT_SYMBOL(tcp_proc_register);
2678 EXPORT_SYMBOL(tcp_proc_unregister);
2679 #endif
2680 #ifdef CONFIG_SYSCTL
2681 EXPORT_SYMBOL(sysctl_local_port_range);
2682 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2683 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2684 #endif