2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
113 return h & (tcp_ehash_size - 1);
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 struct inet_opt *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
124 return tcp_hashfn(laddr, lport, faddr, fport);
127 /* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
170 __tcp_inherit_port(sk, child);
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
183 Return 1 if addr match the socket IP list
184 or the socket is INADDR_ANY
186 static inline int tcp_in_list(struct sock *sk, u32 addr)
188 struct nx_info *nxi = sk->sk_nx_info;
190 vxdprintk("tcp_in_list(%p) %p,%p;%lx\n",
191 sk, nxi, sk->sk_socket,
192 (sk->sk_socket?sk->sk_socket->flags:0));
199 if (nxi->ipv4[i] == addr)
202 else if (!tcp_v4_rcv_saddr(sk) || tcp_v4_rcv_saddr(sk) == addr)
208 Check if the addresses in sk1 conflict with those in sk2
210 int tcp_ipv4_addr_conflict(struct sock *sk1, struct sock *sk2)
213 nxdprintk("inet_bind(%p,%p) %p,%p;%lx %p,%p;%lx\n",
215 sk1->sk_nx_info, sk1->sk_socket,
216 (sk1->sk_socket?sk1->sk_socket->flags:0),
217 sk2->sk_nx_info, sk2->sk_socket,
218 (sk2->sk_socket?sk2->sk_socket->flags:0));
220 if (tcp_v4_rcv_saddr(sk1)) {
221 /* Bind to one address only */
222 return tcp_in_list (sk2, tcp_v4_rcv_saddr(sk1));
223 } else if (sk1->sk_nx_info) {
224 /* A restricted bind(any) */
225 struct nx_info *nxi = sk1->sk_nx_info;
230 if (tcp_in_list (sk2, nxi->ipv4[i]))
232 } else /* A bind(any) do not allow other bind on the same port */
237 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
240 struct hlist_node *node;
241 int reuse = sk->sk_reuse;
243 sk_for_each_bound(sk2, node, &tb->owners) {
245 !tcp_v6_ipv6only(sk2) &&
246 (!sk->sk_bound_dev_if ||
247 !sk2->sk_bound_dev_if ||
248 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
249 if (!reuse || !sk2->sk_reuse ||
250 sk2->sk_state == TCP_LISTEN) {
251 if (tcp_ipv4_addr_conflict(sk, sk2))
259 /* Obtain a reference to a local port for the given sock,
260 * if snum is zero it means select any available local port.
262 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
264 struct tcp_bind_hashbucket *head;
265 struct hlist_node *node;
266 struct tcp_bind_bucket *tb;
271 int low = sysctl_local_port_range[0];
272 int high = sysctl_local_port_range[1];
273 int remaining = (high - low) + 1;
276 spin_lock(&tcp_portalloc_lock);
277 rover = tcp_port_rover;
280 if (rover < low || rover > high)
282 head = &tcp_bhash[tcp_bhashfn(rover)];
283 spin_lock(&head->lock);
284 tb_for_each(tb, node, &head->chain)
285 if (tb->port == rover)
289 spin_unlock(&head->lock);
290 } while (--remaining > 0);
291 tcp_port_rover = rover;
292 spin_unlock(&tcp_portalloc_lock);
294 /* Exhausted local port range during search? */
299 /* OK, here is the one we will use. HEAD is
300 * non-NULL and we hold it's mutex.
304 head = &tcp_bhash[tcp_bhashfn(snum)];
305 spin_lock(&head->lock);
306 tb_for_each(tb, node, &head->chain)
307 if (tb->port == snum)
313 if (!hlist_empty(&tb->owners)) {
314 if (sk->sk_reuse > 1)
316 if (tb->fastreuse > 0 &&
317 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
321 if (tcp_bind_conflict(sk, tb))
327 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
329 if (hlist_empty(&tb->owners)) {
330 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
334 } else if (tb->fastreuse &&
335 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
338 if (!tcp_sk(sk)->bind_hash)
339 tcp_bind_hash(sk, tb, snum);
340 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
344 spin_unlock(&head->lock);
350 /* Get rid of any references to a local port held by the
353 static void __tcp_put_port(struct sock *sk)
355 struct inet_opt *inet = inet_sk(sk);
356 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
357 struct tcp_bind_bucket *tb;
359 spin_lock(&head->lock);
360 tb = tcp_sk(sk)->bind_hash;
361 __sk_del_bind_node(sk);
362 tcp_sk(sk)->bind_hash = NULL;
364 tcp_bucket_destroy(tb);
365 spin_unlock(&head->lock);
368 void tcp_put_port(struct sock *sk)
375 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
376 * Look, when several writers sleep and reader wakes them up, all but one
377 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
378 * this, _but_ remember, it adds useless work on UP machines (wake up each
379 * exclusive lock release). It should be ifdefed really.
382 void tcp_listen_wlock(void)
384 write_lock(&tcp_lhash_lock);
386 if (atomic_read(&tcp_lhash_users)) {
390 prepare_to_wait_exclusive(&tcp_lhash_wait,
391 &wait, TASK_UNINTERRUPTIBLE);
392 if (!atomic_read(&tcp_lhash_users))
394 write_unlock_bh(&tcp_lhash_lock);
396 write_lock_bh(&tcp_lhash_lock);
399 finish_wait(&tcp_lhash_wait, &wait);
403 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
405 struct hlist_head *list;
408 BUG_TRAP(sk_unhashed(sk));
409 if (listen_possible && sk->sk_state == TCP_LISTEN) {
410 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
411 lock = &tcp_lhash_lock;
414 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
415 lock = &tcp_ehash[sk->sk_hashent].lock;
418 __sk_add_node(sk, list);
419 sock_prot_inc_use(sk->sk_prot);
421 if (listen_possible && sk->sk_state == TCP_LISTEN)
422 wake_up(&tcp_lhash_wait);
425 static void tcp_v4_hash(struct sock *sk)
427 if (sk->sk_state != TCP_CLOSE) {
429 __tcp_v4_hash(sk, 1);
434 void tcp_unhash(struct sock *sk)
441 if (sk->sk_state == TCP_LISTEN) {
444 lock = &tcp_lhash_lock;
446 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
448 write_lock_bh(&head->lock);
451 if (__sk_del_node_init(sk))
452 sock_prot_dec_use(sk->sk_prot);
453 write_unlock_bh(lock);
456 if (sk->sk_state == TCP_LISTEN)
457 wake_up(&tcp_lhash_wait);
461 Check if an address is in the list
463 static inline int tcp_addr_in_list(
466 struct nx_info *nx_info)
468 if (rcv_saddr == daddr)
470 else if (rcv_saddr == 0) {
471 /* Accept any address or check the list */
475 int n = nx_info->nbipv4;
479 if (nx_info->ipv4[i] == daddr)
488 /* Don't inline this cruft. Here are some nice properties to
489 * exploit here. The BSD API does not allow a listening TCP
490 * to specify the remote port nor the remote address for the
491 * connection. So always assume those are both wildcarded
492 * during the search since they can never be otherwise.
494 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
495 unsigned short hnum, int dif)
497 struct sock *result = NULL, *sk;
498 struct hlist_node *node;
502 sk_for_each(sk, node, head) {
503 struct inet_opt *inet = inet_sk(sk);
505 if (inet->num == hnum && !ipv6_only_sock(sk)) {
506 __u32 rcv_saddr = inet->rcv_saddr;
508 score = (sk->sk_family == PF_INET ? 1 : 0);
509 if (tcp_addr_in_list(rcv_saddr, daddr, sk->sk_nx_info))
513 if (sk->sk_bound_dev_if) {
514 if (sk->sk_bound_dev_if != dif)
520 if (score > hiscore) {
529 /* Optimize the common listener case. */
530 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
533 struct sock *sk = NULL;
534 struct hlist_head *head;
536 read_lock(&tcp_lhash_lock);
537 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
538 if (!hlist_empty(head)) {
539 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
540 if (inet->num == hnum && !sk->sk_node.next &&
541 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
542 tcp_addr_in_list(inet->rcv_saddr, daddr, sk->sk_nx_info) &&
543 !sk->sk_bound_dev_if)
545 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
551 read_unlock(&tcp_lhash_lock);
555 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
556 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
558 * Local BH must be disabled here.
561 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
565 struct tcp_ehash_bucket *head;
566 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
567 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
569 struct hlist_node *node;
570 /* Optimize here for direct hit, only listening connections can
571 * have wildcards anyways.
573 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
574 head = &tcp_ehash[hash];
575 read_lock(&head->lock);
576 sk_for_each(sk, node, &head->chain) {
577 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
578 goto hit; /* You sunk my battleship! */
581 /* Must check for a TIME_WAIT'er before going to listener hash. */
582 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
583 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
588 read_unlock(&head->lock);
595 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
596 u32 daddr, u16 hnum, int dif)
598 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
601 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
604 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
610 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
616 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
618 return secure_tcp_sequence_number(skb->nh.iph->daddr,
624 /* called with local bh disabled */
625 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
626 struct tcp_tw_bucket **twp)
628 struct inet_opt *inet = inet_sk(sk);
629 u32 daddr = inet->rcv_saddr;
630 u32 saddr = inet->daddr;
631 int dif = sk->sk_bound_dev_if;
632 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
633 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
634 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
635 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
637 struct hlist_node *node;
638 struct tcp_tw_bucket *tw;
640 write_lock(&head->lock);
642 /* Check TIME-WAIT sockets first. */
643 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
644 tw = (struct tcp_tw_bucket *)sk2;
646 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
647 struct tcp_opt *tp = tcp_sk(sk);
649 /* With PAWS, it is safe from the viewpoint
650 of data integrity. Even without PAWS it
651 is safe provided sequence spaces do not
652 overlap i.e. at data rates <= 80Mbit/sec.
654 Actually, the idea is close to VJ's one,
655 only timestamp cache is held not per host,
656 but per port pair and TW bucket is used
659 If TW bucket has been already destroyed we
660 fall back to VJ's scheme and use initial
661 timestamp retrieved from peer table.
663 if (tw->tw_ts_recent_stamp &&
664 (!twp || (sysctl_tcp_tw_reuse &&
666 tw->tw_ts_recent_stamp > 1))) {
668 tw->tw_snd_nxt + 65535 + 2) == 0)
670 tp->ts_recent = tw->tw_ts_recent;
671 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
680 /* And established part... */
681 sk_for_each(sk2, node, &head->chain) {
682 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
687 /* Must record num and sport now. Otherwise we will see
688 * in hash table socket with a funny identity. */
690 inet->sport = htons(lport);
691 sk->sk_hashent = hash;
692 BUG_TRAP(sk_unhashed(sk));
693 __sk_add_node(sk, &head->chain);
694 sock_prot_inc_use(sk->sk_prot);
695 write_unlock(&head->lock);
699 NET_INC_STATS_BH(TimeWaitRecycled);
701 /* Silly. Should hash-dance instead... */
702 tcp_tw_deschedule(tw);
703 NET_INC_STATS_BH(TimeWaitRecycled);
711 write_unlock(&head->lock);
712 return -EADDRNOTAVAIL;
716 * Bind a port for a connect operation and hash it.
718 static int tcp_v4_hash_connect(struct sock *sk)
720 unsigned short snum = inet_sk(sk)->num;
721 struct tcp_bind_hashbucket *head;
722 struct tcp_bind_bucket *tb;
727 int low = sysctl_local_port_range[0];
728 int high = sysctl_local_port_range[1];
729 int remaining = (high - low) + 1;
730 struct hlist_node *node;
731 struct tcp_tw_bucket *tw = NULL;
735 /* TODO. Actually it is not so bad idea to remove
736 * tcp_portalloc_lock before next submission to Linus.
737 * As soon as we touch this place at all it is time to think.
739 * Now it protects single _advisory_ variable tcp_port_rover,
740 * hence it is mostly useless.
741 * Code will work nicely if we just delete it, but
742 * I am afraid in contented case it will work not better or
743 * even worse: another cpu just will hit the same bucket
745 * So some cpu salt could remove both contention and
746 * memory pingpong. Any ideas how to do this in a nice way?
748 spin_lock(&tcp_portalloc_lock);
749 rover = tcp_port_rover;
753 if ((rover < low) || (rover > high))
755 head = &tcp_bhash[tcp_bhashfn(rover)];
756 spin_lock(&head->lock);
758 /* Does not bother with rcv_saddr checks,
759 * because the established check is already
762 tb_for_each(tb, node, &head->chain) {
763 if (tb->port == rover) {
764 BUG_TRAP(!hlist_empty(&tb->owners));
765 if (tb->fastreuse >= 0)
767 if (!__tcp_v4_check_established(sk,
775 tb = tcp_bucket_create(head, rover);
777 spin_unlock(&head->lock);
784 spin_unlock(&head->lock);
785 } while (--remaining > 0);
786 tcp_port_rover = rover;
787 spin_unlock(&tcp_portalloc_lock);
791 return -EADDRNOTAVAIL;
794 /* All locks still held and bhs disabled */
795 tcp_port_rover = rover;
796 spin_unlock(&tcp_portalloc_lock);
798 tcp_bind_hash(sk, tb, rover);
799 if (sk_unhashed(sk)) {
800 inet_sk(sk)->sport = htons(rover);
801 __tcp_v4_hash(sk, 0);
803 spin_unlock(&head->lock);
806 tcp_tw_deschedule(tw);
814 head = &tcp_bhash[tcp_bhashfn(snum)];
815 tb = tcp_sk(sk)->bind_hash;
816 spin_lock_bh(&head->lock);
817 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
818 __tcp_v4_hash(sk, 0);
819 spin_unlock_bh(&head->lock);
822 spin_unlock(&head->lock);
823 /* No definite answer... Walk to established hash table */
824 ret = __tcp_v4_check_established(sk, snum, NULL);
831 /* This will initiate an outgoing connection. */
832 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
834 struct inet_opt *inet = inet_sk(sk);
835 struct tcp_opt *tp = tcp_sk(sk);
836 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
842 if (addr_len < sizeof(struct sockaddr_in))
845 if (usin->sin_family != AF_INET)
846 return -EAFNOSUPPORT;
848 nexthop = daddr = usin->sin_addr.s_addr;
849 if (inet->opt && inet->opt->srr) {
852 nexthop = inet->opt->faddr;
855 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
856 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
858 inet->sport, usin->sin_port, sk);
862 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
867 if (!inet->opt || !inet->opt->srr)
871 inet->saddr = rt->rt_src;
872 inet->rcv_saddr = inet->saddr;
874 if (tp->ts_recent_stamp && inet->daddr != daddr) {
875 /* Reset inherited state */
877 tp->ts_recent_stamp = 0;
881 if (sysctl_tcp_tw_recycle &&
882 !tp->ts_recent_stamp && rt->rt_dst == daddr) {
883 struct inet_peer *peer = rt_get_peer(rt);
885 /* VJ's idea. We save last timestamp seen from
886 * the destination in peer table, when entering state TIME-WAIT
887 * and initialize ts_recent from it, when trying new connection.
890 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
891 tp->ts_recent_stamp = peer->tcp_ts_stamp;
892 tp->ts_recent = peer->tcp_ts;
896 inet->dport = usin->sin_port;
899 tp->ext_header_len = 0;
901 tp->ext_header_len = inet->opt->optlen;
905 /* Socket identity is still unknown (sport may be zero).
906 * However we set state to SYN-SENT and not releasing socket
907 * lock select source port, enter ourselves into the hash tables and
908 * complete initialization after this.
910 tcp_set_state(sk, TCP_SYN_SENT);
911 err = tcp_v4_hash_connect(sk);
915 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
919 /* OK, now commit destination to socket. */
920 __sk_dst_set(sk, &rt->u.dst);
921 tcp_v4_setup_caps(sk, &rt->u.dst);
922 tp->ext2_header_len = rt->u.dst.header_len;
925 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
930 inet->id = tp->write_seq ^ jiffies;
932 err = tcp_connect(sk);
940 /* This unhashes the socket and releases the local port, if necessary. */
941 tcp_set_state(sk, TCP_CLOSE);
943 sk->sk_route_caps = 0;
948 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
950 return ((struct rtable *)skb->dst)->rt_iif;
953 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
955 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
958 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
959 struct open_request ***prevp,
961 __u32 raddr, __u32 laddr)
963 struct tcp_listen_opt *lopt = tp->listen_opt;
964 struct open_request *req, **prev;
966 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
967 (req = *prev) != NULL;
968 prev = &req->dl_next) {
969 if (req->rmt_port == rport &&
970 req->af.v4_req.rmt_addr == raddr &&
971 req->af.v4_req.loc_addr == laddr &&
972 TCP_INET_FAMILY(req->class->family)) {
982 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
984 struct tcp_opt *tp = tcp_sk(sk);
985 struct tcp_listen_opt *lopt = tp->listen_opt;
986 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
988 req->expires = jiffies + TCP_TIMEOUT_INIT;
991 req->dl_next = lopt->syn_table[h];
993 write_lock(&tp->syn_wait_lock);
994 lopt->syn_table[h] = req;
995 write_unlock(&tp->syn_wait_lock);
997 #ifdef CONFIG_ACCEPT_QUEUES
998 tcp_synq_added(sk, req);
1006 * This routine does path mtu discovery as defined in RFC1191.
1008 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
1011 struct dst_entry *dst;
1012 struct inet_opt *inet = inet_sk(sk);
1013 struct tcp_opt *tp = tcp_sk(sk);
1015 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
1016 * send out by Linux are always <576bytes so they should go through
1019 if (sk->sk_state == TCP_LISTEN)
1022 /* We don't check in the destentry if pmtu discovery is forbidden
1023 * on this route. We just assume that no packet_to_big packets
1024 * are send back when pmtu discovery is not active.
1025 * There is a small race when the user changes this flag in the
1026 * route, but I think that's acceptable.
1028 if ((dst = __sk_dst_check(sk, 0)) == NULL)
1031 dst->ops->update_pmtu(dst, mtu);
1033 /* Something is about to be wrong... Remember soft error
1034 * for the case, if this connection will not able to recover.
1036 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
1037 sk->sk_err_soft = EMSGSIZE;
1039 mtu = dst_pmtu(dst);
1041 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
1042 tp->pmtu_cookie > mtu) {
1043 tcp_sync_mss(sk, mtu);
1045 /* Resend the TCP packet because it's
1046 * clear that the old packet has been
1047 * dropped. This is the new "fast" path mtu
1050 tcp_simple_retransmit(sk);
1051 } /* else let the usual retransmit timer handle it */
1055 * This routine is called by the ICMP module when it gets some
1056 * sort of error condition. If err < 0 then the socket should
1057 * be closed and the error returned to the user. If err > 0
1058 * it's just the icmp type << 8 | icmp code. After adjustment
1059 * header points to the first 8 bytes of the tcp header. We need
1060 * to find the appropriate port.
1062 * The locking strategy used here is very "optimistic". When
1063 * someone else accesses the socket the ICMP is just dropped
1064 * and for some paths there is no check at all.
1065 * A more general error queue to queue errors for later handling
1066 * is probably better.
1070 void tcp_v4_err(struct sk_buff *skb, u32 info)
1072 struct iphdr *iph = (struct iphdr *)skb->data;
1073 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1075 struct inet_opt *inet;
1076 int type = skb->h.icmph->type;
1077 int code = skb->h.icmph->code;
1082 if (skb->len < (iph->ihl << 2) + 8) {
1083 ICMP_INC_STATS_BH(IcmpInErrors);
1087 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1088 th->source, tcp_v4_iif(skb));
1090 ICMP_INC_STATS_BH(IcmpInErrors);
1093 if (sk->sk_state == TCP_TIME_WAIT) {
1094 tcp_tw_put((struct tcp_tw_bucket *)sk);
1099 /* If too many ICMPs get dropped on busy
1100 * servers this needs to be solved differently.
1102 if (sock_owned_by_user(sk))
1103 NET_INC_STATS_BH(LockDroppedIcmps);
1105 if (sk->sk_state == TCP_CLOSE)
1109 seq = ntohl(th->seq);
1110 if (sk->sk_state != TCP_LISTEN &&
1111 !between(seq, tp->snd_una, tp->snd_nxt)) {
1112 NET_INC_STATS(OutOfWindowIcmps);
1117 case ICMP_SOURCE_QUENCH:
1118 /* This is deprecated, but if someone generated it,
1119 * we have no reasons to ignore it.
1121 if (!sock_owned_by_user(sk))
1124 case ICMP_PARAMETERPROB:
1127 case ICMP_DEST_UNREACH:
1128 if (code > NR_ICMP_UNREACH)
1131 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1132 if (!sock_owned_by_user(sk))
1133 do_pmtu_discovery(sk, iph, info);
1137 err = icmp_err_convert[code].errno;
1139 case ICMP_TIME_EXCEEDED:
1146 switch (sk->sk_state) {
1147 struct open_request *req, **prev;
1149 if (sock_owned_by_user(sk))
1152 req = tcp_v4_search_req(tp, &prev, th->dest,
1153 iph->daddr, iph->saddr);
1157 /* ICMPs are not backlogged, hence we cannot get
1158 an established socket here.
1162 if (seq != req->snt_isn) {
1163 NET_INC_STATS_BH(OutOfWindowIcmps);
1168 * Still in SYN_RECV, just remove it silently.
1169 * There is no good way to pass the error to the newly
1170 * created socket, and POSIX does not want network
1171 * errors returned from accept().
1173 tcp_synq_drop(sk, req, prev);
1177 case TCP_SYN_RECV: /* Cannot happen.
1178 It can f.e. if SYNs crossed.
1180 if (!sock_owned_by_user(sk)) {
1181 TCP_INC_STATS_BH(TcpAttemptFails);
1184 sk->sk_error_report(sk);
1188 sk->sk_err_soft = err;
1193 /* If we've already connected we will keep trying
1194 * until we time out, or the user gives up.
1196 * rfc1122 4.2.3.9 allows to consider as hard errors
1197 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1198 * but it is obsoleted by pmtu discovery).
1200 * Note, that in modern internet, where routing is unreliable
1201 * and in each dark corner broken firewalls sit, sending random
1202 * errors ordered by their masters even this two messages finally lose
1203 * their original sense (even Linux sends invalid PORT_UNREACHs)
1205 * Now we are in compliance with RFCs.
1210 if (!sock_owned_by_user(sk) && inet->recverr) {
1212 sk->sk_error_report(sk);
1213 } else { /* Only an error on timeout */
1214 sk->sk_err_soft = err;
1222 /* This routine computes an IPv4 TCP checksum. */
1223 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1224 struct sk_buff *skb)
1226 struct inet_opt *inet = inet_sk(sk);
1228 if (skb->ip_summed == CHECKSUM_HW) {
1229 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1230 skb->csum = offsetof(struct tcphdr, check);
1232 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1233 csum_partial((char *)th,
1240 * This routine will send an RST to the other tcp.
1242 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1244 * Answer: if a packet caused RST, it is not for a socket
1245 * existing in our system, if it is matched to a socket,
1246 * it is just duplicate segment or bug in other side's TCP.
1247 * So that we build reply only basing on parameters
1248 * arrived with segment.
1249 * Exception: precedence violation. We do not implement it in any case.
1252 static void tcp_v4_send_reset(struct sk_buff *skb)
1254 struct tcphdr *th = skb->h.th;
1256 struct ip_reply_arg arg;
1258 /* Never send a reset in response to a reset. */
1262 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1265 /* Swap the send and the receive. */
1266 memset(&rth, 0, sizeof(struct tcphdr));
1267 rth.dest = th->source;
1268 rth.source = th->dest;
1269 rth.doff = sizeof(struct tcphdr) / 4;
1273 rth.seq = th->ack_seq;
1276 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1277 skb->len - (th->doff << 2));
1280 memset(&arg, 0, sizeof arg);
1281 arg.iov[0].iov_base = (unsigned char *)&rth;
1282 arg.iov[0].iov_len = sizeof rth;
1283 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1284 skb->nh.iph->saddr, /*XXX*/
1285 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1286 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1288 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1290 TCP_INC_STATS_BH(TcpOutSegs);
1291 TCP_INC_STATS_BH(TcpOutRsts);
1294 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1295 outside socket context is ugly, certainly. What can I do?
1298 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1301 struct tcphdr *th = skb->h.th;
1306 struct ip_reply_arg arg;
1308 memset(&rep.th, 0, sizeof(struct tcphdr));
1309 memset(&arg, 0, sizeof arg);
1311 arg.iov[0].iov_base = (unsigned char *)&rep;
1312 arg.iov[0].iov_len = sizeof(rep.th);
1314 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1315 (TCPOPT_TIMESTAMP << 8) |
1317 rep.tsopt[1] = htonl(tcp_time_stamp);
1318 rep.tsopt[2] = htonl(ts);
1319 arg.iov[0].iov_len = sizeof(rep);
1322 /* Swap the send and the receive. */
1323 rep.th.dest = th->source;
1324 rep.th.source = th->dest;
1325 rep.th.doff = arg.iov[0].iov_len / 4;
1326 rep.th.seq = htonl(seq);
1327 rep.th.ack_seq = htonl(ack);
1329 rep.th.window = htons(win);
1331 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1332 skb->nh.iph->saddr, /*XXX*/
1333 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1334 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1336 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1338 TCP_INC_STATS_BH(TcpOutSegs);
1341 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1343 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1345 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1346 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1351 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1353 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1357 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1358 struct open_request *req)
1361 struct ip_options *opt = req->af.v4_req.opt;
1362 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1364 { .daddr = ((opt && opt->srr) ?
1366 req->af.v4_req.rmt_addr),
1367 .saddr = req->af.v4_req.loc_addr,
1368 .tos = RT_CONN_FLAGS(sk) } },
1369 .proto = IPPROTO_TCP,
1371 { .sport = inet_sk(sk)->sport,
1372 .dport = req->rmt_port } } };
1374 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1375 IP_INC_STATS_BH(OutNoRoutes);
1378 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1380 IP_INC_STATS_BH(OutNoRoutes);
1387 * Send a SYN-ACK after having received an ACK.
1388 * This still operates on a open_request only, not on a big
1391 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1392 struct dst_entry *dst)
1395 struct sk_buff * skb;
1397 /* First, grab a route. */
1398 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1401 skb = tcp_make_synack(sk, dst, req);
1404 struct tcphdr *th = skb->h.th;
1406 th->check = tcp_v4_check(th, skb->len,
1407 req->af.v4_req.loc_addr,
1408 req->af.v4_req.rmt_addr,
1409 csum_partial((char *)th, skb->len,
1412 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1413 req->af.v4_req.rmt_addr,
1414 req->af.v4_req.opt);
1415 if (err == NET_XMIT_CN)
1425 * IPv4 open_request destructor.
1427 static void tcp_v4_or_free(struct open_request *req)
1429 if (req->af.v4_req.opt)
1430 kfree(req->af.v4_req.opt);
1433 static inline void syn_flood_warning(struct sk_buff *skb)
1435 static unsigned long warntime;
1437 if (time_after(jiffies, (warntime + HZ * 60))) {
1440 "possible SYN flooding on port %d. Sending cookies.\n",
1441 ntohs(skb->h.th->dest));
1446 * Save and compile IPv4 options into the open_request if needed.
1448 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1449 struct sk_buff *skb)
1451 struct ip_options *opt = &(IPCB(skb)->opt);
1452 struct ip_options *dopt = NULL;
1454 if (opt && opt->optlen) {
1455 int opt_size = optlength(opt);
1456 dopt = kmalloc(opt_size, GFP_ATOMIC);
1458 if (ip_options_echo(dopt, skb)) {
1468 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1469 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1470 * It would be better to replace it with a global counter for all sockets
1471 * but then some measure against one socket starving all other sockets
1474 * It was 128 by default. Experiments with real servers show, that
1475 * it is absolutely not enough even at 100conn/sec. 256 cures most
1476 * of problems. This value is adjusted to 128 for very small machines
1477 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1478 * Further increasing requires to change hash table size.
1480 int sysctl_max_syn_backlog = 256;
1482 struct or_calltable or_ipv4 = {
1484 .rtx_syn_ack = tcp_v4_send_synack,
1485 .send_ack = tcp_v4_or_send_ack,
1486 .destructor = tcp_v4_or_free,
1487 .send_reset = tcp_v4_send_reset,
1490 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1493 struct open_request *req;
1494 __u32 saddr = skb->nh.iph->saddr;
1495 __u32 daddr = skb->nh.iph->daddr;
1496 __u32 isn = TCP_SKB_CB(skb)->when;
1497 struct dst_entry *dst = NULL;
1498 #ifdef CONFIG_ACCEPT_QUEUES
1501 #ifdef CONFIG_SYN_COOKIES
1502 int want_cookie = 0;
1504 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1507 /* Never answer to SYNs send to broadcast or multicast */
1508 if (((struct rtable *)skb->dst)->rt_flags &
1509 (RTCF_BROADCAST | RTCF_MULTICAST))
1512 /* TW buckets are converted to open requests without
1513 * limitations, they conserve resources and peer is
1514 * evidently real one.
1516 if (tcp_synq_is_full(sk) && !isn) {
1517 #ifdef CONFIG_SYN_COOKIES
1518 if (sysctl_tcp_syncookies) {
1525 #ifdef CONFIG_ACCEPT_QUEUES
1526 class = (skb->nfmark <= 0) ? 0 :
1527 ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
1529 * Accept only if the class has shares set or if the default class
1530 * i.e. class 0 has shares
1532 if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
1533 if (tcp_sk(sk)->acceptq[0].aq_valid)
1540 /* Accept backlog is full. If we have already queued enough
1541 * of warm entries in syn queue, drop request. It is better than
1542 * clogging syn queue with openreqs with exponentially increasing
1545 #ifdef CONFIG_ACCEPT_QUEUES
1546 if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
1548 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1552 req = tcp_openreq_alloc();
1556 tcp_clear_options(&tp);
1558 tp.user_mss = tcp_sk(sk)->user_mss;
1560 tcp_parse_options(skb, &tp, 0);
1563 tcp_clear_options(&tp);
1567 if (tp.saw_tstamp && !tp.rcv_tsval) {
1568 /* Some OSes (unknown ones, but I see them on web server, which
1569 * contains information interesting only for windows'
1570 * users) do not send their stamp in SYN. It is easy case.
1571 * We simply do not advertise TS support.
1576 tp.tstamp_ok = tp.saw_tstamp;
1578 tcp_openreq_init(req, &tp, skb);
1579 #ifdef CONFIG_ACCEPT_QUEUES
1580 req->acceptq_class = class;
1581 req->acceptq_time_stamp = jiffies;
1583 req->af.v4_req.loc_addr = daddr;
1584 req->af.v4_req.rmt_addr = saddr;
1585 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1586 req->class = &or_ipv4;
1588 TCP_ECN_create_request(req, skb->h.th);
1591 #ifdef CONFIG_SYN_COOKIES
1592 syn_flood_warning(skb);
1594 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1596 struct inet_peer *peer = NULL;
1598 /* VJ's idea. We save last timestamp seen
1599 * from the destination in peer table, when entering
1600 * state TIME-WAIT, and check against it before
1601 * accepting new connection request.
1603 * If "isn" is not zero, this request hit alive
1604 * timewait bucket, so that all the necessary checks
1605 * are made in the function processing timewait state.
1607 if (tp.saw_tstamp &&
1608 sysctl_tcp_tw_recycle &&
1609 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1610 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1611 peer->v4daddr == saddr) {
1612 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1613 (s32)(peer->tcp_ts - req->ts_recent) >
1615 NET_INC_STATS_BH(PAWSPassiveRejected);
1620 /* Kill the following clause, if you dislike this way. */
1621 else if (!sysctl_tcp_syncookies &&
1622 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1623 (sysctl_max_syn_backlog >> 2)) &&
1624 (!peer || !peer->tcp_ts_stamp) &&
1625 (!dst || !dst_metric(dst, RTAX_RTT))) {
1626 /* Without syncookies last quarter of
1627 * backlog is filled with destinations,
1628 * proven to be alive.
1629 * It means that we continue to communicate
1630 * to destinations, already remembered
1631 * to the moment of synflood.
1633 NETDEBUG(if (net_ratelimit()) \
1634 printk(KERN_DEBUG "TCP: drop open "
1635 "request from %u.%u."
1638 ntohs(skb->h.th->source)));
1643 isn = tcp_v4_init_sequence(sk, skb);
1647 if (tcp_v4_send_synack(sk, req, dst))
1651 tcp_openreq_free(req);
1653 tcp_v4_synq_add(sk, req);
1658 tcp_openreq_free(req);
1660 TCP_INC_STATS_BH(TcpAttemptFails);
1666 * The three way handshake has completed - we got a valid synack -
1667 * now create the new socket.
1669 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1670 struct open_request *req,
1671 struct dst_entry *dst)
1673 struct inet_opt *newinet;
1674 struct tcp_opt *newtp;
1677 #ifdef CONFIG_ACCEPT_QUEUES
1678 if (tcp_acceptq_is_full(sk, req->acceptq_class))
1680 if (sk_acceptq_is_full(sk))
1684 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1687 newsk = tcp_create_openreq_child(sk, req, skb);
1691 newsk->sk_dst_cache = dst;
1692 tcp_v4_setup_caps(newsk, dst);
1694 newtp = tcp_sk(newsk);
1695 newinet = inet_sk(newsk);
1696 newinet->daddr = req->af.v4_req.rmt_addr;
1697 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1698 newinet->saddr = req->af.v4_req.loc_addr;
1699 newinet->opt = req->af.v4_req.opt;
1700 req->af.v4_req.opt = NULL;
1701 newinet->mc_index = tcp_v4_iif(skb);
1702 newinet->mc_ttl = skb->nh.iph->ttl;
1703 newtp->ext_header_len = 0;
1705 newtp->ext_header_len = newinet->opt->optlen;
1706 newtp->ext2_header_len = dst->header_len;
1707 newinet->id = newtp->write_seq ^ jiffies;
1709 tcp_sync_mss(newsk, dst_pmtu(dst));
1710 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1711 tcp_initialize_rcv_mss(newsk);
1713 __tcp_v4_hash(newsk, 0);
1714 __tcp_inherit_port(sk, newsk);
1719 NET_INC_STATS_BH(ListenOverflows);
1721 NET_INC_STATS_BH(ListenDrops);
1726 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1728 struct tcphdr *th = skb->h.th;
1729 struct iphdr *iph = skb->nh.iph;
1730 struct tcp_opt *tp = tcp_sk(sk);
1732 struct open_request **prev;
1733 /* Find possible connection requests. */
1734 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1735 iph->saddr, iph->daddr);
1737 return tcp_check_req(sk, skb, req, prev);
1739 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1746 if (nsk->sk_state != TCP_TIME_WAIT) {
1750 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1754 #ifdef CONFIG_SYN_COOKIES
1755 if (!th->rst && !th->syn && th->ack)
1756 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1761 static int tcp_v4_checksum_init(struct sk_buff *skb)
1763 if (skb->ip_summed == CHECKSUM_HW) {
1764 skb->ip_summed = CHECKSUM_UNNECESSARY;
1765 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1766 skb->nh.iph->daddr, skb->csum))
1769 NETDEBUG(if (net_ratelimit())
1770 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1771 skb->ip_summed = CHECKSUM_NONE;
1773 if (skb->len <= 76) {
1774 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1776 skb_checksum(skb, 0, skb->len, 0)))
1778 skb->ip_summed = CHECKSUM_UNNECESSARY;
1780 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1782 skb->nh.iph->daddr, 0);
1788 /* The socket must have it's spinlock held when we get
1791 * We have a potential double-lock case here, so even when
1792 * doing backlog processing we use the BH locking scheme.
1793 * This is because we cannot sleep with the original spinlock
1796 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1798 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1799 TCP_CHECK_TIMER(sk);
1800 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1802 TCP_CHECK_TIMER(sk);
1806 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1809 if (sk->sk_state == TCP_LISTEN) {
1810 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1815 if (tcp_child_process(sk, nsk, skb))
1821 TCP_CHECK_TIMER(sk);
1822 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1824 TCP_CHECK_TIMER(sk);
1828 tcp_v4_send_reset(skb);
1831 /* Be careful here. If this function gets more complicated and
1832 * gcc suffers from register pressure on the x86, sk (in %ebx)
1833 * might be destroyed here. This current version compiles correctly,
1834 * but you have been warned.
1839 TCP_INC_STATS_BH(TcpInErrs);
1847 int tcp_v4_rcv(struct sk_buff *skb)
1853 if (skb->pkt_type != PACKET_HOST)
1856 /* Count it even if it's bad */
1857 TCP_INC_STATS_BH(TcpInSegs);
1859 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1864 if (th->doff < sizeof(struct tcphdr) / 4)
1866 if (!pskb_may_pull(skb, th->doff * 4))
1869 /* An explanation is required here, I think.
1870 * Packet length and doff are validated by header prediction,
1871 * provided case of th->doff==0 is elimineted.
1872 * So, we defer the checks. */
1873 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1874 tcp_v4_checksum_init(skb) < 0))
1878 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1879 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1880 skb->len - th->doff * 4);
1881 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1882 TCP_SKB_CB(skb)->when = 0;
1883 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1884 TCP_SKB_CB(skb)->sacked = 0;
1886 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1887 skb->nh.iph->daddr, ntohs(th->dest),
1894 if (sk->sk_state == TCP_TIME_WAIT)
1897 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1898 goto discard_and_relse;
1900 if (sk_filter(sk, skb, 0))
1901 goto discard_and_relse;
1907 if (!sock_owned_by_user(sk)) {
1908 if (!tcp_prequeue(sk, skb))
1909 ret = tcp_v4_do_rcv(sk, skb);
1911 sk_add_backlog(sk, skb);
1919 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1922 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1924 TCP_INC_STATS_BH(TcpInErrs);
1926 tcp_v4_send_reset(skb);
1930 /* Discard frame. */
1939 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1940 tcp_tw_put((struct tcp_tw_bucket *) sk);
1944 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1945 TCP_INC_STATS_BH(TcpInErrs);
1946 tcp_tw_put((struct tcp_tw_bucket *) sk);
1949 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1950 skb, th, skb->len)) {
1952 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1956 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1957 tcp_tw_put((struct tcp_tw_bucket *)sk);
1961 /* Fall through to ACK */
1964 tcp_v4_timewait_ack(sk, skb);
1968 case TCP_TW_SUCCESS:;
1973 /* With per-bucket locks this operation is not-atomic, so that
1974 * this version is not worse.
1976 static void __tcp_v4_rehash(struct sock *sk)
1978 sk->sk_prot->unhash(sk);
1979 sk->sk_prot->hash(sk);
1982 static int tcp_v4_reselect_saddr(struct sock *sk)
1984 struct inet_opt *inet = inet_sk(sk);
1987 __u32 old_saddr = inet->saddr;
1989 __u32 daddr = inet->daddr;
1991 if (inet->opt && inet->opt->srr)
1992 daddr = inet->opt->faddr;
1994 /* Query new route. */
1995 err = ip_route_connect(&rt, daddr, 0,
1996 RT_TOS(inet->tos) | sk->sk_localroute,
1997 sk->sk_bound_dev_if,
1999 inet->sport, inet->dport, sk);
2003 __sk_dst_set(sk, &rt->u.dst);
2004 tcp_v4_setup_caps(sk, &rt->u.dst);
2005 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
2007 new_saddr = rt->rt_src;
2009 if (new_saddr == old_saddr)
2012 if (sysctl_ip_dynaddr > 1) {
2013 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
2014 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
2016 NIPQUAD(new_saddr));
2019 inet->saddr = new_saddr;
2020 inet->rcv_saddr = new_saddr;
2022 /* XXX The only one ugly spot where we need to
2023 * XXX really change the sockets identity after
2024 * XXX it has entered the hashes. -DaveM
2026 * Besides that, it does not check for connection
2027 * uniqueness. Wait for troubles.
2029 __tcp_v4_rehash(sk);
2033 int tcp_v4_rebuild_header(struct sock *sk)
2035 struct inet_opt *inet = inet_sk(sk);
2036 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
2040 /* Route is OK, nothing to do. */
2045 daddr = inet->daddr;
2046 if (inet->opt && inet->opt->srr)
2047 daddr = inet->opt->faddr;
2050 struct flowi fl = { .oif = sk->sk_bound_dev_if,
2053 .saddr = inet->saddr,
2054 .tos = RT_CONN_FLAGS(sk) } },
2055 .proto = IPPROTO_TCP,
2057 { .sport = inet->sport,
2058 .dport = inet->dport } } };
2060 err = ip_route_output_flow(&rt, &fl, sk, 0);
2063 __sk_dst_set(sk, &rt->u.dst);
2064 tcp_v4_setup_caps(sk, &rt->u.dst);
2065 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
2069 /* Routing failed... */
2070 sk->sk_route_caps = 0;
2072 if (!sysctl_ip_dynaddr ||
2073 sk->sk_state != TCP_SYN_SENT ||
2074 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
2075 (err = tcp_v4_reselect_saddr(sk)) != 0)
2076 sk->sk_err_soft = -err;
2081 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2083 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2084 struct inet_opt *inet = inet_sk(sk);
2086 sin->sin_family = AF_INET;
2087 sin->sin_addr.s_addr = inet->daddr;
2088 sin->sin_port = inet->dport;
2091 /* VJ's idea. Save last timestamp seen from this destination
2092 * and hold it at least for normal timewait interval to use for duplicate
2093 * segment detection in subsequent connections, before they enter synchronized
2097 int tcp_v4_remember_stamp(struct sock *sk)
2099 struct inet_opt *inet = inet_sk(sk);
2100 struct tcp_opt *tp = tcp_sk(sk);
2101 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2102 struct inet_peer *peer = NULL;
2105 if (!rt || rt->rt_dst != inet->daddr) {
2106 peer = inet_getpeer(inet->daddr, 1);
2110 rt_bind_peer(rt, 1);
2115 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2116 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2117 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2118 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2119 peer->tcp_ts = tp->ts_recent;
2129 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2131 struct inet_peer *peer = NULL;
2133 peer = inet_getpeer(tw->tw_daddr, 1);
2136 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2137 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2138 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2139 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2140 peer->tcp_ts = tw->tw_ts_recent;
2149 struct tcp_func ipv4_specific = {
2150 .queue_xmit = ip_queue_xmit,
2151 .send_check = tcp_v4_send_check,
2152 .rebuild_header = tcp_v4_rebuild_header,
2153 .conn_request = tcp_v4_conn_request,
2154 .syn_recv_sock = tcp_v4_syn_recv_sock,
2155 .remember_stamp = tcp_v4_remember_stamp,
2156 .net_header_len = sizeof(struct iphdr),
2157 .setsockopt = ip_setsockopt,
2158 .getsockopt = ip_getsockopt,
2159 .addr2sockaddr = v4_addr2sockaddr,
2160 .sockaddr_len = sizeof(struct sockaddr_in),
2163 /* NOTE: A lot of things set to zero explicitly by call to
2164 * sk_alloc() so need not be done here.
2166 static int tcp_v4_init_sock(struct sock *sk)
2168 struct tcp_opt *tp = tcp_sk(sk);
2170 skb_queue_head_init(&tp->out_of_order_queue);
2171 tcp_init_xmit_timers(sk);
2172 tcp_prequeue_init(tp);
2174 tp->rto = TCP_TIMEOUT_INIT;
2175 tp->mdev = TCP_TIMEOUT_INIT;
2177 /* So many TCP implementations out there (incorrectly) count the
2178 * initial SYN frame in their delayed-ACK and congestion control
2179 * algorithms that we must have the following bandaid to talk
2180 * efficiently to them. -DaveM
2184 /* See draft-stevens-tcpca-spec-01 for discussion of the
2185 * initialization of these values.
2187 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2188 tp->snd_cwnd_clamp = ~0;
2189 tp->mss_cache = 536;
2191 tp->reordering = sysctl_tcp_reordering;
2193 sk->sk_state = TCP_CLOSE;
2195 sk->sk_write_space = sk_stream_write_space;
2196 sk->sk_use_write_queue = 1;
2198 tp->af_specific = &ipv4_specific;
2200 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2201 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2203 atomic_inc(&tcp_sockets_allocated);
2208 static int tcp_v4_destroy_sock(struct sock *sk)
2210 struct tcp_opt *tp = tcp_sk(sk);
2212 tcp_clear_xmit_timers(sk);
2214 /* Cleanup up the write buffer. */
2215 tcp_writequeue_purge(sk);
2217 /* Cleans up our, hopefully empty, out_of_order_queue. */
2218 __skb_queue_purge(&tp->out_of_order_queue);
2220 /* Clean prequeue, it must be empty really */
2221 __skb_queue_purge(&tp->ucopy.prequeue);
2223 /* Clean up a referenced TCP bind bucket. */
2227 /* If sendmsg cached page exists, toss it. */
2228 if (inet_sk(sk)->sndmsg_page)
2229 __free_page(inet_sk(sk)->sndmsg_page);
2231 atomic_dec(&tcp_sockets_allocated);
2236 #ifdef CONFIG_PROC_FS
2237 /* Proc filesystem TCP sock list dumping. */
2239 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2241 return hlist_empty(head) ? NULL :
2242 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2245 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2247 return tw->tw_node.next ?
2248 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2251 static void *listening_get_next(struct seq_file *seq, void *cur)
2254 struct hlist_node *node;
2255 struct sock *sk = cur;
2256 struct tcp_iter_state* st = seq->private;
2260 sk = sk_head(&tcp_listening_hash[0]);
2266 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2267 struct open_request *req = cur;
2269 tp = tcp_sk(st->syn_wait_sk);
2273 if (!vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2275 if (req->class->family == st->family) {
2281 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2284 req = tp->listen_opt->syn_table[st->sbucket];
2286 sk = sk_next(st->syn_wait_sk);
2287 st->state = TCP_SEQ_STATE_LISTENING;
2288 read_unlock_bh(&tp->syn_wait_lock);
2292 sk_for_each_from(sk, node) {
2293 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2295 if (sk->sk_family == st->family) {
2300 read_lock_bh(&tp->syn_wait_lock);
2301 if (tp->listen_opt && tp->listen_opt->qlen) {
2302 st->uid = sock_i_uid(sk);
2303 st->syn_wait_sk = sk;
2304 st->state = TCP_SEQ_STATE_OPENREQ;
2308 read_unlock_bh(&tp->syn_wait_lock);
2310 if (++st->bucket < TCP_LHTABLE_SIZE) {
2311 sk = sk_head(&tcp_listening_hash[st->bucket]);
2319 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2321 void *rc = listening_get_next(seq, NULL);
2323 while (rc && *pos) {
2324 rc = listening_get_next(seq, rc);
2330 static void *established_get_first(struct seq_file *seq)
2332 struct tcp_iter_state* st = seq->private;
2335 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2337 struct hlist_node *node;
2338 struct tcp_tw_bucket *tw;
2340 read_lock(&tcp_ehash[st->bucket].lock);
2341 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2342 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2344 if (sk->sk_family != st->family)
2349 st->state = TCP_SEQ_STATE_TIME_WAIT;
2350 tw_for_each(tw, node,
2351 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2352 if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2354 if (tw->tw_family != st->family)
2359 read_unlock(&tcp_ehash[st->bucket].lock);
2360 st->state = TCP_SEQ_STATE_ESTABLISHED;
2366 static void *established_get_next(struct seq_file *seq, void *cur)
2368 struct sock *sk = cur;
2369 struct tcp_tw_bucket *tw;
2370 struct hlist_node *node;
2371 struct tcp_iter_state* st = seq->private;
2375 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2379 while (tw && tw->tw_family != st->family &&
2380 !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH)) {
2387 read_unlock(&tcp_ehash[st->bucket].lock);
2388 st->state = TCP_SEQ_STATE_ESTABLISHED;
2389 if (++st->bucket < tcp_ehash_size) {
2390 read_lock(&tcp_ehash[st->bucket].lock);
2391 sk = sk_head(&tcp_ehash[st->bucket].chain);
2399 sk_for_each_from(sk, node) {
2400 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2402 if (sk->sk_family == st->family)
2406 st->state = TCP_SEQ_STATE_TIME_WAIT;
2407 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2415 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2417 void *rc = established_get_first(seq);
2420 rc = established_get_next(seq, rc);
2426 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2429 struct tcp_iter_state* st = seq->private;
2432 st->state = TCP_SEQ_STATE_LISTENING;
2433 rc = listening_get_idx(seq, &pos);
2436 tcp_listen_unlock();
2438 st->state = TCP_SEQ_STATE_ESTABLISHED;
2439 rc = established_get_idx(seq, pos);
2445 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2447 struct tcp_iter_state* st = seq->private;
2448 st->state = TCP_SEQ_STATE_LISTENING;
2450 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2453 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2456 struct tcp_iter_state* st;
2458 if (v == SEQ_START_TOKEN) {
2459 rc = tcp_get_idx(seq, 0);
2464 switch (st->state) {
2465 case TCP_SEQ_STATE_OPENREQ:
2466 case TCP_SEQ_STATE_LISTENING:
2467 rc = listening_get_next(seq, v);
2469 tcp_listen_unlock();
2471 st->state = TCP_SEQ_STATE_ESTABLISHED;
2472 rc = established_get_first(seq);
2475 case TCP_SEQ_STATE_ESTABLISHED:
2476 case TCP_SEQ_STATE_TIME_WAIT:
2477 rc = established_get_next(seq, v);
2485 static void tcp_seq_stop(struct seq_file *seq, void *v)
2487 struct tcp_iter_state* st = seq->private;
2489 switch (st->state) {
2490 case TCP_SEQ_STATE_OPENREQ:
2492 struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2493 read_unlock_bh(&tp->syn_wait_lock);
2495 case TCP_SEQ_STATE_LISTENING:
2496 if (v != SEQ_START_TOKEN)
2497 tcp_listen_unlock();
2499 case TCP_SEQ_STATE_TIME_WAIT:
2500 case TCP_SEQ_STATE_ESTABLISHED:
2502 read_unlock(&tcp_ehash[st->bucket].lock);
2508 static int tcp_seq_open(struct inode *inode, struct file *file)
2510 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2511 struct seq_file *seq;
2512 struct tcp_iter_state *s;
2515 if (unlikely(afinfo == NULL))
2518 s = kmalloc(sizeof(*s), GFP_KERNEL);
2521 memset(s, 0, sizeof(*s));
2522 s->family = afinfo->family;
2523 s->seq_ops.start = tcp_seq_start;
2524 s->seq_ops.next = tcp_seq_next;
2525 s->seq_ops.show = afinfo->seq_show;
2526 s->seq_ops.stop = tcp_seq_stop;
2528 rc = seq_open(file, &s->seq_ops);
2531 seq = file->private_data;
2540 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2543 struct proc_dir_entry *p;
2547 afinfo->seq_fops->owner = afinfo->owner;
2548 afinfo->seq_fops->open = tcp_seq_open;
2549 afinfo->seq_fops->read = seq_read;
2550 afinfo->seq_fops->llseek = seq_lseek;
2551 afinfo->seq_fops->release = seq_release_private;
2553 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2561 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2565 proc_net_remove(afinfo->name);
2566 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2569 static void get_openreq4(struct sock *sk, struct open_request *req,
2570 char *tmpbuf, int i, int uid)
2572 int ttd = req->expires - jiffies;
2574 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2575 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2577 req->af.v4_req.loc_addr,
2578 ntohs(inet_sk(sk)->sport),
2579 req->af.v4_req.rmt_addr,
2580 ntohs(req->rmt_port),
2582 0, 0, /* could print option size, but that is af dependent. */
2583 1, /* timers active (only the expire timer) */
2584 jiffies_to_clock_t(ttd),
2587 0, /* non standard timer */
2588 0, /* open_requests have no inode */
2589 atomic_read(&sk->sk_refcnt),
2593 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2596 unsigned long timer_expires;
2597 struct tcp_opt *tp = tcp_sk(sp);
2598 struct inet_opt *inet = inet_sk(sp);
2599 unsigned int dest = inet->daddr;
2600 unsigned int src = inet->rcv_saddr;
2601 __u16 destp = ntohs(inet->dport);
2602 __u16 srcp = ntohs(inet->sport);
2604 if (tp->pending == TCP_TIME_RETRANS) {
2606 timer_expires = tp->timeout;
2607 } else if (tp->pending == TCP_TIME_PROBE0) {
2609 timer_expires = tp->timeout;
2610 } else if (timer_pending(&sp->sk_timer)) {
2612 timer_expires = sp->sk_timer.expires;
2615 timer_expires = jiffies;
2618 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2619 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2620 i, src, srcp, dest, destp, sp->sk_state,
2621 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2623 jiffies_to_clock_t(timer_expires - jiffies),
2628 atomic_read(&sp->sk_refcnt), sp,
2629 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2631 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2634 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2636 unsigned int dest, src;
2638 int ttd = tw->tw_ttd - jiffies;
2643 dest = tw->tw_daddr;
2644 src = tw->tw_rcv_saddr;
2645 destp = ntohs(tw->tw_dport);
2646 srcp = ntohs(tw->tw_sport);
2648 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2649 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2650 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2651 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2652 atomic_read(&tw->tw_refcnt), tw);
2657 static int tcp4_seq_show(struct seq_file *seq, void *v)
2659 struct tcp_iter_state* st;
2660 char tmpbuf[TMPSZ + 1];
2662 if (v == SEQ_START_TOKEN) {
2663 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2664 " sl local_address rem_address st tx_queue "
2665 "rx_queue tr tm->when retrnsmt uid timeout "
2671 switch (st->state) {
2672 case TCP_SEQ_STATE_LISTENING:
2673 case TCP_SEQ_STATE_ESTABLISHED:
2674 get_tcp4_sock(v, tmpbuf, st->num);
2676 case TCP_SEQ_STATE_OPENREQ:
2677 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2679 case TCP_SEQ_STATE_TIME_WAIT:
2680 get_timewait4_sock(v, tmpbuf, st->num);
2683 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2688 static struct file_operations tcp4_seq_fops;
2689 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2690 .owner = THIS_MODULE,
2693 .seq_show = tcp4_seq_show,
2694 .seq_fops = &tcp4_seq_fops,
2697 int __init tcp4_proc_init(void)
2699 return tcp_proc_register(&tcp4_seq_afinfo);
2702 void tcp4_proc_exit(void)
2704 tcp_proc_unregister(&tcp4_seq_afinfo);
2706 #endif /* CONFIG_PROC_FS */
2708 struct proto tcp_prot = {
2711 .connect = tcp_v4_connect,
2712 .disconnect = tcp_disconnect,
2713 .accept = tcp_accept,
2715 .init = tcp_v4_init_sock,
2716 .destroy = tcp_v4_destroy_sock,
2717 .shutdown = tcp_shutdown,
2718 .setsockopt = tcp_setsockopt,
2719 .getsockopt = tcp_getsockopt,
2720 .sendmsg = tcp_sendmsg,
2721 .recvmsg = tcp_recvmsg,
2722 .backlog_rcv = tcp_v4_do_rcv,
2723 .hash = tcp_v4_hash,
2724 .unhash = tcp_unhash,
2725 .get_port = tcp_v4_get_port,
2730 void __init tcp_v4_init(struct net_proto_family *ops)
2732 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2734 panic("Failed to create the TCP control socket.\n");
2735 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2736 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2738 /* Unhash it so that IP input processing does not even
2739 * see it, we do not wish this socket to see incoming
2742 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2745 EXPORT_SYMBOL(ipv4_specific);
2746 EXPORT_SYMBOL(tcp_bind_hash);
2747 EXPORT_SYMBOL(tcp_bucket_create);
2748 EXPORT_SYMBOL(tcp_hashinfo);
2749 EXPORT_SYMBOL(tcp_inherit_port);
2750 EXPORT_SYMBOL(tcp_listen_wlock);
2751 EXPORT_SYMBOL(tcp_port_rover);
2752 EXPORT_SYMBOL(tcp_prot);
2753 EXPORT_SYMBOL(tcp_put_port);
2754 EXPORT_SYMBOL(tcp_unhash);
2755 EXPORT_SYMBOL(tcp_v4_conn_request);
2756 EXPORT_SYMBOL(tcp_v4_connect);
2757 EXPORT_SYMBOL(tcp_v4_do_rcv);
2758 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2759 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2760 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2761 EXPORT_SYMBOL(tcp_v4_send_check);
2762 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2764 #ifdef CONFIG_PROC_FS
2765 EXPORT_SYMBOL(tcp_proc_register);
2766 EXPORT_SYMBOL(tcp_proc_unregister);
2768 #ifdef CONFIG_SYSCTL
2769 EXPORT_SYMBOL(sysctl_local_port_range);
2770 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2771 EXPORT_SYMBOL(sysctl_tcp_low_latency);