2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
113 return h & (tcp_ehash_size - 1);
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 struct inet_opt *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
124 return tcp_hashfn(laddr, lport, faddr, fport);
127 /* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
170 __tcp_inherit_port(sk, child);
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
207 /* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
228 if (rover < low || rover > high)
230 head = &tcp_bhash[tcp_bhashfn(rover)];
231 spin_lock(&head->lock);
232 tb_for_each(tb, node, &head->chain)
233 if (tb->port == rover)
237 spin_unlock(&head->lock);
238 } while (--remaining > 0);
239 tcp_port_rover = rover;
240 spin_unlock(&tcp_portalloc_lock);
242 /* Exhausted local port range during search? */
247 /* OK, here is the one we will use. HEAD is
248 * non-NULL and we hold it's mutex.
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 tb_for_each(tb, node, &head->chain)
255 if (tb->port == snum)
261 if (!hlist_empty(&tb->owners)) {
262 if (sk->sk_reuse > 1)
264 if (tb->fastreuse > 0 &&
265 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
269 if (tcp_bind_conflict(sk, tb))
275 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
277 if (hlist_empty(&tb->owners)) {
278 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
282 } else if (tb->fastreuse &&
283 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
286 if (!tcp_sk(sk)->bind_hash)
287 tcp_bind_hash(sk, tb, snum);
288 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
292 spin_unlock(&head->lock);
298 /* Get rid of any references to a local port held by the
301 static void __tcp_put_port(struct sock *sk)
303 struct inet_opt *inet = inet_sk(sk);
304 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305 struct tcp_bind_bucket *tb;
307 spin_lock(&head->lock);
308 tb = tcp_sk(sk)->bind_hash;
309 __sk_del_bind_node(sk);
310 tcp_sk(sk)->bind_hash = NULL;
312 tcp_bucket_destroy(tb);
313 spin_unlock(&head->lock);
316 void tcp_put_port(struct sock *sk)
323 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324 * Look, when several writers sleep and reader wakes them up, all but one
325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326 * this, _but_ remember, it adds useless work on UP machines (wake up each
327 * exclusive lock release). It should be ifdefed really.
330 void tcp_listen_wlock(void)
332 write_lock(&tcp_lhash_lock);
334 if (atomic_read(&tcp_lhash_users)) {
338 prepare_to_wait_exclusive(&tcp_lhash_wait,
339 &wait, TASK_UNINTERRUPTIBLE);
340 if (!atomic_read(&tcp_lhash_users))
342 write_unlock_bh(&tcp_lhash_lock);
344 write_lock_bh(&tcp_lhash_lock);
347 finish_wait(&tcp_lhash_wait, &wait);
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
353 struct hlist_head *list;
356 BUG_TRAP(sk_unhashed(sk));
357 if (listen_possible && sk->sk_state == TCP_LISTEN) {
358 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
362 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->sk_hashent].lock;
366 __sk_add_node(sk, list);
367 sock_prot_inc_use(sk->sk_prot);
369 if (listen_possible && sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
373 static void tcp_v4_hash(struct sock *sk)
375 if (sk->sk_state != TCP_CLOSE) {
377 __tcp_v4_hash(sk, 1);
382 void tcp_unhash(struct sock *sk)
389 if (sk->sk_state == TCP_LISTEN) {
392 lock = &tcp_lhash_lock;
394 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
396 write_lock_bh(&head->lock);
399 if (__sk_del_node_init(sk))
400 sock_prot_dec_use(sk->sk_prot);
401 write_unlock_bh(lock);
404 if (sk->sk_state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
408 /* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
414 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415 unsigned short hnum, int dif)
417 struct sock *result = NULL, *sk;
418 struct hlist_node *node;
422 sk_for_each(sk, node, head) {
423 struct inet_opt *inet = inet_sk(sk);
425 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426 __u32 rcv_saddr = inet->rcv_saddr;
428 score = (sk->sk_family == PF_INET ? 1 : 0);
430 if (rcv_saddr != daddr)
434 if (sk->sk_bound_dev_if) {
435 if (sk->sk_bound_dev_if != dif)
441 if (score > hiscore) {
450 /* Optimize the common listener case. */
451 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
454 struct sock *sk = NULL;
455 struct hlist_head *head;
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
461 if (inet->num == hnum && !sk->sk_node.next &&
462 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
463 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
464 !sk->sk_bound_dev_if)
466 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
472 read_unlock(&tcp_lhash_lock);
476 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
477 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
479 * Local BH must be disabled here.
482 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
486 struct tcp_ehash_bucket *head;
487 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
488 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
490 struct hlist_node *node;
491 /* Optimize here for direct hit, only listening connections can
492 * have wildcards anyways.
494 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
495 head = &tcp_ehash[hash];
496 read_lock(&head->lock);
497 sk_for_each(sk, node, &head->chain) {
498 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
499 goto hit; /* You sunk my battleship! */
502 /* Must check for a TIME_WAIT'er before going to listener hash. */
503 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
504 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
509 read_unlock(&head->lock);
516 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
517 u32 daddr, u16 hnum, int dif)
519 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
522 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
525 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
531 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
537 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
539 return secure_tcp_sequence_number(skb->nh.iph->daddr,
545 /* called with local bh disabled */
546 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
547 struct tcp_tw_bucket **twp)
549 struct inet_opt *inet = inet_sk(sk);
550 u32 daddr = inet->rcv_saddr;
551 u32 saddr = inet->daddr;
552 int dif = sk->sk_bound_dev_if;
553 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
554 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
555 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
556 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
558 struct hlist_node *node;
559 struct tcp_tw_bucket *tw;
561 write_lock(&head->lock);
563 /* Check TIME-WAIT sockets first. */
564 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
565 tw = (struct tcp_tw_bucket *)sk2;
567 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
568 struct tcp_opt *tp = tcp_sk(sk);
570 /* With PAWS, it is safe from the viewpoint
571 of data integrity. Even without PAWS it
572 is safe provided sequence spaces do not
573 overlap i.e. at data rates <= 80Mbit/sec.
575 Actually, the idea is close to VJ's one,
576 only timestamp cache is held not per host,
577 but per port pair and TW bucket is used
580 If TW bucket has been already destroyed we
581 fall back to VJ's scheme and use initial
582 timestamp retrieved from peer table.
584 if (tw->tw_ts_recent_stamp &&
585 (!twp || (sysctl_tcp_tw_reuse &&
587 tw->tw_ts_recent_stamp > 1))) {
589 tw->tw_snd_nxt + 65535 + 2) == 0)
591 tp->ts_recent = tw->tw_ts_recent;
592 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
601 /* And established part... */
602 sk_for_each(sk2, node, &head->chain) {
603 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
608 /* Must record num and sport now. Otherwise we will see
609 * in hash table socket with a funny identity. */
611 inet->sport = htons(lport);
612 sk->sk_hashent = hash;
613 BUG_TRAP(sk_unhashed(sk));
614 __sk_add_node(sk, &head->chain);
615 sock_prot_inc_use(sk->sk_prot);
616 write_unlock(&head->lock);
620 NET_INC_STATS_BH(TimeWaitRecycled);
622 /* Silly. Should hash-dance instead... */
623 tcp_tw_deschedule(tw);
624 NET_INC_STATS_BH(TimeWaitRecycled);
632 write_unlock(&head->lock);
633 return -EADDRNOTAVAIL;
637 * Bind a port for a connect operation and hash it.
639 static int tcp_v4_hash_connect(struct sock *sk)
641 unsigned short snum = inet_sk(sk)->num;
642 struct tcp_bind_hashbucket *head;
643 struct tcp_bind_bucket *tb;
648 int low = sysctl_local_port_range[0];
649 int high = sysctl_local_port_range[1];
650 int remaining = (high - low) + 1;
651 struct hlist_node *node;
652 struct tcp_tw_bucket *tw = NULL;
656 /* TODO. Actually it is not so bad idea to remove
657 * tcp_portalloc_lock before next submission to Linus.
658 * As soon as we touch this place at all it is time to think.
660 * Now it protects single _advisory_ variable tcp_port_rover,
661 * hence it is mostly useless.
662 * Code will work nicely if we just delete it, but
663 * I am afraid in contented case it will work not better or
664 * even worse: another cpu just will hit the same bucket
666 * So some cpu salt could remove both contention and
667 * memory pingpong. Any ideas how to do this in a nice way?
669 spin_lock(&tcp_portalloc_lock);
670 rover = tcp_port_rover;
674 if ((rover < low) || (rover > high))
676 head = &tcp_bhash[tcp_bhashfn(rover)];
677 spin_lock(&head->lock);
679 /* Does not bother with rcv_saddr checks,
680 * because the established check is already
683 tb_for_each(tb, node, &head->chain) {
684 if (tb->port == rover) {
685 BUG_TRAP(!hlist_empty(&tb->owners));
686 if (tb->fastreuse >= 0)
688 if (!__tcp_v4_check_established(sk,
696 tb = tcp_bucket_create(head, rover);
698 spin_unlock(&head->lock);
705 spin_unlock(&head->lock);
706 } while (--remaining > 0);
707 tcp_port_rover = rover;
708 spin_unlock(&tcp_portalloc_lock);
712 return -EADDRNOTAVAIL;
715 /* All locks still held and bhs disabled */
716 tcp_port_rover = rover;
717 spin_unlock(&tcp_portalloc_lock);
719 tcp_bind_hash(sk, tb, rover);
720 if (sk_unhashed(sk)) {
721 inet_sk(sk)->sport = htons(rover);
722 __tcp_v4_hash(sk, 0);
724 spin_unlock(&head->lock);
727 tcp_tw_deschedule(tw);
735 head = &tcp_bhash[tcp_bhashfn(snum)];
736 tb = tcp_sk(sk)->bind_hash;
737 spin_lock_bh(&head->lock);
738 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
739 __tcp_v4_hash(sk, 0);
740 spin_unlock_bh(&head->lock);
743 spin_unlock(&head->lock);
744 /* No definite answer... Walk to established hash table */
745 ret = __tcp_v4_check_established(sk, snum, NULL);
752 /* This will initiate an outgoing connection. */
753 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
755 struct inet_opt *inet = inet_sk(sk);
756 struct tcp_opt *tp = tcp_sk(sk);
757 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
763 if (addr_len < sizeof(struct sockaddr_in))
766 if (usin->sin_family != AF_INET)
767 return -EAFNOSUPPORT;
769 nexthop = daddr = usin->sin_addr.s_addr;
770 if (inet->opt && inet->opt->srr) {
773 nexthop = inet->opt->faddr;
776 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
777 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
779 inet->sport, usin->sin_port, sk);
783 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
788 if (!inet->opt || !inet->opt->srr)
792 inet->saddr = rt->rt_src;
793 inet->rcv_saddr = inet->saddr;
795 if (tp->ts_recent_stamp && inet->daddr != daddr) {
796 /* Reset inherited state */
798 tp->ts_recent_stamp = 0;
802 if (sysctl_tcp_tw_recycle &&
803 !tp->ts_recent_stamp && rt->rt_dst == daddr) {
804 struct inet_peer *peer = rt_get_peer(rt);
806 /* VJ's idea. We save last timestamp seen from
807 * the destination in peer table, when entering state TIME-WAIT
808 * and initialize ts_recent from it, when trying new connection.
811 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
812 tp->ts_recent_stamp = peer->tcp_ts_stamp;
813 tp->ts_recent = peer->tcp_ts;
817 inet->dport = usin->sin_port;
820 tp->ext_header_len = 0;
822 tp->ext_header_len = inet->opt->optlen;
826 /* Socket identity is still unknown (sport may be zero).
827 * However we set state to SYN-SENT and not releasing socket
828 * lock select source port, enter ourselves into the hash tables and
829 * complete initialization after this.
831 tcp_set_state(sk, TCP_SYN_SENT);
832 err = tcp_v4_hash_connect(sk);
836 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
840 /* OK, now commit destination to socket. */
841 __sk_dst_set(sk, &rt->u.dst);
842 tcp_v4_setup_caps(sk, &rt->u.dst);
843 tp->ext2_header_len = rt->u.dst.header_len;
846 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
851 inet->id = tp->write_seq ^ jiffies;
853 err = tcp_connect(sk);
861 /* This unhashes the socket and releases the local port, if necessary. */
862 tcp_set_state(sk, TCP_CLOSE);
864 sk->sk_route_caps = 0;
869 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
871 return ((struct rtable *)skb->dst)->rt_iif;
874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
876 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880 struct open_request ***prevp,
882 __u32 raddr, __u32 laddr)
884 struct tcp_listen_opt *lopt = tp->listen_opt;
885 struct open_request *req, **prev;
887 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888 (req = *prev) != NULL;
889 prev = &req->dl_next) {
890 if (req->rmt_port == rport &&
891 req->af.v4_req.rmt_addr == raddr &&
892 req->af.v4_req.loc_addr == laddr &&
893 TCP_INET_FAMILY(req->class->family)) {
903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
905 struct tcp_opt *tp = tcp_sk(sk);
906 struct tcp_listen_opt *lopt = tp->listen_opt;
907 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
909 req->expires = jiffies + TCP_TIMEOUT_INIT;
912 req->dl_next = lopt->syn_table[h];
914 write_lock(&tp->syn_wait_lock);
915 lopt->syn_table[h] = req;
916 write_unlock(&tp->syn_wait_lock);
918 #ifdef CONFIG_ACCEPT_QUEUES
919 tcp_synq_added(sk, req);
927 * This routine does path mtu discovery as defined in RFC1191.
929 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
932 struct dst_entry *dst;
933 struct inet_opt *inet = inet_sk(sk);
934 struct tcp_opt *tp = tcp_sk(sk);
936 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
937 * send out by Linux are always <576bytes so they should go through
940 if (sk->sk_state == TCP_LISTEN)
943 /* We don't check in the destentry if pmtu discovery is forbidden
944 * on this route. We just assume that no packet_to_big packets
945 * are send back when pmtu discovery is not active.
946 * There is a small race when the user changes this flag in the
947 * route, but I think that's acceptable.
949 if ((dst = __sk_dst_check(sk, 0)) == NULL)
952 dst->ops->update_pmtu(dst, mtu);
954 /* Something is about to be wrong... Remember soft error
955 * for the case, if this connection will not able to recover.
957 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
958 sk->sk_err_soft = EMSGSIZE;
962 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
963 tp->pmtu_cookie > mtu) {
964 tcp_sync_mss(sk, mtu);
966 /* Resend the TCP packet because it's
967 * clear that the old packet has been
968 * dropped. This is the new "fast" path mtu
971 tcp_simple_retransmit(sk);
972 } /* else let the usual retransmit timer handle it */
976 * This routine is called by the ICMP module when it gets some
977 * sort of error condition. If err < 0 then the socket should
978 * be closed and the error returned to the user. If err > 0
979 * it's just the icmp type << 8 | icmp code. After adjustment
980 * header points to the first 8 bytes of the tcp header. We need
981 * to find the appropriate port.
983 * The locking strategy used here is very "optimistic". When
984 * someone else accesses the socket the ICMP is just dropped
985 * and for some paths there is no check at all.
986 * A more general error queue to queue errors for later handling
987 * is probably better.
991 void tcp_v4_err(struct sk_buff *skb, u32 info)
993 struct iphdr *iph = (struct iphdr *)skb->data;
994 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
996 struct inet_opt *inet;
997 int type = skb->h.icmph->type;
998 int code = skb->h.icmph->code;
1003 if (skb->len < (iph->ihl << 2) + 8) {
1004 ICMP_INC_STATS_BH(IcmpInErrors);
1008 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1009 th->source, tcp_v4_iif(skb));
1011 ICMP_INC_STATS_BH(IcmpInErrors);
1014 if (sk->sk_state == TCP_TIME_WAIT) {
1015 tcp_tw_put((struct tcp_tw_bucket *)sk);
1020 /* If too many ICMPs get dropped on busy
1021 * servers this needs to be solved differently.
1023 if (sock_owned_by_user(sk))
1024 NET_INC_STATS_BH(LockDroppedIcmps);
1026 if (sk->sk_state == TCP_CLOSE)
1030 seq = ntohl(th->seq);
1031 if (sk->sk_state != TCP_LISTEN &&
1032 !between(seq, tp->snd_una, tp->snd_nxt)) {
1033 NET_INC_STATS(OutOfWindowIcmps);
1038 case ICMP_SOURCE_QUENCH:
1039 /* This is deprecated, but if someone generated it,
1040 * we have no reasons to ignore it.
1042 if (!sock_owned_by_user(sk))
1045 case ICMP_PARAMETERPROB:
1048 case ICMP_DEST_UNREACH:
1049 if (code > NR_ICMP_UNREACH)
1052 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1053 if (!sock_owned_by_user(sk))
1054 do_pmtu_discovery(sk, iph, info);
1058 err = icmp_err_convert[code].errno;
1060 case ICMP_TIME_EXCEEDED:
1067 switch (sk->sk_state) {
1068 struct open_request *req, **prev;
1070 if (sock_owned_by_user(sk))
1073 req = tcp_v4_search_req(tp, &prev, th->dest,
1074 iph->daddr, iph->saddr);
1078 /* ICMPs are not backlogged, hence we cannot get
1079 an established socket here.
1083 if (seq != req->snt_isn) {
1084 NET_INC_STATS_BH(OutOfWindowIcmps);
1089 * Still in SYN_RECV, just remove it silently.
1090 * There is no good way to pass the error to the newly
1091 * created socket, and POSIX does not want network
1092 * errors returned from accept().
1094 tcp_synq_drop(sk, req, prev);
1098 case TCP_SYN_RECV: /* Cannot happen.
1099 It can f.e. if SYNs crossed.
1101 if (!sock_owned_by_user(sk)) {
1102 TCP_INC_STATS_BH(TcpAttemptFails);
1105 sk->sk_error_report(sk);
1109 sk->sk_err_soft = err;
1114 /* If we've already connected we will keep trying
1115 * until we time out, or the user gives up.
1117 * rfc1122 4.2.3.9 allows to consider as hard errors
1118 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1119 * but it is obsoleted by pmtu discovery).
1121 * Note, that in modern internet, where routing is unreliable
1122 * and in each dark corner broken firewalls sit, sending random
1123 * errors ordered by their masters even this two messages finally lose
1124 * their original sense (even Linux sends invalid PORT_UNREACHs)
1126 * Now we are in compliance with RFCs.
1131 if (!sock_owned_by_user(sk) && inet->recverr) {
1133 sk->sk_error_report(sk);
1134 } else { /* Only an error on timeout */
1135 sk->sk_err_soft = err;
1143 /* This routine computes an IPv4 TCP checksum. */
1144 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1145 struct sk_buff *skb)
1147 struct inet_opt *inet = inet_sk(sk);
1149 if (skb->ip_summed == CHECKSUM_HW) {
1150 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1151 skb->csum = offsetof(struct tcphdr, check);
1153 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1154 csum_partial((char *)th,
1161 * This routine will send an RST to the other tcp.
1163 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1165 * Answer: if a packet caused RST, it is not for a socket
1166 * existing in our system, if it is matched to a socket,
1167 * it is just duplicate segment or bug in other side's TCP.
1168 * So that we build reply only basing on parameters
1169 * arrived with segment.
1170 * Exception: precedence violation. We do not implement it in any case.
1173 static void tcp_v4_send_reset(struct sk_buff *skb)
1175 struct tcphdr *th = skb->h.th;
1177 struct ip_reply_arg arg;
1179 /* Never send a reset in response to a reset. */
1183 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1186 /* Swap the send and the receive. */
1187 memset(&rth, 0, sizeof(struct tcphdr));
1188 rth.dest = th->source;
1189 rth.source = th->dest;
1190 rth.doff = sizeof(struct tcphdr) / 4;
1194 rth.seq = th->ack_seq;
1197 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1198 skb->len - (th->doff << 2));
1201 memset(&arg, 0, sizeof arg);
1202 arg.iov[0].iov_base = (unsigned char *)&rth;
1203 arg.iov[0].iov_len = sizeof rth;
1204 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1205 skb->nh.iph->saddr, /*XXX*/
1206 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1207 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1209 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1211 TCP_INC_STATS_BH(TcpOutSegs);
1212 TCP_INC_STATS_BH(TcpOutRsts);
1215 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1216 outside socket context is ugly, certainly. What can I do?
1219 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1222 struct tcphdr *th = skb->h.th;
1227 struct ip_reply_arg arg;
1229 memset(&rep.th, 0, sizeof(struct tcphdr));
1230 memset(&arg, 0, sizeof arg);
1232 arg.iov[0].iov_base = (unsigned char *)&rep;
1233 arg.iov[0].iov_len = sizeof(rep.th);
1235 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1236 (TCPOPT_TIMESTAMP << 8) |
1238 rep.tsopt[1] = htonl(tcp_time_stamp);
1239 rep.tsopt[2] = htonl(ts);
1240 arg.iov[0].iov_len = sizeof(rep);
1243 /* Swap the send and the receive. */
1244 rep.th.dest = th->source;
1245 rep.th.source = th->dest;
1246 rep.th.doff = arg.iov[0].iov_len / 4;
1247 rep.th.seq = htonl(seq);
1248 rep.th.ack_seq = htonl(ack);
1250 rep.th.window = htons(win);
1252 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1253 skb->nh.iph->saddr, /*XXX*/
1254 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1255 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1257 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1259 TCP_INC_STATS_BH(TcpOutSegs);
1262 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1264 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1266 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1267 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1272 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1274 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1278 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1279 struct open_request *req)
1282 struct ip_options *opt = req->af.v4_req.opt;
1283 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1285 { .daddr = ((opt && opt->srr) ?
1287 req->af.v4_req.rmt_addr),
1288 .saddr = req->af.v4_req.loc_addr,
1289 .tos = RT_CONN_FLAGS(sk) } },
1290 .proto = IPPROTO_TCP,
1292 { .sport = inet_sk(sk)->sport,
1293 .dport = req->rmt_port } } };
1295 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1296 IP_INC_STATS_BH(IpOutNoRoutes);
1299 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1301 IP_INC_STATS_BH(IpOutNoRoutes);
1308 * Send a SYN-ACK after having received an ACK.
1309 * This still operates on a open_request only, not on a big
1312 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1313 struct dst_entry *dst)
1316 struct sk_buff * skb;
1318 /* First, grab a route. */
1319 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1322 skb = tcp_make_synack(sk, dst, req);
1325 struct tcphdr *th = skb->h.th;
1327 th->check = tcp_v4_check(th, skb->len,
1328 req->af.v4_req.loc_addr,
1329 req->af.v4_req.rmt_addr,
1330 csum_partial((char *)th, skb->len,
1333 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1334 req->af.v4_req.rmt_addr,
1335 req->af.v4_req.opt);
1336 if (err == NET_XMIT_CN)
1346 * IPv4 open_request destructor.
1348 static void tcp_v4_or_free(struct open_request *req)
1350 if (req->af.v4_req.opt)
1351 kfree(req->af.v4_req.opt);
1354 static inline void syn_flood_warning(struct sk_buff *skb)
1356 static unsigned long warntime;
1358 if (time_after(jiffies, (warntime + HZ * 60))) {
1361 "possible SYN flooding on port %d. Sending cookies.\n",
1362 ntohs(skb->h.th->dest));
1367 * Save and compile IPv4 options into the open_request if needed.
1369 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1370 struct sk_buff *skb)
1372 struct ip_options *opt = &(IPCB(skb)->opt);
1373 struct ip_options *dopt = NULL;
1375 if (opt && opt->optlen) {
1376 int opt_size = optlength(opt);
1377 dopt = kmalloc(opt_size, GFP_ATOMIC);
1379 if (ip_options_echo(dopt, skb)) {
1389 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1390 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1391 * It would be better to replace it with a global counter for all sockets
1392 * but then some measure against one socket starving all other sockets
1395 * It was 128 by default. Experiments with real servers show, that
1396 * it is absolutely not enough even at 100conn/sec. 256 cures most
1397 * of problems. This value is adjusted to 128 for very small machines
1398 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1399 * Further increasing requires to change hash table size.
1401 int sysctl_max_syn_backlog = 256;
1403 struct or_calltable or_ipv4 = {
1405 .rtx_syn_ack = tcp_v4_send_synack,
1406 .send_ack = tcp_v4_or_send_ack,
1407 .destructor = tcp_v4_or_free,
1408 .send_reset = tcp_v4_send_reset,
1411 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1414 struct open_request *req;
1415 __u32 saddr = skb->nh.iph->saddr;
1416 __u32 daddr = skb->nh.iph->daddr;
1417 __u32 isn = TCP_SKB_CB(skb)->when;
1418 struct dst_entry *dst = NULL;
1419 #ifdef CONFIG_ACCEPT_QUEUES
1422 #ifdef CONFIG_SYN_COOKIES
1423 int want_cookie = 0;
1425 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1428 /* Never answer to SYNs send to broadcast or multicast */
1429 if (((struct rtable *)skb->dst)->rt_flags &
1430 (RTCF_BROADCAST | RTCF_MULTICAST))
1433 /* TW buckets are converted to open requests without
1434 * limitations, they conserve resources and peer is
1435 * evidently real one.
1437 if (tcp_synq_is_full(sk) && !isn) {
1438 #ifdef CONFIG_SYN_COOKIES
1439 if (sysctl_tcp_syncookies) {
1446 #ifdef CONFIG_ACCEPT_QUEUES
1447 class = (skb->nfmark <= 0) ? 0 :
1448 ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
1450 * Accept only if the class has shares set or if the default class
1451 * i.e. class 0 has shares
1453 if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
1454 if (tcp_sk(sk)->acceptq[0].aq_valid)
1461 /* Accept backlog is full. If we have already queued enough
1462 * of warm entries in syn queue, drop request. It is better than
1463 * clogging syn queue with openreqs with exponentially increasing
1466 #ifdef CONFIG_ACCEPT_QUEUES
1467 if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
1469 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1473 req = tcp_openreq_alloc();
1477 tcp_clear_options(&tp);
1479 tp.user_mss = tcp_sk(sk)->user_mss;
1481 tcp_parse_options(skb, &tp, 0);
1484 tcp_clear_options(&tp);
1488 if (tp.saw_tstamp && !tp.rcv_tsval) {
1489 /* Some OSes (unknown ones, but I see them on web server, which
1490 * contains information interesting only for windows'
1491 * users) do not send their stamp in SYN. It is easy case.
1492 * We simply do not advertise TS support.
1497 tp.tstamp_ok = tp.saw_tstamp;
1499 tcp_openreq_init(req, &tp, skb);
1500 #ifdef CONFIG_ACCEPT_QUEUES
1501 req->acceptq_class = class;
1502 req->acceptq_time_stamp = jiffies;
1504 req->af.v4_req.loc_addr = daddr;
1505 req->af.v4_req.rmt_addr = saddr;
1506 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1507 req->class = &or_ipv4;
1509 TCP_ECN_create_request(req, skb->h.th);
1512 #ifdef CONFIG_SYN_COOKIES
1513 syn_flood_warning(skb);
1515 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1517 struct inet_peer *peer = NULL;
1519 /* VJ's idea. We save last timestamp seen
1520 * from the destination in peer table, when entering
1521 * state TIME-WAIT, and check against it before
1522 * accepting new connection request.
1524 * If "isn" is not zero, this request hit alive
1525 * timewait bucket, so that all the necessary checks
1526 * are made in the function processing timewait state.
1528 if (tp.saw_tstamp &&
1529 sysctl_tcp_tw_recycle &&
1530 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1531 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1532 peer->v4daddr == saddr) {
1533 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1534 (s32)(peer->tcp_ts - req->ts_recent) >
1536 NET_INC_STATS_BH(PAWSPassiveRejected);
1541 /* Kill the following clause, if you dislike this way. */
1542 else if (!sysctl_tcp_syncookies &&
1543 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1544 (sysctl_max_syn_backlog >> 2)) &&
1545 (!peer || !peer->tcp_ts_stamp) &&
1546 (!dst || !dst_metric(dst, RTAX_RTT))) {
1547 /* Without syncookies last quarter of
1548 * backlog is filled with destinations,
1549 * proven to be alive.
1550 * It means that we continue to communicate
1551 * to destinations, already remembered
1552 * to the moment of synflood.
1554 NETDEBUG(if (net_ratelimit()) \
1555 printk(KERN_DEBUG "TCP: drop open "
1556 "request from %u.%u."
1559 ntohs(skb->h.th->source)));
1564 isn = tcp_v4_init_sequence(sk, skb);
1568 if (tcp_v4_send_synack(sk, req, dst))
1572 tcp_openreq_free(req);
1574 tcp_v4_synq_add(sk, req);
1579 tcp_openreq_free(req);
1581 TCP_INC_STATS_BH(TcpAttemptFails);
1587 * The three way handshake has completed - we got a valid synack -
1588 * now create the new socket.
1590 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1591 struct open_request *req,
1592 struct dst_entry *dst)
1594 struct inet_opt *newinet;
1595 struct tcp_opt *newtp;
1598 #ifdef CONFIG_ACCEPT_QUEUES
1599 if (tcp_acceptq_is_full(sk, req->acceptq_class))
1601 if (tcp_acceptq_is_full(sk))
1605 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1608 newsk = tcp_create_openreq_child(sk, req, skb);
1612 newsk->sk_dst_cache = dst;
1613 tcp_v4_setup_caps(newsk, dst);
1615 newtp = tcp_sk(newsk);
1616 newinet = inet_sk(newsk);
1617 newinet->daddr = req->af.v4_req.rmt_addr;
1618 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1619 newinet->saddr = req->af.v4_req.loc_addr;
1620 newinet->opt = req->af.v4_req.opt;
1621 req->af.v4_req.opt = NULL;
1622 newinet->mc_index = tcp_v4_iif(skb);
1623 newinet->mc_ttl = skb->nh.iph->ttl;
1624 newtp->ext_header_len = 0;
1626 newtp->ext_header_len = newinet->opt->optlen;
1627 newtp->ext2_header_len = dst->header_len;
1628 newinet->id = newtp->write_seq ^ jiffies;
1630 tcp_sync_mss(newsk, dst_pmtu(dst));
1631 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1632 tcp_initialize_rcv_mss(newsk);
1634 __tcp_v4_hash(newsk, 0);
1635 __tcp_inherit_port(sk, newsk);
1640 NET_INC_STATS_BH(ListenOverflows);
1642 NET_INC_STATS_BH(ListenDrops);
1647 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1649 struct tcphdr *th = skb->h.th;
1650 struct iphdr *iph = skb->nh.iph;
1651 struct tcp_opt *tp = tcp_sk(sk);
1653 struct open_request **prev;
1654 /* Find possible connection requests. */
1655 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1656 iph->saddr, iph->daddr);
1658 return tcp_check_req(sk, skb, req, prev);
1660 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1667 if (nsk->sk_state != TCP_TIME_WAIT) {
1671 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1675 #ifdef CONFIG_SYN_COOKIES
1676 if (!th->rst && !th->syn && th->ack)
1677 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1682 static int tcp_v4_checksum_init(struct sk_buff *skb)
1684 if (skb->ip_summed == CHECKSUM_HW) {
1685 skb->ip_summed = CHECKSUM_UNNECESSARY;
1686 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1687 skb->nh.iph->daddr, skb->csum))
1690 NETDEBUG(if (net_ratelimit())
1691 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1692 skb->ip_summed = CHECKSUM_NONE;
1694 if (skb->len <= 76) {
1695 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1697 skb_checksum(skb, 0, skb->len, 0)))
1699 skb->ip_summed = CHECKSUM_UNNECESSARY;
1701 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1703 skb->nh.iph->daddr, 0);
1709 /* The socket must have it's spinlock held when we get
1712 * We have a potential double-lock case here, so even when
1713 * doing backlog processing we use the BH locking scheme.
1714 * This is because we cannot sleep with the original spinlock
1717 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1719 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1720 TCP_CHECK_TIMER(sk);
1721 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1723 TCP_CHECK_TIMER(sk);
1727 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1730 if (sk->sk_state == TCP_LISTEN) {
1731 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1736 if (tcp_child_process(sk, nsk, skb))
1742 TCP_CHECK_TIMER(sk);
1743 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1745 TCP_CHECK_TIMER(sk);
1749 tcp_v4_send_reset(skb);
1752 /* Be careful here. If this function gets more complicated and
1753 * gcc suffers from register pressure on the x86, sk (in %ebx)
1754 * might be destroyed here. This current version compiles correctly,
1755 * but you have been warned.
1760 TCP_INC_STATS_BH(TcpInErrs);
1768 int tcp_v4_rcv(struct sk_buff *skb)
1774 if (skb->pkt_type != PACKET_HOST)
1777 /* Count it even if it's bad */
1778 TCP_INC_STATS_BH(TcpInSegs);
1780 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1785 if (th->doff < sizeof(struct tcphdr) / 4)
1787 if (!pskb_may_pull(skb, th->doff * 4))
1790 /* An explanation is required here, I think.
1791 * Packet length and doff are validated by header prediction,
1792 * provided case of th->doff==0 is elimineted.
1793 * So, we defer the checks. */
1794 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1795 tcp_v4_checksum_init(skb) < 0))
1799 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1800 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1801 skb->len - th->doff * 4);
1802 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1803 TCP_SKB_CB(skb)->when = 0;
1804 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1805 TCP_SKB_CB(skb)->sacked = 0;
1807 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1808 skb->nh.iph->daddr, ntohs(th->dest),
1815 if (sk->sk_state == TCP_TIME_WAIT)
1818 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1819 goto discard_and_relse;
1821 if (sk_filter(sk, skb, 0))
1822 goto discard_and_relse;
1828 if (!sock_owned_by_user(sk)) {
1829 if (!tcp_prequeue(sk, skb))
1830 ret = tcp_v4_do_rcv(sk, skb);
1832 sk_add_backlog(sk, skb);
1840 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1843 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1845 TCP_INC_STATS_BH(TcpInErrs);
1847 tcp_v4_send_reset(skb);
1851 /* Discard frame. */
1860 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1861 tcp_tw_put((struct tcp_tw_bucket *) sk);
1865 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1866 TCP_INC_STATS_BH(TcpInErrs);
1867 tcp_tw_put((struct tcp_tw_bucket *) sk);
1870 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1871 skb, th, skb->len)) {
1873 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1877 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1878 tcp_tw_put((struct tcp_tw_bucket *)sk);
1882 /* Fall through to ACK */
1885 tcp_v4_timewait_ack(sk, skb);
1889 case TCP_TW_SUCCESS:;
1894 /* With per-bucket locks this operation is not-atomic, so that
1895 * this version is not worse.
1897 static void __tcp_v4_rehash(struct sock *sk)
1899 sk->sk_prot->unhash(sk);
1900 sk->sk_prot->hash(sk);
1903 static int tcp_v4_reselect_saddr(struct sock *sk)
1905 struct inet_opt *inet = inet_sk(sk);
1908 __u32 old_saddr = inet->saddr;
1910 __u32 daddr = inet->daddr;
1912 if (inet->opt && inet->opt->srr)
1913 daddr = inet->opt->faddr;
1915 /* Query new route. */
1916 err = ip_route_connect(&rt, daddr, 0,
1917 RT_TOS(inet->tos) | sk->sk_localroute,
1918 sk->sk_bound_dev_if,
1920 inet->sport, inet->dport, sk);
1924 __sk_dst_set(sk, &rt->u.dst);
1925 tcp_v4_setup_caps(sk, &rt->u.dst);
1926 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1928 new_saddr = rt->rt_src;
1930 if (new_saddr == old_saddr)
1933 if (sysctl_ip_dynaddr > 1) {
1934 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1935 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1937 NIPQUAD(new_saddr));
1940 inet->saddr = new_saddr;
1941 inet->rcv_saddr = new_saddr;
1943 /* XXX The only one ugly spot where we need to
1944 * XXX really change the sockets identity after
1945 * XXX it has entered the hashes. -DaveM
1947 * Besides that, it does not check for connection
1948 * uniqueness. Wait for troubles.
1950 __tcp_v4_rehash(sk);
1954 int tcp_v4_rebuild_header(struct sock *sk)
1956 struct inet_opt *inet = inet_sk(sk);
1957 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1961 /* Route is OK, nothing to do. */
1966 daddr = inet->daddr;
1967 if (inet->opt && inet->opt->srr)
1968 daddr = inet->opt->faddr;
1971 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1974 .saddr = inet->saddr,
1975 .tos = RT_CONN_FLAGS(sk) } },
1976 .proto = IPPROTO_TCP,
1978 { .sport = inet->sport,
1979 .dport = inet->dport } } };
1981 err = ip_route_output_flow(&rt, &fl, sk, 0);
1984 __sk_dst_set(sk, &rt->u.dst);
1985 tcp_v4_setup_caps(sk, &rt->u.dst);
1986 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1990 /* Routing failed... */
1991 sk->sk_route_caps = 0;
1993 if (!sysctl_ip_dynaddr ||
1994 sk->sk_state != TCP_SYN_SENT ||
1995 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1996 (err = tcp_v4_reselect_saddr(sk)) != 0)
1997 sk->sk_err_soft = -err;
2002 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2004 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2005 struct inet_opt *inet = inet_sk(sk);
2007 sin->sin_family = AF_INET;
2008 sin->sin_addr.s_addr = inet->daddr;
2009 sin->sin_port = inet->dport;
2012 /* VJ's idea. Save last timestamp seen from this destination
2013 * and hold it at least for normal timewait interval to use for duplicate
2014 * segment detection in subsequent connections, before they enter synchronized
2018 int tcp_v4_remember_stamp(struct sock *sk)
2020 struct inet_opt *inet = inet_sk(sk);
2021 struct tcp_opt *tp = tcp_sk(sk);
2022 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2023 struct inet_peer *peer = NULL;
2026 if (!rt || rt->rt_dst != inet->daddr) {
2027 peer = inet_getpeer(inet->daddr, 1);
2031 rt_bind_peer(rt, 1);
2036 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2037 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2038 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2039 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2040 peer->tcp_ts = tp->ts_recent;
2050 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2052 struct inet_peer *peer = NULL;
2054 peer = inet_getpeer(tw->tw_daddr, 1);
2057 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2058 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2059 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2060 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2061 peer->tcp_ts = tw->tw_ts_recent;
2070 struct tcp_func ipv4_specific = {
2071 .queue_xmit = ip_queue_xmit,
2072 .send_check = tcp_v4_send_check,
2073 .rebuild_header = tcp_v4_rebuild_header,
2074 .conn_request = tcp_v4_conn_request,
2075 .syn_recv_sock = tcp_v4_syn_recv_sock,
2076 .remember_stamp = tcp_v4_remember_stamp,
2077 .net_header_len = sizeof(struct iphdr),
2078 .setsockopt = ip_setsockopt,
2079 .getsockopt = ip_getsockopt,
2080 .addr2sockaddr = v4_addr2sockaddr,
2081 .sockaddr_len = sizeof(struct sockaddr_in),
2084 /* NOTE: A lot of things set to zero explicitly by call to
2085 * sk_alloc() so need not be done here.
2087 static int tcp_v4_init_sock(struct sock *sk)
2089 struct tcp_opt *tp = tcp_sk(sk);
2091 skb_queue_head_init(&tp->out_of_order_queue);
2092 tcp_init_xmit_timers(sk);
2093 tcp_prequeue_init(tp);
2095 tp->rto = TCP_TIMEOUT_INIT;
2096 tp->mdev = TCP_TIMEOUT_INIT;
2098 /* So many TCP implementations out there (incorrectly) count the
2099 * initial SYN frame in their delayed-ACK and congestion control
2100 * algorithms that we must have the following bandaid to talk
2101 * efficiently to them. -DaveM
2105 /* See draft-stevens-tcpca-spec-01 for discussion of the
2106 * initialization of these values.
2108 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2109 tp->snd_cwnd_clamp = ~0;
2110 tp->mss_cache = 536;
2112 tp->reordering = sysctl_tcp_reordering;
2114 sk->sk_state = TCP_CLOSE;
2116 sk->sk_write_space = tcp_write_space;
2117 sk->sk_use_write_queue = 1;
2119 tp->af_specific = &ipv4_specific;
2121 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2122 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2124 atomic_inc(&tcp_sockets_allocated);
2129 static int tcp_v4_destroy_sock(struct sock *sk)
2131 struct tcp_opt *tp = tcp_sk(sk);
2133 tcp_clear_xmit_timers(sk);
2135 /* Cleanup up the write buffer. */
2136 tcp_writequeue_purge(sk);
2138 /* Cleans up our, hopefully empty, out_of_order_queue. */
2139 __skb_queue_purge(&tp->out_of_order_queue);
2141 /* Clean prequeue, it must be empty really */
2142 __skb_queue_purge(&tp->ucopy.prequeue);
2144 /* Clean up a referenced TCP bind bucket. */
2148 /* If sendmsg cached page exists, toss it. */
2149 if (inet_sk(sk)->sndmsg_page)
2150 __free_page(inet_sk(sk)->sndmsg_page);
2152 atomic_dec(&tcp_sockets_allocated);
2157 #ifdef CONFIG_PROC_FS
2158 /* Proc filesystem TCP sock list dumping. */
2160 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2162 return hlist_empty(head) ? NULL :
2163 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2166 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2168 return tw->tw_node.next ?
2169 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2172 static void *listening_get_next(struct seq_file *seq, void *cur)
2175 struct hlist_node *node;
2176 struct sock *sk = cur;
2177 struct tcp_iter_state* st = seq->private;
2181 sk = sk_head(&tcp_listening_hash[0]);
2187 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2188 struct open_request *req = cur;
2190 tp = tcp_sk(st->syn_wait_sk);
2194 if (req->class->family == st->family) {
2200 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2203 req = tp->listen_opt->syn_table[st->sbucket];
2205 sk = sk_next(st->syn_wait_sk);
2206 st->state = TCP_SEQ_STATE_LISTENING;
2207 read_unlock_bh(&tp->syn_wait_lock);
2211 sk_for_each_from(sk, node) {
2212 if (sk->sk_family == st->family) {
2217 read_lock_bh(&tp->syn_wait_lock);
2218 if (tp->listen_opt && tp->listen_opt->qlen) {
2219 st->uid = sock_i_uid(sk);
2220 st->syn_wait_sk = sk;
2221 st->state = TCP_SEQ_STATE_OPENREQ;
2225 read_unlock_bh(&tp->syn_wait_lock);
2227 if (++st->bucket < TCP_LHTABLE_SIZE) {
2228 sk = sk_head(&tcp_listening_hash[st->bucket]);
2236 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2238 void *rc = listening_get_next(seq, NULL);
2240 while (rc && *pos) {
2241 rc = listening_get_next(seq, rc);
2247 static void *established_get_first(struct seq_file *seq)
2249 struct tcp_iter_state* st = seq->private;
2252 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2254 struct hlist_node *node;
2255 struct tcp_tw_bucket *tw;
2257 read_lock(&tcp_ehash[st->bucket].lock);
2258 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2259 if (sk->sk_family != st->family) {
2265 st->state = TCP_SEQ_STATE_TIME_WAIT;
2266 tw_for_each(tw, node,
2267 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2268 if (tw->tw_family != st->family) {
2274 read_unlock(&tcp_ehash[st->bucket].lock);
2275 st->state = TCP_SEQ_STATE_ESTABLISHED;
2281 static void *established_get_next(struct seq_file *seq, void *cur)
2283 struct sock *sk = cur;
2284 struct tcp_tw_bucket *tw;
2285 struct hlist_node *node;
2286 struct tcp_iter_state* st = seq->private;
2290 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2294 while (tw && tw->tw_family != st->family) {
2301 read_unlock(&tcp_ehash[st->bucket].lock);
2302 st->state = TCP_SEQ_STATE_ESTABLISHED;
2303 if (++st->bucket < tcp_ehash_size) {
2304 read_lock(&tcp_ehash[st->bucket].lock);
2305 sk = sk_head(&tcp_ehash[st->bucket].chain);
2313 sk_for_each_from(sk, node) {
2314 if (sk->sk_family == st->family)
2318 st->state = TCP_SEQ_STATE_TIME_WAIT;
2319 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2327 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2329 void *rc = established_get_first(seq);
2332 rc = established_get_next(seq, rc);
2338 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2341 struct tcp_iter_state* st = seq->private;
2344 st->state = TCP_SEQ_STATE_LISTENING;
2345 rc = listening_get_idx(seq, &pos);
2348 tcp_listen_unlock();
2350 st->state = TCP_SEQ_STATE_ESTABLISHED;
2351 rc = established_get_idx(seq, pos);
2357 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2359 struct tcp_iter_state* st = seq->private;
2360 st->state = TCP_SEQ_STATE_LISTENING;
2362 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2365 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2368 struct tcp_iter_state* st;
2370 if (v == SEQ_START_TOKEN) {
2371 rc = tcp_get_idx(seq, 0);
2376 switch (st->state) {
2377 case TCP_SEQ_STATE_OPENREQ:
2378 case TCP_SEQ_STATE_LISTENING:
2379 rc = listening_get_next(seq, v);
2381 tcp_listen_unlock();
2383 st->state = TCP_SEQ_STATE_ESTABLISHED;
2384 rc = established_get_first(seq);
2387 case TCP_SEQ_STATE_ESTABLISHED:
2388 case TCP_SEQ_STATE_TIME_WAIT:
2389 rc = established_get_next(seq, v);
2397 static void tcp_seq_stop(struct seq_file *seq, void *v)
2399 struct tcp_iter_state* st = seq->private;
2401 switch (st->state) {
2402 case TCP_SEQ_STATE_OPENREQ:
2404 struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2405 read_unlock_bh(&tp->syn_wait_lock);
2407 case TCP_SEQ_STATE_LISTENING:
2408 if (v != SEQ_START_TOKEN)
2409 tcp_listen_unlock();
2411 case TCP_SEQ_STATE_TIME_WAIT:
2412 case TCP_SEQ_STATE_ESTABLISHED:
2414 read_unlock(&tcp_ehash[st->bucket].lock);
2420 static int tcp_seq_open(struct inode *inode, struct file *file)
2422 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2423 struct seq_file *seq;
2424 struct tcp_iter_state *s;
2427 if (unlikely(afinfo == NULL))
2430 s = kmalloc(sizeof(*s), GFP_KERNEL);
2433 memset(s, 0, sizeof(*s));
2434 s->family = afinfo->family;
2435 s->seq_ops.start = tcp_seq_start;
2436 s->seq_ops.next = tcp_seq_next;
2437 s->seq_ops.show = afinfo->seq_show;
2438 s->seq_ops.stop = tcp_seq_stop;
2440 rc = seq_open(file, &s->seq_ops);
2443 seq = file->private_data;
2452 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2455 struct proc_dir_entry *p;
2459 afinfo->seq_fops->owner = afinfo->owner;
2460 afinfo->seq_fops->open = tcp_seq_open;
2461 afinfo->seq_fops->read = seq_read;
2462 afinfo->seq_fops->llseek = seq_lseek;
2463 afinfo->seq_fops->release = seq_release_private;
2465 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2473 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2477 proc_net_remove(afinfo->name);
2478 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2481 static void get_openreq4(struct sock *sk, struct open_request *req,
2482 char *tmpbuf, int i, int uid)
2484 int ttd = req->expires - jiffies;
2486 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2487 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2489 req->af.v4_req.loc_addr,
2490 ntohs(inet_sk(sk)->sport),
2491 req->af.v4_req.rmt_addr,
2492 ntohs(req->rmt_port),
2494 0, 0, /* could print option size, but that is af dependent. */
2495 1, /* timers active (only the expire timer) */
2496 jiffies_to_clock_t(ttd),
2499 0, /* non standard timer */
2500 0, /* open_requests have no inode */
2501 atomic_read(&sk->sk_refcnt),
2505 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2508 unsigned long timer_expires;
2509 struct tcp_opt *tp = tcp_sk(sp);
2510 struct inet_opt *inet = inet_sk(sp);
2511 unsigned int dest = inet->daddr;
2512 unsigned int src = inet->rcv_saddr;
2513 __u16 destp = ntohs(inet->dport);
2514 __u16 srcp = ntohs(inet->sport);
2516 if (tp->pending == TCP_TIME_RETRANS) {
2518 timer_expires = tp->timeout;
2519 } else if (tp->pending == TCP_TIME_PROBE0) {
2521 timer_expires = tp->timeout;
2522 } else if (timer_pending(&sp->sk_timer)) {
2524 timer_expires = sp->sk_timer.expires;
2527 timer_expires = jiffies;
2530 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2531 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2532 i, src, srcp, dest, destp, sp->sk_state,
2533 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2535 jiffies_to_clock_t(timer_expires - jiffies),
2540 atomic_read(&sp->sk_refcnt), sp,
2541 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2543 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2546 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2548 unsigned int dest, src;
2550 int ttd = tw->tw_ttd - jiffies;
2555 dest = tw->tw_daddr;
2556 src = tw->tw_rcv_saddr;
2557 destp = ntohs(tw->tw_dport);
2558 srcp = ntohs(tw->tw_sport);
2560 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2561 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2562 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2563 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2564 atomic_read(&tw->tw_refcnt), tw);
2569 static int tcp4_seq_show(struct seq_file *seq, void *v)
2571 struct tcp_iter_state* st;
2572 char tmpbuf[TMPSZ + 1];
2574 if (v == SEQ_START_TOKEN) {
2575 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2576 " sl local_address rem_address st tx_queue "
2577 "rx_queue tr tm->when retrnsmt uid timeout "
2583 switch (st->state) {
2584 case TCP_SEQ_STATE_LISTENING:
2585 case TCP_SEQ_STATE_ESTABLISHED:
2586 get_tcp4_sock(v, tmpbuf, st->num);
2588 case TCP_SEQ_STATE_OPENREQ:
2589 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2591 case TCP_SEQ_STATE_TIME_WAIT:
2592 get_timewait4_sock(v, tmpbuf, st->num);
2595 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2600 static struct file_operations tcp4_seq_fops;
2601 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2602 .owner = THIS_MODULE,
2605 .seq_show = tcp4_seq_show,
2606 .seq_fops = &tcp4_seq_fops,
2609 int __init tcp4_proc_init(void)
2611 return tcp_proc_register(&tcp4_seq_afinfo);
2614 void tcp4_proc_exit(void)
2616 tcp_proc_unregister(&tcp4_seq_afinfo);
2618 #endif /* CONFIG_PROC_FS */
2620 struct proto tcp_prot = {
2623 .connect = tcp_v4_connect,
2624 .disconnect = tcp_disconnect,
2625 .accept = tcp_accept,
2627 .init = tcp_v4_init_sock,
2628 .destroy = tcp_v4_destroy_sock,
2629 .shutdown = tcp_shutdown,
2630 .setsockopt = tcp_setsockopt,
2631 .getsockopt = tcp_getsockopt,
2632 .sendmsg = tcp_sendmsg,
2633 .recvmsg = tcp_recvmsg,
2634 .backlog_rcv = tcp_v4_do_rcv,
2635 .hash = tcp_v4_hash,
2636 .unhash = tcp_unhash,
2637 .get_port = tcp_v4_get_port,
2642 void __init tcp_v4_init(struct net_proto_family *ops)
2644 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2646 panic("Failed to create the TCP control socket.\n");
2647 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2648 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2650 /* Unhash it so that IP input processing does not even
2651 * see it, we do not wish this socket to see incoming
2654 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2657 EXPORT_SYMBOL(ipv4_specific);
2658 EXPORT_SYMBOL(tcp_bind_hash);
2659 EXPORT_SYMBOL(tcp_bucket_create);
2660 EXPORT_SYMBOL(tcp_hashinfo);
2661 EXPORT_SYMBOL(tcp_inherit_port);
2662 EXPORT_SYMBOL(tcp_listen_wlock);
2663 EXPORT_SYMBOL(tcp_port_rover);
2664 EXPORT_SYMBOL(tcp_prot);
2665 EXPORT_SYMBOL(tcp_put_port);
2666 EXPORT_SYMBOL(tcp_unhash);
2667 EXPORT_SYMBOL(tcp_v4_conn_request);
2668 EXPORT_SYMBOL(tcp_v4_connect);
2669 EXPORT_SYMBOL(tcp_v4_do_rcv);
2670 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2671 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2672 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2673 EXPORT_SYMBOL(tcp_v4_send_check);
2674 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2676 #ifdef CONFIG_PROC_FS
2677 EXPORT_SYMBOL(tcp_proc_register);
2678 EXPORT_SYMBOL(tcp_proc_unregister);
2680 #ifdef CONFIG_SYSCTL
2681 EXPORT_SYMBOL(sysctl_local_port_range);
2682 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2683 EXPORT_SYMBOL(sysctl_tcp_low_latency);