2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/vserver/debug.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
109 __u32 faddr, __u16 fport)
111 int h = (laddr ^ lport) ^ (faddr ^ fport);
114 return h & (tcp_ehash_size - 1);
117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
119 struct inet_sock *inet = inet_sk(sk);
120 __u32 laddr = inet->rcv_saddr;
121 __u16 lport = inet->num;
122 __u32 faddr = inet->daddr;
123 __u16 fport = inet->dport;
125 return tcp_hashfn(laddr, lport, faddr, fport);
128 /* Allocate and initialize a new TCP local port bind bucket.
129 * The bindhash mutex for snum's hash chain must be held here.
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
134 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
139 INIT_HLIST_HEAD(&tb->owners);
140 hlist_add_head(&tb->node, &head->chain);
145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
148 if (hlist_empty(&tb->owners)) {
149 __hlist_del(&tb->node);
150 kmem_cache_free(tcp_bucket_cachep, tb);
154 /* Caller must disable local BH processing. */
155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
157 struct tcp_bind_hashbucket *head =
158 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
159 struct tcp_bind_bucket *tb;
161 spin_lock(&head->lock);
162 tb = tcp_sk(sk)->bind_hash;
163 sk_add_bind_node(child, &tb->owners);
164 tcp_sk(child)->bind_hash = tb;
165 spin_unlock(&head->lock);
168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
171 __tcp_inherit_port(sk, child);
175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
178 inet_sk(sk)->num = snum;
179 sk_add_bind_node(sk, &tb->owners);
180 tcp_sk(sk)->bind_hash = tb;
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 if (nx_addr_conflict(sk->sk_nx_info,
198 tcp_v4_rcv_saddr(sk), sk2))
206 /* Obtain a reference to a local port for the given sock,
207 * if snum is zero it means select any available local port.
209 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211 struct tcp_bind_hashbucket *head;
212 struct hlist_node *node;
213 struct tcp_bind_bucket *tb;
218 int low = sysctl_local_port_range[0];
219 int high = sysctl_local_port_range[1];
220 int remaining = (high - low) + 1;
223 spin_lock(&tcp_portalloc_lock);
224 if (tcp_port_rover < low)
227 rover = tcp_port_rover;
232 head = &tcp_bhash[tcp_bhashfn(rover)];
233 spin_lock(&head->lock);
234 tb_for_each(tb, node, &head->chain)
235 if (tb->port == rover)
239 spin_unlock(&head->lock);
240 } while (--remaining > 0);
241 tcp_port_rover = rover;
242 spin_unlock(&tcp_portalloc_lock);
244 /* Exhausted local port range during search? */
249 /* OK, here is the one we will use. HEAD is
250 * non-NULL and we hold it's mutex.
254 head = &tcp_bhash[tcp_bhashfn(snum)];
255 spin_lock(&head->lock);
256 tb_for_each(tb, node, &head->chain)
257 if (tb->port == snum)
263 if (!hlist_empty(&tb->owners)) {
264 if (sk->sk_reuse > 1)
266 if (tb->fastreuse > 0 &&
267 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
271 if (tcp_bind_conflict(sk, tb))
277 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
279 if (hlist_empty(&tb->owners)) {
280 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
284 } else if (tb->fastreuse &&
285 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
288 if (!tcp_sk(sk)->bind_hash)
289 tcp_bind_hash(sk, tb, snum);
290 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
294 spin_unlock(&head->lock);
300 /* Get rid of any references to a local port held by the
303 static void __tcp_put_port(struct sock *sk)
305 struct inet_sock *inet = inet_sk(sk);
306 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
307 struct tcp_bind_bucket *tb;
309 spin_lock(&head->lock);
310 tb = tcp_sk(sk)->bind_hash;
311 __sk_del_bind_node(sk);
312 tcp_sk(sk)->bind_hash = NULL;
314 tcp_bucket_destroy(tb);
315 spin_unlock(&head->lock);
318 void tcp_put_port(struct sock *sk)
325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
326 * Look, when several writers sleep and reader wakes them up, all but one
327 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
328 * this, _but_ remember, it adds useless work on UP machines (wake up each
329 * exclusive lock release). It should be ifdefed really.
332 void tcp_listen_wlock(void)
334 write_lock(&tcp_lhash_lock);
336 if (atomic_read(&tcp_lhash_users)) {
340 prepare_to_wait_exclusive(&tcp_lhash_wait,
341 &wait, TASK_UNINTERRUPTIBLE);
342 if (!atomic_read(&tcp_lhash_users))
344 write_unlock_bh(&tcp_lhash_lock);
346 write_lock_bh(&tcp_lhash_lock);
349 finish_wait(&tcp_lhash_wait, &wait);
353 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355 struct hlist_head *list;
358 BUG_TRAP(sk_unhashed(sk));
359 if (listen_possible && sk->sk_state == TCP_LISTEN) {
360 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
361 lock = &tcp_lhash_lock;
364 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
365 lock = &tcp_ehash[sk->sk_hashent].lock;
368 __sk_add_node(sk, list);
369 sock_prot_inc_use(sk->sk_prot);
371 if (listen_possible && sk->sk_state == TCP_LISTEN)
372 wake_up(&tcp_lhash_wait);
375 static void tcp_v4_hash(struct sock *sk)
377 if (sk->sk_state != TCP_CLOSE) {
379 __tcp_v4_hash(sk, 1);
384 void tcp_unhash(struct sock *sk)
391 if (sk->sk_state == TCP_LISTEN) {
394 lock = &tcp_lhash_lock;
396 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
398 write_lock_bh(&head->lock);
401 if (__sk_del_node_init(sk))
402 sock_prot_dec_use(sk->sk_prot);
403 write_unlock_bh(lock);
406 if (sk->sk_state == TCP_LISTEN)
407 wake_up(&tcp_lhash_wait);
412 * Check if a given address matches for a tcp socket
414 * nxi: the socket's nx_info if any
415 * addr: to be verified address
416 * saddr: socket addresses
418 static inline int tcp_addr_match (
423 if (addr && (saddr == addr))
426 return addr_in_nx_info(nxi, addr);
430 /* Don't inline this cruft. Here are some nice properties to
431 * exploit here. The BSD API does not allow a listening TCP
432 * to specify the remote port nor the remote address for the
433 * connection. So always assume those are both wildcarded
434 * during the search since they can never be otherwise.
436 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
437 unsigned short hnum, int dif)
439 struct sock *result = NULL, *sk;
440 struct hlist_node *node;
444 sk_for_each(sk, node, head) {
445 struct inet_sock *inet = inet_sk(sk);
447 if (inet->num == hnum && !ipv6_only_sock(sk)) {
448 __u32 rcv_saddr = inet->rcv_saddr;
450 score = (sk->sk_family == PF_INET ? 1 : 0);
451 if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
455 if (sk->sk_bound_dev_if) {
456 if (sk->sk_bound_dev_if != dif)
462 if (score > hiscore) {
471 /* Optimize the common listener case. */
472 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
473 unsigned short hnum, int dif)
475 struct sock *sk = NULL;
476 struct hlist_head *head;
478 read_lock(&tcp_lhash_lock);
479 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
480 if (!hlist_empty(head)) {
481 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
483 if (inet->num == hnum && !sk->sk_node.next &&
484 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
485 tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
486 !sk->sk_bound_dev_if)
488 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
494 read_unlock(&tcp_lhash_lock);
498 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
499 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
501 * Local BH must be disabled here.
504 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
508 struct tcp_ehash_bucket *head;
509 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
510 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
512 struct hlist_node *node;
513 /* Optimize here for direct hit, only listening connections can
514 * have wildcards anyways.
516 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
517 head = &tcp_ehash[hash];
518 read_lock(&head->lock);
519 sk_for_each(sk, node, &head->chain) {
520 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
521 goto hit; /* You sunk my battleship! */
524 /* Must check for a TIME_WAIT'er before going to listener hash. */
525 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
526 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
531 read_unlock(&head->lock);
538 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
539 u32 daddr, u16 hnum, int dif)
541 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
544 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
547 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
553 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
559 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
561 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
563 return secure_tcp_sequence_number(skb->nh.iph->daddr,
569 /* called with local bh disabled */
570 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
571 struct tcp_tw_bucket **twp)
573 struct inet_sock *inet = inet_sk(sk);
574 u32 daddr = inet->rcv_saddr;
575 u32 saddr = inet->daddr;
576 int dif = sk->sk_bound_dev_if;
577 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
578 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
579 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
580 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
582 struct hlist_node *node;
583 struct tcp_tw_bucket *tw;
585 write_lock(&head->lock);
587 /* Check TIME-WAIT sockets first. */
588 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
589 tw = (struct tcp_tw_bucket *)sk2;
591 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
592 struct tcp_sock *tp = tcp_sk(sk);
594 /* With PAWS, it is safe from the viewpoint
595 of data integrity. Even without PAWS it
596 is safe provided sequence spaces do not
597 overlap i.e. at data rates <= 80Mbit/sec.
599 Actually, the idea is close to VJ's one,
600 only timestamp cache is held not per host,
601 but per port pair and TW bucket is used
604 If TW bucket has been already destroyed we
605 fall back to VJ's scheme and use initial
606 timestamp retrieved from peer table.
608 if (tw->tw_ts_recent_stamp &&
609 (!twp || (sysctl_tcp_tw_reuse &&
611 tw->tw_ts_recent_stamp > 1))) {
613 tw->tw_snd_nxt + 65535 + 2) == 0)
615 tp->rx_opt.ts_recent = tw->tw_ts_recent;
616 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
625 /* And established part... */
626 sk_for_each(sk2, node, &head->chain) {
627 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
632 /* Must record num and sport now. Otherwise we will see
633 * in hash table socket with a funny identity. */
635 inet->sport = htons(lport);
636 sk->sk_hashent = hash;
637 BUG_TRAP(sk_unhashed(sk));
638 __sk_add_node(sk, &head->chain);
639 sock_prot_inc_use(sk->sk_prot);
640 write_unlock(&head->lock);
644 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
646 /* Silly. Should hash-dance instead... */
647 tcp_tw_deschedule(tw);
648 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
656 write_unlock(&head->lock);
657 return -EADDRNOTAVAIL;
660 static inline u32 connect_port_offset(const struct sock *sk)
662 const struct inet_sock *inet = inet_sk(sk);
664 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
669 * Bind a port for a connect operation and hash it.
671 static inline int tcp_v4_hash_connect(struct sock *sk)
673 unsigned short snum = inet_sk(sk)->num;
674 struct tcp_bind_hashbucket *head;
675 struct tcp_bind_bucket *tb;
679 int low = sysctl_local_port_range[0];
680 int high = sysctl_local_port_range[1];
681 int range = high - low;
685 u32 offset = hint + connect_port_offset(sk);
686 struct hlist_node *node;
687 struct tcp_tw_bucket *tw = NULL;
690 for (i = 1; i <= range; i++) {
691 port = low + (i + offset) % range;
692 head = &tcp_bhash[tcp_bhashfn(port)];
693 spin_lock(&head->lock);
695 /* Does not bother with rcv_saddr checks,
696 * because the established check is already
699 tb_for_each(tb, node, &head->chain) {
700 if (tb->port == port) {
701 BUG_TRAP(!hlist_empty(&tb->owners));
702 if (tb->fastreuse >= 0)
704 if (!__tcp_v4_check_established(sk,
712 tb = tcp_bucket_create(head, port);
714 spin_unlock(&head->lock);
721 spin_unlock(&head->lock);
725 return -EADDRNOTAVAIL;
730 /* Head lock still held and bh's disabled */
731 tcp_bind_hash(sk, tb, port);
732 if (sk_unhashed(sk)) {
733 inet_sk(sk)->sport = htons(port);
734 __tcp_v4_hash(sk, 0);
736 spin_unlock(&head->lock);
739 tcp_tw_deschedule(tw);
747 head = &tcp_bhash[tcp_bhashfn(snum)];
748 tb = tcp_sk(sk)->bind_hash;
749 spin_lock_bh(&head->lock);
750 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
751 __tcp_v4_hash(sk, 0);
752 spin_unlock_bh(&head->lock);
755 spin_unlock(&head->lock);
756 /* No definite answer... Walk to established hash table */
757 ret = __tcp_v4_check_established(sk, snum, NULL);
764 /* This will initiate an outgoing connection. */
765 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
767 struct inet_sock *inet = inet_sk(sk);
768 struct tcp_sock *tp = tcp_sk(sk);
769 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
775 if (addr_len < sizeof(struct sockaddr_in))
778 if (usin->sin_family != AF_INET)
779 return -EAFNOSUPPORT;
781 nexthop = daddr = usin->sin_addr.s_addr;
782 if (inet->opt && inet->opt->srr) {
785 nexthop = inet->opt->faddr;
788 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
789 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
791 inet->sport, usin->sin_port, sk);
795 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
800 if (!inet->opt || !inet->opt->srr)
804 inet->saddr = rt->rt_src;
805 inet->rcv_saddr = inet->saddr;
807 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
808 /* Reset inherited state */
809 tp->rx_opt.ts_recent = 0;
810 tp->rx_opt.ts_recent_stamp = 0;
814 if (sysctl_tcp_tw_recycle &&
815 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
816 struct inet_peer *peer = rt_get_peer(rt);
818 /* VJ's idea. We save last timestamp seen from
819 * the destination in peer table, when entering state TIME-WAIT
820 * and initialize rx_opt.ts_recent from it, when trying new connection.
823 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
824 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
825 tp->rx_opt.ts_recent = peer->tcp_ts;
829 inet->dport = usin->sin_port;
832 tp->ext_header_len = 0;
834 tp->ext_header_len = inet->opt->optlen;
836 tp->rx_opt.mss_clamp = 536;
838 /* Socket identity is still unknown (sport may be zero).
839 * However we set state to SYN-SENT and not releasing socket
840 * lock select source port, enter ourselves into the hash tables and
841 * complete initialization after this.
843 tcp_set_state(sk, TCP_SYN_SENT);
844 err = tcp_v4_hash_connect(sk);
848 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
852 /* OK, now commit destination to socket. */
853 __sk_dst_set(sk, &rt->u.dst);
854 tcp_v4_setup_caps(sk, &rt->u.dst);
857 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
862 inet->id = tp->write_seq ^ jiffies;
864 err = tcp_connect(sk);
872 /* This unhashes the socket and releases the local port, if necessary. */
873 tcp_set_state(sk, TCP_CLOSE);
875 sk->sk_route_caps = 0;
880 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
882 return ((struct rtable *)skb->dst)->rt_iif;
885 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
887 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
890 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
891 struct open_request ***prevp,
893 __u32 raddr, __u32 laddr)
895 struct tcp_listen_opt *lopt = tp->listen_opt;
896 struct open_request *req, **prev;
898 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
899 (req = *prev) != NULL;
900 prev = &req->dl_next) {
901 if (req->rmt_port == rport &&
902 req->af.v4_req.rmt_addr == raddr &&
903 req->af.v4_req.loc_addr == laddr &&
904 TCP_INET_FAMILY(req->class->family)) {
914 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
916 struct tcp_sock *tp = tcp_sk(sk);
917 struct tcp_listen_opt *lopt = tp->listen_opt;
918 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
920 req->expires = jiffies + TCP_TIMEOUT_INIT;
923 req->dl_next = lopt->syn_table[h];
925 write_lock(&tp->syn_wait_lock);
926 lopt->syn_table[h] = req;
927 write_unlock(&tp->syn_wait_lock);
934 * This routine does path mtu discovery as defined in RFC1191.
936 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
939 struct dst_entry *dst;
940 struct inet_sock *inet = inet_sk(sk);
941 struct tcp_sock *tp = tcp_sk(sk);
943 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
944 * send out by Linux are always <576bytes so they should go through
947 if (sk->sk_state == TCP_LISTEN)
950 /* We don't check in the destentry if pmtu discovery is forbidden
951 * on this route. We just assume that no packet_to_big packets
952 * are send back when pmtu discovery is not active.
953 * There is a small race when the user changes this flag in the
954 * route, but I think that's acceptable.
956 if ((dst = __sk_dst_check(sk, 0)) == NULL)
959 dst->ops->update_pmtu(dst, mtu);
961 /* Something is about to be wrong... Remember soft error
962 * for the case, if this connection will not able to recover.
964 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
965 sk->sk_err_soft = EMSGSIZE;
969 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
970 tp->pmtu_cookie > mtu) {
971 tcp_sync_mss(sk, mtu);
973 /* Resend the TCP packet because it's
974 * clear that the old packet has been
975 * dropped. This is the new "fast" path mtu
978 tcp_simple_retransmit(sk);
979 } /* else let the usual retransmit timer handle it */
983 * This routine is called by the ICMP module when it gets some
984 * sort of error condition. If err < 0 then the socket should
985 * be closed and the error returned to the user. If err > 0
986 * it's just the icmp type << 8 | icmp code. After adjustment
987 * header points to the first 8 bytes of the tcp header. We need
988 * to find the appropriate port.
990 * The locking strategy used here is very "optimistic". When
991 * someone else accesses the socket the ICMP is just dropped
992 * and for some paths there is no check at all.
993 * A more general error queue to queue errors for later handling
994 * is probably better.
998 void tcp_v4_err(struct sk_buff *skb, u32 info)
1000 struct iphdr *iph = (struct iphdr *)skb->data;
1001 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1002 struct tcp_sock *tp;
1003 struct inet_sock *inet;
1004 int type = skb->h.icmph->type;
1005 int code = skb->h.icmph->code;
1010 if (skb->len < (iph->ihl << 2) + 8) {
1011 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1015 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1016 th->source, tcp_v4_iif(skb));
1018 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1021 if (sk->sk_state == TCP_TIME_WAIT) {
1022 tcp_tw_put((struct tcp_tw_bucket *)sk);
1027 /* If too many ICMPs get dropped on busy
1028 * servers this needs to be solved differently.
1030 if (sock_owned_by_user(sk))
1031 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1033 if (sk->sk_state == TCP_CLOSE)
1037 seq = ntohl(th->seq);
1038 if (sk->sk_state != TCP_LISTEN &&
1039 !between(seq, tp->snd_una, tp->snd_nxt)) {
1040 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1045 case ICMP_SOURCE_QUENCH:
1046 /* Just silently ignore these. */
1048 case ICMP_PARAMETERPROB:
1051 case ICMP_DEST_UNREACH:
1052 if (code > NR_ICMP_UNREACH)
1055 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1056 if (!sock_owned_by_user(sk))
1057 do_pmtu_discovery(sk, iph, info);
1061 err = icmp_err_convert[code].errno;
1063 case ICMP_TIME_EXCEEDED:
1070 switch (sk->sk_state) {
1071 struct open_request *req, **prev;
1073 if (sock_owned_by_user(sk))
1076 req = tcp_v4_search_req(tp, &prev, th->dest,
1077 iph->daddr, iph->saddr);
1081 /* ICMPs are not backlogged, hence we cannot get
1082 an established socket here.
1086 if (seq != req->snt_isn) {
1087 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1092 * Still in SYN_RECV, just remove it silently.
1093 * There is no good way to pass the error to the newly
1094 * created socket, and POSIX does not want network
1095 * errors returned from accept().
1097 tcp_synq_drop(sk, req, prev);
1101 case TCP_SYN_RECV: /* Cannot happen.
1102 It can f.e. if SYNs crossed.
1104 if (!sock_owned_by_user(sk)) {
1105 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1108 sk->sk_error_report(sk);
1112 sk->sk_err_soft = err;
1117 /* If we've already connected we will keep trying
1118 * until we time out, or the user gives up.
1120 * rfc1122 4.2.3.9 allows to consider as hard errors
1121 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1122 * but it is obsoleted by pmtu discovery).
1124 * Note, that in modern internet, where routing is unreliable
1125 * and in each dark corner broken firewalls sit, sending random
1126 * errors ordered by their masters even this two messages finally lose
1127 * their original sense (even Linux sends invalid PORT_UNREACHs)
1129 * Now we are in compliance with RFCs.
1134 if (!sock_owned_by_user(sk) && inet->recverr) {
1136 sk->sk_error_report(sk);
1137 } else { /* Only an error on timeout */
1138 sk->sk_err_soft = err;
1146 /* This routine computes an IPv4 TCP checksum. */
1147 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1148 struct sk_buff *skb)
1150 struct inet_sock *inet = inet_sk(sk);
1152 if (skb->ip_summed == CHECKSUM_HW) {
1153 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1154 skb->csum = offsetof(struct tcphdr, check);
1156 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1157 csum_partial((char *)th,
1164 * This routine will send an RST to the other tcp.
1166 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1168 * Answer: if a packet caused RST, it is not for a socket
1169 * existing in our system, if it is matched to a socket,
1170 * it is just duplicate segment or bug in other side's TCP.
1171 * So that we build reply only basing on parameters
1172 * arrived with segment.
1173 * Exception: precedence violation. We do not implement it in any case.
1176 static void tcp_v4_send_reset(struct sk_buff *skb)
1178 struct tcphdr *th = skb->h.th;
1180 struct ip_reply_arg arg;
1182 /* Never send a reset in response to a reset. */
1186 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1189 /* Swap the send and the receive. */
1190 memset(&rth, 0, sizeof(struct tcphdr));
1191 rth.dest = th->source;
1192 rth.source = th->dest;
1193 rth.doff = sizeof(struct tcphdr) / 4;
1197 rth.seq = th->ack_seq;
1200 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1201 skb->len - (th->doff << 2));
1204 memset(&arg, 0, sizeof arg);
1205 arg.iov[0].iov_base = (unsigned char *)&rth;
1206 arg.iov[0].iov_len = sizeof rth;
1207 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1208 skb->nh.iph->saddr, /*XXX*/
1209 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1210 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1212 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1214 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1215 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1218 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1219 outside socket context is ugly, certainly. What can I do?
1222 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1225 struct tcphdr *th = skb->h.th;
1230 struct ip_reply_arg arg;
1232 memset(&rep.th, 0, sizeof(struct tcphdr));
1233 memset(&arg, 0, sizeof arg);
1235 arg.iov[0].iov_base = (unsigned char *)&rep;
1236 arg.iov[0].iov_len = sizeof(rep.th);
1238 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1239 (TCPOPT_TIMESTAMP << 8) |
1241 rep.tsopt[1] = htonl(tcp_time_stamp);
1242 rep.tsopt[2] = htonl(ts);
1243 arg.iov[0].iov_len = sizeof(rep);
1246 /* Swap the send and the receive. */
1247 rep.th.dest = th->source;
1248 rep.th.source = th->dest;
1249 rep.th.doff = arg.iov[0].iov_len / 4;
1250 rep.th.seq = htonl(seq);
1251 rep.th.ack_seq = htonl(ack);
1253 rep.th.window = htons(win);
1255 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1256 skb->nh.iph->saddr, /*XXX*/
1257 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1258 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1260 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1262 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1265 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1267 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1269 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1270 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1275 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1277 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1281 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1282 struct open_request *req)
1285 struct ip_options *opt = req->af.v4_req.opt;
1286 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1288 { .daddr = ((opt && opt->srr) ?
1290 req->af.v4_req.rmt_addr),
1291 .saddr = req->af.v4_req.loc_addr,
1292 .tos = RT_CONN_FLAGS(sk) } },
1293 .proto = IPPROTO_TCP,
1295 { .sport = inet_sk(sk)->sport,
1296 .dport = req->rmt_port } } };
1298 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1299 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1302 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1304 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1311 * Send a SYN-ACK after having received an ACK.
1312 * This still operates on a open_request only, not on a big
1315 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1316 struct dst_entry *dst)
1319 struct sk_buff * skb;
1321 /* First, grab a route. */
1322 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1325 skb = tcp_make_synack(sk, dst, req);
1328 struct tcphdr *th = skb->h.th;
1330 th->check = tcp_v4_check(th, skb->len,
1331 req->af.v4_req.loc_addr,
1332 req->af.v4_req.rmt_addr,
1333 csum_partial((char *)th, skb->len,
1336 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1337 req->af.v4_req.rmt_addr,
1338 req->af.v4_req.opt);
1339 if (err == NET_XMIT_CN)
1349 * IPv4 open_request destructor.
1351 static void tcp_v4_or_free(struct open_request *req)
1353 if (req->af.v4_req.opt)
1354 kfree(req->af.v4_req.opt);
1357 static inline void syn_flood_warning(struct sk_buff *skb)
1359 static unsigned long warntime;
1361 if (time_after(jiffies, (warntime + HZ * 60))) {
1364 "possible SYN flooding on port %d. Sending cookies.\n",
1365 ntohs(skb->h.th->dest));
1370 * Save and compile IPv4 options into the open_request if needed.
1372 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1373 struct sk_buff *skb)
1375 struct ip_options *opt = &(IPCB(skb)->opt);
1376 struct ip_options *dopt = NULL;
1378 if (opt && opt->optlen) {
1379 int opt_size = optlength(opt);
1380 dopt = kmalloc(opt_size, GFP_ATOMIC);
1382 if (ip_options_echo(dopt, skb)) {
1392 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1393 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1394 * It would be better to replace it with a global counter for all sockets
1395 * but then some measure against one socket starving all other sockets
1398 * It was 128 by default. Experiments with real servers show, that
1399 * it is absolutely not enough even at 100conn/sec. 256 cures most
1400 * of problems. This value is adjusted to 128 for very small machines
1401 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1402 * Further increasing requires to change hash table size.
1404 int sysctl_max_syn_backlog = 256;
1406 struct or_calltable or_ipv4 = {
1408 .rtx_syn_ack = tcp_v4_send_synack,
1409 .send_ack = tcp_v4_or_send_ack,
1410 .destructor = tcp_v4_or_free,
1411 .send_reset = tcp_v4_send_reset,
1414 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1416 struct tcp_options_received tmp_opt;
1417 struct open_request *req;
1418 __u32 saddr = skb->nh.iph->saddr;
1419 __u32 daddr = skb->nh.iph->daddr;
1420 __u32 isn = TCP_SKB_CB(skb)->when;
1421 struct dst_entry *dst = NULL;
1422 #ifdef CONFIG_SYN_COOKIES
1423 int want_cookie = 0;
1425 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1428 /* Never answer to SYNs send to broadcast or multicast */
1429 if (((struct rtable *)skb->dst)->rt_flags &
1430 (RTCF_BROADCAST | RTCF_MULTICAST))
1433 /* TW buckets are converted to open requests without
1434 * limitations, they conserve resources and peer is
1435 * evidently real one.
1437 if (tcp_synq_is_full(sk) && !isn) {
1438 #ifdef CONFIG_SYN_COOKIES
1439 if (sysctl_tcp_syncookies) {
1446 /* Accept backlog is full. If we have already queued enough
1447 * of warm entries in syn queue, drop request. It is better than
1448 * clogging syn queue with openreqs with exponentially increasing
1451 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1454 req = tcp_openreq_alloc();
1458 tcp_clear_options(&tmp_opt);
1459 tmp_opt.mss_clamp = 536;
1460 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1462 tcp_parse_options(skb, &tmp_opt, 0);
1465 tcp_clear_options(&tmp_opt);
1466 tmp_opt.saw_tstamp = 0;
1469 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1470 /* Some OSes (unknown ones, but I see them on web server, which
1471 * contains information interesting only for windows'
1472 * users) do not send their stamp in SYN. It is easy case.
1473 * We simply do not advertise TS support.
1475 tmp_opt.saw_tstamp = 0;
1476 tmp_opt.tstamp_ok = 0;
1478 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1480 tcp_openreq_init(req, &tmp_opt, skb);
1482 req->af.v4_req.loc_addr = daddr;
1483 req->af.v4_req.rmt_addr = saddr;
1484 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1485 req->class = &or_ipv4;
1487 TCP_ECN_create_request(req, skb->h.th);
1490 #ifdef CONFIG_SYN_COOKIES
1491 syn_flood_warning(skb);
1493 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1495 struct inet_peer *peer = NULL;
1497 /* VJ's idea. We save last timestamp seen
1498 * from the destination in peer table, when entering
1499 * state TIME-WAIT, and check against it before
1500 * accepting new connection request.
1502 * If "isn" is not zero, this request hit alive
1503 * timewait bucket, so that all the necessary checks
1504 * are made in the function processing timewait state.
1506 if (tmp_opt.saw_tstamp &&
1507 sysctl_tcp_tw_recycle &&
1508 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1509 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1510 peer->v4daddr == saddr) {
1511 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1512 (s32)(peer->tcp_ts - req->ts_recent) >
1514 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1519 /* Kill the following clause, if you dislike this way. */
1520 else if (!sysctl_tcp_syncookies &&
1521 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1522 (sysctl_max_syn_backlog >> 2)) &&
1523 (!peer || !peer->tcp_ts_stamp) &&
1524 (!dst || !dst_metric(dst, RTAX_RTT))) {
1525 /* Without syncookies last quarter of
1526 * backlog is filled with destinations,
1527 * proven to be alive.
1528 * It means that we continue to communicate
1529 * to destinations, already remembered
1530 * to the moment of synflood.
1532 NETDEBUG(if (net_ratelimit()) \
1533 printk(KERN_DEBUG "TCP: drop open "
1534 "request from %u.%u."
1537 ntohs(skb->h.th->source)));
1542 isn = tcp_v4_init_sequence(sk, skb);
1546 if (tcp_v4_send_synack(sk, req, dst))
1550 tcp_openreq_free(req);
1552 tcp_v4_synq_add(sk, req);
1557 tcp_openreq_free(req);
1559 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1565 * The three way handshake has completed - we got a valid synack -
1566 * now create the new socket.
1568 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1569 struct open_request *req,
1570 struct dst_entry *dst)
1572 struct inet_sock *newinet;
1573 struct tcp_sock *newtp;
1576 if (sk_acceptq_is_full(sk))
1579 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1582 newsk = tcp_create_openreq_child(sk, req, skb);
1586 newsk->sk_dst_cache = dst;
1587 tcp_v4_setup_caps(newsk, dst);
1589 newtp = tcp_sk(newsk);
1590 newinet = inet_sk(newsk);
1591 newinet->daddr = req->af.v4_req.rmt_addr;
1592 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1593 newinet->saddr = req->af.v4_req.loc_addr;
1594 newinet->opt = req->af.v4_req.opt;
1595 req->af.v4_req.opt = NULL;
1596 newinet->mc_index = tcp_v4_iif(skb);
1597 newinet->mc_ttl = skb->nh.iph->ttl;
1598 newtp->ext_header_len = 0;
1600 newtp->ext_header_len = newinet->opt->optlen;
1601 newinet->id = newtp->write_seq ^ jiffies;
1603 tcp_sync_mss(newsk, dst_mtu(dst));
1604 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1605 tcp_initialize_rcv_mss(newsk);
1607 __tcp_v4_hash(newsk, 0);
1608 __tcp_inherit_port(sk, newsk);
1613 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1615 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1620 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1622 struct tcphdr *th = skb->h.th;
1623 struct iphdr *iph = skb->nh.iph;
1624 struct tcp_sock *tp = tcp_sk(sk);
1626 struct open_request **prev;
1627 /* Find possible connection requests. */
1628 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1629 iph->saddr, iph->daddr);
1631 return tcp_check_req(sk, skb, req, prev);
1633 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1640 if (nsk->sk_state != TCP_TIME_WAIT) {
1644 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1648 #ifdef CONFIG_SYN_COOKIES
1649 if (!th->rst && !th->syn && th->ack)
1650 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1655 static int tcp_v4_checksum_init(struct sk_buff *skb)
1657 if (skb->ip_summed == CHECKSUM_HW) {
1658 skb->ip_summed = CHECKSUM_UNNECESSARY;
1659 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1660 skb->nh.iph->daddr, skb->csum))
1663 NETDEBUG(if (net_ratelimit())
1664 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1665 skb->ip_summed = CHECKSUM_NONE;
1667 if (skb->len <= 76) {
1668 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1670 skb_checksum(skb, 0, skb->len, 0)))
1672 skb->ip_summed = CHECKSUM_UNNECESSARY;
1674 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1676 skb->nh.iph->daddr, 0);
1682 /* The socket must have it's spinlock held when we get
1685 * We have a potential double-lock case here, so even when
1686 * doing backlog processing we use the BH locking scheme.
1687 * This is because we cannot sleep with the original spinlock
1690 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1692 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1693 TCP_CHECK_TIMER(sk);
1694 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1696 TCP_CHECK_TIMER(sk);
1700 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1703 if (sk->sk_state == TCP_LISTEN) {
1704 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1709 if (tcp_child_process(sk, nsk, skb))
1715 TCP_CHECK_TIMER(sk);
1716 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1718 TCP_CHECK_TIMER(sk);
1722 tcp_v4_send_reset(skb);
1725 /* Be careful here. If this function gets more complicated and
1726 * gcc suffers from register pressure on the x86, sk (in %ebx)
1727 * might be destroyed here. This current version compiles correctly,
1728 * but you have been warned.
1733 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1741 int tcp_v4_rcv(struct sk_buff *skb)
1747 if (skb->pkt_type != PACKET_HOST)
1750 /* Count it even if it's bad */
1751 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1753 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1758 if (th->doff < sizeof(struct tcphdr) / 4)
1760 if (!pskb_may_pull(skb, th->doff * 4))
1763 /* An explanation is required here, I think.
1764 * Packet length and doff are validated by header prediction,
1765 * provided case of th->doff==0 is elimineted.
1766 * So, we defer the checks. */
1767 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1768 tcp_v4_checksum_init(skb) < 0))
1772 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1773 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1774 skb->len - th->doff * 4);
1775 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1776 TCP_SKB_CB(skb)->when = 0;
1777 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1778 TCP_SKB_CB(skb)->sacked = 0;
1780 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1781 skb->nh.iph->daddr, ntohs(th->dest),
1788 if (sk->sk_state == TCP_TIME_WAIT)
1791 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1792 goto discard_and_relse;
1794 if (sk_filter(sk, skb, 0))
1795 goto discard_and_relse;
1801 if (!sock_owned_by_user(sk)) {
1802 if (!tcp_prequeue(sk, skb))
1803 ret = tcp_v4_do_rcv(sk, skb);
1805 sk_add_backlog(sk, skb);
1813 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1816 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1818 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1820 tcp_v4_send_reset(skb);
1824 /* Discard frame. */
1833 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1834 tcp_tw_put((struct tcp_tw_bucket *) sk);
1838 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1839 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1840 tcp_tw_put((struct tcp_tw_bucket *) sk);
1843 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1844 skb, th, skb->len)) {
1846 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1850 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1851 tcp_tw_put((struct tcp_tw_bucket *)sk);
1855 /* Fall through to ACK */
1858 tcp_v4_timewait_ack(sk, skb);
1862 case TCP_TW_SUCCESS:;
1867 /* With per-bucket locks this operation is not-atomic, so that
1868 * this version is not worse.
1870 static void __tcp_v4_rehash(struct sock *sk)
1872 sk->sk_prot->unhash(sk);
1873 sk->sk_prot->hash(sk);
1876 static int tcp_v4_reselect_saddr(struct sock *sk)
1878 struct inet_sock *inet = inet_sk(sk);
1881 __u32 old_saddr = inet->saddr;
1883 __u32 daddr = inet->daddr;
1885 if (inet->opt && inet->opt->srr)
1886 daddr = inet->opt->faddr;
1888 /* Query new route. */
1889 err = ip_route_connect(&rt, daddr, 0,
1891 sk->sk_bound_dev_if,
1893 inet->sport, inet->dport, sk);
1897 __sk_dst_set(sk, &rt->u.dst);
1898 tcp_v4_setup_caps(sk, &rt->u.dst);
1900 new_saddr = rt->rt_src;
1902 if (new_saddr == old_saddr)
1905 if (sysctl_ip_dynaddr > 1) {
1906 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1907 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1909 NIPQUAD(new_saddr));
1912 inet->saddr = new_saddr;
1913 inet->rcv_saddr = new_saddr;
1915 /* XXX The only one ugly spot where we need to
1916 * XXX really change the sockets identity after
1917 * XXX it has entered the hashes. -DaveM
1919 * Besides that, it does not check for connection
1920 * uniqueness. Wait for troubles.
1922 __tcp_v4_rehash(sk);
1926 int tcp_v4_rebuild_header(struct sock *sk)
1928 struct inet_sock *inet = inet_sk(sk);
1929 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1933 /* Route is OK, nothing to do. */
1938 daddr = inet->daddr;
1939 if (inet->opt && inet->opt->srr)
1940 daddr = inet->opt->faddr;
1943 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1946 .saddr = inet->saddr,
1947 .tos = RT_CONN_FLAGS(sk) } },
1948 .proto = IPPROTO_TCP,
1950 { .sport = inet->sport,
1951 .dport = inet->dport } } };
1953 err = ip_route_output_flow(&rt, &fl, sk, 0);
1956 __sk_dst_set(sk, &rt->u.dst);
1957 tcp_v4_setup_caps(sk, &rt->u.dst);
1961 /* Routing failed... */
1962 sk->sk_route_caps = 0;
1964 if (!sysctl_ip_dynaddr ||
1965 sk->sk_state != TCP_SYN_SENT ||
1966 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1967 (err = tcp_v4_reselect_saddr(sk)) != 0)
1968 sk->sk_err_soft = -err;
1973 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1975 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1976 struct inet_sock *inet = inet_sk(sk);
1978 sin->sin_family = AF_INET;
1979 sin->sin_addr.s_addr = inet->daddr;
1980 sin->sin_port = inet->dport;
1983 /* VJ's idea. Save last timestamp seen from this destination
1984 * and hold it at least for normal timewait interval to use for duplicate
1985 * segment detection in subsequent connections, before they enter synchronized
1989 int tcp_v4_remember_stamp(struct sock *sk)
1991 struct inet_sock *inet = inet_sk(sk);
1992 struct tcp_sock *tp = tcp_sk(sk);
1993 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1994 struct inet_peer *peer = NULL;
1997 if (!rt || rt->rt_dst != inet->daddr) {
1998 peer = inet_getpeer(inet->daddr, 1);
2002 rt_bind_peer(rt, 1);
2007 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
2008 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2009 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
2010 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
2011 peer->tcp_ts = tp->rx_opt.ts_recent;
2021 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2023 struct inet_peer *peer = NULL;
2025 peer = inet_getpeer(tw->tw_daddr, 1);
2028 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2029 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2030 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2031 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2032 peer->tcp_ts = tw->tw_ts_recent;
2041 struct tcp_func ipv4_specific = {
2042 .queue_xmit = ip_queue_xmit,
2043 .send_check = tcp_v4_send_check,
2044 .rebuild_header = tcp_v4_rebuild_header,
2045 .conn_request = tcp_v4_conn_request,
2046 .syn_recv_sock = tcp_v4_syn_recv_sock,
2047 .remember_stamp = tcp_v4_remember_stamp,
2048 .net_header_len = sizeof(struct iphdr),
2049 .setsockopt = ip_setsockopt,
2050 .getsockopt = ip_getsockopt,
2051 .addr2sockaddr = v4_addr2sockaddr,
2052 .sockaddr_len = sizeof(struct sockaddr_in),
2055 /* NOTE: A lot of things set to zero explicitly by call to
2056 * sk_alloc() so need not be done here.
2058 static int tcp_v4_init_sock(struct sock *sk)
2060 struct tcp_sock *tp = tcp_sk(sk);
2062 skb_queue_head_init(&tp->out_of_order_queue);
2063 tcp_init_xmit_timers(sk);
2064 tcp_prequeue_init(tp);
2066 tp->rto = TCP_TIMEOUT_INIT;
2067 tp->mdev = TCP_TIMEOUT_INIT;
2069 /* So many TCP implementations out there (incorrectly) count the
2070 * initial SYN frame in their delayed-ACK and congestion control
2071 * algorithms that we must have the following bandaid to talk
2072 * efficiently to them. -DaveM
2076 /* See draft-stevens-tcpca-spec-01 for discussion of the
2077 * initialization of these values.
2079 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2080 tp->snd_cwnd_clamp = ~0;
2081 tp->mss_cache_std = tp->mss_cache = 536;
2083 tp->reordering = sysctl_tcp_reordering;
2085 sk->sk_state = TCP_CLOSE;
2087 sk->sk_write_space = sk_stream_write_space;
2088 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2090 tp->af_specific = &ipv4_specific;
2092 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2093 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2095 atomic_inc(&tcp_sockets_allocated);
2100 int tcp_v4_destroy_sock(struct sock *sk)
2102 struct tcp_sock *tp = tcp_sk(sk);
2104 tcp_clear_xmit_timers(sk);
2106 /* Cleanup up the write buffer. */
2107 sk_stream_writequeue_purge(sk);
2109 /* Cleans up our, hopefully empty, out_of_order_queue. */
2110 __skb_queue_purge(&tp->out_of_order_queue);
2112 /* Clean prequeue, it must be empty really */
2113 __skb_queue_purge(&tp->ucopy.prequeue);
2115 /* Clean up a referenced TCP bind bucket. */
2120 * If sendmsg cached page exists, toss it.
2122 if (sk->sk_sndmsg_page) {
2123 __free_page(sk->sk_sndmsg_page);
2124 sk->sk_sndmsg_page = NULL;
2127 atomic_dec(&tcp_sockets_allocated);
2132 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2134 #ifdef CONFIG_PROC_FS
2135 /* Proc filesystem TCP sock list dumping. */
2137 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2139 return hlist_empty(head) ? NULL :
2140 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2143 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2145 return tw->tw_node.next ?
2146 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2149 static void *listening_get_next(struct seq_file *seq, void *cur)
2151 struct tcp_sock *tp;
2152 struct hlist_node *node;
2153 struct sock *sk = cur;
2154 struct tcp_iter_state* st = seq->private;
2158 sk = sk_head(&tcp_listening_hash[0]);
2164 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2165 struct open_request *req = cur;
2167 tp = tcp_sk(st->syn_wait_sk);
2171 vxdprintk(VXD_CBIT(net, 6),
2172 "sk,req: %p [#%d] (from %d)", req->sk,
2173 (req->sk)?req->sk->sk_xid:0, vx_current_xid());
2175 !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2177 if (req->class->family == st->family) {
2183 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2186 req = tp->listen_opt->syn_table[st->sbucket];
2188 sk = sk_next(st->syn_wait_sk);
2189 st->state = TCP_SEQ_STATE_LISTENING;
2190 read_unlock_bh(&tp->syn_wait_lock);
2193 read_lock_bh(&tp->syn_wait_lock);
2194 if (tp->listen_opt && tp->listen_opt->qlen)
2196 read_unlock_bh(&tp->syn_wait_lock);
2200 sk_for_each_from(sk, node) {
2201 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2202 sk, sk->sk_xid, vx_current_xid());
2203 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2205 if (sk->sk_family == st->family) {
2210 read_lock_bh(&tp->syn_wait_lock);
2211 if (tp->listen_opt && tp->listen_opt->qlen) {
2213 st->uid = sock_i_uid(sk);
2214 st->syn_wait_sk = sk;
2215 st->state = TCP_SEQ_STATE_OPENREQ;
2219 read_unlock_bh(&tp->syn_wait_lock);
2221 if (++st->bucket < TCP_LHTABLE_SIZE) {
2222 sk = sk_head(&tcp_listening_hash[st->bucket]);
2230 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2232 void *rc = listening_get_next(seq, NULL);
2234 while (rc && *pos) {
2235 rc = listening_get_next(seq, rc);
2241 static void *established_get_first(struct seq_file *seq)
2243 struct tcp_iter_state* st = seq->private;
2246 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2248 struct hlist_node *node;
2249 struct tcp_tw_bucket *tw;
2251 /* We can reschedule _before_ having picked the target: */
2252 cond_resched_softirq();
2254 read_lock(&tcp_ehash[st->bucket].lock);
2255 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2256 vxdprintk(VXD_CBIT(net, 6),
2257 "sk,egf: %p [#%d] (from %d)",
2258 sk, sk->sk_xid, vx_current_xid());
2259 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2261 if (sk->sk_family != st->family)
2266 st->state = TCP_SEQ_STATE_TIME_WAIT;
2267 tw_for_each(tw, node,
2268 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2269 vxdprintk(VXD_CBIT(net, 6),
2270 "tw: %p [#%d] (from %d)",
2271 tw, tw->tw_xid, vx_current_xid());
2272 if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2274 if (tw->tw_family != st->family)
2279 read_unlock(&tcp_ehash[st->bucket].lock);
2280 st->state = TCP_SEQ_STATE_ESTABLISHED;
2286 static void *established_get_next(struct seq_file *seq, void *cur)
2288 struct sock *sk = cur;
2289 struct tcp_tw_bucket *tw;
2290 struct hlist_node *node;
2291 struct tcp_iter_state* st = seq->private;
2295 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2299 while (tw && (tw->tw_family != st->family ||
2300 !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2307 read_unlock(&tcp_ehash[st->bucket].lock);
2308 st->state = TCP_SEQ_STATE_ESTABLISHED;
2310 /* We can reschedule between buckets: */
2311 cond_resched_softirq();
2313 if (++st->bucket < tcp_ehash_size) {
2314 read_lock(&tcp_ehash[st->bucket].lock);
2315 sk = sk_head(&tcp_ehash[st->bucket].chain);
2323 sk_for_each_from(sk, node) {
2324 vxdprintk(VXD_CBIT(net, 6),
2325 "sk,egn: %p [#%d] (from %d)",
2326 sk, sk->sk_xid, vx_current_xid());
2327 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2329 if (sk->sk_family == st->family)
2333 st->state = TCP_SEQ_STATE_TIME_WAIT;
2334 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2342 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2344 void *rc = established_get_first(seq);
2347 rc = established_get_next(seq, rc);
2353 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2356 struct tcp_iter_state* st = seq->private;
2359 st->state = TCP_SEQ_STATE_LISTENING;
2360 rc = listening_get_idx(seq, &pos);
2363 tcp_listen_unlock();
2365 st->state = TCP_SEQ_STATE_ESTABLISHED;
2366 rc = established_get_idx(seq, pos);
2372 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2374 struct tcp_iter_state* st = seq->private;
2375 st->state = TCP_SEQ_STATE_LISTENING;
2377 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2380 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2383 struct tcp_iter_state* st;
2385 if (v == SEQ_START_TOKEN) {
2386 rc = tcp_get_idx(seq, 0);
2391 switch (st->state) {
2392 case TCP_SEQ_STATE_OPENREQ:
2393 case TCP_SEQ_STATE_LISTENING:
2394 rc = listening_get_next(seq, v);
2396 tcp_listen_unlock();
2398 st->state = TCP_SEQ_STATE_ESTABLISHED;
2399 rc = established_get_first(seq);
2402 case TCP_SEQ_STATE_ESTABLISHED:
2403 case TCP_SEQ_STATE_TIME_WAIT:
2404 rc = established_get_next(seq, v);
2412 static void tcp_seq_stop(struct seq_file *seq, void *v)
2414 struct tcp_iter_state* st = seq->private;
2416 switch (st->state) {
2417 case TCP_SEQ_STATE_OPENREQ:
2419 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2420 read_unlock_bh(&tp->syn_wait_lock);
2422 case TCP_SEQ_STATE_LISTENING:
2423 if (v != SEQ_START_TOKEN)
2424 tcp_listen_unlock();
2426 case TCP_SEQ_STATE_TIME_WAIT:
2427 case TCP_SEQ_STATE_ESTABLISHED:
2429 read_unlock(&tcp_ehash[st->bucket].lock);
2435 static int tcp_seq_open(struct inode *inode, struct file *file)
2437 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2438 struct seq_file *seq;
2439 struct tcp_iter_state *s;
2442 if (unlikely(afinfo == NULL))
2445 s = kmalloc(sizeof(*s), GFP_KERNEL);
2448 memset(s, 0, sizeof(*s));
2449 s->family = afinfo->family;
2450 s->seq_ops.start = tcp_seq_start;
2451 s->seq_ops.next = tcp_seq_next;
2452 s->seq_ops.show = afinfo->seq_show;
2453 s->seq_ops.stop = tcp_seq_stop;
2455 rc = seq_open(file, &s->seq_ops);
2458 seq = file->private_data;
2467 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2470 struct proc_dir_entry *p;
2474 afinfo->seq_fops->owner = afinfo->owner;
2475 afinfo->seq_fops->open = tcp_seq_open;
2476 afinfo->seq_fops->read = seq_read;
2477 afinfo->seq_fops->llseek = seq_lseek;
2478 afinfo->seq_fops->release = seq_release_private;
2480 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2488 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2492 proc_net_remove(afinfo->name);
2493 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2496 static void get_openreq4(struct sock *sk, struct open_request *req,
2497 char *tmpbuf, int i, int uid)
2499 int ttd = req->expires - jiffies;
2501 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2502 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2504 req->af.v4_req.loc_addr,
2505 ntohs(inet_sk(sk)->sport),
2506 req->af.v4_req.rmt_addr,
2507 ntohs(req->rmt_port),
2509 0, 0, /* could print option size, but that is af dependent. */
2510 1, /* timers active (only the expire timer) */
2511 jiffies_to_clock_t(ttd),
2514 0, /* non standard timer */
2515 0, /* open_requests have no inode */
2516 atomic_read(&sk->sk_refcnt),
2520 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2523 unsigned long timer_expires;
2524 struct tcp_sock *tp = tcp_sk(sp);
2525 struct inet_sock *inet = inet_sk(sp);
2526 unsigned int dest = inet->daddr;
2527 unsigned int src = inet->rcv_saddr;
2528 __u16 destp = ntohs(inet->dport);
2529 __u16 srcp = ntohs(inet->sport);
2531 if (tp->pending == TCP_TIME_RETRANS) {
2533 timer_expires = tp->timeout;
2534 } else if (tp->pending == TCP_TIME_PROBE0) {
2536 timer_expires = tp->timeout;
2537 } else if (timer_pending(&sp->sk_timer)) {
2539 timer_expires = sp->sk_timer.expires;
2542 timer_expires = jiffies;
2545 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2546 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2547 i, src, srcp, dest, destp, sp->sk_state,
2548 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2550 jiffies_to_clock_t(timer_expires - jiffies),
2555 atomic_read(&sp->sk_refcnt), sp,
2556 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2558 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2561 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2563 unsigned int dest, src;
2565 int ttd = tw->tw_ttd - jiffies;
2570 dest = tw->tw_daddr;
2571 src = tw->tw_rcv_saddr;
2572 destp = ntohs(tw->tw_dport);
2573 srcp = ntohs(tw->tw_sport);
2575 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2576 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2577 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2578 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2579 atomic_read(&tw->tw_refcnt), tw);
2584 static int tcp4_seq_show(struct seq_file *seq, void *v)
2586 struct tcp_iter_state* st;
2587 char tmpbuf[TMPSZ + 1];
2589 if (v == SEQ_START_TOKEN) {
2590 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2591 " sl local_address rem_address st tx_queue "
2592 "rx_queue tr tm->when retrnsmt uid timeout "
2598 switch (st->state) {
2599 case TCP_SEQ_STATE_LISTENING:
2600 case TCP_SEQ_STATE_ESTABLISHED:
2601 get_tcp4_sock(v, tmpbuf, st->num);
2603 case TCP_SEQ_STATE_OPENREQ:
2604 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2606 case TCP_SEQ_STATE_TIME_WAIT:
2607 get_timewait4_sock(v, tmpbuf, st->num);
2610 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2615 static struct file_operations tcp4_seq_fops;
2616 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2617 .owner = THIS_MODULE,
2620 .seq_show = tcp4_seq_show,
2621 .seq_fops = &tcp4_seq_fops,
2624 int __init tcp4_proc_init(void)
2626 return tcp_proc_register(&tcp4_seq_afinfo);
2629 void tcp4_proc_exit(void)
2631 tcp_proc_unregister(&tcp4_seq_afinfo);
2633 #endif /* CONFIG_PROC_FS */
2635 struct proto tcp_prot = {
2637 .owner = THIS_MODULE,
2639 .connect = tcp_v4_connect,
2640 .disconnect = tcp_disconnect,
2641 .accept = tcp_accept,
2643 .init = tcp_v4_init_sock,
2644 .destroy = tcp_v4_destroy_sock,
2645 .shutdown = tcp_shutdown,
2646 .setsockopt = tcp_setsockopt,
2647 .getsockopt = tcp_getsockopt,
2648 .sendmsg = tcp_sendmsg,
2649 .recvmsg = tcp_recvmsg,
2650 .backlog_rcv = tcp_v4_do_rcv,
2651 .hash = tcp_v4_hash,
2652 .unhash = tcp_unhash,
2653 .get_port = tcp_v4_get_port,
2654 .enter_memory_pressure = tcp_enter_memory_pressure,
2655 .sockets_allocated = &tcp_sockets_allocated,
2656 .memory_allocated = &tcp_memory_allocated,
2657 .memory_pressure = &tcp_memory_pressure,
2658 .sysctl_mem = sysctl_tcp_mem,
2659 .sysctl_wmem = sysctl_tcp_wmem,
2660 .sysctl_rmem = sysctl_tcp_rmem,
2661 .max_header = MAX_TCP_HEADER,
2662 .obj_size = sizeof(struct tcp_sock),
2667 void __init tcp_v4_init(struct net_proto_family *ops)
2669 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2671 panic("Failed to create the TCP control socket.\n");
2672 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2673 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2675 /* Unhash it so that IP input processing does not even
2676 * see it, we do not wish this socket to see incoming
2679 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2682 EXPORT_SYMBOL(ipv4_specific);
2683 EXPORT_SYMBOL(tcp_bind_hash);
2684 EXPORT_SYMBOL(tcp_bucket_create);
2685 EXPORT_SYMBOL(tcp_hashinfo);
2686 EXPORT_SYMBOL(tcp_inherit_port);
2687 EXPORT_SYMBOL(tcp_listen_wlock);
2688 EXPORT_SYMBOL(tcp_port_rover);
2689 EXPORT_SYMBOL(tcp_prot);
2690 EXPORT_SYMBOL(tcp_put_port);
2691 EXPORT_SYMBOL(tcp_unhash);
2692 EXPORT_SYMBOL(tcp_v4_conn_request);
2693 EXPORT_SYMBOL(tcp_v4_connect);
2694 EXPORT_SYMBOL(tcp_v4_do_rcv);
2695 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2696 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2697 EXPORT_SYMBOL(tcp_v4_send_check);
2698 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2700 #ifdef CONFIG_PROC_FS
2701 EXPORT_SYMBOL(tcp_proc_register);
2702 EXPORT_SYMBOL(tcp_proc_unregister);
2704 EXPORT_SYMBOL(sysctl_local_port_range);
2705 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2706 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2707 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);