2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/vserver/debug.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
109 __u32 faddr, __u16 fport)
111 int h = (laddr ^ lport) ^ (faddr ^ fport);
114 return h & (tcp_ehash_size - 1);
117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
119 struct inet_opt *inet = inet_sk(sk);
120 __u32 laddr = inet->rcv_saddr;
121 __u16 lport = inet->num;
122 __u32 faddr = inet->daddr;
123 __u16 fport = inet->dport;
125 return tcp_hashfn(laddr, lport, faddr, fport);
128 /* Allocate and initialize a new TCP local port bind bucket.
129 * The bindhash mutex for snum's hash chain must be held here.
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
134 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
139 INIT_HLIST_HEAD(&tb->owners);
140 hlist_add_head(&tb->node, &head->chain);
145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
148 if (hlist_empty(&tb->owners)) {
149 __hlist_del(&tb->node);
150 kmem_cache_free(tcp_bucket_cachep, tb);
154 /* Caller must disable local BH processing. */
155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
157 struct tcp_bind_hashbucket *head =
158 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
159 struct tcp_bind_bucket *tb;
161 spin_lock(&head->lock);
162 tb = tcp_sk(sk)->bind_hash;
163 sk_add_bind_node(child, &tb->owners);
164 tcp_sk(child)->bind_hash = tb;
165 spin_unlock(&head->lock);
168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
171 __tcp_inherit_port(sk, child);
175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
178 inet_sk(sk)->num = snum;
179 sk_add_bind_node(sk, &tb->owners);
180 tcp_sk(sk)->bind_hash = tb;
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
185 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
187 struct hlist_node *node;
188 int reuse = sk->sk_reuse;
190 sk_for_each_bound(sk2, node, &tb->owners) {
192 !tcp_v6_ipv6only(sk2) &&
193 (!sk->sk_bound_dev_if ||
194 !sk2->sk_bound_dev_if ||
195 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
196 if (!reuse || !sk2->sk_reuse ||
197 sk2->sk_state == TCP_LISTEN) {
198 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
199 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
200 sk2_rcv_saddr == sk_rcv_saddr)
208 /* Obtain a reference to a local port for the given sock,
209 * if snum is zero it means select any available local port.
211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
213 struct tcp_bind_hashbucket *head;
214 struct hlist_node *node;
215 struct tcp_bind_bucket *tb;
220 int low = sysctl_local_port_range[0];
221 int high = sysctl_local_port_range[1];
222 int remaining = (high - low) + 1;
225 spin_lock(&tcp_portalloc_lock);
226 rover = tcp_port_rover;
229 if (rover < low || rover > high)
231 head = &tcp_bhash[tcp_bhashfn(rover)];
232 spin_lock(&head->lock);
233 tb_for_each(tb, node, &head->chain)
234 if (tb->port == rover)
238 spin_unlock(&head->lock);
239 } while (--remaining > 0);
240 tcp_port_rover = rover;
241 spin_unlock(&tcp_portalloc_lock);
243 /* Exhausted local port range during search? */
248 /* OK, here is the one we will use. HEAD is
249 * non-NULL and we hold it's mutex.
253 head = &tcp_bhash[tcp_bhashfn(snum)];
254 spin_lock(&head->lock);
255 tb_for_each(tb, node, &head->chain)
256 if (tb->port == snum)
262 if (!hlist_empty(&tb->owners)) {
263 if (sk->sk_reuse > 1)
265 if (tb->fastreuse > 0 &&
266 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
270 if (tcp_bind_conflict(sk, tb))
276 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
278 if (hlist_empty(&tb->owners)) {
279 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
283 } else if (tb->fastreuse &&
284 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
287 if (!tcp_sk(sk)->bind_hash)
288 tcp_bind_hash(sk, tb, snum);
289 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
293 spin_unlock(&head->lock);
299 /* Get rid of any references to a local port held by the
302 static void __tcp_put_port(struct sock *sk)
304 struct inet_opt *inet = inet_sk(sk);
305 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
306 struct tcp_bind_bucket *tb;
308 spin_lock(&head->lock);
309 tb = tcp_sk(sk)->bind_hash;
310 __sk_del_bind_node(sk);
311 tcp_sk(sk)->bind_hash = NULL;
313 tcp_bucket_destroy(tb);
314 spin_unlock(&head->lock);
317 void tcp_put_port(struct sock *sk)
324 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
325 * Look, when several writers sleep and reader wakes them up, all but one
326 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
327 * this, _but_ remember, it adds useless work on UP machines (wake up each
328 * exclusive lock release). It should be ifdefed really.
331 void tcp_listen_wlock(void)
333 write_lock(&tcp_lhash_lock);
335 if (atomic_read(&tcp_lhash_users)) {
339 prepare_to_wait_exclusive(&tcp_lhash_wait,
340 &wait, TASK_UNINTERRUPTIBLE);
341 if (!atomic_read(&tcp_lhash_users))
343 write_unlock_bh(&tcp_lhash_lock);
345 write_lock_bh(&tcp_lhash_lock);
348 finish_wait(&tcp_lhash_wait, &wait);
352 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
354 struct hlist_head *list;
357 BUG_TRAP(sk_unhashed(sk));
358 if (listen_possible && sk->sk_state == TCP_LISTEN) {
359 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
360 lock = &tcp_lhash_lock;
363 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
364 lock = &tcp_ehash[sk->sk_hashent].lock;
367 __sk_add_node(sk, list);
368 sock_prot_inc_use(sk->sk_prot);
370 if (listen_possible && sk->sk_state == TCP_LISTEN)
371 wake_up(&tcp_lhash_wait);
374 static void tcp_v4_hash(struct sock *sk)
376 if (sk->sk_state != TCP_CLOSE) {
378 __tcp_v4_hash(sk, 1);
383 void tcp_unhash(struct sock *sk)
390 if (sk->sk_state == TCP_LISTEN) {
393 lock = &tcp_lhash_lock;
395 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
397 write_lock_bh(&head->lock);
400 if (__sk_del_node_init(sk))
401 sock_prot_dec_use(sk->sk_prot);
402 write_unlock_bh(lock);
405 if (sk->sk_state == TCP_LISTEN)
406 wake_up(&tcp_lhash_wait);
409 /* Don't inline this cruft. Here are some nice properties to
410 * exploit here. The BSD API does not allow a listening TCP
411 * to specify the remote port nor the remote address for the
412 * connection. So always assume those are both wildcarded
413 * during the search since they can never be otherwise.
415 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
416 unsigned short hnum, int dif)
418 struct sock *result = NULL, *sk;
419 struct hlist_node *node;
423 sk_for_each(sk, node, head) {
424 struct inet_opt *inet = inet_sk(sk);
426 if (inet->num == hnum && !ipv6_only_sock(sk)) {
427 __u32 rcv_saddr = inet->rcv_saddr;
429 score = (sk->sk_family == PF_INET ? 1 : 0);
431 if (rcv_saddr != daddr)
435 if (sk->sk_bound_dev_if) {
436 if (sk->sk_bound_dev_if != dif)
442 if (score > hiscore) {
451 /* Optimize the common listener case. */
452 struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
454 struct sock *sk = NULL;
455 struct hlist_head *head;
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
461 if (inet->num == hnum && !sk->sk_node.next &&
462 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
463 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
464 !sk->sk_bound_dev_if)
466 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
472 read_unlock(&tcp_lhash_lock);
476 EXPORT_SYMBOL_GPL(tcp_v4_lookup_listener);
478 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
479 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
481 * Local BH must be disabled here.
484 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
488 struct tcp_ehash_bucket *head;
489 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
490 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
492 struct hlist_node *node;
493 /* Optimize here for direct hit, only listening connections can
494 * have wildcards anyways.
496 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
497 head = &tcp_ehash[hash];
498 read_lock(&head->lock);
499 sk_for_each(sk, node, &head->chain) {
500 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
501 goto hit; /* You sunk my battleship! */
504 /* Must check for a TIME_WAIT'er before going to listener hash. */
505 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
506 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
511 read_unlock(&head->lock);
518 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
519 u32 daddr, u16 hnum, int dif)
521 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
524 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
527 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
533 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
539 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
541 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
543 return secure_tcp_sequence_number(skb->nh.iph->daddr,
549 /* called with local bh disabled */
550 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
551 struct tcp_tw_bucket **twp)
553 struct inet_opt *inet = inet_sk(sk);
554 u32 daddr = inet->rcv_saddr;
555 u32 saddr = inet->daddr;
556 int dif = sk->sk_bound_dev_if;
557 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
558 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
559 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
560 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
562 struct hlist_node *node;
563 struct tcp_tw_bucket *tw;
565 write_lock(&head->lock);
567 /* Check TIME-WAIT sockets first. */
568 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
569 tw = (struct tcp_tw_bucket *)sk2;
571 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
572 struct tcp_opt *tp = tcp_sk(sk);
574 /* With PAWS, it is safe from the viewpoint
575 of data integrity. Even without PAWS it
576 is safe provided sequence spaces do not
577 overlap i.e. at data rates <= 80Mbit/sec.
579 Actually, the idea is close to VJ's one,
580 only timestamp cache is held not per host,
581 but per port pair and TW bucket is used
584 If TW bucket has been already destroyed we
585 fall back to VJ's scheme and use initial
586 timestamp retrieved from peer table.
588 if (tw->tw_ts_recent_stamp &&
589 (!twp || (sysctl_tcp_tw_reuse &&
591 tw->tw_ts_recent_stamp > 1))) {
593 tw->tw_snd_nxt + 65535 + 2) == 0)
595 tp->ts_recent = tw->tw_ts_recent;
596 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
605 /* And established part... */
606 sk_for_each(sk2, node, &head->chain) {
607 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
612 /* Must record num and sport now. Otherwise we will see
613 * in hash table socket with a funny identity. */
615 inet->sport = htons(lport);
616 sk->sk_hashent = hash;
617 BUG_TRAP(sk_unhashed(sk));
618 __sk_add_node(sk, &head->chain);
619 sock_prot_inc_use(sk->sk_prot);
620 write_unlock(&head->lock);
624 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
626 /* Silly. Should hash-dance instead... */
627 tcp_tw_deschedule(tw);
628 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
636 write_unlock(&head->lock);
637 return -EADDRNOTAVAIL;
641 * Bind a port for a connect operation and hash it.
643 static int tcp_v4_hash_connect(struct sock *sk)
645 unsigned short snum = inet_sk(sk)->num;
646 struct tcp_bind_hashbucket *head;
647 struct tcp_bind_bucket *tb;
652 int low = sysctl_local_port_range[0];
653 int high = sysctl_local_port_range[1];
654 int remaining = (high - low) + 1;
655 struct hlist_node *node;
656 struct tcp_tw_bucket *tw = NULL;
660 /* TODO. Actually it is not so bad idea to remove
661 * tcp_portalloc_lock before next submission to Linus.
662 * As soon as we touch this place at all it is time to think.
664 * Now it protects single _advisory_ variable tcp_port_rover,
665 * hence it is mostly useless.
666 * Code will work nicely if we just delete it, but
667 * I am afraid in contented case it will work not better or
668 * even worse: another cpu just will hit the same bucket
670 * So some cpu salt could remove both contention and
671 * memory pingpong. Any ideas how to do this in a nice way?
673 spin_lock(&tcp_portalloc_lock);
674 rover = tcp_port_rover;
678 if ((rover < low) || (rover > high))
680 head = &tcp_bhash[tcp_bhashfn(rover)];
681 spin_lock(&head->lock);
683 /* Does not bother with rcv_saddr checks,
684 * because the established check is already
687 tb_for_each(tb, node, &head->chain) {
688 if (tb->port == rover) {
689 BUG_TRAP(!hlist_empty(&tb->owners));
690 if (tb->fastreuse >= 0)
692 if (!__tcp_v4_check_established(sk,
700 tb = tcp_bucket_create(head, rover);
702 spin_unlock(&head->lock);
709 spin_unlock(&head->lock);
710 } while (--remaining > 0);
711 tcp_port_rover = rover;
712 spin_unlock(&tcp_portalloc_lock);
716 return -EADDRNOTAVAIL;
719 /* All locks still held and bhs disabled */
720 tcp_port_rover = rover;
721 spin_unlock(&tcp_portalloc_lock);
723 tcp_bind_hash(sk, tb, rover);
724 if (sk_unhashed(sk)) {
725 inet_sk(sk)->sport = htons(rover);
726 __tcp_v4_hash(sk, 0);
728 spin_unlock(&head->lock);
731 tcp_tw_deschedule(tw);
739 head = &tcp_bhash[tcp_bhashfn(snum)];
740 tb = tcp_sk(sk)->bind_hash;
741 spin_lock_bh(&head->lock);
742 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
743 __tcp_v4_hash(sk, 0);
744 spin_unlock_bh(&head->lock);
747 spin_unlock(&head->lock);
748 /* No definite answer... Walk to established hash table */
749 ret = __tcp_v4_check_established(sk, snum, NULL);
756 /* This will initiate an outgoing connection. */
757 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
759 struct inet_opt *inet = inet_sk(sk);
760 struct tcp_opt *tp = tcp_sk(sk);
761 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
767 if (addr_len < sizeof(struct sockaddr_in))
770 if (usin->sin_family != AF_INET)
771 return -EAFNOSUPPORT;
773 nexthop = daddr = usin->sin_addr.s_addr;
774 if (inet->opt && inet->opt->srr) {
777 nexthop = inet->opt->faddr;
780 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
781 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
783 inet->sport, usin->sin_port, sk);
787 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
792 if (!inet->opt || !inet->opt->srr)
796 inet->saddr = rt->rt_src;
797 inet->rcv_saddr = inet->saddr;
799 if (tp->ts_recent_stamp && inet->daddr != daddr) {
800 /* Reset inherited state */
802 tp->ts_recent_stamp = 0;
806 if (sysctl_tcp_tw_recycle &&
807 !tp->ts_recent_stamp && rt->rt_dst == daddr) {
808 struct inet_peer *peer = rt_get_peer(rt);
810 /* VJ's idea. We save last timestamp seen from
811 * the destination in peer table, when entering state TIME-WAIT
812 * and initialize ts_recent from it, when trying new connection.
815 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
816 tp->ts_recent_stamp = peer->tcp_ts_stamp;
817 tp->ts_recent = peer->tcp_ts;
821 inet->dport = usin->sin_port;
824 tp->ext_header_len = 0;
826 tp->ext_header_len = inet->opt->optlen;
830 /* Socket identity is still unknown (sport may be zero).
831 * However we set state to SYN-SENT and not releasing socket
832 * lock select source port, enter ourselves into the hash tables and
833 * complete initialization after this.
835 tcp_set_state(sk, TCP_SYN_SENT);
836 err = tcp_v4_hash_connect(sk);
840 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
844 /* OK, now commit destination to socket. */
845 __sk_dst_set(sk, &rt->u.dst);
846 tcp_v4_setup_caps(sk, &rt->u.dst);
847 tp->ext2_header_len = rt->u.dst.header_len;
850 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
855 inet->id = tp->write_seq ^ jiffies;
857 err = tcp_connect(sk);
865 /* This unhashes the socket and releases the local port, if necessary. */
866 tcp_set_state(sk, TCP_CLOSE);
868 sk->sk_route_caps = 0;
873 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
875 return ((struct rtable *)skb->dst)->rt_iif;
878 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
880 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
883 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
884 struct open_request ***prevp,
886 __u32 raddr, __u32 laddr)
888 struct tcp_listen_opt *lopt = tp->listen_opt;
889 struct open_request *req, **prev;
891 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
892 (req = *prev) != NULL;
893 prev = &req->dl_next) {
894 if (req->rmt_port == rport &&
895 req->af.v4_req.rmt_addr == raddr &&
896 req->af.v4_req.loc_addr == laddr &&
897 TCP_INET_FAMILY(req->class->family)) {
907 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
909 struct tcp_opt *tp = tcp_sk(sk);
910 struct tcp_listen_opt *lopt = tp->listen_opt;
911 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
913 req->expires = jiffies + TCP_TIMEOUT_INIT;
916 req->dl_next = lopt->syn_table[h];
918 write_lock(&tp->syn_wait_lock);
919 lopt->syn_table[h] = req;
920 write_unlock(&tp->syn_wait_lock);
927 * This routine does path mtu discovery as defined in RFC1191.
929 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
932 struct dst_entry *dst;
933 struct inet_opt *inet = inet_sk(sk);
934 struct tcp_opt *tp = tcp_sk(sk);
936 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
937 * send out by Linux are always <576bytes so they should go through
940 if (sk->sk_state == TCP_LISTEN)
943 /* We don't check in the destentry if pmtu discovery is forbidden
944 * on this route. We just assume that no packet_to_big packets
945 * are send back when pmtu discovery is not active.
946 * There is a small race when the user changes this flag in the
947 * route, but I think that's acceptable.
949 if ((dst = __sk_dst_check(sk, 0)) == NULL)
952 dst->ops->update_pmtu(dst, mtu);
954 /* Something is about to be wrong... Remember soft error
955 * for the case, if this connection will not able to recover.
957 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
958 sk->sk_err_soft = EMSGSIZE;
962 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
963 tp->pmtu_cookie > mtu) {
964 tcp_sync_mss(sk, mtu);
966 /* Resend the TCP packet because it's
967 * clear that the old packet has been
968 * dropped. This is the new "fast" path mtu
971 tcp_simple_retransmit(sk);
972 } /* else let the usual retransmit timer handle it */
976 * This routine is called by the ICMP module when it gets some
977 * sort of error condition. If err < 0 then the socket should
978 * be closed and the error returned to the user. If err > 0
979 * it's just the icmp type << 8 | icmp code. After adjustment
980 * header points to the first 8 bytes of the tcp header. We need
981 * to find the appropriate port.
983 * The locking strategy used here is very "optimistic". When
984 * someone else accesses the socket the ICMP is just dropped
985 * and for some paths there is no check at all.
986 * A more general error queue to queue errors for later handling
987 * is probably better.
991 void tcp_v4_err(struct sk_buff *skb, u32 info)
993 struct iphdr *iph = (struct iphdr *)skb->data;
994 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
996 struct inet_opt *inet;
997 int type = skb->h.icmph->type;
998 int code = skb->h.icmph->code;
1003 if (skb->len < (iph->ihl << 2) + 8) {
1004 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1008 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1009 th->source, tcp_v4_iif(skb));
1011 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1014 if (sk->sk_state == TCP_TIME_WAIT) {
1015 tcp_tw_put((struct tcp_tw_bucket *)sk);
1020 /* If too many ICMPs get dropped on busy
1021 * servers this needs to be solved differently.
1023 if (sock_owned_by_user(sk))
1024 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1026 if (sk->sk_state == TCP_CLOSE)
1030 seq = ntohl(th->seq);
1031 if (sk->sk_state != TCP_LISTEN &&
1032 !between(seq, tp->snd_una, tp->snd_nxt)) {
1033 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1038 case ICMP_SOURCE_QUENCH:
1039 /* Just silently ignore these. */
1041 case ICMP_PARAMETERPROB:
1044 case ICMP_DEST_UNREACH:
1045 if (code > NR_ICMP_UNREACH)
1048 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1049 if (!sock_owned_by_user(sk))
1050 do_pmtu_discovery(sk, iph, info);
1054 err = icmp_err_convert[code].errno;
1056 case ICMP_TIME_EXCEEDED:
1063 switch (sk->sk_state) {
1064 struct open_request *req, **prev;
1066 if (sock_owned_by_user(sk))
1069 req = tcp_v4_search_req(tp, &prev, th->dest,
1070 iph->daddr, iph->saddr);
1074 /* ICMPs are not backlogged, hence we cannot get
1075 an established socket here.
1079 if (seq != req->snt_isn) {
1080 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1085 * Still in SYN_RECV, just remove it silently.
1086 * There is no good way to pass the error to the newly
1087 * created socket, and POSIX does not want network
1088 * errors returned from accept().
1090 tcp_synq_drop(sk, req, prev);
1094 case TCP_SYN_RECV: /* Cannot happen.
1095 It can f.e. if SYNs crossed.
1097 if (!sock_owned_by_user(sk)) {
1098 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1101 sk->sk_error_report(sk);
1105 sk->sk_err_soft = err;
1110 /* If we've already connected we will keep trying
1111 * until we time out, or the user gives up.
1113 * rfc1122 4.2.3.9 allows to consider as hard errors
1114 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1115 * but it is obsoleted by pmtu discovery).
1117 * Note, that in modern internet, where routing is unreliable
1118 * and in each dark corner broken firewalls sit, sending random
1119 * errors ordered by their masters even this two messages finally lose
1120 * their original sense (even Linux sends invalid PORT_UNREACHs)
1122 * Now we are in compliance with RFCs.
1127 if (!sock_owned_by_user(sk) && inet->recverr) {
1129 sk->sk_error_report(sk);
1130 } else { /* Only an error on timeout */
1131 sk->sk_err_soft = err;
1139 /* This routine computes an IPv4 TCP checksum. */
1140 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1141 struct sk_buff *skb)
1143 struct inet_opt *inet = inet_sk(sk);
1145 if (skb->ip_summed == CHECKSUM_HW) {
1146 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1147 skb->csum = offsetof(struct tcphdr, check);
1149 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1150 csum_partial((char *)th,
1157 * This routine will send an RST to the other tcp.
1159 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1161 * Answer: if a packet caused RST, it is not for a socket
1162 * existing in our system, if it is matched to a socket,
1163 * it is just duplicate segment or bug in other side's TCP.
1164 * So that we build reply only basing on parameters
1165 * arrived with segment.
1166 * Exception: precedence violation. We do not implement it in any case.
1169 static void tcp_v4_send_reset(struct sk_buff *skb)
1171 struct tcphdr *th = skb->h.th;
1173 struct ip_reply_arg arg;
1175 /* Never send a reset in response to a reset. */
1179 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1182 /* Swap the send and the receive. */
1183 memset(&rth, 0, sizeof(struct tcphdr));
1184 rth.dest = th->source;
1185 rth.source = th->dest;
1186 rth.doff = sizeof(struct tcphdr) / 4;
1190 rth.seq = th->ack_seq;
1193 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1194 skb->len - (th->doff << 2));
1197 memset(&arg, 0, sizeof arg);
1198 arg.iov[0].iov_base = (unsigned char *)&rth;
1199 arg.iov[0].iov_len = sizeof rth;
1200 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1201 skb->nh.iph->saddr, /*XXX*/
1202 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1203 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1205 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1207 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1208 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1211 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1212 outside socket context is ugly, certainly. What can I do?
1215 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1218 struct tcphdr *th = skb->h.th;
1223 struct ip_reply_arg arg;
1225 memset(&rep.th, 0, sizeof(struct tcphdr));
1226 memset(&arg, 0, sizeof arg);
1228 arg.iov[0].iov_base = (unsigned char *)&rep;
1229 arg.iov[0].iov_len = sizeof(rep.th);
1231 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1232 (TCPOPT_TIMESTAMP << 8) |
1234 rep.tsopt[1] = htonl(tcp_time_stamp);
1235 rep.tsopt[2] = htonl(ts);
1236 arg.iov[0].iov_len = sizeof(rep);
1239 /* Swap the send and the receive. */
1240 rep.th.dest = th->source;
1241 rep.th.source = th->dest;
1242 rep.th.doff = arg.iov[0].iov_len / 4;
1243 rep.th.seq = htonl(seq);
1244 rep.th.ack_seq = htonl(ack);
1246 rep.th.window = htons(win);
1248 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1249 skb->nh.iph->saddr, /*XXX*/
1250 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1251 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1253 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1255 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1258 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1260 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1262 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1263 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1268 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1270 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1274 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1275 struct open_request *req)
1278 struct ip_options *opt = req->af.v4_req.opt;
1279 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1281 { .daddr = ((opt && opt->srr) ?
1283 req->af.v4_req.rmt_addr),
1284 .saddr = req->af.v4_req.loc_addr,
1285 .tos = RT_CONN_FLAGS(sk) } },
1286 .proto = IPPROTO_TCP,
1288 { .sport = inet_sk(sk)->sport,
1289 .dport = req->rmt_port } } };
1291 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1292 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1295 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1297 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1304 * Send a SYN-ACK after having received an ACK.
1305 * This still operates on a open_request only, not on a big
1308 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1309 struct dst_entry *dst)
1312 struct sk_buff * skb;
1314 /* First, grab a route. */
1315 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1318 skb = tcp_make_synack(sk, dst, req);
1321 struct tcphdr *th = skb->h.th;
1323 th->check = tcp_v4_check(th, skb->len,
1324 req->af.v4_req.loc_addr,
1325 req->af.v4_req.rmt_addr,
1326 csum_partial((char *)th, skb->len,
1329 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1330 req->af.v4_req.rmt_addr,
1331 req->af.v4_req.opt);
1332 if (err == NET_XMIT_CN)
1342 * IPv4 open_request destructor.
1344 static void tcp_v4_or_free(struct open_request *req)
1346 if (req->af.v4_req.opt)
1347 kfree(req->af.v4_req.opt);
1350 static inline void syn_flood_warning(struct sk_buff *skb)
1352 static unsigned long warntime;
1354 if (time_after(jiffies, (warntime + HZ * 60))) {
1357 "possible SYN flooding on port %d. Sending cookies.\n",
1358 ntohs(skb->h.th->dest));
1363 * Save and compile IPv4 options into the open_request if needed.
1365 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1366 struct sk_buff *skb)
1368 struct ip_options *opt = &(IPCB(skb)->opt);
1369 struct ip_options *dopt = NULL;
1371 if (opt && opt->optlen) {
1372 int opt_size = optlength(opt);
1373 dopt = kmalloc(opt_size, GFP_ATOMIC);
1375 if (ip_options_echo(dopt, skb)) {
1385 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1386 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1387 * It would be better to replace it with a global counter for all sockets
1388 * but then some measure against one socket starving all other sockets
1391 * It was 128 by default. Experiments with real servers show, that
1392 * it is absolutely not enough even at 100conn/sec. 256 cures most
1393 * of problems. This value is adjusted to 128 for very small machines
1394 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1395 * Further increasing requires to change hash table size.
1397 int sysctl_max_syn_backlog = 256;
1399 struct or_calltable or_ipv4 = {
1401 .rtx_syn_ack = tcp_v4_send_synack,
1402 .send_ack = tcp_v4_or_send_ack,
1403 .destructor = tcp_v4_or_free,
1404 .send_reset = tcp_v4_send_reset,
1407 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1410 struct open_request *req;
1411 __u32 saddr = skb->nh.iph->saddr;
1412 __u32 daddr = skb->nh.iph->daddr;
1413 __u32 isn = TCP_SKB_CB(skb)->when;
1414 struct dst_entry *dst = NULL;
1415 #ifdef CONFIG_SYN_COOKIES
1416 int want_cookie = 0;
1418 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1421 /* Never answer to SYNs send to broadcast or multicast */
1422 if (((struct rtable *)skb->dst)->rt_flags &
1423 (RTCF_BROADCAST | RTCF_MULTICAST))
1426 /* TW buckets are converted to open requests without
1427 * limitations, they conserve resources and peer is
1428 * evidently real one.
1430 if (tcp_synq_is_full(sk) && !isn) {
1431 #ifdef CONFIG_SYN_COOKIES
1432 if (sysctl_tcp_syncookies) {
1439 /* Accept backlog is full. If we have already queued enough
1440 * of warm entries in syn queue, drop request. It is better than
1441 * clogging syn queue with openreqs with exponentially increasing
1444 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1447 req = tcp_openreq_alloc();
1451 tcp_clear_options(&tp);
1453 tp.user_mss = tcp_sk(sk)->user_mss;
1455 tcp_parse_options(skb, &tp, 0);
1458 tcp_clear_options(&tp);
1462 if (tp.saw_tstamp && !tp.rcv_tsval) {
1463 /* Some OSes (unknown ones, but I see them on web server, which
1464 * contains information interesting only for windows'
1465 * users) do not send their stamp in SYN. It is easy case.
1466 * We simply do not advertise TS support.
1471 tp.tstamp_ok = tp.saw_tstamp;
1473 tcp_openreq_init(req, &tp, skb);
1474 req->af.v4_req.loc_addr = daddr;
1475 req->af.v4_req.rmt_addr = saddr;
1476 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1477 req->class = &or_ipv4;
1479 TCP_ECN_create_request(req, skb->h.th);
1482 #ifdef CONFIG_SYN_COOKIES
1483 syn_flood_warning(skb);
1485 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1487 struct inet_peer *peer = NULL;
1489 /* VJ's idea. We save last timestamp seen
1490 * from the destination in peer table, when entering
1491 * state TIME-WAIT, and check against it before
1492 * accepting new connection request.
1494 * If "isn" is not zero, this request hit alive
1495 * timewait bucket, so that all the necessary checks
1496 * are made in the function processing timewait state.
1498 if (tp.saw_tstamp &&
1499 sysctl_tcp_tw_recycle &&
1500 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1501 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1502 peer->v4daddr == saddr) {
1503 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1504 (s32)(peer->tcp_ts - req->ts_recent) >
1506 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1511 /* Kill the following clause, if you dislike this way. */
1512 else if (!sysctl_tcp_syncookies &&
1513 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1514 (sysctl_max_syn_backlog >> 2)) &&
1515 (!peer || !peer->tcp_ts_stamp) &&
1516 (!dst || !dst_metric(dst, RTAX_RTT))) {
1517 /* Without syncookies last quarter of
1518 * backlog is filled with destinations,
1519 * proven to be alive.
1520 * It means that we continue to communicate
1521 * to destinations, already remembered
1522 * to the moment of synflood.
1524 NETDEBUG(if (net_ratelimit()) \
1525 printk(KERN_DEBUG "TCP: drop open "
1526 "request from %u.%u."
1529 ntohs(skb->h.th->source)));
1534 isn = tcp_v4_init_sequence(sk, skb);
1538 if (tcp_v4_send_synack(sk, req, dst))
1542 tcp_openreq_free(req);
1544 tcp_v4_synq_add(sk, req);
1549 tcp_openreq_free(req);
1551 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1557 * The three way handshake has completed - we got a valid synack -
1558 * now create the new socket.
1560 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1561 struct open_request *req,
1562 struct dst_entry *dst)
1564 struct inet_opt *newinet;
1565 struct tcp_opt *newtp;
1568 if (sk_acceptq_is_full(sk))
1571 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1574 newsk = tcp_create_openreq_child(sk, req, skb);
1578 newsk->sk_dst_cache = dst;
1579 tcp_v4_setup_caps(newsk, dst);
1581 newtp = tcp_sk(newsk);
1582 newinet = inet_sk(newsk);
1583 newinet->daddr = req->af.v4_req.rmt_addr;
1584 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1585 newinet->saddr = req->af.v4_req.loc_addr;
1586 newinet->opt = req->af.v4_req.opt;
1587 req->af.v4_req.opt = NULL;
1588 newinet->mc_index = tcp_v4_iif(skb);
1589 newinet->mc_ttl = skb->nh.iph->ttl;
1590 newtp->ext_header_len = 0;
1592 newtp->ext_header_len = newinet->opt->optlen;
1593 newtp->ext2_header_len = dst->header_len;
1594 newinet->id = newtp->write_seq ^ jiffies;
1596 tcp_sync_mss(newsk, dst_pmtu(dst));
1597 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1598 tcp_initialize_rcv_mss(newsk);
1600 __tcp_v4_hash(newsk, 0);
1601 __tcp_inherit_port(sk, newsk);
1606 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1608 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1613 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1615 struct tcphdr *th = skb->h.th;
1616 struct iphdr *iph = skb->nh.iph;
1617 struct tcp_opt *tp = tcp_sk(sk);
1619 struct open_request **prev;
1620 /* Find possible connection requests. */
1621 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1622 iph->saddr, iph->daddr);
1624 return tcp_check_req(sk, skb, req, prev);
1626 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1633 if (nsk->sk_state != TCP_TIME_WAIT) {
1637 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1641 #ifdef CONFIG_SYN_COOKIES
1642 if (!th->rst && !th->syn && th->ack)
1643 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1648 static int tcp_v4_checksum_init(struct sk_buff *skb)
1650 if (skb->ip_summed == CHECKSUM_HW) {
1651 skb->ip_summed = CHECKSUM_UNNECESSARY;
1652 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1653 skb->nh.iph->daddr, skb->csum))
1656 NETDEBUG(if (net_ratelimit())
1657 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1658 skb->ip_summed = CHECKSUM_NONE;
1660 if (skb->len <= 76) {
1661 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1663 skb_checksum(skb, 0, skb->len, 0)))
1665 skb->ip_summed = CHECKSUM_UNNECESSARY;
1667 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1669 skb->nh.iph->daddr, 0);
1675 /* The socket must have it's spinlock held when we get
1678 * We have a potential double-lock case here, so even when
1679 * doing backlog processing we use the BH locking scheme.
1680 * This is because we cannot sleep with the original spinlock
1683 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1685 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1686 TCP_CHECK_TIMER(sk);
1687 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1689 TCP_CHECK_TIMER(sk);
1693 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1696 if (sk->sk_state == TCP_LISTEN) {
1697 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1702 if (tcp_child_process(sk, nsk, skb))
1708 TCP_CHECK_TIMER(sk);
1709 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1711 TCP_CHECK_TIMER(sk);
1715 tcp_v4_send_reset(skb);
1718 /* Be careful here. If this function gets more complicated and
1719 * gcc suffers from register pressure on the x86, sk (in %ebx)
1720 * might be destroyed here. This current version compiles correctly,
1721 * but you have been warned.
1726 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1734 int tcp_v4_rcv(struct sk_buff *skb)
1740 if (skb->pkt_type != PACKET_HOST)
1743 /* Count it even if it's bad */
1744 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1746 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1751 if (th->doff < sizeof(struct tcphdr) / 4)
1753 if (!pskb_may_pull(skb, th->doff * 4))
1756 /* An explanation is required here, I think.
1757 * Packet length and doff are validated by header prediction,
1758 * provided case of th->doff==0 is elimineted.
1759 * So, we defer the checks. */
1760 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1761 tcp_v4_checksum_init(skb) < 0))
1765 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1766 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1767 skb->len - th->doff * 4);
1768 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1769 TCP_SKB_CB(skb)->when = 0;
1770 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1771 TCP_SKB_CB(skb)->sacked = 0;
1773 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1774 skb->nh.iph->daddr, ntohs(th->dest),
1781 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1782 /* Silently drop if VNET is active and the context is not
1783 * entitled to read the packet.
1786 /* Transfer ownership of reusable TIME_WAIT buckets to
1787 * whomever VNET decided should own the packet.
1789 if (sk->sk_state == TCP_TIME_WAIT)
1790 sk->sk_xid = skb->xid;
1792 if ((int) sk->sk_xid > 0 && sk->sk_xid != skb->xid)
1797 if (sk->sk_state == TCP_TIME_WAIT)
1800 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1801 goto discard_and_relse;
1803 if (sk_filter(sk, skb, 0))
1804 goto discard_and_relse;
1810 if (!sock_owned_by_user(sk)) {
1811 if (!tcp_prequeue(sk, skb))
1812 ret = tcp_v4_do_rcv(sk, skb);
1814 sk_add_backlog(sk, skb);
1822 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1825 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1827 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1828 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1829 } else if (vnet_active && skb->sk) {
1830 /* VNET: Suppress RST if the port was bound to a (presumably raw) socket */
1833 tcp_v4_send_reset(skb);
1837 /* Discard frame. */
1846 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1847 tcp_tw_put((struct tcp_tw_bucket *) sk);
1851 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1852 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1853 tcp_tw_put((struct tcp_tw_bucket *) sk);
1856 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1857 skb, th, skb->len)) {
1859 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1863 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1864 tcp_tw_put((struct tcp_tw_bucket *)sk);
1868 /* Fall through to ACK */
1871 tcp_v4_timewait_ack(sk, skb);
1875 case TCP_TW_SUCCESS:;
1880 /* With per-bucket locks this operation is not-atomic, so that
1881 * this version is not worse.
1883 static void __tcp_v4_rehash(struct sock *sk)
1885 sk->sk_prot->unhash(sk);
1886 sk->sk_prot->hash(sk);
1889 static int tcp_v4_reselect_saddr(struct sock *sk)
1891 struct inet_opt *inet = inet_sk(sk);
1894 __u32 old_saddr = inet->saddr;
1896 __u32 daddr = inet->daddr;
1898 if (inet->opt && inet->opt->srr)
1899 daddr = inet->opt->faddr;
1901 /* Query new route. */
1902 err = ip_route_connect(&rt, daddr, 0,
1903 RT_TOS(inet->tos) | sk->sk_localroute,
1904 sk->sk_bound_dev_if,
1906 inet->sport, inet->dport, sk);
1910 __sk_dst_set(sk, &rt->u.dst);
1911 tcp_v4_setup_caps(sk, &rt->u.dst);
1912 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1914 new_saddr = rt->rt_src;
1916 if (new_saddr == old_saddr)
1919 if (sysctl_ip_dynaddr > 1) {
1920 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1921 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1923 NIPQUAD(new_saddr));
1926 inet->saddr = new_saddr;
1927 inet->rcv_saddr = new_saddr;
1929 /* XXX The only one ugly spot where we need to
1930 * XXX really change the sockets identity after
1931 * XXX it has entered the hashes. -DaveM
1933 * Besides that, it does not check for connection
1934 * uniqueness. Wait for troubles.
1936 __tcp_v4_rehash(sk);
1940 int tcp_v4_rebuild_header(struct sock *sk)
1942 struct inet_opt *inet = inet_sk(sk);
1943 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1947 /* Route is OK, nothing to do. */
1952 daddr = inet->daddr;
1953 if (inet->opt && inet->opt->srr)
1954 daddr = inet->opt->faddr;
1957 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1960 .saddr = inet->saddr,
1961 .tos = RT_CONN_FLAGS(sk) } },
1962 .proto = IPPROTO_TCP,
1964 { .sport = inet->sport,
1965 .dport = inet->dport } } };
1967 err = ip_route_output_flow(&rt, &fl, sk, 0);
1970 __sk_dst_set(sk, &rt->u.dst);
1971 tcp_v4_setup_caps(sk, &rt->u.dst);
1972 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1976 /* Routing failed... */
1977 sk->sk_route_caps = 0;
1979 if (!sysctl_ip_dynaddr ||
1980 sk->sk_state != TCP_SYN_SENT ||
1981 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1982 (err = tcp_v4_reselect_saddr(sk)) != 0)
1983 sk->sk_err_soft = -err;
1988 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1990 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1991 struct inet_opt *inet = inet_sk(sk);
1993 sin->sin_family = AF_INET;
1994 sin->sin_addr.s_addr = inet->daddr;
1995 sin->sin_port = inet->dport;
1998 /* VJ's idea. Save last timestamp seen from this destination
1999 * and hold it at least for normal timewait interval to use for duplicate
2000 * segment detection in subsequent connections, before they enter synchronized
2004 int tcp_v4_remember_stamp(struct sock *sk)
2006 struct inet_opt *inet = inet_sk(sk);
2007 struct tcp_opt *tp = tcp_sk(sk);
2008 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2009 struct inet_peer *peer = NULL;
2012 if (!rt || rt->rt_dst != inet->daddr) {
2013 peer = inet_getpeer(inet->daddr, 1);
2017 rt_bind_peer(rt, 1);
2022 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2023 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2024 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2025 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2026 peer->tcp_ts = tp->ts_recent;
2036 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2038 struct inet_peer *peer = NULL;
2040 peer = inet_getpeer(tw->tw_daddr, 1);
2043 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2044 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2045 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2046 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2047 peer->tcp_ts = tw->tw_ts_recent;
2056 struct tcp_func ipv4_specific = {
2057 .queue_xmit = ip_queue_xmit,
2058 .send_check = tcp_v4_send_check,
2059 .rebuild_header = tcp_v4_rebuild_header,
2060 .conn_request = tcp_v4_conn_request,
2061 .syn_recv_sock = tcp_v4_syn_recv_sock,
2062 .remember_stamp = tcp_v4_remember_stamp,
2063 .net_header_len = sizeof(struct iphdr),
2064 .setsockopt = ip_setsockopt,
2065 .getsockopt = ip_getsockopt,
2066 .addr2sockaddr = v4_addr2sockaddr,
2067 .sockaddr_len = sizeof(struct sockaddr_in),
2070 /* NOTE: A lot of things set to zero explicitly by call to
2071 * sk_alloc() so need not be done here.
2073 static int tcp_v4_init_sock(struct sock *sk)
2075 struct tcp_opt *tp = tcp_sk(sk);
2077 skb_queue_head_init(&tp->out_of_order_queue);
2078 tcp_init_xmit_timers(sk);
2079 tcp_prequeue_init(tp);
2081 tp->rto = TCP_TIMEOUT_INIT;
2082 tp->mdev = TCP_TIMEOUT_INIT;
2084 /* So many TCP implementations out there (incorrectly) count the
2085 * initial SYN frame in their delayed-ACK and congestion control
2086 * algorithms that we must have the following bandaid to talk
2087 * efficiently to them. -DaveM
2091 /* See draft-stevens-tcpca-spec-01 for discussion of the
2092 * initialization of these values.
2094 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2095 tp->snd_cwnd_clamp = ~0;
2096 tp->mss_cache_std = tp->mss_cache = 536;
2098 tp->reordering = sysctl_tcp_reordering;
2100 sk->sk_state = TCP_CLOSE;
2102 sk->sk_write_space = sk_stream_write_space;
2103 sk->sk_use_write_queue = 1;
2105 tp->af_specific = &ipv4_specific;
2107 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2108 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2110 atomic_inc(&tcp_sockets_allocated);
2115 int tcp_v4_destroy_sock(struct sock *sk)
2117 struct tcp_opt *tp = tcp_sk(sk);
2119 tcp_clear_xmit_timers(sk);
2121 /* Cleanup up the write buffer. */
2122 sk_stream_writequeue_purge(sk);
2124 /* Cleans up our, hopefully empty, out_of_order_queue. */
2125 __skb_queue_purge(&tp->out_of_order_queue);
2127 /* Clean prequeue, it must be empty really */
2128 __skb_queue_purge(&tp->ucopy.prequeue);
2130 /* Clean up a referenced TCP bind bucket. */
2135 * If sendmsg cached page exists, toss it.
2137 if (sk->sk_sndmsg_page) {
2138 __free_page(sk->sk_sndmsg_page);
2139 sk->sk_sndmsg_page = NULL;
2142 atomic_dec(&tcp_sockets_allocated);
2147 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2149 #ifdef CONFIG_PROC_FS
2150 /* Proc filesystem TCP sock list dumping. */
2152 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2154 return hlist_empty(head) ? NULL :
2155 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2158 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2160 return tw->tw_node.next ?
2161 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2164 static void *listening_get_next(struct seq_file *seq, void *cur)
2167 struct hlist_node *node;
2168 struct sock *sk = cur;
2169 struct tcp_iter_state* st = seq->private;
2173 sk = sk_head(&tcp_listening_hash[0]);
2179 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2180 struct open_request *req = cur;
2182 tp = tcp_sk(st->syn_wait_sk);
2186 vxdprintk(VXD_CBIT(net, 6),
2187 "sk,req: %p [#%d] (from %d)", req->sk,
2188 (req->sk)?req->sk->sk_xid:0, current->xid);
2190 !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2192 if (req->class->family == st->family) {
2198 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2201 req = tp->listen_opt->syn_table[st->sbucket];
2203 sk = sk_next(st->syn_wait_sk);
2204 st->state = TCP_SEQ_STATE_LISTENING;
2205 read_unlock_bh(&tp->syn_wait_lock);
2208 read_lock_bh(&tp->syn_wait_lock);
2209 if (tp->listen_opt && tp->listen_opt->qlen)
2211 read_unlock_bh(&tp->syn_wait_lock);
2215 sk_for_each_from(sk, node) {
2216 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2217 sk, sk->sk_xid, current->xid);
2218 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2220 if (sk->sk_family == st->family) {
2225 read_lock_bh(&tp->syn_wait_lock);
2226 if (tp->listen_opt && tp->listen_opt->qlen) {
2228 st->uid = sock_i_uid(sk);
2229 st->syn_wait_sk = sk;
2230 st->state = TCP_SEQ_STATE_OPENREQ;
2234 read_unlock_bh(&tp->syn_wait_lock);
2236 if (++st->bucket < TCP_LHTABLE_SIZE) {
2237 sk = sk_head(&tcp_listening_hash[st->bucket]);
2245 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2247 void *rc = listening_get_next(seq, NULL);
2249 while (rc && *pos) {
2250 rc = listening_get_next(seq, rc);
2256 static void *established_get_first(struct seq_file *seq)
2258 struct tcp_iter_state* st = seq->private;
2261 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2263 struct hlist_node *node;
2264 struct tcp_tw_bucket *tw;
2266 read_lock(&tcp_ehash[st->bucket].lock);
2267 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2268 vxdprintk(VXD_CBIT(net, 6),
2269 "sk,egf: %p [#%d] (from %d)",
2270 sk, sk->sk_xid, current->xid);
2271 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2273 if (sk->sk_family != st->family)
2278 st->state = TCP_SEQ_STATE_TIME_WAIT;
2279 tw_for_each(tw, node,
2280 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2281 vxdprintk(VXD_CBIT(net, 6),
2282 "tw: %p [#%d] (from %d)",
2283 tw, tw->tw_xid, current->xid);
2284 if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2286 if (tw->tw_family != st->family)
2291 read_unlock(&tcp_ehash[st->bucket].lock);
2292 st->state = TCP_SEQ_STATE_ESTABLISHED;
2298 static void *established_get_next(struct seq_file *seq, void *cur)
2300 struct sock *sk = cur;
2301 struct tcp_tw_bucket *tw;
2302 struct hlist_node *node;
2303 struct tcp_iter_state* st = seq->private;
2307 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2311 while (tw && (tw->tw_family != st->family ||
2312 !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2319 read_unlock(&tcp_ehash[st->bucket].lock);
2320 st->state = TCP_SEQ_STATE_ESTABLISHED;
2321 if (++st->bucket < tcp_ehash_size) {
2322 read_lock(&tcp_ehash[st->bucket].lock);
2323 sk = sk_head(&tcp_ehash[st->bucket].chain);
2331 sk_for_each_from(sk, node) {
2332 vxdprintk(VXD_CBIT(net, 6),
2333 "sk,egn: %p [#%d] (from %d)",
2334 sk, sk->sk_xid, current->xid);
2335 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2337 if (sk->sk_family == st->family)
2341 st->state = TCP_SEQ_STATE_TIME_WAIT;
2342 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2350 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2352 void *rc = established_get_first(seq);
2355 rc = established_get_next(seq, rc);
2361 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2364 struct tcp_iter_state* st = seq->private;
2367 st->state = TCP_SEQ_STATE_LISTENING;
2368 rc = listening_get_idx(seq, &pos);
2371 tcp_listen_unlock();
2373 st->state = TCP_SEQ_STATE_ESTABLISHED;
2374 rc = established_get_idx(seq, pos);
2380 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2382 struct tcp_iter_state* st = seq->private;
2383 st->state = TCP_SEQ_STATE_LISTENING;
2385 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2388 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2391 struct tcp_iter_state* st;
2393 if (v == SEQ_START_TOKEN) {
2394 rc = tcp_get_idx(seq, 0);
2399 switch (st->state) {
2400 case TCP_SEQ_STATE_OPENREQ:
2401 case TCP_SEQ_STATE_LISTENING:
2402 rc = listening_get_next(seq, v);
2404 tcp_listen_unlock();
2406 st->state = TCP_SEQ_STATE_ESTABLISHED;
2407 rc = established_get_first(seq);
2410 case TCP_SEQ_STATE_ESTABLISHED:
2411 case TCP_SEQ_STATE_TIME_WAIT:
2412 rc = established_get_next(seq, v);
2420 static void tcp_seq_stop(struct seq_file *seq, void *v)
2422 struct tcp_iter_state* st = seq->private;
2424 switch (st->state) {
2425 case TCP_SEQ_STATE_OPENREQ:
2427 struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2428 read_unlock_bh(&tp->syn_wait_lock);
2430 case TCP_SEQ_STATE_LISTENING:
2431 if (v != SEQ_START_TOKEN)
2432 tcp_listen_unlock();
2434 case TCP_SEQ_STATE_TIME_WAIT:
2435 case TCP_SEQ_STATE_ESTABLISHED:
2437 read_unlock(&tcp_ehash[st->bucket].lock);
2443 static int tcp_seq_open(struct inode *inode, struct file *file)
2445 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2446 struct seq_file *seq;
2447 struct tcp_iter_state *s;
2450 if (unlikely(afinfo == NULL))
2453 s = kmalloc(sizeof(*s), GFP_KERNEL);
2456 memset(s, 0, sizeof(*s));
2457 s->family = afinfo->family;
2458 s->seq_ops.start = tcp_seq_start;
2459 s->seq_ops.next = tcp_seq_next;
2460 s->seq_ops.show = afinfo->seq_show;
2461 s->seq_ops.stop = tcp_seq_stop;
2463 rc = seq_open(file, &s->seq_ops);
2466 seq = file->private_data;
2475 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2478 struct proc_dir_entry *p;
2482 afinfo->seq_fops->owner = afinfo->owner;
2483 afinfo->seq_fops->open = tcp_seq_open;
2484 afinfo->seq_fops->read = seq_read;
2485 afinfo->seq_fops->llseek = seq_lseek;
2486 afinfo->seq_fops->release = seq_release_private;
2488 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2496 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2500 proc_net_remove(afinfo->name);
2501 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2504 static void get_openreq4(struct sock *sk, struct open_request *req,
2505 char *tmpbuf, int i, int uid)
2507 int ttd = req->expires - jiffies;
2509 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2510 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2512 req->af.v4_req.loc_addr,
2513 ntohs(inet_sk(sk)->sport),
2514 req->af.v4_req.rmt_addr,
2515 ntohs(req->rmt_port),
2517 0, 0, /* could print option size, but that is af dependent. */
2518 1, /* timers active (only the expire timer) */
2519 jiffies_to_clock_t(ttd),
2522 0, /* non standard timer */
2523 0, /* open_requests have no inode */
2524 atomic_read(&sk->sk_refcnt),
2528 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2531 unsigned long timer_expires;
2532 struct tcp_opt *tp = tcp_sk(sp);
2533 struct inet_opt *inet = inet_sk(sp);
2534 unsigned int dest = inet->daddr;
2535 unsigned int src = inet->rcv_saddr;
2536 __u16 destp = ntohs(inet->dport);
2537 __u16 srcp = ntohs(inet->sport);
2539 if (tp->pending == TCP_TIME_RETRANS) {
2541 timer_expires = tp->timeout;
2542 } else if (tp->pending == TCP_TIME_PROBE0) {
2544 timer_expires = tp->timeout;
2545 } else if (timer_pending(&sp->sk_timer)) {
2547 timer_expires = sp->sk_timer.expires;
2550 timer_expires = jiffies;
2553 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2554 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2555 i, src, srcp, dest, destp, sp->sk_state,
2556 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2558 jiffies_to_clock_t(timer_expires - jiffies),
2563 atomic_read(&sp->sk_refcnt), sp,
2564 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2566 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2569 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2571 unsigned int dest, src;
2573 int ttd = tw->tw_ttd - jiffies;
2578 dest = tw->tw_daddr;
2579 src = tw->tw_rcv_saddr;
2580 destp = ntohs(tw->tw_dport);
2581 srcp = ntohs(tw->tw_sport);
2583 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2584 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2585 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2586 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2587 atomic_read(&tw->tw_refcnt), tw);
2592 static int tcp4_seq_show(struct seq_file *seq, void *v)
2594 struct tcp_iter_state* st;
2595 char tmpbuf[TMPSZ + 1];
2597 if (v == SEQ_START_TOKEN) {
2598 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2599 " sl local_address rem_address st tx_queue "
2600 "rx_queue tr tm->when retrnsmt uid timeout "
2606 switch (st->state) {
2607 case TCP_SEQ_STATE_LISTENING:
2608 case TCP_SEQ_STATE_ESTABLISHED:
2609 get_tcp4_sock(v, tmpbuf, st->num);
2611 case TCP_SEQ_STATE_OPENREQ:
2612 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2614 case TCP_SEQ_STATE_TIME_WAIT:
2615 get_timewait4_sock(v, tmpbuf, st->num);
2618 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2623 static struct file_operations tcp4_seq_fops;
2624 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2625 .owner = THIS_MODULE,
2628 .seq_show = tcp4_seq_show,
2629 .seq_fops = &tcp4_seq_fops,
2632 int __init tcp4_proc_init(void)
2634 return tcp_proc_register(&tcp4_seq_afinfo);
2637 void tcp4_proc_exit(void)
2639 tcp_proc_unregister(&tcp4_seq_afinfo);
2641 #endif /* CONFIG_PROC_FS */
2643 struct proto tcp_prot = {
2645 .owner = THIS_MODULE,
2647 .connect = tcp_v4_connect,
2648 .disconnect = tcp_disconnect,
2649 .accept = tcp_accept,
2651 .init = tcp_v4_init_sock,
2652 .destroy = tcp_v4_destroy_sock,
2653 .shutdown = tcp_shutdown,
2654 .setsockopt = tcp_setsockopt,
2655 .getsockopt = tcp_getsockopt,
2656 .sendmsg = tcp_sendmsg,
2657 .recvmsg = tcp_recvmsg,
2658 .backlog_rcv = tcp_v4_do_rcv,
2659 .hash = tcp_v4_hash,
2660 .unhash = tcp_unhash,
2661 .get_port = tcp_v4_get_port,
2662 .enter_memory_pressure = tcp_enter_memory_pressure,
2663 .sockets_allocated = &tcp_sockets_allocated,
2664 .memory_allocated = &tcp_memory_allocated,
2665 .memory_pressure = &tcp_memory_pressure,
2666 .sysctl_mem = sysctl_tcp_mem,
2667 .sysctl_wmem = sysctl_tcp_wmem,
2668 .sysctl_rmem = sysctl_tcp_rmem,
2669 .max_header = MAX_TCP_HEADER,
2670 .slab_obj_size = sizeof(struct tcp_sock),
2675 void __init tcp_v4_init(struct net_proto_family *ops)
2677 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2679 panic("Failed to create the TCP control socket.\n");
2680 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2681 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2683 /* Unhash it so that IP input processing does not even
2684 * see it, we do not wish this socket to see incoming
2687 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2690 EXPORT_SYMBOL(ipv4_specific);
2691 EXPORT_SYMBOL(tcp_bind_hash);
2692 EXPORT_SYMBOL(tcp_bucket_create);
2693 EXPORT_SYMBOL(tcp_hashinfo);
2694 EXPORT_SYMBOL(tcp_inherit_port);
2695 EXPORT_SYMBOL(tcp_listen_wlock);
2696 EXPORT_SYMBOL(tcp_port_rover);
2697 EXPORT_SYMBOL(tcp_prot);
2698 EXPORT_SYMBOL(tcp_put_port);
2699 EXPORT_SYMBOL(tcp_unhash);
2700 EXPORT_SYMBOL(tcp_v4_conn_request);
2701 EXPORT_SYMBOL(tcp_v4_connect);
2702 EXPORT_SYMBOL(tcp_v4_do_rcv);
2703 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2704 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2705 EXPORT_SYMBOL(tcp_v4_send_check);
2706 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2708 #ifdef CONFIG_PROC_FS
2709 EXPORT_SYMBOL(tcp_proc_register);
2710 EXPORT_SYMBOL(tcp_proc_unregister);
2712 EXPORT_SYMBOL(sysctl_local_port_range);
2713 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2714 EXPORT_SYMBOL(sysctl_tcp_low_latency);