2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/vserver/debug.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
109 __u32 faddr, __u16 fport)
111 int h = (laddr ^ lport) ^ (faddr ^ fport);
114 return h & (tcp_ehash_size - 1);
117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
119 struct inet_opt *inet = inet_sk(sk);
120 __u32 laddr = inet->rcv_saddr;
121 __u16 lport = inet->num;
122 __u32 faddr = inet->daddr;
123 __u16 fport = inet->dport;
125 return tcp_hashfn(laddr, lport, faddr, fport);
128 /* Allocate and initialize a new TCP local port bind bucket.
129 * The bindhash mutex for snum's hash chain must be held here.
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
134 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
139 INIT_HLIST_HEAD(&tb->owners);
140 hlist_add_head(&tb->node, &head->chain);
145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
148 if (hlist_empty(&tb->owners)) {
149 __hlist_del(&tb->node);
150 kmem_cache_free(tcp_bucket_cachep, tb);
154 /* Caller must disable local BH processing. */
155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
157 struct tcp_bind_hashbucket *head =
158 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
159 struct tcp_bind_bucket *tb;
161 spin_lock(&head->lock);
162 tb = tcp_sk(sk)->bind_hash;
163 sk_add_bind_node(child, &tb->owners);
164 tcp_sk(child)->bind_hash = tb;
165 spin_unlock(&head->lock);
168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
171 __tcp_inherit_port(sk, child);
175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
178 inet_sk(sk)->num = snum;
179 sk_add_bind_node(sk, &tb->owners);
180 tcp_sk(sk)->bind_hash = tb;
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
185 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
187 struct hlist_node *node;
188 int reuse = sk->sk_reuse;
190 sk_for_each_bound(sk2, node, &tb->owners) {
192 !tcp_v6_ipv6only(sk2) &&
193 (!sk->sk_bound_dev_if ||
194 !sk2->sk_bound_dev_if ||
195 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
196 if (!reuse || !sk2->sk_reuse ||
197 sk2->sk_state == TCP_LISTEN) {
198 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
199 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
200 sk2_rcv_saddr == sk_rcv_saddr)
208 /* Obtain a reference to a local port for the given sock,
209 * if snum is zero it means select any available local port.
211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
213 struct tcp_bind_hashbucket *head;
214 struct hlist_node *node;
215 struct tcp_bind_bucket *tb;
220 int low = sysctl_local_port_range[0];
221 int high = sysctl_local_port_range[1];
222 int remaining = (high - low) + 1;
225 spin_lock(&tcp_portalloc_lock);
226 rover = tcp_port_rover;
229 if (rover < low || rover > high)
231 head = &tcp_bhash[tcp_bhashfn(rover)];
232 spin_lock(&head->lock);
233 tb_for_each(tb, node, &head->chain)
234 if (tb->port == rover)
238 spin_unlock(&head->lock);
239 } while (--remaining > 0);
240 tcp_port_rover = rover;
241 spin_unlock(&tcp_portalloc_lock);
243 /* Exhausted local port range during search? */
248 /* OK, here is the one we will use. HEAD is
249 * non-NULL and we hold it's mutex.
253 head = &tcp_bhash[tcp_bhashfn(snum)];
254 spin_lock(&head->lock);
255 tb_for_each(tb, node, &head->chain)
256 if (tb->port == snum)
262 if (!hlist_empty(&tb->owners)) {
263 if (sk->sk_reuse > 1)
265 if (tb->fastreuse > 0 &&
266 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
270 if (tcp_bind_conflict(sk, tb))
276 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
278 if (hlist_empty(&tb->owners)) {
279 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
283 } else if (tb->fastreuse &&
284 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
287 if (!tcp_sk(sk)->bind_hash)
288 tcp_bind_hash(sk, tb, snum);
289 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
293 spin_unlock(&head->lock);
299 /* Get rid of any references to a local port held by the
302 static void __tcp_put_port(struct sock *sk)
304 struct inet_opt *inet = inet_sk(sk);
305 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
306 struct tcp_bind_bucket *tb;
308 spin_lock(&head->lock);
309 tb = tcp_sk(sk)->bind_hash;
310 __sk_del_bind_node(sk);
311 tcp_sk(sk)->bind_hash = NULL;
313 tcp_bucket_destroy(tb);
314 spin_unlock(&head->lock);
317 void tcp_put_port(struct sock *sk)
324 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
325 * Look, when several writers sleep and reader wakes them up, all but one
326 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
327 * this, _but_ remember, it adds useless work on UP machines (wake up each
328 * exclusive lock release). It should be ifdefed really.
331 void tcp_listen_wlock(void)
333 write_lock(&tcp_lhash_lock);
335 if (atomic_read(&tcp_lhash_users)) {
339 prepare_to_wait_exclusive(&tcp_lhash_wait,
340 &wait, TASK_UNINTERRUPTIBLE);
341 if (!atomic_read(&tcp_lhash_users))
343 write_unlock_bh(&tcp_lhash_lock);
345 write_lock_bh(&tcp_lhash_lock);
348 finish_wait(&tcp_lhash_wait, &wait);
352 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
354 struct hlist_head *list;
357 BUG_TRAP(sk_unhashed(sk));
358 if (listen_possible && sk->sk_state == TCP_LISTEN) {
359 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
360 lock = &tcp_lhash_lock;
363 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
364 lock = &tcp_ehash[sk->sk_hashent].lock;
367 __sk_add_node(sk, list);
368 sock_prot_inc_use(sk->sk_prot);
370 if (listen_possible && sk->sk_state == TCP_LISTEN)
371 wake_up(&tcp_lhash_wait);
374 static void tcp_v4_hash(struct sock *sk)
376 if (sk->sk_state != TCP_CLOSE) {
378 __tcp_v4_hash(sk, 1);
383 void tcp_unhash(struct sock *sk)
390 if (sk->sk_state == TCP_LISTEN) {
393 lock = &tcp_lhash_lock;
395 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
397 write_lock_bh(&head->lock);
400 if (__sk_del_node_init(sk))
401 sock_prot_dec_use(sk->sk_prot);
402 write_unlock_bh(lock);
405 if (sk->sk_state == TCP_LISTEN)
406 wake_up(&tcp_lhash_wait);
409 /* Don't inline this cruft. Here are some nice properties to
410 * exploit here. The BSD API does not allow a listening TCP
411 * to specify the remote port nor the remote address for the
412 * connection. So always assume those are both wildcarded
413 * during the search since they can never be otherwise.
415 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
416 unsigned short hnum, int dif)
418 struct sock *result = NULL, *sk;
419 struct hlist_node *node;
423 sk_for_each(sk, node, head) {
424 struct inet_opt *inet = inet_sk(sk);
426 if (inet->num == hnum && !ipv6_only_sock(sk)) {
427 __u32 rcv_saddr = inet->rcv_saddr;
429 score = (sk->sk_family == PF_INET ? 1 : 0);
431 if (rcv_saddr != daddr)
435 if (sk->sk_bound_dev_if) {
436 if (sk->sk_bound_dev_if != dif)
442 if (score > hiscore) {
451 /* Optimize the common listener case. */
452 struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
454 struct sock *sk = NULL;
455 struct hlist_head *head;
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
461 if (inet->num == hnum && !sk->sk_node.next &&
462 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
463 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
464 !sk->sk_bound_dev_if)
466 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
472 read_unlock(&tcp_lhash_lock);
476 EXPORT_SYMBOL_GPL(tcp_v4_lookup_listener);
478 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
479 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
481 * Local BH must be disabled here.
484 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
488 struct tcp_ehash_bucket *head;
489 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
490 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
492 struct hlist_node *node;
493 /* Optimize here for direct hit, only listening connections can
494 * have wildcards anyways.
496 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
497 head = &tcp_ehash[hash];
498 read_lock(&head->lock);
499 sk_for_each(sk, node, &head->chain) {
500 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
501 goto hit; /* You sunk my battleship! */
504 /* Must check for a TIME_WAIT'er before going to listener hash. */
505 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
506 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
511 read_unlock(&head->lock);
518 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
519 u32 daddr, u16 hnum, int dif)
521 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
524 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
527 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
533 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
539 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
541 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
543 return secure_tcp_sequence_number(skb->nh.iph->daddr,
549 /* called with local bh disabled */
550 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
551 struct tcp_tw_bucket **twp)
553 struct inet_opt *inet = inet_sk(sk);
554 u32 daddr = inet->rcv_saddr;
555 u32 saddr = inet->daddr;
556 int dif = sk->sk_bound_dev_if;
557 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
558 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
559 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
560 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
562 struct hlist_node *node;
563 struct tcp_tw_bucket *tw;
565 write_lock(&head->lock);
567 /* Check TIME-WAIT sockets first. */
568 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
569 tw = (struct tcp_tw_bucket *)sk2;
571 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
572 struct tcp_opt *tp = tcp_sk(sk);
574 /* With PAWS, it is safe from the viewpoint
575 of data integrity. Even without PAWS it
576 is safe provided sequence spaces do not
577 overlap i.e. at data rates <= 80Mbit/sec.
579 Actually, the idea is close to VJ's one,
580 only timestamp cache is held not per host,
581 but per port pair and TW bucket is used
584 If TW bucket has been already destroyed we
585 fall back to VJ's scheme and use initial
586 timestamp retrieved from peer table.
588 if (tw->tw_ts_recent_stamp &&
589 (!twp || (sysctl_tcp_tw_reuse &&
591 tw->tw_ts_recent_stamp > 1))) {
593 tw->tw_snd_nxt + 65535 + 2) == 0)
595 tp->ts_recent = tw->tw_ts_recent;
596 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
605 /* And established part... */
606 sk_for_each(sk2, node, &head->chain) {
607 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
612 /* Must record num and sport now. Otherwise we will see
613 * in hash table socket with a funny identity. */
615 inet->sport = htons(lport);
616 sk->sk_hashent = hash;
617 BUG_TRAP(sk_unhashed(sk));
618 __sk_add_node(sk, &head->chain);
619 sock_prot_inc_use(sk->sk_prot);
620 write_unlock(&head->lock);
624 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
626 /* Silly. Should hash-dance instead... */
627 tcp_tw_deschedule(tw);
628 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
636 write_unlock(&head->lock);
637 return -EADDRNOTAVAIL;
641 * Bind a port for a connect operation and hash it.
643 static int tcp_v4_hash_connect(struct sock *sk)
645 unsigned short snum = inet_sk(sk)->num;
646 struct tcp_bind_hashbucket *head;
647 struct tcp_bind_bucket *tb;
652 int low = sysctl_local_port_range[0];
653 int high = sysctl_local_port_range[1];
654 int remaining = (high - low) + 1;
655 struct hlist_node *node;
656 struct tcp_tw_bucket *tw = NULL;
660 /* TODO. Actually it is not so bad idea to remove
661 * tcp_portalloc_lock before next submission to Linus.
662 * As soon as we touch this place at all it is time to think.
664 * Now it protects single _advisory_ variable tcp_port_rover,
665 * hence it is mostly useless.
666 * Code will work nicely if we just delete it, but
667 * I am afraid in contented case it will work not better or
668 * even worse: another cpu just will hit the same bucket
670 * So some cpu salt could remove both contention and
671 * memory pingpong. Any ideas how to do this in a nice way?
673 spin_lock(&tcp_portalloc_lock);
674 rover = tcp_port_rover;
678 if ((rover < low) || (rover > high))
680 head = &tcp_bhash[tcp_bhashfn(rover)];
681 spin_lock(&head->lock);
683 /* Does not bother with rcv_saddr checks,
684 * because the established check is already
687 tb_for_each(tb, node, &head->chain) {
688 if (tb->port == rover) {
689 BUG_TRAP(!hlist_empty(&tb->owners));
690 if (tb->fastreuse >= 0)
692 if (!__tcp_v4_check_established(sk,
700 tb = tcp_bucket_create(head, rover);
702 spin_unlock(&head->lock);
709 spin_unlock(&head->lock);
710 } while (--remaining > 0);
711 tcp_port_rover = rover;
712 spin_unlock(&tcp_portalloc_lock);
716 return -EADDRNOTAVAIL;
719 /* All locks still held and bhs disabled */
720 tcp_port_rover = rover;
721 spin_unlock(&tcp_portalloc_lock);
723 tcp_bind_hash(sk, tb, rover);
724 if (sk_unhashed(sk)) {
725 inet_sk(sk)->sport = htons(rover);
726 __tcp_v4_hash(sk, 0);
728 spin_unlock(&head->lock);
731 tcp_tw_deschedule(tw);
739 head = &tcp_bhash[tcp_bhashfn(snum)];
740 tb = tcp_sk(sk)->bind_hash;
741 spin_lock_bh(&head->lock);
742 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
743 __tcp_v4_hash(sk, 0);
744 spin_unlock_bh(&head->lock);
747 spin_unlock(&head->lock);
748 /* No definite answer... Walk to established hash table */
749 ret = __tcp_v4_check_established(sk, snum, NULL);
756 /* This will initiate an outgoing connection. */
757 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
759 struct inet_opt *inet = inet_sk(sk);
760 struct tcp_opt *tp = tcp_sk(sk);
761 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
767 if (addr_len < sizeof(struct sockaddr_in))
770 if (usin->sin_family != AF_INET)
771 return -EAFNOSUPPORT;
773 nexthop = daddr = usin->sin_addr.s_addr;
774 if (inet->opt && inet->opt->srr) {
777 nexthop = inet->opt->faddr;
780 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
781 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
783 inet->sport, usin->sin_port, sk);
787 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
792 if (!inet->opt || !inet->opt->srr)
796 inet->saddr = rt->rt_src;
797 inet->rcv_saddr = inet->saddr;
799 if (tp->ts_recent_stamp && inet->daddr != daddr) {
800 /* Reset inherited state */
802 tp->ts_recent_stamp = 0;
806 if (sysctl_tcp_tw_recycle &&
807 !tp->ts_recent_stamp && rt->rt_dst == daddr) {
808 struct inet_peer *peer = rt_get_peer(rt);
810 /* VJ's idea. We save last timestamp seen from
811 * the destination in peer table, when entering state TIME-WAIT
812 * and initialize ts_recent from it, when trying new connection.
815 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
816 tp->ts_recent_stamp = peer->tcp_ts_stamp;
817 tp->ts_recent = peer->tcp_ts;
821 inet->dport = usin->sin_port;
824 tp->ext_header_len = 0;
826 tp->ext_header_len = inet->opt->optlen;
830 /* Socket identity is still unknown (sport may be zero).
831 * However we set state to SYN-SENT and not releasing socket
832 * lock select source port, enter ourselves into the hash tables and
833 * complete initialization after this.
835 tcp_set_state(sk, TCP_SYN_SENT);
836 err = tcp_v4_hash_connect(sk);
840 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
844 /* OK, now commit destination to socket. */
845 __sk_dst_set(sk, &rt->u.dst);
846 tcp_v4_setup_caps(sk, &rt->u.dst);
847 tp->ext2_header_len = rt->u.dst.header_len;
850 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
855 inet->id = tp->write_seq ^ jiffies;
857 err = tcp_connect(sk);
865 /* This unhashes the socket and releases the local port, if necessary. */
866 tcp_set_state(sk, TCP_CLOSE);
868 sk->sk_route_caps = 0;
873 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
875 return ((struct rtable *)skb->dst)->rt_iif;
878 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
880 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
883 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
884 struct open_request ***prevp,
886 __u32 raddr, __u32 laddr)
888 struct tcp_listen_opt *lopt = tp->listen_opt;
889 struct open_request *req, **prev;
891 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
892 (req = *prev) != NULL;
893 prev = &req->dl_next) {
894 if (req->rmt_port == rport &&
895 req->af.v4_req.rmt_addr == raddr &&
896 req->af.v4_req.loc_addr == laddr &&
897 TCP_INET_FAMILY(req->class->family)) {
907 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
909 struct tcp_opt *tp = tcp_sk(sk);
910 struct tcp_listen_opt *lopt = tp->listen_opt;
911 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
913 req->expires = jiffies + TCP_TIMEOUT_INIT;
916 req->dl_next = lopt->syn_table[h];
918 write_lock(&tp->syn_wait_lock);
919 lopt->syn_table[h] = req;
920 write_unlock(&tp->syn_wait_lock);
922 #ifdef CONFIG_ACCEPT_QUEUES
923 tcp_synq_added(sk, req);
931 * This routine does path mtu discovery as defined in RFC1191.
933 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
936 struct dst_entry *dst;
937 struct inet_opt *inet = inet_sk(sk);
938 struct tcp_opt *tp = tcp_sk(sk);
940 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
941 * send out by Linux are always <576bytes so they should go through
944 if (sk->sk_state == TCP_LISTEN)
947 /* We don't check in the destentry if pmtu discovery is forbidden
948 * on this route. We just assume that no packet_to_big packets
949 * are send back when pmtu discovery is not active.
950 * There is a small race when the user changes this flag in the
951 * route, but I think that's acceptable.
953 if ((dst = __sk_dst_check(sk, 0)) == NULL)
956 dst->ops->update_pmtu(dst, mtu);
958 /* Something is about to be wrong... Remember soft error
959 * for the case, if this connection will not able to recover.
961 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
962 sk->sk_err_soft = EMSGSIZE;
966 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
967 tp->pmtu_cookie > mtu) {
968 tcp_sync_mss(sk, mtu);
970 /* Resend the TCP packet because it's
971 * clear that the old packet has been
972 * dropped. This is the new "fast" path mtu
975 tcp_simple_retransmit(sk);
976 } /* else let the usual retransmit timer handle it */
980 * This routine is called by the ICMP module when it gets some
981 * sort of error condition. If err < 0 then the socket should
982 * be closed and the error returned to the user. If err > 0
983 * it's just the icmp type << 8 | icmp code. After adjustment
984 * header points to the first 8 bytes of the tcp header. We need
985 * to find the appropriate port.
987 * The locking strategy used here is very "optimistic". When
988 * someone else accesses the socket the ICMP is just dropped
989 * and for some paths there is no check at all.
990 * A more general error queue to queue errors for later handling
991 * is probably better.
995 void tcp_v4_err(struct sk_buff *skb, u32 info)
997 struct iphdr *iph = (struct iphdr *)skb->data;
998 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1000 struct inet_opt *inet;
1001 int type = skb->h.icmph->type;
1002 int code = skb->h.icmph->code;
1007 if (skb->len < (iph->ihl << 2) + 8) {
1008 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1012 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1013 th->source, tcp_v4_iif(skb));
1015 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1018 if (sk->sk_state == TCP_TIME_WAIT) {
1019 tcp_tw_put((struct tcp_tw_bucket *)sk);
1024 /* If too many ICMPs get dropped on busy
1025 * servers this needs to be solved differently.
1027 if (sock_owned_by_user(sk))
1028 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1030 if (sk->sk_state == TCP_CLOSE)
1034 seq = ntohl(th->seq);
1035 if (sk->sk_state != TCP_LISTEN &&
1036 !between(seq, tp->snd_una, tp->snd_nxt)) {
1037 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1042 case ICMP_SOURCE_QUENCH:
1043 /* Just silently ignore these. */
1045 case ICMP_PARAMETERPROB:
1048 case ICMP_DEST_UNREACH:
1049 if (code > NR_ICMP_UNREACH)
1052 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1053 if (!sock_owned_by_user(sk))
1054 do_pmtu_discovery(sk, iph, info);
1058 err = icmp_err_convert[code].errno;
1060 case ICMP_TIME_EXCEEDED:
1067 switch (sk->sk_state) {
1068 struct open_request *req, **prev;
1070 if (sock_owned_by_user(sk))
1073 req = tcp_v4_search_req(tp, &prev, th->dest,
1074 iph->daddr, iph->saddr);
1078 /* ICMPs are not backlogged, hence we cannot get
1079 an established socket here.
1083 if (seq != req->snt_isn) {
1084 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1089 * Still in SYN_RECV, just remove it silently.
1090 * There is no good way to pass the error to the newly
1091 * created socket, and POSIX does not want network
1092 * errors returned from accept().
1094 tcp_synq_drop(sk, req, prev);
1098 case TCP_SYN_RECV: /* Cannot happen.
1099 It can f.e. if SYNs crossed.
1101 if (!sock_owned_by_user(sk)) {
1102 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1105 sk->sk_error_report(sk);
1109 sk->sk_err_soft = err;
1114 /* If we've already connected we will keep trying
1115 * until we time out, or the user gives up.
1117 * rfc1122 4.2.3.9 allows to consider as hard errors
1118 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1119 * but it is obsoleted by pmtu discovery).
1121 * Note, that in modern internet, where routing is unreliable
1122 * and in each dark corner broken firewalls sit, sending random
1123 * errors ordered by their masters even this two messages finally lose
1124 * their original sense (even Linux sends invalid PORT_UNREACHs)
1126 * Now we are in compliance with RFCs.
1131 if (!sock_owned_by_user(sk) && inet->recverr) {
1133 sk->sk_error_report(sk);
1134 } else { /* Only an error on timeout */
1135 sk->sk_err_soft = err;
1143 /* This routine computes an IPv4 TCP checksum. */
1144 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1145 struct sk_buff *skb)
1147 struct inet_opt *inet = inet_sk(sk);
1149 if (skb->ip_summed == CHECKSUM_HW) {
1150 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1151 skb->csum = offsetof(struct tcphdr, check);
1153 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1154 csum_partial((char *)th,
1161 * This routine will send an RST to the other tcp.
1163 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1165 * Answer: if a packet caused RST, it is not for a socket
1166 * existing in our system, if it is matched to a socket,
1167 * it is just duplicate segment or bug in other side's TCP.
1168 * So that we build reply only basing on parameters
1169 * arrived with segment.
1170 * Exception: precedence violation. We do not implement it in any case.
1173 static void tcp_v4_send_reset(struct sk_buff *skb)
1175 struct tcphdr *th = skb->h.th;
1177 struct ip_reply_arg arg;
1179 /* Never send a reset in response to a reset. */
1183 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1186 /* Swap the send and the receive. */
1187 memset(&rth, 0, sizeof(struct tcphdr));
1188 rth.dest = th->source;
1189 rth.source = th->dest;
1190 rth.doff = sizeof(struct tcphdr) / 4;
1194 rth.seq = th->ack_seq;
1197 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1198 skb->len - (th->doff << 2));
1201 memset(&arg, 0, sizeof arg);
1202 arg.iov[0].iov_base = (unsigned char *)&rth;
1203 arg.iov[0].iov_len = sizeof rth;
1204 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1205 skb->nh.iph->saddr, /*XXX*/
1206 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1207 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1209 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1211 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1212 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1215 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1216 outside socket context is ugly, certainly. What can I do?
1219 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1222 struct tcphdr *th = skb->h.th;
1227 struct ip_reply_arg arg;
1229 memset(&rep.th, 0, sizeof(struct tcphdr));
1230 memset(&arg, 0, sizeof arg);
1232 arg.iov[0].iov_base = (unsigned char *)&rep;
1233 arg.iov[0].iov_len = sizeof(rep.th);
1235 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1236 (TCPOPT_TIMESTAMP << 8) |
1238 rep.tsopt[1] = htonl(tcp_time_stamp);
1239 rep.tsopt[2] = htonl(ts);
1240 arg.iov[0].iov_len = sizeof(rep);
1243 /* Swap the send and the receive. */
1244 rep.th.dest = th->source;
1245 rep.th.source = th->dest;
1246 rep.th.doff = arg.iov[0].iov_len / 4;
1247 rep.th.seq = htonl(seq);
1248 rep.th.ack_seq = htonl(ack);
1250 rep.th.window = htons(win);
1252 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1253 skb->nh.iph->saddr, /*XXX*/
1254 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1255 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1257 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1259 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1262 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1264 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1266 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1267 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1272 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1274 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1278 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1279 struct open_request *req)
1282 struct ip_options *opt = req->af.v4_req.opt;
1283 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1285 { .daddr = ((opt && opt->srr) ?
1287 req->af.v4_req.rmt_addr),
1288 .saddr = req->af.v4_req.loc_addr,
1289 .tos = RT_CONN_FLAGS(sk) } },
1290 .proto = IPPROTO_TCP,
1292 { .sport = inet_sk(sk)->sport,
1293 .dport = req->rmt_port } } };
1295 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1296 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1299 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1301 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1308 * Send a SYN-ACK after having received an ACK.
1309 * This still operates on a open_request only, not on a big
1312 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1313 struct dst_entry *dst)
1316 struct sk_buff * skb;
1318 /* First, grab a route. */
1319 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1322 skb = tcp_make_synack(sk, dst, req);
1325 struct tcphdr *th = skb->h.th;
1327 th->check = tcp_v4_check(th, skb->len,
1328 req->af.v4_req.loc_addr,
1329 req->af.v4_req.rmt_addr,
1330 csum_partial((char *)th, skb->len,
1333 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1334 req->af.v4_req.rmt_addr,
1335 req->af.v4_req.opt);
1336 if (err == NET_XMIT_CN)
1346 * IPv4 open_request destructor.
1348 static void tcp_v4_or_free(struct open_request *req)
1350 if (req->af.v4_req.opt)
1351 kfree(req->af.v4_req.opt);
1354 static inline void syn_flood_warning(struct sk_buff *skb)
1356 static unsigned long warntime;
1358 if (time_after(jiffies, (warntime + HZ * 60))) {
1361 "possible SYN flooding on port %d. Sending cookies.\n",
1362 ntohs(skb->h.th->dest));
1367 * Save and compile IPv4 options into the open_request if needed.
1369 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1370 struct sk_buff *skb)
1372 struct ip_options *opt = &(IPCB(skb)->opt);
1373 struct ip_options *dopt = NULL;
1375 if (opt && opt->optlen) {
1376 int opt_size = optlength(opt);
1377 dopt = kmalloc(opt_size, GFP_ATOMIC);
1379 if (ip_options_echo(dopt, skb)) {
1389 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1390 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1391 * It would be better to replace it with a global counter for all sockets
1392 * but then some measure against one socket starving all other sockets
1395 * It was 128 by default. Experiments with real servers show, that
1396 * it is absolutely not enough even at 100conn/sec. 256 cures most
1397 * of problems. This value is adjusted to 128 for very small machines
1398 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1399 * Further increasing requires to change hash table size.
1401 int sysctl_max_syn_backlog = 256;
1403 struct or_calltable or_ipv4 = {
1405 .rtx_syn_ack = tcp_v4_send_synack,
1406 .send_ack = tcp_v4_or_send_ack,
1407 .destructor = tcp_v4_or_free,
1408 .send_reset = tcp_v4_send_reset,
1411 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1414 struct open_request *req;
1415 __u32 saddr = skb->nh.iph->saddr;
1416 __u32 daddr = skb->nh.iph->daddr;
1417 __u32 isn = TCP_SKB_CB(skb)->when;
1418 struct dst_entry *dst = NULL;
1419 #ifdef CONFIG_ACCEPT_QUEUES
1422 #ifdef CONFIG_SYN_COOKIES
1423 int want_cookie = 0;
1425 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1428 /* Never answer to SYNs send to broadcast or multicast */
1429 if (((struct rtable *)skb->dst)->rt_flags &
1430 (RTCF_BROADCAST | RTCF_MULTICAST))
1433 /* TW buckets are converted to open requests without
1434 * limitations, they conserve resources and peer is
1435 * evidently real one.
1437 if (tcp_synq_is_full(sk) && !isn) {
1438 #ifdef CONFIG_SYN_COOKIES
1439 if (sysctl_tcp_syncookies) {
1446 #ifdef CONFIG_ACCEPT_QUEUES
1447 class = (skb->nfmark <= 0) ? 0 :
1448 ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
1450 * Accept only if the class has shares set or if the default class
1451 * i.e. class 0 has shares
1453 if (!(tcp_sk(sk)->acceptq[class].aq_ratio)) {
1454 if (tcp_sk(sk)->acceptq[0].aq_ratio)
1461 /* Accept backlog is full. If we have already queued enough
1462 * of warm entries in syn queue, drop request. It is better than
1463 * clogging syn queue with openreqs with exponentially increasing
1466 #ifdef CONFIG_ACCEPT_QUEUES
1467 if (sk_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
1469 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1473 req = tcp_openreq_alloc();
1477 tcp_clear_options(&tp);
1479 tp.user_mss = tcp_sk(sk)->user_mss;
1481 tcp_parse_options(skb, &tp, 0);
1484 tcp_clear_options(&tp);
1488 if (tp.saw_tstamp && !tp.rcv_tsval) {
1489 /* Some OSes (unknown ones, but I see them on web server, which
1490 * contains information interesting only for windows'
1491 * users) do not send their stamp in SYN. It is easy case.
1492 * We simply do not advertise TS support.
1497 tp.tstamp_ok = tp.saw_tstamp;
1499 tcp_openreq_init(req, &tp, skb);
1500 #ifdef CONFIG_ACCEPT_QUEUES
1501 req->acceptq_class = class;
1502 req->acceptq_time_stamp = jiffies;
1504 req->af.v4_req.loc_addr = daddr;
1505 req->af.v4_req.rmt_addr = saddr;
1506 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1507 req->class = &or_ipv4;
1509 TCP_ECN_create_request(req, skb->h.th);
1512 #ifdef CONFIG_SYN_COOKIES
1513 syn_flood_warning(skb);
1515 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1517 struct inet_peer *peer = NULL;
1519 /* VJ's idea. We save last timestamp seen
1520 * from the destination in peer table, when entering
1521 * state TIME-WAIT, and check against it before
1522 * accepting new connection request.
1524 * If "isn" is not zero, this request hit alive
1525 * timewait bucket, so that all the necessary checks
1526 * are made in the function processing timewait state.
1528 if (tp.saw_tstamp &&
1529 sysctl_tcp_tw_recycle &&
1530 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1531 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1532 peer->v4daddr == saddr) {
1533 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1534 (s32)(peer->tcp_ts - req->ts_recent) >
1536 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1541 /* Kill the following clause, if you dislike this way. */
1542 else if (!sysctl_tcp_syncookies &&
1543 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1544 (sysctl_max_syn_backlog >> 2)) &&
1545 (!peer || !peer->tcp_ts_stamp) &&
1546 (!dst || !dst_metric(dst, RTAX_RTT))) {
1547 /* Without syncookies last quarter of
1548 * backlog is filled with destinations,
1549 * proven to be alive.
1550 * It means that we continue to communicate
1551 * to destinations, already remembered
1552 * to the moment of synflood.
1554 NETDEBUG(if (net_ratelimit()) \
1555 printk(KERN_DEBUG "TCP: drop open "
1556 "request from %u.%u."
1559 ntohs(skb->h.th->source)));
1564 isn = tcp_v4_init_sequence(sk, skb);
1568 if (tcp_v4_send_synack(sk, req, dst))
1572 tcp_openreq_free(req);
1574 tcp_v4_synq_add(sk, req);
1579 tcp_openreq_free(req);
1581 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1587 * The three way handshake has completed - we got a valid synack -
1588 * now create the new socket.
1590 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1591 struct open_request *req,
1592 struct dst_entry *dst)
1594 struct inet_opt *newinet;
1595 struct tcp_opt *newtp;
1598 #ifdef CONFIG_ACCEPT_QUEUES
1599 if (sk_acceptq_is_full(sk, req->acceptq_class))
1601 if (sk_acceptq_is_full(sk))
1605 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1608 newsk = tcp_create_openreq_child(sk, req, skb);
1612 newsk->sk_dst_cache = dst;
1613 tcp_v4_setup_caps(newsk, dst);
1615 newtp = tcp_sk(newsk);
1616 newinet = inet_sk(newsk);
1617 newinet->daddr = req->af.v4_req.rmt_addr;
1618 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1619 newinet->saddr = req->af.v4_req.loc_addr;
1620 newinet->opt = req->af.v4_req.opt;
1621 req->af.v4_req.opt = NULL;
1622 newinet->mc_index = tcp_v4_iif(skb);
1623 newinet->mc_ttl = skb->nh.iph->ttl;
1624 newtp->ext_header_len = 0;
1626 newtp->ext_header_len = newinet->opt->optlen;
1627 newtp->ext2_header_len = dst->header_len;
1628 newinet->id = newtp->write_seq ^ jiffies;
1630 tcp_sync_mss(newsk, dst_pmtu(dst));
1631 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1632 tcp_initialize_rcv_mss(newsk);
1634 __tcp_v4_hash(newsk, 0);
1635 __tcp_inherit_port(sk, newsk);
1640 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1642 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1647 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1649 struct tcphdr *th = skb->h.th;
1650 struct iphdr *iph = skb->nh.iph;
1651 struct tcp_opt *tp = tcp_sk(sk);
1653 struct open_request **prev;
1654 /* Find possible connection requests. */
1655 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1656 iph->saddr, iph->daddr);
1658 return tcp_check_req(sk, skb, req, prev);
1660 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1667 if (nsk->sk_state != TCP_TIME_WAIT) {
1671 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1675 #ifdef CONFIG_SYN_COOKIES
1676 if (!th->rst && !th->syn && th->ack)
1677 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1682 static int tcp_v4_checksum_init(struct sk_buff *skb)
1684 if (skb->ip_summed == CHECKSUM_HW) {
1685 skb->ip_summed = CHECKSUM_UNNECESSARY;
1686 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1687 skb->nh.iph->daddr, skb->csum))
1690 NETDEBUG(if (net_ratelimit())
1691 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1692 skb->ip_summed = CHECKSUM_NONE;
1694 if (skb->len <= 76) {
1695 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1697 skb_checksum(skb, 0, skb->len, 0)))
1699 skb->ip_summed = CHECKSUM_UNNECESSARY;
1701 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1703 skb->nh.iph->daddr, 0);
1709 /* The socket must have it's spinlock held when we get
1712 * We have a potential double-lock case here, so even when
1713 * doing backlog processing we use the BH locking scheme.
1714 * This is because we cannot sleep with the original spinlock
1717 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1719 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1720 TCP_CHECK_TIMER(sk);
1721 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1723 TCP_CHECK_TIMER(sk);
1727 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1730 if (sk->sk_state == TCP_LISTEN) {
1731 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1736 if (tcp_child_process(sk, nsk, skb))
1742 TCP_CHECK_TIMER(sk);
1743 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1745 TCP_CHECK_TIMER(sk);
1749 tcp_v4_send_reset(skb);
1752 /* Be careful here. If this function gets more complicated and
1753 * gcc suffers from register pressure on the x86, sk (in %ebx)
1754 * might be destroyed here. This current version compiles correctly,
1755 * but you have been warned.
1760 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1768 int tcp_v4_rcv(struct sk_buff *skb)
1774 if (skb->pkt_type != PACKET_HOST)
1777 /* Count it even if it's bad */
1778 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1780 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1785 if (th->doff < sizeof(struct tcphdr) / 4)
1787 if (!pskb_may_pull(skb, th->doff * 4))
1790 /* An explanation is required here, I think.
1791 * Packet length and doff are validated by header prediction,
1792 * provided case of th->doff==0 is elimineted.
1793 * So, we defer the checks. */
1794 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1795 tcp_v4_checksum_init(skb) < 0))
1799 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1800 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1801 skb->len - th->doff * 4);
1802 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1803 TCP_SKB_CB(skb)->when = 0;
1804 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1805 TCP_SKB_CB(skb)->sacked = 0;
1807 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1808 skb->nh.iph->daddr, ntohs(th->dest),
1815 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1816 /* Silently drop if VNET is active and the context is not
1817 * entitled to read the packet.
1820 /* Transfer ownership of reusable TIME_WAIT buckets to
1821 * whomever VNET decided should own the packet.
1823 if (sk->sk_state == TCP_TIME_WAIT)
1824 sk->sk_xid = skb->xid;
1826 if ((int) sk->sk_xid > 0 && sk->sk_xid != skb->xid)
1831 if (sk->sk_state == TCP_TIME_WAIT)
1834 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1835 goto discard_and_relse;
1837 if (sk_filter(sk, skb, 0))
1838 goto discard_and_relse;
1844 if (!sock_owned_by_user(sk)) {
1845 if (!tcp_prequeue(sk, skb))
1846 ret = tcp_v4_do_rcv(sk, skb);
1848 sk_add_backlog(sk, skb);
1856 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1859 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1861 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1862 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1863 } else if (vnet_active && skb->sk) {
1864 /* VNET: Suppress RST if the port was bound to a (presumably raw) socket */
1867 tcp_v4_send_reset(skb);
1871 /* Discard frame. */
1880 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1881 tcp_tw_put((struct tcp_tw_bucket *) sk);
1885 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1886 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1887 tcp_tw_put((struct tcp_tw_bucket *) sk);
1890 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1891 skb, th, skb->len)) {
1893 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1897 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1898 tcp_tw_put((struct tcp_tw_bucket *)sk);
1902 /* Fall through to ACK */
1905 tcp_v4_timewait_ack(sk, skb);
1909 case TCP_TW_SUCCESS:;
1914 /* With per-bucket locks this operation is not-atomic, so that
1915 * this version is not worse.
1917 static void __tcp_v4_rehash(struct sock *sk)
1919 sk->sk_prot->unhash(sk);
1920 sk->sk_prot->hash(sk);
1923 static int tcp_v4_reselect_saddr(struct sock *sk)
1925 struct inet_opt *inet = inet_sk(sk);
1928 __u32 old_saddr = inet->saddr;
1930 __u32 daddr = inet->daddr;
1932 if (inet->opt && inet->opt->srr)
1933 daddr = inet->opt->faddr;
1935 /* Query new route. */
1936 err = ip_route_connect(&rt, daddr, 0,
1937 RT_TOS(inet->tos) | sk->sk_localroute,
1938 sk->sk_bound_dev_if,
1940 inet->sport, inet->dport, sk);
1944 __sk_dst_set(sk, &rt->u.dst);
1945 tcp_v4_setup_caps(sk, &rt->u.dst);
1946 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1948 new_saddr = rt->rt_src;
1950 if (new_saddr == old_saddr)
1953 if (sysctl_ip_dynaddr > 1) {
1954 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1955 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1957 NIPQUAD(new_saddr));
1960 inet->saddr = new_saddr;
1961 inet->rcv_saddr = new_saddr;
1963 /* XXX The only one ugly spot where we need to
1964 * XXX really change the sockets identity after
1965 * XXX it has entered the hashes. -DaveM
1967 * Besides that, it does not check for connection
1968 * uniqueness. Wait for troubles.
1970 __tcp_v4_rehash(sk);
1974 int tcp_v4_rebuild_header(struct sock *sk)
1976 struct inet_opt *inet = inet_sk(sk);
1977 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1981 /* Route is OK, nothing to do. */
1986 daddr = inet->daddr;
1987 if (inet->opt && inet->opt->srr)
1988 daddr = inet->opt->faddr;
1991 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1994 .saddr = inet->saddr,
1995 .tos = RT_CONN_FLAGS(sk) } },
1996 .proto = IPPROTO_TCP,
1998 { .sport = inet->sport,
1999 .dport = inet->dport } } };
2001 err = ip_route_output_flow(&rt, &fl, sk, 0);
2004 __sk_dst_set(sk, &rt->u.dst);
2005 tcp_v4_setup_caps(sk, &rt->u.dst);
2006 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
2010 /* Routing failed... */
2011 sk->sk_route_caps = 0;
2013 if (!sysctl_ip_dynaddr ||
2014 sk->sk_state != TCP_SYN_SENT ||
2015 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
2016 (err = tcp_v4_reselect_saddr(sk)) != 0)
2017 sk->sk_err_soft = -err;
2022 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2024 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2025 struct inet_opt *inet = inet_sk(sk);
2027 sin->sin_family = AF_INET;
2028 sin->sin_addr.s_addr = inet->daddr;
2029 sin->sin_port = inet->dport;
2032 /* VJ's idea. Save last timestamp seen from this destination
2033 * and hold it at least for normal timewait interval to use for duplicate
2034 * segment detection in subsequent connections, before they enter synchronized
2038 int tcp_v4_remember_stamp(struct sock *sk)
2040 struct inet_opt *inet = inet_sk(sk);
2041 struct tcp_opt *tp = tcp_sk(sk);
2042 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2043 struct inet_peer *peer = NULL;
2046 if (!rt || rt->rt_dst != inet->daddr) {
2047 peer = inet_getpeer(inet->daddr, 1);
2051 rt_bind_peer(rt, 1);
2056 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2057 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2058 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2059 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2060 peer->tcp_ts = tp->ts_recent;
2070 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2072 struct inet_peer *peer = NULL;
2074 peer = inet_getpeer(tw->tw_daddr, 1);
2077 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2078 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2079 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2080 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2081 peer->tcp_ts = tw->tw_ts_recent;
2090 struct tcp_func ipv4_specific = {
2091 .queue_xmit = ip_queue_xmit,
2092 .send_check = tcp_v4_send_check,
2093 .rebuild_header = tcp_v4_rebuild_header,
2094 .conn_request = tcp_v4_conn_request,
2095 .syn_recv_sock = tcp_v4_syn_recv_sock,
2096 .remember_stamp = tcp_v4_remember_stamp,
2097 .net_header_len = sizeof(struct iphdr),
2098 .setsockopt = ip_setsockopt,
2099 .getsockopt = ip_getsockopt,
2100 .addr2sockaddr = v4_addr2sockaddr,
2101 .sockaddr_len = sizeof(struct sockaddr_in),
2104 /* NOTE: A lot of things set to zero explicitly by call to
2105 * sk_alloc() so need not be done here.
2107 static int tcp_v4_init_sock(struct sock *sk)
2109 struct tcp_opt *tp = tcp_sk(sk);
2111 skb_queue_head_init(&tp->out_of_order_queue);
2112 tcp_init_xmit_timers(sk);
2113 tcp_prequeue_init(tp);
2115 tp->rto = TCP_TIMEOUT_INIT;
2116 tp->mdev = TCP_TIMEOUT_INIT;
2118 /* So many TCP implementations out there (incorrectly) count the
2119 * initial SYN frame in their delayed-ACK and congestion control
2120 * algorithms that we must have the following bandaid to talk
2121 * efficiently to them. -DaveM
2125 /* See draft-stevens-tcpca-spec-01 for discussion of the
2126 * initialization of these values.
2128 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2129 tp->snd_cwnd_clamp = ~0;
2130 tp->mss_cache_std = tp->mss_cache = 536;
2132 tp->reordering = sysctl_tcp_reordering;
2134 sk->sk_state = TCP_CLOSE;
2136 sk->sk_write_space = sk_stream_write_space;
2137 sk->sk_use_write_queue = 1;
2139 tp->af_specific = &ipv4_specific;
2141 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2142 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2144 atomic_inc(&tcp_sockets_allocated);
2149 int tcp_v4_destroy_sock(struct sock *sk)
2151 struct tcp_opt *tp = tcp_sk(sk);
2153 tcp_clear_xmit_timers(sk);
2155 /* Cleanup up the write buffer. */
2156 sk_stream_writequeue_purge(sk);
2158 /* Cleans up our, hopefully empty, out_of_order_queue. */
2159 __skb_queue_purge(&tp->out_of_order_queue);
2161 /* Clean prequeue, it must be empty really */
2162 __skb_queue_purge(&tp->ucopy.prequeue);
2164 /* Clean up a referenced TCP bind bucket. */
2169 * If sendmsg cached page exists, toss it.
2171 if (sk->sk_sndmsg_page) {
2172 __free_page(sk->sk_sndmsg_page);
2173 sk->sk_sndmsg_page = NULL;
2176 atomic_dec(&tcp_sockets_allocated);
2181 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2183 #ifdef CONFIG_PROC_FS
2184 /* Proc filesystem TCP sock list dumping. */
2186 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2188 return hlist_empty(head) ? NULL :
2189 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2192 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2194 return tw->tw_node.next ?
2195 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2198 static void *listening_get_next(struct seq_file *seq, void *cur)
2201 struct hlist_node *node;
2202 struct sock *sk = cur;
2203 struct tcp_iter_state* st = seq->private;
2207 sk = sk_head(&tcp_listening_hash[0]);
2213 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2214 struct open_request *req = cur;
2216 tp = tcp_sk(st->syn_wait_sk);
2220 vxdprintk(VXD_CBIT(net, 6),
2221 "sk,req: %p [#%d] (from %d)", req->sk,
2222 (req->sk)?req->sk->sk_xid:0, current->xid);
2224 !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2226 if (req->class->family == st->family) {
2232 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2235 req = tp->listen_opt->syn_table[st->sbucket];
2237 sk = sk_next(st->syn_wait_sk);
2238 st->state = TCP_SEQ_STATE_LISTENING;
2239 read_unlock_bh(&tp->syn_wait_lock);
2242 read_lock_bh(&tp->syn_wait_lock);
2243 if (tp->listen_opt && tp->listen_opt->qlen)
2245 read_unlock_bh(&tp->syn_wait_lock);
2249 sk_for_each_from(sk, node) {
2250 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2251 sk, sk->sk_xid, current->xid);
2252 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2254 if (sk->sk_family == st->family) {
2259 read_lock_bh(&tp->syn_wait_lock);
2260 if (tp->listen_opt && tp->listen_opt->qlen) {
2262 st->uid = sock_i_uid(sk);
2263 st->syn_wait_sk = sk;
2264 st->state = TCP_SEQ_STATE_OPENREQ;
2268 read_unlock_bh(&tp->syn_wait_lock);
2270 if (++st->bucket < TCP_LHTABLE_SIZE) {
2271 sk = sk_head(&tcp_listening_hash[st->bucket]);
2279 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2281 void *rc = listening_get_next(seq, NULL);
2283 while (rc && *pos) {
2284 rc = listening_get_next(seq, rc);
2290 static void *established_get_first(struct seq_file *seq)
2292 struct tcp_iter_state* st = seq->private;
2295 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2297 struct hlist_node *node;
2298 struct tcp_tw_bucket *tw;
2300 read_lock(&tcp_ehash[st->bucket].lock);
2301 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2302 vxdprintk(VXD_CBIT(net, 6),
2303 "sk,egf: %p [#%d] (from %d)",
2304 sk, sk->sk_xid, current->xid);
2305 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2307 if (sk->sk_family != st->family)
2312 st->state = TCP_SEQ_STATE_TIME_WAIT;
2313 tw_for_each(tw, node,
2314 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2315 vxdprintk(VXD_CBIT(net, 6),
2316 "tw: %p [#%d] (from %d)",
2317 tw, tw->tw_xid, current->xid);
2318 if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2320 if (tw->tw_family != st->family)
2325 read_unlock(&tcp_ehash[st->bucket].lock);
2326 st->state = TCP_SEQ_STATE_ESTABLISHED;
2332 static void *established_get_next(struct seq_file *seq, void *cur)
2334 struct sock *sk = cur;
2335 struct tcp_tw_bucket *tw;
2336 struct hlist_node *node;
2337 struct tcp_iter_state* st = seq->private;
2341 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2345 while (tw && (tw->tw_family != st->family ||
2346 !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2353 read_unlock(&tcp_ehash[st->bucket].lock);
2354 st->state = TCP_SEQ_STATE_ESTABLISHED;
2355 if (++st->bucket < tcp_ehash_size) {
2356 read_lock(&tcp_ehash[st->bucket].lock);
2357 sk = sk_head(&tcp_ehash[st->bucket].chain);
2365 sk_for_each_from(sk, node) {
2366 vxdprintk(VXD_CBIT(net, 6),
2367 "sk,egn: %p [#%d] (from %d)",
2368 sk, sk->sk_xid, current->xid);
2369 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2371 if (sk->sk_family == st->family)
2375 st->state = TCP_SEQ_STATE_TIME_WAIT;
2376 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2384 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2386 void *rc = established_get_first(seq);
2389 rc = established_get_next(seq, rc);
2395 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2398 struct tcp_iter_state* st = seq->private;
2401 st->state = TCP_SEQ_STATE_LISTENING;
2402 rc = listening_get_idx(seq, &pos);
2405 tcp_listen_unlock();
2407 st->state = TCP_SEQ_STATE_ESTABLISHED;
2408 rc = established_get_idx(seq, pos);
2414 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2416 struct tcp_iter_state* st = seq->private;
2417 st->state = TCP_SEQ_STATE_LISTENING;
2419 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2422 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2425 struct tcp_iter_state* st;
2427 if (v == SEQ_START_TOKEN) {
2428 rc = tcp_get_idx(seq, 0);
2433 switch (st->state) {
2434 case TCP_SEQ_STATE_OPENREQ:
2435 case TCP_SEQ_STATE_LISTENING:
2436 rc = listening_get_next(seq, v);
2438 tcp_listen_unlock();
2440 st->state = TCP_SEQ_STATE_ESTABLISHED;
2441 rc = established_get_first(seq);
2444 case TCP_SEQ_STATE_ESTABLISHED:
2445 case TCP_SEQ_STATE_TIME_WAIT:
2446 rc = established_get_next(seq, v);
2454 static void tcp_seq_stop(struct seq_file *seq, void *v)
2456 struct tcp_iter_state* st = seq->private;
2458 switch (st->state) {
2459 case TCP_SEQ_STATE_OPENREQ:
2461 struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2462 read_unlock_bh(&tp->syn_wait_lock);
2464 case TCP_SEQ_STATE_LISTENING:
2465 if (v != SEQ_START_TOKEN)
2466 tcp_listen_unlock();
2468 case TCP_SEQ_STATE_TIME_WAIT:
2469 case TCP_SEQ_STATE_ESTABLISHED:
2471 read_unlock(&tcp_ehash[st->bucket].lock);
2477 static int tcp_seq_open(struct inode *inode, struct file *file)
2479 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2480 struct seq_file *seq;
2481 struct tcp_iter_state *s;
2484 if (unlikely(afinfo == NULL))
2487 s = kmalloc(sizeof(*s), GFP_KERNEL);
2490 memset(s, 0, sizeof(*s));
2491 s->family = afinfo->family;
2492 s->seq_ops.start = tcp_seq_start;
2493 s->seq_ops.next = tcp_seq_next;
2494 s->seq_ops.show = afinfo->seq_show;
2495 s->seq_ops.stop = tcp_seq_stop;
2497 rc = seq_open(file, &s->seq_ops);
2500 seq = file->private_data;
2509 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2512 struct proc_dir_entry *p;
2516 afinfo->seq_fops->owner = afinfo->owner;
2517 afinfo->seq_fops->open = tcp_seq_open;
2518 afinfo->seq_fops->read = seq_read;
2519 afinfo->seq_fops->llseek = seq_lseek;
2520 afinfo->seq_fops->release = seq_release_private;
2522 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2530 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2534 proc_net_remove(afinfo->name);
2535 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2538 static void get_openreq4(struct sock *sk, struct open_request *req,
2539 char *tmpbuf, int i, int uid)
2541 int ttd = req->expires - jiffies;
2543 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2544 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2546 req->af.v4_req.loc_addr,
2547 ntohs(inet_sk(sk)->sport),
2548 req->af.v4_req.rmt_addr,
2549 ntohs(req->rmt_port),
2551 0, 0, /* could print option size, but that is af dependent. */
2552 1, /* timers active (only the expire timer) */
2553 jiffies_to_clock_t(ttd),
2556 0, /* non standard timer */
2557 0, /* open_requests have no inode */
2558 atomic_read(&sk->sk_refcnt),
2562 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2565 unsigned long timer_expires;
2566 struct tcp_opt *tp = tcp_sk(sp);
2567 struct inet_opt *inet = inet_sk(sp);
2568 unsigned int dest = inet->daddr;
2569 unsigned int src = inet->rcv_saddr;
2570 __u16 destp = ntohs(inet->dport);
2571 __u16 srcp = ntohs(inet->sport);
2573 if (tp->pending == TCP_TIME_RETRANS) {
2575 timer_expires = tp->timeout;
2576 } else if (tp->pending == TCP_TIME_PROBE0) {
2578 timer_expires = tp->timeout;
2579 } else if (timer_pending(&sp->sk_timer)) {
2581 timer_expires = sp->sk_timer.expires;
2584 timer_expires = jiffies;
2587 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2588 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2589 i, src, srcp, dest, destp, sp->sk_state,
2590 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2592 jiffies_to_clock_t(timer_expires - jiffies),
2597 atomic_read(&sp->sk_refcnt), sp,
2598 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2600 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2603 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2605 unsigned int dest, src;
2607 int ttd = tw->tw_ttd - jiffies;
2612 dest = tw->tw_daddr;
2613 src = tw->tw_rcv_saddr;
2614 destp = ntohs(tw->tw_dport);
2615 srcp = ntohs(tw->tw_sport);
2617 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2618 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2619 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2620 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2621 atomic_read(&tw->tw_refcnt), tw);
2626 static int tcp4_seq_show(struct seq_file *seq, void *v)
2628 struct tcp_iter_state* st;
2629 char tmpbuf[TMPSZ + 1];
2631 if (v == SEQ_START_TOKEN) {
2632 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2633 " sl local_address rem_address st tx_queue "
2634 "rx_queue tr tm->when retrnsmt uid timeout "
2640 switch (st->state) {
2641 case TCP_SEQ_STATE_LISTENING:
2642 case TCP_SEQ_STATE_ESTABLISHED:
2643 get_tcp4_sock(v, tmpbuf, st->num);
2645 case TCP_SEQ_STATE_OPENREQ:
2646 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2648 case TCP_SEQ_STATE_TIME_WAIT:
2649 get_timewait4_sock(v, tmpbuf, st->num);
2652 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2657 static struct file_operations tcp4_seq_fops;
2658 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2659 .owner = THIS_MODULE,
2662 .seq_show = tcp4_seq_show,
2663 .seq_fops = &tcp4_seq_fops,
2666 int __init tcp4_proc_init(void)
2668 return tcp_proc_register(&tcp4_seq_afinfo);
2671 void tcp4_proc_exit(void)
2673 tcp_proc_unregister(&tcp4_seq_afinfo);
2675 #endif /* CONFIG_PROC_FS */
2677 struct proto tcp_prot = {
2679 .owner = THIS_MODULE,
2681 .connect = tcp_v4_connect,
2682 .disconnect = tcp_disconnect,
2683 .accept = tcp_accept,
2685 .init = tcp_v4_init_sock,
2686 .destroy = tcp_v4_destroy_sock,
2687 .shutdown = tcp_shutdown,
2688 .setsockopt = tcp_setsockopt,
2689 .getsockopt = tcp_getsockopt,
2690 .sendmsg = tcp_sendmsg,
2691 .recvmsg = tcp_recvmsg,
2692 .backlog_rcv = tcp_v4_do_rcv,
2693 .hash = tcp_v4_hash,
2694 .unhash = tcp_unhash,
2695 .get_port = tcp_v4_get_port,
2696 .enter_memory_pressure = tcp_enter_memory_pressure,
2697 .sockets_allocated = &tcp_sockets_allocated,
2698 .memory_allocated = &tcp_memory_allocated,
2699 .memory_pressure = &tcp_memory_pressure,
2700 .sysctl_mem = sysctl_tcp_mem,
2701 .sysctl_wmem = sysctl_tcp_wmem,
2702 .sysctl_rmem = sysctl_tcp_rmem,
2703 .max_header = MAX_TCP_HEADER,
2704 .slab_obj_size = sizeof(struct tcp_sock),
2709 void __init tcp_v4_init(struct net_proto_family *ops)
2711 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2713 panic("Failed to create the TCP control socket.\n");
2714 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2715 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2717 /* Unhash it so that IP input processing does not even
2718 * see it, we do not wish this socket to see incoming
2721 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2724 EXPORT_SYMBOL(ipv4_specific);
2725 EXPORT_SYMBOL(tcp_bind_hash);
2726 EXPORT_SYMBOL(tcp_bucket_create);
2727 EXPORT_SYMBOL(tcp_hashinfo);
2728 EXPORT_SYMBOL(tcp_inherit_port);
2729 EXPORT_SYMBOL(tcp_listen_wlock);
2730 EXPORT_SYMBOL(tcp_port_rover);
2731 EXPORT_SYMBOL(tcp_prot);
2732 EXPORT_SYMBOL(tcp_put_port);
2733 EXPORT_SYMBOL(tcp_unhash);
2734 EXPORT_SYMBOL(tcp_v4_conn_request);
2735 EXPORT_SYMBOL(tcp_v4_connect);
2736 EXPORT_SYMBOL(tcp_v4_do_rcv);
2737 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2738 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2739 EXPORT_SYMBOL(tcp_v4_send_check);
2740 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2742 #ifdef CONFIG_PROC_FS
2743 EXPORT_SYMBOL(tcp_proc_register);
2744 EXPORT_SYMBOL(tcp_proc_unregister);
2746 EXPORT_SYMBOL(sysctl_local_port_range);
2747 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2748 EXPORT_SYMBOL(sysctl_tcp_low_latency);