2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/vserver/debug.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
109 __u32 faddr, __u16 fport)
111 int h = (laddr ^ lport) ^ (faddr ^ fport);
114 return h & (tcp_ehash_size - 1);
117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
119 struct inet_opt *inet = inet_sk(sk);
120 __u32 laddr = inet->rcv_saddr;
121 __u16 lport = inet->num;
122 __u32 faddr = inet->daddr;
123 __u16 fport = inet->dport;
125 return tcp_hashfn(laddr, lport, faddr, fport);
128 /* Allocate and initialize a new TCP local port bind bucket.
129 * The bindhash mutex for snum's hash chain must be held here.
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
134 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
139 INIT_HLIST_HEAD(&tb->owners);
140 hlist_add_head(&tb->node, &head->chain);
145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
148 if (hlist_empty(&tb->owners)) {
149 __hlist_del(&tb->node);
150 kmem_cache_free(tcp_bucket_cachep, tb);
154 /* Caller must disable local BH processing. */
155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
157 struct tcp_bind_hashbucket *head =
158 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
159 struct tcp_bind_bucket *tb;
161 spin_lock(&head->lock);
162 tb = tcp_sk(sk)->bind_hash;
163 sk_add_bind_node(child, &tb->owners);
164 tcp_sk(child)->bind_hash = tb;
165 spin_unlock(&head->lock);
168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
171 __tcp_inherit_port(sk, child);
175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
178 inet_sk(sk)->num = snum;
179 sk_add_bind_node(sk, &tb->owners);
180 tcp_sk(sk)->bind_hash = tb;
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 if (nx_addr_conflict(sk->sk_nx_info,
198 tcp_v4_rcv_saddr(sk), sk2))
206 /* Obtain a reference to a local port for the given sock,
207 * if snum is zero it means select any available local port.
209 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211 struct tcp_bind_hashbucket *head;
212 struct hlist_node *node;
213 struct tcp_bind_bucket *tb;
218 int low = sysctl_local_port_range[0];
219 int high = sysctl_local_port_range[1];
220 int remaining = (high - low) + 1;
223 spin_lock(&tcp_portalloc_lock);
224 rover = tcp_port_rover;
227 if (rover < low || rover > high)
229 head = &tcp_bhash[tcp_bhashfn(rover)];
230 spin_lock(&head->lock);
231 tb_for_each(tb, node, &head->chain)
232 if (tb->port == rover)
236 spin_unlock(&head->lock);
237 } while (--remaining > 0);
238 tcp_port_rover = rover;
239 spin_unlock(&tcp_portalloc_lock);
241 /* Exhausted local port range during search? */
246 /* OK, here is the one we will use. HEAD is
247 * non-NULL and we hold it's mutex.
251 head = &tcp_bhash[tcp_bhashfn(snum)];
252 spin_lock(&head->lock);
253 tb_for_each(tb, node, &head->chain)
254 if (tb->port == snum)
260 if (!hlist_empty(&tb->owners)) {
261 if (sk->sk_reuse > 1)
263 if (tb->fastreuse > 0 &&
264 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
268 if (tcp_bind_conflict(sk, tb))
274 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
276 if (hlist_empty(&tb->owners)) {
277 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
281 } else if (tb->fastreuse &&
282 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
285 if (!tcp_sk(sk)->bind_hash)
286 tcp_bind_hash(sk, tb, snum);
287 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
291 spin_unlock(&head->lock);
297 /* Get rid of any references to a local port held by the
300 static void __tcp_put_port(struct sock *sk)
302 struct inet_opt *inet = inet_sk(sk);
303 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
304 struct tcp_bind_bucket *tb;
306 spin_lock(&head->lock);
307 tb = tcp_sk(sk)->bind_hash;
308 __sk_del_bind_node(sk);
309 tcp_sk(sk)->bind_hash = NULL;
311 tcp_bucket_destroy(tb);
312 spin_unlock(&head->lock);
315 void tcp_put_port(struct sock *sk)
322 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
323 * Look, when several writers sleep and reader wakes them up, all but one
324 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
325 * this, _but_ remember, it adds useless work on UP machines (wake up each
326 * exclusive lock release). It should be ifdefed really.
329 void tcp_listen_wlock(void)
331 write_lock(&tcp_lhash_lock);
333 if (atomic_read(&tcp_lhash_users)) {
337 prepare_to_wait_exclusive(&tcp_lhash_wait,
338 &wait, TASK_UNINTERRUPTIBLE);
339 if (!atomic_read(&tcp_lhash_users))
341 write_unlock_bh(&tcp_lhash_lock);
343 write_lock_bh(&tcp_lhash_lock);
346 finish_wait(&tcp_lhash_wait, &wait);
350 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
352 struct hlist_head *list;
355 BUG_TRAP(sk_unhashed(sk));
356 if (listen_possible && sk->sk_state == TCP_LISTEN) {
357 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
358 lock = &tcp_lhash_lock;
361 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
362 lock = &tcp_ehash[sk->sk_hashent].lock;
365 __sk_add_node(sk, list);
366 sock_prot_inc_use(sk->sk_prot);
368 if (listen_possible && sk->sk_state == TCP_LISTEN)
369 wake_up(&tcp_lhash_wait);
372 static void tcp_v4_hash(struct sock *sk)
374 if (sk->sk_state != TCP_CLOSE) {
376 __tcp_v4_hash(sk, 1);
381 void tcp_unhash(struct sock *sk)
388 if (sk->sk_state == TCP_LISTEN) {
391 lock = &tcp_lhash_lock;
393 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
395 write_lock_bh(&head->lock);
398 if (__sk_del_node_init(sk))
399 sock_prot_dec_use(sk->sk_prot);
400 write_unlock_bh(lock);
403 if (sk->sk_state == TCP_LISTEN)
404 wake_up(&tcp_lhash_wait);
409 * Check if a given address matches for a tcp socket
411 * nxi: the socket's nx_info if any
412 * addr: to be verified address
413 * saddr: socket addresses
415 static inline int tcp_addr_match (
420 if (addr && (saddr == addr))
423 return addr_in_nx_info(nxi, addr);
427 /* Don't inline this cruft. Here are some nice properties to
428 * exploit here. The BSD API does not allow a listening TCP
429 * to specify the remote port nor the remote address for the
430 * connection. So always assume those are both wildcarded
431 * during the search since they can never be otherwise.
433 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
434 unsigned short hnum, int dif)
436 struct sock *result = NULL, *sk;
437 struct hlist_node *node;
441 sk_for_each(sk, node, head) {
442 struct inet_opt *inet = inet_sk(sk);
444 if (inet->num == hnum && !ipv6_only_sock(sk)) {
445 __u32 rcv_saddr = inet->rcv_saddr;
447 score = (sk->sk_family == PF_INET ? 1 : 0);
448 if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
452 if (sk->sk_bound_dev_if) {
453 if (sk->sk_bound_dev_if != dif)
459 if (score > hiscore) {
468 /* Optimize the common listener case. */
469 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
472 struct sock *sk = NULL;
473 struct hlist_head *head;
475 read_lock(&tcp_lhash_lock);
476 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
477 if (!hlist_empty(head)) {
478 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
480 if (inet->num == hnum && !sk->sk_node.next &&
481 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
482 tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
483 !sk->sk_bound_dev_if)
485 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
491 read_unlock(&tcp_lhash_lock);
495 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
496 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
498 * Local BH must be disabled here.
501 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
505 struct tcp_ehash_bucket *head;
506 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
507 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
509 struct hlist_node *node;
510 /* Optimize here for direct hit, only listening connections can
511 * have wildcards anyways.
513 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
514 head = &tcp_ehash[hash];
515 read_lock(&head->lock);
516 sk_for_each(sk, node, &head->chain) {
517 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
518 goto hit; /* You sunk my battleship! */
521 /* Must check for a TIME_WAIT'er before going to listener hash. */
522 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
523 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
528 read_unlock(&head->lock);
535 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
536 u32 daddr, u16 hnum, int dif)
538 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
541 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
544 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
550 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
556 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
558 return secure_tcp_sequence_number(skb->nh.iph->daddr,
564 /* called with local bh disabled */
565 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
566 struct tcp_tw_bucket **twp)
568 struct inet_opt *inet = inet_sk(sk);
569 u32 daddr = inet->rcv_saddr;
570 u32 saddr = inet->daddr;
571 int dif = sk->sk_bound_dev_if;
572 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
573 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
574 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
575 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
577 struct hlist_node *node;
578 struct tcp_tw_bucket *tw;
580 write_lock(&head->lock);
582 /* Check TIME-WAIT sockets first. */
583 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
584 tw = (struct tcp_tw_bucket *)sk2;
586 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
587 struct tcp_opt *tp = tcp_sk(sk);
589 /* With PAWS, it is safe from the viewpoint
590 of data integrity. Even without PAWS it
591 is safe provided sequence spaces do not
592 overlap i.e. at data rates <= 80Mbit/sec.
594 Actually, the idea is close to VJ's one,
595 only timestamp cache is held not per host,
596 but per port pair and TW bucket is used
599 If TW bucket has been already destroyed we
600 fall back to VJ's scheme and use initial
601 timestamp retrieved from peer table.
603 if (tw->tw_ts_recent_stamp &&
604 (!twp || (sysctl_tcp_tw_reuse &&
606 tw->tw_ts_recent_stamp > 1))) {
608 tw->tw_snd_nxt + 65535 + 2) == 0)
610 tp->ts_recent = tw->tw_ts_recent;
611 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
620 /* And established part... */
621 sk_for_each(sk2, node, &head->chain) {
622 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
627 /* Must record num and sport now. Otherwise we will see
628 * in hash table socket with a funny identity. */
630 inet->sport = htons(lport);
631 sk->sk_hashent = hash;
632 BUG_TRAP(sk_unhashed(sk));
633 __sk_add_node(sk, &head->chain);
634 sock_prot_inc_use(sk->sk_prot);
635 write_unlock(&head->lock);
639 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
641 /* Silly. Should hash-dance instead... */
642 tcp_tw_deschedule(tw);
643 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
651 write_unlock(&head->lock);
652 return -EADDRNOTAVAIL;
656 * Bind a port for a connect operation and hash it.
658 static int tcp_v4_hash_connect(struct sock *sk)
660 unsigned short snum = inet_sk(sk)->num;
661 struct tcp_bind_hashbucket *head;
662 struct tcp_bind_bucket *tb;
667 int low = sysctl_local_port_range[0];
668 int high = sysctl_local_port_range[1];
669 int remaining = (high - low) + 1;
670 struct hlist_node *node;
671 struct tcp_tw_bucket *tw = NULL;
675 /* TODO. Actually it is not so bad idea to remove
676 * tcp_portalloc_lock before next submission to Linus.
677 * As soon as we touch this place at all it is time to think.
679 * Now it protects single _advisory_ variable tcp_port_rover,
680 * hence it is mostly useless.
681 * Code will work nicely if we just delete it, but
682 * I am afraid in contented case it will work not better or
683 * even worse: another cpu just will hit the same bucket
685 * So some cpu salt could remove both contention and
686 * memory pingpong. Any ideas how to do this in a nice way?
688 spin_lock(&tcp_portalloc_lock);
689 rover = tcp_port_rover;
693 if ((rover < low) || (rover > high))
695 head = &tcp_bhash[tcp_bhashfn(rover)];
696 spin_lock(&head->lock);
698 /* Does not bother with rcv_saddr checks,
699 * because the established check is already
702 tb_for_each(tb, node, &head->chain) {
703 if (tb->port == rover) {
704 BUG_TRAP(!hlist_empty(&tb->owners));
705 if (tb->fastreuse >= 0)
707 if (!__tcp_v4_check_established(sk,
715 tb = tcp_bucket_create(head, rover);
717 spin_unlock(&head->lock);
724 spin_unlock(&head->lock);
725 } while (--remaining > 0);
726 tcp_port_rover = rover;
727 spin_unlock(&tcp_portalloc_lock);
731 return -EADDRNOTAVAIL;
734 /* All locks still held and bhs disabled */
735 tcp_port_rover = rover;
736 spin_unlock(&tcp_portalloc_lock);
738 tcp_bind_hash(sk, tb, rover);
739 if (sk_unhashed(sk)) {
740 inet_sk(sk)->sport = htons(rover);
741 __tcp_v4_hash(sk, 0);
743 spin_unlock(&head->lock);
746 tcp_tw_deschedule(tw);
754 head = &tcp_bhash[tcp_bhashfn(snum)];
755 tb = tcp_sk(sk)->bind_hash;
756 spin_lock_bh(&head->lock);
757 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
758 __tcp_v4_hash(sk, 0);
759 spin_unlock_bh(&head->lock);
762 spin_unlock(&head->lock);
763 /* No definite answer... Walk to established hash table */
764 ret = __tcp_v4_check_established(sk, snum, NULL);
771 /* This will initiate an outgoing connection. */
772 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
774 struct inet_opt *inet = inet_sk(sk);
775 struct tcp_opt *tp = tcp_sk(sk);
776 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
782 if (addr_len < sizeof(struct sockaddr_in))
785 if (usin->sin_family != AF_INET)
786 return -EAFNOSUPPORT;
788 nexthop = daddr = usin->sin_addr.s_addr;
789 if (inet->opt && inet->opt->srr) {
792 nexthop = inet->opt->faddr;
795 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
796 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
798 inet->sport, usin->sin_port, sk);
802 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
807 if (!inet->opt || !inet->opt->srr)
811 inet->saddr = rt->rt_src;
812 inet->rcv_saddr = inet->saddr;
814 if (tp->ts_recent_stamp && inet->daddr != daddr) {
815 /* Reset inherited state */
817 tp->ts_recent_stamp = 0;
821 if (sysctl_tcp_tw_recycle &&
822 !tp->ts_recent_stamp && rt->rt_dst == daddr) {
823 struct inet_peer *peer = rt_get_peer(rt);
825 /* VJ's idea. We save last timestamp seen from
826 * the destination in peer table, when entering state TIME-WAIT
827 * and initialize ts_recent from it, when trying new connection.
830 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
831 tp->ts_recent_stamp = peer->tcp_ts_stamp;
832 tp->ts_recent = peer->tcp_ts;
836 inet->dport = usin->sin_port;
839 tp->ext_header_len = 0;
841 tp->ext_header_len = inet->opt->optlen;
845 /* Socket identity is still unknown (sport may be zero).
846 * However we set state to SYN-SENT and not releasing socket
847 * lock select source port, enter ourselves into the hash tables and
848 * complete initialization after this.
850 tcp_set_state(sk, TCP_SYN_SENT);
851 err = tcp_v4_hash_connect(sk);
855 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
859 /* OK, now commit destination to socket. */
860 __sk_dst_set(sk, &rt->u.dst);
861 tcp_v4_setup_caps(sk, &rt->u.dst);
862 tp->ext2_header_len = rt->u.dst.header_len;
865 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
870 inet->id = tp->write_seq ^ jiffies;
872 err = tcp_connect(sk);
880 /* This unhashes the socket and releases the local port, if necessary. */
881 tcp_set_state(sk, TCP_CLOSE);
883 sk->sk_route_caps = 0;
888 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
890 return ((struct rtable *)skb->dst)->rt_iif;
893 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
895 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
898 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
899 struct open_request ***prevp,
901 __u32 raddr, __u32 laddr)
903 struct tcp_listen_opt *lopt = tp->listen_opt;
904 struct open_request *req, **prev;
906 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
907 (req = *prev) != NULL;
908 prev = &req->dl_next) {
909 if (req->rmt_port == rport &&
910 req->af.v4_req.rmt_addr == raddr &&
911 req->af.v4_req.loc_addr == laddr &&
912 TCP_INET_FAMILY(req->class->family)) {
922 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
924 struct tcp_opt *tp = tcp_sk(sk);
925 struct tcp_listen_opt *lopt = tp->listen_opt;
926 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
928 req->expires = jiffies + TCP_TIMEOUT_INIT;
931 req->dl_next = lopt->syn_table[h];
933 write_lock(&tp->syn_wait_lock);
934 lopt->syn_table[h] = req;
935 write_unlock(&tp->syn_wait_lock);
942 * This routine does path mtu discovery as defined in RFC1191.
944 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
947 struct dst_entry *dst;
948 struct inet_opt *inet = inet_sk(sk);
949 struct tcp_opt *tp = tcp_sk(sk);
951 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
952 * send out by Linux are always <576bytes so they should go through
955 if (sk->sk_state == TCP_LISTEN)
958 /* We don't check in the destentry if pmtu discovery is forbidden
959 * on this route. We just assume that no packet_to_big packets
960 * are send back when pmtu discovery is not active.
961 * There is a small race when the user changes this flag in the
962 * route, but I think that's acceptable.
964 if ((dst = __sk_dst_check(sk, 0)) == NULL)
967 dst->ops->update_pmtu(dst, mtu);
969 /* Something is about to be wrong... Remember soft error
970 * for the case, if this connection will not able to recover.
972 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
973 sk->sk_err_soft = EMSGSIZE;
977 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
978 tp->pmtu_cookie > mtu) {
979 tcp_sync_mss(sk, mtu);
981 /* Resend the TCP packet because it's
982 * clear that the old packet has been
983 * dropped. This is the new "fast" path mtu
986 tcp_simple_retransmit(sk);
987 } /* else let the usual retransmit timer handle it */
991 * This routine is called by the ICMP module when it gets some
992 * sort of error condition. If err < 0 then the socket should
993 * be closed and the error returned to the user. If err > 0
994 * it's just the icmp type << 8 | icmp code. After adjustment
995 * header points to the first 8 bytes of the tcp header. We need
996 * to find the appropriate port.
998 * The locking strategy used here is very "optimistic". When
999 * someone else accesses the socket the ICMP is just dropped
1000 * and for some paths there is no check at all.
1001 * A more general error queue to queue errors for later handling
1002 * is probably better.
1006 void tcp_v4_err(struct sk_buff *skb, u32 info)
1008 struct iphdr *iph = (struct iphdr *)skb->data;
1009 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1011 struct inet_opt *inet;
1012 int type = skb->h.icmph->type;
1013 int code = skb->h.icmph->code;
1018 if (skb->len < (iph->ihl << 2) + 8) {
1019 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1023 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1024 th->source, tcp_v4_iif(skb));
1026 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1029 if (sk->sk_state == TCP_TIME_WAIT) {
1030 tcp_tw_put((struct tcp_tw_bucket *)sk);
1035 /* If too many ICMPs get dropped on busy
1036 * servers this needs to be solved differently.
1038 if (sock_owned_by_user(sk))
1039 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1041 if (sk->sk_state == TCP_CLOSE)
1045 seq = ntohl(th->seq);
1046 if (sk->sk_state != TCP_LISTEN &&
1047 !between(seq, tp->snd_una, tp->snd_nxt)) {
1048 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1053 case ICMP_SOURCE_QUENCH:
1054 /* Just silently ignore these. */
1056 case ICMP_PARAMETERPROB:
1059 case ICMP_DEST_UNREACH:
1060 if (code > NR_ICMP_UNREACH)
1063 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1064 if (!sock_owned_by_user(sk))
1065 do_pmtu_discovery(sk, iph, info);
1069 err = icmp_err_convert[code].errno;
1071 case ICMP_TIME_EXCEEDED:
1078 switch (sk->sk_state) {
1079 struct open_request *req, **prev;
1081 if (sock_owned_by_user(sk))
1084 req = tcp_v4_search_req(tp, &prev, th->dest,
1085 iph->daddr, iph->saddr);
1089 /* ICMPs are not backlogged, hence we cannot get
1090 an established socket here.
1094 if (seq != req->snt_isn) {
1095 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1100 * Still in SYN_RECV, just remove it silently.
1101 * There is no good way to pass the error to the newly
1102 * created socket, and POSIX does not want network
1103 * errors returned from accept().
1105 tcp_synq_drop(sk, req, prev);
1109 case TCP_SYN_RECV: /* Cannot happen.
1110 It can f.e. if SYNs crossed.
1112 if (!sock_owned_by_user(sk)) {
1113 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1116 sk->sk_error_report(sk);
1120 sk->sk_err_soft = err;
1125 /* If we've already connected we will keep trying
1126 * until we time out, or the user gives up.
1128 * rfc1122 4.2.3.9 allows to consider as hard errors
1129 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1130 * but it is obsoleted by pmtu discovery).
1132 * Note, that in modern internet, where routing is unreliable
1133 * and in each dark corner broken firewalls sit, sending random
1134 * errors ordered by their masters even this two messages finally lose
1135 * their original sense (even Linux sends invalid PORT_UNREACHs)
1137 * Now we are in compliance with RFCs.
1142 if (!sock_owned_by_user(sk) && inet->recverr) {
1144 sk->sk_error_report(sk);
1145 } else { /* Only an error on timeout */
1146 sk->sk_err_soft = err;
1154 /* This routine computes an IPv4 TCP checksum. */
1155 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1156 struct sk_buff *skb)
1158 struct inet_opt *inet = inet_sk(sk);
1160 if (skb->ip_summed == CHECKSUM_HW) {
1161 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1162 skb->csum = offsetof(struct tcphdr, check);
1164 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1165 csum_partial((char *)th,
1172 * This routine will send an RST to the other tcp.
1174 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1176 * Answer: if a packet caused RST, it is not for a socket
1177 * existing in our system, if it is matched to a socket,
1178 * it is just duplicate segment or bug in other side's TCP.
1179 * So that we build reply only basing on parameters
1180 * arrived with segment.
1181 * Exception: precedence violation. We do not implement it in any case.
1184 static void tcp_v4_send_reset(struct sk_buff *skb)
1186 struct tcphdr *th = skb->h.th;
1188 struct ip_reply_arg arg;
1190 /* Never send a reset in response to a reset. */
1194 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1197 /* Swap the send and the receive. */
1198 memset(&rth, 0, sizeof(struct tcphdr));
1199 rth.dest = th->source;
1200 rth.source = th->dest;
1201 rth.doff = sizeof(struct tcphdr) / 4;
1205 rth.seq = th->ack_seq;
1208 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1209 skb->len - (th->doff << 2));
1212 memset(&arg, 0, sizeof arg);
1213 arg.iov[0].iov_base = (unsigned char *)&rth;
1214 arg.iov[0].iov_len = sizeof rth;
1215 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1216 skb->nh.iph->saddr, /*XXX*/
1217 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1218 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1220 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1222 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1223 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1226 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1227 outside socket context is ugly, certainly. What can I do?
1230 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1233 struct tcphdr *th = skb->h.th;
1238 struct ip_reply_arg arg;
1240 memset(&rep.th, 0, sizeof(struct tcphdr));
1241 memset(&arg, 0, sizeof arg);
1243 arg.iov[0].iov_base = (unsigned char *)&rep;
1244 arg.iov[0].iov_len = sizeof(rep.th);
1246 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1247 (TCPOPT_TIMESTAMP << 8) |
1249 rep.tsopt[1] = htonl(tcp_time_stamp);
1250 rep.tsopt[2] = htonl(ts);
1251 arg.iov[0].iov_len = sizeof(rep);
1254 /* Swap the send and the receive. */
1255 rep.th.dest = th->source;
1256 rep.th.source = th->dest;
1257 rep.th.doff = arg.iov[0].iov_len / 4;
1258 rep.th.seq = htonl(seq);
1259 rep.th.ack_seq = htonl(ack);
1261 rep.th.window = htons(win);
1263 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1264 skb->nh.iph->saddr, /*XXX*/
1265 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1266 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1268 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1270 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1273 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1275 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1277 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1278 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1283 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1285 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1289 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1290 struct open_request *req)
1293 struct ip_options *opt = req->af.v4_req.opt;
1294 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1296 { .daddr = ((opt && opt->srr) ?
1298 req->af.v4_req.rmt_addr),
1299 .saddr = req->af.v4_req.loc_addr,
1300 .tos = RT_CONN_FLAGS(sk) } },
1301 .proto = IPPROTO_TCP,
1303 { .sport = inet_sk(sk)->sport,
1304 .dport = req->rmt_port } } };
1306 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1307 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1310 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1312 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1319 * Send a SYN-ACK after having received an ACK.
1320 * This still operates on a open_request only, not on a big
1323 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1324 struct dst_entry *dst)
1327 struct sk_buff * skb;
1329 /* First, grab a route. */
1330 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1333 skb = tcp_make_synack(sk, dst, req);
1336 struct tcphdr *th = skb->h.th;
1338 th->check = tcp_v4_check(th, skb->len,
1339 req->af.v4_req.loc_addr,
1340 req->af.v4_req.rmt_addr,
1341 csum_partial((char *)th, skb->len,
1344 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1345 req->af.v4_req.rmt_addr,
1346 req->af.v4_req.opt);
1347 if (err == NET_XMIT_CN)
1357 * IPv4 open_request destructor.
1359 static void tcp_v4_or_free(struct open_request *req)
1361 if (req->af.v4_req.opt)
1362 kfree(req->af.v4_req.opt);
1365 static inline void syn_flood_warning(struct sk_buff *skb)
1367 static unsigned long warntime;
1369 if (time_after(jiffies, (warntime + HZ * 60))) {
1372 "possible SYN flooding on port %d. Sending cookies.\n",
1373 ntohs(skb->h.th->dest));
1378 * Save and compile IPv4 options into the open_request if needed.
1380 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1381 struct sk_buff *skb)
1383 struct ip_options *opt = &(IPCB(skb)->opt);
1384 struct ip_options *dopt = NULL;
1386 if (opt && opt->optlen) {
1387 int opt_size = optlength(opt);
1388 dopt = kmalloc(opt_size, GFP_ATOMIC);
1390 if (ip_options_echo(dopt, skb)) {
1400 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1401 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1402 * It would be better to replace it with a global counter for all sockets
1403 * but then some measure against one socket starving all other sockets
1406 * It was 128 by default. Experiments with real servers show, that
1407 * it is absolutely not enough even at 100conn/sec. 256 cures most
1408 * of problems. This value is adjusted to 128 for very small machines
1409 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1410 * Further increasing requires to change hash table size.
1412 int sysctl_max_syn_backlog = 256;
1414 struct or_calltable or_ipv4 = {
1416 .rtx_syn_ack = tcp_v4_send_synack,
1417 .send_ack = tcp_v4_or_send_ack,
1418 .destructor = tcp_v4_or_free,
1419 .send_reset = tcp_v4_send_reset,
1422 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1425 struct open_request *req;
1426 __u32 saddr = skb->nh.iph->saddr;
1427 __u32 daddr = skb->nh.iph->daddr;
1428 __u32 isn = TCP_SKB_CB(skb)->when;
1429 struct dst_entry *dst = NULL;
1430 #ifdef CONFIG_SYN_COOKIES
1431 int want_cookie = 0;
1433 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1436 /* Never answer to SYNs send to broadcast or multicast */
1437 if (((struct rtable *)skb->dst)->rt_flags &
1438 (RTCF_BROADCAST | RTCF_MULTICAST))
1441 /* TW buckets are converted to open requests without
1442 * limitations, they conserve resources and peer is
1443 * evidently real one.
1445 if (tcp_synq_is_full(sk) && !isn) {
1446 #ifdef CONFIG_SYN_COOKIES
1447 if (sysctl_tcp_syncookies) {
1454 /* Accept backlog is full. If we have already queued enough
1455 * of warm entries in syn queue, drop request. It is better than
1456 * clogging syn queue with openreqs with exponentially increasing
1459 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1462 req = tcp_openreq_alloc();
1466 tcp_clear_options(&tp);
1468 tp.user_mss = tcp_sk(sk)->user_mss;
1470 tcp_parse_options(skb, &tp, 0);
1473 tcp_clear_options(&tp);
1477 if (tp.saw_tstamp && !tp.rcv_tsval) {
1478 /* Some OSes (unknown ones, but I see them on web server, which
1479 * contains information interesting only for windows'
1480 * users) do not send their stamp in SYN. It is easy case.
1481 * We simply do not advertise TS support.
1486 tp.tstamp_ok = tp.saw_tstamp;
1488 tcp_openreq_init(req, &tp, skb);
1490 req->af.v4_req.loc_addr = daddr;
1491 req->af.v4_req.rmt_addr = saddr;
1492 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1493 req->class = &or_ipv4;
1495 TCP_ECN_create_request(req, skb->h.th);
1498 #ifdef CONFIG_SYN_COOKIES
1499 syn_flood_warning(skb);
1501 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1503 struct inet_peer *peer = NULL;
1505 /* VJ's idea. We save last timestamp seen
1506 * from the destination in peer table, when entering
1507 * state TIME-WAIT, and check against it before
1508 * accepting new connection request.
1510 * If "isn" is not zero, this request hit alive
1511 * timewait bucket, so that all the necessary checks
1512 * are made in the function processing timewait state.
1514 if (tp.saw_tstamp &&
1515 sysctl_tcp_tw_recycle &&
1516 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1517 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1518 peer->v4daddr == saddr) {
1519 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1520 (s32)(peer->tcp_ts - req->ts_recent) >
1522 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1527 /* Kill the following clause, if you dislike this way. */
1528 else if (!sysctl_tcp_syncookies &&
1529 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1530 (sysctl_max_syn_backlog >> 2)) &&
1531 (!peer || !peer->tcp_ts_stamp) &&
1532 (!dst || !dst_metric(dst, RTAX_RTT))) {
1533 /* Without syncookies last quarter of
1534 * backlog is filled with destinations,
1535 * proven to be alive.
1536 * It means that we continue to communicate
1537 * to destinations, already remembered
1538 * to the moment of synflood.
1540 NETDEBUG(if (net_ratelimit()) \
1541 printk(KERN_DEBUG "TCP: drop open "
1542 "request from %u.%u."
1545 ntohs(skb->h.th->source)));
1550 isn = tcp_v4_init_sequence(sk, skb);
1554 if (tcp_v4_send_synack(sk, req, dst))
1558 tcp_openreq_free(req);
1560 tcp_v4_synq_add(sk, req);
1565 tcp_openreq_free(req);
1567 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1573 * The three way handshake has completed - we got a valid synack -
1574 * now create the new socket.
1576 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1577 struct open_request *req,
1578 struct dst_entry *dst)
1580 struct inet_opt *newinet;
1581 struct tcp_opt *newtp;
1584 if (sk_acceptq_is_full(sk))
1587 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1590 newsk = tcp_create_openreq_child(sk, req, skb);
1594 newsk->sk_dst_cache = dst;
1595 tcp_v4_setup_caps(newsk, dst);
1597 newtp = tcp_sk(newsk);
1598 newinet = inet_sk(newsk);
1599 newinet->daddr = req->af.v4_req.rmt_addr;
1600 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1601 newinet->saddr = req->af.v4_req.loc_addr;
1602 newinet->opt = req->af.v4_req.opt;
1603 req->af.v4_req.opt = NULL;
1604 newinet->mc_index = tcp_v4_iif(skb);
1605 newinet->mc_ttl = skb->nh.iph->ttl;
1606 newtp->ext_header_len = 0;
1608 newtp->ext_header_len = newinet->opt->optlen;
1609 newtp->ext2_header_len = dst->header_len;
1610 newinet->id = newtp->write_seq ^ jiffies;
1612 tcp_sync_mss(newsk, dst_pmtu(dst));
1613 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1614 tcp_initialize_rcv_mss(newsk);
1616 __tcp_v4_hash(newsk, 0);
1617 __tcp_inherit_port(sk, newsk);
1622 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1624 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1629 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1631 struct tcphdr *th = skb->h.th;
1632 struct iphdr *iph = skb->nh.iph;
1633 struct tcp_opt *tp = tcp_sk(sk);
1635 struct open_request **prev;
1636 /* Find possible connection requests. */
1637 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1638 iph->saddr, iph->daddr);
1640 return tcp_check_req(sk, skb, req, prev);
1642 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1649 if (nsk->sk_state != TCP_TIME_WAIT) {
1653 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1657 #ifdef CONFIG_SYN_COOKIES
1658 if (!th->rst && !th->syn && th->ack)
1659 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1664 static int tcp_v4_checksum_init(struct sk_buff *skb)
1666 if (skb->ip_summed == CHECKSUM_HW) {
1667 skb->ip_summed = CHECKSUM_UNNECESSARY;
1668 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1669 skb->nh.iph->daddr, skb->csum))
1672 NETDEBUG(if (net_ratelimit())
1673 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1674 skb->ip_summed = CHECKSUM_NONE;
1676 if (skb->len <= 76) {
1677 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1679 skb_checksum(skb, 0, skb->len, 0)))
1681 skb->ip_summed = CHECKSUM_UNNECESSARY;
1683 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1685 skb->nh.iph->daddr, 0);
1691 /* The socket must have it's spinlock held when we get
1694 * We have a potential double-lock case here, so even when
1695 * doing backlog processing we use the BH locking scheme.
1696 * This is because we cannot sleep with the original spinlock
1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1701 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1702 TCP_CHECK_TIMER(sk);
1703 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1705 TCP_CHECK_TIMER(sk);
1709 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1712 if (sk->sk_state == TCP_LISTEN) {
1713 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1718 if (tcp_child_process(sk, nsk, skb))
1724 TCP_CHECK_TIMER(sk);
1725 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1727 TCP_CHECK_TIMER(sk);
1731 tcp_v4_send_reset(skb);
1734 /* Be careful here. If this function gets more complicated and
1735 * gcc suffers from register pressure on the x86, sk (in %ebx)
1736 * might be destroyed here. This current version compiles correctly,
1737 * but you have been warned.
1742 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1750 int tcp_v4_rcv(struct sk_buff *skb)
1756 if (skb->pkt_type != PACKET_HOST)
1759 /* Count it even if it's bad */
1760 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1762 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1767 if (th->doff < sizeof(struct tcphdr) / 4)
1769 if (!pskb_may_pull(skb, th->doff * 4))
1772 /* An explanation is required here, I think.
1773 * Packet length and doff are validated by header prediction,
1774 * provided case of th->doff==0 is elimineted.
1775 * So, we defer the checks. */
1776 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1777 tcp_v4_checksum_init(skb) < 0))
1781 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1782 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1783 skb->len - th->doff * 4);
1784 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1785 TCP_SKB_CB(skb)->when = 0;
1786 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1787 TCP_SKB_CB(skb)->sacked = 0;
1789 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1790 skb->nh.iph->daddr, ntohs(th->dest),
1797 if (sk->sk_state == TCP_TIME_WAIT)
1800 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1801 goto discard_and_relse;
1803 if (sk_filter(sk, skb, 0))
1804 goto discard_and_relse;
1810 if (!sock_owned_by_user(sk)) {
1811 if (!tcp_prequeue(sk, skb))
1812 ret = tcp_v4_do_rcv(sk, skb);
1814 sk_add_backlog(sk, skb);
1822 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1825 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1827 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1829 tcp_v4_send_reset(skb);
1833 /* Discard frame. */
1842 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1843 tcp_tw_put((struct tcp_tw_bucket *) sk);
1847 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1848 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1849 tcp_tw_put((struct tcp_tw_bucket *) sk);
1852 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1853 skb, th, skb->len)) {
1855 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1859 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1860 tcp_tw_put((struct tcp_tw_bucket *)sk);
1864 /* Fall through to ACK */
1867 tcp_v4_timewait_ack(sk, skb);
1871 case TCP_TW_SUCCESS:;
1876 /* With per-bucket locks this operation is not-atomic, so that
1877 * this version is not worse.
1879 static void __tcp_v4_rehash(struct sock *sk)
1881 sk->sk_prot->unhash(sk);
1882 sk->sk_prot->hash(sk);
1885 static int tcp_v4_reselect_saddr(struct sock *sk)
1887 struct inet_opt *inet = inet_sk(sk);
1890 __u32 old_saddr = inet->saddr;
1892 __u32 daddr = inet->daddr;
1894 if (inet->opt && inet->opt->srr)
1895 daddr = inet->opt->faddr;
1897 /* Query new route. */
1898 err = ip_route_connect(&rt, daddr, 0,
1899 RT_TOS(inet->tos) | sk->sk_localroute,
1900 sk->sk_bound_dev_if,
1902 inet->sport, inet->dport, sk);
1906 __sk_dst_set(sk, &rt->u.dst);
1907 tcp_v4_setup_caps(sk, &rt->u.dst);
1908 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1910 new_saddr = rt->rt_src;
1912 if (new_saddr == old_saddr)
1915 if (sysctl_ip_dynaddr > 1) {
1916 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1917 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1919 NIPQUAD(new_saddr));
1922 inet->saddr = new_saddr;
1923 inet->rcv_saddr = new_saddr;
1925 /* XXX The only one ugly spot where we need to
1926 * XXX really change the sockets identity after
1927 * XXX it has entered the hashes. -DaveM
1929 * Besides that, it does not check for connection
1930 * uniqueness. Wait for troubles.
1932 __tcp_v4_rehash(sk);
1936 int tcp_v4_rebuild_header(struct sock *sk)
1938 struct inet_opt *inet = inet_sk(sk);
1939 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1943 /* Route is OK, nothing to do. */
1948 daddr = inet->daddr;
1949 if (inet->opt && inet->opt->srr)
1950 daddr = inet->opt->faddr;
1953 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1956 .saddr = inet->saddr,
1957 .tos = RT_CONN_FLAGS(sk) } },
1958 .proto = IPPROTO_TCP,
1960 { .sport = inet->sport,
1961 .dport = inet->dport } } };
1963 err = ip_route_output_flow(&rt, &fl, sk, 0);
1966 __sk_dst_set(sk, &rt->u.dst);
1967 tcp_v4_setup_caps(sk, &rt->u.dst);
1968 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1972 /* Routing failed... */
1973 sk->sk_route_caps = 0;
1975 if (!sysctl_ip_dynaddr ||
1976 sk->sk_state != TCP_SYN_SENT ||
1977 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1978 (err = tcp_v4_reselect_saddr(sk)) != 0)
1979 sk->sk_err_soft = -err;
1984 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1986 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1987 struct inet_opt *inet = inet_sk(sk);
1989 sin->sin_family = AF_INET;
1990 sin->sin_addr.s_addr = inet->daddr;
1991 sin->sin_port = inet->dport;
1994 /* VJ's idea. Save last timestamp seen from this destination
1995 * and hold it at least for normal timewait interval to use for duplicate
1996 * segment detection in subsequent connections, before they enter synchronized
2000 int tcp_v4_remember_stamp(struct sock *sk)
2002 struct inet_opt *inet = inet_sk(sk);
2003 struct tcp_opt *tp = tcp_sk(sk);
2004 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2005 struct inet_peer *peer = NULL;
2008 if (!rt || rt->rt_dst != inet->daddr) {
2009 peer = inet_getpeer(inet->daddr, 1);
2013 rt_bind_peer(rt, 1);
2018 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2019 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2020 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2021 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2022 peer->tcp_ts = tp->ts_recent;
2032 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2034 struct inet_peer *peer = NULL;
2036 peer = inet_getpeer(tw->tw_daddr, 1);
2039 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2040 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2041 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2042 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2043 peer->tcp_ts = tw->tw_ts_recent;
2052 struct tcp_func ipv4_specific = {
2053 .queue_xmit = ip_queue_xmit,
2054 .send_check = tcp_v4_send_check,
2055 .rebuild_header = tcp_v4_rebuild_header,
2056 .conn_request = tcp_v4_conn_request,
2057 .syn_recv_sock = tcp_v4_syn_recv_sock,
2058 .remember_stamp = tcp_v4_remember_stamp,
2059 .net_header_len = sizeof(struct iphdr),
2060 .setsockopt = ip_setsockopt,
2061 .getsockopt = ip_getsockopt,
2062 .addr2sockaddr = v4_addr2sockaddr,
2063 .sockaddr_len = sizeof(struct sockaddr_in),
2066 /* NOTE: A lot of things set to zero explicitly by call to
2067 * sk_alloc() so need not be done here.
2069 static int tcp_v4_init_sock(struct sock *sk)
2071 struct tcp_opt *tp = tcp_sk(sk);
2073 skb_queue_head_init(&tp->out_of_order_queue);
2074 tcp_init_xmit_timers(sk);
2075 tcp_prequeue_init(tp);
2077 tp->rto = TCP_TIMEOUT_INIT;
2078 tp->mdev = TCP_TIMEOUT_INIT;
2080 /* So many TCP implementations out there (incorrectly) count the
2081 * initial SYN frame in their delayed-ACK and congestion control
2082 * algorithms that we must have the following bandaid to talk
2083 * efficiently to them. -DaveM
2087 /* See draft-stevens-tcpca-spec-01 for discussion of the
2088 * initialization of these values.
2090 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2091 tp->snd_cwnd_clamp = ~0;
2092 tp->mss_cache_std = tp->mss_cache = 536;
2094 tp->reordering = sysctl_tcp_reordering;
2096 sk->sk_state = TCP_CLOSE;
2098 sk->sk_write_space = sk_stream_write_space;
2099 sk->sk_use_write_queue = 1;
2101 tp->af_specific = &ipv4_specific;
2103 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2104 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2106 atomic_inc(&tcp_sockets_allocated);
2111 int tcp_v4_destroy_sock(struct sock *sk)
2113 struct tcp_opt *tp = tcp_sk(sk);
2115 tcp_clear_xmit_timers(sk);
2117 /* Cleanup up the write buffer. */
2118 sk_stream_writequeue_purge(sk);
2120 /* Cleans up our, hopefully empty, out_of_order_queue. */
2121 __skb_queue_purge(&tp->out_of_order_queue);
2123 /* Clean prequeue, it must be empty really */
2124 __skb_queue_purge(&tp->ucopy.prequeue);
2126 /* Clean up a referenced TCP bind bucket. */
2131 * If sendmsg cached page exists, toss it.
2133 if (sk->sk_sndmsg_page) {
2134 __free_page(sk->sk_sndmsg_page);
2135 sk->sk_sndmsg_page = NULL;
2138 atomic_dec(&tcp_sockets_allocated);
2143 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2145 #ifdef CONFIG_PROC_FS
2146 /* Proc filesystem TCP sock list dumping. */
2148 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2150 return hlist_empty(head) ? NULL :
2151 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2154 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2156 return tw->tw_node.next ?
2157 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2160 static void *listening_get_next(struct seq_file *seq, void *cur)
2163 struct hlist_node *node;
2164 struct sock *sk = cur;
2165 struct tcp_iter_state* st = seq->private;
2169 sk = sk_head(&tcp_listening_hash[0]);
2175 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2176 struct open_request *req = cur;
2178 tp = tcp_sk(st->syn_wait_sk);
2182 vxdprintk(VXD_CBIT(net, 6),
2183 "sk,req: %p [#%d] (from %d)", req->sk,
2184 (req->sk)?req->sk->sk_xid:0, current->xid);
2186 !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2188 if (req->class->family == st->family) {
2194 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2197 req = tp->listen_opt->syn_table[st->sbucket];
2199 sk = sk_next(st->syn_wait_sk);
2200 st->state = TCP_SEQ_STATE_LISTENING;
2201 read_unlock_bh(&tp->syn_wait_lock);
2204 read_lock_bh(&tp->syn_wait_lock);
2205 if (tp->listen_opt && tp->listen_opt->qlen)
2207 read_unlock_bh(&tp->syn_wait_lock);
2211 sk_for_each_from(sk, node) {
2212 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2213 sk, sk->sk_xid, current->xid);
2214 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2216 if (sk->sk_family == st->family) {
2221 read_lock_bh(&tp->syn_wait_lock);
2222 if (tp->listen_opt && tp->listen_opt->qlen) {
2224 st->uid = sock_i_uid(sk);
2225 st->syn_wait_sk = sk;
2226 st->state = TCP_SEQ_STATE_OPENREQ;
2230 read_unlock_bh(&tp->syn_wait_lock);
2232 if (++st->bucket < TCP_LHTABLE_SIZE) {
2233 sk = sk_head(&tcp_listening_hash[st->bucket]);
2241 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2243 void *rc = listening_get_next(seq, NULL);
2245 while (rc && *pos) {
2246 rc = listening_get_next(seq, rc);
2252 static void *established_get_first(struct seq_file *seq)
2254 struct tcp_iter_state* st = seq->private;
2257 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2259 struct hlist_node *node;
2260 struct tcp_tw_bucket *tw;
2262 read_lock(&tcp_ehash[st->bucket].lock);
2263 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2264 vxdprintk(VXD_CBIT(net, 6),
2265 "sk,egf: %p [#%d] (from %d)",
2266 sk, sk->sk_xid, current->xid);
2267 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2269 if (sk->sk_family != st->family)
2274 st->state = TCP_SEQ_STATE_TIME_WAIT;
2275 tw_for_each(tw, node,
2276 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2277 vxdprintk(VXD_CBIT(net, 6),
2278 "tw: %p [#%d] (from %d)",
2279 tw, tw->tw_xid, current->xid);
2280 if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2282 if (tw->tw_family != st->family)
2287 read_unlock(&tcp_ehash[st->bucket].lock);
2288 st->state = TCP_SEQ_STATE_ESTABLISHED;
2294 static void *established_get_next(struct seq_file *seq, void *cur)
2296 struct sock *sk = cur;
2297 struct tcp_tw_bucket *tw;
2298 struct hlist_node *node;
2299 struct tcp_iter_state* st = seq->private;
2303 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2307 while (tw && (tw->tw_family != st->family ||
2308 !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2315 read_unlock(&tcp_ehash[st->bucket].lock);
2316 st->state = TCP_SEQ_STATE_ESTABLISHED;
2317 if (++st->bucket < tcp_ehash_size) {
2318 read_lock(&tcp_ehash[st->bucket].lock);
2319 sk = sk_head(&tcp_ehash[st->bucket].chain);
2327 sk_for_each_from(sk, node) {
2328 vxdprintk(VXD_CBIT(net, 6),
2329 "sk,egn: %p [#%d] (from %d)",
2330 sk, sk->sk_xid, current->xid);
2331 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2333 if (sk->sk_family == st->family)
2337 st->state = TCP_SEQ_STATE_TIME_WAIT;
2338 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2346 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2348 void *rc = established_get_first(seq);
2351 rc = established_get_next(seq, rc);
2357 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2360 struct tcp_iter_state* st = seq->private;
2363 st->state = TCP_SEQ_STATE_LISTENING;
2364 rc = listening_get_idx(seq, &pos);
2367 tcp_listen_unlock();
2369 st->state = TCP_SEQ_STATE_ESTABLISHED;
2370 rc = established_get_idx(seq, pos);
2376 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2378 struct tcp_iter_state* st = seq->private;
2379 st->state = TCP_SEQ_STATE_LISTENING;
2381 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2384 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2387 struct tcp_iter_state* st;
2389 if (v == SEQ_START_TOKEN) {
2390 rc = tcp_get_idx(seq, 0);
2395 switch (st->state) {
2396 case TCP_SEQ_STATE_OPENREQ:
2397 case TCP_SEQ_STATE_LISTENING:
2398 rc = listening_get_next(seq, v);
2400 tcp_listen_unlock();
2402 st->state = TCP_SEQ_STATE_ESTABLISHED;
2403 rc = established_get_first(seq);
2406 case TCP_SEQ_STATE_ESTABLISHED:
2407 case TCP_SEQ_STATE_TIME_WAIT:
2408 rc = established_get_next(seq, v);
2416 static void tcp_seq_stop(struct seq_file *seq, void *v)
2418 struct tcp_iter_state* st = seq->private;
2420 switch (st->state) {
2421 case TCP_SEQ_STATE_OPENREQ:
2423 struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2424 read_unlock_bh(&tp->syn_wait_lock);
2426 case TCP_SEQ_STATE_LISTENING:
2427 if (v != SEQ_START_TOKEN)
2428 tcp_listen_unlock();
2430 case TCP_SEQ_STATE_TIME_WAIT:
2431 case TCP_SEQ_STATE_ESTABLISHED:
2433 read_unlock(&tcp_ehash[st->bucket].lock);
2439 static int tcp_seq_open(struct inode *inode, struct file *file)
2441 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2442 struct seq_file *seq;
2443 struct tcp_iter_state *s;
2446 if (unlikely(afinfo == NULL))
2449 s = kmalloc(sizeof(*s), GFP_KERNEL);
2452 memset(s, 0, sizeof(*s));
2453 s->family = afinfo->family;
2454 s->seq_ops.start = tcp_seq_start;
2455 s->seq_ops.next = tcp_seq_next;
2456 s->seq_ops.show = afinfo->seq_show;
2457 s->seq_ops.stop = tcp_seq_stop;
2459 rc = seq_open(file, &s->seq_ops);
2462 seq = file->private_data;
2471 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2474 struct proc_dir_entry *p;
2478 afinfo->seq_fops->owner = afinfo->owner;
2479 afinfo->seq_fops->open = tcp_seq_open;
2480 afinfo->seq_fops->read = seq_read;
2481 afinfo->seq_fops->llseek = seq_lseek;
2482 afinfo->seq_fops->release = seq_release_private;
2484 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2492 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2496 proc_net_remove(afinfo->name);
2497 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2500 static void get_openreq4(struct sock *sk, struct open_request *req,
2501 char *tmpbuf, int i, int uid)
2503 int ttd = req->expires - jiffies;
2505 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2506 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2508 req->af.v4_req.loc_addr,
2509 ntohs(inet_sk(sk)->sport),
2510 req->af.v4_req.rmt_addr,
2511 ntohs(req->rmt_port),
2513 0, 0, /* could print option size, but that is af dependent. */
2514 1, /* timers active (only the expire timer) */
2515 jiffies_to_clock_t(ttd),
2518 0, /* non standard timer */
2519 0, /* open_requests have no inode */
2520 atomic_read(&sk->sk_refcnt),
2524 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2527 unsigned long timer_expires;
2528 struct tcp_opt *tp = tcp_sk(sp);
2529 struct inet_opt *inet = inet_sk(sp);
2530 unsigned int dest = inet->daddr;
2531 unsigned int src = inet->rcv_saddr;
2532 __u16 destp = ntohs(inet->dport);
2533 __u16 srcp = ntohs(inet->sport);
2535 if (tp->pending == TCP_TIME_RETRANS) {
2537 timer_expires = tp->timeout;
2538 } else if (tp->pending == TCP_TIME_PROBE0) {
2540 timer_expires = tp->timeout;
2541 } else if (timer_pending(&sp->sk_timer)) {
2543 timer_expires = sp->sk_timer.expires;
2546 timer_expires = jiffies;
2549 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2550 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2551 i, src, srcp, dest, destp, sp->sk_state,
2552 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2554 jiffies_to_clock_t(timer_expires - jiffies),
2559 atomic_read(&sp->sk_refcnt), sp,
2560 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2562 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2565 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2567 unsigned int dest, src;
2569 int ttd = tw->tw_ttd - jiffies;
2574 dest = tw->tw_daddr;
2575 src = tw->tw_rcv_saddr;
2576 destp = ntohs(tw->tw_dport);
2577 srcp = ntohs(tw->tw_sport);
2579 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2580 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2581 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2582 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2583 atomic_read(&tw->tw_refcnt), tw);
2588 static int tcp4_seq_show(struct seq_file *seq, void *v)
2590 struct tcp_iter_state* st;
2591 char tmpbuf[TMPSZ + 1];
2593 if (v == SEQ_START_TOKEN) {
2594 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2595 " sl local_address rem_address st tx_queue "
2596 "rx_queue tr tm->when retrnsmt uid timeout "
2602 switch (st->state) {
2603 case TCP_SEQ_STATE_LISTENING:
2604 case TCP_SEQ_STATE_ESTABLISHED:
2605 get_tcp4_sock(v, tmpbuf, st->num);
2607 case TCP_SEQ_STATE_OPENREQ:
2608 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2610 case TCP_SEQ_STATE_TIME_WAIT:
2611 get_timewait4_sock(v, tmpbuf, st->num);
2614 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2619 static struct file_operations tcp4_seq_fops;
2620 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2621 .owner = THIS_MODULE,
2624 .seq_show = tcp4_seq_show,
2625 .seq_fops = &tcp4_seq_fops,
2628 int __init tcp4_proc_init(void)
2630 return tcp_proc_register(&tcp4_seq_afinfo);
2633 void tcp4_proc_exit(void)
2635 tcp_proc_unregister(&tcp4_seq_afinfo);
2637 #endif /* CONFIG_PROC_FS */
2639 struct proto tcp_prot = {
2642 .connect = tcp_v4_connect,
2643 .disconnect = tcp_disconnect,
2644 .accept = tcp_accept,
2646 .init = tcp_v4_init_sock,
2647 .destroy = tcp_v4_destroy_sock,
2648 .shutdown = tcp_shutdown,
2649 .setsockopt = tcp_setsockopt,
2650 .getsockopt = tcp_getsockopt,
2651 .sendmsg = tcp_sendmsg,
2652 .recvmsg = tcp_recvmsg,
2653 .backlog_rcv = tcp_v4_do_rcv,
2654 .hash = tcp_v4_hash,
2655 .unhash = tcp_unhash,
2656 .get_port = tcp_v4_get_port,
2657 .enter_memory_pressure = tcp_enter_memory_pressure,
2658 .sockets_allocated = &tcp_sockets_allocated,
2659 .memory_allocated = &tcp_memory_allocated,
2660 .memory_pressure = &tcp_memory_pressure,
2661 .sysctl_mem = sysctl_tcp_mem,
2662 .sysctl_wmem = sysctl_tcp_wmem,
2663 .sysctl_rmem = sysctl_tcp_rmem,
2664 .max_header = MAX_TCP_HEADER,
2665 .slab_obj_size = sizeof(struct tcp_sock),
2670 void __init tcp_v4_init(struct net_proto_family *ops)
2672 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2674 panic("Failed to create the TCP control socket.\n");
2675 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2676 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2678 /* Unhash it so that IP input processing does not even
2679 * see it, we do not wish this socket to see incoming
2682 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2685 EXPORT_SYMBOL(ipv4_specific);
2686 EXPORT_SYMBOL(tcp_bind_hash);
2687 EXPORT_SYMBOL(tcp_bucket_create);
2688 EXPORT_SYMBOL(tcp_hashinfo);
2689 EXPORT_SYMBOL(tcp_inherit_port);
2690 EXPORT_SYMBOL(tcp_listen_wlock);
2691 EXPORT_SYMBOL(tcp_port_rover);
2692 EXPORT_SYMBOL(tcp_prot);
2693 EXPORT_SYMBOL(tcp_put_port);
2694 EXPORT_SYMBOL(tcp_unhash);
2695 EXPORT_SYMBOL(tcp_v4_conn_request);
2696 EXPORT_SYMBOL(tcp_v4_connect);
2697 EXPORT_SYMBOL(tcp_v4_do_rcv);
2698 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2699 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2700 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2701 EXPORT_SYMBOL(tcp_v4_send_check);
2702 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2704 #ifdef CONFIG_PROC_FS
2705 EXPORT_SYMBOL(tcp_proc_register);
2706 EXPORT_SYMBOL(tcp_proc_unregister);
2708 #ifdef CONFIG_SYSCTL
2709 EXPORT_SYMBOL(sysctl_local_port_range);
2710 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2711 EXPORT_SYMBOL(sysctl_tcp_low_latency);