2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
113 return h & (tcp_ehash_size - 1);
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 struct inet_opt *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
124 return tcp_hashfn(laddr, lport, faddr, fport);
127 /* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
170 __tcp_inherit_port(sk, child);
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
207 /* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
228 if (rover < low || rover > high)
230 head = &tcp_bhash[tcp_bhashfn(rover)];
231 spin_lock(&head->lock);
232 tb_for_each(tb, node, &head->chain)
233 if (tb->port == rover)
237 spin_unlock(&head->lock);
238 } while (--remaining > 0);
239 tcp_port_rover = rover;
240 spin_unlock(&tcp_portalloc_lock);
242 /* Exhausted local port range during search? */
247 /* OK, here is the one we will use. HEAD is
248 * non-NULL and we hold it's mutex.
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 tb_for_each(tb, node, &head->chain)
255 if (tb->port == snum)
261 if (!hlist_empty(&tb->owners)) {
262 if (sk->sk_reuse > 1)
264 if (tb->fastreuse > 0 &&
265 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
269 if (tcp_bind_conflict(sk, tb))
275 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
277 if (hlist_empty(&tb->owners)) {
278 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
282 } else if (tb->fastreuse &&
283 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
286 if (!tcp_sk(sk)->bind_hash)
287 tcp_bind_hash(sk, tb, snum);
288 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
292 spin_unlock(&head->lock);
298 /* Get rid of any references to a local port held by the
301 static void __tcp_put_port(struct sock *sk)
303 struct inet_opt *inet = inet_sk(sk);
304 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305 struct tcp_bind_bucket *tb;
307 spin_lock(&head->lock);
308 tb = tcp_sk(sk)->bind_hash;
309 __sk_del_bind_node(sk);
310 tcp_sk(sk)->bind_hash = NULL;
312 tcp_bucket_destroy(tb);
313 spin_unlock(&head->lock);
316 void tcp_put_port(struct sock *sk)
323 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324 * Look, when several writers sleep and reader wakes them up, all but one
325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326 * this, _but_ remember, it adds useless work on UP machines (wake up each
327 * exclusive lock release). It should be ifdefed really.
330 void tcp_listen_wlock(void)
332 write_lock(&tcp_lhash_lock);
334 if (atomic_read(&tcp_lhash_users)) {
338 prepare_to_wait_exclusive(&tcp_lhash_wait,
339 &wait, TASK_UNINTERRUPTIBLE);
340 if (!atomic_read(&tcp_lhash_users))
342 write_unlock_bh(&tcp_lhash_lock);
344 write_lock_bh(&tcp_lhash_lock);
347 finish_wait(&tcp_lhash_wait, &wait);
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
353 struct hlist_head *list;
356 BUG_TRAP(sk_unhashed(sk));
357 if (listen_possible && sk->sk_state == TCP_LISTEN) {
358 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
362 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->sk_hashent].lock;
366 __sk_add_node(sk, list);
367 sock_prot_inc_use(sk->sk_prot);
369 if (listen_possible && sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
373 static void tcp_v4_hash(struct sock *sk)
375 if (sk->sk_state != TCP_CLOSE) {
377 __tcp_v4_hash(sk, 1);
382 void tcp_unhash(struct sock *sk)
389 if (sk->sk_state == TCP_LISTEN) {
392 lock = &tcp_lhash_lock;
394 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
396 write_lock_bh(&head->lock);
399 if (__sk_del_node_init(sk))
400 sock_prot_dec_use(sk->sk_prot);
401 write_unlock_bh(lock);
404 if (sk->sk_state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
408 /* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
414 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415 unsigned short hnum, int dif)
417 struct sock *result = NULL, *sk;
418 struct hlist_node *node;
422 sk_for_each(sk, node, head) {
423 struct inet_opt *inet = inet_sk(sk);
425 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426 __u32 rcv_saddr = inet->rcv_saddr;
428 score = (sk->sk_family == PF_INET ? 1 : 0);
430 if (rcv_saddr != daddr)
434 if (sk->sk_bound_dev_if) {
435 if (sk->sk_bound_dev_if != dif)
441 if (score > hiscore) {
450 /* Optimize the common listener case. */
451 inline struct sock *tcp_v4_lookup_listener(u32 daddr,
452 unsigned short hnum, int dif)
454 struct sock *sk = NULL;
455 struct hlist_head *head;
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
462 if (inet->num == hnum && !sk->sk_node.next &&
463 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
464 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
465 !sk->sk_bound_dev_if)
467 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
473 read_unlock(&tcp_lhash_lock);
477 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
480 * Local BH must be disabled here.
483 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
487 struct tcp_ehash_bucket *head;
488 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
489 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
491 struct hlist_node *node;
492 /* Optimize here for direct hit, only listening connections can
493 * have wildcards anyways.
495 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
496 head = &tcp_ehash[hash];
497 read_lock(&head->lock);
498 sk_for_each(sk, node, &head->chain) {
499 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
500 goto hit; /* You sunk my battleship! */
503 /* Must check for a TIME_WAIT'er before going to listener hash. */
504 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
505 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
510 read_unlock(&head->lock);
517 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
518 u32 daddr, u16 hnum, int dif)
520 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
523 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
526 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
532 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
538 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
540 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
542 return secure_tcp_sequence_number(skb->nh.iph->daddr,
548 /* called with local bh disabled */
549 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
550 struct tcp_tw_bucket **twp)
552 struct inet_opt *inet = inet_sk(sk);
553 u32 daddr = inet->rcv_saddr;
554 u32 saddr = inet->daddr;
555 int dif = sk->sk_bound_dev_if;
556 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
557 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
558 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
559 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
561 struct hlist_node *node;
562 struct tcp_tw_bucket *tw;
564 write_lock(&head->lock);
566 /* Check TIME-WAIT sockets first. */
567 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
568 tw = (struct tcp_tw_bucket *)sk2;
570 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
571 struct tcp_opt *tp = tcp_sk(sk);
573 /* With PAWS, it is safe from the viewpoint
574 of data integrity. Even without PAWS it
575 is safe provided sequence spaces do not
576 overlap i.e. at data rates <= 80Mbit/sec.
578 Actually, the idea is close to VJ's one,
579 only timestamp cache is held not per host,
580 but per port pair and TW bucket is used
583 If TW bucket has been already destroyed we
584 fall back to VJ's scheme and use initial
585 timestamp retrieved from peer table.
587 if (tw->tw_ts_recent_stamp &&
588 (!twp || (sysctl_tcp_tw_reuse &&
590 tw->tw_ts_recent_stamp > 1))) {
592 tw->tw_snd_nxt + 65535 + 2) == 0)
594 tp->ts_recent = tw->tw_ts_recent;
595 tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
604 /* And established part... */
605 sk_for_each(sk2, node, &head->chain) {
606 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
611 /* Must record num and sport now. Otherwise we will see
612 * in hash table socket with a funny identity. */
614 inet->sport = htons(lport);
615 sk->sk_hashent = hash;
616 BUG_TRAP(sk_unhashed(sk));
617 __sk_add_node(sk, &head->chain);
618 sock_prot_inc_use(sk->sk_prot);
619 write_unlock(&head->lock);
623 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
625 /* Silly. Should hash-dance instead... */
626 tcp_tw_deschedule(tw);
627 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
635 write_unlock(&head->lock);
636 return -EADDRNOTAVAIL;
640 * Bind a port for a connect operation and hash it.
642 static int tcp_v4_hash_connect(struct sock *sk)
644 unsigned short snum = inet_sk(sk)->num;
645 struct tcp_bind_hashbucket *head;
646 struct tcp_bind_bucket *tb;
651 int low = sysctl_local_port_range[0];
652 int high = sysctl_local_port_range[1];
653 int remaining = (high - low) + 1;
654 struct hlist_node *node;
655 struct tcp_tw_bucket *tw = NULL;
659 /* TODO. Actually it is not so bad idea to remove
660 * tcp_portalloc_lock before next submission to Linus.
661 * As soon as we touch this place at all it is time to think.
663 * Now it protects single _advisory_ variable tcp_port_rover,
664 * hence it is mostly useless.
665 * Code will work nicely if we just delete it, but
666 * I am afraid in contented case it will work not better or
667 * even worse: another cpu just will hit the same bucket
669 * So some cpu salt could remove both contention and
670 * memory pingpong. Any ideas how to do this in a nice way?
672 spin_lock(&tcp_portalloc_lock);
673 rover = tcp_port_rover;
677 if ((rover < low) || (rover > high))
679 head = &tcp_bhash[tcp_bhashfn(rover)];
680 spin_lock(&head->lock);
682 /* Does not bother with rcv_saddr checks,
683 * because the established check is already
686 tb_for_each(tb, node, &head->chain) {
687 if (tb->port == rover) {
688 BUG_TRAP(!hlist_empty(&tb->owners));
689 if (tb->fastreuse >= 0)
691 if (!__tcp_v4_check_established(sk,
699 tb = tcp_bucket_create(head, rover);
701 spin_unlock(&head->lock);
708 spin_unlock(&head->lock);
709 } while (--remaining > 0);
710 tcp_port_rover = rover;
711 spin_unlock(&tcp_portalloc_lock);
715 return -EADDRNOTAVAIL;
718 /* All locks still held and bhs disabled */
719 tcp_port_rover = rover;
720 spin_unlock(&tcp_portalloc_lock);
722 tcp_bind_hash(sk, tb, rover);
723 if (sk_unhashed(sk)) {
724 inet_sk(sk)->sport = htons(rover);
725 __tcp_v4_hash(sk, 0);
727 spin_unlock(&head->lock);
730 tcp_tw_deschedule(tw);
738 head = &tcp_bhash[tcp_bhashfn(snum)];
739 tb = tcp_sk(sk)->bind_hash;
740 spin_lock_bh(&head->lock);
741 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
742 __tcp_v4_hash(sk, 0);
743 spin_unlock_bh(&head->lock);
746 spin_unlock(&head->lock);
747 /* No definite answer... Walk to established hash table */
748 ret = __tcp_v4_check_established(sk, snum, NULL);
755 /* This will initiate an outgoing connection. */
756 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
758 struct inet_opt *inet = inet_sk(sk);
759 struct tcp_opt *tp = tcp_sk(sk);
760 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
766 if (addr_len < sizeof(struct sockaddr_in))
769 if (usin->sin_family != AF_INET)
770 return -EAFNOSUPPORT;
772 nexthop = daddr = usin->sin_addr.s_addr;
773 if (inet->opt && inet->opt->srr) {
776 nexthop = inet->opt->faddr;
779 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
780 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
782 inet->sport, usin->sin_port, sk);
786 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
791 if (!inet->opt || !inet->opt->srr)
795 inet->saddr = rt->rt_src;
796 inet->rcv_saddr = inet->saddr;
798 if (tp->ts_recent_stamp && inet->daddr != daddr) {
799 /* Reset inherited state */
801 tp->ts_recent_stamp = 0;
805 if (sysctl_tcp_tw_recycle &&
806 !tp->ts_recent_stamp && rt->rt_dst == daddr) {
807 struct inet_peer *peer = rt_get_peer(rt);
809 /* VJ's idea. We save last timestamp seen from
810 * the destination in peer table, when entering state TIME-WAIT
811 * and initialize ts_recent from it, when trying new connection.
814 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
815 tp->ts_recent_stamp = peer->tcp_ts_stamp;
816 tp->ts_recent = peer->tcp_ts;
820 inet->dport = usin->sin_port;
823 tp->ext_header_len = 0;
825 tp->ext_header_len = inet->opt->optlen;
829 /* Socket identity is still unknown (sport may be zero).
830 * However we set state to SYN-SENT and not releasing socket
831 * lock select source port, enter ourselves into the hash tables and
832 * complete initialization after this.
834 tcp_set_state(sk, TCP_SYN_SENT);
835 err = tcp_v4_hash_connect(sk);
839 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
843 /* OK, now commit destination to socket. */
844 __sk_dst_set(sk, &rt->u.dst);
845 tcp_v4_setup_caps(sk, &rt->u.dst);
846 tp->ext2_header_len = rt->u.dst.header_len;
849 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
854 inet->id = tp->write_seq ^ jiffies;
856 err = tcp_connect(sk);
864 /* This unhashes the socket and releases the local port, if necessary. */
865 tcp_set_state(sk, TCP_CLOSE);
867 sk->sk_route_caps = 0;
872 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
874 return ((struct rtable *)skb->dst)->rt_iif;
877 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
879 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
882 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
883 struct open_request ***prevp,
885 __u32 raddr, __u32 laddr)
887 struct tcp_listen_opt *lopt = tp->listen_opt;
888 struct open_request *req, **prev;
890 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
891 (req = *prev) != NULL;
892 prev = &req->dl_next) {
893 if (req->rmt_port == rport &&
894 req->af.v4_req.rmt_addr == raddr &&
895 req->af.v4_req.loc_addr == laddr &&
896 TCP_INET_FAMILY(req->class->family)) {
906 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
908 struct tcp_opt *tp = tcp_sk(sk);
909 struct tcp_listen_opt *lopt = tp->listen_opt;
910 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
912 req->expires = jiffies + TCP_TIMEOUT_INIT;
915 req->dl_next = lopt->syn_table[h];
917 write_lock(&tp->syn_wait_lock);
918 lopt->syn_table[h] = req;
919 write_unlock(&tp->syn_wait_lock);
921 #ifdef CONFIG_ACCEPT_QUEUES
922 tcp_synq_added(sk, req);
930 * This routine does path mtu discovery as defined in RFC1191.
932 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
935 struct dst_entry *dst;
936 struct inet_opt *inet = inet_sk(sk);
937 struct tcp_opt *tp = tcp_sk(sk);
939 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
940 * send out by Linux are always <576bytes so they should go through
943 if (sk->sk_state == TCP_LISTEN)
946 /* We don't check in the destentry if pmtu discovery is forbidden
947 * on this route. We just assume that no packet_to_big packets
948 * are send back when pmtu discovery is not active.
949 * There is a small race when the user changes this flag in the
950 * route, but I think that's acceptable.
952 if ((dst = __sk_dst_check(sk, 0)) == NULL)
955 dst->ops->update_pmtu(dst, mtu);
957 /* Something is about to be wrong... Remember soft error
958 * for the case, if this connection will not able to recover.
960 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
961 sk->sk_err_soft = EMSGSIZE;
965 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
966 tp->pmtu_cookie > mtu) {
967 tcp_sync_mss(sk, mtu);
969 /* Resend the TCP packet because it's
970 * clear that the old packet has been
971 * dropped. This is the new "fast" path mtu
974 tcp_simple_retransmit(sk);
975 } /* else let the usual retransmit timer handle it */
979 * This routine is called by the ICMP module when it gets some
980 * sort of error condition. If err < 0 then the socket should
981 * be closed and the error returned to the user. If err > 0
982 * it's just the icmp type << 8 | icmp code. After adjustment
983 * header points to the first 8 bytes of the tcp header. We need
984 * to find the appropriate port.
986 * The locking strategy used here is very "optimistic". When
987 * someone else accesses the socket the ICMP is just dropped
988 * and for some paths there is no check at all.
989 * A more general error queue to queue errors for later handling
990 * is probably better.
994 void tcp_v4_err(struct sk_buff *skb, u32 info)
996 struct iphdr *iph = (struct iphdr *)skb->data;
997 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
999 struct inet_opt *inet;
1000 int type = skb->h.icmph->type;
1001 int code = skb->h.icmph->code;
1006 if (skb->len < (iph->ihl << 2) + 8) {
1007 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1011 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1012 th->source, tcp_v4_iif(skb));
1014 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1017 if (sk->sk_state == TCP_TIME_WAIT) {
1018 tcp_tw_put((struct tcp_tw_bucket *)sk);
1023 /* If too many ICMPs get dropped on busy
1024 * servers this needs to be solved differently.
1026 if (sock_owned_by_user(sk))
1027 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1029 if (sk->sk_state == TCP_CLOSE)
1033 seq = ntohl(th->seq);
1034 if (sk->sk_state != TCP_LISTEN &&
1035 !between(seq, tp->snd_una, tp->snd_nxt)) {
1036 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1041 case ICMP_SOURCE_QUENCH:
1042 /* Just silently ignore these. */
1044 case ICMP_PARAMETERPROB:
1047 case ICMP_DEST_UNREACH:
1048 if (code > NR_ICMP_UNREACH)
1051 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1052 if (!sock_owned_by_user(sk))
1053 do_pmtu_discovery(sk, iph, info);
1057 err = icmp_err_convert[code].errno;
1059 case ICMP_TIME_EXCEEDED:
1066 switch (sk->sk_state) {
1067 struct open_request *req, **prev;
1069 if (sock_owned_by_user(sk))
1072 req = tcp_v4_search_req(tp, &prev, th->dest,
1073 iph->daddr, iph->saddr);
1077 /* ICMPs are not backlogged, hence we cannot get
1078 an established socket here.
1082 if (seq != req->snt_isn) {
1083 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1088 * Still in SYN_RECV, just remove it silently.
1089 * There is no good way to pass the error to the newly
1090 * created socket, and POSIX does not want network
1091 * errors returned from accept().
1093 tcp_synq_drop(sk, req, prev);
1097 case TCP_SYN_RECV: /* Cannot happen.
1098 It can f.e. if SYNs crossed.
1100 if (!sock_owned_by_user(sk)) {
1101 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1104 sk->sk_error_report(sk);
1108 sk->sk_err_soft = err;
1113 /* If we've already connected we will keep trying
1114 * until we time out, or the user gives up.
1116 * rfc1122 4.2.3.9 allows to consider as hard errors
1117 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1118 * but it is obsoleted by pmtu discovery).
1120 * Note, that in modern internet, where routing is unreliable
1121 * and in each dark corner broken firewalls sit, sending random
1122 * errors ordered by their masters even this two messages finally lose
1123 * their original sense (even Linux sends invalid PORT_UNREACHs)
1125 * Now we are in compliance with RFCs.
1130 if (!sock_owned_by_user(sk) && inet->recverr) {
1132 sk->sk_error_report(sk);
1133 } else { /* Only an error on timeout */
1134 sk->sk_err_soft = err;
1142 /* This routine computes an IPv4 TCP checksum. */
1143 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1144 struct sk_buff *skb)
1146 struct inet_opt *inet = inet_sk(sk);
1148 if (skb->ip_summed == CHECKSUM_HW) {
1149 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1150 skb->csum = offsetof(struct tcphdr, check);
1152 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1153 csum_partial((char *)th,
1160 * This routine will send an RST to the other tcp.
1162 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1164 * Answer: if a packet caused RST, it is not for a socket
1165 * existing in our system, if it is matched to a socket,
1166 * it is just duplicate segment or bug in other side's TCP.
1167 * So that we build reply only basing on parameters
1168 * arrived with segment.
1169 * Exception: precedence violation. We do not implement it in any case.
1172 static void tcp_v4_send_reset(struct sk_buff *skb)
1174 struct tcphdr *th = skb->h.th;
1176 struct ip_reply_arg arg;
1178 /* Never send a reset in response to a reset. */
1182 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1185 /* Swap the send and the receive. */
1186 memset(&rth, 0, sizeof(struct tcphdr));
1187 rth.dest = th->source;
1188 rth.source = th->dest;
1189 rth.doff = sizeof(struct tcphdr) / 4;
1193 rth.seq = th->ack_seq;
1196 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1197 skb->len - (th->doff << 2));
1200 memset(&arg, 0, sizeof arg);
1201 arg.iov[0].iov_base = (unsigned char *)&rth;
1202 arg.iov[0].iov_len = sizeof rth;
1203 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1204 skb->nh.iph->saddr, /*XXX*/
1205 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1206 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1208 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1210 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1211 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1214 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1215 outside socket context is ugly, certainly. What can I do?
1218 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1221 struct tcphdr *th = skb->h.th;
1226 struct ip_reply_arg arg;
1228 memset(&rep.th, 0, sizeof(struct tcphdr));
1229 memset(&arg, 0, sizeof arg);
1231 arg.iov[0].iov_base = (unsigned char *)&rep;
1232 arg.iov[0].iov_len = sizeof(rep.th);
1234 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1235 (TCPOPT_TIMESTAMP << 8) |
1237 rep.tsopt[1] = htonl(tcp_time_stamp);
1238 rep.tsopt[2] = htonl(ts);
1239 arg.iov[0].iov_len = sizeof(rep);
1242 /* Swap the send and the receive. */
1243 rep.th.dest = th->source;
1244 rep.th.source = th->dest;
1245 rep.th.doff = arg.iov[0].iov_len / 4;
1246 rep.th.seq = htonl(seq);
1247 rep.th.ack_seq = htonl(ack);
1249 rep.th.window = htons(win);
1251 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1252 skb->nh.iph->saddr, /*XXX*/
1253 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1254 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1256 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1258 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1261 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1263 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1265 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1266 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1271 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1273 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1277 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1278 struct open_request *req)
1281 struct ip_options *opt = req->af.v4_req.opt;
1282 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1284 { .daddr = ((opt && opt->srr) ?
1286 req->af.v4_req.rmt_addr),
1287 .saddr = req->af.v4_req.loc_addr,
1288 .tos = RT_CONN_FLAGS(sk) } },
1289 .proto = IPPROTO_TCP,
1291 { .sport = inet_sk(sk)->sport,
1292 .dport = req->rmt_port } } };
1294 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1295 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1298 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1300 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1307 * Send a SYN-ACK after having received an ACK.
1308 * This still operates on a open_request only, not on a big
1311 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1312 struct dst_entry *dst)
1315 struct sk_buff * skb;
1317 /* First, grab a route. */
1318 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1321 skb = tcp_make_synack(sk, dst, req);
1324 struct tcphdr *th = skb->h.th;
1326 th->check = tcp_v4_check(th, skb->len,
1327 req->af.v4_req.loc_addr,
1328 req->af.v4_req.rmt_addr,
1329 csum_partial((char *)th, skb->len,
1332 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1333 req->af.v4_req.rmt_addr,
1334 req->af.v4_req.opt);
1335 if (err == NET_XMIT_CN)
1345 * IPv4 open_request destructor.
1347 static void tcp_v4_or_free(struct open_request *req)
1349 if (req->af.v4_req.opt)
1350 kfree(req->af.v4_req.opt);
1353 static inline void syn_flood_warning(struct sk_buff *skb)
1355 static unsigned long warntime;
1357 if (time_after(jiffies, (warntime + HZ * 60))) {
1360 "possible SYN flooding on port %d. Sending cookies.\n",
1361 ntohs(skb->h.th->dest));
1366 * Save and compile IPv4 options into the open_request if needed.
1368 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1369 struct sk_buff *skb)
1371 struct ip_options *opt = &(IPCB(skb)->opt);
1372 struct ip_options *dopt = NULL;
1374 if (opt && opt->optlen) {
1375 int opt_size = optlength(opt);
1376 dopt = kmalloc(opt_size, GFP_ATOMIC);
1378 if (ip_options_echo(dopt, skb)) {
1388 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1389 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1390 * It would be better to replace it with a global counter for all sockets
1391 * but then some measure against one socket starving all other sockets
1394 * It was 128 by default. Experiments with real servers show, that
1395 * it is absolutely not enough even at 100conn/sec. 256 cures most
1396 * of problems. This value is adjusted to 128 for very small machines
1397 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1398 * Further increasing requires to change hash table size.
1400 int sysctl_max_syn_backlog = 256;
1402 struct or_calltable or_ipv4 = {
1404 .rtx_syn_ack = tcp_v4_send_synack,
1405 .send_ack = tcp_v4_or_send_ack,
1406 .destructor = tcp_v4_or_free,
1407 .send_reset = tcp_v4_send_reset,
1410 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1413 struct open_request *req;
1414 __u32 saddr = skb->nh.iph->saddr;
1415 __u32 daddr = skb->nh.iph->daddr;
1416 __u32 isn = TCP_SKB_CB(skb)->when;
1417 struct dst_entry *dst = NULL;
1418 #ifdef CONFIG_ACCEPT_QUEUES
1421 #ifdef CONFIG_SYN_COOKIES
1422 int want_cookie = 0;
1424 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1427 /* Never answer to SYNs send to broadcast or multicast */
1428 if (((struct rtable *)skb->dst)->rt_flags &
1429 (RTCF_BROADCAST | RTCF_MULTICAST))
1432 /* TW buckets are converted to open requests without
1433 * limitations, they conserve resources and peer is
1434 * evidently real one.
1436 if (tcp_synq_is_full(sk) && !isn) {
1437 #ifdef CONFIG_SYN_COOKIES
1438 if (sysctl_tcp_syncookies) {
1445 #ifdef CONFIG_ACCEPT_QUEUES
1446 class = (skb->nfmark <= 0) ? 0 :
1447 ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark);
1449 * Accept only if the class has shares set or if the default class
1450 * i.e. class 0 has shares
1452 if (!(tcp_sk(sk)->acceptq[class].aq_ratio)) {
1453 if (tcp_sk(sk)->acceptq[0].aq_ratio)
1460 /* Accept backlog is full. If we have already queued enough
1461 * of warm entries in syn queue, drop request. It is better than
1462 * clogging syn queue with openreqs with exponentially increasing
1465 #ifdef CONFIG_ACCEPT_QUEUES
1466 if (sk_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
1468 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1472 req = tcp_openreq_alloc();
1476 tcp_clear_options(&tp);
1478 tp.user_mss = tcp_sk(sk)->user_mss;
1480 tcp_parse_options(skb, &tp, 0);
1483 tcp_clear_options(&tp);
1487 if (tp.saw_tstamp && !tp.rcv_tsval) {
1488 /* Some OSes (unknown ones, but I see them on web server, which
1489 * contains information interesting only for windows'
1490 * users) do not send their stamp in SYN. It is easy case.
1491 * We simply do not advertise TS support.
1496 tp.tstamp_ok = tp.saw_tstamp;
1498 tcp_openreq_init(req, &tp, skb);
1499 #ifdef CONFIG_ACCEPT_QUEUES
1500 req->acceptq_class = class;
1501 req->acceptq_time_stamp = jiffies;
1503 req->af.v4_req.loc_addr = daddr;
1504 req->af.v4_req.rmt_addr = saddr;
1505 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1506 req->class = &or_ipv4;
1508 TCP_ECN_create_request(req, skb->h.th);
1511 #ifdef CONFIG_SYN_COOKIES
1512 syn_flood_warning(skb);
1514 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1516 struct inet_peer *peer = NULL;
1518 /* VJ's idea. We save last timestamp seen
1519 * from the destination in peer table, when entering
1520 * state TIME-WAIT, and check against it before
1521 * accepting new connection request.
1523 * If "isn" is not zero, this request hit alive
1524 * timewait bucket, so that all the necessary checks
1525 * are made in the function processing timewait state.
1527 if (tp.saw_tstamp &&
1528 sysctl_tcp_tw_recycle &&
1529 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1530 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1531 peer->v4daddr == saddr) {
1532 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1533 (s32)(peer->tcp_ts - req->ts_recent) >
1535 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1540 /* Kill the following clause, if you dislike this way. */
1541 else if (!sysctl_tcp_syncookies &&
1542 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1543 (sysctl_max_syn_backlog >> 2)) &&
1544 (!peer || !peer->tcp_ts_stamp) &&
1545 (!dst || !dst_metric(dst, RTAX_RTT))) {
1546 /* Without syncookies last quarter of
1547 * backlog is filled with destinations,
1548 * proven to be alive.
1549 * It means that we continue to communicate
1550 * to destinations, already remembered
1551 * to the moment of synflood.
1553 NETDEBUG(if (net_ratelimit()) \
1554 printk(KERN_DEBUG "TCP: drop open "
1555 "request from %u.%u."
1558 ntohs(skb->h.th->source)));
1563 isn = tcp_v4_init_sequence(sk, skb);
1567 if (tcp_v4_send_synack(sk, req, dst))
1571 tcp_openreq_free(req);
1573 tcp_v4_synq_add(sk, req);
1578 tcp_openreq_free(req);
1580 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1586 * The three way handshake has completed - we got a valid synack -
1587 * now create the new socket.
1589 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1590 struct open_request *req,
1591 struct dst_entry *dst)
1593 struct inet_opt *newinet;
1594 struct tcp_opt *newtp;
1597 #ifdef CONFIG_ACCEPT_QUEUES
1598 if (sk_acceptq_is_full(sk, req->acceptq_class))
1600 if (sk_acceptq_is_full(sk))
1604 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1607 newsk = tcp_create_openreq_child(sk, req, skb);
1611 newsk->sk_dst_cache = dst;
1612 tcp_v4_setup_caps(newsk, dst);
1614 newtp = tcp_sk(newsk);
1615 newinet = inet_sk(newsk);
1616 newinet->daddr = req->af.v4_req.rmt_addr;
1617 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1618 newinet->saddr = req->af.v4_req.loc_addr;
1619 newinet->opt = req->af.v4_req.opt;
1620 req->af.v4_req.opt = NULL;
1621 newinet->mc_index = tcp_v4_iif(skb);
1622 newinet->mc_ttl = skb->nh.iph->ttl;
1623 newtp->ext_header_len = 0;
1625 newtp->ext_header_len = newinet->opt->optlen;
1626 newtp->ext2_header_len = dst->header_len;
1627 newinet->id = newtp->write_seq ^ jiffies;
1629 tcp_sync_mss(newsk, dst_pmtu(dst));
1630 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1631 tcp_initialize_rcv_mss(newsk);
1633 __tcp_v4_hash(newsk, 0);
1634 __tcp_inherit_port(sk, newsk);
1639 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1641 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1646 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1648 struct tcphdr *th = skb->h.th;
1649 struct iphdr *iph = skb->nh.iph;
1650 struct tcp_opt *tp = tcp_sk(sk);
1652 struct open_request **prev;
1653 /* Find possible connection requests. */
1654 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1655 iph->saddr, iph->daddr);
1657 return tcp_check_req(sk, skb, req, prev);
1659 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1666 if (nsk->sk_state != TCP_TIME_WAIT) {
1670 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1674 #ifdef CONFIG_SYN_COOKIES
1675 if (!th->rst && !th->syn && th->ack)
1676 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1681 static int tcp_v4_checksum_init(struct sk_buff *skb)
1683 if (skb->ip_summed == CHECKSUM_HW) {
1684 skb->ip_summed = CHECKSUM_UNNECESSARY;
1685 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1686 skb->nh.iph->daddr, skb->csum))
1689 NETDEBUG(if (net_ratelimit())
1690 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1691 skb->ip_summed = CHECKSUM_NONE;
1693 if (skb->len <= 76) {
1694 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1696 skb_checksum(skb, 0, skb->len, 0)))
1698 skb->ip_summed = CHECKSUM_UNNECESSARY;
1700 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1702 skb->nh.iph->daddr, 0);
1708 /* The socket must have it's spinlock held when we get
1711 * We have a potential double-lock case here, so even when
1712 * doing backlog processing we use the BH locking scheme.
1713 * This is because we cannot sleep with the original spinlock
1716 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1718 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1719 TCP_CHECK_TIMER(sk);
1720 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1722 TCP_CHECK_TIMER(sk);
1726 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1729 if (sk->sk_state == TCP_LISTEN) {
1730 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1735 if (tcp_child_process(sk, nsk, skb))
1741 TCP_CHECK_TIMER(sk);
1742 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1744 TCP_CHECK_TIMER(sk);
1748 tcp_v4_send_reset(skb);
1751 /* Be careful here. If this function gets more complicated and
1752 * gcc suffers from register pressure on the x86, sk (in %ebx)
1753 * might be destroyed here. This current version compiles correctly,
1754 * but you have been warned.
1759 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1767 int tcp_v4_rcv(struct sk_buff *skb)
1773 if (skb->pkt_type != PACKET_HOST)
1776 /* Count it even if it's bad */
1777 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1779 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1784 if (th->doff < sizeof(struct tcphdr) / 4)
1786 if (!pskb_may_pull(skb, th->doff * 4))
1789 /* An explanation is required here, I think.
1790 * Packet length and doff are validated by header prediction,
1791 * provided case of th->doff==0 is elimineted.
1792 * So, we defer the checks. */
1793 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1794 tcp_v4_checksum_init(skb) < 0))
1798 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1799 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1800 skb->len - th->doff * 4);
1801 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1802 TCP_SKB_CB(skb)->when = 0;
1803 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1804 TCP_SKB_CB(skb)->sacked = 0;
1806 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1807 skb->nh.iph->daddr, ntohs(th->dest),
1814 if (sk->sk_state == TCP_TIME_WAIT)
1817 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1818 goto discard_and_relse;
1820 if (sk_filter(sk, skb, 0))
1821 goto discard_and_relse;
1827 if (!sock_owned_by_user(sk)) {
1828 if (!tcp_prequeue(sk, skb))
1829 ret = tcp_v4_do_rcv(sk, skb);
1831 sk_add_backlog(sk, skb);
1839 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1842 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1844 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1846 tcp_v4_send_reset(skb);
1850 /* Discard frame. */
1859 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1860 tcp_tw_put((struct tcp_tw_bucket *) sk);
1864 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1865 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1866 tcp_tw_put((struct tcp_tw_bucket *) sk);
1869 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1870 skb, th, skb->len)) {
1872 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1876 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1877 tcp_tw_put((struct tcp_tw_bucket *)sk);
1881 /* Fall through to ACK */
1884 tcp_v4_timewait_ack(sk, skb);
1888 case TCP_TW_SUCCESS:;
1893 /* With per-bucket locks this operation is not-atomic, so that
1894 * this version is not worse.
1896 static void __tcp_v4_rehash(struct sock *sk)
1898 sk->sk_prot->unhash(sk);
1899 sk->sk_prot->hash(sk);
1902 static int tcp_v4_reselect_saddr(struct sock *sk)
1904 struct inet_opt *inet = inet_sk(sk);
1907 __u32 old_saddr = inet->saddr;
1909 __u32 daddr = inet->daddr;
1911 if (inet->opt && inet->opt->srr)
1912 daddr = inet->opt->faddr;
1914 /* Query new route. */
1915 err = ip_route_connect(&rt, daddr, 0,
1916 RT_TOS(inet->tos) | sk->sk_localroute,
1917 sk->sk_bound_dev_if,
1919 inet->sport, inet->dport, sk);
1923 __sk_dst_set(sk, &rt->u.dst);
1924 tcp_v4_setup_caps(sk, &rt->u.dst);
1925 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1927 new_saddr = rt->rt_src;
1929 if (new_saddr == old_saddr)
1932 if (sysctl_ip_dynaddr > 1) {
1933 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1934 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1936 NIPQUAD(new_saddr));
1939 inet->saddr = new_saddr;
1940 inet->rcv_saddr = new_saddr;
1942 /* XXX The only one ugly spot where we need to
1943 * XXX really change the sockets identity after
1944 * XXX it has entered the hashes. -DaveM
1946 * Besides that, it does not check for connection
1947 * uniqueness. Wait for troubles.
1949 __tcp_v4_rehash(sk);
1953 int tcp_v4_rebuild_header(struct sock *sk)
1955 struct inet_opt *inet = inet_sk(sk);
1956 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1960 /* Route is OK, nothing to do. */
1965 daddr = inet->daddr;
1966 if (inet->opt && inet->opt->srr)
1967 daddr = inet->opt->faddr;
1970 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1973 .saddr = inet->saddr,
1974 .tos = RT_CONN_FLAGS(sk) } },
1975 .proto = IPPROTO_TCP,
1977 { .sport = inet->sport,
1978 .dport = inet->dport } } };
1980 err = ip_route_output_flow(&rt, &fl, sk, 0);
1983 __sk_dst_set(sk, &rt->u.dst);
1984 tcp_v4_setup_caps(sk, &rt->u.dst);
1985 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1989 /* Routing failed... */
1990 sk->sk_route_caps = 0;
1992 if (!sysctl_ip_dynaddr ||
1993 sk->sk_state != TCP_SYN_SENT ||
1994 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1995 (err = tcp_v4_reselect_saddr(sk)) != 0)
1996 sk->sk_err_soft = -err;
2001 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
2003 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
2004 struct inet_opt *inet = inet_sk(sk);
2006 sin->sin_family = AF_INET;
2007 sin->sin_addr.s_addr = inet->daddr;
2008 sin->sin_port = inet->dport;
2011 /* VJ's idea. Save last timestamp seen from this destination
2012 * and hold it at least for normal timewait interval to use for duplicate
2013 * segment detection in subsequent connections, before they enter synchronized
2017 int tcp_v4_remember_stamp(struct sock *sk)
2019 struct inet_opt *inet = inet_sk(sk);
2020 struct tcp_opt *tp = tcp_sk(sk);
2021 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
2022 struct inet_peer *peer = NULL;
2025 if (!rt || rt->rt_dst != inet->daddr) {
2026 peer = inet_getpeer(inet->daddr, 1);
2030 rt_bind_peer(rt, 1);
2035 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2036 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2037 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2038 peer->tcp_ts_stamp = tp->ts_recent_stamp;
2039 peer->tcp_ts = tp->ts_recent;
2049 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2051 struct inet_peer *peer = NULL;
2053 peer = inet_getpeer(tw->tw_daddr, 1);
2056 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2057 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2058 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2059 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2060 peer->tcp_ts = tw->tw_ts_recent;
2069 struct tcp_func ipv4_specific = {
2070 .queue_xmit = ip_queue_xmit,
2071 .send_check = tcp_v4_send_check,
2072 .rebuild_header = tcp_v4_rebuild_header,
2073 .conn_request = tcp_v4_conn_request,
2074 .syn_recv_sock = tcp_v4_syn_recv_sock,
2075 .remember_stamp = tcp_v4_remember_stamp,
2076 .net_header_len = sizeof(struct iphdr),
2077 .setsockopt = ip_setsockopt,
2078 .getsockopt = ip_getsockopt,
2079 .addr2sockaddr = v4_addr2sockaddr,
2080 .sockaddr_len = sizeof(struct sockaddr_in),
2083 /* NOTE: A lot of things set to zero explicitly by call to
2084 * sk_alloc() so need not be done here.
2086 static int tcp_v4_init_sock(struct sock *sk)
2088 struct tcp_opt *tp = tcp_sk(sk);
2090 skb_queue_head_init(&tp->out_of_order_queue);
2091 tcp_init_xmit_timers(sk);
2092 tcp_prequeue_init(tp);
2094 tp->rto = TCP_TIMEOUT_INIT;
2095 tp->mdev = TCP_TIMEOUT_INIT;
2097 /* So many TCP implementations out there (incorrectly) count the
2098 * initial SYN frame in their delayed-ACK and congestion control
2099 * algorithms that we must have the following bandaid to talk
2100 * efficiently to them. -DaveM
2104 /* See draft-stevens-tcpca-spec-01 for discussion of the
2105 * initialization of these values.
2107 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2108 tp->snd_cwnd_clamp = ~0;
2109 tp->mss_cache_std = tp->mss_cache = 536;
2111 tp->reordering = sysctl_tcp_reordering;
2113 sk->sk_state = TCP_CLOSE;
2115 sk->sk_write_space = sk_stream_write_space;
2116 sk->sk_use_write_queue = 1;
2118 tp->af_specific = &ipv4_specific;
2120 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2121 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2123 atomic_inc(&tcp_sockets_allocated);
2128 int tcp_v4_destroy_sock(struct sock *sk)
2130 struct tcp_opt *tp = tcp_sk(sk);
2132 tcp_clear_xmit_timers(sk);
2134 /* Cleanup up the write buffer. */
2135 sk_stream_writequeue_purge(sk);
2137 /* Cleans up our, hopefully empty, out_of_order_queue. */
2138 __skb_queue_purge(&tp->out_of_order_queue);
2140 /* Clean prequeue, it must be empty really */
2141 __skb_queue_purge(&tp->ucopy.prequeue);
2143 /* Clean up a referenced TCP bind bucket. */
2148 * If sendmsg cached page exists, toss it.
2150 if (sk->sk_sndmsg_page) {
2151 __free_page(sk->sk_sndmsg_page);
2152 sk->sk_sndmsg_page = NULL;
2155 atomic_dec(&tcp_sockets_allocated);
2160 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2162 #ifdef CONFIG_PROC_FS
2163 /* Proc filesystem TCP sock list dumping. */
2165 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2167 return hlist_empty(head) ? NULL :
2168 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2171 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2173 return tw->tw_node.next ?
2174 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2177 static void *listening_get_next(struct seq_file *seq, void *cur)
2180 struct hlist_node *node;
2181 struct sock *sk = cur;
2182 struct tcp_iter_state* st = seq->private;
2186 sk = sk_head(&tcp_listening_hash[0]);
2192 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2193 struct open_request *req = cur;
2195 tp = tcp_sk(st->syn_wait_sk);
2199 if (req->class->family == st->family) {
2205 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2208 req = tp->listen_opt->syn_table[st->sbucket];
2210 sk = sk_next(st->syn_wait_sk);
2211 st->state = TCP_SEQ_STATE_LISTENING;
2212 read_unlock_bh(&tp->syn_wait_lock);
2215 read_lock_bh(&tp->syn_wait_lock);
2216 if (tp->listen_opt && tp->listen_opt->qlen)
2218 read_unlock_bh(&tp->syn_wait_lock);
2222 sk_for_each_from(sk, node) {
2223 if (sk->sk_family == st->family) {
2228 read_lock_bh(&tp->syn_wait_lock);
2229 if (tp->listen_opt && tp->listen_opt->qlen) {
2231 st->uid = sock_i_uid(sk);
2232 st->syn_wait_sk = sk;
2233 st->state = TCP_SEQ_STATE_OPENREQ;
2237 read_unlock_bh(&tp->syn_wait_lock);
2239 if (++st->bucket < TCP_LHTABLE_SIZE) {
2240 sk = sk_head(&tcp_listening_hash[st->bucket]);
2248 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2250 void *rc = listening_get_next(seq, NULL);
2252 while (rc && *pos) {
2253 rc = listening_get_next(seq, rc);
2259 static void *established_get_first(struct seq_file *seq)
2261 struct tcp_iter_state* st = seq->private;
2264 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2266 struct hlist_node *node;
2267 struct tcp_tw_bucket *tw;
2269 read_lock(&tcp_ehash[st->bucket].lock);
2270 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2271 if (sk->sk_family != st->family) {
2277 st->state = TCP_SEQ_STATE_TIME_WAIT;
2278 tw_for_each(tw, node,
2279 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2280 if (tw->tw_family != st->family) {
2286 read_unlock(&tcp_ehash[st->bucket].lock);
2287 st->state = TCP_SEQ_STATE_ESTABLISHED;
2293 static void *established_get_next(struct seq_file *seq, void *cur)
2295 struct sock *sk = cur;
2296 struct tcp_tw_bucket *tw;
2297 struct hlist_node *node;
2298 struct tcp_iter_state* st = seq->private;
2302 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2306 while (tw && tw->tw_family != st->family) {
2313 read_unlock(&tcp_ehash[st->bucket].lock);
2314 st->state = TCP_SEQ_STATE_ESTABLISHED;
2315 if (++st->bucket < tcp_ehash_size) {
2316 read_lock(&tcp_ehash[st->bucket].lock);
2317 sk = sk_head(&tcp_ehash[st->bucket].chain);
2325 sk_for_each_from(sk, node) {
2326 if (sk->sk_family == st->family)
2330 st->state = TCP_SEQ_STATE_TIME_WAIT;
2331 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2339 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2341 void *rc = established_get_first(seq);
2344 rc = established_get_next(seq, rc);
2350 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2353 struct tcp_iter_state* st = seq->private;
2356 st->state = TCP_SEQ_STATE_LISTENING;
2357 rc = listening_get_idx(seq, &pos);
2360 tcp_listen_unlock();
2362 st->state = TCP_SEQ_STATE_ESTABLISHED;
2363 rc = established_get_idx(seq, pos);
2369 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2371 struct tcp_iter_state* st = seq->private;
2372 st->state = TCP_SEQ_STATE_LISTENING;
2374 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2377 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2380 struct tcp_iter_state* st;
2382 if (v == SEQ_START_TOKEN) {
2383 rc = tcp_get_idx(seq, 0);
2388 switch (st->state) {
2389 case TCP_SEQ_STATE_OPENREQ:
2390 case TCP_SEQ_STATE_LISTENING:
2391 rc = listening_get_next(seq, v);
2393 tcp_listen_unlock();
2395 st->state = TCP_SEQ_STATE_ESTABLISHED;
2396 rc = established_get_first(seq);
2399 case TCP_SEQ_STATE_ESTABLISHED:
2400 case TCP_SEQ_STATE_TIME_WAIT:
2401 rc = established_get_next(seq, v);
2409 static void tcp_seq_stop(struct seq_file *seq, void *v)
2411 struct tcp_iter_state* st = seq->private;
2413 switch (st->state) {
2414 case TCP_SEQ_STATE_OPENREQ:
2416 struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2417 read_unlock_bh(&tp->syn_wait_lock);
2419 case TCP_SEQ_STATE_LISTENING:
2420 if (v != SEQ_START_TOKEN)
2421 tcp_listen_unlock();
2423 case TCP_SEQ_STATE_TIME_WAIT:
2424 case TCP_SEQ_STATE_ESTABLISHED:
2426 read_unlock(&tcp_ehash[st->bucket].lock);
2432 static int tcp_seq_open(struct inode *inode, struct file *file)
2434 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2435 struct seq_file *seq;
2436 struct tcp_iter_state *s;
2439 if (unlikely(afinfo == NULL))
2442 s = kmalloc(sizeof(*s), GFP_KERNEL);
2445 memset(s, 0, sizeof(*s));
2446 s->family = afinfo->family;
2447 s->seq_ops.start = tcp_seq_start;
2448 s->seq_ops.next = tcp_seq_next;
2449 s->seq_ops.show = afinfo->seq_show;
2450 s->seq_ops.stop = tcp_seq_stop;
2452 rc = seq_open(file, &s->seq_ops);
2455 seq = file->private_data;
2464 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2467 struct proc_dir_entry *p;
2471 afinfo->seq_fops->owner = afinfo->owner;
2472 afinfo->seq_fops->open = tcp_seq_open;
2473 afinfo->seq_fops->read = seq_read;
2474 afinfo->seq_fops->llseek = seq_lseek;
2475 afinfo->seq_fops->release = seq_release_private;
2477 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2485 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2489 proc_net_remove(afinfo->name);
2490 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2493 static void get_openreq4(struct sock *sk, struct open_request *req,
2494 char *tmpbuf, int i, int uid)
2496 int ttd = req->expires - jiffies;
2498 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2499 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2501 req->af.v4_req.loc_addr,
2502 ntohs(inet_sk(sk)->sport),
2503 req->af.v4_req.rmt_addr,
2504 ntohs(req->rmt_port),
2506 0, 0, /* could print option size, but that is af dependent. */
2507 1, /* timers active (only the expire timer) */
2508 jiffies_to_clock_t(ttd),
2511 0, /* non standard timer */
2512 0, /* open_requests have no inode */
2513 atomic_read(&sk->sk_refcnt),
2517 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2520 unsigned long timer_expires;
2521 struct tcp_opt *tp = tcp_sk(sp);
2522 struct inet_opt *inet = inet_sk(sp);
2523 unsigned int dest = inet->daddr;
2524 unsigned int src = inet->rcv_saddr;
2525 __u16 destp = ntohs(inet->dport);
2526 __u16 srcp = ntohs(inet->sport);
2528 if (tp->pending == TCP_TIME_RETRANS) {
2530 timer_expires = tp->timeout;
2531 } else if (tp->pending == TCP_TIME_PROBE0) {
2533 timer_expires = tp->timeout;
2534 } else if (timer_pending(&sp->sk_timer)) {
2536 timer_expires = sp->sk_timer.expires;
2539 timer_expires = jiffies;
2542 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2543 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2544 i, src, srcp, dest, destp, sp->sk_state,
2545 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2547 jiffies_to_clock_t(timer_expires - jiffies),
2552 atomic_read(&sp->sk_refcnt), sp,
2553 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2555 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2558 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2560 unsigned int dest, src;
2562 int ttd = tw->tw_ttd - jiffies;
2567 dest = tw->tw_daddr;
2568 src = tw->tw_rcv_saddr;
2569 destp = ntohs(tw->tw_dport);
2570 srcp = ntohs(tw->tw_sport);
2572 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2573 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2574 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2575 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2576 atomic_read(&tw->tw_refcnt), tw);
2581 static int tcp4_seq_show(struct seq_file *seq, void *v)
2583 struct tcp_iter_state* st;
2584 char tmpbuf[TMPSZ + 1];
2586 if (v == SEQ_START_TOKEN) {
2587 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2588 " sl local_address rem_address st tx_queue "
2589 "rx_queue tr tm->when retrnsmt uid timeout "
2595 switch (st->state) {
2596 case TCP_SEQ_STATE_LISTENING:
2597 case TCP_SEQ_STATE_ESTABLISHED:
2598 get_tcp4_sock(v, tmpbuf, st->num);
2600 case TCP_SEQ_STATE_OPENREQ:
2601 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2603 case TCP_SEQ_STATE_TIME_WAIT:
2604 get_timewait4_sock(v, tmpbuf, st->num);
2607 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2612 static struct file_operations tcp4_seq_fops;
2613 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2614 .owner = THIS_MODULE,
2617 .seq_show = tcp4_seq_show,
2618 .seq_fops = &tcp4_seq_fops,
2621 int __init tcp4_proc_init(void)
2623 return tcp_proc_register(&tcp4_seq_afinfo);
2626 void tcp4_proc_exit(void)
2628 tcp_proc_unregister(&tcp4_seq_afinfo);
2630 #endif /* CONFIG_PROC_FS */
2632 struct proto tcp_prot = {
2634 .owner = THIS_MODULE,
2636 .connect = tcp_v4_connect,
2637 .disconnect = tcp_disconnect,
2638 .accept = tcp_accept,
2640 .init = tcp_v4_init_sock,
2641 .destroy = tcp_v4_destroy_sock,
2642 .shutdown = tcp_shutdown,
2643 .setsockopt = tcp_setsockopt,
2644 .getsockopt = tcp_getsockopt,
2645 .sendmsg = tcp_sendmsg,
2646 .recvmsg = tcp_recvmsg,
2647 .backlog_rcv = tcp_v4_do_rcv,
2648 .hash = tcp_v4_hash,
2649 .unhash = tcp_unhash,
2650 .get_port = tcp_v4_get_port,
2651 .enter_memory_pressure = tcp_enter_memory_pressure,
2652 .sockets_allocated = &tcp_sockets_allocated,
2653 .memory_allocated = &tcp_memory_allocated,
2654 .memory_pressure = &tcp_memory_pressure,
2655 .sysctl_mem = sysctl_tcp_mem,
2656 .sysctl_wmem = sysctl_tcp_wmem,
2657 .sysctl_rmem = sysctl_tcp_rmem,
2658 .max_header = MAX_TCP_HEADER,
2659 .slab_obj_size = sizeof(struct tcp_sock),
2664 void __init tcp_v4_init(struct net_proto_family *ops)
2666 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2668 panic("Failed to create the TCP control socket.\n");
2669 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2670 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2672 /* Unhash it so that IP input processing does not even
2673 * see it, we do not wish this socket to see incoming
2676 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2679 EXPORT_SYMBOL(ipv4_specific);
2680 EXPORT_SYMBOL(tcp_bind_hash);
2681 EXPORT_SYMBOL(tcp_bucket_create);
2682 EXPORT_SYMBOL(tcp_hashinfo);
2683 EXPORT_SYMBOL(tcp_inherit_port);
2684 EXPORT_SYMBOL(tcp_listen_wlock);
2685 EXPORT_SYMBOL(tcp_port_rover);
2686 EXPORT_SYMBOL(tcp_prot);
2687 EXPORT_SYMBOL(tcp_put_port);
2688 EXPORT_SYMBOL(tcp_unhash);
2689 EXPORT_SYMBOL(tcp_v4_conn_request);
2690 EXPORT_SYMBOL(tcp_v4_lookup_listener);
2691 EXPORT_SYMBOL(tcp_v4_connect);
2692 EXPORT_SYMBOL(tcp_v4_do_rcv);
2693 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2694 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2695 EXPORT_SYMBOL(tcp_v4_send_check);
2696 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2698 #ifdef CONFIG_PROC_FS
2699 EXPORT_SYMBOL(tcp_proc_register);
2700 EXPORT_SYMBOL(tcp_proc_unregister);
2702 EXPORT_SYMBOL(sysctl_local_port_range);
2703 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2704 EXPORT_SYMBOL(sysctl_tcp_low_latency);