2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/vserver/debug.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
108 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
109 __u32 faddr, __u16 fport)
111 int h = (laddr ^ lport) ^ (faddr ^ fport);
114 return h & (tcp_ehash_size - 1);
117 static __inline__ int tcp_sk_hashfn(struct sock *sk)
119 struct inet_sock *inet = inet_sk(sk);
120 __u32 laddr = inet->rcv_saddr;
121 __u16 lport = inet->num;
122 __u32 faddr = inet->daddr;
123 __u16 fport = inet->dport;
125 return tcp_hashfn(laddr, lport, faddr, fport);
128 /* Allocate and initialize a new TCP local port bind bucket.
129 * The bindhash mutex for snum's hash chain must be held here.
131 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
134 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
139 INIT_HLIST_HEAD(&tb->owners);
140 hlist_add_head(&tb->node, &head->chain);
145 /* Caller must hold hashbucket lock for this tb with local BH disabled */
146 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
148 if (hlist_empty(&tb->owners)) {
149 __hlist_del(&tb->node);
150 kmem_cache_free(tcp_bucket_cachep, tb);
154 /* Caller must disable local BH processing. */
155 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
157 struct tcp_bind_hashbucket *head =
158 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
159 struct tcp_bind_bucket *tb;
161 spin_lock(&head->lock);
162 tb = tcp_sk(sk)->bind_hash;
163 sk_add_bind_node(child, &tb->owners);
164 tcp_sk(child)->bind_hash = tb;
165 spin_unlock(&head->lock);
168 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
171 __tcp_inherit_port(sk, child);
175 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
178 inet_sk(sk)->num = snum;
179 sk_add_bind_node(sk, &tb->owners);
180 tcp_sk(sk)->bind_hash = tb;
183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 if (nx_addr_conflict(sk->sk_nx_info,
198 tcp_v4_rcv_saddr(sk), sk2))
206 /* Obtain a reference to a local port for the given sock,
207 * if snum is zero it means select any available local port.
209 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211 struct tcp_bind_hashbucket *head;
212 struct hlist_node *node;
213 struct tcp_bind_bucket *tb;
218 int low = sysctl_local_port_range[0];
219 int high = sysctl_local_port_range[1];
220 int remaining = (high - low) + 1;
223 spin_lock(&tcp_portalloc_lock);
224 rover = tcp_port_rover;
227 if (rover < low || rover > high)
229 head = &tcp_bhash[tcp_bhashfn(rover)];
230 spin_lock(&head->lock);
231 tb_for_each(tb, node, &head->chain)
232 if (tb->port == rover)
236 spin_unlock(&head->lock);
237 } while (--remaining > 0);
238 tcp_port_rover = rover;
239 spin_unlock(&tcp_portalloc_lock);
241 /* Exhausted local port range during search? */
246 /* OK, here is the one we will use. HEAD is
247 * non-NULL and we hold it's mutex.
251 head = &tcp_bhash[tcp_bhashfn(snum)];
252 spin_lock(&head->lock);
253 tb_for_each(tb, node, &head->chain)
254 if (tb->port == snum)
260 if (!hlist_empty(&tb->owners)) {
261 if (sk->sk_reuse > 1)
263 if (tb->fastreuse > 0 &&
264 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
268 if (tcp_bind_conflict(sk, tb))
274 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
276 if (hlist_empty(&tb->owners)) {
277 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
281 } else if (tb->fastreuse &&
282 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
285 if (!tcp_sk(sk)->bind_hash)
286 tcp_bind_hash(sk, tb, snum);
287 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
291 spin_unlock(&head->lock);
297 /* Get rid of any references to a local port held by the
300 static void __tcp_put_port(struct sock *sk)
302 struct inet_sock *inet = inet_sk(sk);
303 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
304 struct tcp_bind_bucket *tb;
306 spin_lock(&head->lock);
307 tb = tcp_sk(sk)->bind_hash;
308 __sk_del_bind_node(sk);
309 tcp_sk(sk)->bind_hash = NULL;
311 tcp_bucket_destroy(tb);
312 spin_unlock(&head->lock);
315 void tcp_put_port(struct sock *sk)
322 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
323 * Look, when several writers sleep and reader wakes them up, all but one
324 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
325 * this, _but_ remember, it adds useless work on UP machines (wake up each
326 * exclusive lock release). It should be ifdefed really.
329 void tcp_listen_wlock(void)
331 write_lock(&tcp_lhash_lock);
333 if (atomic_read(&tcp_lhash_users)) {
337 prepare_to_wait_exclusive(&tcp_lhash_wait,
338 &wait, TASK_UNINTERRUPTIBLE);
339 if (!atomic_read(&tcp_lhash_users))
341 write_unlock_bh(&tcp_lhash_lock);
343 write_lock_bh(&tcp_lhash_lock);
346 finish_wait(&tcp_lhash_wait, &wait);
350 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
352 struct hlist_head *list;
355 BUG_TRAP(sk_unhashed(sk));
356 if (listen_possible && sk->sk_state == TCP_LISTEN) {
357 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
358 lock = &tcp_lhash_lock;
361 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
362 lock = &tcp_ehash[sk->sk_hashent].lock;
365 __sk_add_node(sk, list);
366 sock_prot_inc_use(sk->sk_prot);
368 if (listen_possible && sk->sk_state == TCP_LISTEN)
369 wake_up(&tcp_lhash_wait);
372 static void tcp_v4_hash(struct sock *sk)
374 if (sk->sk_state != TCP_CLOSE) {
376 __tcp_v4_hash(sk, 1);
381 void tcp_unhash(struct sock *sk)
388 if (sk->sk_state == TCP_LISTEN) {
391 lock = &tcp_lhash_lock;
393 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
395 write_lock_bh(&head->lock);
398 if (__sk_del_node_init(sk))
399 sock_prot_dec_use(sk->sk_prot);
400 write_unlock_bh(lock);
403 if (sk->sk_state == TCP_LISTEN)
404 wake_up(&tcp_lhash_wait);
409 * Check if a given address matches for a tcp socket
411 * nxi: the socket's nx_info if any
412 * addr: to be verified address
413 * saddr: socket addresses
415 static inline int tcp_addr_match (
420 if (addr && (saddr == addr))
423 return addr_in_nx_info(nxi, addr);
427 /* Don't inline this cruft. Here are some nice properties to
428 * exploit here. The BSD API does not allow a listening TCP
429 * to specify the remote port nor the remote address for the
430 * connection. So always assume those are both wildcarded
431 * during the search since they can never be otherwise.
433 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
434 unsigned short hnum, int dif)
436 struct sock *result = NULL, *sk;
437 struct hlist_node *node;
441 sk_for_each(sk, node, head) {
442 struct inet_sock *inet = inet_sk(sk);
444 if (inet->num == hnum && !ipv6_only_sock(sk)) {
445 __u32 rcv_saddr = inet->rcv_saddr;
447 score = (sk->sk_family == PF_INET ? 1 : 0);
448 if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr))
452 if (sk->sk_bound_dev_if) {
453 if (sk->sk_bound_dev_if != dif)
459 if (score > hiscore) {
468 /* Optimize the common listener case. */
469 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
470 unsigned short hnum, int dif)
472 struct sock *sk = NULL;
473 struct hlist_head *head;
475 read_lock(&tcp_lhash_lock);
476 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
477 if (!hlist_empty(head)) {
478 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
480 if (inet->num == hnum && !sk->sk_node.next &&
481 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
482 tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) &&
483 !sk->sk_bound_dev_if)
485 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
491 read_unlock(&tcp_lhash_lock);
495 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
496 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
498 * Local BH must be disabled here.
501 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
505 struct tcp_ehash_bucket *head;
506 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
507 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
509 struct hlist_node *node;
510 /* Optimize here for direct hit, only listening connections can
511 * have wildcards anyways.
513 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
514 head = &tcp_ehash[hash];
515 read_lock(&head->lock);
516 sk_for_each(sk, node, &head->chain) {
517 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
518 goto hit; /* You sunk my battleship! */
521 /* Must check for a TIME_WAIT'er before going to listener hash. */
522 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
523 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
528 read_unlock(&head->lock);
535 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
536 u32 daddr, u16 hnum, int dif)
538 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
541 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
544 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
550 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
556 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
558 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
560 return secure_tcp_sequence_number(skb->nh.iph->daddr,
566 /* called with local bh disabled */
567 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
568 struct tcp_tw_bucket **twp)
570 struct inet_sock *inet = inet_sk(sk);
571 u32 daddr = inet->rcv_saddr;
572 u32 saddr = inet->daddr;
573 int dif = sk->sk_bound_dev_if;
574 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
575 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
576 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
577 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
579 struct hlist_node *node;
580 struct tcp_tw_bucket *tw;
582 write_lock(&head->lock);
584 /* Check TIME-WAIT sockets first. */
585 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
586 tw = (struct tcp_tw_bucket *)sk2;
588 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
589 struct tcp_sock *tp = tcp_sk(sk);
591 /* With PAWS, it is safe from the viewpoint
592 of data integrity. Even without PAWS it
593 is safe provided sequence spaces do not
594 overlap i.e. at data rates <= 80Mbit/sec.
596 Actually, the idea is close to VJ's one,
597 only timestamp cache is held not per host,
598 but per port pair and TW bucket is used
601 If TW bucket has been already destroyed we
602 fall back to VJ's scheme and use initial
603 timestamp retrieved from peer table.
605 if (tw->tw_ts_recent_stamp &&
606 (!twp || (sysctl_tcp_tw_reuse &&
608 tw->tw_ts_recent_stamp > 1))) {
610 tw->tw_snd_nxt + 65535 + 2) == 0)
612 tp->rx_opt.ts_recent = tw->tw_ts_recent;
613 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
622 /* And established part... */
623 sk_for_each(sk2, node, &head->chain) {
624 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
629 /* Must record num and sport now. Otherwise we will see
630 * in hash table socket with a funny identity. */
632 inet->sport = htons(lport);
633 sk->sk_hashent = hash;
634 BUG_TRAP(sk_unhashed(sk));
635 __sk_add_node(sk, &head->chain);
636 sock_prot_inc_use(sk->sk_prot);
637 write_unlock(&head->lock);
641 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
643 /* Silly. Should hash-dance instead... */
644 tcp_tw_deschedule(tw);
645 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
653 write_unlock(&head->lock);
654 return -EADDRNOTAVAIL;
657 static inline u32 connect_port_offset(const struct sock *sk)
659 const struct inet_sock *inet = inet_sk(sk);
661 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
666 * Bind a port for a connect operation and hash it.
668 static inline int tcp_v4_hash_connect(struct sock *sk)
670 unsigned short snum = inet_sk(sk)->num;
671 struct tcp_bind_hashbucket *head;
672 struct tcp_bind_bucket *tb;
676 int low = sysctl_local_port_range[0];
677 int high = sysctl_local_port_range[1];
678 int range = high - low;
682 u32 offset = hint + connect_port_offset(sk);
683 struct hlist_node *node;
684 struct tcp_tw_bucket *tw = NULL;
687 for (i = 1; i <= range; i++) {
688 port = low + (i + offset) % range;
689 head = &tcp_bhash[tcp_bhashfn(port)];
690 spin_lock(&head->lock);
692 /* Does not bother with rcv_saddr checks,
693 * because the established check is already
696 tb_for_each(tb, node, &head->chain) {
697 if (tb->port == port) {
698 BUG_TRAP(!hlist_empty(&tb->owners));
699 if (tb->fastreuse >= 0)
701 if (!__tcp_v4_check_established(sk,
709 tb = tcp_bucket_create(head, port);
711 spin_unlock(&head->lock);
718 spin_unlock(&head->lock);
722 return -EADDRNOTAVAIL;
727 /* Head lock still held and bh's disabled */
728 tcp_bind_hash(sk, tb, port);
729 if (sk_unhashed(sk)) {
730 inet_sk(sk)->sport = htons(port);
731 __tcp_v4_hash(sk, 0);
733 spin_unlock(&head->lock);
736 tcp_tw_deschedule(tw);
744 head = &tcp_bhash[tcp_bhashfn(snum)];
745 tb = tcp_sk(sk)->bind_hash;
746 spin_lock_bh(&head->lock);
747 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
748 __tcp_v4_hash(sk, 0);
749 spin_unlock_bh(&head->lock);
752 spin_unlock(&head->lock);
753 /* No definite answer... Walk to established hash table */
754 ret = __tcp_v4_check_established(sk, snum, NULL);
761 /* This will initiate an outgoing connection. */
762 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
764 struct inet_sock *inet = inet_sk(sk);
765 struct tcp_sock *tp = tcp_sk(sk);
766 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
772 if (addr_len < sizeof(struct sockaddr_in))
775 if (usin->sin_family != AF_INET)
776 return -EAFNOSUPPORT;
778 nexthop = daddr = usin->sin_addr.s_addr;
779 if (inet->opt && inet->opt->srr) {
782 nexthop = inet->opt->faddr;
785 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
786 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
788 inet->sport, usin->sin_port, sk);
792 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
797 if (!inet->opt || !inet->opt->srr)
801 inet->saddr = rt->rt_src;
802 inet->rcv_saddr = inet->saddr;
804 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
805 /* Reset inherited state */
806 tp->rx_opt.ts_recent = 0;
807 tp->rx_opt.ts_recent_stamp = 0;
811 if (sysctl_tcp_tw_recycle &&
812 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
813 struct inet_peer *peer = rt_get_peer(rt);
815 /* VJ's idea. We save last timestamp seen from
816 * the destination in peer table, when entering state TIME-WAIT
817 * and initialize rx_opt.ts_recent from it, when trying new connection.
820 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
821 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
822 tp->rx_opt.ts_recent = peer->tcp_ts;
826 inet->dport = usin->sin_port;
829 tp->ext_header_len = 0;
831 tp->ext_header_len = inet->opt->optlen;
833 tp->rx_opt.mss_clamp = 536;
835 /* Socket identity is still unknown (sport may be zero).
836 * However we set state to SYN-SENT and not releasing socket
837 * lock select source port, enter ourselves into the hash tables and
838 * complete initialization after this.
840 tcp_set_state(sk, TCP_SYN_SENT);
841 err = tcp_v4_hash_connect(sk);
845 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
849 /* OK, now commit destination to socket. */
850 __sk_dst_set(sk, &rt->u.dst);
851 tcp_v4_setup_caps(sk, &rt->u.dst);
852 tp->ext2_header_len = rt->u.dst.header_len;
855 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
860 inet->id = tp->write_seq ^ jiffies;
862 err = tcp_connect(sk);
870 /* This unhashes the socket and releases the local port, if necessary. */
871 tcp_set_state(sk, TCP_CLOSE);
873 sk->sk_route_caps = 0;
878 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
880 return ((struct rtable *)skb->dst)->rt_iif;
883 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
885 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
888 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
889 struct open_request ***prevp,
891 __u32 raddr, __u32 laddr)
893 struct tcp_listen_opt *lopt = tp->listen_opt;
894 struct open_request *req, **prev;
896 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
897 (req = *prev) != NULL;
898 prev = &req->dl_next) {
899 if (req->rmt_port == rport &&
900 req->af.v4_req.rmt_addr == raddr &&
901 req->af.v4_req.loc_addr == laddr &&
902 TCP_INET_FAMILY(req->class->family)) {
912 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
914 struct tcp_sock *tp = tcp_sk(sk);
915 struct tcp_listen_opt *lopt = tp->listen_opt;
916 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
918 req->expires = jiffies + TCP_TIMEOUT_INIT;
921 req->dl_next = lopt->syn_table[h];
923 write_lock(&tp->syn_wait_lock);
924 lopt->syn_table[h] = req;
925 write_unlock(&tp->syn_wait_lock);
932 * This routine does path mtu discovery as defined in RFC1191.
934 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
937 struct dst_entry *dst;
938 struct inet_sock *inet = inet_sk(sk);
939 struct tcp_sock *tp = tcp_sk(sk);
941 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
942 * send out by Linux are always <576bytes so they should go through
945 if (sk->sk_state == TCP_LISTEN)
948 /* We don't check in the destentry if pmtu discovery is forbidden
949 * on this route. We just assume that no packet_to_big packets
950 * are send back when pmtu discovery is not active.
951 * There is a small race when the user changes this flag in the
952 * route, but I think that's acceptable.
954 if ((dst = __sk_dst_check(sk, 0)) == NULL)
957 dst->ops->update_pmtu(dst, mtu);
959 /* Something is about to be wrong... Remember soft error
960 * for the case, if this connection will not able to recover.
962 if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
963 sk->sk_err_soft = EMSGSIZE;
967 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
968 tp->pmtu_cookie > mtu) {
969 tcp_sync_mss(sk, mtu);
971 /* Resend the TCP packet because it's
972 * clear that the old packet has been
973 * dropped. This is the new "fast" path mtu
976 tcp_simple_retransmit(sk);
977 } /* else let the usual retransmit timer handle it */
981 * This routine is called by the ICMP module when it gets some
982 * sort of error condition. If err < 0 then the socket should
983 * be closed and the error returned to the user. If err > 0
984 * it's just the icmp type << 8 | icmp code. After adjustment
985 * header points to the first 8 bytes of the tcp header. We need
986 * to find the appropriate port.
988 * The locking strategy used here is very "optimistic". When
989 * someone else accesses the socket the ICMP is just dropped
990 * and for some paths there is no check at all.
991 * A more general error queue to queue errors for later handling
992 * is probably better.
996 void tcp_v4_err(struct sk_buff *skb, u32 info)
998 struct iphdr *iph = (struct iphdr *)skb->data;
999 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1000 struct tcp_sock *tp;
1001 struct inet_sock *inet;
1002 int type = skb->h.icmph->type;
1003 int code = skb->h.icmph->code;
1008 if (skb->len < (iph->ihl << 2) + 8) {
1009 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1013 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1014 th->source, tcp_v4_iif(skb));
1016 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1019 if (sk->sk_state == TCP_TIME_WAIT) {
1020 tcp_tw_put((struct tcp_tw_bucket *)sk);
1025 /* If too many ICMPs get dropped on busy
1026 * servers this needs to be solved differently.
1028 if (sock_owned_by_user(sk))
1029 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1031 if (sk->sk_state == TCP_CLOSE)
1035 seq = ntohl(th->seq);
1036 if (sk->sk_state != TCP_LISTEN &&
1037 !between(seq, tp->snd_una, tp->snd_nxt)) {
1038 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1043 case ICMP_SOURCE_QUENCH:
1044 /* Just silently ignore these. */
1046 case ICMP_PARAMETERPROB:
1049 case ICMP_DEST_UNREACH:
1050 if (code > NR_ICMP_UNREACH)
1053 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1054 if (!sock_owned_by_user(sk))
1055 do_pmtu_discovery(sk, iph, info);
1059 err = icmp_err_convert[code].errno;
1061 case ICMP_TIME_EXCEEDED:
1068 switch (sk->sk_state) {
1069 struct open_request *req, **prev;
1071 if (sock_owned_by_user(sk))
1074 req = tcp_v4_search_req(tp, &prev, th->dest,
1075 iph->daddr, iph->saddr);
1079 /* ICMPs are not backlogged, hence we cannot get
1080 an established socket here.
1084 if (seq != req->snt_isn) {
1085 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1090 * Still in SYN_RECV, just remove it silently.
1091 * There is no good way to pass the error to the newly
1092 * created socket, and POSIX does not want network
1093 * errors returned from accept().
1095 tcp_synq_drop(sk, req, prev);
1099 case TCP_SYN_RECV: /* Cannot happen.
1100 It can f.e. if SYNs crossed.
1102 if (!sock_owned_by_user(sk)) {
1103 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1106 sk->sk_error_report(sk);
1110 sk->sk_err_soft = err;
1115 /* If we've already connected we will keep trying
1116 * until we time out, or the user gives up.
1118 * rfc1122 4.2.3.9 allows to consider as hard errors
1119 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1120 * but it is obsoleted by pmtu discovery).
1122 * Note, that in modern internet, where routing is unreliable
1123 * and in each dark corner broken firewalls sit, sending random
1124 * errors ordered by their masters even this two messages finally lose
1125 * their original sense (even Linux sends invalid PORT_UNREACHs)
1127 * Now we are in compliance with RFCs.
1132 if (!sock_owned_by_user(sk) && inet->recverr) {
1134 sk->sk_error_report(sk);
1135 } else { /* Only an error on timeout */
1136 sk->sk_err_soft = err;
1144 /* This routine computes an IPv4 TCP checksum. */
1145 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1146 struct sk_buff *skb)
1148 struct inet_sock *inet = inet_sk(sk);
1150 if (skb->ip_summed == CHECKSUM_HW) {
1151 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1152 skb->csum = offsetof(struct tcphdr, check);
1154 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1155 csum_partial((char *)th,
1162 * This routine will send an RST to the other tcp.
1164 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1166 * Answer: if a packet caused RST, it is not for a socket
1167 * existing in our system, if it is matched to a socket,
1168 * it is just duplicate segment or bug in other side's TCP.
1169 * So that we build reply only basing on parameters
1170 * arrived with segment.
1171 * Exception: precedence violation. We do not implement it in any case.
1174 static void tcp_v4_send_reset(struct sk_buff *skb)
1176 struct tcphdr *th = skb->h.th;
1178 struct ip_reply_arg arg;
1180 /* Never send a reset in response to a reset. */
1184 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1187 /* Swap the send and the receive. */
1188 memset(&rth, 0, sizeof(struct tcphdr));
1189 rth.dest = th->source;
1190 rth.source = th->dest;
1191 rth.doff = sizeof(struct tcphdr) / 4;
1195 rth.seq = th->ack_seq;
1198 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1199 skb->len - (th->doff << 2));
1202 memset(&arg, 0, sizeof arg);
1203 arg.iov[0].iov_base = (unsigned char *)&rth;
1204 arg.iov[0].iov_len = sizeof rth;
1205 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1206 skb->nh.iph->saddr, /*XXX*/
1207 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1208 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1210 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1212 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1213 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1216 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1217 outside socket context is ugly, certainly. What can I do?
1220 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1223 struct tcphdr *th = skb->h.th;
1228 struct ip_reply_arg arg;
1230 memset(&rep.th, 0, sizeof(struct tcphdr));
1231 memset(&arg, 0, sizeof arg);
1233 arg.iov[0].iov_base = (unsigned char *)&rep;
1234 arg.iov[0].iov_len = sizeof(rep.th);
1236 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1237 (TCPOPT_TIMESTAMP << 8) |
1239 rep.tsopt[1] = htonl(tcp_time_stamp);
1240 rep.tsopt[2] = htonl(ts);
1241 arg.iov[0].iov_len = sizeof(rep);
1244 /* Swap the send and the receive. */
1245 rep.th.dest = th->source;
1246 rep.th.source = th->dest;
1247 rep.th.doff = arg.iov[0].iov_len / 4;
1248 rep.th.seq = htonl(seq);
1249 rep.th.ack_seq = htonl(ack);
1251 rep.th.window = htons(win);
1253 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1254 skb->nh.iph->saddr, /*XXX*/
1255 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1256 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1258 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1260 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1263 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1265 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1267 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1268 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1273 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1275 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1279 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1280 struct open_request *req)
1283 struct ip_options *opt = req->af.v4_req.opt;
1284 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1286 { .daddr = ((opt && opt->srr) ?
1288 req->af.v4_req.rmt_addr),
1289 .saddr = req->af.v4_req.loc_addr,
1290 .tos = RT_CONN_FLAGS(sk) } },
1291 .proto = IPPROTO_TCP,
1293 { .sport = inet_sk(sk)->sport,
1294 .dport = req->rmt_port } } };
1296 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1297 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1300 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1302 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1309 * Send a SYN-ACK after having received an ACK.
1310 * This still operates on a open_request only, not on a big
1313 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1314 struct dst_entry *dst)
1317 struct sk_buff * skb;
1319 /* First, grab a route. */
1320 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1323 skb = tcp_make_synack(sk, dst, req);
1326 struct tcphdr *th = skb->h.th;
1328 th->check = tcp_v4_check(th, skb->len,
1329 req->af.v4_req.loc_addr,
1330 req->af.v4_req.rmt_addr,
1331 csum_partial((char *)th, skb->len,
1334 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1335 req->af.v4_req.rmt_addr,
1336 req->af.v4_req.opt);
1337 if (err == NET_XMIT_CN)
1347 * IPv4 open_request destructor.
1349 static void tcp_v4_or_free(struct open_request *req)
1351 if (req->af.v4_req.opt)
1352 kfree(req->af.v4_req.opt);
1355 static inline void syn_flood_warning(struct sk_buff *skb)
1357 static unsigned long warntime;
1359 if (time_after(jiffies, (warntime + HZ * 60))) {
1362 "possible SYN flooding on port %d. Sending cookies.\n",
1363 ntohs(skb->h.th->dest));
1368 * Save and compile IPv4 options into the open_request if needed.
1370 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1371 struct sk_buff *skb)
1373 struct ip_options *opt = &(IPCB(skb)->opt);
1374 struct ip_options *dopt = NULL;
1376 if (opt && opt->optlen) {
1377 int opt_size = optlength(opt);
1378 dopt = kmalloc(opt_size, GFP_ATOMIC);
1380 if (ip_options_echo(dopt, skb)) {
1390 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1391 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1392 * It would be better to replace it with a global counter for all sockets
1393 * but then some measure against one socket starving all other sockets
1396 * It was 128 by default. Experiments with real servers show, that
1397 * it is absolutely not enough even at 100conn/sec. 256 cures most
1398 * of problems. This value is adjusted to 128 for very small machines
1399 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1400 * Further increasing requires to change hash table size.
1402 int sysctl_max_syn_backlog = 256;
1404 struct or_calltable or_ipv4 = {
1406 .rtx_syn_ack = tcp_v4_send_synack,
1407 .send_ack = tcp_v4_or_send_ack,
1408 .destructor = tcp_v4_or_free,
1409 .send_reset = tcp_v4_send_reset,
1412 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1414 struct tcp_options_received tmp_opt;
1415 struct open_request *req;
1416 __u32 saddr = skb->nh.iph->saddr;
1417 __u32 daddr = skb->nh.iph->daddr;
1418 __u32 isn = TCP_SKB_CB(skb)->when;
1419 struct dst_entry *dst = NULL;
1420 #ifdef CONFIG_SYN_COOKIES
1421 int want_cookie = 0;
1423 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1426 /* Never answer to SYNs send to broadcast or multicast */
1427 if (((struct rtable *)skb->dst)->rt_flags &
1428 (RTCF_BROADCAST | RTCF_MULTICAST))
1431 /* TW buckets are converted to open requests without
1432 * limitations, they conserve resources and peer is
1433 * evidently real one.
1435 if (tcp_synq_is_full(sk) && !isn) {
1436 #ifdef CONFIG_SYN_COOKIES
1437 if (sysctl_tcp_syncookies) {
1444 /* Accept backlog is full. If we have already queued enough
1445 * of warm entries in syn queue, drop request. It is better than
1446 * clogging syn queue with openreqs with exponentially increasing
1449 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1452 req = tcp_openreq_alloc();
1456 tcp_clear_options(&tmp_opt);
1457 tmp_opt.mss_clamp = 536;
1458 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1460 tcp_parse_options(skb, &tmp_opt, 0);
1463 tcp_clear_options(&tmp_opt);
1464 tmp_opt.saw_tstamp = 0;
1467 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1468 /* Some OSes (unknown ones, but I see them on web server, which
1469 * contains information interesting only for windows'
1470 * users) do not send their stamp in SYN. It is easy case.
1471 * We simply do not advertise TS support.
1473 tmp_opt.saw_tstamp = 0;
1474 tmp_opt.tstamp_ok = 0;
1476 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1478 tcp_openreq_init(req, &tmp_opt, skb);
1480 req->af.v4_req.loc_addr = daddr;
1481 req->af.v4_req.rmt_addr = saddr;
1482 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1483 req->class = &or_ipv4;
1485 TCP_ECN_create_request(req, skb->h.th);
1488 #ifdef CONFIG_SYN_COOKIES
1489 syn_flood_warning(skb);
1491 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1493 struct inet_peer *peer = NULL;
1495 /* VJ's idea. We save last timestamp seen
1496 * from the destination in peer table, when entering
1497 * state TIME-WAIT, and check against it before
1498 * accepting new connection request.
1500 * If "isn" is not zero, this request hit alive
1501 * timewait bucket, so that all the necessary checks
1502 * are made in the function processing timewait state.
1504 if (tmp_opt.saw_tstamp &&
1505 sysctl_tcp_tw_recycle &&
1506 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1507 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1508 peer->v4daddr == saddr) {
1509 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1510 (s32)(peer->tcp_ts - req->ts_recent) >
1512 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1517 /* Kill the following clause, if you dislike this way. */
1518 else if (!sysctl_tcp_syncookies &&
1519 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1520 (sysctl_max_syn_backlog >> 2)) &&
1521 (!peer || !peer->tcp_ts_stamp) &&
1522 (!dst || !dst_metric(dst, RTAX_RTT))) {
1523 /* Without syncookies last quarter of
1524 * backlog is filled with destinations,
1525 * proven to be alive.
1526 * It means that we continue to communicate
1527 * to destinations, already remembered
1528 * to the moment of synflood.
1530 NETDEBUG(if (net_ratelimit()) \
1531 printk(KERN_DEBUG "TCP: drop open "
1532 "request from %u.%u."
1535 ntohs(skb->h.th->source)));
1540 isn = tcp_v4_init_sequence(sk, skb);
1544 if (tcp_v4_send_synack(sk, req, dst))
1548 tcp_openreq_free(req);
1550 tcp_v4_synq_add(sk, req);
1555 tcp_openreq_free(req);
1557 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1563 * The three way handshake has completed - we got a valid synack -
1564 * now create the new socket.
1566 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1567 struct open_request *req,
1568 struct dst_entry *dst)
1570 struct inet_sock *newinet;
1571 struct tcp_sock *newtp;
1574 if (sk_acceptq_is_full(sk))
1577 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1580 newsk = tcp_create_openreq_child(sk, req, skb);
1584 newsk->sk_dst_cache = dst;
1585 tcp_v4_setup_caps(newsk, dst);
1587 newtp = tcp_sk(newsk);
1588 newinet = inet_sk(newsk);
1589 newinet->daddr = req->af.v4_req.rmt_addr;
1590 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1591 newinet->saddr = req->af.v4_req.loc_addr;
1592 newinet->opt = req->af.v4_req.opt;
1593 req->af.v4_req.opt = NULL;
1594 newinet->mc_index = tcp_v4_iif(skb);
1595 newinet->mc_ttl = skb->nh.iph->ttl;
1596 newtp->ext_header_len = 0;
1598 newtp->ext_header_len = newinet->opt->optlen;
1599 newtp->ext2_header_len = dst->header_len;
1600 newinet->id = newtp->write_seq ^ jiffies;
1602 tcp_sync_mss(newsk, dst_pmtu(dst));
1603 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1604 tcp_initialize_rcv_mss(newsk);
1606 __tcp_v4_hash(newsk, 0);
1607 __tcp_inherit_port(sk, newsk);
1612 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1614 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1619 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1621 struct tcphdr *th = skb->h.th;
1622 struct iphdr *iph = skb->nh.iph;
1623 struct tcp_sock *tp = tcp_sk(sk);
1625 struct open_request **prev;
1626 /* Find possible connection requests. */
1627 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1628 iph->saddr, iph->daddr);
1630 return tcp_check_req(sk, skb, req, prev);
1632 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1639 if (nsk->sk_state != TCP_TIME_WAIT) {
1643 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1647 #ifdef CONFIG_SYN_COOKIES
1648 if (!th->rst && !th->syn && th->ack)
1649 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1654 static int tcp_v4_checksum_init(struct sk_buff *skb)
1656 if (skb->ip_summed == CHECKSUM_HW) {
1657 skb->ip_summed = CHECKSUM_UNNECESSARY;
1658 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1659 skb->nh.iph->daddr, skb->csum))
1662 NETDEBUG(if (net_ratelimit())
1663 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1664 skb->ip_summed = CHECKSUM_NONE;
1666 if (skb->len <= 76) {
1667 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1669 skb_checksum(skb, 0, skb->len, 0)))
1671 skb->ip_summed = CHECKSUM_UNNECESSARY;
1673 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1675 skb->nh.iph->daddr, 0);
1681 /* The socket must have it's spinlock held when we get
1684 * We have a potential double-lock case here, so even when
1685 * doing backlog processing we use the BH locking scheme.
1686 * This is because we cannot sleep with the original spinlock
1689 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1691 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1692 TCP_CHECK_TIMER(sk);
1693 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1695 TCP_CHECK_TIMER(sk);
1699 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1702 if (sk->sk_state == TCP_LISTEN) {
1703 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1708 if (tcp_child_process(sk, nsk, skb))
1714 TCP_CHECK_TIMER(sk);
1715 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1717 TCP_CHECK_TIMER(sk);
1721 tcp_v4_send_reset(skb);
1724 /* Be careful here. If this function gets more complicated and
1725 * gcc suffers from register pressure on the x86, sk (in %ebx)
1726 * might be destroyed here. This current version compiles correctly,
1727 * but you have been warned.
1732 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1740 int tcp_v4_rcv(struct sk_buff *skb)
1746 if (skb->pkt_type != PACKET_HOST)
1749 /* Count it even if it's bad */
1750 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1752 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1757 if (th->doff < sizeof(struct tcphdr) / 4)
1759 if (!pskb_may_pull(skb, th->doff * 4))
1762 /* An explanation is required here, I think.
1763 * Packet length and doff are validated by header prediction,
1764 * provided case of th->doff==0 is elimineted.
1765 * So, we defer the checks. */
1766 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1767 tcp_v4_checksum_init(skb) < 0))
1771 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1772 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1773 skb->len - th->doff * 4);
1774 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1775 TCP_SKB_CB(skb)->when = 0;
1776 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1777 TCP_SKB_CB(skb)->sacked = 0;
1779 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1780 skb->nh.iph->daddr, ntohs(th->dest),
1787 if (sk->sk_state == TCP_TIME_WAIT)
1790 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1791 goto discard_and_relse;
1793 if (sk_filter(sk, skb, 0))
1794 goto discard_and_relse;
1800 if (!sock_owned_by_user(sk)) {
1801 if (!tcp_prequeue(sk, skb))
1802 ret = tcp_v4_do_rcv(sk, skb);
1804 sk_add_backlog(sk, skb);
1812 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1815 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1817 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1819 tcp_v4_send_reset(skb);
1823 /* Discard frame. */
1832 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1833 tcp_tw_put((struct tcp_tw_bucket *) sk);
1837 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1838 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1839 tcp_tw_put((struct tcp_tw_bucket *) sk);
1842 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1843 skb, th, skb->len)) {
1845 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1849 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1850 tcp_tw_put((struct tcp_tw_bucket *)sk);
1854 /* Fall through to ACK */
1857 tcp_v4_timewait_ack(sk, skb);
1861 case TCP_TW_SUCCESS:;
1866 /* With per-bucket locks this operation is not-atomic, so that
1867 * this version is not worse.
1869 static void __tcp_v4_rehash(struct sock *sk)
1871 sk->sk_prot->unhash(sk);
1872 sk->sk_prot->hash(sk);
1875 static int tcp_v4_reselect_saddr(struct sock *sk)
1877 struct inet_sock *inet = inet_sk(sk);
1880 __u32 old_saddr = inet->saddr;
1882 __u32 daddr = inet->daddr;
1884 if (inet->opt && inet->opt->srr)
1885 daddr = inet->opt->faddr;
1887 /* Query new route. */
1888 err = ip_route_connect(&rt, daddr, 0,
1889 RT_TOS(inet->tos) | sk->sk_localroute,
1890 sk->sk_bound_dev_if,
1892 inet->sport, inet->dport, sk);
1896 __sk_dst_set(sk, &rt->u.dst);
1897 tcp_v4_setup_caps(sk, &rt->u.dst);
1898 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1900 new_saddr = rt->rt_src;
1902 if (new_saddr == old_saddr)
1905 if (sysctl_ip_dynaddr > 1) {
1906 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1907 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1909 NIPQUAD(new_saddr));
1912 inet->saddr = new_saddr;
1913 inet->rcv_saddr = new_saddr;
1915 /* XXX The only one ugly spot where we need to
1916 * XXX really change the sockets identity after
1917 * XXX it has entered the hashes. -DaveM
1919 * Besides that, it does not check for connection
1920 * uniqueness. Wait for troubles.
1922 __tcp_v4_rehash(sk);
1926 int tcp_v4_rebuild_header(struct sock *sk)
1928 struct inet_sock *inet = inet_sk(sk);
1929 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1933 /* Route is OK, nothing to do. */
1938 daddr = inet->daddr;
1939 if (inet->opt && inet->opt->srr)
1940 daddr = inet->opt->faddr;
1943 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1946 .saddr = inet->saddr,
1947 .tos = RT_CONN_FLAGS(sk) } },
1948 .proto = IPPROTO_TCP,
1950 { .sport = inet->sport,
1951 .dport = inet->dport } } };
1953 err = ip_route_output_flow(&rt, &fl, sk, 0);
1956 __sk_dst_set(sk, &rt->u.dst);
1957 tcp_v4_setup_caps(sk, &rt->u.dst);
1958 tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1962 /* Routing failed... */
1963 sk->sk_route_caps = 0;
1965 if (!sysctl_ip_dynaddr ||
1966 sk->sk_state != TCP_SYN_SENT ||
1967 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1968 (err = tcp_v4_reselect_saddr(sk)) != 0)
1969 sk->sk_err_soft = -err;
1974 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1976 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1977 struct inet_sock *inet = inet_sk(sk);
1979 sin->sin_family = AF_INET;
1980 sin->sin_addr.s_addr = inet->daddr;
1981 sin->sin_port = inet->dport;
1984 /* VJ's idea. Save last timestamp seen from this destination
1985 * and hold it at least for normal timewait interval to use for duplicate
1986 * segment detection in subsequent connections, before they enter synchronized
1990 int tcp_v4_remember_stamp(struct sock *sk)
1992 struct inet_sock *inet = inet_sk(sk);
1993 struct tcp_sock *tp = tcp_sk(sk);
1994 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1995 struct inet_peer *peer = NULL;
1998 if (!rt || rt->rt_dst != inet->daddr) {
1999 peer = inet_getpeer(inet->daddr, 1);
2003 rt_bind_peer(rt, 1);
2008 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
2009 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2010 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
2011 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
2012 peer->tcp_ts = tp->rx_opt.ts_recent;
2022 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2024 struct inet_peer *peer = NULL;
2026 peer = inet_getpeer(tw->tw_daddr, 1);
2029 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2030 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2031 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2032 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2033 peer->tcp_ts = tw->tw_ts_recent;
2042 struct tcp_func ipv4_specific = {
2043 .queue_xmit = ip_queue_xmit,
2044 .send_check = tcp_v4_send_check,
2045 .rebuild_header = tcp_v4_rebuild_header,
2046 .conn_request = tcp_v4_conn_request,
2047 .syn_recv_sock = tcp_v4_syn_recv_sock,
2048 .remember_stamp = tcp_v4_remember_stamp,
2049 .net_header_len = sizeof(struct iphdr),
2050 .setsockopt = ip_setsockopt,
2051 .getsockopt = ip_getsockopt,
2052 .addr2sockaddr = v4_addr2sockaddr,
2053 .sockaddr_len = sizeof(struct sockaddr_in),
2056 /* NOTE: A lot of things set to zero explicitly by call to
2057 * sk_alloc() so need not be done here.
2059 static int tcp_v4_init_sock(struct sock *sk)
2061 struct tcp_sock *tp = tcp_sk(sk);
2063 skb_queue_head_init(&tp->out_of_order_queue);
2064 tcp_init_xmit_timers(sk);
2065 tcp_prequeue_init(tp);
2067 tp->rto = TCP_TIMEOUT_INIT;
2068 tp->mdev = TCP_TIMEOUT_INIT;
2070 /* So many TCP implementations out there (incorrectly) count the
2071 * initial SYN frame in their delayed-ACK and congestion control
2072 * algorithms that we must have the following bandaid to talk
2073 * efficiently to them. -DaveM
2077 /* See draft-stevens-tcpca-spec-01 for discussion of the
2078 * initialization of these values.
2080 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2081 tp->snd_cwnd_clamp = ~0;
2082 tp->mss_cache_std = tp->mss_cache = 536;
2084 tp->reordering = sysctl_tcp_reordering;
2086 sk->sk_state = TCP_CLOSE;
2088 sk->sk_write_space = sk_stream_write_space;
2089 sk->sk_use_write_queue = 1;
2091 tp->af_specific = &ipv4_specific;
2093 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2094 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2096 atomic_inc(&tcp_sockets_allocated);
2101 int tcp_v4_destroy_sock(struct sock *sk)
2103 struct tcp_sock *tp = tcp_sk(sk);
2105 tcp_clear_xmit_timers(sk);
2107 /* Cleanup up the write buffer. */
2108 sk_stream_writequeue_purge(sk);
2110 /* Cleans up our, hopefully empty, out_of_order_queue. */
2111 __skb_queue_purge(&tp->out_of_order_queue);
2113 /* Clean prequeue, it must be empty really */
2114 __skb_queue_purge(&tp->ucopy.prequeue);
2116 /* Clean up a referenced TCP bind bucket. */
2121 * If sendmsg cached page exists, toss it.
2123 if (sk->sk_sndmsg_page) {
2124 __free_page(sk->sk_sndmsg_page);
2125 sk->sk_sndmsg_page = NULL;
2128 atomic_dec(&tcp_sockets_allocated);
2133 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2135 #ifdef CONFIG_PROC_FS
2136 /* Proc filesystem TCP sock list dumping. */
2138 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2140 return hlist_empty(head) ? NULL :
2141 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2144 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2146 return tw->tw_node.next ?
2147 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2150 static void *listening_get_next(struct seq_file *seq, void *cur)
2152 struct tcp_sock *tp;
2153 struct hlist_node *node;
2154 struct sock *sk = cur;
2155 struct tcp_iter_state* st = seq->private;
2159 sk = sk_head(&tcp_listening_hash[0]);
2165 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2166 struct open_request *req = cur;
2168 tp = tcp_sk(st->syn_wait_sk);
2172 vxdprintk(VXD_CBIT(net, 6),
2173 "sk,req: %p [#%d] (from %d)", req->sk,
2174 (req->sk)?req->sk->sk_xid:0, vx_current_xid());
2176 !vx_check(req->sk->sk_xid, VX_IDENT|VX_WATCH))
2178 if (req->class->family == st->family) {
2184 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2187 req = tp->listen_opt->syn_table[st->sbucket];
2189 sk = sk_next(st->syn_wait_sk);
2190 st->state = TCP_SEQ_STATE_LISTENING;
2191 read_unlock_bh(&tp->syn_wait_lock);
2194 read_lock_bh(&tp->syn_wait_lock);
2195 if (tp->listen_opt && tp->listen_opt->qlen)
2197 read_unlock_bh(&tp->syn_wait_lock);
2201 sk_for_each_from(sk, node) {
2202 vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)",
2203 sk, sk->sk_xid, vx_current_xid());
2204 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2206 if (sk->sk_family == st->family) {
2211 read_lock_bh(&tp->syn_wait_lock);
2212 if (tp->listen_opt && tp->listen_opt->qlen) {
2214 st->uid = sock_i_uid(sk);
2215 st->syn_wait_sk = sk;
2216 st->state = TCP_SEQ_STATE_OPENREQ;
2220 read_unlock_bh(&tp->syn_wait_lock);
2222 if (++st->bucket < TCP_LHTABLE_SIZE) {
2223 sk = sk_head(&tcp_listening_hash[st->bucket]);
2231 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2233 void *rc = listening_get_next(seq, NULL);
2235 while (rc && *pos) {
2236 rc = listening_get_next(seq, rc);
2242 static void *established_get_first(struct seq_file *seq)
2244 struct tcp_iter_state* st = seq->private;
2247 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2249 struct hlist_node *node;
2250 struct tcp_tw_bucket *tw;
2252 /* We can reschedule _before_ having picked the target: */
2253 cond_resched_softirq();
2255 read_lock(&tcp_ehash[st->bucket].lock);
2256 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2257 vxdprintk(VXD_CBIT(net, 6),
2258 "sk,egf: %p [#%d] (from %d)",
2259 sk, sk->sk_xid, vx_current_xid());
2260 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2262 if (sk->sk_family != st->family)
2267 st->state = TCP_SEQ_STATE_TIME_WAIT;
2268 tw_for_each(tw, node,
2269 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2270 vxdprintk(VXD_CBIT(net, 6),
2271 "tw: %p [#%d] (from %d)",
2272 tw, tw->tw_xid, vx_current_xid());
2273 if (!vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))
2275 if (tw->tw_family != st->family)
2280 read_unlock(&tcp_ehash[st->bucket].lock);
2281 st->state = TCP_SEQ_STATE_ESTABLISHED;
2287 static void *established_get_next(struct seq_file *seq, void *cur)
2289 struct sock *sk = cur;
2290 struct tcp_tw_bucket *tw;
2291 struct hlist_node *node;
2292 struct tcp_iter_state* st = seq->private;
2296 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2300 while (tw && (tw->tw_family != st->family ||
2301 !vx_check(tw->tw_xid, VX_IDENT|VX_WATCH))) {
2308 read_unlock(&tcp_ehash[st->bucket].lock);
2309 st->state = TCP_SEQ_STATE_ESTABLISHED;
2311 /* We can reschedule between buckets: */
2312 cond_resched_softirq();
2314 if (++st->bucket < tcp_ehash_size) {
2315 read_lock(&tcp_ehash[st->bucket].lock);
2316 sk = sk_head(&tcp_ehash[st->bucket].chain);
2324 sk_for_each_from(sk, node) {
2325 vxdprintk(VXD_CBIT(net, 6),
2326 "sk,egn: %p [#%d] (from %d)",
2327 sk, sk->sk_xid, vx_current_xid());
2328 if (!vx_check(sk->sk_xid, VX_IDENT|VX_WATCH))
2330 if (sk->sk_family == st->family)
2334 st->state = TCP_SEQ_STATE_TIME_WAIT;
2335 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2343 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2345 void *rc = established_get_first(seq);
2348 rc = established_get_next(seq, rc);
2354 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2357 struct tcp_iter_state* st = seq->private;
2360 st->state = TCP_SEQ_STATE_LISTENING;
2361 rc = listening_get_idx(seq, &pos);
2364 tcp_listen_unlock();
2366 st->state = TCP_SEQ_STATE_ESTABLISHED;
2367 rc = established_get_idx(seq, pos);
2373 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2375 struct tcp_iter_state* st = seq->private;
2376 st->state = TCP_SEQ_STATE_LISTENING;
2378 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2381 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2384 struct tcp_iter_state* st;
2386 if (v == SEQ_START_TOKEN) {
2387 rc = tcp_get_idx(seq, 0);
2392 switch (st->state) {
2393 case TCP_SEQ_STATE_OPENREQ:
2394 case TCP_SEQ_STATE_LISTENING:
2395 rc = listening_get_next(seq, v);
2397 tcp_listen_unlock();
2399 st->state = TCP_SEQ_STATE_ESTABLISHED;
2400 rc = established_get_first(seq);
2403 case TCP_SEQ_STATE_ESTABLISHED:
2404 case TCP_SEQ_STATE_TIME_WAIT:
2405 rc = established_get_next(seq, v);
2413 static void tcp_seq_stop(struct seq_file *seq, void *v)
2415 struct tcp_iter_state* st = seq->private;
2417 switch (st->state) {
2418 case TCP_SEQ_STATE_OPENREQ:
2420 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2421 read_unlock_bh(&tp->syn_wait_lock);
2423 case TCP_SEQ_STATE_LISTENING:
2424 if (v != SEQ_START_TOKEN)
2425 tcp_listen_unlock();
2427 case TCP_SEQ_STATE_TIME_WAIT:
2428 case TCP_SEQ_STATE_ESTABLISHED:
2430 read_unlock(&tcp_ehash[st->bucket].lock);
2436 static int tcp_seq_open(struct inode *inode, struct file *file)
2438 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2439 struct seq_file *seq;
2440 struct tcp_iter_state *s;
2443 if (unlikely(afinfo == NULL))
2446 s = kmalloc(sizeof(*s), GFP_KERNEL);
2449 memset(s, 0, sizeof(*s));
2450 s->family = afinfo->family;
2451 s->seq_ops.start = tcp_seq_start;
2452 s->seq_ops.next = tcp_seq_next;
2453 s->seq_ops.show = afinfo->seq_show;
2454 s->seq_ops.stop = tcp_seq_stop;
2456 rc = seq_open(file, &s->seq_ops);
2459 seq = file->private_data;
2468 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2471 struct proc_dir_entry *p;
2475 afinfo->seq_fops->owner = afinfo->owner;
2476 afinfo->seq_fops->open = tcp_seq_open;
2477 afinfo->seq_fops->read = seq_read;
2478 afinfo->seq_fops->llseek = seq_lseek;
2479 afinfo->seq_fops->release = seq_release_private;
2481 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2489 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2493 proc_net_remove(afinfo->name);
2494 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2497 static void get_openreq4(struct sock *sk, struct open_request *req,
2498 char *tmpbuf, int i, int uid)
2500 int ttd = req->expires - jiffies;
2502 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2503 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2505 req->af.v4_req.loc_addr,
2506 ntohs(inet_sk(sk)->sport),
2507 req->af.v4_req.rmt_addr,
2508 ntohs(req->rmt_port),
2510 0, 0, /* could print option size, but that is af dependent. */
2511 1, /* timers active (only the expire timer) */
2512 jiffies_to_clock_t(ttd),
2515 0, /* non standard timer */
2516 0, /* open_requests have no inode */
2517 atomic_read(&sk->sk_refcnt),
2521 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2524 unsigned long timer_expires;
2525 struct tcp_sock *tp = tcp_sk(sp);
2526 struct inet_sock *inet = inet_sk(sp);
2527 unsigned int dest = inet->daddr;
2528 unsigned int src = inet->rcv_saddr;
2529 __u16 destp = ntohs(inet->dport);
2530 __u16 srcp = ntohs(inet->sport);
2532 if (tp->pending == TCP_TIME_RETRANS) {
2534 timer_expires = tp->timeout;
2535 } else if (tp->pending == TCP_TIME_PROBE0) {
2537 timer_expires = tp->timeout;
2538 } else if (timer_pending(&sp->sk_timer)) {
2540 timer_expires = sp->sk_timer.expires;
2543 timer_expires = jiffies;
2546 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2547 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2548 i, src, srcp, dest, destp, sp->sk_state,
2549 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2551 jiffies_to_clock_t(timer_expires - jiffies),
2556 atomic_read(&sp->sk_refcnt), sp,
2557 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2559 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2562 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2564 unsigned int dest, src;
2566 int ttd = tw->tw_ttd - jiffies;
2571 dest = tw->tw_daddr;
2572 src = tw->tw_rcv_saddr;
2573 destp = ntohs(tw->tw_dport);
2574 srcp = ntohs(tw->tw_sport);
2576 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2577 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2578 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2579 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2580 atomic_read(&tw->tw_refcnt), tw);
2585 static int tcp4_seq_show(struct seq_file *seq, void *v)
2587 struct tcp_iter_state* st;
2588 char tmpbuf[TMPSZ + 1];
2590 if (v == SEQ_START_TOKEN) {
2591 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2592 " sl local_address rem_address st tx_queue "
2593 "rx_queue tr tm->when retrnsmt uid timeout "
2599 switch (st->state) {
2600 case TCP_SEQ_STATE_LISTENING:
2601 case TCP_SEQ_STATE_ESTABLISHED:
2602 get_tcp4_sock(v, tmpbuf, st->num);
2604 case TCP_SEQ_STATE_OPENREQ:
2605 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2607 case TCP_SEQ_STATE_TIME_WAIT:
2608 get_timewait4_sock(v, tmpbuf, st->num);
2611 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2616 static struct file_operations tcp4_seq_fops;
2617 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2618 .owner = THIS_MODULE,
2621 .seq_show = tcp4_seq_show,
2622 .seq_fops = &tcp4_seq_fops,
2625 int __init tcp4_proc_init(void)
2627 return tcp_proc_register(&tcp4_seq_afinfo);
2630 void tcp4_proc_exit(void)
2632 tcp_proc_unregister(&tcp4_seq_afinfo);
2634 #endif /* CONFIG_PROC_FS */
2636 struct proto tcp_prot = {
2638 .owner = THIS_MODULE,
2640 .connect = tcp_v4_connect,
2641 .disconnect = tcp_disconnect,
2642 .accept = tcp_accept,
2644 .init = tcp_v4_init_sock,
2645 .destroy = tcp_v4_destroy_sock,
2646 .shutdown = tcp_shutdown,
2647 .setsockopt = tcp_setsockopt,
2648 .getsockopt = tcp_getsockopt,
2649 .sendmsg = tcp_sendmsg,
2650 .recvmsg = tcp_recvmsg,
2651 .backlog_rcv = tcp_v4_do_rcv,
2652 .hash = tcp_v4_hash,
2653 .unhash = tcp_unhash,
2654 .get_port = tcp_v4_get_port,
2655 .enter_memory_pressure = tcp_enter_memory_pressure,
2656 .sockets_allocated = &tcp_sockets_allocated,
2657 .memory_allocated = &tcp_memory_allocated,
2658 .memory_pressure = &tcp_memory_pressure,
2659 .sysctl_mem = sysctl_tcp_mem,
2660 .sysctl_wmem = sysctl_tcp_wmem,
2661 .sysctl_rmem = sysctl_tcp_rmem,
2662 .max_header = MAX_TCP_HEADER,
2663 .slab_obj_size = sizeof(struct tcp_sock),
2668 void __init tcp_v4_init(struct net_proto_family *ops)
2670 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2672 panic("Failed to create the TCP control socket.\n");
2673 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2674 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2676 /* Unhash it so that IP input processing does not even
2677 * see it, we do not wish this socket to see incoming
2680 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2683 EXPORT_SYMBOL(ipv4_specific);
2684 EXPORT_SYMBOL(tcp_bind_hash);
2685 EXPORT_SYMBOL(tcp_bucket_create);
2686 EXPORT_SYMBOL(tcp_hashinfo);
2687 EXPORT_SYMBOL(tcp_inherit_port);
2688 EXPORT_SYMBOL(tcp_listen_wlock);
2689 EXPORT_SYMBOL(tcp_port_rover);
2690 EXPORT_SYMBOL(tcp_prot);
2691 EXPORT_SYMBOL(tcp_put_port);
2692 EXPORT_SYMBOL(tcp_unhash);
2693 EXPORT_SYMBOL(tcp_v4_conn_request);
2694 EXPORT_SYMBOL(tcp_v4_connect);
2695 EXPORT_SYMBOL(tcp_v4_do_rcv);
2696 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2697 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2698 EXPORT_SYMBOL(tcp_v4_send_check);
2699 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2701 #ifdef CONFIG_PROC_FS
2702 EXPORT_SYMBOL(tcp_proc_register);
2703 EXPORT_SYMBOL(tcp_proc_unregister);
2705 EXPORT_SYMBOL(sysctl_local_port_range);
2706 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2707 EXPORT_SYMBOL(sysctl_tcp_low_latency);