2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
23 #include <linux/config.h>
25 #include <linux/module.h>
26 #include <linux/sysctl.h>
27 #include <linux/workqueue.h>
29 #include <net/inet_common.h>
32 #include <linux/vs_limit.h>
33 #include <linux/vs_socket.h>
36 #define SYNC_INIT 0 /* let the user enable it */
41 int sysctl_tcp_tw_recycle;
42 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
44 int sysctl_tcp_syncookies = SYNC_INIT;
45 int sysctl_tcp_abort_on_overflow;
47 static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
49 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
53 if (after(end_seq, s_win) && before(seq, e_win))
55 return (seq == e_win && seq == end_seq);
58 /* New-style handling of TIME_WAIT sockets. */
63 /* Must be called with locally disabled BHs. */
64 static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
66 struct tcp_ehash_bucket *ehead;
67 struct tcp_bind_hashbucket *bhead;
68 struct tcp_bind_bucket *tb;
70 /* Unlink from established hashes. */
71 ehead = &tcp_ehash[tw->tw_hashent];
72 write_lock(&ehead->lock);
73 if (hlist_unhashed(&tw->tw_node)) {
74 write_unlock(&ehead->lock);
77 __hlist_del(&tw->tw_node);
78 sk_node_init(&tw->tw_node);
79 write_unlock(&ehead->lock);
81 /* Disassociate with bind bucket. */
82 bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
83 spin_lock(&bhead->lock);
85 __hlist_del(&tw->tw_bind_node);
87 tcp_bucket_destroy(tb);
88 spin_unlock(&bhead->lock);
90 #ifdef INET_REFCNT_DEBUG
91 if (atomic_read(&tw->tw_refcnt) != 1) {
92 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
93 atomic_read(&tw->tw_refcnt));
100 * * Main purpose of TIME-WAIT state is to close connection gracefully,
101 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
102 * (and, probably, tail of data) and one or more our ACKs are lost.
103 * * What is TIME-WAIT timeout? It is associated with maximal packet
104 * lifetime in the internet, which results in wrong conclusion, that
105 * it is set to catch "old duplicate segments" wandering out of their path.
106 * It is not quite correct. This timeout is calculated so that it exceeds
107 * maximal retransmission timeout enough to allow to lose one (or more)
108 * segments sent by peer and our ACKs. This time may be calculated from RTO.
109 * * When TIME-WAIT socket receives RST, it means that another end
110 * finally closed and we are allowed to kill TIME-WAIT too.
111 * * Second purpose of TIME-WAIT is catching old duplicate segments.
112 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
113 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
114 * * If we invented some more clever way to catch duplicates
115 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
117 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
118 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
119 * from the very beginning.
121 * NOTE. With recycling (and later with fin-wait-2) TW bucket
122 * is _not_ stateless. It means, that strictly speaking we must
123 * spinlock it. I do not want! Well, probability of misbehaviour
124 * is ridiculously low and, seems, we could use some mb() tricks
125 * to avoid misread sequence numbers, states etc. --ANK
128 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
129 struct tcphdr *th, unsigned len)
131 struct tcp_options_received tmp_opt;
134 tmp_opt.saw_tstamp = 0;
135 if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
136 tcp_parse_options(skb, &tmp_opt, 0);
138 if (tmp_opt.saw_tstamp) {
139 tmp_opt.ts_recent = tw->tw_ts_recent;
140 tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
141 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
145 if (tw->tw_substate == TCP_FIN_WAIT2) {
146 /* Just repeat all the checks of tcp_rcv_state_process() */
148 /* Out of window, send ACK */
150 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
152 tw->tw_rcv_nxt + tw->tw_rcv_wnd))
158 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
162 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
163 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
165 return TCP_TW_SUCCESS;
168 /* New data or FIN. If new data arrive after half-duplex close,
172 TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
174 tcp_tw_deschedule(tw);
179 /* FIN arrived, enter true time-wait state. */
180 tw->tw_substate = TCP_TIME_WAIT;
181 tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
182 if (tmp_opt.saw_tstamp) {
183 tw->tw_ts_recent_stamp = xtime.tv_sec;
184 tw->tw_ts_recent = tmp_opt.rcv_tsval;
187 /* I am shamed, but failed to make it more elegant.
188 * Yes, it is direct reference to IP, which is impossible
189 * to generalize to IPv6. Taking into account that IPv6
190 * do not undertsnad recycling in any case, it not
191 * a big problem in practice. --ANK */
192 if (tw->tw_family == AF_INET &&
193 sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
194 tcp_v4_tw_remember_stamp(tw))
195 tcp_tw_schedule(tw, tw->tw_timeout);
197 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
202 * Now real TIME-WAIT state.
205 * "When a connection is [...] on TIME-WAIT state [...]
206 * [a TCP] MAY accept a new SYN from the remote TCP to
207 * reopen the connection directly, if it:
209 * (1) assigns its initial sequence number for the new
210 * connection to be larger than the largest sequence
211 * number it used on the previous connection incarnation,
214 * (2) returns to TIME-WAIT state if the SYN turns out
215 * to be an old duplicate".
219 (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
220 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
221 /* In window segment, it may be only reset or bare ack. */
224 /* This is TIME_WAIT assasination, in two flavors.
225 * Oh well... nobody has a sufficient solution to this
228 if (sysctl_tcp_rfc1337 == 0) {
230 tcp_tw_deschedule(tw);
232 return TCP_TW_SUCCESS;
235 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
237 if (tmp_opt.saw_tstamp) {
238 tw->tw_ts_recent = tmp_opt.rcv_tsval;
239 tw->tw_ts_recent_stamp = xtime.tv_sec;
243 return TCP_TW_SUCCESS;
246 /* Out of window segment.
248 All the segments are ACKed immediately.
250 The only exception is new SYN. We accept it, if it is
251 not old duplicate and we are not in danger to be killed
252 by delayed old duplicates. RFC check is that it has
253 newer sequence number works at rates <40Mbit/sec.
254 However, if paws works, it is reliable AND even more,
255 we even may relax silly seq space cutoff.
257 RED-PEN: we violate main RFC requirement, if this SYN will appear
258 old duplicate (i.e. we receive RST in reply to SYN-ACK),
259 we must return socket to time-wait state. It is not good,
263 if (th->syn && !th->rst && !th->ack && !paws_reject &&
264 (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
265 (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
266 u32 isn = tw->tw_snd_nxt + 65535 + 2;
269 TCP_SKB_CB(skb)->when = isn;
274 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
277 /* In this case we must reset the TIMEWAIT timer.
279 * If it is ACKless SYN it may be both old duplicate
280 * and new good SYN with random sequence number <rcv_nxt.
281 * Do not reschedule in the last case.
283 if (paws_reject || th->ack)
284 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
286 /* Send ACK. Note, we do not put the bucket,
287 * it will be released by caller.
292 return TCP_TW_SUCCESS;
295 /* Enter the time wait state. This is called with locally disabled BH.
296 * Essentially we whip up a timewait bucket, copy the
297 * relevant info into it from the SK, and mess with hash chains
300 static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
302 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
303 struct tcp_bind_hashbucket *bhead;
305 /* Step 1: Put TW into bind hash. Original socket stays there too.
306 Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
307 binding cache, even if it is closed.
309 bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
310 spin_lock(&bhead->lock);
311 tw->tw_tb = tcp_sk(sk)->bind_hash;
312 BUG_TRAP(tcp_sk(sk)->bind_hash);
313 tw_add_bind_node(tw, &tw->tw_tb->owners);
314 spin_unlock(&bhead->lock);
316 write_lock(&ehead->lock);
318 /* Step 2: Remove SK from established hash. */
319 if (__sk_del_node_init(sk))
320 sock_prot_dec_use(sk->sk_prot);
322 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
323 tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
324 atomic_inc(&tw->tw_refcnt);
326 write_unlock(&ehead->lock);
330 * Move a socket to time-wait or dead fin-wait-2 state.
332 void tcp_time_wait(struct sock *sk, int state, int timeo)
334 struct tcp_tw_bucket *tw = NULL;
335 struct tcp_sock *tp = tcp_sk(sk);
338 if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
339 recycle_ok = tp->af_specific->remember_stamp(sk);
341 if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
342 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
345 struct inet_sock *inet = inet_sk(sk);
346 int rto = (tp->rto<<2) - (tp->rto>>1);
348 /* Give us an identity. */
349 tw->tw_daddr = inet->daddr;
350 tw->tw_rcv_saddr = inet->rcv_saddr;
351 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
352 tw->tw_num = inet->num;
353 tw->tw_state = TCP_TIME_WAIT;
354 tw->tw_substate = state;
355 tw->tw_sport = inet->sport;
356 tw->tw_dport = inet->dport;
357 tw->tw_family = sk->sk_family;
358 tw->tw_reuse = sk->sk_reuse;
359 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
360 atomic_set(&tw->tw_refcnt, 1);
362 tw->tw_hashent = sk->sk_hashent;
363 tw->tw_rcv_nxt = tp->rcv_nxt;
364 tw->tw_snd_nxt = tp->snd_nxt;
365 tw->tw_rcv_wnd = tcp_receive_window(tp);
366 tw->tw_ts_recent = tp->rx_opt.ts_recent;
367 tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
368 tw_dead_node_init(tw);
370 tw->tw_xid = sk->sk_xid;
371 tw->tw_vx_info = NULL;
372 tw->tw_nid = sk->sk_nid;
373 tw->tw_nx_info = NULL;
375 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
376 if (tw->tw_family == PF_INET6) {
377 struct ipv6_pinfo *np = inet6_sk(sk);
379 ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
380 ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
381 tw->tw_v6_ipv6only = np->ipv6only;
383 memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
384 memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
385 tw->tw_v6_ipv6only = 0;
388 /* Linkage updates. */
389 __tcp_tw_hashdance(sk, tw);
391 /* Get the TIME_WAIT timeout firing. */
396 tw->tw_timeout = rto;
398 tw->tw_timeout = TCP_TIMEWAIT_LEN;
399 if (state == TCP_TIME_WAIT)
400 timeo = TCP_TIMEWAIT_LEN;
403 tcp_tw_schedule(tw, timeo);
406 /* Sorry, if we're out of memory, just CLOSE this
407 * socket up. We've got bigger problems than
408 * non-graceful socket closings.
411 printk(KERN_INFO "TCP: time wait bucket table overflow\n");
414 tcp_update_metrics(sk);
418 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
419 static int tcp_tw_death_row_slot;
421 static void tcp_twkill(unsigned long);
423 /* TIME_WAIT reaping mechanism. */
424 #define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
425 #define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
427 #define TCP_TWKILL_QUOTA 100
429 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
430 static DEFINE_SPINLOCK(tw_death_lock);
431 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
432 static void twkill_work(void *);
433 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
434 static u32 twkill_thread_slots;
436 /* Returns non-zero if quota exceeded. */
437 static int tcp_do_twkill_work(int slot, unsigned int quota)
439 struct tcp_tw_bucket *tw;
440 struct hlist_node *node;
444 /* NOTE: compare this to previous version where lock
445 * was released after detaching chain. It was racy,
446 * because tw buckets are scheduled in not serialized context
447 * in 2.3 (with netfilter), and with softnet it is common, because
448 * soft irqs are not sequenced.
453 tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
454 __tw_del_dead_node(tw);
455 spin_unlock(&tw_death_lock);
456 tcp_timewait_kill(tw);
459 spin_lock(&tw_death_lock);
460 if (killed > quota) {
465 /* While we dropped tw_death_lock, another cpu may have
466 * killed off the next TW bucket in the list, therefore
467 * do a fresh re-read of the hlist head node with the
468 * lock reacquired. We still use the hlist traversal
469 * macro in order to get the prefetches.
474 tcp_tw_count -= killed;
475 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
480 static void tcp_twkill(unsigned long dummy)
484 spin_lock(&tw_death_lock);
486 if (tcp_tw_count == 0)
490 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
492 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
494 schedule_work(&tcp_twkill_work);
497 /* We purged the entire slot, anything left? */
501 tcp_tw_death_row_slot =
502 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
504 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
506 spin_unlock(&tw_death_lock);
509 extern void twkill_slots_invalid(void);
511 static void twkill_work(void *dummy)
515 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
516 twkill_slots_invalid();
518 while (twkill_thread_slots) {
519 spin_lock_bh(&tw_death_lock);
520 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
521 if (!(twkill_thread_slots & (1 << i)))
524 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
525 if (need_resched()) {
526 spin_unlock_bh(&tw_death_lock);
528 spin_lock_bh(&tw_death_lock);
532 twkill_thread_slots &= ~(1 << i);
534 spin_unlock_bh(&tw_death_lock);
538 /* These are always called from BH context. See callers in
539 * tcp_input.c to verify this.
542 /* This is for handling early-kills of TIME_WAIT sockets. */
543 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
545 spin_lock(&tw_death_lock);
546 if (tw_del_dead_node(tw)) {
548 if (--tcp_tw_count == 0)
549 del_timer(&tcp_tw_timer);
551 spin_unlock(&tw_death_lock);
552 tcp_timewait_kill(tw);
555 /* Short-time timewait calendar */
557 static int tcp_twcal_hand = -1;
558 static int tcp_twcal_jiffie;
559 static void tcp_twcal_tick(unsigned long);
560 static struct timer_list tcp_twcal_timer =
561 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
562 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
564 static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
566 struct hlist_head *list;
569 /* timeout := RTO * 3.5
571 * 3.5 = 1+2+0.5 to wait for two retransmits.
573 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
574 * our ACK acking that FIN can be lost. If N subsequent retransmitted
575 * FINs (or previous seqments) are lost (probability of such event
576 * is p^(N+1), where p is probability to lose single packet and
577 * time to detect the loss is about RTO*(2^N - 1) with exponential
578 * backoff). Normal timewait length is calculated so, that we
579 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
580 * [ BTW Linux. following BSD, violates this requirement waiting
581 * only for 60sec, we should wait at least for 240 secs.
582 * Well, 240 consumes too much of resources 8)
584 * This interval is not reduced to catch old duplicate and
585 * responces to our wandering segments living for two MSLs.
586 * However, if we use PAWS to detect
587 * old duplicates, we can reduce the interval to bounds required
588 * by RTO, rather than MSL. So, if peer understands PAWS, we
589 * kill tw bucket after 3.5*RTO (it is important that this number
590 * is greater than TS tick!) and detect old duplicates with help
593 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
595 spin_lock(&tw_death_lock);
597 /* Unlink it, if it was scheduled */
598 if (tw_del_dead_node(tw))
601 atomic_inc(&tw->tw_refcnt);
603 if (slot >= TCP_TW_RECYCLE_SLOTS) {
604 /* Schedule to slow timer */
605 if (timeo >= TCP_TIMEWAIT_LEN) {
606 slot = TCP_TWKILL_SLOTS-1;
608 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
609 if (slot >= TCP_TWKILL_SLOTS)
610 slot = TCP_TWKILL_SLOTS-1;
612 tw->tw_ttd = jiffies + timeo;
613 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
614 list = &tcp_tw_death_row[slot];
616 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
618 if (tcp_twcal_hand < 0) {
620 tcp_twcal_jiffie = jiffies;
621 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
622 add_timer(&tcp_twcal_timer);
624 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
625 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
626 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
628 list = &tcp_twcal_row[slot];
631 hlist_add_head(&tw->tw_death_node, list);
633 if (tcp_tw_count++ == 0)
634 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
635 spin_unlock(&tw_death_lock);
638 void tcp_twcal_tick(unsigned long dummy)
642 unsigned long now = jiffies;
646 spin_lock(&tw_death_lock);
647 if (tcp_twcal_hand < 0)
650 slot = tcp_twcal_hand;
651 j = tcp_twcal_jiffie;
653 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
654 if (time_before_eq(j, now)) {
655 struct hlist_node *node, *safe;
656 struct tcp_tw_bucket *tw;
658 tw_for_each_inmate_safe(tw, node, safe,
659 &tcp_twcal_row[slot]) {
660 __tw_del_dead_node(tw);
661 tcp_timewait_kill(tw);
668 tcp_twcal_jiffie = j;
669 tcp_twcal_hand = slot;
672 if (!hlist_empty(&tcp_twcal_row[slot])) {
673 mod_timer(&tcp_twcal_timer, j);
677 j += (1<<TCP_TW_RECYCLE_TICK);
678 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
683 if ((tcp_tw_count -= killed) == 0)
684 del_timer(&tcp_tw_timer);
685 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
686 spin_unlock(&tw_death_lock);
689 /* This is not only more efficient than what we used to do, it eliminates
690 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
692 * Actually, we could lots of memory writes here. tp of listening
693 * socket contains all necessary default parameters.
695 struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
697 /* allocate the newsk from the same slab of the master sock,
698 * if not, at sk_free time we'll try to free it from the wrong
699 * slabcache (i.e. is it TCPv4 or v6?) -acme */
700 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0, sk->sk_prot->slab);
703 struct tcp_sock *newtp;
704 struct sk_filter *filter;
706 memcpy(newsk, sk, sizeof(struct tcp_sock));
707 newsk->sk_state = TCP_SYN_RECV;
712 sk_node_init(&newsk->sk_node);
713 tcp_sk(newsk)->bind_hash = NULL;
715 /* Clone the TCP header template */
716 inet_sk(newsk)->dport = req->rmt_port;
718 sock_lock_init(newsk);
721 rwlock_init(&newsk->sk_dst_lock);
722 atomic_set(&newsk->sk_rmem_alloc, 0);
723 skb_queue_head_init(&newsk->sk_receive_queue);
724 atomic_set(&newsk->sk_wmem_alloc, 0);
725 skb_queue_head_init(&newsk->sk_write_queue);
726 atomic_set(&newsk->sk_omem_alloc, 0);
727 newsk->sk_wmem_queued = 0;
728 newsk->sk_forward_alloc = 0;
730 sock_reset_flag(newsk, SOCK_DONE);
731 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
732 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
733 newsk->sk_send_head = NULL;
734 rwlock_init(&newsk->sk_callback_lock);
735 skb_queue_head_init(&newsk->sk_error_queue);
736 newsk->sk_write_space = sk_stream_write_space;
738 if ((filter = newsk->sk_filter) != NULL)
739 sk_filter_charge(newsk, filter);
741 if (unlikely(xfrm_sk_clone_policy(newsk))) {
742 /* It is still raw copy of parent, so invalidate
743 * destructor and make plain sk_free() */
744 newsk->sk_destruct = NULL;
749 /* Now setup tcp_sock */
750 newtp = tcp_sk(newsk);
751 newtp->pred_flags = 0;
752 newtp->rcv_nxt = req->rcv_isn + 1;
753 newtp->snd_nxt = req->snt_isn + 1;
754 newtp->snd_una = req->snt_isn + 1;
755 newtp->snd_sml = req->snt_isn + 1;
757 tcp_prequeue_init(newtp);
759 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
761 newtp->retransmits = 0;
764 newtp->mdev = TCP_TIMEOUT_INIT;
765 newtp->rto = TCP_TIMEOUT_INIT;
767 newtp->packets_out = 0;
769 newtp->retrans_out = 0;
770 newtp->sacked_out = 0;
771 newtp->fackets_out = 0;
772 newtp->snd_ssthresh = 0x7fffffff;
774 /* So many TCP implementations out there (incorrectly) count the
775 * initial SYN frame in their delayed-ACK and congestion control
776 * algorithms that we must have the following bandaid to talk
777 * efficiently to them. -DaveM
780 newtp->snd_cwnd_cnt = 0;
782 newtp->frto_counter = 0;
783 newtp->frto_highmark = 0;
785 tcp_set_ca_state(newtp, TCP_CA_Open);
786 tcp_init_xmit_timers(newsk);
787 skb_queue_head_init(&newtp->out_of_order_queue);
788 newtp->rcv_wup = req->rcv_isn + 1;
789 newtp->write_seq = req->snt_isn + 1;
790 newtp->pushed_seq = newtp->write_seq;
791 newtp->copied_seq = req->rcv_isn + 1;
793 newtp->rx_opt.saw_tstamp = 0;
795 newtp->rx_opt.dsack = 0;
796 newtp->rx_opt.eff_sacks = 0;
798 newtp->probes_out = 0;
799 newtp->rx_opt.num_sacks = 0;
801 newtp->listen_opt = NULL;
802 newtp->accept_queue = newtp->accept_queue_tail = NULL;
803 /* Deinitialize syn_wait_lock to trap illegal accesses. */
804 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
806 /* Back to base struct sock members. */
808 newsk->sk_priority = 0;
809 atomic_set(&newsk->sk_refcnt, 2);
811 set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info);
812 newsk->sk_xid = sk->sk_xid;
814 set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info);
815 newsk->sk_nid = sk->sk_nid;
816 #ifdef INET_REFCNT_DEBUG
817 atomic_inc(&inet_sock_nr);
819 atomic_inc(&tcp_sockets_allocated);
821 if (sock_flag(newsk, SOCK_KEEPOPEN))
822 tcp_reset_keepalive_timer(newsk,
823 keepalive_time_when(newtp));
824 newsk->sk_socket = NULL;
825 newsk->sk_sleep = NULL;
826 newsk->sk_owner = NULL;
828 newtp->rx_opt.tstamp_ok = req->tstamp_ok;
829 if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
831 newtp->rx_opt.sack_ok |= 2;
833 newtp->window_clamp = req->window_clamp;
834 newtp->rcv_ssthresh = req->rcv_wnd;
835 newtp->rcv_wnd = req->rcv_wnd;
836 newtp->rx_opt.wscale_ok = req->wscale_ok;
837 if (newtp->rx_opt.wscale_ok) {
838 newtp->rx_opt.snd_wscale = req->snd_wscale;
839 newtp->rx_opt.rcv_wscale = req->rcv_wscale;
841 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
842 newtp->window_clamp = min(newtp->window_clamp, 65535U);
844 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
845 newtp->max_window = newtp->snd_wnd;
847 if (newtp->rx_opt.tstamp_ok) {
848 newtp->rx_opt.ts_recent = req->ts_recent;
849 newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
850 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
852 newtp->rx_opt.ts_recent_stamp = 0;
853 newtp->tcp_header_len = sizeof(struct tcphdr);
855 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
856 newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
857 newtp->rx_opt.mss_clamp = req->mss;
858 TCP_ECN_openreq_child(newtp, req);
859 if (newtp->ecn_flags&TCP_ECN_OK)
860 newsk->sk_no_largesend = 1;
864 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
870 * Process an incoming packet for SYN_RECV sockets represented
871 * as an open_request.
874 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
875 struct open_request *req,
876 struct open_request **prev)
878 struct tcphdr *th = skb->h.th;
879 struct tcp_sock *tp = tcp_sk(sk);
880 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
882 struct tcp_options_received tmp_opt;
885 tmp_opt.saw_tstamp = 0;
886 if (th->doff > (sizeof(struct tcphdr)>>2)) {
887 tcp_parse_options(skb, &tmp_opt, 0);
889 if (tmp_opt.saw_tstamp) {
890 tmp_opt.ts_recent = req->ts_recent;
891 /* We do not store true stamp, but it is not required,
892 * it can be estimated (approximately)
895 tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
896 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
900 /* Check for pure retransmitted SYN. */
901 if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
902 flg == TCP_FLAG_SYN &&
905 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
906 * this case on figure 6 and figure 8, but formal
907 * protocol description says NOTHING.
908 * To be more exact, it says that we should send ACK,
909 * because this segment (at least, if it has no data)
912 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
913 * describe SYN-RECV state. All the description
914 * is wrong, we cannot believe to it and should
915 * rely only on common sense and implementation
918 * Enforce "SYN-ACK" according to figure 8, figure 6
919 * of RFC793, fixed by RFC1122.
921 req->class->rtx_syn_ack(sk, req, NULL);
925 /* Further reproduces section "SEGMENT ARRIVES"
926 for state SYN-RECEIVED of RFC793.
927 It is broken, however, it does not work only
928 when SYNs are crossed.
930 You would think that SYN crossing is impossible here, since
931 we should have a SYN_SENT socket (from connect()) on our end,
932 but this is not true if the crossed SYNs were sent to both
933 ends by a malicious third party. We must defend against this,
934 and to do that we first verify the ACK (as per RFC793, page
935 36) and reset if it is invalid. Is this a true full defense?
936 To convince ourselves, let us consider a way in which the ACK
937 test can still pass in this 'malicious crossed SYNs' case.
938 Malicious sender sends identical SYNs (and thus identical sequence
939 numbers) to both A and B:
944 By our good fortune, both A and B select the same initial
945 send sequence number of seven :-)
947 A: sends SYN|ACK, seq=7, ack_seq=8
948 B: sends SYN|ACK, seq=7, ack_seq=8
950 So we are now A eating this SYN|ACK, ACK test passes. So
951 does sequence test, SYN is truncated, and thus we consider
954 If tp->defer_accept, we silently drop this bare ACK. Otherwise,
955 we create an established connection. Both ends (listening sockets)
956 accept the new incoming connection and try to talk to each other. 8-)
958 Note: This case is both harmless, and rare. Possibility is about the
959 same as us discovering intelligent life on another plant tomorrow.
961 But generally, we should (RFC lies!) to accept ACK
962 from SYNACK both here and in tcp_rcv_state_process().
963 tcp_rcv_state_process() does not, hence, we do not too.
965 Note that the case is absolutely generic:
966 we cannot optimize anything here without
967 violating protocol. All the checks must be made
968 before attempt to create socket.
971 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
972 * and the incoming segment acknowledges something not yet
973 * sent (the segment carries an unaccaptable ACK) ...
976 * Invalid ACK: reset will be sent by listening socket
978 if ((flg & TCP_FLAG_ACK) &&
979 (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
982 /* Also, it would be not so bad idea to check rcv_tsecr, which
983 * is essentially ACK extension and too early or too late values
984 * should cause reset in unsynchronized states.
987 /* RFC793: "first check sequence number". */
989 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
990 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
991 /* Out of window: send ACK and drop. */
992 if (!(flg & TCP_FLAG_RST))
993 req->class->send_ack(skb, req);
995 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
999 /* In sequence, PAWS is OK. */
1001 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
1002 req->ts_recent = tmp_opt.rcv_tsval;
1004 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
1005 /* Truncate SYN, it is out of window starting
1006 at req->rcv_isn+1. */
1007 flg &= ~TCP_FLAG_SYN;
1010 /* RFC793: "second check the RST bit" and
1011 * "fourth, check the SYN bit"
1013 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
1014 goto embryonic_reset;
1016 /* ACK sequence verified above, just make sure ACK is
1017 * set. If ACK not set, just silently drop the packet.
1019 if (!(flg & TCP_FLAG_ACK))
1022 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1023 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
1028 /* OK, ACK is valid, create big socket and
1029 * feed this segment to it. It will repeat all
1030 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
1031 * ESTABLISHED STATE. If it will be dropped after
1032 * socket is created, wait for troubles.
1034 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1036 goto listen_overflow;
1038 sk_set_owner(child, sk->sk_owner);
1039 tcp_synq_unlink(tp, req, prev);
1040 tcp_synq_removed(sk, req);
1042 tcp_acceptq_queue(sk, req, child);
1046 if (!sysctl_tcp_abort_on_overflow) {
1052 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
1053 if (!(flg & TCP_FLAG_RST))
1054 req->class->send_reset(skb);
1056 tcp_synq_drop(sk, req, prev);
1061 * Queue segment on the new socket if the new socket is active,
1062 * otherwise we just shortcircuit this and continue with
1066 int tcp_child_process(struct sock *parent, struct sock *child,
1067 struct sk_buff *skb)
1070 int state = child->sk_state;
1072 if (!sock_owned_by_user(child)) {
1073 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1075 /* Wakeup parent, send SIGIO */
1076 if (state == TCP_SYN_RECV && child->sk_state != state)
1077 parent->sk_data_ready(parent, 0);
1079 /* Alas, it is possible again, because we do lookup
1080 * in main socket hash table and lock on listening
1081 * socket does not protect us more.
1083 sk_add_backlog(child, skb);
1086 bh_unlock_sock(child);
1091 EXPORT_SYMBOL(tcp_check_req);
1092 EXPORT_SYMBOL(tcp_child_process);
1093 EXPORT_SYMBOL(tcp_create_openreq_child);
1094 EXPORT_SYMBOL(tcp_timewait_state_process);
1095 EXPORT_SYMBOL(tcp_tw_deschedule);