2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
48 /* This limits the percentage of the congestion window which we
49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty.
52 int sysctl_tcp_tso_win_divisor = 8;
55 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
59 sk->sk_send_head = NULL;
60 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
61 tcp_packets_out_inc(sk, tp, skb);
64 /* SND.NXT, if window was not shrunk.
65 * If window has been shrunk, what should we make? It is not clear at all.
66 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
67 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
68 * invalid. OK, let's make this for now:
70 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
72 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
75 return tp->snd_una+tp->snd_wnd;
78 /* Calculate mss to advertise in SYN segment.
79 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
81 * 1. It is independent of path mtu.
82 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
83 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
84 * attached devices, because some buggy hosts are confused by
86 * 4. We do not make 3, we advertise MSS, calculated from first
87 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
88 * This may be overridden via information stored in routing table.
89 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
90 * probably even Jumbo".
92 static __u16 tcp_advertise_mss(struct sock *sk)
94 struct tcp_opt *tp = tcp_sk(sk);
95 struct dst_entry *dst = __sk_dst_get(sk);
98 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
99 mss = dst_metric(dst, RTAX_ADVMSS);
106 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */
108 static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
110 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd;
114 if (tcp_is_vegas(tp))
115 tcp_vegas_enable(tp);
117 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd);
120 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
122 tp->snd_cwnd = max(cwnd, restart_cwnd);
123 tp->snd_cwnd_stamp = tcp_time_stamp;
124 tp->snd_cwnd_used = 0;
127 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
129 u32 now = tcp_time_stamp;
131 if (!tcp_get_pcount(&tp->packets_out) &&
132 (s32)(now - tp->lsndtime) > tp->rto)
133 tcp_cwnd_restart(tp, __sk_dst_get(sk));
137 /* If it is a reply for ato after last received
138 * packet, enter pingpong mode.
140 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
141 tp->ack.pingpong = 1;
144 static __inline__ void tcp_event_ack_sent(struct sock *sk)
146 struct tcp_opt *tp = tcp_sk(sk);
148 tcp_dec_quickack_mode(tp);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
152 /* Determine a window scaling and initial window to offer.
153 * Based on the assumption that the given amount of space
154 * will be offered. Store the results in the tp structure.
155 * NOTE: for smooth operation initial space offering should
156 * be a multiple of mss if possible. We assume here that mss >= 1.
157 * This MUST be enforced by all callers.
159 void tcp_select_initial_window(int __space, __u32 mss,
160 __u32 *rcv_wnd, __u32 *window_clamp,
161 int wscale_ok, __u8 *rcv_wscale)
163 unsigned int space = (__space < 0 ? 0 : __space);
165 /* If no clamp set the clamp to the max possible scaled window */
166 if (*window_clamp == 0)
167 (*window_clamp) = (65535 << 14);
168 space = min(*window_clamp, space);
170 /* Quantize space offering to a multiple of mss if possible. */
172 space = (space / mss) * mss;
174 /* NOTE: offering an initial window larger than 32767
175 * will break some buggy TCP stacks. We try to be nice.
176 * If we are not window scaling, then this truncates
177 * our initial window offering to 32k. There should also
178 * be a sysctl option to stop being nice.
180 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
183 /* Set window scaling on max possible window
184 * See RFC1323 for an explanation of the limit to 14
186 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
187 while (space > 65535 && (*rcv_wscale) < 14) {
193 /* Set initial window to value enough for senders,
194 * following RFC1414. Senders, not following this RFC,
195 * will be satisfied with 2.
197 if (mss > (1<<*rcv_wscale)) {
203 if (*rcv_wnd > init_cwnd*mss)
204 *rcv_wnd = init_cwnd*mss;
207 /* Set the clamp no higher than max representable value */
208 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
211 /* Chose a new window to advertise, update state in tcp_opt for the
212 * socket, and return result with RFC1323 scaling applied. The return
213 * value can be stuffed directly into th->window for an outgoing
216 static __inline__ u16 tcp_select_window(struct sock *sk)
218 struct tcp_opt *tp = tcp_sk(sk);
219 u32 cur_win = tcp_receive_window(tp);
220 u32 new_win = __tcp_select_window(sk);
222 /* Never shrink the offered window */
223 if(new_win < cur_win) {
224 /* Danger Will Robinson!
225 * Don't update rcv_wup/rcv_wnd here or else
226 * we will not be able to advertise a zero
227 * window in time. --DaveM
229 * Relax Will Robinson.
233 tp->rcv_wnd = new_win;
234 tp->rcv_wup = tp->rcv_nxt;
236 /* Make sure we do not exceed the maximum possible
240 new_win = min(new_win, MAX_TCP_WINDOW);
242 new_win = min(new_win, (65535U << tp->rcv_wscale));
244 /* RFC1323 scaling applied */
245 new_win >>= tp->rcv_wscale;
247 /* If we advertise zero window, disable fast path. */
255 /* This routine actually transmits TCP packets queued in by
256 * tcp_do_sendmsg(). This is used by both the initial
257 * transmission and possible later retransmissions.
258 * All SKB's seen here are completely headerless. It is our
259 * job to build the TCP header, and pass the packet down to
260 * IP so it can do the same plus pass the packet off to the
263 * We are working here with either a clone of the original
264 * SKB, or a fresh unique copy made by the retransmit engine.
266 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
269 struct inet_opt *inet = inet_sk(sk);
270 struct tcp_opt *tp = tcp_sk(sk);
271 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
272 int tcp_header_size = tp->tcp_header_len;
277 BUG_ON(!tcp_skb_pcount(skb));
279 #define SYSCTL_FLAG_TSTAMPS 0x1
280 #define SYSCTL_FLAG_WSCALE 0x2
281 #define SYSCTL_FLAG_SACK 0x4
284 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
286 if(sysctl_tcp_timestamps) {
287 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
288 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
290 if(sysctl_tcp_window_scaling) {
291 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
292 sysctl_flags |= SYSCTL_FLAG_WSCALE;
294 if(sysctl_tcp_sack) {
295 sysctl_flags |= SYSCTL_FLAG_SACK;
296 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
297 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
299 } else if (tp->eff_sacks) {
300 /* A SACK is 2 pad bytes, a 2 byte header, plus
301 * 2 32-bit sequence numbers for each SACK block.
303 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
304 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
308 * If the connection is idle and we are restarting,
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
321 skb_set_owner_w(skb, sk);
323 /* Build TCP header and checksum it. */
324 th->source = inet->sport;
325 th->dest = inet->dport;
326 th->seq = htonl(tcb->seq);
327 th->ack_seq = htonl(tp->rcv_nxt);
328 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
329 if (tcb->flags & TCPCB_FLAG_SYN) {
330 /* RFC1323: The window in SYN & SYN/ACK segments
333 th->window = htons(tp->rcv_wnd);
335 th->window = htons(tcp_select_window(sk));
341 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
342 th->urg_ptr = htons(tp->snd_up-tcb->seq);
346 if (tcb->flags & TCPCB_FLAG_SYN) {
347 tcp_syn_build_options((__u32 *)(th + 1),
348 tcp_advertise_mss(sk),
349 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
350 (sysctl_flags & SYSCTL_FLAG_SACK),
351 (sysctl_flags & SYSCTL_FLAG_WSCALE),
356 tcp_build_and_update_options((__u32 *)(th + 1),
359 TCP_ECN_send(sk, tp, skb, tcp_header_size);
361 tp->af_specific->send_check(sk, th, skb->len, skb);
363 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk);
366 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk);
369 TCP_INC_STATS(TCP_MIB_OUTSEGS);
371 err = tp->af_specific->queue_xmit(skb, 0);
377 /* NET_XMIT_CN is special. It does not guarantee,
378 * that this packet is lost. It tells that device
379 * is about to start to drop packets or already
380 * drops some packets of the same priority and
381 * invokes us to send less aggressively.
383 return err == NET_XMIT_CN ? 0 : err;
386 #undef SYSCTL_FLAG_TSTAMPS
387 #undef SYSCTL_FLAG_WSCALE
388 #undef SYSCTL_FLAG_SACK
392 /* This routine just queue's the buffer
394 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
395 * otherwise socket can stall.
397 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
399 struct tcp_opt *tp = tcp_sk(sk);
401 /* Advance write_seq and place onto the write_queue. */
402 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
403 __skb_queue_tail(&sk->sk_write_queue, skb);
404 sk_charge_skb(sk, skb);
406 /* Queue it, remembering where we must start sending. */
407 if (sk->sk_send_head == NULL)
408 sk->sk_send_head = skb;
411 /* Send _single_ skb sitting at the send head. This function requires
412 * true push pending frames to setup probe timer etc.
414 void tcp_push_one(struct sock *sk, unsigned cur_mss)
416 struct tcp_opt *tp = tcp_sk(sk);
417 struct sk_buff *skb = sk->sk_send_head;
419 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
420 /* Send it out now. */
421 TCP_SKB_CB(skb)->when = tcp_time_stamp;
422 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
423 sk->sk_send_head = NULL;
424 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
425 tcp_packets_out_inc(sk, tp, skb);
431 void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
433 if (skb->len <= mss_std) {
434 /* Avoid the costly divide in the normal
437 skb_shinfo(skb)->tso_segs = 1;
438 skb_shinfo(skb)->tso_size = 0;
442 factor = skb->len + (mss_std - 1);
444 skb_shinfo(skb)->tso_segs = factor;
445 skb_shinfo(skb)->tso_size = mss_std;
449 /* Function to create two new TCP segments. Shrinks the given segment
450 * to the specified size and appends a new segment with the rest of the
451 * packet to the list. This won't be called frequently, I hope.
452 * Remember, these are still headerless SKBs at this point.
454 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
456 struct tcp_opt *tp = tcp_sk(sk);
457 struct sk_buff *buff;
458 int nsize = skb->len - len;
461 if (skb_cloned(skb) &&
462 skb_is_nonlinear(skb) &&
463 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
466 /* Get a new skb... force flag on. */
467 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
469 return -ENOMEM; /* We'll just try again later. */
470 sk_charge_skb(sk, buff);
472 /* Correct the sequence numbers. */
473 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
474 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
475 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
477 /* PSH and FIN should only be set in the second packet. */
478 flags = TCP_SKB_CB(skb)->flags;
479 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
480 TCP_SKB_CB(buff)->flags = flags;
481 TCP_SKB_CB(buff)->sacked =
482 (TCP_SKB_CB(skb)->sacked &
483 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
484 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
486 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
487 /* Copy and checksum data tail into the new buffer. */
488 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
493 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
495 skb->ip_summed = CHECKSUM_HW;
496 skb_split(skb, buff, len);
499 buff->ip_summed = skb->ip_summed;
501 /* Looks stupid, but our code really uses when of
502 * skbs, which it never sent before. --ANK
504 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
506 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
507 tcp_dec_pcount(&tp->lost_out, skb);
508 tcp_dec_pcount(&tp->left_out, skb);
511 /* Fix up tso_factor for both original and new SKB. */
512 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
513 tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
515 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
516 tcp_inc_pcount(&tp->lost_out, skb);
517 tcp_inc_pcount(&tp->left_out, skb);
520 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
521 tcp_inc_pcount(&tp->lost_out, buff);
522 tcp_inc_pcount(&tp->left_out, buff);
525 /* Link BUFF into the send queue. */
526 __skb_append(skb, buff);
531 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
532 * eventually). The difference is that pulled data not copied, but
533 * immediately discarded.
535 static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
541 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
542 if (skb_shinfo(skb)->frags[i].size <= eat) {
543 put_page(skb_shinfo(skb)->frags[i].page);
544 eat -= skb_shinfo(skb)->frags[i].size;
546 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
548 skb_shinfo(skb)->frags[k].page_offset += eat;
549 skb_shinfo(skb)->frags[k].size -= eat;
555 skb_shinfo(skb)->nr_frags = k;
557 skb->tail = skb->data;
558 skb->data_len -= len;
559 skb->len = skb->data_len;
563 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
565 struct tcp_opt *tp = tcp_sk(sk);
567 if (skb_cloned(skb) &&
568 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
571 if (len <= skb_headlen(skb)) {
572 __skb_pull(skb, len);
574 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
578 TCP_SKB_CB(skb)->seq += len;
579 skb->ip_summed = CHECKSUM_HW;
581 skb->truesize -= len;
582 sk->sk_queue_shrunk = 1;
583 sk->sk_wmem_queued -= len;
584 sk->sk_forward_alloc += len;
586 /* Any change of skb->len requires recalculation of tso
589 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
594 /* This function synchronize snd mss to current pmtu/exthdr set.
596 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
597 for TCP options, but includes only bare TCP header.
599 tp->mss_clamp is mss negotiated at connection setup.
600 It is minumum of user_mss and mss received with SYN.
601 It also does not include TCP options.
603 tp->pmtu_cookie is last pmtu, seen by this function.
605 tp->mss_cache is current effective sending mss, including
606 all tcp options except for SACKs. It is evaluated,
607 taking into account current pmtu, but never exceeds
610 NOTE1. rfc1122 clearly states that advertised MSS
611 DOES NOT include either tcp or ip options.
613 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
614 this function. --ANK (980731)
617 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
619 struct tcp_opt *tp = tcp_sk(sk);
620 struct dst_entry *dst = __sk_dst_get(sk);
623 if (dst && dst->ops->get_mss)
624 pmtu = dst->ops->get_mss(dst, pmtu);
626 /* Calculate base mss without TCP options:
627 It is MMS_S - sizeof(tcphdr) of rfc1122
629 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
631 /* Clamp it (mss_clamp does not include tcp options) */
632 if (mss_now > tp->mss_clamp)
633 mss_now = tp->mss_clamp;
635 /* Now subtract optional transport overhead */
636 mss_now -= tp->ext_header_len + tp->ext2_header_len;
638 /* Then reserve room for full set of TCP options and 8 bytes of data */
642 /* Now subtract TCP options size, not including SACKs */
643 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
645 /* Bound mss with half of window */
646 if (tp->max_window && mss_now > (tp->max_window>>1))
647 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
649 /* And store cached results */
650 tp->pmtu_cookie = pmtu;
651 tp->mss_cache = tp->mss_cache_std = mss_now;
656 /* Compute the current effective MSS, taking SACKs and IP options,
657 * and even PMTU discovery events into account.
659 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
660 * cannot be large. However, taking into account rare use of URG, this
664 unsigned int tcp_current_mss(struct sock *sk, int large)
666 struct tcp_opt *tp = tcp_sk(sk);
667 struct dst_entry *dst = __sk_dst_get(sk);
668 unsigned int do_large, mss_now;
670 mss_now = tp->mss_cache_std;
672 u32 mtu = dst_pmtu(dst);
673 if (mtu != tp->pmtu_cookie ||
674 tp->ext2_header_len != dst->header_len)
675 mss_now = tcp_sync_mss(sk, mtu);
679 (sk->sk_route_caps & NETIF_F_TSO) &&
683 unsigned int large_mss, factor, limit;
685 large_mss = 65535 - tp->af_specific->net_header_len -
686 tp->ext_header_len - tp->ext2_header_len -
689 if (tp->max_window && large_mss > (tp->max_window>>1))
690 large_mss = max((tp->max_window>>1),
691 68U - tp->tcp_header_len);
693 factor = large_mss / mss_now;
695 /* Always keep large mss multiple of real mss, but
696 * do not exceed 1/tso_win_divisor of the congestion window
697 * so we can keep the ACK clock ticking and minimize
700 limit = tp->snd_cwnd;
701 if (sysctl_tcp_tso_win_divisor)
702 limit /= sysctl_tcp_tso_win_divisor;
703 limit = max(1U, limit);
707 tp->mss_cache = mss_now * factor;
709 mss_now = tp->mss_cache;
713 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
714 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
718 /* This routine writes packets to the network. It advances the
719 * send_head. This happens as incoming acks open up the remote
722 * Returns 1, if no segments are in flight and we have queued segments, but
723 * cannot send anything now because of SWS or another problem.
725 int tcp_write_xmit(struct sock *sk, int nonagle)
727 struct tcp_opt *tp = tcp_sk(sk);
728 unsigned int mss_now;
730 /* If we are closed, the bytes will have to remain here.
731 * In time closedown will finish, we empty the write queue and all
734 if (sk->sk_state != TCP_CLOSE) {
738 /* Account for SACKS, we may need to fragment due to this.
739 * It is just like the real MSS changing on us midstream.
740 * We also handle things correctly when the user adds some
741 * IP options mid-stream. Silly to do, but cover it.
743 mss_now = tcp_current_mss(sk, 1);
745 while ((skb = sk->sk_send_head) &&
746 tcp_snd_test(tp, skb, mss_now,
747 tcp_skb_is_last(sk, skb) ? nonagle :
749 if (skb->len > mss_now) {
750 if (tcp_fragment(sk, skb, mss_now))
754 TCP_SKB_CB(skb)->when = tcp_time_stamp;
755 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
758 /* Advance the send_head. This one is sent out.
759 * This call will increment packets_out.
761 update_send_head(sk, tp, skb);
763 tcp_minshall_update(tp, mss_now, skb);
768 tcp_cwnd_validate(sk, tp);
772 return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head;
777 /* This function returns the amount that we can raise the
778 * usable window based on the following constraints
780 * 1. The window can never be shrunk once it is offered (RFC 793)
781 * 2. We limit memory per socket
784 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
785 * RECV.NEXT + RCV.WIN fixed until:
786 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
788 * i.e. don't raise the right edge of the window until you can raise
789 * it at least MSS bytes.
791 * Unfortunately, the recommended algorithm breaks header prediction,
792 * since header prediction assumes th->window stays fixed.
794 * Strictly speaking, keeping th->window fixed violates the receiver
795 * side SWS prevention criteria. The problem is that under this rule
796 * a stream of single byte packets will cause the right side of the
797 * window to always advance by a single byte.
799 * Of course, if the sender implements sender side SWS prevention
800 * then this will not be a problem.
802 * BSD seems to make the following compromise:
804 * If the free space is less than the 1/4 of the maximum
805 * space available and the free space is less than 1/2 mss,
806 * then set the window to 0.
807 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
808 * Otherwise, just prevent the window from shrinking
809 * and from being larger than the largest representable value.
811 * This prevents incremental opening of the window in the regime
812 * where TCP is limited by the speed of the reader side taking
813 * data out of the TCP receive queue. It does nothing about
814 * those cases where the window is constrained on the sender side
815 * because the pipeline is full.
817 * BSD also seems to "accidentally" limit itself to windows that are a
818 * multiple of MSS, at least until the free space gets quite small.
819 * This would appear to be a side effect of the mbuf implementation.
820 * Combining these two algorithms results in the observed behavior
821 * of having a fixed window size at almost all times.
823 * Below we obtain similar behavior by forcing the offered window to
824 * a multiple of the mss when it is feasible to do so.
826 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
827 * Regular options like TIMESTAMP are taken into account.
829 u32 __tcp_select_window(struct sock *sk)
831 struct tcp_opt *tp = tcp_sk(sk);
832 /* MSS for the peer's data. Previous verions used mss_clamp
833 * here. I don't know if the value based on our guesses
834 * of peer's MSS is better for the performance. It's more correct
835 * but may be worse for the performance because of rcv_mss
836 * fluctuations. --SAW 1998/11/1
838 int mss = tp->ack.rcv_mss;
839 int free_space = tcp_space(sk);
840 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
843 if (mss > full_space)
846 if (free_space < full_space/2) {
849 if (tcp_memory_pressure)
850 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
852 if (free_space < mss)
856 if (free_space > tp->rcv_ssthresh)
857 free_space = tp->rcv_ssthresh;
859 /* Don't do rounding if we are using window scaling, since the
860 * scaled window will not line up with the MSS boundary anyway.
862 window = tp->rcv_wnd;
863 if (tp->rcv_wscale) {
866 /* Advertise enough space so that it won't get scaled away.
867 * Import case: prevent zero window announcement if
868 * 1<<rcv_wscale > mss.
870 if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window)
871 window = (((window >> tp->rcv_wscale) + 1)
874 /* Get the largest window that is a nice multiple of mss.
875 * Window clamp already applied above.
876 * If our current window offering is within 1 mss of the
877 * free space we just keep it. This prevents the divide
878 * and multiply from happening most of the time.
879 * We also don't do any window rounding when the free space
882 if (window <= free_space - mss || window > free_space)
883 window = (free_space/mss)*mss;
889 /* Attempt to collapse two adjacent SKB's during retransmission. */
890 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
892 struct tcp_opt *tp = tcp_sk(sk);
893 struct sk_buff *next_skb = skb->next;
895 /* The first test we must make is that neither of these two
896 * SKB's are still referenced by someone else.
898 if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
899 int skb_size = skb->len, next_skb_size = next_skb->len;
900 u16 flags = TCP_SKB_CB(skb)->flags;
902 /* Also punt if next skb has been SACK'd. */
903 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
906 /* Next skb is out of window. */
907 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
910 /* Punt if not enough space exists in the first SKB for
911 * the data in the second, or the total combined payload
912 * would exceed the MSS.
914 if ((next_skb_size > skb_tailroom(skb)) ||
915 ((skb_size + next_skb_size) > mss_now))
918 BUG_ON(tcp_skb_pcount(skb) != 1 ||
919 tcp_skb_pcount(next_skb) != 1);
921 /* Ok. We will be able to collapse the packet. */
922 __skb_unlink(next_skb, next_skb->list);
924 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
926 if (next_skb->ip_summed == CHECKSUM_HW)
927 skb->ip_summed = CHECKSUM_HW;
929 if (skb->ip_summed != CHECKSUM_HW)
930 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
932 /* Update sequence range on original skb. */
933 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
935 /* Merge over control information. */
936 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
937 TCP_SKB_CB(skb)->flags = flags;
939 /* All done, get rid of second SKB and account for it so
940 * packet counting does not break.
942 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
943 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
944 tcp_dec_pcount(&tp->retrans_out, next_skb);
945 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
946 tcp_dec_pcount(&tp->lost_out, next_skb);
947 tcp_dec_pcount(&tp->left_out, next_skb);
949 /* Reno case is special. Sigh... */
950 if (!tp->sack_ok && tcp_get_pcount(&tp->sacked_out)) {
951 tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
952 tcp_dec_pcount(&tp->left_out, next_skb);
955 /* Not quite right: it can be > snd.fack, but
956 * it is better to underestimate fackets.
958 tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
959 tcp_packets_out_dec(tp, next_skb);
960 sk_stream_free_skb(sk, next_skb);
964 /* Do a simple retransmit without using the backoff mechanisms in
965 * tcp_timer. This is used for path mtu discovery.
966 * The socket is already locked here.
968 void tcp_simple_retransmit(struct sock *sk)
970 struct tcp_opt *tp = tcp_sk(sk);
972 unsigned int mss = tcp_current_mss(sk, 0);
975 sk_stream_for_retrans_queue(skb, sk) {
976 if (skb->len > mss &&
977 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
978 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
979 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
980 tcp_dec_pcount(&tp->retrans_out, skb);
982 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
983 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
984 tcp_inc_pcount(&tp->lost_out, skb);
993 tcp_sync_left_out(tp);
995 /* Don't muck with the congestion window here.
996 * Reason is that we do not increase amount of _data_
997 * in network, but units changed and effective
998 * cwnd/ssthresh really reduced now.
1000 if (tp->ca_state != TCP_CA_Loss) {
1001 tp->high_seq = tp->snd_nxt;
1002 tp->snd_ssthresh = tcp_current_ssthresh(tp);
1003 tp->prior_ssthresh = 0;
1004 tp->undo_marker = 0;
1005 tcp_set_ca_state(tp, TCP_CA_Loss);
1007 tcp_xmit_retransmit_queue(sk);
1010 /* This retransmits one SKB. Policy decisions and retransmit queue
1011 * state updates are done by the caller. Returns non-zero if an
1012 * error occurred which prevented the send.
1014 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1016 struct tcp_opt *tp = tcp_sk(sk);
1017 unsigned int cur_mss = tcp_current_mss(sk, 0);
1020 /* Do not sent more than we queued. 1/4 is reserved for possible
1021 * copying overhead: frgagmentation, tunneling, mangling etc.
1023 if (atomic_read(&sk->sk_wmem_alloc) >
1024 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1027 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1028 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1031 if (sk->sk_route_caps & NETIF_F_TSO) {
1032 sk->sk_route_caps &= ~NETIF_F_TSO;
1033 sk->sk_no_largesend = 1;
1034 tp->mss_cache = tp->mss_cache_std;
1037 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1041 /* If receiver has shrunk his window, and skb is out of
1042 * new window, do not retransmit it. The exception is the
1043 * case, when window is shrunk to zero. In this case
1044 * our retransmit serves as a zero window probe.
1046 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
1047 && TCP_SKB_CB(skb)->seq != tp->snd_una)
1050 if (skb->len > cur_mss) {
1051 int old_factor = tcp_skb_pcount(skb);
1054 if (tcp_fragment(sk, skb, cur_mss))
1055 return -ENOMEM; /* We'll try again later. */
1057 /* New SKB created, account for it. */
1058 new_factor = tcp_skb_pcount(skb);
1059 tcp_dec_pcount_explicit(&tp->packets_out,
1060 old_factor - new_factor);
1061 tcp_inc_pcount(&tp->packets_out, skb->next);
1064 /* Collapse two adjacent packets if worthwhile and we can. */
1065 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1066 (skb->len < (cur_mss >> 1)) &&
1067 (skb->next != sk->sk_send_head) &&
1068 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
1069 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
1070 (sysctl_tcp_retrans_collapse != 0))
1071 tcp_retrans_try_collapse(sk, skb, cur_mss);
1073 if(tp->af_specific->rebuild_header(sk))
1074 return -EHOSTUNREACH; /* Routing failure or similar. */
1076 /* Some Solaris stacks overoptimize and ignore the FIN on a
1077 * retransmit when old data is attached. So strip it off
1078 * since it is cheap to do so and saves bytes on the network.
1081 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1082 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1083 if (!pskb_trim(skb, 0)) {
1084 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1085 skb_shinfo(skb)->tso_segs = 1;
1086 skb_shinfo(skb)->tso_size = 0;
1087 skb->ip_summed = CHECKSUM_NONE;
1092 /* Make a copy, if the first transmission SKB clone we made
1093 * is still in somebody's hands, else make a clone.
1095 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1097 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1098 pskb_copy(skb, GFP_ATOMIC):
1099 skb_clone(skb, GFP_ATOMIC)));
1102 /* Update global TCP statistics. */
1103 TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1105 #if FASTRETRANS_DEBUG > 0
1106 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1107 if (net_ratelimit())
1108 printk(KERN_DEBUG "retrans_out leaked.\n");
1111 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1112 tcp_inc_pcount(&tp->retrans_out, skb);
1114 /* Save stamp of the first retransmit. */
1115 if (!tp->retrans_stamp)
1116 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1120 /* snd_nxt is stored to detect loss of retransmitted segment,
1121 * see tcp_input.c tcp_sacktag_write_queue().
1123 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1128 /* This gets called after a retransmit timeout, and the initially
1129 * retransmitted data is acknowledged. It tries to continue
1130 * resending the rest of the retransmit queue, until either
1131 * we've sent it all or the congestion window limit is reached.
1132 * If doing SACK, the first ACK which comes back for a timeout
1133 * based retransmit packet might feed us FACK information again.
1134 * If so, we use it to avoid unnecessarily retransmissions.
1136 void tcp_xmit_retransmit_queue(struct sock *sk)
1138 struct tcp_opt *tp = tcp_sk(sk);
1139 struct sk_buff *skb;
1140 int packet_cnt = tcp_get_pcount(&tp->lost_out);
1142 /* First pass: retransmit lost packets. */
1144 sk_stream_for_retrans_queue(skb, sk) {
1145 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1147 /* Assume this retransmit will generate
1148 * only one packet for congestion window
1149 * calculation purposes. This works because
1150 * tcp_retransmit_skb() will chop up the
1151 * packet to be MSS sized and all the
1152 * packet counting works out.
1154 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1157 if (sacked&TCPCB_LOST) {
1158 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1159 if (tcp_retransmit_skb(sk, skb))
1161 if (tp->ca_state != TCP_CA_Loss)
1162 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1164 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1167 skb_peek(&sk->sk_write_queue))
1168 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1171 packet_cnt -= tcp_skb_pcount(skb);
1172 if (packet_cnt <= 0)
1178 /* OK, demanded retransmission is finished. */
1180 /* Forward retransmissions are possible only during Recovery. */
1181 if (tp->ca_state != TCP_CA_Recovery)
1184 /* No forward retransmissions in Reno are possible. */
1188 /* Yeah, we have to make difficult choice between forward transmission
1189 * and retransmission... Both ways have their merits...
1191 * For now we do not retransmit anything, while we have some new
1195 if (tcp_may_send_now(sk, tp))
1200 sk_stream_for_retrans_queue(skb, sk) {
1201 /* Similar to the retransmit loop above we
1202 * can pretend that the retransmitted SKB
1203 * we send out here will be composed of one
1204 * real MSS sized packet because tcp_retransmit_skb()
1205 * will fragment it if necessary.
1207 if (++packet_cnt > tcp_get_pcount(&tp->fackets_out))
1210 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1213 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1216 /* Ok, retransmit it. */
1217 if (tcp_retransmit_skb(sk, skb))
1220 if (skb == skb_peek(&sk->sk_write_queue))
1221 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1223 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1228 /* Send a fin. The caller locks the socket for us. This cannot be
1229 * allowed to fail queueing a FIN frame under any circumstances.
1231 void tcp_send_fin(struct sock *sk)
1233 struct tcp_opt *tp = tcp_sk(sk);
1234 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1237 /* Optimization, tack on the FIN if we have a queue of
1238 * unsent frames. But be careful about outgoing SACKS
1241 mss_now = tcp_current_mss(sk, 1);
1243 if (sk->sk_send_head != NULL) {
1244 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1245 TCP_SKB_CB(skb)->end_seq++;
1248 /* Socket is locked, keep trying until memory is available. */
1250 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1256 /* Reserve space for headers and prepare control bits. */
1257 skb_reserve(skb, MAX_TCP_HEADER);
1259 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1260 TCP_SKB_CB(skb)->sacked = 0;
1261 skb_shinfo(skb)->tso_segs = 1;
1262 skb_shinfo(skb)->tso_size = 0;
1264 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1265 TCP_SKB_CB(skb)->seq = tp->write_seq;
1266 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1267 tcp_queue_skb(sk, skb);
1269 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1272 /* We get here when a process closes a file descriptor (either due to
1273 * an explicit close() or as a byproduct of exit()'ing) and there
1274 * was unread data in the receive queue. This behavior is recommended
1275 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1277 void tcp_send_active_reset(struct sock *sk, int priority)
1279 struct tcp_opt *tp = tcp_sk(sk);
1280 struct sk_buff *skb;
1282 /* NOTE: No TCP options attached and we never retransmit this. */
1283 skb = alloc_skb(MAX_TCP_HEADER, priority);
1285 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1289 /* Reserve space for headers and prepare control bits. */
1290 skb_reserve(skb, MAX_TCP_HEADER);
1292 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1293 TCP_SKB_CB(skb)->sacked = 0;
1294 skb_shinfo(skb)->tso_segs = 1;
1295 skb_shinfo(skb)->tso_size = 0;
1298 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1299 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1300 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1301 if (tcp_transmit_skb(sk, skb))
1302 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1305 /* WARNING: This routine must only be called when we have already sent
1306 * a SYN packet that crossed the incoming SYN that caused this routine
1307 * to get called. If this assumption fails then the initial rcv_wnd
1308 * and rcv_wscale values will not be correct.
1310 int tcp_send_synack(struct sock *sk)
1312 struct sk_buff* skb;
1314 skb = skb_peek(&sk->sk_write_queue);
1315 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1316 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1319 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1320 if (skb_cloned(skb)) {
1321 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1324 __skb_unlink(skb, &sk->sk_write_queue);
1325 __skb_queue_head(&sk->sk_write_queue, nskb);
1326 sk_stream_free_skb(sk, skb);
1327 sk_charge_skb(sk, nskb);
1331 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1332 TCP_ECN_send_synack(tcp_sk(sk), skb);
1334 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1335 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1339 * Prepare a SYN-ACK.
1341 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1342 struct open_request *req)
1344 struct tcp_opt *tp = tcp_sk(sk);
1346 int tcp_header_size;
1347 struct sk_buff *skb;
1349 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1353 /* Reserve space for headers. */
1354 skb_reserve(skb, MAX_TCP_HEADER);
1356 skb->dst = dst_clone(dst);
1358 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1359 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1360 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1361 /* SACK_PERM is in the place of NOP NOP of TS */
1362 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1363 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1365 memset(th, 0, sizeof(struct tcphdr));
1368 if (dst->dev->features&NETIF_F_TSO)
1370 TCP_ECN_make_synack(req, th);
1371 th->source = inet_sk(sk)->sport;
1372 th->dest = req->rmt_port;
1373 TCP_SKB_CB(skb)->seq = req->snt_isn;
1374 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1375 TCP_SKB_CB(skb)->sacked = 0;
1376 skb_shinfo(skb)->tso_segs = 1;
1377 skb_shinfo(skb)->tso_size = 0;
1378 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1379 th->ack_seq = htonl(req->rcv_isn + 1);
1380 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1382 /* Set this up on the first call only */
1383 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1384 /* tcp_full_space because it is guaranteed to be the first packet */
1385 tcp_select_initial_window(tcp_full_space(sk),
1386 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1391 req->rcv_wscale = rcv_wscale;
1394 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1395 th->window = htons(req->rcv_wnd);
1397 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1398 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1399 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1400 TCP_SKB_CB(skb)->when,
1404 th->doff = (tcp_header_size >> 2);
1405 TCP_INC_STATS(TCP_MIB_OUTSEGS);
1410 * Do all connect socket setups that can be done AF independent.
1412 static inline void tcp_connect_init(struct sock *sk)
1414 struct dst_entry *dst = __sk_dst_get(sk);
1415 struct tcp_opt *tp = tcp_sk(sk);
1417 /* We'll fix this up when we get a response from the other end.
1418 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1420 tp->tcp_header_len = sizeof(struct tcphdr) +
1421 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1423 /* If user gave his TCP_MAXSEG, record it to clamp */
1425 tp->mss_clamp = tp->user_mss;
1427 tcp_sync_mss(sk, dst_pmtu(dst));
1429 if (!tp->window_clamp)
1430 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1431 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1432 tcp_initialize_rcv_mss(sk);
1435 tcp_select_initial_window(tcp_full_space(sk),
1436 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1439 sysctl_tcp_window_scaling,
1442 tp->rcv_ssthresh = tp->rcv_wnd;
1445 sock_reset_flag(sk, SOCK_DONE);
1447 tcp_init_wl(tp, tp->write_seq, 0);
1448 tp->snd_una = tp->write_seq;
1449 tp->snd_sml = tp->write_seq;
1454 tp->rto = TCP_TIMEOUT_INIT;
1455 tp->retransmits = 0;
1456 tcp_clear_retrans(tp);
1460 * Build a SYN and send it off.
1462 int tcp_connect(struct sock *sk)
1464 struct tcp_opt *tp = tcp_sk(sk);
1465 struct sk_buff *buff;
1467 tcp_connect_init(sk);
1469 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1470 if (unlikely(buff == NULL))
1473 /* Reserve space for headers. */
1474 skb_reserve(buff, MAX_TCP_HEADER);
1476 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1477 TCP_ECN_send_syn(sk, tp, buff);
1478 TCP_SKB_CB(buff)->sacked = 0;
1479 skb_shinfo(buff)->tso_segs = 1;
1480 skb_shinfo(buff)->tso_size = 0;
1482 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1483 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1484 tp->snd_nxt = tp->write_seq;
1485 tp->pushed_seq = tp->write_seq;
1489 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1490 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1491 __skb_queue_tail(&sk->sk_write_queue, buff);
1492 sk_charge_skb(sk, buff);
1493 tcp_inc_pcount(&tp->packets_out, buff);
1494 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1495 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1497 /* Timer for repeating the SYN until an answer. */
1498 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1502 /* Send out a delayed ack, the caller does the policy checking
1503 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1506 void tcp_send_delayed_ack(struct sock *sk)
1508 struct tcp_opt *tp = tcp_sk(sk);
1509 int ato = tp->ack.ato;
1510 unsigned long timeout;
1512 if (ato > TCP_DELACK_MIN) {
1515 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1516 max_ato = TCP_DELACK_MAX;
1518 /* Slow path, intersegment interval is "high". */
1520 /* If some rtt estimate is known, use it to bound delayed ack.
1521 * Do not use tp->rto here, use results of rtt measurements
1525 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1531 ato = min(ato, max_ato);
1534 /* Stay within the limit we were given */
1535 timeout = jiffies + ato;
1537 /* Use new timeout only if there wasn't a older one earlier. */
1538 if (tp->ack.pending&TCP_ACK_TIMER) {
1539 /* If delack timer was blocked or is about to expire,
1542 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1547 if (!time_before(timeout, tp->ack.timeout))
1548 timeout = tp->ack.timeout;
1550 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1551 tp->ack.timeout = timeout;
1552 sk_reset_timer(sk, &tp->delack_timer, timeout);
1555 /* This routine sends an ack and also updates the window. */
1556 void tcp_send_ack(struct sock *sk)
1558 /* If we have been reset, we may not send again. */
1559 if (sk->sk_state != TCP_CLOSE) {
1560 struct tcp_opt *tp = tcp_sk(sk);
1561 struct sk_buff *buff;
1563 /* We are not putting this on the write queue, so
1564 * tcp_transmit_skb() will set the ownership to this
1567 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1569 tcp_schedule_ack(tp);
1570 tp->ack.ato = TCP_ATO_MIN;
1571 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1575 /* Reserve space for headers and prepare control bits. */
1576 skb_reserve(buff, MAX_TCP_HEADER);
1578 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1579 TCP_SKB_CB(buff)->sacked = 0;
1580 skb_shinfo(buff)->tso_segs = 1;
1581 skb_shinfo(buff)->tso_size = 0;
1583 /* Send it off, this clears delayed acks for us. */
1584 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1585 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1586 tcp_transmit_skb(sk, buff);
1590 /* This routine sends a packet with an out of date sequence
1591 * number. It assumes the other end will try to ack it.
1593 * Question: what should we make while urgent mode?
1594 * 4.4BSD forces sending single byte of data. We cannot send
1595 * out of window data, because we have SND.NXT==SND.MAX...
1597 * Current solution: to send TWO zero-length segments in urgent mode:
1598 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1599 * out-of-date with SND.UNA-1 to probe window.
1601 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1603 struct tcp_opt *tp = tcp_sk(sk);
1604 struct sk_buff *skb;
1606 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1607 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1611 /* Reserve space for headers and set control bits. */
1612 skb_reserve(skb, MAX_TCP_HEADER);
1614 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1615 TCP_SKB_CB(skb)->sacked = urgent;
1616 skb_shinfo(skb)->tso_segs = 1;
1617 skb_shinfo(skb)->tso_size = 0;
1619 /* Use a previous sequence. This should cause the other
1620 * end to send an ack. Don't queue or clone SKB, just
1623 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1624 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1625 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1626 return tcp_transmit_skb(sk, skb);
1629 int tcp_write_wakeup(struct sock *sk)
1631 if (sk->sk_state != TCP_CLOSE) {
1632 struct tcp_opt *tp = tcp_sk(sk);
1633 struct sk_buff *skb;
1635 if ((skb = sk->sk_send_head) != NULL &&
1636 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1638 unsigned int mss = tcp_current_mss(sk, 0);
1639 unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1641 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1642 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1644 /* We are probing the opening of a window
1645 * but the window size is != 0
1646 * must have been a result SWS avoidance ( sender )
1648 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1650 seg_size = min(seg_size, mss);
1651 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1652 if (tcp_fragment(sk, skb, seg_size))
1654 /* SWS override triggered forced fragmentation.
1655 * Disable TSO, the connection is too sick. */
1656 if (sk->sk_route_caps & NETIF_F_TSO) {
1657 sk->sk_no_largesend = 1;
1658 sk->sk_route_caps &= ~NETIF_F_TSO;
1659 tp->mss_cache = tp->mss_cache_std;
1661 } else if (!tcp_skb_pcount(skb))
1662 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
1664 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1665 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1666 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1668 update_send_head(sk, tp, skb);
1673 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1674 tcp_xmit_probe_skb(sk, TCPCB_URG);
1675 return tcp_xmit_probe_skb(sk, 0);
1681 /* A window probe timeout has occurred. If window is not closed send
1682 * a partial packet else a zero probe.
1684 void tcp_send_probe0(struct sock *sk)
1686 struct tcp_opt *tp = tcp_sk(sk);
1689 err = tcp_write_wakeup(sk);
1691 if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
1692 /* Cancel probe timer, if it is not required. */
1699 if (tp->backoff < sysctl_tcp_retries2)
1702 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1703 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1705 /* If packet was not sent due to local congestion,
1706 * do not backoff and do not remember probes_out.
1707 * Let local senders to fight for local resources.
1709 * Use accumulated backoff yet.
1711 if (!tp->probes_out)
1713 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1714 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1718 EXPORT_SYMBOL(tcp_acceptable_seq);
1719 EXPORT_SYMBOL(tcp_connect);
1720 EXPORT_SYMBOL(tcp_connect_init);
1721 EXPORT_SYMBOL(tcp_make_synack);
1722 EXPORT_SYMBOL(tcp_send_synack);
1723 EXPORT_SYMBOL(tcp_simple_retransmit);
1724 EXPORT_SYMBOL(tcp_sync_mss);
1725 EXPORT_SYMBOL(tcp_write_wakeup);
1726 EXPORT_SYMBOL(tcp_write_xmit);