2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
48 /* This limits the percentage of the congestion window which we
49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty.
52 int sysctl_tcp_tso_win_divisor = 8;
55 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
59 sk->sk_send_head = NULL;
60 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
61 tcp_packets_out_inc(sk, tp, skb);
64 /* SND.NXT, if window was not shrunk.
65 * If window has been shrunk, what should we make? It is not clear at all.
66 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
67 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
68 * invalid. OK, let's make this for now:
70 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
72 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
75 return tp->snd_una+tp->snd_wnd;
78 /* Calculate mss to advertise in SYN segment.
79 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
81 * 1. It is independent of path mtu.
82 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
83 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
84 * attached devices, because some buggy hosts are confused by
86 * 4. We do not make 3, we advertise MSS, calculated from first
87 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
88 * This may be overridden via information stored in routing table.
89 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
90 * probably even Jumbo".
92 static __u16 tcp_advertise_mss(struct sock *sk)
94 struct tcp_opt *tp = tcp_sk(sk);
95 struct dst_entry *dst = __sk_dst_get(sk);
98 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
99 mss = dst_metric(dst, RTAX_ADVMSS);
106 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */
108 static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
110 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd;
114 if (tcp_is_vegas(tp))
115 tcp_vegas_enable(tp);
117 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd);
120 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
122 tp->snd_cwnd = max(cwnd, restart_cwnd);
123 tp->snd_cwnd_stamp = tcp_time_stamp;
124 tp->snd_cwnd_used = 0;
127 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
129 u32 now = tcp_time_stamp;
131 if (!tcp_get_pcount(&tp->packets_out) &&
132 (s32)(now - tp->lsndtime) > tp->rto)
133 tcp_cwnd_restart(tp, __sk_dst_get(sk));
137 /* If it is a reply for ato after last received
138 * packet, enter pingpong mode.
140 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
141 tp->ack.pingpong = 1;
144 static __inline__ void tcp_event_ack_sent(struct sock *sk)
146 struct tcp_opt *tp = tcp_sk(sk);
148 tcp_dec_quickack_mode(tp);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
152 /* Determine a window scaling and initial window to offer.
153 * Based on the assumption that the given amount of space
154 * will be offered. Store the results in the tp structure.
155 * NOTE: for smooth operation initial space offering should
156 * be a multiple of mss if possible. We assume here that mss >= 1.
157 * This MUST be enforced by all callers.
159 void tcp_select_initial_window(int __space, __u32 mss,
160 __u32 *rcv_wnd, __u32 *window_clamp,
161 int wscale_ok, __u8 *rcv_wscale)
163 unsigned int space = (__space < 0 ? 0 : __space);
165 /* If no clamp set the clamp to the max possible scaled window */
166 if (*window_clamp == 0)
167 (*window_clamp) = (65535 << 14);
168 space = min(*window_clamp, space);
170 /* Quantize space offering to a multiple of mss if possible. */
172 space = (space / mss) * mss;
174 /* NOTE: offering an initial window larger than 32767
175 * will break some buggy TCP stacks. We try to be nice.
176 * If we are not window scaling, then this truncates
177 * our initial window offering to 32k. There should also
178 * be a sysctl option to stop being nice.
180 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
183 /* Set window scaling on max possible window
184 * See RFC1323 for an explanation of the limit to 14
186 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
187 while (space > 65535 && (*rcv_wscale) < 14) {
193 /* Set initial window to value enough for senders,
194 * following RFC1414. Senders, not following this RFC,
195 * will be satisfied with 2.
197 if (mss > (1<<*rcv_wscale)) {
203 if (*rcv_wnd > init_cwnd*mss)
204 *rcv_wnd = init_cwnd*mss;
207 /* Set the clamp no higher than max representable value */
208 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
211 /* Chose a new window to advertise, update state in tcp_opt for the
212 * socket, and return result with RFC1323 scaling applied. The return
213 * value can be stuffed directly into th->window for an outgoing
216 static __inline__ u16 tcp_select_window(struct sock *sk)
218 struct tcp_opt *tp = tcp_sk(sk);
219 u32 cur_win = tcp_receive_window(tp);
220 u32 new_win = __tcp_select_window(sk);
222 /* Never shrink the offered window */
223 if(new_win < cur_win) {
224 /* Danger Will Robinson!
225 * Don't update rcv_wup/rcv_wnd here or else
226 * we will not be able to advertise a zero
227 * window in time. --DaveM
229 * Relax Will Robinson.
233 tp->rcv_wnd = new_win;
234 tp->rcv_wup = tp->rcv_nxt;
236 /* Make sure we do not exceed the maximum possible
240 new_win = min(new_win, MAX_TCP_WINDOW);
242 new_win = min(new_win, (65535U << tp->rcv_wscale));
244 /* RFC1323 scaling applied */
245 new_win >>= tp->rcv_wscale;
247 /* If we advertise zero window, disable fast path. */
255 /* This routine actually transmits TCP packets queued in by
256 * tcp_do_sendmsg(). This is used by both the initial
257 * transmission and possible later retransmissions.
258 * All SKB's seen here are completely headerless. It is our
259 * job to build the TCP header, and pass the packet down to
260 * IP so it can do the same plus pass the packet off to the
263 * We are working here with either a clone of the original
264 * SKB, or a fresh unique copy made by the retransmit engine.
266 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
269 struct inet_opt *inet = inet_sk(sk);
270 struct tcp_opt *tp = tcp_sk(sk);
271 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
272 int tcp_header_size = tp->tcp_header_len;
277 BUG_ON(!tcp_skb_pcount(skb));
279 #define SYSCTL_FLAG_TSTAMPS 0x1
280 #define SYSCTL_FLAG_WSCALE 0x2
281 #define SYSCTL_FLAG_SACK 0x4
284 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
286 if(sysctl_tcp_timestamps) {
287 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
288 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
290 if(sysctl_tcp_window_scaling) {
291 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
292 sysctl_flags |= SYSCTL_FLAG_WSCALE;
294 if(sysctl_tcp_sack) {
295 sysctl_flags |= SYSCTL_FLAG_SACK;
296 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
297 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
299 } else if (tp->eff_sacks) {
300 /* A SACK is 2 pad bytes, a 2 byte header, plus
301 * 2 32-bit sequence numbers for each SACK block.
303 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
304 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
308 * If the connection is idle and we are restarting,
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
321 skb_set_owner_w(skb, sk);
323 /* Build TCP header and checksum it. */
324 th->source = inet->sport;
325 th->dest = inet->dport;
326 th->seq = htonl(tcb->seq);
327 th->ack_seq = htonl(tp->rcv_nxt);
328 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
329 if (tcb->flags & TCPCB_FLAG_SYN) {
330 /* RFC1323: The window in SYN & SYN/ACK segments
333 th->window = htons(tp->rcv_wnd);
335 th->window = htons(tcp_select_window(sk));
341 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
342 th->urg_ptr = htons(tp->snd_up-tcb->seq);
346 if (tcb->flags & TCPCB_FLAG_SYN) {
347 tcp_syn_build_options((__u32 *)(th + 1),
348 tcp_advertise_mss(sk),
349 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
350 (sysctl_flags & SYSCTL_FLAG_SACK),
351 (sysctl_flags & SYSCTL_FLAG_WSCALE),
356 tcp_build_and_update_options((__u32 *)(th + 1),
359 TCP_ECN_send(sk, tp, skb, tcp_header_size);
361 tp->af_specific->send_check(sk, th, skb->len, skb);
363 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk);
366 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk);
369 TCP_INC_STATS(TCP_MIB_OUTSEGS);
371 err = tp->af_specific->queue_xmit(skb, 0);
377 /* NET_XMIT_CN is special. It does not guarantee,
378 * that this packet is lost. It tells that device
379 * is about to start to drop packets or already
380 * drops some packets of the same priority and
381 * invokes us to send less aggressively.
383 return err == NET_XMIT_CN ? 0 : err;
386 #undef SYSCTL_FLAG_TSTAMPS
387 #undef SYSCTL_FLAG_WSCALE
388 #undef SYSCTL_FLAG_SACK
392 /* This routine just queue's the buffer
394 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
395 * otherwise socket can stall.
397 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
399 struct tcp_opt *tp = tcp_sk(sk);
401 /* Advance write_seq and place onto the write_queue. */
402 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
403 __skb_queue_tail(&sk->sk_write_queue, skb);
404 sk_charge_skb(sk, skb);
406 /* Queue it, remembering where we must start sending. */
407 if (sk->sk_send_head == NULL)
408 sk->sk_send_head = skb;
411 /* Send _single_ skb sitting at the send head. This function requires
412 * true push pending frames to setup probe timer etc.
414 void tcp_push_one(struct sock *sk, unsigned cur_mss)
416 struct tcp_opt *tp = tcp_sk(sk);
417 struct sk_buff *skb = sk->sk_send_head;
419 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
420 /* Send it out now. */
421 TCP_SKB_CB(skb)->when = tcp_time_stamp;
422 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
423 sk->sk_send_head = NULL;
424 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
425 tcp_packets_out_inc(sk, tp, skb);
431 void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
433 if (skb->len <= mss_std) {
434 /* Avoid the costly divide in the normal
437 skb_shinfo(skb)->tso_segs = 1;
438 skb_shinfo(skb)->tso_size = 0;
442 factor = skb->len + (mss_std - 1);
444 skb_shinfo(skb)->tso_segs = factor;
445 skb_shinfo(skb)->tso_size = mss_std;
449 EXPORT_SYMBOL_GPL(tcp_set_skb_tso_segs);
451 /* Function to create two new TCP segments. Shrinks the given segment
452 * to the specified size and appends a new segment with the rest of the
453 * packet to the list. This won't be called frequently, I hope.
454 * Remember, these are still headerless SKBs at this point.
456 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
458 struct tcp_opt *tp = tcp_sk(sk);
459 struct sk_buff *buff;
463 nsize = skb_headlen(skb) - len;
467 if (skb_cloned(skb) &&
468 skb_is_nonlinear(skb) &&
469 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
472 /* Get a new skb... force flag on. */
473 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
475 return -ENOMEM; /* We'll just try again later. */
476 sk_charge_skb(sk, buff);
478 /* Correct the sequence numbers. */
479 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
480 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
481 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
483 /* PSH and FIN should only be set in the second packet. */
484 flags = TCP_SKB_CB(skb)->flags;
485 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
486 TCP_SKB_CB(buff)->flags = flags;
487 TCP_SKB_CB(buff)->sacked =
488 (TCP_SKB_CB(skb)->sacked &
489 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
490 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
492 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
493 /* Copy and checksum data tail into the new buffer. */
494 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
499 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
501 skb->ip_summed = CHECKSUM_HW;
502 skb_split(skb, buff, len);
505 buff->ip_summed = skb->ip_summed;
507 /* Looks stupid, but our code really uses when of
508 * skbs, which it never sent before. --ANK
510 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
512 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
513 tcp_dec_pcount(&tp->lost_out, skb);
514 tcp_dec_pcount(&tp->left_out, skb);
517 /* Fix up tso_factor for both original and new SKB. */
518 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
519 tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
521 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
522 tcp_inc_pcount(&tp->lost_out, skb);
523 tcp_inc_pcount(&tp->left_out, skb);
526 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
527 tcp_inc_pcount(&tp->lost_out, buff);
528 tcp_inc_pcount(&tp->left_out, buff);
531 /* Link BUFF into the send queue. */
532 __skb_append(skb, buff);
537 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
538 * eventually). The difference is that pulled data not copied, but
539 * immediately discarded.
541 static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
547 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
548 if (skb_shinfo(skb)->frags[i].size <= eat) {
549 put_page(skb_shinfo(skb)->frags[i].page);
550 eat -= skb_shinfo(skb)->frags[i].size;
552 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
554 skb_shinfo(skb)->frags[k].page_offset += eat;
555 skb_shinfo(skb)->frags[k].size -= eat;
561 skb_shinfo(skb)->nr_frags = k;
563 skb->tail = skb->data;
564 skb->data_len -= len;
565 skb->len = skb->data_len;
569 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
571 if (skb_cloned(skb) &&
572 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
575 if (len <= skb_headlen(skb)) {
576 __skb_pull(skb, len);
578 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
582 TCP_SKB_CB(skb)->seq += len;
583 skb->ip_summed = CHECKSUM_HW;
585 skb->truesize -= len;
586 sk->sk_queue_shrunk = 1;
587 sk->sk_wmem_queued -= len;
588 sk->sk_forward_alloc += len;
590 /* Any change of skb->len requires recalculation of tso
593 if (tcp_skb_pcount(skb) > 1)
594 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
599 /* This function synchronize snd mss to current pmtu/exthdr set.
601 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
602 for TCP options, but includes only bare TCP header.
604 tp->mss_clamp is mss negotiated at connection setup.
605 It is minumum of user_mss and mss received with SYN.
606 It also does not include TCP options.
608 tp->pmtu_cookie is last pmtu, seen by this function.
610 tp->mss_cache is current effective sending mss, including
611 all tcp options except for SACKs. It is evaluated,
612 taking into account current pmtu, but never exceeds
615 NOTE1. rfc1122 clearly states that advertised MSS
616 DOES NOT include either tcp or ip options.
618 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
619 this function. --ANK (980731)
622 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
624 struct tcp_opt *tp = tcp_sk(sk);
625 struct dst_entry *dst = __sk_dst_get(sk);
628 if (dst && dst->ops->get_mss)
629 pmtu = dst->ops->get_mss(dst, pmtu);
631 /* Calculate base mss without TCP options:
632 It is MMS_S - sizeof(tcphdr) of rfc1122
634 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
636 /* Clamp it (mss_clamp does not include tcp options) */
637 if (mss_now > tp->mss_clamp)
638 mss_now = tp->mss_clamp;
640 /* Now subtract optional transport overhead */
641 mss_now -= tp->ext_header_len + tp->ext2_header_len;
643 /* Then reserve room for full set of TCP options and 8 bytes of data */
647 /* Now subtract TCP options size, not including SACKs */
648 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
650 /* Bound mss with half of window */
651 if (tp->max_window && mss_now > (tp->max_window>>1))
652 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
654 /* And store cached results */
655 tp->pmtu_cookie = pmtu;
656 tp->mss_cache = tp->mss_cache_std = mss_now;
661 /* Compute the current effective MSS, taking SACKs and IP options,
662 * and even PMTU discovery events into account.
664 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
665 * cannot be large. However, taking into account rare use of URG, this
669 unsigned int tcp_current_mss(struct sock *sk, int large)
671 struct tcp_opt *tp = tcp_sk(sk);
672 struct dst_entry *dst = __sk_dst_get(sk);
673 unsigned int do_large, mss_now;
675 mss_now = tp->mss_cache_std;
677 u32 mtu = dst_pmtu(dst);
678 if (mtu != tp->pmtu_cookie ||
679 tp->ext2_header_len != dst->header_len)
680 mss_now = tcp_sync_mss(sk, mtu);
684 (sk->sk_route_caps & NETIF_F_TSO) &&
688 unsigned int large_mss, factor, limit;
690 large_mss = 65535 - tp->af_specific->net_header_len -
691 tp->ext_header_len - tp->ext2_header_len -
694 if (tp->max_window && large_mss > (tp->max_window>>1))
695 large_mss = max((tp->max_window>>1),
696 68U - tp->tcp_header_len);
698 factor = large_mss / mss_now;
700 /* Always keep large mss multiple of real mss, but
701 * do not exceed 1/tso_win_divisor of the congestion window
702 * so we can keep the ACK clock ticking and minimize
705 limit = tp->snd_cwnd;
706 if (sysctl_tcp_tso_win_divisor)
707 limit /= sysctl_tcp_tso_win_divisor;
708 limit = max(1U, limit);
712 tp->mss_cache = mss_now * factor;
714 mss_now = tp->mss_cache;
718 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
719 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
722 EXPORT_SYMBOL_GPL(tcp_current_mss);
724 /* This routine writes packets to the network. It advances the
725 * send_head. This happens as incoming acks open up the remote
728 * Returns 1, if no segments are in flight and we have queued segments, but
729 * cannot send anything now because of SWS or another problem.
731 int tcp_write_xmit(struct sock *sk, int nonagle)
733 struct tcp_opt *tp = tcp_sk(sk);
734 unsigned int mss_now;
736 /* If we are closed, the bytes will have to remain here.
737 * In time closedown will finish, we empty the write queue and all
740 if (sk->sk_state != TCP_CLOSE) {
744 /* Account for SACKS, we may need to fragment due to this.
745 * It is just like the real MSS changing on us midstream.
746 * We also handle things correctly when the user adds some
747 * IP options mid-stream. Silly to do, but cover it.
749 mss_now = tcp_current_mss(sk, 1);
751 while ((skb = sk->sk_send_head) &&
752 tcp_snd_test(tp, skb, mss_now,
753 tcp_skb_is_last(sk, skb) ? nonagle :
755 if (skb->len > mss_now) {
756 if (tcp_fragment(sk, skb, mss_now))
760 TCP_SKB_CB(skb)->when = tcp_time_stamp;
761 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
764 /* Advance the send_head. This one is sent out.
765 * This call will increment packets_out.
767 update_send_head(sk, tp, skb);
769 tcp_minshall_update(tp, mss_now, skb);
774 tcp_cwnd_validate(sk, tp);
778 return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head;
783 EXPORT_SYMBOL_GPL(tcp_write_xmit);
785 /* This function returns the amount that we can raise the
786 * usable window based on the following constraints
788 * 1. The window can never be shrunk once it is offered (RFC 793)
789 * 2. We limit memory per socket
792 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
793 * RECV.NEXT + RCV.WIN fixed until:
794 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
796 * i.e. don't raise the right edge of the window until you can raise
797 * it at least MSS bytes.
799 * Unfortunately, the recommended algorithm breaks header prediction,
800 * since header prediction assumes th->window stays fixed.
802 * Strictly speaking, keeping th->window fixed violates the receiver
803 * side SWS prevention criteria. The problem is that under this rule
804 * a stream of single byte packets will cause the right side of the
805 * window to always advance by a single byte.
807 * Of course, if the sender implements sender side SWS prevention
808 * then this will not be a problem.
810 * BSD seems to make the following compromise:
812 * If the free space is less than the 1/4 of the maximum
813 * space available and the free space is less than 1/2 mss,
814 * then set the window to 0.
815 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
816 * Otherwise, just prevent the window from shrinking
817 * and from being larger than the largest representable value.
819 * This prevents incremental opening of the window in the regime
820 * where TCP is limited by the speed of the reader side taking
821 * data out of the TCP receive queue. It does nothing about
822 * those cases where the window is constrained on the sender side
823 * because the pipeline is full.
825 * BSD also seems to "accidentally" limit itself to windows that are a
826 * multiple of MSS, at least until the free space gets quite small.
827 * This would appear to be a side effect of the mbuf implementation.
828 * Combining these two algorithms results in the observed behavior
829 * of having a fixed window size at almost all times.
831 * Below we obtain similar behavior by forcing the offered window to
832 * a multiple of the mss when it is feasible to do so.
834 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
835 * Regular options like TIMESTAMP are taken into account.
837 u32 __tcp_select_window(struct sock *sk)
839 struct tcp_opt *tp = tcp_sk(sk);
840 /* MSS for the peer's data. Previous verions used mss_clamp
841 * here. I don't know if the value based on our guesses
842 * of peer's MSS is better for the performance. It's more correct
843 * but may be worse for the performance because of rcv_mss
844 * fluctuations. --SAW 1998/11/1
846 int mss = tp->ack.rcv_mss;
847 int free_space = tcp_space(sk);
848 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
851 if (mss > full_space)
854 if (free_space < full_space/2) {
857 if (tcp_memory_pressure)
858 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
860 if (free_space < mss)
864 if (free_space > tp->rcv_ssthresh)
865 free_space = tp->rcv_ssthresh;
867 /* Don't do rounding if we are using window scaling, since the
868 * scaled window will not line up with the MSS boundary anyway.
870 window = tp->rcv_wnd;
871 if (tp->rcv_wscale) {
874 /* Advertise enough space so that it won't get scaled away.
875 * Import case: prevent zero window announcement if
876 * 1<<rcv_wscale > mss.
878 if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window)
879 window = (((window >> tp->rcv_wscale) + 1)
882 /* Get the largest window that is a nice multiple of mss.
883 * Window clamp already applied above.
884 * If our current window offering is within 1 mss of the
885 * free space we just keep it. This prevents the divide
886 * and multiply from happening most of the time.
887 * We also don't do any window rounding when the free space
890 if (window <= free_space - mss || window > free_space)
891 window = (free_space/mss)*mss;
897 /* Attempt to collapse two adjacent SKB's during retransmission. */
898 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
900 struct tcp_opt *tp = tcp_sk(sk);
901 struct sk_buff *next_skb = skb->next;
903 /* The first test we must make is that neither of these two
904 * SKB's are still referenced by someone else.
906 if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
907 int skb_size = skb->len, next_skb_size = next_skb->len;
908 u16 flags = TCP_SKB_CB(skb)->flags;
910 /* Also punt if next skb has been SACK'd. */
911 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
914 /* Next skb is out of window. */
915 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
918 /* Punt if not enough space exists in the first SKB for
919 * the data in the second, or the total combined payload
920 * would exceed the MSS.
922 if ((next_skb_size > skb_tailroom(skb)) ||
923 ((skb_size + next_skb_size) > mss_now))
926 BUG_ON(tcp_skb_pcount(skb) != 1 ||
927 tcp_skb_pcount(next_skb) != 1);
929 /* Ok. We will be able to collapse the packet. */
930 __skb_unlink(next_skb, next_skb->list);
932 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
934 if (next_skb->ip_summed == CHECKSUM_HW)
935 skb->ip_summed = CHECKSUM_HW;
937 if (skb->ip_summed != CHECKSUM_HW)
938 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
940 /* Update sequence range on original skb. */
941 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
943 /* Merge over control information. */
944 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
945 TCP_SKB_CB(skb)->flags = flags;
947 /* All done, get rid of second SKB and account for it so
948 * packet counting does not break.
950 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
951 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
952 tcp_dec_pcount(&tp->retrans_out, next_skb);
953 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
954 tcp_dec_pcount(&tp->lost_out, next_skb);
955 tcp_dec_pcount(&tp->left_out, next_skb);
957 /* Reno case is special. Sigh... */
958 if (!tp->sack_ok && tcp_get_pcount(&tp->sacked_out)) {
959 tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
960 tcp_dec_pcount(&tp->left_out, next_skb);
963 /* Not quite right: it can be > snd.fack, but
964 * it is better to underestimate fackets.
966 tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
967 tcp_packets_out_dec(tp, next_skb);
968 sk_stream_free_skb(sk, next_skb);
972 /* Do a simple retransmit without using the backoff mechanisms in
973 * tcp_timer. This is used for path mtu discovery.
974 * The socket is already locked here.
976 void tcp_simple_retransmit(struct sock *sk)
978 struct tcp_opt *tp = tcp_sk(sk);
980 unsigned int mss = tcp_current_mss(sk, 0);
983 sk_stream_for_retrans_queue(skb, sk) {
984 if (skb->len > mss &&
985 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
986 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
987 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
988 tcp_dec_pcount(&tp->retrans_out, skb);
990 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
991 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
992 tcp_inc_pcount(&tp->lost_out, skb);
1001 tcp_sync_left_out(tp);
1003 /* Don't muck with the congestion window here.
1004 * Reason is that we do not increase amount of _data_
1005 * in network, but units changed and effective
1006 * cwnd/ssthresh really reduced now.
1008 if (tp->ca_state != TCP_CA_Loss) {
1009 tp->high_seq = tp->snd_nxt;
1010 tp->snd_ssthresh = tcp_current_ssthresh(tp);
1011 tp->prior_ssthresh = 0;
1012 tp->undo_marker = 0;
1013 tcp_set_ca_state(tp, TCP_CA_Loss);
1015 tcp_xmit_retransmit_queue(sk);
1018 /* This retransmits one SKB. Policy decisions and retransmit queue
1019 * state updates are done by the caller. Returns non-zero if an
1020 * error occurred which prevented the send.
1022 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1024 struct tcp_opt *tp = tcp_sk(sk);
1025 unsigned int cur_mss = tcp_current_mss(sk, 0);
1028 /* Do not sent more than we queued. 1/4 is reserved for possible
1029 * copying overhead: frgagmentation, tunneling, mangling etc.
1031 if (atomic_read(&sk->sk_wmem_alloc) >
1032 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1035 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1036 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1039 if (sk->sk_route_caps & NETIF_F_TSO) {
1040 sk->sk_route_caps &= ~NETIF_F_TSO;
1041 sk->sk_no_largesend = 1;
1042 tp->mss_cache = tp->mss_cache_std;
1045 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1049 /* If receiver has shrunk his window, and skb is out of
1050 * new window, do not retransmit it. The exception is the
1051 * case, when window is shrunk to zero. In this case
1052 * our retransmit serves as a zero window probe.
1054 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
1055 && TCP_SKB_CB(skb)->seq != tp->snd_una)
1058 if (skb->len > cur_mss) {
1059 int old_factor = tcp_skb_pcount(skb);
1062 if (tcp_fragment(sk, skb, cur_mss))
1063 return -ENOMEM; /* We'll try again later. */
1065 /* New SKB created, account for it. */
1066 new_factor = tcp_skb_pcount(skb);
1067 tcp_dec_pcount_explicit(&tp->packets_out,
1068 old_factor - new_factor);
1069 tcp_inc_pcount(&tp->packets_out, skb->next);
1072 /* Collapse two adjacent packets if worthwhile and we can. */
1073 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1074 (skb->len < (cur_mss >> 1)) &&
1075 (skb->next != sk->sk_send_head) &&
1076 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
1077 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
1078 (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
1079 (sysctl_tcp_retrans_collapse != 0))
1080 tcp_retrans_try_collapse(sk, skb, cur_mss);
1082 if(tp->af_specific->rebuild_header(sk))
1083 return -EHOSTUNREACH; /* Routing failure or similar. */
1085 /* Some Solaris stacks overoptimize and ignore the FIN on a
1086 * retransmit when old data is attached. So strip it off
1087 * since it is cheap to do so and saves bytes on the network.
1090 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1091 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1092 if (!pskb_trim(skb, 0)) {
1093 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1094 skb_shinfo(skb)->tso_segs = 1;
1095 skb_shinfo(skb)->tso_size = 0;
1096 skb->ip_summed = CHECKSUM_NONE;
1101 /* Make a copy, if the first transmission SKB clone we made
1102 * is still in somebody's hands, else make a clone.
1104 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1107 pskb_copy(skb, GFP_ATOMIC):
1108 skb_clone(skb, GFP_ATOMIC)));
1111 /* Update global TCP statistics. */
1112 TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1114 tp->total_retrans++;
1116 #if FASTRETRANS_DEBUG > 0
1117 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1118 if (net_ratelimit())
1119 printk(KERN_DEBUG "retrans_out leaked.\n");
1122 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1123 tcp_inc_pcount(&tp->retrans_out, skb);
1125 /* Save stamp of the first retransmit. */
1126 if (!tp->retrans_stamp)
1127 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1131 /* snd_nxt is stored to detect loss of retransmitted segment,
1132 * see tcp_input.c tcp_sacktag_write_queue().
1134 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1139 /* This gets called after a retransmit timeout, and the initially
1140 * retransmitted data is acknowledged. It tries to continue
1141 * resending the rest of the retransmit queue, until either
1142 * we've sent it all or the congestion window limit is reached.
1143 * If doing SACK, the first ACK which comes back for a timeout
1144 * based retransmit packet might feed us FACK information again.
1145 * If so, we use it to avoid unnecessarily retransmissions.
1147 void tcp_xmit_retransmit_queue(struct sock *sk)
1149 struct tcp_opt *tp = tcp_sk(sk);
1150 struct sk_buff *skb;
1151 int packet_cnt = tcp_get_pcount(&tp->lost_out);
1153 /* First pass: retransmit lost packets. */
1155 sk_stream_for_retrans_queue(skb, sk) {
1156 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1158 /* Assume this retransmit will generate
1159 * only one packet for congestion window
1160 * calculation purposes. This works because
1161 * tcp_retransmit_skb() will chop up the
1162 * packet to be MSS sized and all the
1163 * packet counting works out.
1165 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1168 if (sacked&TCPCB_LOST) {
1169 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1170 if (tcp_retransmit_skb(sk, skb))
1172 if (tp->ca_state != TCP_CA_Loss)
1173 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1175 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1178 skb_peek(&sk->sk_write_queue))
1179 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1182 packet_cnt -= tcp_skb_pcount(skb);
1183 if (packet_cnt <= 0)
1189 /* OK, demanded retransmission is finished. */
1191 /* Forward retransmissions are possible only during Recovery. */
1192 if (tp->ca_state != TCP_CA_Recovery)
1195 /* No forward retransmissions in Reno are possible. */
1199 /* Yeah, we have to make difficult choice between forward transmission
1200 * and retransmission... Both ways have their merits...
1202 * For now we do not retransmit anything, while we have some new
1206 if (tcp_may_send_now(sk, tp))
1211 sk_stream_for_retrans_queue(skb, sk) {
1212 /* Similar to the retransmit loop above we
1213 * can pretend that the retransmitted SKB
1214 * we send out here will be composed of one
1215 * real MSS sized packet because tcp_retransmit_skb()
1216 * will fragment it if necessary.
1218 if (++packet_cnt > tcp_get_pcount(&tp->fackets_out))
1221 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1224 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1227 /* Ok, retransmit it. */
1228 if (tcp_retransmit_skb(sk, skb))
1231 if (skb == skb_peek(&sk->sk_write_queue))
1232 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1234 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1239 /* Send a fin. The caller locks the socket for us. This cannot be
1240 * allowed to fail queueing a FIN frame under any circumstances.
1242 void tcp_send_fin(struct sock *sk)
1244 struct tcp_opt *tp = tcp_sk(sk);
1245 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1248 /* Optimization, tack on the FIN if we have a queue of
1249 * unsent frames. But be careful about outgoing SACKS
1252 mss_now = tcp_current_mss(sk, 1);
1254 if (sk->sk_send_head != NULL) {
1255 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1256 TCP_SKB_CB(skb)->end_seq++;
1259 /* Socket is locked, keep trying until memory is available. */
1261 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1267 /* Reserve space for headers and prepare control bits. */
1268 skb_reserve(skb, MAX_TCP_HEADER);
1270 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1271 TCP_SKB_CB(skb)->sacked = 0;
1272 skb_shinfo(skb)->tso_segs = 1;
1273 skb_shinfo(skb)->tso_size = 0;
1275 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1276 TCP_SKB_CB(skb)->seq = tp->write_seq;
1277 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1278 tcp_queue_skb(sk, skb);
1280 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1283 /* We get here when a process closes a file descriptor (either due to
1284 * an explicit close() or as a byproduct of exit()'ing) and there
1285 * was unread data in the receive queue. This behavior is recommended
1286 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1288 void tcp_send_active_reset(struct sock *sk, int priority)
1290 struct tcp_opt *tp = tcp_sk(sk);
1291 struct sk_buff *skb;
1293 /* NOTE: No TCP options attached and we never retransmit this. */
1294 skb = alloc_skb(MAX_TCP_HEADER, priority);
1296 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1300 /* Reserve space for headers and prepare control bits. */
1301 skb_reserve(skb, MAX_TCP_HEADER);
1303 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1304 TCP_SKB_CB(skb)->sacked = 0;
1305 skb_shinfo(skb)->tso_segs = 1;
1306 skb_shinfo(skb)->tso_size = 0;
1309 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1310 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1311 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1312 if (tcp_transmit_skb(sk, skb))
1313 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1316 /* WARNING: This routine must only be called when we have already sent
1317 * a SYN packet that crossed the incoming SYN that caused this routine
1318 * to get called. If this assumption fails then the initial rcv_wnd
1319 * and rcv_wscale values will not be correct.
1321 int tcp_send_synack(struct sock *sk)
1323 struct sk_buff* skb;
1325 skb = skb_peek(&sk->sk_write_queue);
1326 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1327 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1330 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1331 if (skb_cloned(skb)) {
1332 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1335 __skb_unlink(skb, &sk->sk_write_queue);
1336 __skb_queue_head(&sk->sk_write_queue, nskb);
1337 sk_stream_free_skb(sk, skb);
1338 sk_charge_skb(sk, nskb);
1342 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1343 TCP_ECN_send_synack(tcp_sk(sk), skb);
1345 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1346 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1350 * Prepare a SYN-ACK.
1352 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1353 struct open_request *req)
1355 struct tcp_opt *tp = tcp_sk(sk);
1357 int tcp_header_size;
1358 struct sk_buff *skb;
1360 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1364 /* Reserve space for headers. */
1365 skb_reserve(skb, MAX_TCP_HEADER);
1367 skb->dst = dst_clone(dst);
1369 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1370 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1371 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1372 /* SACK_PERM is in the place of NOP NOP of TS */
1373 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1374 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1376 memset(th, 0, sizeof(struct tcphdr));
1379 if (dst->dev->features&NETIF_F_TSO)
1381 TCP_ECN_make_synack(req, th);
1382 th->source = inet_sk(sk)->sport;
1383 th->dest = req->rmt_port;
1384 TCP_SKB_CB(skb)->seq = req->snt_isn;
1385 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1386 TCP_SKB_CB(skb)->sacked = 0;
1387 skb_shinfo(skb)->tso_segs = 1;
1388 skb_shinfo(skb)->tso_size = 0;
1389 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1390 th->ack_seq = htonl(req->rcv_isn + 1);
1391 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1393 /* Set this up on the first call only */
1394 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1395 /* tcp_full_space because it is guaranteed to be the first packet */
1396 tcp_select_initial_window(tcp_full_space(sk),
1397 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1402 req->rcv_wscale = rcv_wscale;
1405 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1406 th->window = htons(req->rcv_wnd);
1408 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1409 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1410 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1411 TCP_SKB_CB(skb)->when,
1415 th->doff = (tcp_header_size >> 2);
1416 TCP_INC_STATS(TCP_MIB_OUTSEGS);
1421 * Do all connect socket setups that can be done AF independent.
1423 static inline void tcp_connect_init(struct sock *sk)
1425 struct dst_entry *dst = __sk_dst_get(sk);
1426 struct tcp_opt *tp = tcp_sk(sk);
1428 /* We'll fix this up when we get a response from the other end.
1429 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1431 tp->tcp_header_len = sizeof(struct tcphdr) +
1432 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1434 /* If user gave his TCP_MAXSEG, record it to clamp */
1436 tp->mss_clamp = tp->user_mss;
1438 tcp_sync_mss(sk, dst_pmtu(dst));
1440 if (!tp->window_clamp)
1441 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1442 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1443 tcp_initialize_rcv_mss(sk);
1446 tcp_select_initial_window(tcp_full_space(sk),
1447 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1450 sysctl_tcp_window_scaling,
1453 tp->rcv_ssthresh = tp->rcv_wnd;
1456 sock_reset_flag(sk, SOCK_DONE);
1458 tcp_init_wl(tp, tp->write_seq, 0);
1459 tp->snd_una = tp->write_seq;
1460 tp->snd_sml = tp->write_seq;
1465 tp->rto = TCP_TIMEOUT_INIT;
1466 tp->retransmits = 0;
1467 tcp_clear_retrans(tp);
1471 * Build a SYN and send it off.
1473 int tcp_connect(struct sock *sk)
1475 struct tcp_opt *tp = tcp_sk(sk);
1476 struct sk_buff *buff;
1478 tcp_connect_init(sk);
1480 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1481 if (unlikely(buff == NULL))
1484 /* Reserve space for headers. */
1485 skb_reserve(buff, MAX_TCP_HEADER);
1487 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1488 TCP_ECN_send_syn(sk, tp, buff);
1489 TCP_SKB_CB(buff)->sacked = 0;
1490 skb_shinfo(buff)->tso_segs = 1;
1491 skb_shinfo(buff)->tso_size = 0;
1493 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1494 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1495 tp->snd_nxt = tp->write_seq;
1496 tp->pushed_seq = tp->write_seq;
1500 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1501 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1502 __skb_queue_tail(&sk->sk_write_queue, buff);
1503 sk_charge_skb(sk, buff);
1504 tcp_inc_pcount(&tp->packets_out, buff);
1505 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1506 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1508 /* Timer for repeating the SYN until an answer. */
1509 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1513 /* Send out a delayed ack, the caller does the policy checking
1514 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1517 void tcp_send_delayed_ack(struct sock *sk)
1519 struct tcp_opt *tp = tcp_sk(sk);
1520 int ato = tp->ack.ato;
1521 unsigned long timeout;
1523 if (ato > TCP_DELACK_MIN) {
1526 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1527 max_ato = TCP_DELACK_MAX;
1529 /* Slow path, intersegment interval is "high". */
1531 /* If some rtt estimate is known, use it to bound delayed ack.
1532 * Do not use tp->rto here, use results of rtt measurements
1536 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1542 ato = min(ato, max_ato);
1545 /* Stay within the limit we were given */
1546 timeout = jiffies + ato;
1548 /* Use new timeout only if there wasn't a older one earlier. */
1549 if (tp->ack.pending&TCP_ACK_TIMER) {
1550 /* If delack timer was blocked or is about to expire,
1553 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1558 if (!time_before(timeout, tp->ack.timeout))
1559 timeout = tp->ack.timeout;
1561 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1562 tp->ack.timeout = timeout;
1563 sk_reset_timer(sk, &tp->delack_timer, timeout);
1566 /* This routine sends an ack and also updates the window. */
1567 void tcp_send_ack(struct sock *sk)
1569 /* If we have been reset, we may not send again. */
1570 if (sk->sk_state != TCP_CLOSE) {
1571 struct tcp_opt *tp = tcp_sk(sk);
1572 struct sk_buff *buff;
1574 /* We are not putting this on the write queue, so
1575 * tcp_transmit_skb() will set the ownership to this
1578 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1580 tcp_schedule_ack(tp);
1581 tp->ack.ato = TCP_ATO_MIN;
1582 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1586 /* Reserve space for headers and prepare control bits. */
1587 skb_reserve(buff, MAX_TCP_HEADER);
1589 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1590 TCP_SKB_CB(buff)->sacked = 0;
1591 skb_shinfo(buff)->tso_segs = 1;
1592 skb_shinfo(buff)->tso_size = 0;
1594 /* Send it off, this clears delayed acks for us. */
1595 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1596 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1597 tcp_transmit_skb(sk, buff);
1601 /* This routine sends a packet with an out of date sequence
1602 * number. It assumes the other end will try to ack it.
1604 * Question: what should we make while urgent mode?
1605 * 4.4BSD forces sending single byte of data. We cannot send
1606 * out of window data, because we have SND.NXT==SND.MAX...
1608 * Current solution: to send TWO zero-length segments in urgent mode:
1609 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1610 * out-of-date with SND.UNA-1 to probe window.
1612 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1614 struct tcp_opt *tp = tcp_sk(sk);
1615 struct sk_buff *skb;
1617 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1618 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1622 /* Reserve space for headers and set control bits. */
1623 skb_reserve(skb, MAX_TCP_HEADER);
1625 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1626 TCP_SKB_CB(skb)->sacked = urgent;
1627 skb_shinfo(skb)->tso_segs = 1;
1628 skb_shinfo(skb)->tso_size = 0;
1630 /* Use a previous sequence. This should cause the other
1631 * end to send an ack. Don't queue or clone SKB, just
1634 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1635 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1636 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1637 return tcp_transmit_skb(sk, skb);
1640 int tcp_write_wakeup(struct sock *sk)
1642 if (sk->sk_state != TCP_CLOSE) {
1643 struct tcp_opt *tp = tcp_sk(sk);
1644 struct sk_buff *skb;
1646 if ((skb = sk->sk_send_head) != NULL &&
1647 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1649 unsigned int mss = tcp_current_mss(sk, 0);
1650 unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1652 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1653 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1655 /* We are probing the opening of a window
1656 * but the window size is != 0
1657 * must have been a result SWS avoidance ( sender )
1659 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1661 seg_size = min(seg_size, mss);
1662 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1663 if (tcp_fragment(sk, skb, seg_size))
1665 /* SWS override triggered forced fragmentation.
1666 * Disable TSO, the connection is too sick. */
1667 if (sk->sk_route_caps & NETIF_F_TSO) {
1668 sk->sk_no_largesend = 1;
1669 sk->sk_route_caps &= ~NETIF_F_TSO;
1670 tp->mss_cache = tp->mss_cache_std;
1672 } else if (!tcp_skb_pcount(skb))
1673 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
1675 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1676 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1677 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1679 update_send_head(sk, tp, skb);
1684 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1685 tcp_xmit_probe_skb(sk, TCPCB_URG);
1686 return tcp_xmit_probe_skb(sk, 0);
1692 /* A window probe timeout has occurred. If window is not closed send
1693 * a partial packet else a zero probe.
1695 void tcp_send_probe0(struct sock *sk)
1697 struct tcp_opt *tp = tcp_sk(sk);
1700 err = tcp_write_wakeup(sk);
1702 if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
1703 /* Cancel probe timer, if it is not required. */
1710 if (tp->backoff < sysctl_tcp_retries2)
1713 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1714 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1716 /* If packet was not sent due to local congestion,
1717 * do not backoff and do not remember probes_out.
1718 * Let local senders to fight for local resources.
1720 * Use accumulated backoff yet.
1722 if (!tp->probes_out)
1724 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1725 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1729 EXPORT_SYMBOL(tcp_connect);
1730 EXPORT_SYMBOL(tcp_make_synack);
1731 EXPORT_SYMBOL(tcp_simple_retransmit);
1732 EXPORT_SYMBOL(tcp_sync_mss);