2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
48 /* This limits the percentage of the congestion window which we
49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty.
52 int sysctl_tcp_tso_win_divisor = 8;
55 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
59 sk->sk_send_head = NULL;
60 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
61 tcp_packets_out_inc(sk, tp, skb);
64 /* SND.NXT, if window was not shrunk.
65 * If window has been shrunk, what should we make? It is not clear at all.
66 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
67 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
68 * invalid. OK, let's make this for now:
70 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
72 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
75 return tp->snd_una+tp->snd_wnd;
78 /* Calculate mss to advertise in SYN segment.
79 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
81 * 1. It is independent of path mtu.
82 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
83 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
84 * attached devices, because some buggy hosts are confused by
86 * 4. We do not make 3, we advertise MSS, calculated from first
87 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
88 * This may be overridden via information stored in routing table.
89 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
90 * probably even Jumbo".
92 static __u16 tcp_advertise_mss(struct sock *sk)
94 struct tcp_opt *tp = tcp_sk(sk);
95 struct dst_entry *dst = __sk_dst_get(sk);
98 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
99 mss = dst_metric(dst, RTAX_ADVMSS);
106 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */
108 static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
110 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd;
114 if (tcp_is_vegas(tp))
115 tcp_vegas_enable(tp);
117 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd);
120 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
122 tp->snd_cwnd = max(cwnd, restart_cwnd);
123 tp->snd_cwnd_stamp = tcp_time_stamp;
124 tp->snd_cwnd_used = 0;
127 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
129 u32 now = tcp_time_stamp;
131 if (!tcp_get_pcount(&tp->packets_out) &&
132 (s32)(now - tp->lsndtime) > tp->rto)
133 tcp_cwnd_restart(tp, __sk_dst_get(sk));
137 /* If it is a reply for ato after last received
138 * packet, enter pingpong mode.
140 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
141 tp->ack.pingpong = 1;
144 static __inline__ void tcp_event_ack_sent(struct sock *sk)
146 struct tcp_opt *tp = tcp_sk(sk);
148 tcp_dec_quickack_mode(tp);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
152 /* Determine a window scaling and initial window to offer.
153 * Based on the assumption that the given amount of space
154 * will be offered. Store the results in the tp structure.
155 * NOTE: for smooth operation initial space offering should
156 * be a multiple of mss if possible. We assume here that mss >= 1.
157 * This MUST be enforced by all callers.
159 void tcp_select_initial_window(int __space, __u32 mss,
160 __u32 *rcv_wnd, __u32 *window_clamp,
161 int wscale_ok, __u8 *rcv_wscale)
163 unsigned int space = (__space < 0 ? 0 : __space);
165 /* If no clamp set the clamp to the max possible scaled window */
166 if (*window_clamp == 0)
167 (*window_clamp) = (65535 << 14);
168 space = min(*window_clamp, space);
170 /* Quantize space offering to a multiple of mss if possible. */
172 space = (space / mss) * mss;
174 /* NOTE: offering an initial window larger than 32767
175 * will break some buggy TCP stacks. We try to be nice.
176 * If we are not window scaling, then this truncates
177 * our initial window offering to 32k. There should also
178 * be a sysctl option to stop being nice.
180 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
183 /* Set window scaling on max possible window
184 * See RFC1323 for an explanation of the limit to 14
186 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
187 while (space > 65535 && (*rcv_wscale) < 14) {
193 /* Set initial window to value enough for senders,
194 * following RFC1414. Senders, not following this RFC,
195 * will be satisfied with 2.
197 if (mss > (1<<*rcv_wscale)) {
203 if (*rcv_wnd > init_cwnd*mss)
204 *rcv_wnd = init_cwnd*mss;
207 /* Set the clamp no higher than max representable value */
208 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
211 /* Chose a new window to advertise, update state in tcp_opt for the
212 * socket, and return result with RFC1323 scaling applied. The return
213 * value can be stuffed directly into th->window for an outgoing
216 static __inline__ u16 tcp_select_window(struct sock *sk)
218 struct tcp_opt *tp = tcp_sk(sk);
219 u32 cur_win = tcp_receive_window(tp);
220 u32 new_win = __tcp_select_window(sk);
222 /* Never shrink the offered window */
223 if(new_win < cur_win) {
224 /* Danger Will Robinson!
225 * Don't update rcv_wup/rcv_wnd here or else
226 * we will not be able to advertise a zero
227 * window in time. --DaveM
229 * Relax Will Robinson.
233 tp->rcv_wnd = new_win;
234 tp->rcv_wup = tp->rcv_nxt;
236 /* Make sure we do not exceed the maximum possible
240 new_win = min(new_win, MAX_TCP_WINDOW);
242 new_win = min(new_win, (65535U << tp->rcv_wscale));
244 /* RFC1323 scaling applied */
245 new_win >>= tp->rcv_wscale;
247 /* If we advertise zero window, disable fast path. */
255 /* This routine actually transmits TCP packets queued in by
256 * tcp_do_sendmsg(). This is used by both the initial
257 * transmission and possible later retransmissions.
258 * All SKB's seen here are completely headerless. It is our
259 * job to build the TCP header, and pass the packet down to
260 * IP so it can do the same plus pass the packet off to the
263 * We are working here with either a clone of the original
264 * SKB, or a fresh unique copy made by the retransmit engine.
266 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
269 struct inet_opt *inet = inet_sk(sk);
270 struct tcp_opt *tp = tcp_sk(sk);
271 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
272 int tcp_header_size = tp->tcp_header_len;
277 BUG_ON(!tcp_skb_pcount(skb));
279 #define SYSCTL_FLAG_TSTAMPS 0x1
280 #define SYSCTL_FLAG_WSCALE 0x2
281 #define SYSCTL_FLAG_SACK 0x4
284 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
286 if(sysctl_tcp_timestamps) {
287 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
288 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
290 if(sysctl_tcp_window_scaling) {
291 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
292 sysctl_flags |= SYSCTL_FLAG_WSCALE;
294 if(sysctl_tcp_sack) {
295 sysctl_flags |= SYSCTL_FLAG_SACK;
296 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
297 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
299 } else if (tp->eff_sacks) {
300 /* A SACK is 2 pad bytes, a 2 byte header, plus
301 * 2 32-bit sequence numbers for each SACK block.
303 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
304 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
308 * If the connection is idle and we are restarting,
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
321 skb_set_owner_w(skb, sk);
323 /* Build TCP header and checksum it. */
324 th->source = inet->sport;
325 th->dest = inet->dport;
326 th->seq = htonl(tcb->seq);
327 th->ack_seq = htonl(tp->rcv_nxt);
328 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
329 if (tcb->flags & TCPCB_FLAG_SYN) {
330 /* RFC1323: The window in SYN & SYN/ACK segments
333 th->window = htons(tp->rcv_wnd);
335 th->window = htons(tcp_select_window(sk));
341 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
342 th->urg_ptr = htons(tp->snd_up-tcb->seq);
346 if (tcb->flags & TCPCB_FLAG_SYN) {
347 tcp_syn_build_options((__u32 *)(th + 1),
348 tcp_advertise_mss(sk),
349 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
350 (sysctl_flags & SYSCTL_FLAG_SACK),
351 (sysctl_flags & SYSCTL_FLAG_WSCALE),
356 tcp_build_and_update_options((__u32 *)(th + 1),
359 TCP_ECN_send(sk, tp, skb, tcp_header_size);
361 tp->af_specific->send_check(sk, th, skb->len, skb);
363 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk);
366 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk);
369 TCP_INC_STATS(TCP_MIB_OUTSEGS);
371 err = tp->af_specific->queue_xmit(skb, 0);
377 /* NET_XMIT_CN is special. It does not guarantee,
378 * that this packet is lost. It tells that device
379 * is about to start to drop packets or already
380 * drops some packets of the same priority and
381 * invokes us to send less aggressively.
383 return err == NET_XMIT_CN ? 0 : err;
386 #undef SYSCTL_FLAG_TSTAMPS
387 #undef SYSCTL_FLAG_WSCALE
388 #undef SYSCTL_FLAG_SACK
392 /* This routine just queue's the buffer
394 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
395 * otherwise socket can stall.
397 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
399 struct tcp_opt *tp = tcp_sk(sk);
401 /* Advance write_seq and place onto the write_queue. */
402 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
403 __skb_queue_tail(&sk->sk_write_queue, skb);
404 sk_charge_skb(sk, skb);
406 /* Queue it, remembering where we must start sending. */
407 if (sk->sk_send_head == NULL)
408 sk->sk_send_head = skb;
411 /* Send _single_ skb sitting at the send head. This function requires
412 * true push pending frames to setup probe timer etc.
414 void tcp_push_one(struct sock *sk, unsigned cur_mss)
416 struct tcp_opt *tp = tcp_sk(sk);
417 struct sk_buff *skb = sk->sk_send_head;
419 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
420 /* Send it out now. */
421 TCP_SKB_CB(skb)->when = tcp_time_stamp;
422 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
423 sk->sk_send_head = NULL;
424 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
425 tcp_packets_out_inc(sk, tp, skb);
431 void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
433 if (skb->len <= mss_std) {
434 /* Avoid the costly divide in the normal
437 skb_shinfo(skb)->tso_segs = 1;
438 skb_shinfo(skb)->tso_size = 0;
442 factor = skb->len + (mss_std - 1);
444 skb_shinfo(skb)->tso_segs = factor;
445 skb_shinfo(skb)->tso_size = mss_std;
448 EXPORT_SYMBOL_GPL(tcp_set_skb_tso_segs);
450 /* Function to create two new TCP segments. Shrinks the given segment
451 * to the specified size and appends a new segment with the rest of the
452 * packet to the list. This won't be called frequently, I hope.
453 * Remember, these are still headerless SKBs at this point.
455 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
457 struct tcp_opt *tp = tcp_sk(sk);
458 struct sk_buff *buff;
462 nsize = skb_headlen(skb) - len;
466 if (skb_cloned(skb) &&
467 skb_is_nonlinear(skb) &&
468 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
471 /* Get a new skb... force flag on. */
472 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
474 return -ENOMEM; /* We'll just try again later. */
475 sk_charge_skb(sk, buff);
477 /* Correct the sequence numbers. */
478 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
479 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
480 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
482 /* PSH and FIN should only be set in the second packet. */
483 flags = TCP_SKB_CB(skb)->flags;
484 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
485 TCP_SKB_CB(buff)->flags = flags;
486 TCP_SKB_CB(buff)->sacked =
487 (TCP_SKB_CB(skb)->sacked &
488 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
489 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
491 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
492 /* Copy and checksum data tail into the new buffer. */
493 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
498 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
500 skb->ip_summed = CHECKSUM_HW;
501 skb_split(skb, buff, len);
504 buff->ip_summed = skb->ip_summed;
506 /* Looks stupid, but our code really uses when of
507 * skbs, which it never sent before. --ANK
509 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
511 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
512 tcp_dec_pcount(&tp->lost_out, skb);
513 tcp_dec_pcount(&tp->left_out, skb);
516 /* Fix up tso_factor for both original and new SKB. */
517 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
518 tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
520 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
521 tcp_inc_pcount(&tp->lost_out, skb);
522 tcp_inc_pcount(&tp->left_out, skb);
525 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
526 tcp_inc_pcount(&tp->lost_out, buff);
527 tcp_inc_pcount(&tp->left_out, buff);
530 /* Link BUFF into the send queue. */
531 __skb_append(skb, buff);
536 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
537 * eventually). The difference is that pulled data not copied, but
538 * immediately discarded.
540 static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
546 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
547 if (skb_shinfo(skb)->frags[i].size <= eat) {
548 put_page(skb_shinfo(skb)->frags[i].page);
549 eat -= skb_shinfo(skb)->frags[i].size;
551 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
553 skb_shinfo(skb)->frags[k].page_offset += eat;
554 skb_shinfo(skb)->frags[k].size -= eat;
560 skb_shinfo(skb)->nr_frags = k;
562 skb->tail = skb->data;
563 skb->data_len -= len;
564 skb->len = skb->data_len;
568 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
570 if (skb_cloned(skb) &&
571 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
574 if (len <= skb_headlen(skb)) {
575 __skb_pull(skb, len);
577 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
581 TCP_SKB_CB(skb)->seq += len;
582 skb->ip_summed = CHECKSUM_HW;
584 skb->truesize -= len;
585 sk->sk_queue_shrunk = 1;
586 sk->sk_wmem_queued -= len;
587 sk->sk_forward_alloc += len;
589 /* Any change of skb->len requires recalculation of tso
592 if (tcp_skb_pcount(skb) > 1)
593 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
598 /* This function synchronize snd mss to current pmtu/exthdr set.
600 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
601 for TCP options, but includes only bare TCP header.
603 tp->mss_clamp is mss negotiated at connection setup.
604 It is minumum of user_mss and mss received with SYN.
605 It also does not include TCP options.
607 tp->pmtu_cookie is last pmtu, seen by this function.
609 tp->mss_cache is current effective sending mss, including
610 all tcp options except for SACKs. It is evaluated,
611 taking into account current pmtu, but never exceeds
614 NOTE1. rfc1122 clearly states that advertised MSS
615 DOES NOT include either tcp or ip options.
617 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
618 this function. --ANK (980731)
621 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
623 struct tcp_opt *tp = tcp_sk(sk);
624 struct dst_entry *dst = __sk_dst_get(sk);
627 if (dst && dst->ops->get_mss)
628 pmtu = dst->ops->get_mss(dst, pmtu);
630 /* Calculate base mss without TCP options:
631 It is MMS_S - sizeof(tcphdr) of rfc1122
633 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
635 /* Clamp it (mss_clamp does not include tcp options) */
636 if (mss_now > tp->mss_clamp)
637 mss_now = tp->mss_clamp;
639 /* Now subtract optional transport overhead */
640 mss_now -= tp->ext_header_len + tp->ext2_header_len;
642 /* Then reserve room for full set of TCP options and 8 bytes of data */
646 /* Now subtract TCP options size, not including SACKs */
647 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
649 /* Bound mss with half of window */
650 if (tp->max_window && mss_now > (tp->max_window>>1))
651 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
653 /* And store cached results */
654 tp->pmtu_cookie = pmtu;
655 tp->mss_cache = tp->mss_cache_std = mss_now;
660 /* Compute the current effective MSS, taking SACKs and IP options,
661 * and even PMTU discovery events into account.
663 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
664 * cannot be large. However, taking into account rare use of URG, this
668 unsigned int tcp_current_mss(struct sock *sk, int large)
670 struct tcp_opt *tp = tcp_sk(sk);
671 struct dst_entry *dst = __sk_dst_get(sk);
672 unsigned int do_large, mss_now;
674 mss_now = tp->mss_cache_std;
676 u32 mtu = dst_pmtu(dst);
677 if (mtu != tp->pmtu_cookie ||
678 tp->ext2_header_len != dst->header_len)
679 mss_now = tcp_sync_mss(sk, mtu);
683 (sk->sk_route_caps & NETIF_F_TSO) &&
687 unsigned int large_mss, factor, limit;
689 large_mss = 65535 - tp->af_specific->net_header_len -
690 tp->ext_header_len - tp->ext2_header_len -
693 if (tp->max_window && large_mss > (tp->max_window>>1))
694 large_mss = max((tp->max_window>>1),
695 68U - tp->tcp_header_len);
697 factor = large_mss / mss_now;
699 /* Always keep large mss multiple of real mss, but
700 * do not exceed 1/tso_win_divisor of the congestion window
701 * so we can keep the ACK clock ticking and minimize
704 limit = tp->snd_cwnd;
705 if (sysctl_tcp_tso_win_divisor)
706 limit /= sysctl_tcp_tso_win_divisor;
707 limit = max(1U, limit);
711 tp->mss_cache = mss_now * factor;
713 mss_now = tp->mss_cache;
717 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
718 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
721 EXPORT_SYMBOL_GPL(tcp_current_mss);
723 /* This routine writes packets to the network. It advances the
724 * send_head. This happens as incoming acks open up the remote
727 * Returns 1, if no segments are in flight and we have queued segments, but
728 * cannot send anything now because of SWS or another problem.
730 int tcp_write_xmit(struct sock *sk, int nonagle)
732 struct tcp_opt *tp = tcp_sk(sk);
733 unsigned int mss_now;
735 /* If we are closed, the bytes will have to remain here.
736 * In time closedown will finish, we empty the write queue and all
739 if (sk->sk_state != TCP_CLOSE) {
743 /* Account for SACKS, we may need to fragment due to this.
744 * It is just like the real MSS changing on us midstream.
745 * We also handle things correctly when the user adds some
746 * IP options mid-stream. Silly to do, but cover it.
748 mss_now = tcp_current_mss(sk, 1);
750 while ((skb = sk->sk_send_head) &&
751 tcp_snd_test(tp, skb, mss_now,
752 tcp_skb_is_last(sk, skb) ? nonagle :
754 if (skb->len > mss_now) {
755 if (tcp_fragment(sk, skb, mss_now))
759 TCP_SKB_CB(skb)->when = tcp_time_stamp;
760 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
763 /* Advance the send_head. This one is sent out.
764 * This call will increment packets_out.
766 update_send_head(sk, tp, skb);
768 tcp_minshall_update(tp, mss_now, skb);
773 tcp_cwnd_validate(sk, tp);
777 return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head;
782 /* This function returns the amount that we can raise the
783 * usable window based on the following constraints
785 * 1. The window can never be shrunk once it is offered (RFC 793)
786 * 2. We limit memory per socket
789 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
790 * RECV.NEXT + RCV.WIN fixed until:
791 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
793 * i.e. don't raise the right edge of the window until you can raise
794 * it at least MSS bytes.
796 * Unfortunately, the recommended algorithm breaks header prediction,
797 * since header prediction assumes th->window stays fixed.
799 * Strictly speaking, keeping th->window fixed violates the receiver
800 * side SWS prevention criteria. The problem is that under this rule
801 * a stream of single byte packets will cause the right side of the
802 * window to always advance by a single byte.
804 * Of course, if the sender implements sender side SWS prevention
805 * then this will not be a problem.
807 * BSD seems to make the following compromise:
809 * If the free space is less than the 1/4 of the maximum
810 * space available and the free space is less than 1/2 mss,
811 * then set the window to 0.
812 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
813 * Otherwise, just prevent the window from shrinking
814 * and from being larger than the largest representable value.
816 * This prevents incremental opening of the window in the regime
817 * where TCP is limited by the speed of the reader side taking
818 * data out of the TCP receive queue. It does nothing about
819 * those cases where the window is constrained on the sender side
820 * because the pipeline is full.
822 * BSD also seems to "accidentally" limit itself to windows that are a
823 * multiple of MSS, at least until the free space gets quite small.
824 * This would appear to be a side effect of the mbuf implementation.
825 * Combining these two algorithms results in the observed behavior
826 * of having a fixed window size at almost all times.
828 * Below we obtain similar behavior by forcing the offered window to
829 * a multiple of the mss when it is feasible to do so.
831 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
832 * Regular options like TIMESTAMP are taken into account.
834 u32 __tcp_select_window(struct sock *sk)
836 struct tcp_opt *tp = tcp_sk(sk);
837 /* MSS for the peer's data. Previous verions used mss_clamp
838 * here. I don't know if the value based on our guesses
839 * of peer's MSS is better for the performance. It's more correct
840 * but may be worse for the performance because of rcv_mss
841 * fluctuations. --SAW 1998/11/1
843 int mss = tp->ack.rcv_mss;
844 int free_space = tcp_space(sk);
845 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
848 if (mss > full_space)
851 if (free_space < full_space/2) {
854 if (tcp_memory_pressure)
855 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
857 if (free_space < mss)
861 if (free_space > tp->rcv_ssthresh)
862 free_space = tp->rcv_ssthresh;
864 /* Don't do rounding if we are using window scaling, since the
865 * scaled window will not line up with the MSS boundary anyway.
867 window = tp->rcv_wnd;
868 if (tp->rcv_wscale) {
871 /* Advertise enough space so that it won't get scaled away.
872 * Import case: prevent zero window announcement if
873 * 1<<rcv_wscale > mss.
875 if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window)
876 window = (((window >> tp->rcv_wscale) + 1)
879 /* Get the largest window that is a nice multiple of mss.
880 * Window clamp already applied above.
881 * If our current window offering is within 1 mss of the
882 * free space we just keep it. This prevents the divide
883 * and multiply from happening most of the time.
884 * We also don't do any window rounding when the free space
887 if (window <= free_space - mss || window > free_space)
888 window = (free_space/mss)*mss;
894 /* Attempt to collapse two adjacent SKB's during retransmission. */
895 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
897 struct tcp_opt *tp = tcp_sk(sk);
898 struct sk_buff *next_skb = skb->next;
900 /* The first test we must make is that neither of these two
901 * SKB's are still referenced by someone else.
903 if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
904 int skb_size = skb->len, next_skb_size = next_skb->len;
905 u16 flags = TCP_SKB_CB(skb)->flags;
907 /* Also punt if next skb has been SACK'd. */
908 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
911 /* Next skb is out of window. */
912 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
915 /* Punt if not enough space exists in the first SKB for
916 * the data in the second, or the total combined payload
917 * would exceed the MSS.
919 if ((next_skb_size > skb_tailroom(skb)) ||
920 ((skb_size + next_skb_size) > mss_now))
923 BUG_ON(tcp_skb_pcount(skb) != 1 ||
924 tcp_skb_pcount(next_skb) != 1);
926 /* Ok. We will be able to collapse the packet. */
927 __skb_unlink(next_skb, next_skb->list);
929 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
931 if (next_skb->ip_summed == CHECKSUM_HW)
932 skb->ip_summed = CHECKSUM_HW;
934 if (skb->ip_summed != CHECKSUM_HW)
935 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
937 /* Update sequence range on original skb. */
938 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
940 /* Merge over control information. */
941 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
942 TCP_SKB_CB(skb)->flags = flags;
944 /* All done, get rid of second SKB and account for it so
945 * packet counting does not break.
947 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
948 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
949 tcp_dec_pcount(&tp->retrans_out, next_skb);
950 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
951 tcp_dec_pcount(&tp->lost_out, next_skb);
952 tcp_dec_pcount(&tp->left_out, next_skb);
954 /* Reno case is special. Sigh... */
955 if (!tp->sack_ok && tcp_get_pcount(&tp->sacked_out)) {
956 tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
957 tcp_dec_pcount(&tp->left_out, next_skb);
960 /* Not quite right: it can be > snd.fack, but
961 * it is better to underestimate fackets.
963 tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
964 tcp_packets_out_dec(tp, next_skb);
965 sk_stream_free_skb(sk, next_skb);
969 /* Do a simple retransmit without using the backoff mechanisms in
970 * tcp_timer. This is used for path mtu discovery.
971 * The socket is already locked here.
973 void tcp_simple_retransmit(struct sock *sk)
975 struct tcp_opt *tp = tcp_sk(sk);
977 unsigned int mss = tcp_current_mss(sk, 0);
980 sk_stream_for_retrans_queue(skb, sk) {
981 if (skb->len > mss &&
982 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
983 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
984 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
985 tcp_dec_pcount(&tp->retrans_out, skb);
987 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
988 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
989 tcp_inc_pcount(&tp->lost_out, skb);
998 tcp_sync_left_out(tp);
1000 /* Don't muck with the congestion window here.
1001 * Reason is that we do not increase amount of _data_
1002 * in network, but units changed and effective
1003 * cwnd/ssthresh really reduced now.
1005 if (tp->ca_state != TCP_CA_Loss) {
1006 tp->high_seq = tp->snd_nxt;
1007 tp->snd_ssthresh = tcp_current_ssthresh(tp);
1008 tp->prior_ssthresh = 0;
1009 tp->undo_marker = 0;
1010 tcp_set_ca_state(tp, TCP_CA_Loss);
1012 tcp_xmit_retransmit_queue(sk);
1015 /* This retransmits one SKB. Policy decisions and retransmit queue
1016 * state updates are done by the caller. Returns non-zero if an
1017 * error occurred which prevented the send.
1019 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1021 struct tcp_opt *tp = tcp_sk(sk);
1022 unsigned int cur_mss = tcp_current_mss(sk, 0);
1025 /* Do not sent more than we queued. 1/4 is reserved for possible
1026 * copying overhead: frgagmentation, tunneling, mangling etc.
1028 if (atomic_read(&sk->sk_wmem_alloc) >
1029 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
1032 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1033 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1036 if (sk->sk_route_caps & NETIF_F_TSO) {
1037 sk->sk_route_caps &= ~NETIF_F_TSO;
1038 sk->sk_no_largesend = 1;
1039 tp->mss_cache = tp->mss_cache_std;
1042 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1046 /* If receiver has shrunk his window, and skb is out of
1047 * new window, do not retransmit it. The exception is the
1048 * case, when window is shrunk to zero. In this case
1049 * our retransmit serves as a zero window probe.
1051 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
1052 && TCP_SKB_CB(skb)->seq != tp->snd_una)
1055 if (skb->len > cur_mss) {
1056 int old_factor = tcp_skb_pcount(skb);
1059 if (tcp_fragment(sk, skb, cur_mss))
1060 return -ENOMEM; /* We'll try again later. */
1062 /* New SKB created, account for it. */
1063 new_factor = tcp_skb_pcount(skb);
1064 tcp_dec_pcount_explicit(&tp->packets_out,
1065 old_factor - new_factor);
1066 tcp_inc_pcount(&tp->packets_out, skb->next);
1069 /* Collapse two adjacent packets if worthwhile and we can. */
1070 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1071 (skb->len < (cur_mss >> 1)) &&
1072 (skb->next != sk->sk_send_head) &&
1073 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
1074 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
1075 (sysctl_tcp_retrans_collapse != 0))
1076 tcp_retrans_try_collapse(sk, skb, cur_mss);
1078 if(tp->af_specific->rebuild_header(sk))
1079 return -EHOSTUNREACH; /* Routing failure or similar. */
1081 /* Some Solaris stacks overoptimize and ignore the FIN on a
1082 * retransmit when old data is attached. So strip it off
1083 * since it is cheap to do so and saves bytes on the network.
1086 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1087 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1088 if (!pskb_trim(skb, 0)) {
1089 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
1090 skb_shinfo(skb)->tso_segs = 1;
1091 skb_shinfo(skb)->tso_size = 0;
1092 skb->ip_summed = CHECKSUM_NONE;
1097 /* Make a copy, if the first transmission SKB clone we made
1098 * is still in somebody's hands, else make a clone.
1100 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1102 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1103 pskb_copy(skb, GFP_ATOMIC):
1104 skb_clone(skb, GFP_ATOMIC)));
1107 /* Update global TCP statistics. */
1108 TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
1110 #if FASTRETRANS_DEBUG > 0
1111 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
1112 if (net_ratelimit())
1113 printk(KERN_DEBUG "retrans_out leaked.\n");
1116 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1117 tcp_inc_pcount(&tp->retrans_out, skb);
1119 /* Save stamp of the first retransmit. */
1120 if (!tp->retrans_stamp)
1121 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
1125 /* snd_nxt is stored to detect loss of retransmitted segment,
1126 * see tcp_input.c tcp_sacktag_write_queue().
1128 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
1133 /* This gets called after a retransmit timeout, and the initially
1134 * retransmitted data is acknowledged. It tries to continue
1135 * resending the rest of the retransmit queue, until either
1136 * we've sent it all or the congestion window limit is reached.
1137 * If doing SACK, the first ACK which comes back for a timeout
1138 * based retransmit packet might feed us FACK information again.
1139 * If so, we use it to avoid unnecessarily retransmissions.
1141 void tcp_xmit_retransmit_queue(struct sock *sk)
1143 struct tcp_opt *tp = tcp_sk(sk);
1144 struct sk_buff *skb;
1145 int packet_cnt = tcp_get_pcount(&tp->lost_out);
1147 /* First pass: retransmit lost packets. */
1149 sk_stream_for_retrans_queue(skb, sk) {
1150 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1152 /* Assume this retransmit will generate
1153 * only one packet for congestion window
1154 * calculation purposes. This works because
1155 * tcp_retransmit_skb() will chop up the
1156 * packet to be MSS sized and all the
1157 * packet counting works out.
1159 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1162 if (sacked&TCPCB_LOST) {
1163 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1164 if (tcp_retransmit_skb(sk, skb))
1166 if (tp->ca_state != TCP_CA_Loss)
1167 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1169 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1172 skb_peek(&sk->sk_write_queue))
1173 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1176 packet_cnt -= tcp_skb_pcount(skb);
1177 if (packet_cnt <= 0)
1183 /* OK, demanded retransmission is finished. */
1185 /* Forward retransmissions are possible only during Recovery. */
1186 if (tp->ca_state != TCP_CA_Recovery)
1189 /* No forward retransmissions in Reno are possible. */
1193 /* Yeah, we have to make difficult choice between forward transmission
1194 * and retransmission... Both ways have their merits...
1196 * For now we do not retransmit anything, while we have some new
1200 if (tcp_may_send_now(sk, tp))
1205 sk_stream_for_retrans_queue(skb, sk) {
1206 /* Similar to the retransmit loop above we
1207 * can pretend that the retransmitted SKB
1208 * we send out here will be composed of one
1209 * real MSS sized packet because tcp_retransmit_skb()
1210 * will fragment it if necessary.
1212 if (++packet_cnt > tcp_get_pcount(&tp->fackets_out))
1215 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1218 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1221 /* Ok, retransmit it. */
1222 if (tcp_retransmit_skb(sk, skb))
1225 if (skb == skb_peek(&sk->sk_write_queue))
1226 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1228 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1233 /* Send a fin. The caller locks the socket for us. This cannot be
1234 * allowed to fail queueing a FIN frame under any circumstances.
1236 void tcp_send_fin(struct sock *sk)
1238 struct tcp_opt *tp = tcp_sk(sk);
1239 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1242 /* Optimization, tack on the FIN if we have a queue of
1243 * unsent frames. But be careful about outgoing SACKS
1246 mss_now = tcp_current_mss(sk, 1);
1248 if (sk->sk_send_head != NULL) {
1249 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1250 TCP_SKB_CB(skb)->end_seq++;
1253 /* Socket is locked, keep trying until memory is available. */
1255 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1261 /* Reserve space for headers and prepare control bits. */
1262 skb_reserve(skb, MAX_TCP_HEADER);
1264 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1265 TCP_SKB_CB(skb)->sacked = 0;
1266 skb_shinfo(skb)->tso_segs = 1;
1267 skb_shinfo(skb)->tso_size = 0;
1269 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1270 TCP_SKB_CB(skb)->seq = tp->write_seq;
1271 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1272 tcp_queue_skb(sk, skb);
1274 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1277 /* We get here when a process closes a file descriptor (either due to
1278 * an explicit close() or as a byproduct of exit()'ing) and there
1279 * was unread data in the receive queue. This behavior is recommended
1280 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1282 void tcp_send_active_reset(struct sock *sk, int priority)
1284 struct tcp_opt *tp = tcp_sk(sk);
1285 struct sk_buff *skb;
1287 /* NOTE: No TCP options attached and we never retransmit this. */
1288 skb = alloc_skb(MAX_TCP_HEADER, priority);
1290 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1294 /* Reserve space for headers and prepare control bits. */
1295 skb_reserve(skb, MAX_TCP_HEADER);
1297 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1298 TCP_SKB_CB(skb)->sacked = 0;
1299 skb_shinfo(skb)->tso_segs = 1;
1300 skb_shinfo(skb)->tso_size = 0;
1303 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1304 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1305 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1306 if (tcp_transmit_skb(sk, skb))
1307 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1310 /* WARNING: This routine must only be called when we have already sent
1311 * a SYN packet that crossed the incoming SYN that caused this routine
1312 * to get called. If this assumption fails then the initial rcv_wnd
1313 * and rcv_wscale values will not be correct.
1315 int tcp_send_synack(struct sock *sk)
1317 struct sk_buff* skb;
1319 skb = skb_peek(&sk->sk_write_queue);
1320 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1321 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1324 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1325 if (skb_cloned(skb)) {
1326 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1329 __skb_unlink(skb, &sk->sk_write_queue);
1330 __skb_queue_head(&sk->sk_write_queue, nskb);
1331 sk_stream_free_skb(sk, skb);
1332 sk_charge_skb(sk, nskb);
1336 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1337 TCP_ECN_send_synack(tcp_sk(sk), skb);
1339 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1340 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1344 * Prepare a SYN-ACK.
1346 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1347 struct open_request *req)
1349 struct tcp_opt *tp = tcp_sk(sk);
1351 int tcp_header_size;
1352 struct sk_buff *skb;
1354 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1358 /* Reserve space for headers. */
1359 skb_reserve(skb, MAX_TCP_HEADER);
1361 skb->dst = dst_clone(dst);
1363 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1364 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1365 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1366 /* SACK_PERM is in the place of NOP NOP of TS */
1367 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1368 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1370 memset(th, 0, sizeof(struct tcphdr));
1373 if (dst->dev->features&NETIF_F_TSO)
1375 TCP_ECN_make_synack(req, th);
1376 th->source = inet_sk(sk)->sport;
1377 th->dest = req->rmt_port;
1378 TCP_SKB_CB(skb)->seq = req->snt_isn;
1379 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1380 TCP_SKB_CB(skb)->sacked = 0;
1381 skb_shinfo(skb)->tso_segs = 1;
1382 skb_shinfo(skb)->tso_size = 0;
1383 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1384 th->ack_seq = htonl(req->rcv_isn + 1);
1385 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1387 /* Set this up on the first call only */
1388 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1389 /* tcp_full_space because it is guaranteed to be the first packet */
1390 tcp_select_initial_window(tcp_full_space(sk),
1391 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1396 req->rcv_wscale = rcv_wscale;
1399 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1400 th->window = htons(req->rcv_wnd);
1402 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1403 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1404 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1405 TCP_SKB_CB(skb)->when,
1409 th->doff = (tcp_header_size >> 2);
1410 TCP_INC_STATS(TCP_MIB_OUTSEGS);
1415 * Do all connect socket setups that can be done AF independent.
1417 static inline void tcp_connect_init(struct sock *sk)
1419 struct dst_entry *dst = __sk_dst_get(sk);
1420 struct tcp_opt *tp = tcp_sk(sk);
1422 /* We'll fix this up when we get a response from the other end.
1423 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1425 tp->tcp_header_len = sizeof(struct tcphdr) +
1426 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1428 /* If user gave his TCP_MAXSEG, record it to clamp */
1430 tp->mss_clamp = tp->user_mss;
1432 tcp_sync_mss(sk, dst_pmtu(dst));
1434 if (!tp->window_clamp)
1435 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1436 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1437 tcp_initialize_rcv_mss(sk);
1440 tcp_select_initial_window(tcp_full_space(sk),
1441 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1444 sysctl_tcp_window_scaling,
1447 tp->rcv_ssthresh = tp->rcv_wnd;
1450 sock_reset_flag(sk, SOCK_DONE);
1452 tcp_init_wl(tp, tp->write_seq, 0);
1453 tp->snd_una = tp->write_seq;
1454 tp->snd_sml = tp->write_seq;
1459 tp->rto = TCP_TIMEOUT_INIT;
1460 tp->retransmits = 0;
1461 tcp_clear_retrans(tp);
1465 * Build a SYN and send it off.
1467 int tcp_connect(struct sock *sk)
1469 struct tcp_opt *tp = tcp_sk(sk);
1470 struct sk_buff *buff;
1472 tcp_connect_init(sk);
1474 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1475 if (unlikely(buff == NULL))
1478 /* Reserve space for headers. */
1479 skb_reserve(buff, MAX_TCP_HEADER);
1481 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1482 TCP_ECN_send_syn(sk, tp, buff);
1483 TCP_SKB_CB(buff)->sacked = 0;
1484 skb_shinfo(buff)->tso_segs = 1;
1485 skb_shinfo(buff)->tso_size = 0;
1487 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1488 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1489 tp->snd_nxt = tp->write_seq;
1490 tp->pushed_seq = tp->write_seq;
1494 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1495 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1496 __skb_queue_tail(&sk->sk_write_queue, buff);
1497 sk_charge_skb(sk, buff);
1498 tcp_inc_pcount(&tp->packets_out, buff);
1499 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1500 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1502 /* Timer for repeating the SYN until an answer. */
1503 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1507 /* Send out a delayed ack, the caller does the policy checking
1508 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1511 void tcp_send_delayed_ack(struct sock *sk)
1513 struct tcp_opt *tp = tcp_sk(sk);
1514 int ato = tp->ack.ato;
1515 unsigned long timeout;
1517 if (ato > TCP_DELACK_MIN) {
1520 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1521 max_ato = TCP_DELACK_MAX;
1523 /* Slow path, intersegment interval is "high". */
1525 /* If some rtt estimate is known, use it to bound delayed ack.
1526 * Do not use tp->rto here, use results of rtt measurements
1530 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1536 ato = min(ato, max_ato);
1539 /* Stay within the limit we were given */
1540 timeout = jiffies + ato;
1542 /* Use new timeout only if there wasn't a older one earlier. */
1543 if (tp->ack.pending&TCP_ACK_TIMER) {
1544 /* If delack timer was blocked or is about to expire,
1547 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1552 if (!time_before(timeout, tp->ack.timeout))
1553 timeout = tp->ack.timeout;
1555 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1556 tp->ack.timeout = timeout;
1557 sk_reset_timer(sk, &tp->delack_timer, timeout);
1560 /* This routine sends an ack and also updates the window. */
1561 void tcp_send_ack(struct sock *sk)
1563 /* If we have been reset, we may not send again. */
1564 if (sk->sk_state != TCP_CLOSE) {
1565 struct tcp_opt *tp = tcp_sk(sk);
1566 struct sk_buff *buff;
1568 /* We are not putting this on the write queue, so
1569 * tcp_transmit_skb() will set the ownership to this
1572 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1574 tcp_schedule_ack(tp);
1575 tp->ack.ato = TCP_ATO_MIN;
1576 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1580 /* Reserve space for headers and prepare control bits. */
1581 skb_reserve(buff, MAX_TCP_HEADER);
1583 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1584 TCP_SKB_CB(buff)->sacked = 0;
1585 skb_shinfo(buff)->tso_segs = 1;
1586 skb_shinfo(buff)->tso_size = 0;
1588 /* Send it off, this clears delayed acks for us. */
1589 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1590 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1591 tcp_transmit_skb(sk, buff);
1595 /* This routine sends a packet with an out of date sequence
1596 * number. It assumes the other end will try to ack it.
1598 * Question: what should we make while urgent mode?
1599 * 4.4BSD forces sending single byte of data. We cannot send
1600 * out of window data, because we have SND.NXT==SND.MAX...
1602 * Current solution: to send TWO zero-length segments in urgent mode:
1603 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1604 * out-of-date with SND.UNA-1 to probe window.
1606 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1608 struct tcp_opt *tp = tcp_sk(sk);
1609 struct sk_buff *skb;
1611 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1612 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1616 /* Reserve space for headers and set control bits. */
1617 skb_reserve(skb, MAX_TCP_HEADER);
1619 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1620 TCP_SKB_CB(skb)->sacked = urgent;
1621 skb_shinfo(skb)->tso_segs = 1;
1622 skb_shinfo(skb)->tso_size = 0;
1624 /* Use a previous sequence. This should cause the other
1625 * end to send an ack. Don't queue or clone SKB, just
1628 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1629 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1630 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1631 return tcp_transmit_skb(sk, skb);
1634 int tcp_write_wakeup(struct sock *sk)
1636 if (sk->sk_state != TCP_CLOSE) {
1637 struct tcp_opt *tp = tcp_sk(sk);
1638 struct sk_buff *skb;
1640 if ((skb = sk->sk_send_head) != NULL &&
1641 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1643 unsigned int mss = tcp_current_mss(sk, 0);
1644 unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1646 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1647 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1649 /* We are probing the opening of a window
1650 * but the window size is != 0
1651 * must have been a result SWS avoidance ( sender )
1653 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1655 seg_size = min(seg_size, mss);
1656 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1657 if (tcp_fragment(sk, skb, seg_size))
1659 /* SWS override triggered forced fragmentation.
1660 * Disable TSO, the connection is too sick. */
1661 if (sk->sk_route_caps & NETIF_F_TSO) {
1662 sk->sk_no_largesend = 1;
1663 sk->sk_route_caps &= ~NETIF_F_TSO;
1664 tp->mss_cache = tp->mss_cache_std;
1666 } else if (!tcp_skb_pcount(skb))
1667 tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
1669 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1670 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1671 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1673 update_send_head(sk, tp, skb);
1678 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1679 tcp_xmit_probe_skb(sk, TCPCB_URG);
1680 return tcp_xmit_probe_skb(sk, 0);
1686 /* A window probe timeout has occurred. If window is not closed send
1687 * a partial packet else a zero probe.
1689 void tcp_send_probe0(struct sock *sk)
1691 struct tcp_opt *tp = tcp_sk(sk);
1694 err = tcp_write_wakeup(sk);
1696 if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
1697 /* Cancel probe timer, if it is not required. */
1704 if (tp->backoff < sysctl_tcp_retries2)
1707 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1708 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1710 /* If packet was not sent due to local congestion,
1711 * do not backoff and do not remember probes_out.
1712 * Let local senders to fight for local resources.
1714 * Use accumulated backoff yet.
1716 if (!tp->probes_out)
1718 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1719 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1723 EXPORT_SYMBOL(tcp_connect);
1724 EXPORT_SYMBOL(tcp_make_synack);
1725 EXPORT_SYMBOL(tcp_simple_retransmit);
1726 EXPORT_SYMBOL(tcp_sync_mss);
1727 EXPORT_SYMBOL_GPL(tcp_write_xmit);