2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
49 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
51 sk->sk_send_head = skb->next;
52 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
53 sk->sk_send_head = NULL;
54 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
55 if (tp->packets_out++ == 0)
56 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
59 /* SND.NXT, if window was not shrunk.
60 * If window has been shrunk, what should we make? It is not clear at all.
61 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
62 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
63 * invalid. OK, let's make this for now:
65 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
67 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
70 return tp->snd_una+tp->snd_wnd;
73 /* Calculate mss to advertise in SYN segment.
74 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
76 * 1. It is independent of path mtu.
77 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
78 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
79 * attached devices, because some buggy hosts are confused by
81 * 4. We do not make 3, we advertise MSS, calculated from first
82 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
83 * This may be overridden via information stored in routing table.
84 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
85 * probably even Jumbo".
87 static __u16 tcp_advertise_mss(struct sock *sk)
89 struct tcp_opt *tp = tcp_sk(sk);
90 struct dst_entry *dst = __sk_dst_get(sk);
93 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
94 mss = dst_metric(dst, RTAX_ADVMSS);
101 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
102 * This is the first part of cwnd validation mechanism. */
103 static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
105 s32 delta = tcp_time_stamp - tp->lsndtime;
106 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
107 u32 cwnd = tp->snd_cwnd;
109 if (tcp_is_vegas(tp))
110 tcp_vegas_enable(tp);
112 tp->snd_ssthresh = tcp_current_ssthresh(tp);
113 restart_cwnd = min(restart_cwnd, cwnd);
115 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
117 tp->snd_cwnd = max(cwnd, restart_cwnd);
118 tp->snd_cwnd_stamp = tcp_time_stamp;
119 tp->snd_cwnd_used = 0;
122 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
124 u32 now = tcp_time_stamp;
126 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
127 tcp_cwnd_restart(tp, __sk_dst_get(sk));
131 /* If it is a reply for ato after last received
132 * packet, enter pingpong mode.
134 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
135 tp->ack.pingpong = 1;
138 static __inline__ void tcp_event_ack_sent(struct sock *sk)
140 struct tcp_opt *tp = tcp_sk(sk);
142 tcp_dec_quickack_mode(tp);
143 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
146 /* Chose a new window to advertise, update state in tcp_opt for the
147 * socket, and return result with RFC1323 scaling applied. The return
148 * value can be stuffed directly into th->window for an outgoing
151 static __inline__ u16 tcp_select_window(struct sock *sk)
153 struct tcp_opt *tp = tcp_sk(sk);
154 u32 cur_win = tcp_receive_window(tp);
155 u32 new_win = __tcp_select_window(sk);
157 /* Never shrink the offered window */
158 if(new_win < cur_win) {
159 /* Danger Will Robinson!
160 * Don't update rcv_wup/rcv_wnd here or else
161 * we will not be able to advertise a zero
162 * window in time. --DaveM
164 * Relax Will Robinson.
168 tp->rcv_wnd = new_win;
169 tp->rcv_wup = tp->rcv_nxt;
171 /* Make sure we do not exceed the maximum possible
175 new_win = min(new_win, MAX_TCP_WINDOW);
177 new_win = min(new_win, (65535U << tp->rcv_wscale));
179 /* RFC1323 scaling applied */
180 new_win >>= tp->rcv_wscale;
182 /* If we advertise zero window, disable fast path. */
190 /* This routine actually transmits TCP packets queued in by
191 * tcp_do_sendmsg(). This is used by both the initial
192 * transmission and possible later retransmissions.
193 * All SKB's seen here are completely headerless. It is our
194 * job to build the TCP header, and pass the packet down to
195 * IP so it can do the same plus pass the packet off to the
198 * We are working here with either a clone of the original
199 * SKB, or a fresh unique copy made by the retransmit engine.
201 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
204 struct inet_opt *inet = inet_sk(sk);
205 struct tcp_opt *tp = tcp_sk(sk);
206 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
207 int tcp_header_size = tp->tcp_header_len;
212 #define SYSCTL_FLAG_TSTAMPS 0x1
213 #define SYSCTL_FLAG_WSCALE 0x2
214 #define SYSCTL_FLAG_SACK 0x4
217 if (tcb->flags & TCPCB_FLAG_SYN) {
218 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
219 if(sysctl_tcp_timestamps) {
220 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
221 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
223 if(sysctl_tcp_window_scaling) {
224 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
225 sysctl_flags |= SYSCTL_FLAG_WSCALE;
227 if(sysctl_tcp_sack) {
228 sysctl_flags |= SYSCTL_FLAG_SACK;
229 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
230 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
232 } else if (tp->eff_sacks) {
233 /* A SACK is 2 pad bytes, a 2 byte header, plus
234 * 2 32-bit sequence numbers for each SACK block.
236 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
237 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
241 * If the connection is idle and we are restarting,
242 * then we don't want to do any Vegas calculations
243 * until we get fresh RTT samples. So when we
244 * restart, we reset our Vegas state to a clean
245 * slate. After we get acks for this flight of
246 * packets, _then_ we can make Vegas calculations
249 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
250 tcp_vegas_enable(tp);
252 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
254 skb_set_owner_w(skb, sk);
256 /* Build TCP header and checksum it. */
257 th->source = inet->sport;
258 th->dest = inet->dport;
259 th->seq = htonl(tcb->seq);
260 th->ack_seq = htonl(tp->rcv_nxt);
261 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
262 if (tcb->flags & TCPCB_FLAG_SYN) {
263 /* RFC1323: The window in SYN & SYN/ACK segments
266 th->window = htons(tp->rcv_wnd);
268 th->window = htons(tcp_select_window(sk));
274 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
275 th->urg_ptr = htons(tp->snd_up-tcb->seq);
279 if (tcb->flags & TCPCB_FLAG_SYN) {
280 tcp_syn_build_options((__u32 *)(th + 1),
281 tcp_advertise_mss(sk),
282 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
283 (sysctl_flags & SYSCTL_FLAG_SACK),
284 (sysctl_flags & SYSCTL_FLAG_WSCALE),
289 tcp_build_and_update_options((__u32 *)(th + 1),
292 TCP_ECN_send(sk, tp, skb, tcp_header_size);
294 tp->af_specific->send_check(sk, th, skb->len, skb);
296 if (tcb->flags & TCPCB_FLAG_ACK)
297 tcp_event_ack_sent(sk);
299 if (skb->len != tcp_header_size)
300 tcp_event_data_sent(tp, skb, sk);
302 TCP_INC_STATS(TCP_MIB_OUTSEGS);
304 err = tp->af_specific->queue_xmit(skb, 0);
310 /* NET_XMIT_CN is special. It does not guarantee,
311 * that this packet is lost. It tells that device
312 * is about to start to drop packets or already
313 * drops some packets of the same priority and
314 * invokes us to send less aggressively.
316 return err == NET_XMIT_CN ? 0 : err;
319 #undef SYSCTL_FLAG_TSTAMPS
320 #undef SYSCTL_FLAG_WSCALE
321 #undef SYSCTL_FLAG_SACK
325 /* This routine just queue's the buffer
327 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
328 * otherwise socket can stall.
330 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
332 struct tcp_opt *tp = tcp_sk(sk);
334 /* Advance write_seq and place onto the write_queue. */
335 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
336 __skb_queue_tail(&sk->sk_write_queue, skb);
337 sk_charge_skb(sk, skb);
339 /* Queue it, remembering where we must start sending. */
340 if (sk->sk_send_head == NULL)
341 sk->sk_send_head = skb;
344 /* Send _single_ skb sitting at the send head. This function requires
345 * true push pending frames to setup probe timer etc.
347 void tcp_push_one(struct sock *sk, unsigned cur_mss)
349 struct tcp_opt *tp = tcp_sk(sk);
350 struct sk_buff *skb = sk->sk_send_head;
352 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
353 /* Send it out now. */
354 TCP_SKB_CB(skb)->when = tcp_time_stamp;
355 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
356 sk->sk_send_head = NULL;
357 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
358 if (tp->packets_out++ == 0)
359 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
365 /* Function to create two new TCP segments. Shrinks the given segment
366 * to the specified size and appends a new segment with the rest of the
367 * packet to the list. This won't be called frequently, I hope.
368 * Remember, these are still headerless SKBs at this point.
370 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
372 struct tcp_opt *tp = tcp_sk(sk);
373 struct sk_buff *buff;
374 int nsize = skb->len - len;
377 if (skb_cloned(skb) &&
378 skb_is_nonlinear(skb) &&
379 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
382 /* Get a new skb... force flag on. */
383 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
385 return -ENOMEM; /* We'll just try again later. */
386 sk_charge_skb(sk, buff);
388 /* Correct the sequence numbers. */
389 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
390 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
391 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
393 /* PSH and FIN should only be set in the second packet. */
394 flags = TCP_SKB_CB(skb)->flags;
395 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
396 TCP_SKB_CB(buff)->flags = flags;
397 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
398 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
402 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
404 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
405 /* Copy and checksum data tail into the new buffer. */
406 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
411 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
413 skb->ip_summed = CHECKSUM_HW;
414 skb_split(skb, buff, len);
417 buff->ip_summed = skb->ip_summed;
419 /* Looks stupid, but our code really uses when of
420 * skbs, which it never sent before. --ANK
422 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
424 /* Link BUFF into the send queue. */
425 __skb_append(skb, buff);
430 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
431 * eventually). The difference is that pulled data not copied, but
432 * immediately discarded.
434 unsigned char * __pskb_trim_head(struct sk_buff *skb, int len)
440 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
441 if (skb_shinfo(skb)->frags[i].size <= eat) {
442 put_page(skb_shinfo(skb)->frags[i].page);
443 eat -= skb_shinfo(skb)->frags[i].size;
445 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
447 skb_shinfo(skb)->frags[k].page_offset += eat;
448 skb_shinfo(skb)->frags[k].size -= eat;
454 skb_shinfo(skb)->nr_frags = k;
456 skb->tail = skb->data;
457 skb->data_len -= len;
458 skb->len = skb->data_len;
462 static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
464 if (skb_cloned(skb) &&
465 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
468 if (len <= skb_headlen(skb)) {
469 __skb_pull(skb, len);
471 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
475 TCP_SKB_CB(skb)->seq += len;
476 skb->ip_summed = CHECKSUM_HW;
480 /* This function synchronize snd mss to current pmtu/exthdr set.
482 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
483 for TCP options, but includes only bare TCP header.
485 tp->mss_clamp is mss negotiated at connection setup.
486 It is minumum of user_mss and mss received with SYN.
487 It also does not include TCP options.
489 tp->pmtu_cookie is last pmtu, seen by this function.
491 tp->mss_cache is current effective sending mss, including
492 all tcp options except for SACKs. It is evaluated,
493 taking into account current pmtu, but never exceeds
496 NOTE1. rfc1122 clearly states that advertised MSS
497 DOES NOT include either tcp or ip options.
499 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
500 this function. --ANK (980731)
503 int tcp_sync_mss(struct sock *sk, u32 pmtu)
505 struct tcp_opt *tp = tcp_sk(sk);
506 struct dst_entry *dst = __sk_dst_get(sk);
509 if (dst && dst->ops->get_mss)
510 pmtu = dst->ops->get_mss(dst, pmtu);
512 /* Calculate base mss without TCP options:
513 It is MMS_S - sizeof(tcphdr) of rfc1122
515 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
517 /* Clamp it (mss_clamp does not include tcp options) */
518 if (mss_now > tp->mss_clamp)
519 mss_now = tp->mss_clamp;
521 /* Now subtract optional transport overhead */
522 mss_now -= tp->ext_header_len + tp->ext2_header_len;
524 /* Then reserve room for full set of TCP options and 8 bytes of data */
528 /* Now subtract TCP options size, not including SACKs */
529 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
531 /* Bound mss with half of window */
532 if (tp->max_window && mss_now > (tp->max_window>>1))
533 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
535 /* And store cached results */
536 tp->pmtu_cookie = pmtu;
537 tp->mss_cache = tp->mss_cache_std = mss_now;
539 if (sk->sk_route_caps & NETIF_F_TSO) {
542 large_mss = 65535 - tp->af_specific->net_header_len -
543 tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len;
545 if (tp->max_window && large_mss > (tp->max_window>>1))
546 large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len);
548 /* Always keep large mss multiple of real mss. */
549 tp->mss_cache = mss_now*(large_mss/mss_now);
556 /* This routine writes packets to the network. It advances the
557 * send_head. This happens as incoming acks open up the remote
560 * Returns 1, if no segments are in flight and we have queued segments, but
561 * cannot send anything now because of SWS or another problem.
563 int tcp_write_xmit(struct sock *sk, int nonagle)
565 struct tcp_opt *tp = tcp_sk(sk);
566 unsigned int mss_now;
568 /* If we are closed, the bytes will have to remain here.
569 * In time closedown will finish, we empty the write queue and all
572 if (sk->sk_state != TCP_CLOSE) {
576 /* Account for SACKS, we may need to fragment due to this.
577 * It is just like the real MSS changing on us midstream.
578 * We also handle things correctly when the user adds some
579 * IP options mid-stream. Silly to do, but cover it.
581 mss_now = tcp_current_mss(sk, 1);
583 while ((skb = sk->sk_send_head) &&
584 tcp_snd_test(tp, skb, mss_now,
585 tcp_skb_is_last(sk, skb) ? nonagle :
587 if (skb->len > mss_now) {
588 if (tcp_fragment(sk, skb, mss_now))
592 TCP_SKB_CB(skb)->when = tcp_time_stamp;
593 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
595 /* Advance the send_head. This one is sent out. */
596 update_send_head(sk, tp, skb);
597 tcp_minshall_update(tp, mss_now, skb);
602 tcp_cwnd_validate(sk, tp);
606 return !tp->packets_out && sk->sk_send_head;
611 /* This function returns the amount that we can raise the
612 * usable window based on the following constraints
614 * 1. The window can never be shrunk once it is offered (RFC 793)
615 * 2. We limit memory per socket
618 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
619 * RECV.NEXT + RCV.WIN fixed until:
620 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
622 * i.e. don't raise the right edge of the window until you can raise
623 * it at least MSS bytes.
625 * Unfortunately, the recommended algorithm breaks header prediction,
626 * since header prediction assumes th->window stays fixed.
628 * Strictly speaking, keeping th->window fixed violates the receiver
629 * side SWS prevention criteria. The problem is that under this rule
630 * a stream of single byte packets will cause the right side of the
631 * window to always advance by a single byte.
633 * Of course, if the sender implements sender side SWS prevention
634 * then this will not be a problem.
636 * BSD seems to make the following compromise:
638 * If the free space is less than the 1/4 of the maximum
639 * space available and the free space is less than 1/2 mss,
640 * then set the window to 0.
641 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
642 * Otherwise, just prevent the window from shrinking
643 * and from being larger than the largest representable value.
645 * This prevents incremental opening of the window in the regime
646 * where TCP is limited by the speed of the reader side taking
647 * data out of the TCP receive queue. It does nothing about
648 * those cases where the window is constrained on the sender side
649 * because the pipeline is full.
651 * BSD also seems to "accidentally" limit itself to windows that are a
652 * multiple of MSS, at least until the free space gets quite small.
653 * This would appear to be a side effect of the mbuf implementation.
654 * Combining these two algorithms results in the observed behavior
655 * of having a fixed window size at almost all times.
657 * Below we obtain similar behavior by forcing the offered window to
658 * a multiple of the mss when it is feasible to do so.
660 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
661 * Regular options like TIMESTAMP are taken into account.
663 u32 __tcp_select_window(struct sock *sk)
665 struct tcp_opt *tp = tcp_sk(sk);
666 /* MSS for the peer's data. Previous verions used mss_clamp
667 * here. I don't know if the value based on our guesses
668 * of peer's MSS is better for the performance. It's more correct
669 * but may be worse for the performance because of rcv_mss
670 * fluctuations. --SAW 1998/11/1
672 int mss = tp->ack.rcv_mss;
673 int free_space = tcp_space(sk);
674 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
677 if (mss > full_space)
680 if (free_space < full_space/2) {
683 if (tcp_memory_pressure)
684 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
686 if (free_space < mss)
690 if (free_space > tp->rcv_ssthresh)
691 free_space = tp->rcv_ssthresh;
693 /* Don't do rounding if we are using window scaling, since the
694 * scaled window will not line up with the MSS boundary anyway.
696 window = tp->rcv_wnd;
697 if (tp->rcv_wscale) {
700 /* Advertise enough space so that it won't get scaled away.
701 * Import case: prevent zero window announcement if
702 * 1<<rcv_wscale > mss.
704 if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window)
705 window = (((window >> tp->rcv_wscale) + 1)
708 /* Get the largest window that is a nice multiple of mss.
709 * Window clamp already applied above.
710 * If our current window offering is within 1 mss of the
711 * free space we just keep it. This prevents the divide
712 * and multiply from happening most of the time.
713 * We also don't do any window rounding when the free space
716 if (window <= free_space - mss || window > free_space)
717 window = (free_space/mss)*mss;
723 /* Attempt to collapse two adjacent SKB's during retransmission. */
724 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
726 struct tcp_opt *tp = tcp_sk(sk);
727 struct sk_buff *next_skb = skb->next;
729 /* The first test we must make is that neither of these two
730 * SKB's are still referenced by someone else.
732 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
733 int skb_size = skb->len, next_skb_size = next_skb->len;
734 u16 flags = TCP_SKB_CB(skb)->flags;
736 /* Also punt if next skb has been SACK'd. */
737 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
740 /* Next skb is out of window. */
741 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
744 /* Punt if not enough space exists in the first SKB for
745 * the data in the second, or the total combined payload
746 * would exceed the MSS.
748 if ((next_skb_size > skb_tailroom(skb)) ||
749 ((skb_size + next_skb_size) > mss_now))
752 /* Ok. We will be able to collapse the packet. */
753 __skb_unlink(next_skb, next_skb->list);
755 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
757 if (next_skb->ip_summed == CHECKSUM_HW)
758 skb->ip_summed = CHECKSUM_HW;
760 if (skb->ip_summed != CHECKSUM_HW)
761 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
763 /* Update sequence range on original skb. */
764 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
766 /* Merge over control information. */
767 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
768 TCP_SKB_CB(skb)->flags = flags;
770 /* All done, get rid of second SKB and account for it so
771 * packet counting does not break.
773 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
774 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
776 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
780 /* Reno case is special. Sigh... */
781 if (!tp->sack_ok && tp->sacked_out) {
786 /* Not quite right: it can be > snd.fack, but
787 * it is better to underestimate fackets.
791 sk_stream_free_skb(sk, next_skb);
796 /* Do a simple retransmit without using the backoff mechanisms in
797 * tcp_timer. This is used for path mtu discovery.
798 * The socket is already locked here.
800 void tcp_simple_retransmit(struct sock *sk)
802 struct tcp_opt *tp = tcp_sk(sk);
804 unsigned int mss = tcp_current_mss(sk, 0);
807 sk_stream_for_retrans_queue(skb, sk) {
808 if (skb->len > mss &&
809 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
810 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
811 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
814 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
815 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
825 tcp_sync_left_out(tp);
827 /* Don't muck with the congestion window here.
828 * Reason is that we do not increase amount of _data_
829 * in network, but units changed and effective
830 * cwnd/ssthresh really reduced now.
832 if (tp->ca_state != TCP_CA_Loss) {
833 tp->high_seq = tp->snd_nxt;
834 tp->snd_ssthresh = tcp_current_ssthresh(tp);
835 tp->prior_ssthresh = 0;
837 tcp_set_ca_state(tp, TCP_CA_Loss);
839 tcp_xmit_retransmit_queue(sk);
842 /* This retransmits one SKB. Policy decisions and retransmit queue
843 * state updates are done by the caller. Returns non-zero if an
844 * error occurred which prevented the send.
846 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
848 struct tcp_opt *tp = tcp_sk(sk);
849 unsigned int cur_mss = tcp_current_mss(sk, 0);
852 /* Do not sent more than we queued. 1/4 is reserved for possible
853 * copying overhead: frgagmentation, tunneling, mangling etc.
855 if (atomic_read(&sk->sk_wmem_alloc) >
856 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
859 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
860 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
863 if (sk->sk_route_caps & NETIF_F_TSO) {
864 sk->sk_route_caps &= ~NETIF_F_TSO;
865 sk->sk_no_largesend = 1;
866 tp->mss_cache = tp->mss_cache_std;
869 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
873 /* If receiver has shrunk his window, and skb is out of
874 * new window, do not retransmit it. The exception is the
875 * case, when window is shrunk to zero. In this case
876 * our retransmit serves as a zero window probe.
878 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
879 && TCP_SKB_CB(skb)->seq != tp->snd_una)
882 if(skb->len > cur_mss) {
883 if(tcp_fragment(sk, skb, cur_mss))
884 return -ENOMEM; /* We'll try again later. */
886 /* New SKB created, account for it. */
890 /* Collapse two adjacent packets if worthwhile and we can. */
891 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
892 (skb->len < (cur_mss >> 1)) &&
893 (skb->next != sk->sk_send_head) &&
894 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
895 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
896 (sysctl_tcp_retrans_collapse != 0))
897 tcp_retrans_try_collapse(sk, skb, cur_mss);
899 if(tp->af_specific->rebuild_header(sk))
900 return -EHOSTUNREACH; /* Routing failure or similar. */
902 /* Some Solaris stacks overoptimize and ignore the FIN on a
903 * retransmit when old data is attached. So strip it off
904 * since it is cheap to do so and saves bytes on the network.
907 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
908 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
909 if (!pskb_trim(skb, 0)) {
910 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
911 skb->ip_summed = CHECKSUM_NONE;
916 /* Make a copy, if the first transmission SKB clone we made
917 * is still in somebody's hands, else make a clone.
919 TCP_SKB_CB(skb)->when = tcp_time_stamp;
921 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
922 pskb_copy(skb, GFP_ATOMIC):
923 skb_clone(skb, GFP_ATOMIC)));
926 /* Update global TCP statistics. */
927 TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
929 #if FASTRETRANS_DEBUG > 0
930 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
932 printk(KERN_DEBUG "retrans_out leaked.\n");
935 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
938 /* Save stamp of the first retransmit. */
939 if (!tp->retrans_stamp)
940 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
944 /* snd_nxt is stored to detect loss of retransmitted segment,
945 * see tcp_input.c tcp_sacktag_write_queue().
947 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
952 /* This gets called after a retransmit timeout, and the initially
953 * retransmitted data is acknowledged. It tries to continue
954 * resending the rest of the retransmit queue, until either
955 * we've sent it all or the congestion window limit is reached.
956 * If doing SACK, the first ACK which comes back for a timeout
957 * based retransmit packet might feed us FACK information again.
958 * If so, we use it to avoid unnecessarily retransmissions.
960 void tcp_xmit_retransmit_queue(struct sock *sk)
962 struct tcp_opt *tp = tcp_sk(sk);
964 int packet_cnt = tp->lost_out;
966 /* First pass: retransmit lost packets. */
968 sk_stream_for_retrans_queue(skb, sk) {
969 __u8 sacked = TCP_SKB_CB(skb)->sacked;
971 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
974 if (sacked&TCPCB_LOST) {
975 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
976 if (tcp_retransmit_skb(sk, skb))
978 if (tp->ca_state != TCP_CA_Loss)
979 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
981 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
984 skb_peek(&sk->sk_write_queue))
985 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
988 if (--packet_cnt <= 0)
994 /* OK, demanded retransmission is finished. */
996 /* Forward retransmissions are possible only during Recovery. */
997 if (tp->ca_state != TCP_CA_Recovery)
1000 /* No forward retransmissions in Reno are possible. */
1004 /* Yeah, we have to make difficult choice between forward transmission
1005 * and retransmission... Both ways have their merits...
1007 * For now we do not retrnamsit anything, while we have some new
1011 if (tcp_may_send_now(sk, tp))
1016 sk_stream_for_retrans_queue(skb, sk) {
1017 if(++packet_cnt > tp->fackets_out)
1020 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1023 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1026 /* Ok, retransmit it. */
1027 if(tcp_retransmit_skb(sk, skb))
1030 if (skb == skb_peek(&sk->sk_write_queue))
1031 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1033 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1038 /* Send a fin. The caller locks the socket for us. This cannot be
1039 * allowed to fail queueing a FIN frame under any circumstances.
1041 void tcp_send_fin(struct sock *sk)
1043 struct tcp_opt *tp = tcp_sk(sk);
1044 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1045 unsigned int mss_now;
1047 /* Optimization, tack on the FIN if we have a queue of
1048 * unsent frames. But be careful about outgoing SACKS
1051 mss_now = tcp_current_mss(sk, 1);
1053 if (sk->sk_send_head != NULL) {
1054 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1055 TCP_SKB_CB(skb)->end_seq++;
1058 /* Socket is locked, keep trying until memory is available. */
1060 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1066 /* Reserve space for headers and prepare control bits. */
1067 skb_reserve(skb, MAX_TCP_HEADER);
1069 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1070 TCP_SKB_CB(skb)->sacked = 0;
1072 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1073 TCP_SKB_CB(skb)->seq = tp->write_seq;
1074 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1075 tcp_queue_skb(sk, skb);
1077 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1080 /* We get here when a process closes a file descriptor (either due to
1081 * an explicit close() or as a byproduct of exit()'ing) and there
1082 * was unread data in the receive queue. This behavior is recommended
1083 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1085 void tcp_send_active_reset(struct sock *sk, int priority)
1087 struct tcp_opt *tp = tcp_sk(sk);
1088 struct sk_buff *skb;
1090 /* NOTE: No TCP options attached and we never retransmit this. */
1091 skb = alloc_skb(MAX_TCP_HEADER, priority);
1093 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1097 /* Reserve space for headers and prepare control bits. */
1098 skb_reserve(skb, MAX_TCP_HEADER);
1100 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1101 TCP_SKB_CB(skb)->sacked = 0;
1104 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1105 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1106 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1107 if (tcp_transmit_skb(sk, skb))
1108 NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
1111 /* WARNING: This routine must only be called when we have already sent
1112 * a SYN packet that crossed the incoming SYN that caused this routine
1113 * to get called. If this assumption fails then the initial rcv_wnd
1114 * and rcv_wscale values will not be correct.
1116 int tcp_send_synack(struct sock *sk)
1118 struct sk_buff* skb;
1120 skb = skb_peek(&sk->sk_write_queue);
1121 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1122 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1125 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1126 if (skb_cloned(skb)) {
1127 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1130 __skb_unlink(skb, &sk->sk_write_queue);
1131 __skb_queue_head(&sk->sk_write_queue, nskb);
1132 sk_stream_free_skb(sk, skb);
1133 sk_charge_skb(sk, nskb);
1137 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1138 TCP_ECN_send_synack(tcp_sk(sk), skb);
1140 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1141 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1145 * Prepare a SYN-ACK.
1147 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1148 struct open_request *req)
1150 struct tcp_opt *tp = tcp_sk(sk);
1152 int tcp_header_size;
1153 struct sk_buff *skb;
1155 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1159 /* Reserve space for headers. */
1160 skb_reserve(skb, MAX_TCP_HEADER);
1162 skb->dst = dst_clone(dst);
1164 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1165 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1166 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1167 /* SACK_PERM is in the place of NOP NOP of TS */
1168 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1169 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1171 memset(th, 0, sizeof(struct tcphdr));
1174 if (dst->dev->features&NETIF_F_TSO)
1176 TCP_ECN_make_synack(req, th);
1177 th->source = inet_sk(sk)->sport;
1178 th->dest = req->rmt_port;
1179 TCP_SKB_CB(skb)->seq = req->snt_isn;
1180 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1181 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1182 th->ack_seq = htonl(req->rcv_isn + 1);
1183 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1185 /* Set this up on the first call only */
1186 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1187 /* tcp_full_space because it is guaranteed to be the first packet */
1188 tcp_select_initial_window(tcp_full_space(sk),
1189 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1194 req->rcv_wscale = rcv_wscale;
1197 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1198 th->window = htons(req->rcv_wnd);
1200 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1201 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1202 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1203 TCP_SKB_CB(skb)->when,
1207 th->doff = (tcp_header_size >> 2);
1208 TCP_INC_STATS(TCP_MIB_OUTSEGS);
1213 * Do all connect socket setups that can be done AF independent.
1215 static inline void tcp_connect_init(struct sock *sk)
1217 struct dst_entry *dst = __sk_dst_get(sk);
1218 struct tcp_opt *tp = tcp_sk(sk);
1220 /* We'll fix this up when we get a response from the other end.
1221 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1223 tp->tcp_header_len = sizeof(struct tcphdr) +
1224 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1226 /* If user gave his TCP_MAXSEG, record it to clamp */
1228 tp->mss_clamp = tp->user_mss;
1230 tcp_sync_mss(sk, dst_pmtu(dst));
1232 if (!tp->window_clamp)
1233 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1234 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1235 tcp_initialize_rcv_mss(sk);
1238 tcp_select_initial_window(tcp_full_space(sk),
1239 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1242 sysctl_tcp_window_scaling,
1245 tp->rcv_ssthresh = tp->rcv_wnd;
1248 sock_reset_flag(sk, SOCK_DONE);
1250 tcp_init_wl(tp, tp->write_seq, 0);
1251 tp->snd_una = tp->write_seq;
1252 tp->snd_sml = tp->write_seq;
1257 tp->rto = TCP_TIMEOUT_INIT;
1258 tp->retransmits = 0;
1259 tcp_clear_retrans(tp);
1263 * Build a SYN and send it off.
1265 int tcp_connect(struct sock *sk)
1267 struct tcp_opt *tp = tcp_sk(sk);
1268 struct sk_buff *buff;
1270 tcp_connect_init(sk);
1272 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1273 if (unlikely(buff == NULL))
1276 /* Reserve space for headers. */
1277 skb_reserve(buff, MAX_TCP_HEADER);
1279 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1280 TCP_ECN_send_syn(sk, tp, buff);
1281 TCP_SKB_CB(buff)->sacked = 0;
1283 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1284 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1285 tp->snd_nxt = tp->write_seq;
1286 tp->pushed_seq = tp->write_seq;
1290 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1291 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1292 __skb_queue_tail(&sk->sk_write_queue, buff);
1293 sk_charge_skb(sk, buff);
1295 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1296 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1298 /* Timer for repeating the SYN until an answer. */
1299 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1303 /* Send out a delayed ack, the caller does the policy checking
1304 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1307 void tcp_send_delayed_ack(struct sock *sk)
1309 struct tcp_opt *tp = tcp_sk(sk);
1310 int ato = tp->ack.ato;
1311 unsigned long timeout;
1313 if (ato > TCP_DELACK_MIN) {
1316 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1317 max_ato = TCP_DELACK_MAX;
1319 /* Slow path, intersegment interval is "high". */
1321 /* If some rtt estimate is known, use it to bound delayed ack.
1322 * Do not use tp->rto here, use results of rtt measurements
1326 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1332 ato = min(ato, max_ato);
1335 /* Stay within the limit we were given */
1336 timeout = jiffies + ato;
1338 /* Use new timeout only if there wasn't a older one earlier. */
1339 if (tp->ack.pending&TCP_ACK_TIMER) {
1340 /* If delack timer was blocked or is about to expire,
1343 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1348 if (!time_before(timeout, tp->ack.timeout))
1349 timeout = tp->ack.timeout;
1351 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1352 tp->ack.timeout = timeout;
1353 sk_reset_timer(sk, &tp->delack_timer, timeout);
1356 /* This routine sends an ack and also updates the window. */
1357 void tcp_send_ack(struct sock *sk)
1359 /* If we have been reset, we may not send again. */
1360 if (sk->sk_state != TCP_CLOSE) {
1361 struct tcp_opt *tp = tcp_sk(sk);
1362 struct sk_buff *buff;
1364 /* We are not putting this on the write queue, so
1365 * tcp_transmit_skb() will set the ownership to this
1368 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1370 tcp_schedule_ack(tp);
1371 tp->ack.ato = TCP_ATO_MIN;
1372 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1376 /* Reserve space for headers and prepare control bits. */
1377 skb_reserve(buff, MAX_TCP_HEADER);
1379 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1380 TCP_SKB_CB(buff)->sacked = 0;
1382 /* Send it off, this clears delayed acks for us. */
1383 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1384 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1385 tcp_transmit_skb(sk, buff);
1389 /* This routine sends a packet with an out of date sequence
1390 * number. It assumes the other end will try to ack it.
1392 * Question: what should we make while urgent mode?
1393 * 4.4BSD forces sending single byte of data. We cannot send
1394 * out of window data, because we have SND.NXT==SND.MAX...
1396 * Current solution: to send TWO zero-length segments in urgent mode:
1397 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1398 * out-of-date with SND.UNA-1 to probe window.
1400 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1402 struct tcp_opt *tp = tcp_sk(sk);
1403 struct sk_buff *skb;
1405 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1406 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1410 /* Reserve space for headers and set control bits. */
1411 skb_reserve(skb, MAX_TCP_HEADER);
1413 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1414 TCP_SKB_CB(skb)->sacked = urgent;
1416 /* Use a previous sequence. This should cause the other
1417 * end to send an ack. Don't queue or clone SKB, just
1420 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1421 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1422 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1423 return tcp_transmit_skb(sk, skb);
1426 int tcp_write_wakeup(struct sock *sk)
1428 if (sk->sk_state != TCP_CLOSE) {
1429 struct tcp_opt *tp = tcp_sk(sk);
1430 struct sk_buff *skb;
1432 if ((skb = sk->sk_send_head) != NULL &&
1433 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1435 int mss = tcp_current_mss(sk, 0);
1436 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1438 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1439 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1441 /* We are probing the opening of a window
1442 * but the window size is != 0
1443 * must have been a result SWS avoidance ( sender )
1445 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1447 seg_size = min(seg_size, mss);
1448 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1449 if (tcp_fragment(sk, skb, seg_size))
1451 /* SWS override triggered forced fragmentation.
1452 * Disable TSO, the connection is too sick. */
1453 if (sk->sk_route_caps & NETIF_F_TSO) {
1454 sk->sk_no_largesend = 1;
1455 sk->sk_route_caps &= ~NETIF_F_TSO;
1456 tp->mss_cache = tp->mss_cache_std;
1459 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1460 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1461 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1463 update_send_head(sk, tp, skb);
1468 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1469 tcp_xmit_probe_skb(sk, TCPCB_URG);
1470 return tcp_xmit_probe_skb(sk, 0);
1476 /* A window probe timeout has occurred. If window is not closed send
1477 * a partial packet else a zero probe.
1479 void tcp_send_probe0(struct sock *sk)
1481 struct tcp_opt *tp = tcp_sk(sk);
1484 err = tcp_write_wakeup(sk);
1486 if (tp->packets_out || !sk->sk_send_head) {
1487 /* Cancel probe timer, if it is not required. */
1494 if (tp->backoff < sysctl_tcp_retries2)
1497 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1498 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1500 /* If packet was not sent due to local congestion,
1501 * do not backoff and do not remember probes_out.
1502 * Let local senders to fight for local resources.
1504 * Use accumulated backoff yet.
1506 if (!tp->probes_out)
1508 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1509 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1513 EXPORT_SYMBOL(tcp_acceptable_seq);
1514 EXPORT_SYMBOL(tcp_connect);
1515 EXPORT_SYMBOL(tcp_connect_init);
1516 EXPORT_SYMBOL(tcp_make_synack);
1517 EXPORT_SYMBOL(tcp_send_synack);
1518 EXPORT_SYMBOL(tcp_simple_retransmit);
1519 EXPORT_SYMBOL(tcp_sync_mss);
1520 EXPORT_SYMBOL(tcp_transmit_skb);
1521 EXPORT_SYMBOL(tcp_write_wakeup);
1522 EXPORT_SYMBOL(tcp_write_xmit);