2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
49 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
51 sk->sk_send_head = skb->next;
52 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
53 sk->sk_send_head = NULL;
54 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
55 if (tp->packets_out++ == 0)
56 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
59 /* SND.NXT, if window was not shrunk.
60 * If window has been shrunk, what should we make? It is not clear at all.
61 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
62 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
63 * invalid. OK, let's make this for now:
65 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
67 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
70 return tp->snd_una+tp->snd_wnd;
73 /* Calculate mss to advertise in SYN segment.
74 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
76 * 1. It is independent of path mtu.
77 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
78 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
79 * attached devices, because some buggy hosts are confused by
81 * 4. We do not make 3, we advertise MSS, calculated from first
82 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
83 * This may be overridden via information stored in routing table.
84 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
85 * probably even Jumbo".
87 static __u16 tcp_advertise_mss(struct sock *sk)
89 struct tcp_opt *tp = tcp_sk(sk);
90 struct dst_entry *dst = __sk_dst_get(sk);
93 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
94 mss = dst_metric(dst, RTAX_ADVMSS);
101 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
102 * This is the first part of cwnd validation mechanism. */
103 static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
105 s32 delta = tcp_time_stamp - tp->lsndtime;
106 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
107 u32 cwnd = tp->snd_cwnd;
109 if (tcp_is_vegas(tp))
110 tcp_vegas_enable(tp);
112 tp->snd_ssthresh = tcp_current_ssthresh(tp);
113 restart_cwnd = min(restart_cwnd, cwnd);
115 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
117 tp->snd_cwnd = max(cwnd, restart_cwnd);
118 tp->snd_cwnd_stamp = tcp_time_stamp;
119 tp->snd_cwnd_used = 0;
122 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
124 u32 now = tcp_time_stamp;
126 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
127 tcp_cwnd_restart(tp, __sk_dst_get(sk));
131 /* If it is a reply for ato after last received
132 * packet, enter pingpong mode.
134 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
135 tp->ack.pingpong = 1;
138 static __inline__ void tcp_event_ack_sent(struct sock *sk)
140 struct tcp_opt *tp = tcp_sk(sk);
142 tcp_dec_quickack_mode(tp);
143 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
146 /* Chose a new window to advertise, update state in tcp_opt for the
147 * socket, and return result with RFC1323 scaling applied. The return
148 * value can be stuffed directly into th->window for an outgoing
151 static __inline__ u16 tcp_select_window(struct sock *sk)
153 struct tcp_opt *tp = tcp_sk(sk);
154 u32 cur_win = tcp_receive_window(tp);
155 u32 new_win = __tcp_select_window(sk);
157 /* Never shrink the offered window */
158 if(new_win < cur_win) {
159 /* Danger Will Robinson!
160 * Don't update rcv_wup/rcv_wnd here or else
161 * we will not be able to advertise a zero
162 * window in time. --DaveM
164 * Relax Will Robinson.
168 tp->rcv_wnd = new_win;
169 tp->rcv_wup = tp->rcv_nxt;
171 /* RFC1323 scaling applied */
172 new_win >>= tp->rcv_wscale;
174 /* If we advertise zero window, disable fast path. */
182 /* This routine actually transmits TCP packets queued in by
183 * tcp_do_sendmsg(). This is used by both the initial
184 * transmission and possible later retransmissions.
185 * All SKB's seen here are completely headerless. It is our
186 * job to build the TCP header, and pass the packet down to
187 * IP so it can do the same plus pass the packet off to the
190 * We are working here with either a clone of the original
191 * SKB, or a fresh unique copy made by the retransmit engine.
193 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
196 struct inet_opt *inet = inet_sk(sk);
197 struct tcp_opt *tp = tcp_sk(sk);
198 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
199 int tcp_header_size = tp->tcp_header_len;
204 #define SYSCTL_FLAG_TSTAMPS 0x1
205 #define SYSCTL_FLAG_WSCALE 0x2
206 #define SYSCTL_FLAG_SACK 0x4
209 if (tcb->flags & TCPCB_FLAG_SYN) {
210 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
211 if(sysctl_tcp_timestamps) {
212 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
213 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
215 if(sysctl_tcp_window_scaling) {
216 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
217 sysctl_flags |= SYSCTL_FLAG_WSCALE;
219 if(sysctl_tcp_sack) {
220 sysctl_flags |= SYSCTL_FLAG_SACK;
221 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
222 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
224 } else if (tp->eff_sacks) {
225 /* A SACK is 2 pad bytes, a 2 byte header, plus
226 * 2 32-bit sequence numbers for each SACK block.
228 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
229 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
233 * If the connection is idle and we are restarting,
234 * then we don't want to do any Vegas calculations
235 * until we get fresh RTT samples. So when we
236 * restart, we reset our Vegas state to a clean
237 * slate. After we get acks for this flight of
238 * packets, _then_ we can make Vegas calculations
241 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
242 tcp_vegas_enable(tp);
244 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
246 skb_set_owner_w(skb, sk);
248 /* Build TCP header and checksum it. */
249 th->source = inet->sport;
250 th->dest = inet->dport;
251 th->seq = htonl(tcb->seq);
252 th->ack_seq = htonl(tp->rcv_nxt);
253 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
254 if (tcb->flags & TCPCB_FLAG_SYN) {
255 /* RFC1323: The window in SYN & SYN/ACK segments
258 th->window = htons(tp->rcv_wnd);
260 th->window = htons(tcp_select_window(sk));
266 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
267 th->urg_ptr = htons(tp->snd_up-tcb->seq);
271 if (tcb->flags & TCPCB_FLAG_SYN) {
272 tcp_syn_build_options((__u32 *)(th + 1),
273 tcp_advertise_mss(sk),
274 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
275 (sysctl_flags & SYSCTL_FLAG_SACK),
276 (sysctl_flags & SYSCTL_FLAG_WSCALE),
281 tcp_build_and_update_options((__u32 *)(th + 1),
284 TCP_ECN_send(sk, tp, skb, tcp_header_size);
286 tp->af_specific->send_check(sk, th, skb->len, skb);
288 if (tcb->flags & TCPCB_FLAG_ACK)
289 tcp_event_ack_sent(sk);
291 if (skb->len != tcp_header_size)
292 tcp_event_data_sent(tp, skb, sk);
294 TCP_INC_STATS(TcpOutSegs);
296 err = tp->af_specific->queue_xmit(skb, 0);
302 /* NET_XMIT_CN is special. It does not guarantee,
303 * that this packet is lost. It tells that device
304 * is about to start to drop packets or already
305 * drops some packets of the same priority and
306 * invokes us to send less aggressively.
308 return err == NET_XMIT_CN ? 0 : err;
311 #undef SYSCTL_FLAG_TSTAMPS
312 #undef SYSCTL_FLAG_WSCALE
313 #undef SYSCTL_FLAG_SACK
317 /* This routine just queue's the buffer
319 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
320 * otherwise socket can stall.
322 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
324 struct tcp_opt *tp = tcp_sk(sk);
326 /* Advance write_seq and place onto the write_queue. */
327 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
328 __skb_queue_tail(&sk->sk_write_queue, skb);
329 sk_charge_skb(sk, skb);
331 /* Queue it, remembering where we must start sending. */
332 if (sk->sk_send_head == NULL)
333 sk->sk_send_head = skb;
336 /* Send _single_ skb sitting at the send head. This function requires
337 * true push pending frames to setup probe timer etc.
339 void tcp_push_one(struct sock *sk, unsigned cur_mss)
341 struct tcp_opt *tp = tcp_sk(sk);
342 struct sk_buff *skb = sk->sk_send_head;
344 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
345 /* Send it out now. */
346 TCP_SKB_CB(skb)->when = tcp_time_stamp;
347 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
348 sk->sk_send_head = NULL;
349 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
350 if (tp->packets_out++ == 0)
351 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
357 /* Function to create two new TCP segments. Shrinks the given segment
358 * to the specified size and appends a new segment with the rest of the
359 * packet to the list. This won't be called frequently, I hope.
360 * Remember, these are still headerless SKBs at this point.
362 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
364 struct tcp_opt *tp = tcp_sk(sk);
365 struct sk_buff *buff;
366 int nsize = skb->len - len;
369 if (skb_cloned(skb) &&
370 skb_is_nonlinear(skb) &&
371 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
374 /* Get a new skb... force flag on. */
375 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
377 return -ENOMEM; /* We'll just try again later. */
378 sk_charge_skb(sk, buff);
380 /* Correct the sequence numbers. */
381 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
382 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
383 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
385 /* PSH and FIN should only be set in the second packet. */
386 flags = TCP_SKB_CB(skb)->flags;
387 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
388 TCP_SKB_CB(buff)->flags = flags;
389 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
390 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
394 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
396 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
397 /* Copy and checksum data tail into the new buffer. */
398 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
403 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
405 skb->ip_summed = CHECKSUM_HW;
406 skb_split(skb, buff, len);
409 buff->ip_summed = skb->ip_summed;
411 /* Looks stupid, but our code really uses when of
412 * skbs, which it never sent before. --ANK
414 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
416 /* Link BUFF into the send queue. */
417 __skb_append(skb, buff);
422 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
423 * eventually). The difference is that pulled data not copied, but
424 * immediately discarded.
426 unsigned char * __pskb_trim_head(struct sk_buff *skb, int len)
432 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
433 if (skb_shinfo(skb)->frags[i].size <= eat) {
434 put_page(skb_shinfo(skb)->frags[i].page);
435 eat -= skb_shinfo(skb)->frags[i].size;
437 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
439 skb_shinfo(skb)->frags[k].page_offset += eat;
440 skb_shinfo(skb)->frags[k].size -= eat;
446 skb_shinfo(skb)->nr_frags = k;
448 skb->tail = skb->data;
449 skb->data_len -= len;
450 skb->len = skb->data_len;
454 static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
456 if (skb_cloned(skb) &&
457 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
460 if (len <= skb_headlen(skb)) {
461 __skb_pull(skb, len);
463 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
467 TCP_SKB_CB(skb)->seq += len;
468 skb->ip_summed = CHECKSUM_HW;
472 /* This function synchronize snd mss to current pmtu/exthdr set.
474 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
475 for TCP options, but includes only bare TCP header.
477 tp->mss_clamp is mss negotiated at connection setup.
478 It is minumum of user_mss and mss received with SYN.
479 It also does not include TCP options.
481 tp->pmtu_cookie is last pmtu, seen by this function.
483 tp->mss_cache is current effective sending mss, including
484 all tcp options except for SACKs. It is evaluated,
485 taking into account current pmtu, but never exceeds
488 NOTE1. rfc1122 clearly states that advertised MSS
489 DOES NOT include either tcp or ip options.
491 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
492 this function. --ANK (980731)
495 int tcp_sync_mss(struct sock *sk, u32 pmtu)
497 struct tcp_opt *tp = tcp_sk(sk);
498 struct dst_entry *dst = __sk_dst_get(sk);
501 if (dst && dst->ops->get_mss)
502 pmtu = dst->ops->get_mss(dst, pmtu);
504 /* Calculate base mss without TCP options:
505 It is MMS_S - sizeof(tcphdr) of rfc1122
507 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
509 /* Clamp it (mss_clamp does not include tcp options) */
510 if (mss_now > tp->mss_clamp)
511 mss_now = tp->mss_clamp;
513 /* Now subtract optional transport overhead */
514 mss_now -= tp->ext_header_len + tp->ext2_header_len;
516 /* Then reserve room for full set of TCP options and 8 bytes of data */
520 /* Now subtract TCP options size, not including SACKs */
521 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
523 /* Bound mss with half of window */
524 if (tp->max_window && mss_now > (tp->max_window>>1))
525 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
527 /* And store cached results */
528 tp->pmtu_cookie = pmtu;
529 tp->mss_cache = tp->mss_cache_std = mss_now;
531 if (sk->sk_route_caps & NETIF_F_TSO) {
534 large_mss = 65535 - tp->af_specific->net_header_len -
535 tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len;
537 if (tp->max_window && large_mss > (tp->max_window>>1))
538 large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len);
540 /* Always keep large mss multiple of real mss. */
541 tp->mss_cache = mss_now*(large_mss/mss_now);
548 /* This routine writes packets to the network. It advances the
549 * send_head. This happens as incoming acks open up the remote
552 * Returns 1, if no segments are in flight and we have queued segments, but
553 * cannot send anything now because of SWS or another problem.
555 int tcp_write_xmit(struct sock *sk, int nonagle)
557 struct tcp_opt *tp = tcp_sk(sk);
558 unsigned int mss_now;
560 /* If we are closed, the bytes will have to remain here.
561 * In time closedown will finish, we empty the write queue and all
564 if (sk->sk_state != TCP_CLOSE) {
568 /* Account for SACKS, we may need to fragment due to this.
569 * It is just like the real MSS changing on us midstream.
570 * We also handle things correctly when the user adds some
571 * IP options mid-stream. Silly to do, but cover it.
573 mss_now = tcp_current_mss(sk, 1);
575 while ((skb = sk->sk_send_head) &&
576 tcp_snd_test(tp, skb, mss_now,
577 tcp_skb_is_last(sk, skb) ? nonagle :
579 if (skb->len > mss_now) {
580 if (tcp_fragment(sk, skb, mss_now))
584 TCP_SKB_CB(skb)->when = tcp_time_stamp;
585 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
587 /* Advance the send_head. This one is sent out. */
588 update_send_head(sk, tp, skb);
589 tcp_minshall_update(tp, mss_now, skb);
594 tcp_cwnd_validate(sk, tp);
598 return !tp->packets_out && sk->sk_send_head;
603 /* This function returns the amount that we can raise the
604 * usable window based on the following constraints
606 * 1. The window can never be shrunk once it is offered (RFC 793)
607 * 2. We limit memory per socket
610 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
611 * RECV.NEXT + RCV.WIN fixed until:
612 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
614 * i.e. don't raise the right edge of the window until you can raise
615 * it at least MSS bytes.
617 * Unfortunately, the recommended algorithm breaks header prediction,
618 * since header prediction assumes th->window stays fixed.
620 * Strictly speaking, keeping th->window fixed violates the receiver
621 * side SWS prevention criteria. The problem is that under this rule
622 * a stream of single byte packets will cause the right side of the
623 * window to always advance by a single byte.
625 * Of course, if the sender implements sender side SWS prevention
626 * then this will not be a problem.
628 * BSD seems to make the following compromise:
630 * If the free space is less than the 1/4 of the maximum
631 * space available and the free space is less than 1/2 mss,
632 * then set the window to 0.
633 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
634 * Otherwise, just prevent the window from shrinking
635 * and from being larger than the largest representable value.
637 * This prevents incremental opening of the window in the regime
638 * where TCP is limited by the speed of the reader side taking
639 * data out of the TCP receive queue. It does nothing about
640 * those cases where the window is constrained on the sender side
641 * because the pipeline is full.
643 * BSD also seems to "accidentally" limit itself to windows that are a
644 * multiple of MSS, at least until the free space gets quite small.
645 * This would appear to be a side effect of the mbuf implementation.
646 * Combining these two algorithms results in the observed behavior
647 * of having a fixed window size at almost all times.
649 * Below we obtain similar behavior by forcing the offered window to
650 * a multiple of the mss when it is feasible to do so.
652 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
653 * Regular options like TIMESTAMP are taken into account.
655 u32 __tcp_select_window(struct sock *sk)
657 struct tcp_opt *tp = tcp_sk(sk);
658 /* MSS for the peer's data. Previous verions used mss_clamp
659 * here. I don't know if the value based on our guesses
660 * of peer's MSS is better for the performance. It's more correct
661 * but may be worse for the performance because of rcv_mss
662 * fluctuations. --SAW 1998/11/1
664 int mss = tp->ack.rcv_mss;
665 int free_space = tcp_space(sk);
666 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
669 if (mss > full_space)
672 if (free_space < full_space/2) {
675 if (tcp_memory_pressure)
676 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
678 if (free_space < mss)
682 if (free_space > tp->rcv_ssthresh)
683 free_space = tp->rcv_ssthresh;
685 /* Don't do rounding if we are using window scaling, since the
686 * scaled window will not line up with the MSS boundary anyway.
688 window = tp->rcv_wnd;
689 if (tp->rcv_wscale) {
692 /* Advertise enough space so that it won't get scaled away.
693 * Import case: prevent zero window announcement if
694 * 1<<rcv_wscale > mss.
696 if (((window >> tp->rcv_wscale) << tp->rcv_wscale) != window)
697 window = (((window >> tp->rcv_wscale) + 1)
700 /* Get the largest window that is a nice multiple of mss.
701 * Window clamp already applied above.
702 * If our current window offering is within 1 mss of the
703 * free space we just keep it. This prevents the divide
704 * and multiply from happening most of the time.
705 * We also don't do any window rounding when the free space
708 if (window <= free_space - mss || window > free_space)
709 window = (free_space/mss)*mss;
715 /* Attempt to collapse two adjacent SKB's during retransmission. */
716 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
718 struct tcp_opt *tp = tcp_sk(sk);
719 struct sk_buff *next_skb = skb->next;
721 /* The first test we must make is that neither of these two
722 * SKB's are still referenced by someone else.
724 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
725 int skb_size = skb->len, next_skb_size = next_skb->len;
726 u16 flags = TCP_SKB_CB(skb)->flags;
728 /* Also punt if next skb has been SACK'd. */
729 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
732 /* Next skb is out of window. */
733 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
736 /* Punt if not enough space exists in the first SKB for
737 * the data in the second, or the total combined payload
738 * would exceed the MSS.
740 if ((next_skb_size > skb_tailroom(skb)) ||
741 ((skb_size + next_skb_size) > mss_now))
744 /* Ok. We will be able to collapse the packet. */
745 __skb_unlink(next_skb, next_skb->list);
747 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
749 if (next_skb->ip_summed == CHECKSUM_HW)
750 skb->ip_summed = CHECKSUM_HW;
752 if (skb->ip_summed != CHECKSUM_HW)
753 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
755 /* Update sequence range on original skb. */
756 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
758 /* Merge over control information. */
759 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
760 TCP_SKB_CB(skb)->flags = flags;
762 /* All done, get rid of second SKB and account for it so
763 * packet counting does not break.
765 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
766 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
768 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
772 /* Reno case is special. Sigh... */
773 if (!tp->sack_ok && tp->sacked_out) {
778 /* Not quite right: it can be > snd.fack, but
779 * it is better to underestimate fackets.
783 sk_stream_free_skb(sk, next_skb);
788 /* Do a simple retransmit without using the backoff mechanisms in
789 * tcp_timer. This is used for path mtu discovery.
790 * The socket is already locked here.
792 void tcp_simple_retransmit(struct sock *sk)
794 struct tcp_opt *tp = tcp_sk(sk);
796 unsigned int mss = tcp_current_mss(sk, 0);
799 sk_stream_for_retrans_queue(skb, sk) {
800 if (skb->len > mss &&
801 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
802 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
803 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
806 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
807 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
817 tcp_sync_left_out(tp);
819 /* Don't muck with the congestion window here.
820 * Reason is that we do not increase amount of _data_
821 * in network, but units changed and effective
822 * cwnd/ssthresh really reduced now.
824 if (tp->ca_state != TCP_CA_Loss) {
825 tp->high_seq = tp->snd_nxt;
826 tp->snd_ssthresh = tcp_current_ssthresh(tp);
827 tp->prior_ssthresh = 0;
829 tcp_set_ca_state(tp, TCP_CA_Loss);
831 tcp_xmit_retransmit_queue(sk);
834 /* This retransmits one SKB. Policy decisions and retransmit queue
835 * state updates are done by the caller. Returns non-zero if an
836 * error occurred which prevented the send.
838 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
840 struct tcp_opt *tp = tcp_sk(sk);
841 unsigned int cur_mss = tcp_current_mss(sk, 0);
844 /* Do not sent more than we queued. 1/4 is reserved for possible
845 * copying overhead: frgagmentation, tunneling, mangling etc.
847 if (atomic_read(&sk->sk_wmem_alloc) >
848 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
851 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
852 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
855 if (sk->sk_route_caps & NETIF_F_TSO) {
856 sk->sk_route_caps &= ~NETIF_F_TSO;
857 sk->sk_no_largesend = 1;
858 tp->mss_cache = tp->mss_cache_std;
861 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
865 /* If receiver has shrunk his window, and skb is out of
866 * new window, do not retransmit it. The exception is the
867 * case, when window is shrunk to zero. In this case
868 * our retransmit serves as a zero window probe.
870 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
871 && TCP_SKB_CB(skb)->seq != tp->snd_una)
874 if(skb->len > cur_mss) {
875 if(tcp_fragment(sk, skb, cur_mss))
876 return -ENOMEM; /* We'll try again later. */
878 /* New SKB created, account for it. */
882 /* Collapse two adjacent packets if worthwhile and we can. */
883 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
884 (skb->len < (cur_mss >> 1)) &&
885 (skb->next != sk->sk_send_head) &&
886 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
887 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
888 (sysctl_tcp_retrans_collapse != 0))
889 tcp_retrans_try_collapse(sk, skb, cur_mss);
891 if(tp->af_specific->rebuild_header(sk))
892 return -EHOSTUNREACH; /* Routing failure or similar. */
894 /* Some Solaris stacks overoptimize and ignore the FIN on a
895 * retransmit when old data is attached. So strip it off
896 * since it is cheap to do so and saves bytes on the network.
899 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
900 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
901 if (!pskb_trim(skb, 0)) {
902 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
903 skb->ip_summed = CHECKSUM_NONE;
908 /* Make a copy, if the first transmission SKB clone we made
909 * is still in somebody's hands, else make a clone.
911 TCP_SKB_CB(skb)->when = tcp_time_stamp;
913 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
914 pskb_copy(skb, GFP_ATOMIC):
915 skb_clone(skb, GFP_ATOMIC)));
918 /* Update global TCP statistics. */
919 TCP_INC_STATS(TcpRetransSegs);
921 #if FASTRETRANS_DEBUG > 0
922 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
924 printk(KERN_DEBUG "retrans_out leaked.\n");
927 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
930 /* Save stamp of the first retransmit. */
931 if (!tp->retrans_stamp)
932 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
936 /* snd_nxt is stored to detect loss of retransmitted segment,
937 * see tcp_input.c tcp_sacktag_write_queue().
939 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
944 /* This gets called after a retransmit timeout, and the initially
945 * retransmitted data is acknowledged. It tries to continue
946 * resending the rest of the retransmit queue, until either
947 * we've sent it all or the congestion window limit is reached.
948 * If doing SACK, the first ACK which comes back for a timeout
949 * based retransmit packet might feed us FACK information again.
950 * If so, we use it to avoid unnecessarily retransmissions.
952 void tcp_xmit_retransmit_queue(struct sock *sk)
954 struct tcp_opt *tp = tcp_sk(sk);
956 int packet_cnt = tp->lost_out;
958 /* First pass: retransmit lost packets. */
960 sk_stream_for_retrans_queue(skb, sk) {
961 __u8 sacked = TCP_SKB_CB(skb)->sacked;
963 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
966 if (sacked&TCPCB_LOST) {
967 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
968 if (tcp_retransmit_skb(sk, skb))
970 if (tp->ca_state != TCP_CA_Loss)
971 NET_INC_STATS_BH(TCPFastRetrans);
973 NET_INC_STATS_BH(TCPSlowStartRetrans);
976 skb_peek(&sk->sk_write_queue))
977 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
980 if (--packet_cnt <= 0)
986 /* OK, demanded retransmission is finished. */
988 /* Forward retransmissions are possible only during Recovery. */
989 if (tp->ca_state != TCP_CA_Recovery)
992 /* No forward retransmissions in Reno are possible. */
996 /* Yeah, we have to make difficult choice between forward transmission
997 * and retransmission... Both ways have their merits...
999 * For now we do not retrnamsit anything, while we have some new
1003 if (tcp_may_send_now(sk, tp))
1008 sk_stream_for_retrans_queue(skb, sk) {
1009 if(++packet_cnt > tp->fackets_out)
1012 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1015 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1018 /* Ok, retransmit it. */
1019 if(tcp_retransmit_skb(sk, skb))
1022 if (skb == skb_peek(&sk->sk_write_queue))
1023 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1025 NET_INC_STATS_BH(TCPForwardRetrans);
1030 /* Send a fin. The caller locks the socket for us. This cannot be
1031 * allowed to fail queueing a FIN frame under any circumstances.
1033 void tcp_send_fin(struct sock *sk)
1035 struct tcp_opt *tp = tcp_sk(sk);
1036 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1037 unsigned int mss_now;
1039 /* Optimization, tack on the FIN if we have a queue of
1040 * unsent frames. But be careful about outgoing SACKS
1043 mss_now = tcp_current_mss(sk, 1);
1045 if (sk->sk_send_head != NULL) {
1046 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1047 TCP_SKB_CB(skb)->end_seq++;
1050 /* Socket is locked, keep trying until memory is available. */
1052 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1058 /* Reserve space for headers and prepare control bits. */
1059 skb_reserve(skb, MAX_TCP_HEADER);
1061 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1062 TCP_SKB_CB(skb)->sacked = 0;
1064 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1065 TCP_SKB_CB(skb)->seq = tp->write_seq;
1066 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1067 tcp_queue_skb(sk, skb);
1069 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1072 /* We get here when a process closes a file descriptor (either due to
1073 * an explicit close() or as a byproduct of exit()'ing) and there
1074 * was unread data in the receive queue. This behavior is recommended
1075 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1077 void tcp_send_active_reset(struct sock *sk, int priority)
1079 struct tcp_opt *tp = tcp_sk(sk);
1080 struct sk_buff *skb;
1082 /* NOTE: No TCP options attached and we never retransmit this. */
1083 skb = alloc_skb(MAX_TCP_HEADER, priority);
1085 NET_INC_STATS(TCPAbortFailed);
1089 /* Reserve space for headers and prepare control bits. */
1090 skb_reserve(skb, MAX_TCP_HEADER);
1092 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1093 TCP_SKB_CB(skb)->sacked = 0;
1096 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1097 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1098 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1099 if (tcp_transmit_skb(sk, skb))
1100 NET_INC_STATS(TCPAbortFailed);
1103 /* WARNING: This routine must only be called when we have already sent
1104 * a SYN packet that crossed the incoming SYN that caused this routine
1105 * to get called. If this assumption fails then the initial rcv_wnd
1106 * and rcv_wscale values will not be correct.
1108 int tcp_send_synack(struct sock *sk)
1110 struct sk_buff* skb;
1112 skb = skb_peek(&sk->sk_write_queue);
1113 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1114 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1117 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1118 if (skb_cloned(skb)) {
1119 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1122 __skb_unlink(skb, &sk->sk_write_queue);
1123 __skb_queue_head(&sk->sk_write_queue, nskb);
1124 sk_stream_free_skb(sk, skb);
1125 sk_charge_skb(sk, nskb);
1129 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1130 TCP_ECN_send_synack(tcp_sk(sk), skb);
1132 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1133 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1137 * Prepare a SYN-ACK.
1139 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1140 struct open_request *req)
1142 struct tcp_opt *tp = tcp_sk(sk);
1144 int tcp_header_size;
1145 struct sk_buff *skb;
1147 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1151 /* Reserve space for headers. */
1152 skb_reserve(skb, MAX_TCP_HEADER);
1154 skb->dst = dst_clone(dst);
1156 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1157 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1158 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1159 /* SACK_PERM is in the place of NOP NOP of TS */
1160 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1161 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1163 memset(th, 0, sizeof(struct tcphdr));
1166 if (dst->dev->features&NETIF_F_TSO)
1168 TCP_ECN_make_synack(req, th);
1169 th->source = inet_sk(sk)->sport;
1170 th->dest = req->rmt_port;
1171 TCP_SKB_CB(skb)->seq = req->snt_isn;
1172 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1173 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1174 th->ack_seq = htonl(req->rcv_isn + 1);
1175 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1177 /* Set this up on the first call only */
1178 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1179 /* tcp_full_space because it is guaranteed to be the first packet */
1180 tcp_select_initial_window(tcp_full_space(sk),
1181 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1186 req->rcv_wscale = rcv_wscale;
1189 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1190 th->window = htons(req->rcv_wnd);
1192 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1193 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1194 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1195 TCP_SKB_CB(skb)->when,
1199 th->doff = (tcp_header_size >> 2);
1200 TCP_INC_STATS(TcpOutSegs);
1205 * Do all connect socket setups that can be done AF independent.
1207 static inline void tcp_connect_init(struct sock *sk)
1209 struct dst_entry *dst = __sk_dst_get(sk);
1210 struct tcp_opt *tp = tcp_sk(sk);
1212 /* We'll fix this up when we get a response from the other end.
1213 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1215 tp->tcp_header_len = sizeof(struct tcphdr) +
1216 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1218 /* If user gave his TCP_MAXSEG, record it to clamp */
1220 tp->mss_clamp = tp->user_mss;
1222 tcp_sync_mss(sk, dst_pmtu(dst));
1224 if (!tp->window_clamp)
1225 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1226 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1227 tcp_initialize_rcv_mss(sk);
1230 tcp_select_initial_window(tcp_full_space(sk),
1231 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1234 sysctl_tcp_window_scaling,
1237 tp->rcv_ssthresh = tp->rcv_wnd;
1240 sock_reset_flag(sk, SOCK_DONE);
1242 tcp_init_wl(tp, tp->write_seq, 0);
1243 tp->snd_una = tp->write_seq;
1244 tp->snd_sml = tp->write_seq;
1249 tp->rto = TCP_TIMEOUT_INIT;
1250 tp->retransmits = 0;
1251 tcp_clear_retrans(tp);
1255 * Build a SYN and send it off.
1257 int tcp_connect(struct sock *sk)
1259 struct tcp_opt *tp = tcp_sk(sk);
1260 struct sk_buff *buff;
1262 tcp_connect_init(sk);
1264 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1265 if (unlikely(buff == NULL))
1268 /* Reserve space for headers. */
1269 skb_reserve(buff, MAX_TCP_HEADER);
1271 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1272 TCP_ECN_send_syn(sk, tp, buff);
1273 TCP_SKB_CB(buff)->sacked = 0;
1275 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1276 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1277 tp->snd_nxt = tp->write_seq;
1278 tp->pushed_seq = tp->write_seq;
1282 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1283 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1284 __skb_queue_tail(&sk->sk_write_queue, buff);
1285 sk_charge_skb(sk, buff);
1287 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1288 TCP_INC_STATS(TcpActiveOpens);
1290 /* Timer for repeating the SYN until an answer. */
1291 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1295 /* Send out a delayed ack, the caller does the policy checking
1296 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1299 void tcp_send_delayed_ack(struct sock *sk)
1301 struct tcp_opt *tp = tcp_sk(sk);
1302 int ato = tp->ack.ato;
1303 unsigned long timeout;
1305 if (ato > TCP_DELACK_MIN) {
1308 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1309 max_ato = TCP_DELACK_MAX;
1311 /* Slow path, intersegment interval is "high". */
1313 /* If some rtt estimate is known, use it to bound delayed ack.
1314 * Do not use tp->rto here, use results of rtt measurements
1318 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1324 ato = min(ato, max_ato);
1327 /* Stay within the limit we were given */
1328 timeout = jiffies + ato;
1330 /* Use new timeout only if there wasn't a older one earlier. */
1331 if (tp->ack.pending&TCP_ACK_TIMER) {
1332 /* If delack timer was blocked or is about to expire,
1335 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1340 if (!time_before(timeout, tp->ack.timeout))
1341 timeout = tp->ack.timeout;
1343 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1344 tp->ack.timeout = timeout;
1345 sk_reset_timer(sk, &tp->delack_timer, timeout);
1348 /* This routine sends an ack and also updates the window. */
1349 void tcp_send_ack(struct sock *sk)
1351 /* If we have been reset, we may not send again. */
1352 if (sk->sk_state != TCP_CLOSE) {
1353 struct tcp_opt *tp = tcp_sk(sk);
1354 struct sk_buff *buff;
1356 /* We are not putting this on the write queue, so
1357 * tcp_transmit_skb() will set the ownership to this
1360 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1362 tcp_schedule_ack(tp);
1363 tp->ack.ato = TCP_ATO_MIN;
1364 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1368 /* Reserve space for headers and prepare control bits. */
1369 skb_reserve(buff, MAX_TCP_HEADER);
1371 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1372 TCP_SKB_CB(buff)->sacked = 0;
1374 /* Send it off, this clears delayed acks for us. */
1375 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1376 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1377 tcp_transmit_skb(sk, buff);
1381 /* This routine sends a packet with an out of date sequence
1382 * number. It assumes the other end will try to ack it.
1384 * Question: what should we make while urgent mode?
1385 * 4.4BSD forces sending single byte of data. We cannot send
1386 * out of window data, because we have SND.NXT==SND.MAX...
1388 * Current solution: to send TWO zero-length segments in urgent mode:
1389 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1390 * out-of-date with SND.UNA-1 to probe window.
1392 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1394 struct tcp_opt *tp = tcp_sk(sk);
1395 struct sk_buff *skb;
1397 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1398 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1402 /* Reserve space for headers and set control bits. */
1403 skb_reserve(skb, MAX_TCP_HEADER);
1405 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1406 TCP_SKB_CB(skb)->sacked = urgent;
1408 /* Use a previous sequence. This should cause the other
1409 * end to send an ack. Don't queue or clone SKB, just
1412 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1413 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1414 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1415 return tcp_transmit_skb(sk, skb);
1418 int tcp_write_wakeup(struct sock *sk)
1420 if (sk->sk_state != TCP_CLOSE) {
1421 struct tcp_opt *tp = tcp_sk(sk);
1422 struct sk_buff *skb;
1424 if ((skb = sk->sk_send_head) != NULL &&
1425 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1427 int mss = tcp_current_mss(sk, 0);
1428 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1430 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1431 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1433 /* We are probing the opening of a window
1434 * but the window size is != 0
1435 * must have been a result SWS avoidance ( sender )
1437 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1439 seg_size = min(seg_size, mss);
1440 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1441 if (tcp_fragment(sk, skb, seg_size))
1443 /* SWS override triggered forced fragmentation.
1444 * Disable TSO, the connection is too sick. */
1445 if (sk->sk_route_caps & NETIF_F_TSO) {
1446 sk->sk_no_largesend = 1;
1447 sk->sk_route_caps &= ~NETIF_F_TSO;
1448 tp->mss_cache = tp->mss_cache_std;
1451 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1452 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1453 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1455 update_send_head(sk, tp, skb);
1460 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1461 tcp_xmit_probe_skb(sk, TCPCB_URG);
1462 return tcp_xmit_probe_skb(sk, 0);
1468 /* A window probe timeout has occurred. If window is not closed send
1469 * a partial packet else a zero probe.
1471 void tcp_send_probe0(struct sock *sk)
1473 struct tcp_opt *tp = tcp_sk(sk);
1476 err = tcp_write_wakeup(sk);
1478 if (tp->packets_out || !sk->sk_send_head) {
1479 /* Cancel probe timer, if it is not required. */
1486 if (tp->backoff < sysctl_tcp_retries2)
1489 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1490 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1492 /* If packet was not sent due to local congestion,
1493 * do not backoff and do not remember probes_out.
1494 * Let local senders to fight for local resources.
1496 * Use accumulated backoff yet.
1498 if (!tp->probes_out)
1500 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1501 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1505 EXPORT_SYMBOL(tcp_acceptable_seq);
1506 EXPORT_SYMBOL(tcp_connect);
1507 EXPORT_SYMBOL(tcp_connect_init);
1508 EXPORT_SYMBOL(tcp_make_synack);
1509 EXPORT_SYMBOL(tcp_send_synack);
1510 EXPORT_SYMBOL(tcp_simple_retransmit);
1511 EXPORT_SYMBOL(tcp_sync_mss);
1512 EXPORT_SYMBOL(tcp_transmit_skb);
1513 EXPORT_SYMBOL(tcp_write_wakeup);
1514 EXPORT_SYMBOL(tcp_write_xmit);