2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/compiler.h>
42 #include <linux/module.h>
43 #include <linux/smp_lock.h>
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
49 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
51 tp->send_head = skb->next;
52 if (tp->send_head == (struct sk_buff *)&sk->sk_write_queue)
54 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
55 if (tp->packets_out++ == 0)
56 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
59 /* SND.NXT, if window was not shrunk.
60 * If window has been shrunk, what should we make? It is not clear at all.
61 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
62 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
63 * invalid. OK, let's make this for now:
65 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
67 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
70 return tp->snd_una+tp->snd_wnd;
73 /* Calculate mss to advertise in SYN segment.
74 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
76 * 1. It is independent of path mtu.
77 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
78 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
79 * attached devices, because some buggy hosts are confused by
81 * 4. We do not make 3, we advertise MSS, calculated from first
82 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
83 * This may be overridden via information stored in routing table.
84 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
85 * probably even Jumbo".
87 static __u16 tcp_advertise_mss(struct sock *sk)
89 struct tcp_opt *tp = tcp_sk(sk);
90 struct dst_entry *dst = __sk_dst_get(sk);
93 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
94 mss = dst_metric(dst, RTAX_ADVMSS);
101 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
102 * This is the first part of cwnd validation mechanism. */
103 static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
105 s32 delta = tcp_time_stamp - tp->lsndtime;
106 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
107 u32 cwnd = tp->snd_cwnd;
109 if (tcp_is_vegas(tp))
110 tcp_vegas_enable(tp);
112 tp->snd_ssthresh = tcp_current_ssthresh(tp);
113 restart_cwnd = min(restart_cwnd, cwnd);
115 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
117 tp->snd_cwnd = max(cwnd, restart_cwnd);
118 tp->snd_cwnd_stamp = tcp_time_stamp;
119 tp->snd_cwnd_used = 0;
122 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
124 u32 now = tcp_time_stamp;
126 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
127 tcp_cwnd_restart(tp, __sk_dst_get(sk));
131 /* If it is a reply for ato after last received
132 * packet, enter pingpong mode.
134 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
135 tp->ack.pingpong = 1;
138 static __inline__ void tcp_event_ack_sent(struct sock *sk)
140 struct tcp_opt *tp = tcp_sk(sk);
142 tcp_dec_quickack_mode(tp);
143 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
146 /* Chose a new window to advertise, update state in tcp_opt for the
147 * socket, and return result with RFC1323 scaling applied. The return
148 * value can be stuffed directly into th->window for an outgoing
151 static __inline__ u16 tcp_select_window(struct sock *sk)
153 struct tcp_opt *tp = tcp_sk(sk);
154 u32 cur_win = tcp_receive_window(tp);
155 u32 new_win = __tcp_select_window(sk);
157 /* Never shrink the offered window */
158 if(new_win < cur_win) {
159 /* Danger Will Robinson!
160 * Don't update rcv_wup/rcv_wnd here or else
161 * we will not be able to advertise a zero
162 * window in time. --DaveM
164 * Relax Will Robinson.
168 tp->rcv_wnd = new_win;
169 tp->rcv_wup = tp->rcv_nxt;
171 /* RFC1323 scaling applied */
172 new_win >>= tp->rcv_wscale;
174 /* If we advertise zero window, disable fast path. */
182 /* This routine actually transmits TCP packets queued in by
183 * tcp_do_sendmsg(). This is used by both the initial
184 * transmission and possible later retransmissions.
185 * All SKB's seen here are completely headerless. It is our
186 * job to build the TCP header, and pass the packet down to
187 * IP so it can do the same plus pass the packet off to the
190 * We are working here with either a clone of the original
191 * SKB, or a fresh unique copy made by the retransmit engine.
193 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
196 struct inet_opt *inet = inet_sk(sk);
197 struct tcp_opt *tp = tcp_sk(sk);
198 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
199 int tcp_header_size = tp->tcp_header_len;
204 #define SYSCTL_FLAG_TSTAMPS 0x1
205 #define SYSCTL_FLAG_WSCALE 0x2
206 #define SYSCTL_FLAG_SACK 0x4
209 if (tcb->flags & TCPCB_FLAG_SYN) {
210 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
211 if(sysctl_tcp_timestamps) {
212 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
213 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
215 if(sysctl_tcp_window_scaling) {
216 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
217 sysctl_flags |= SYSCTL_FLAG_WSCALE;
219 if(sysctl_tcp_sack) {
220 sysctl_flags |= SYSCTL_FLAG_SACK;
221 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
222 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
224 } else if (tp->eff_sacks) {
225 /* A SACK is 2 pad bytes, a 2 byte header, plus
226 * 2 32-bit sequence numbers for each SACK block.
228 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
229 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
233 * If the connection is idle and we are restarting,
234 * then we don't want to do any Vegas calculations
235 * until we get fresh RTT samples. So when we
236 * restart, we reset our Vegas state to a clean
237 * slate. After we get acks for this flight of
238 * packets, _then_ we can make Vegas calculations
241 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
242 tcp_vegas_enable(tp);
244 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
246 skb_set_owner_w(skb, sk);
248 /* Build TCP header and checksum it. */
249 th->source = inet->sport;
250 th->dest = inet->dport;
251 th->seq = htonl(tcb->seq);
252 th->ack_seq = htonl(tp->rcv_nxt);
253 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
254 if (tcb->flags & TCPCB_FLAG_SYN) {
255 /* RFC1323: The window in SYN & SYN/ACK segments
258 th->window = htons(tp->rcv_wnd);
260 th->window = htons(tcp_select_window(sk));
266 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
267 th->urg_ptr = htons(tp->snd_up-tcb->seq);
271 if (tcb->flags & TCPCB_FLAG_SYN) {
272 tcp_syn_build_options((__u32 *)(th + 1),
273 tcp_advertise_mss(sk),
274 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
275 (sysctl_flags & SYSCTL_FLAG_SACK),
276 (sysctl_flags & SYSCTL_FLAG_WSCALE),
281 tcp_build_and_update_options((__u32 *)(th + 1),
284 TCP_ECN_send(sk, tp, skb, tcp_header_size);
286 tp->af_specific->send_check(sk, th, skb->len, skb);
288 if (tcb->flags & TCPCB_FLAG_ACK)
289 tcp_event_ack_sent(sk);
291 if (skb->len != tcp_header_size)
292 tcp_event_data_sent(tp, skb, sk);
294 TCP_INC_STATS(TcpOutSegs);
296 err = tp->af_specific->queue_xmit(skb, 0);
302 /* NET_XMIT_CN is special. It does not guarantee,
303 * that this packet is lost. It tells that device
304 * is about to start to drop packets or already
305 * drops some packets of the same priority and
306 * invokes us to send less aggressively.
308 return err == NET_XMIT_CN ? 0 : err;
311 #undef SYSCTL_FLAG_TSTAMPS
312 #undef SYSCTL_FLAG_WSCALE
313 #undef SYSCTL_FLAG_SACK
317 /* This routine just queue's the buffer
319 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
320 * otherwise socket can stall.
322 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
324 struct tcp_opt *tp = tcp_sk(sk);
326 /* Advance write_seq and place onto the write_queue. */
327 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
328 __skb_queue_tail(&sk->sk_write_queue, skb);
329 tcp_charge_skb(sk, skb);
331 /* Queue it, remembering where we must start sending. */
332 if (tp->send_head == NULL)
336 /* Send _single_ skb sitting at the send head. This function requires
337 * true push pending frames to setup probe timer etc.
339 void tcp_push_one(struct sock *sk, unsigned cur_mss)
341 struct tcp_opt *tp = tcp_sk(sk);
342 struct sk_buff *skb = tp->send_head;
344 if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
345 /* Send it out now. */
346 TCP_SKB_CB(skb)->when = tcp_time_stamp;
347 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
348 tp->send_head = NULL;
349 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
350 if (tp->packets_out++ == 0)
351 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
357 /* Split fragmented skb to two parts at length len. */
359 static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
362 int pos = skb_headlen(skb);
365 /* Split line is inside header. */
366 memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
368 /* And move data appendix as is. */
369 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
370 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
372 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
373 skb_shinfo(skb)->nr_frags = 0;
375 skb1->data_len = skb->data_len;
376 skb1->len += skb1->data_len;
379 skb->tail = skb->data+len;
382 int nfrags = skb_shinfo(skb)->nr_frags;
384 /* Second chunk has no header, nothing to copy. */
386 skb_shinfo(skb)->nr_frags = 0;
387 skb1->len = skb1->data_len = skb->len - len;
389 skb->data_len = len - pos;
391 for (i=0; i<nfrags; i++) {
392 int size = skb_shinfo(skb)->frags[i].size;
393 if (pos + size > len) {
394 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
398 * We have to variants in this case:
399 * 1. Move all the frag to the second
400 * part, if it is possible. F.e.
401 * this approach is mandatory for TUX,
402 * where splitting is expensive.
403 * 2. Split is accurately. We make this.
405 get_page(skb_shinfo(skb)->frags[i].page);
406 skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
407 skb_shinfo(skb1)->frags[0].size -= (len-pos);
408 skb_shinfo(skb)->frags[i].size = len-pos;
409 skb_shinfo(skb)->nr_frags++;
413 skb_shinfo(skb)->nr_frags++;
417 skb_shinfo(skb1)->nr_frags = k;
421 /* Function to create two new TCP segments. Shrinks the given segment
422 * to the specified size and appends a new segment with the rest of the
423 * packet to the list. This won't be called frequently, I hope.
424 * Remember, these are still headerless SKBs at this point.
426 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
428 struct tcp_opt *tp = tcp_sk(sk);
429 struct sk_buff *buff;
430 int nsize = skb->len - len;
433 if (skb_cloned(skb) &&
434 skb_is_nonlinear(skb) &&
435 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
438 /* Get a new skb... force flag on. */
439 buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
441 return -ENOMEM; /* We'll just try again later. */
442 tcp_charge_skb(sk, buff);
444 /* Correct the sequence numbers. */
445 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
446 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
447 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
449 /* PSH and FIN should only be set in the second packet. */
450 flags = TCP_SKB_CB(skb)->flags;
451 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
452 TCP_SKB_CB(buff)->flags = flags;
453 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
454 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
458 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
460 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
461 /* Copy and checksum data tail into the new buffer. */
462 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
467 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
469 skb->ip_summed = CHECKSUM_HW;
470 skb_split(skb, buff, len);
473 buff->ip_summed = skb->ip_summed;
475 /* Looks stupid, but our code really uses when of
476 * skbs, which it never sent before. --ANK
478 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
480 /* Link BUFF into the send queue. */
481 __skb_append(skb, buff);
486 /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
487 * eventually). The difference is that pulled data not copied, but
488 * immediately discarded.
490 unsigned char * __pskb_trim_head(struct sk_buff *skb, int len)
496 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
497 if (skb_shinfo(skb)->frags[i].size <= eat) {
498 put_page(skb_shinfo(skb)->frags[i].page);
499 eat -= skb_shinfo(skb)->frags[i].size;
501 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
503 skb_shinfo(skb)->frags[k].page_offset += eat;
504 skb_shinfo(skb)->frags[k].size -= eat;
510 skb_shinfo(skb)->nr_frags = k;
512 skb->tail = skb->data;
513 skb->data_len -= len;
514 skb->len = skb->data_len;
518 static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
520 if (skb_cloned(skb) &&
521 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
524 if (len <= skb_headlen(skb)) {
525 __skb_pull(skb, len);
527 if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
531 TCP_SKB_CB(skb)->seq += len;
532 skb->ip_summed = CHECKSUM_HW;
536 /* This function synchronize snd mss to current pmtu/exthdr set.
538 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
539 for TCP options, but includes only bare TCP header.
541 tp->mss_clamp is mss negotiated at connection setup.
542 It is minumum of user_mss and mss received with SYN.
543 It also does not include TCP options.
545 tp->pmtu_cookie is last pmtu, seen by this function.
547 tp->mss_cache is current effective sending mss, including
548 all tcp options except for SACKs. It is evaluated,
549 taking into account current pmtu, but never exceeds
552 NOTE1. rfc1122 clearly states that advertised MSS
553 DOES NOT include either tcp or ip options.
555 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
556 this function. --ANK (980731)
559 int tcp_sync_mss(struct sock *sk, u32 pmtu)
561 struct tcp_opt *tp = tcp_sk(sk);
562 struct dst_entry *dst = __sk_dst_get(sk);
565 if (dst && dst->ops->get_mss)
566 pmtu = dst->ops->get_mss(dst, pmtu);
568 /* Calculate base mss without TCP options:
569 It is MMS_S - sizeof(tcphdr) of rfc1122
571 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
573 /* Clamp it (mss_clamp does not include tcp options) */
574 if (mss_now > tp->mss_clamp)
575 mss_now = tp->mss_clamp;
577 /* Now subtract optional transport overhead */
578 mss_now -= tp->ext_header_len + tp->ext2_header_len;
580 /* Then reserve room for full set of TCP options and 8 bytes of data */
584 /* Now subtract TCP options size, not including SACKs */
585 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
587 /* Bound mss with half of window */
588 if (tp->max_window && mss_now > (tp->max_window>>1))
589 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
591 /* And store cached results */
592 tp->pmtu_cookie = pmtu;
593 tp->mss_cache = tp->mss_cache_std = mss_now;
595 if (sk->sk_route_caps & NETIF_F_TSO) {
598 large_mss = 65535 - tp->af_specific->net_header_len -
599 tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len;
601 if (tp->max_window && large_mss > (tp->max_window>>1))
602 large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len);
604 /* Always keep large mss multiple of real mss. */
605 tp->mss_cache = mss_now*(large_mss/mss_now);
612 /* This routine writes packets to the network. It advances the
613 * send_head. This happens as incoming acks open up the remote
616 * Returns 1, if no segments are in flight and we have queued segments, but
617 * cannot send anything now because of SWS or another problem.
619 int tcp_write_xmit(struct sock *sk, int nonagle)
621 struct tcp_opt *tp = tcp_sk(sk);
622 unsigned int mss_now;
624 /* If we are closed, the bytes will have to remain here.
625 * In time closedown will finish, we empty the write queue and all
628 if (sk->sk_state != TCP_CLOSE) {
632 /* Account for SACKS, we may need to fragment due to this.
633 * It is just like the real MSS changing on us midstream.
634 * We also handle things correctly when the user adds some
635 * IP options mid-stream. Silly to do, but cover it.
637 mss_now = tcp_current_mss(sk, 1);
639 while((skb = tp->send_head) &&
640 tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)) {
641 if (skb->len > mss_now) {
642 if (tcp_fragment(sk, skb, mss_now))
646 TCP_SKB_CB(skb)->when = tcp_time_stamp;
647 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
649 /* Advance the send_head. This one is sent out. */
650 update_send_head(sk, tp, skb);
651 tcp_minshall_update(tp, mss_now, skb);
656 tcp_cwnd_validate(sk, tp);
660 return !tp->packets_out && tp->send_head;
665 /* This function returns the amount that we can raise the
666 * usable window based on the following constraints
668 * 1. The window can never be shrunk once it is offered (RFC 793)
669 * 2. We limit memory per socket
672 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
673 * RECV.NEXT + RCV.WIN fixed until:
674 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
676 * i.e. don't raise the right edge of the window until you can raise
677 * it at least MSS bytes.
679 * Unfortunately, the recommended algorithm breaks header prediction,
680 * since header prediction assumes th->window stays fixed.
682 * Strictly speaking, keeping th->window fixed violates the receiver
683 * side SWS prevention criteria. The problem is that under this rule
684 * a stream of single byte packets will cause the right side of the
685 * window to always advance by a single byte.
687 * Of course, if the sender implements sender side SWS prevention
688 * then this will not be a problem.
690 * BSD seems to make the following compromise:
692 * If the free space is less than the 1/4 of the maximum
693 * space available and the free space is less than 1/2 mss,
694 * then set the window to 0.
695 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
696 * Otherwise, just prevent the window from shrinking
697 * and from being larger than the largest representable value.
699 * This prevents incremental opening of the window in the regime
700 * where TCP is limited by the speed of the reader side taking
701 * data out of the TCP receive queue. It does nothing about
702 * those cases where the window is constrained on the sender side
703 * because the pipeline is full.
705 * BSD also seems to "accidentally" limit itself to windows that are a
706 * multiple of MSS, at least until the free space gets quite small.
707 * This would appear to be a side effect of the mbuf implementation.
708 * Combining these two algorithms results in the observed behavior
709 * of having a fixed window size at almost all times.
711 * Below we obtain similar behavior by forcing the offered window to
712 * a multiple of the mss when it is feasible to do so.
714 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
715 * Regular options like TIMESTAMP are taken into account.
717 u32 __tcp_select_window(struct sock *sk)
719 struct tcp_opt *tp = tcp_sk(sk);
720 /* MSS for the peer's data. Previous verions used mss_clamp
721 * here. I don't know if the value based on our guesses
722 * of peer's MSS is better for the performance. It's more correct
723 * but may be worse for the performance because of rcv_mss
724 * fluctuations. --SAW 1998/11/1
726 int mss = tp->ack.rcv_mss;
727 int free_space = tcp_space(sk);
728 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
731 if (mss > full_space)
734 if (free_space < full_space/2) {
737 if (tcp_memory_pressure)
738 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
740 if (free_space < mss)
744 if (free_space > tp->rcv_ssthresh)
745 free_space = tp->rcv_ssthresh;
747 /* Get the largest window that is a nice multiple of mss.
748 * Window clamp already applied above.
749 * If our current window offering is within 1 mss of the
750 * free space we just keep it. This prevents the divide
751 * and multiply from happening most of the time.
752 * We also don't do any window rounding when the free space
755 window = tp->rcv_wnd;
756 if (window <= free_space - mss || window > free_space)
757 window = (free_space/mss)*mss;
762 /* Attempt to collapse two adjacent SKB's during retransmission. */
763 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
765 struct tcp_opt *tp = tcp_sk(sk);
766 struct sk_buff *next_skb = skb->next;
768 /* The first test we must make is that neither of these two
769 * SKB's are still referenced by someone else.
771 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
772 int skb_size = skb->len, next_skb_size = next_skb->len;
773 u16 flags = TCP_SKB_CB(skb)->flags;
775 /* Also punt if next skb has been SACK'd. */
776 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
779 /* Next skb is out of window. */
780 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
783 /* Punt if not enough space exists in the first SKB for
784 * the data in the second, or the total combined payload
785 * would exceed the MSS.
787 if ((next_skb_size > skb_tailroom(skb)) ||
788 ((skb_size + next_skb_size) > mss_now))
791 /* Ok. We will be able to collapse the packet. */
792 __skb_unlink(next_skb, next_skb->list);
794 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
796 if (next_skb->ip_summed == CHECKSUM_HW)
797 skb->ip_summed = CHECKSUM_HW;
799 if (skb->ip_summed != CHECKSUM_HW)
800 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
802 /* Update sequence range on original skb. */
803 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
805 /* Merge over control information. */
806 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
807 TCP_SKB_CB(skb)->flags = flags;
809 /* All done, get rid of second SKB and account for it so
810 * packet counting does not break.
812 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
813 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
815 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
819 /* Reno case is special. Sigh... */
820 if (!tp->sack_ok && tp->sacked_out) {
825 /* Not quite right: it can be > snd.fack, but
826 * it is better to underestimate fackets.
830 tcp_free_skb(sk, next_skb);
835 /* Do a simple retransmit without using the backoff mechanisms in
836 * tcp_timer. This is used for path mtu discovery.
837 * The socket is already locked here.
839 void tcp_simple_retransmit(struct sock *sk)
841 struct tcp_opt *tp = tcp_sk(sk);
843 unsigned int mss = tcp_current_mss(sk, 0);
846 for_retrans_queue(skb, sk, tp) {
847 if (skb->len > mss &&
848 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
849 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
850 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
853 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
854 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
864 tcp_sync_left_out(tp);
866 /* Don't muck with the congestion window here.
867 * Reason is that we do not increase amount of _data_
868 * in network, but units changed and effective
869 * cwnd/ssthresh really reduced now.
871 if (tp->ca_state != TCP_CA_Loss) {
872 tp->high_seq = tp->snd_nxt;
873 tp->snd_ssthresh = tcp_current_ssthresh(tp);
874 tp->prior_ssthresh = 0;
876 tcp_set_ca_state(tp, TCP_CA_Loss);
878 tcp_xmit_retransmit_queue(sk);
881 /* This retransmits one SKB. Policy decisions and retransmit queue
882 * state updates are done by the caller. Returns non-zero if an
883 * error occurred which prevented the send.
885 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
887 struct tcp_opt *tp = tcp_sk(sk);
888 unsigned int cur_mss = tcp_current_mss(sk, 0);
891 /* Do not sent more than we queued. 1/4 is reserved for possible
892 * copying overhead: frgagmentation, tunneling, mangling etc.
894 if (atomic_read(&sk->sk_wmem_alloc) >
895 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
898 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
899 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
902 if (sk->sk_route_caps & NETIF_F_TSO) {
903 sk->sk_route_caps &= ~NETIF_F_TSO;
904 sk->sk_no_largesend = 1;
905 tp->mss_cache = tp->mss_cache_std;
908 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
912 /* If receiver has shrunk his window, and skb is out of
913 * new window, do not retransmit it. The exception is the
914 * case, when window is shrunk to zero. In this case
915 * our retransmit serves as a zero window probe.
917 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
918 && TCP_SKB_CB(skb)->seq != tp->snd_una)
921 if(skb->len > cur_mss) {
922 if(tcp_fragment(sk, skb, cur_mss))
923 return -ENOMEM; /* We'll try again later. */
925 /* New SKB created, account for it. */
929 /* Collapse two adjacent packets if worthwhile and we can. */
930 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
931 (skb->len < (cur_mss >> 1)) &&
932 (skb->next != tp->send_head) &&
933 (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
934 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
935 (sysctl_tcp_retrans_collapse != 0))
936 tcp_retrans_try_collapse(sk, skb, cur_mss);
938 if(tp->af_specific->rebuild_header(sk))
939 return -EHOSTUNREACH; /* Routing failure or similar. */
941 /* Some Solaris stacks overoptimize and ignore the FIN on a
942 * retransmit when old data is attached. So strip it off
943 * since it is cheap to do so and saves bytes on the network.
946 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
947 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
948 if (!pskb_trim(skb, 0)) {
949 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
950 skb->ip_summed = CHECKSUM_NONE;
955 /* Make a copy, if the first transmission SKB clone we made
956 * is still in somebody's hands, else make a clone.
958 TCP_SKB_CB(skb)->when = tcp_time_stamp;
960 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
961 pskb_copy(skb, GFP_ATOMIC):
962 skb_clone(skb, GFP_ATOMIC)));
965 /* Update global TCP statistics. */
966 TCP_INC_STATS(TcpRetransSegs);
968 #if FASTRETRANS_DEBUG > 0
969 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
971 printk(KERN_DEBUG "retrans_out leaked.\n");
974 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
977 /* Save stamp of the first retransmit. */
978 if (!tp->retrans_stamp)
979 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
983 /* snd_nxt is stored to detect loss of retransmitted segment,
984 * see tcp_input.c tcp_sacktag_write_queue().
986 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
991 /* This gets called after a retransmit timeout, and the initially
992 * retransmitted data is acknowledged. It tries to continue
993 * resending the rest of the retransmit queue, until either
994 * we've sent it all or the congestion window limit is reached.
995 * If doing SACK, the first ACK which comes back for a timeout
996 * based retransmit packet might feed us FACK information again.
997 * If so, we use it to avoid unnecessarily retransmissions.
999 void tcp_xmit_retransmit_queue(struct sock *sk)
1001 struct tcp_opt *tp = tcp_sk(sk);
1002 struct sk_buff *skb;
1003 int packet_cnt = tp->lost_out;
1005 /* First pass: retransmit lost packets. */
1007 for_retrans_queue(skb, sk, tp) {
1008 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1010 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1013 if (sacked&TCPCB_LOST) {
1014 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1015 if (tcp_retransmit_skb(sk, skb))
1017 if (tp->ca_state != TCP_CA_Loss)
1018 NET_INC_STATS_BH(TCPFastRetrans);
1020 NET_INC_STATS_BH(TCPSlowStartRetrans);
1023 skb_peek(&sk->sk_write_queue))
1024 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1027 if (--packet_cnt <= 0)
1033 /* OK, demanded retransmission is finished. */
1035 /* Forward retransmissions are possible only during Recovery. */
1036 if (tp->ca_state != TCP_CA_Recovery)
1039 /* No forward retransmissions in Reno are possible. */
1043 /* Yeah, we have to make difficult choice between forward transmission
1044 * and retransmission... Both ways have their merits...
1046 * For now we do not retrnamsit anything, while we have some new
1050 if (tcp_may_send_now(sk, tp))
1055 for_retrans_queue(skb, sk, tp) {
1056 if(++packet_cnt > tp->fackets_out)
1059 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1062 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1065 /* Ok, retransmit it. */
1066 if(tcp_retransmit_skb(sk, skb))
1069 if (skb == skb_peek(&sk->sk_write_queue))
1070 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1072 NET_INC_STATS_BH(TCPForwardRetrans);
1077 /* Send a fin. The caller locks the socket for us. This cannot be
1078 * allowed to fail queueing a FIN frame under any circumstances.
1080 void tcp_send_fin(struct sock *sk)
1082 struct tcp_opt *tp = tcp_sk(sk);
1083 struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1084 unsigned int mss_now;
1086 /* Optimization, tack on the FIN if we have a queue of
1087 * unsent frames. But be careful about outgoing SACKS
1090 mss_now = tcp_current_mss(sk, 1);
1092 if(tp->send_head != NULL) {
1093 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1094 TCP_SKB_CB(skb)->end_seq++;
1097 /* Socket is locked, keep trying until memory is available. */
1099 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1105 /* Reserve space for headers and prepare control bits. */
1106 skb_reserve(skb, MAX_TCP_HEADER);
1108 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1109 TCP_SKB_CB(skb)->sacked = 0;
1111 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1112 TCP_SKB_CB(skb)->seq = tp->write_seq;
1113 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1114 tcp_queue_skb(sk, skb);
1116 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1119 /* We get here when a process closes a file descriptor (either due to
1120 * an explicit close() or as a byproduct of exit()'ing) and there
1121 * was unread data in the receive queue. This behavior is recommended
1122 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1124 void tcp_send_active_reset(struct sock *sk, int priority)
1126 struct tcp_opt *tp = tcp_sk(sk);
1127 struct sk_buff *skb;
1129 /* NOTE: No TCP options attached and we never retransmit this. */
1130 skb = alloc_skb(MAX_TCP_HEADER, priority);
1132 NET_INC_STATS(TCPAbortFailed);
1136 /* Reserve space for headers and prepare control bits. */
1137 skb_reserve(skb, MAX_TCP_HEADER);
1139 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1140 TCP_SKB_CB(skb)->sacked = 0;
1143 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1144 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1145 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1146 if (tcp_transmit_skb(sk, skb))
1147 NET_INC_STATS(TCPAbortFailed);
1150 /* WARNING: This routine must only be called when we have already sent
1151 * a SYN packet that crossed the incoming SYN that caused this routine
1152 * to get called. If this assumption fails then the initial rcv_wnd
1153 * and rcv_wscale values will not be correct.
1155 int tcp_send_synack(struct sock *sk)
1157 struct sk_buff* skb;
1159 skb = skb_peek(&sk->sk_write_queue);
1160 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1161 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1164 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1165 if (skb_cloned(skb)) {
1166 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1169 __skb_unlink(skb, &sk->sk_write_queue);
1170 __skb_queue_head(&sk->sk_write_queue, nskb);
1171 tcp_free_skb(sk, skb);
1172 tcp_charge_skb(sk, nskb);
1176 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1177 TCP_ECN_send_synack(tcp_sk(sk), skb);
1179 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1180 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1184 * Prepare a SYN-ACK.
1186 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1187 struct open_request *req)
1189 struct tcp_opt *tp = tcp_sk(sk);
1191 int tcp_header_size;
1192 struct sk_buff *skb;
1194 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1198 /* Reserve space for headers. */
1199 skb_reserve(skb, MAX_TCP_HEADER);
1201 skb->dst = dst_clone(dst);
1203 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1204 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1205 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1206 /* SACK_PERM is in the place of NOP NOP of TS */
1207 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1208 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1210 memset(th, 0, sizeof(struct tcphdr));
1213 if (dst->dev->features&NETIF_F_TSO)
1215 TCP_ECN_make_synack(req, th);
1216 th->source = inet_sk(sk)->sport;
1217 th->dest = req->rmt_port;
1218 TCP_SKB_CB(skb)->seq = req->snt_isn;
1219 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1220 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1221 th->ack_seq = htonl(req->rcv_isn + 1);
1222 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1224 /* Set this up on the first call only */
1225 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1226 /* tcp_full_space because it is guaranteed to be the first packet */
1227 tcp_select_initial_window(tcp_full_space(sk),
1228 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1233 req->rcv_wscale = rcv_wscale;
1236 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1237 th->window = htons(req->rcv_wnd);
1239 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1240 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1241 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1242 TCP_SKB_CB(skb)->when,
1246 th->doff = (tcp_header_size >> 2);
1247 TCP_INC_STATS(TcpOutSegs);
1252 * Do all connect socket setups that can be done AF independent.
1254 static inline void tcp_connect_init(struct sock *sk)
1256 struct dst_entry *dst = __sk_dst_get(sk);
1257 struct tcp_opt *tp = tcp_sk(sk);
1259 /* We'll fix this up when we get a response from the other end.
1260 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1262 tp->tcp_header_len = sizeof(struct tcphdr) +
1263 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1265 /* If user gave his TCP_MAXSEG, record it to clamp */
1267 tp->mss_clamp = tp->user_mss;
1269 tcp_sync_mss(sk, dst_pmtu(dst));
1271 if (!tp->window_clamp)
1272 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1273 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1274 tcp_initialize_rcv_mss(sk);
1277 tcp_select_initial_window(tcp_full_space(sk),
1278 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1281 sysctl_tcp_window_scaling,
1284 tp->rcv_ssthresh = tp->rcv_wnd;
1287 sock_reset_flag(sk, SOCK_DONE);
1289 tcp_init_wl(tp, tp->write_seq, 0);
1290 tp->snd_una = tp->write_seq;
1291 tp->snd_sml = tp->write_seq;
1296 tp->rto = TCP_TIMEOUT_INIT;
1297 tp->retransmits = 0;
1298 tcp_clear_retrans(tp);
1302 * Build a SYN and send it off.
1304 int tcp_connect(struct sock *sk)
1306 struct tcp_opt *tp = tcp_sk(sk);
1307 struct sk_buff *buff;
1309 tcp_connect_init(sk);
1311 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1312 if (unlikely(buff == NULL))
1315 /* Reserve space for headers. */
1316 skb_reserve(buff, MAX_TCP_HEADER);
1318 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1319 TCP_ECN_send_syn(sk, tp, buff);
1320 TCP_SKB_CB(buff)->sacked = 0;
1322 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1323 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1324 tp->snd_nxt = tp->write_seq;
1325 tp->pushed_seq = tp->write_seq;
1329 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1330 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1331 __skb_queue_tail(&sk->sk_write_queue, buff);
1332 tcp_charge_skb(sk, buff);
1334 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1335 TCP_INC_STATS(TcpActiveOpens);
1337 /* Timer for repeating the SYN until an answer. */
1338 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1342 /* Send out a delayed ack, the caller does the policy checking
1343 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1346 void tcp_send_delayed_ack(struct sock *sk)
1348 struct tcp_opt *tp = tcp_sk(sk);
1349 int ato = tp->ack.ato;
1350 unsigned long timeout;
1352 if (ato > TCP_DELACK_MIN) {
1355 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1356 max_ato = TCP_DELACK_MAX;
1358 /* Slow path, intersegment interval is "high". */
1360 /* If some rtt estimate is known, use it to bound delayed ack.
1361 * Do not use tp->rto here, use results of rtt measurements
1365 int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1371 ato = min(ato, max_ato);
1374 /* Stay within the limit we were given */
1375 timeout = jiffies + ato;
1377 /* Use new timeout only if there wasn't a older one earlier. */
1378 if (tp->ack.pending&TCP_ACK_TIMER) {
1379 /* If delack timer was blocked or is about to expire,
1382 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1387 if (!time_before(timeout, tp->ack.timeout))
1388 timeout = tp->ack.timeout;
1390 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1391 tp->ack.timeout = timeout;
1392 if (!mod_timer(&tp->delack_timer, timeout))
1396 /* This routine sends an ack and also updates the window. */
1397 void tcp_send_ack(struct sock *sk)
1399 /* If we have been reset, we may not send again. */
1400 if (sk->sk_state != TCP_CLOSE) {
1401 struct tcp_opt *tp = tcp_sk(sk);
1402 struct sk_buff *buff;
1404 /* We are not putting this on the write queue, so
1405 * tcp_transmit_skb() will set the ownership to this
1408 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1410 tcp_schedule_ack(tp);
1411 tp->ack.ato = TCP_ATO_MIN;
1412 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1416 /* Reserve space for headers and prepare control bits. */
1417 skb_reserve(buff, MAX_TCP_HEADER);
1419 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1420 TCP_SKB_CB(buff)->sacked = 0;
1422 /* Send it off, this clears delayed acks for us. */
1423 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1424 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1425 tcp_transmit_skb(sk, buff);
1429 /* This routine sends a packet with an out of date sequence
1430 * number. It assumes the other end will try to ack it.
1432 * Question: what should we make while urgent mode?
1433 * 4.4BSD forces sending single byte of data. We cannot send
1434 * out of window data, because we have SND.NXT==SND.MAX...
1436 * Current solution: to send TWO zero-length segments in urgent mode:
1437 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1438 * out-of-date with SND.UNA-1 to probe window.
1440 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1442 struct tcp_opt *tp = tcp_sk(sk);
1443 struct sk_buff *skb;
1445 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1446 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1450 /* Reserve space for headers and set control bits. */
1451 skb_reserve(skb, MAX_TCP_HEADER);
1453 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1454 TCP_SKB_CB(skb)->sacked = urgent;
1456 /* Use a previous sequence. This should cause the other
1457 * end to send an ack. Don't queue or clone SKB, just
1460 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1461 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1462 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1463 return tcp_transmit_skb(sk, skb);
1466 int tcp_write_wakeup(struct sock *sk)
1468 if (sk->sk_state != TCP_CLOSE) {
1469 struct tcp_opt *tp = tcp_sk(sk);
1470 struct sk_buff *skb;
1472 if ((skb = tp->send_head) != NULL &&
1473 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1475 int mss = tcp_current_mss(sk, 0);
1476 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1478 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1479 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1481 /* We are probing the opening of a window
1482 * but the window size is != 0
1483 * must have been a result SWS avoidance ( sender )
1485 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1487 seg_size = min(seg_size, mss);
1488 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1489 if (tcp_fragment(sk, skb, seg_size))
1491 /* SWS override triggered forced fragmentation.
1492 * Disable TSO, the connection is too sick. */
1493 if (sk->sk_route_caps & NETIF_F_TSO) {
1494 sk->sk_no_largesend = 1;
1495 sk->sk_route_caps &= ~NETIF_F_TSO;
1496 tp->mss_cache = tp->mss_cache_std;
1499 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1500 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1501 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1503 update_send_head(sk, tp, skb);
1508 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1509 tcp_xmit_probe_skb(sk, TCPCB_URG);
1510 return tcp_xmit_probe_skb(sk, 0);
1516 /* A window probe timeout has occurred. If window is not closed send
1517 * a partial packet else a zero probe.
1519 void tcp_send_probe0(struct sock *sk)
1521 struct tcp_opt *tp = tcp_sk(sk);
1524 err = tcp_write_wakeup(sk);
1526 if (tp->packets_out || !tp->send_head) {
1527 /* Cancel probe timer, if it is not required. */
1534 if (tp->backoff < sysctl_tcp_retries2)
1537 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1538 min(tp->rto << tp->backoff, TCP_RTO_MAX));
1540 /* If packet was not sent due to local congestion,
1541 * do not backoff and do not remember probes_out.
1542 * Let local senders to fight for local resources.
1544 * Use accumulated backoff yet.
1546 if (!tp->probes_out)
1548 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1549 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1553 EXPORT_SYMBOL(tcp_acceptable_seq);
1554 EXPORT_SYMBOL(tcp_connect);
1555 EXPORT_SYMBOL(tcp_connect_init);
1556 EXPORT_SYMBOL(tcp_make_synack);
1557 EXPORT_SYMBOL(tcp_send_synack);
1558 EXPORT_SYMBOL(tcp_simple_retransmit);
1559 EXPORT_SYMBOL(tcp_sync_mss);
1560 EXPORT_SYMBOL(tcp_transmit_skb);
1561 EXPORT_SYMBOL(tcp_write_wakeup);
1562 EXPORT_SYMBOL(tcp_write_xmit);