#include <net/checksum.h>
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/ip.h>
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
#include <linux/ipv6.h>
#endif
#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait)
#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
-/* SLAB cache for TCP socks */
-extern kmem_cache_t *tcp_sk_cachep;
-
extern kmem_cache_t *tcp_bucket_cachep;
extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
unsigned short snum);
extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb);
extern void tcp_bucket_unlock(struct sock *sk);
extern int tcp_port_rover;
-extern struct sock *tcp_v4_lookup_listener(u32 addr, unsigned short hnum, int dif);
/* These are AF independent. */
static __inline__ int tcp_bhashfn(__u16 lport)
#define tw_node __tw_common.skc_node
#define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt
+#define tw_xid __tw_common.skc_xid
+#define tw_vx_info __tw_common.skc_vx_info
+#define tw_nid __tw_common.skc_nid
+#define tw_nx_info __tw_common.skc_nx_info
volatile unsigned char tw_substate;
unsigned char tw_rcv_wscale;
__u16 tw_sport;
/* Socket demultiplex comparisons on incoming packets. */
- /* these five are in inet_opt */
+ /* these five are in inet_sock */
__u32 tw_daddr
__attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES)));
__u32 tw_rcv_saddr;
#define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk))
-static inline const u32 tcp_v4_rcv_saddr(const struct sock *sk)
+static inline u32 tcp_v4_rcv_saddr(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr;
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static inline const struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
&inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr;
}
-static inline const struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
{
return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
}
extern atomic_t tcp_orphan_count;
extern int tcp_tw_count;
extern void tcp_time_wait(struct sock *sk, int state, int timeo);
-extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \
(((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \
((__sk)->sk_family == AF_INET6) && \
- !ipv6_addr_cmp(&inet6_sk(__sk)->daddr, (__saddr)) && \
- !ipv6_addr_cmp(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
+ ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
+ ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
/* These can have wildcards, don't try too hard. */
so that we select tick to get range about 4 seconds.
*/
-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
+#if HZ <= 16 || HZ > 32768
+# error Unsupported: HZ <= 16 or HZ > 32768
#elif HZ <= 32
# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#else
+#elif HZ <= 4096
# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
+#elif HZ <= 8192
+# define TCP_TW_RECYCLE_TICK (13+2-TCP_TW_RECYCLE_SLOTS_LOG)
+#elif HZ <= 16384
+# define TCP_TW_RECYCLE_TICK (14+2-TCP_TW_RECYCLE_SLOTS_LOG)
+#else
+# define TCP_TW_RECYCLE_TICK (15+2-TCP_TW_RECYCLE_SLOTS_LOG)
#endif
-#define BICTCP_1_OVER_BETA 8 /*
- * Fast recovery
- * multiplicative decrease factor
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
*/
#define BICTCP_MAX_INCREMENT 32 /*
* Limit on the amount of
extern int sysctl_tcp_bic;
extern int sysctl_tcp_bic_fast_convergence;
extern int sysctl_tcp_bic_low_window;
+extern int sysctl_tcp_bic_beta;
+extern int sysctl_tcp_moderate_rcvbuf;
+extern int sysctl_tcp_tso_win_divisor;
extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated;
int (*setsockopt) (struct sock *sk,
int level,
int optname,
- char *optval,
+ char __user *optval,
int optlen);
int (*getsockopt) (struct sock *sk,
int level,
int optname,
- char *optval,
- int *optlen);
+ char __user *optval,
+ int __user *optlen);
void (*addr2sockaddr) (struct sock *sk,
struct tcphdr *th,
unsigned len);
+extern void tcp_rcv_space_adjust(struct sock *sk);
+
enum tcp_ack_state_t
{
TCP_ACK_SCHED = 1,
TCP_ACK_PUSHED= 4
};
-static inline void tcp_schedule_ack(struct tcp_opt *tp)
+static inline void tcp_schedule_ack(struct tcp_sock *tp)
{
tp->ack.pending |= TCP_ACK_SCHED;
}
-static inline int tcp_ack_scheduled(struct tcp_opt *tp)
+static inline int tcp_ack_scheduled(struct tcp_sock *tp)
{
return tp->ack.pending&TCP_ACK_SCHED;
}
-static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp)
+static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp)
{
if (tp->ack.quick && --tp->ack.quick == 0) {
/* Leaving quickack mode we deflate ATO. */
}
}
-extern void tcp_enter_quickack_mode(struct tcp_opt *tp);
+extern void tcp_enter_quickack_mode(struct tcp_sock *tp);
-static __inline__ void tcp_delack_init(struct tcp_opt *tp)
+static __inline__ void tcp_delack_init(struct tcp_sock *tp)
{
memset(&tp->ack, 0, sizeof(tp->ack));
}
-static inline void tcp_clear_options(struct tcp_opt *tp)
+static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
- tp->tstamp_ok = tp->sack_ok = tp->wscale_ok = tp->snd_wscale = 0;
+ rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
}
enum tcp_tw_status
struct sk_buff *skb);
extern void tcp_enter_frto(struct sock *sk);
extern void tcp_enter_loss(struct sock *sk, int how);
-extern void tcp_clear_retrans(struct tcp_opt *tp);
+extern void tcp_clear_retrans(struct tcp_sock *tp);
extern void tcp_update_metrics(struct sock *sk);
extern void tcp_close(struct sock *sk,
long timeout);
extern struct sock * tcp_accept(struct sock *sk, int flags, int *err);
extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait);
-extern void tcp_write_space(struct sock *sk);
extern int tcp_getsockopt(struct sock *sk, int level,
- int optname, char *optval,
- int *optlen);
+ int optname,
+ char __user *optval,
+ int __user *optlen);
extern int tcp_setsockopt(struct sock *sk, int level,
- int optname, char *optval,
+ int optname, char __user *optval,
int optlen);
extern void tcp_set_keepalive(struct sock *sk, int val);
extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
extern int tcp_listen_start(struct sock *sk);
extern void tcp_parse_options(struct sk_buff *skb,
- struct tcp_opt *tp,
+ struct tcp_options_received *opt_rx,
int estab);
/*
extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
extern void tcp_xmit_retransmit_queue(struct sock *);
extern void tcp_simple_retransmit(struct sock *);
+extern int tcp_trim_head(struct sock *, struct sk_buff *, u32);
extern void tcp_send_probe0(struct sock *);
extern void tcp_send_partial(struct sock *);
extern void tcp_send_fin(struct sock *sk);
extern void tcp_send_active_reset(struct sock *sk, int priority);
extern int tcp_send_synack(struct sock *);
-extern int tcp_transmit_skb(struct sock *, struct sk_buff *);
extern void tcp_push_one(struct sock *, unsigned mss_now);
extern void tcp_send_ack(struct sock *sk);
extern void tcp_send_delayed_ack(struct sock *sk);
extern void tcp_init_xmit_timers(struct sock *);
extern void tcp_clear_xmit_timers(struct sock *);
-extern void tcp_delete_keepalive_timer (struct sock *);
-extern void tcp_reset_keepalive_timer (struct sock *, unsigned long);
-extern int tcp_sync_mss(struct sock *sk, u32 pmtu);
+extern void tcp_delete_keepalive_timer(struct sock *);
+extern void tcp_reset_keepalive_timer(struct sock *, unsigned long);
+extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
+extern unsigned int tcp_current_mss(struct sock *sk, int large);
+
+#ifdef TCP_DEBUG
+extern const char tcp_timer_bug_msg[];
+#endif
-extern const char timer_bug_msg[];
+/* tcp_diag.c */
+extern void tcp_get_info(struct sock *, struct tcp_info *);
/* Read 'sendfile()'-style from a TCP socket */
typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
{
- struct tcp_opt *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
switch (what) {
case TCP_TIME_RETRANS:
tp->pending = 0;
#ifdef TCP_CLEAR_TIMERS
- if (timer_pending(&tp->retransmit_timer) &&
- del_timer(&tp->retransmit_timer))
- __sock_put(sk);
+ sk_stop_timer(sk, &tp->retransmit_timer);
#endif
break;
case TCP_TIME_DACK:
tp->ack.pending = 0;
#ifdef TCP_CLEAR_TIMERS
- if (timer_pending(&tp->delack_timer) &&
- del_timer(&tp->delack_timer))
- __sock_put(sk);
+ sk_stop_timer(sk, &tp->delack_timer);
#endif
break;
default:
- printk(timer_bug_msg);
+#ifdef TCP_DEBUG
+ printk(tcp_timer_bug_msg);
+#endif
return;
};
*/
static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
{
- struct tcp_opt *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
if (when > TCP_RTO_MAX) {
#ifdef TCP_DEBUG
case TCP_TIME_PROBE0:
tp->pending = what;
tp->timeout = jiffies+when;
- if (!mod_timer(&tp->retransmit_timer, tp->timeout))
- sock_hold(sk);
+ sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
break;
case TCP_TIME_DACK:
tp->ack.pending |= TCP_ACK_TIMER;
tp->ack.timeout = jiffies+when;
- if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
- sock_hold(sk);
+ sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
break;
default:
- printk(KERN_DEBUG "bug: unknown timer value\n");
+#ifdef TCP_DEBUG
+ printk(tcp_timer_bug_msg);
+#endif
};
}
-/* Compute the current effective MSS, taking SACKs and IP options,
- * and even PMTU discovery events into account.
- *
- * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
- * cannot be large. However, taking into account rare use of URG, this
- * is not a big flaw.
- */
-
-static __inline__ unsigned int tcp_current_mss(struct sock *sk, int large)
-{
- struct tcp_opt *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
- int mss_now = large && (sk->sk_route_caps & NETIF_F_TSO) &&
- !tp->urg_mode ?
- tp->mss_cache : tp->mss_cache_std;
-
- if (dst) {
- u32 mtu = dst_pmtu(dst);
- if (mtu != tp->pmtu_cookie ||
- tp->ext2_header_len != dst->header_len)
- mss_now = tcp_sync_mss(sk, mtu);
- }
- if (tp->eff_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
- return mss_now;
-}
-
/* Initialize RCV_MSS value.
* RCV_MSS is an our guess about MSS used by the peer.
* We haven't any direct information about the MSS.
static inline void tcp_initialize_rcv_mss(struct sock *sk)
{
- struct tcp_opt *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min(tp->advmss, tp->mss_cache_std);
hint = min(hint, tp->rcv_wnd/2);
tp->ack.rcv_mss = hint;
}
-static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd)
+static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
-static __inline__ void tcp_fast_path_on(struct tcp_opt *tp)
+static __inline__ void tcp_fast_path_on(struct tcp_sock *tp)
{
- __tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale);
+ __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}
-static inline void tcp_fast_path_check(struct sock *sk, struct tcp_opt *tp)
+static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp)
{
if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
tp->rcv_wnd &&
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
*/
-static __inline__ u32 tcp_receive_window(struct tcp_opt *tp)
+static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp)
{
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-#define for_retrans_queue(skb, sk, tp) \
- for (skb = (sk)->sk_write_queue.next; \
- (skb != (tp)->send_head) && \
- (skb != (struct sk_buff *)&(sk)->sk_write_queue); \
- skb=skb->next)
-
-
#include <net/tcp_ecn.h>
-
-/*
- * Compute minimal free write space needed to queue new packets.
+/* Due to TSO, an SKB can be composed of multiple actual
+ * packets. To keep these tracked properly, we use this.
*/
-static inline int tcp_min_write_space(struct sock *sk)
+static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
- return sk->sk_wmem_queued / 2;
+ return skb_shinfo(skb)->tso_segs;
}
-
-static inline int tcp_wspace(struct sock *sk)
+
+/* This is valid iff tcp_skb_pcount() > 1. */
+static inline int tcp_skb_mss(const struct sk_buff *skb)
{
- return sk->sk_sndbuf - sk->sk_wmem_queued;
+ return skb_shinfo(skb)->tso_size;
}
+static inline void tcp_dec_pcount_approx(__u32 *count,
+ const struct sk_buff *skb)
+{
+ if (*count) {
+ *count -= tcp_skb_pcount(skb);
+ if ((int)*count < 0)
+ *count = 0;
+ }
+}
+
+static inline void tcp_packets_out_inc(struct sock *sk,
+ struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ int orig = tp->packets_out;
+
+ tp->packets_out += tcp_skb_pcount(skb);
+ if (!orig)
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+}
+
+static inline void tcp_packets_out_dec(struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ tp->packets_out -= tcp_skb_pcount(skb);
+}
/* This determines how many packets are "in the network" to the best
* of our knowledge. In many cases it is conservative, but where
* "Packets left network, but not honestly ACKed yet" PLUS
* "Packets fast retransmitted"
*/
-static __inline__ unsigned int tcp_packets_in_flight(struct tcp_opt *tp)
+static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
- return tp->packets_out - tp->left_out + tp->retrans_out;
+ return (tp->packets_out - tp->left_out + tp->retrans_out);
}
+/*
+ * Which congestion algorithim is in use on the connection.
+ */
+#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS)
+#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD)
+#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC)
+
/* Recalculate snd_ssthresh, we want to set it to:
*
* Reno:
* behave like Reno until low_window is reached,
* then increase congestion window slowly
*/
-static inline __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
+static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
{
- if (sysctl_tcp_bic) {
+ if (tcp_is_bic(tp)) {
if (sysctl_tcp_bic_fast_convergence &&
tp->snd_cwnd < tp->bictcp.last_max_cwnd)
- tp->bictcp.last_max_cwnd
- = (tp->snd_cwnd * (2*BICTCP_1_OVER_BETA-1))
- / (BICTCP_1_OVER_BETA/2);
+ tp->bictcp.last_max_cwnd = (tp->snd_cwnd *
+ (BICTCP_BETA_SCALE
+ + sysctl_tcp_bic_beta))
+ / (2 * BICTCP_BETA_SCALE);
else
tp->bictcp.last_max_cwnd = tp->snd_cwnd;
if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
- return max(tp->snd_cwnd - (tp->snd_cwnd/BICTCP_1_OVER_BETA),
- 2U);
+ return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
+ / BICTCP_BETA_SCALE, 2U);
}
return max(tp->snd_cwnd >> 1U, 2U);
/* Stop taking Vegas samples for now. */
#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0)
-
-/* Is this TCP connection using Vegas (regardless of whether it is taking
- * Vegas measurements at the current time)?
- */
-#define tcp_is_vegas(__tp) ((__tp)->vegas.do_vegas)
-static inline void tcp_vegas_enable(struct tcp_opt *tp)
+static inline void tcp_vegas_enable(struct tcp_sock *tp)
{
/* There are several situations when we must "re-start" Vegas:
*
/* Should we be taking Vegas samples right now? */
#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now)
-extern void tcp_vegas_init(struct tcp_opt *tp);
+extern void tcp_ca_init(struct tcp_sock *tp);
-static inline void tcp_set_ca_state(struct tcp_opt *tp, u8 ca_state)
+static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
{
if (tcp_is_vegas(tp)) {
if (ca_state == TCP_CA_Open)
* The exception is rate halving phase, when cwnd is decreasing towards
* ssthresh.
*/
-static inline __u32 tcp_current_ssthresh(struct tcp_opt *tp)
+static inline __u32 tcp_current_ssthresh(struct tcp_sock *tp)
{
if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery))
return tp->snd_ssthresh;
(tp->snd_cwnd >> 2)));
}
-static inline void tcp_sync_left_out(struct tcp_opt *tp)
+static inline void tcp_sync_left_out(struct tcp_sock *tp)
{
- if (tp->sack_ok && tp->sacked_out >= tp->packets_out - tp->lost_out)
+ if (tp->rx_opt.sack_ok &&
+ (tp->sacked_out >= tp->packets_out - tp->lost_out))
tp->sacked_out = tp->packets_out - tp->lost_out;
tp->left_out = tp->sacked_out + tp->lost_out;
}
/* Congestion window validation. (RFC2861) */
-static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp)
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
{
- if (tp->packets_out >= tp->snd_cwnd) {
+ __u32 packets_out = tp->packets_out;
+
+ if (packets_out >= tp->snd_cwnd) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* Set slow start threshould and cwnd not falling to slow start */
-static inline void __tcp_enter_cwr(struct tcp_opt *tp)
+static inline void __tcp_enter_cwr(struct tcp_sock *tp)
{
tp->undo_marker = 0;
tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
TCP_ECN_queue_cwr(tp);
}
-static inline void tcp_enter_cwr(struct tcp_opt *tp)
+static inline void tcp_enter_cwr(struct tcp_sock *tp)
{
tp->prior_ssthresh = 0;
if (tp->ca_state < TCP_CA_CWR) {
}
}
-extern __u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst);
+extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
/* Slow start with delack produces 3 packets of burst, so that
* it is safe "de facto".
*/
-static __inline__ __u32 tcp_max_burst(struct tcp_opt *tp)
+static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
{
return 3;
}
-static __inline__ int tcp_minshall_check(struct tcp_opt *tp)
+static __inline__ int tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml,tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
-static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, struct sk_buff *skb)
+static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
+ const struct sk_buff *skb)
{
if (skb->len < mss)
tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
*/
static __inline__ int
-tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int nonagle)
+tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned mss_now, int nonagle)
{
return (skb->len < mss_now &&
!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
tcp_minshall_check(tp))));
}
-/* This checks if the data bearing packet SKB (usually tp->send_head)
+extern void tcp_set_skb_tso_segs(struct sk_buff *, unsigned int);
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
* should be put on the wire right now.
*/
-static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
+static __inline__ int tcp_snd_test(const struct tcp_sock *tp,
+ struct sk_buff *skb,
unsigned cur_mss, int nonagle)
{
+ int pkts = tcp_skb_pcount(skb);
+
+ if (!pkts) {
+ tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+ pkts = tcp_skb_pcount(skb);
+ }
+
/* RFC 1122 - section 4.2.3.4
*
* We must queue if
*/
return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
|| !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
- ((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
+ (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
}
-static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
+static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
{
if (!tp->packets_out && !tp->pending)
tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
}
-static __inline__ int tcp_skb_is_last(struct sock *sk, struct sk_buff *skb)
+static __inline__ int tcp_skb_is_last(const struct sock *sk,
+ const struct sk_buff *skb)
{
return skb->next == (struct sk_buff *)&sk->sk_write_queue;
}
* The socket must be locked by the caller.
*/
static __inline__ void __tcp_push_pending_frames(struct sock *sk,
- struct tcp_opt *tp,
+ struct tcp_sock *tp,
unsigned cur_mss,
int nonagle)
{
- struct sk_buff *skb = tp->send_head;
+ struct sk_buff *skb = sk->sk_send_head;
if (skb) {
if (!tcp_skb_is_last(sk, skb))
}
static __inline__ void tcp_push_pending_frames(struct sock *sk,
- struct tcp_opt *tp)
+ struct tcp_sock *tp)
{
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
}
-static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
+static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
{
- struct sk_buff *skb = tp->send_head;
+ struct sk_buff *skb = sk->sk_send_head;
return (skb &&
tcp_snd_test(tp, skb, tcp_current_mss(sk, 1),
tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
}
-static __inline__ void tcp_init_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
{
tp->snd_wl1 = seq;
}
-static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq)
+static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
{
tp->snd_wl1 = seq;
}
-extern void tcp_destroy_sock(struct sock *sk);
+extern void tcp_destroy_sock(struct sock *sk);
/*
/* Prequeue for VJ style copy to user, combined with checksumming. */
-static __inline__ void tcp_prequeue_init(struct tcp_opt *tp)
+static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
{
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
*/
static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_opt *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
if (!sysctl_tcp_low_latency && tp->ucopy.task) {
__skb_queue_tail(&tp->ucopy.prequeue, skb);
while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
sk->sk_backlog_rcv(sk, skb1);
- NET_INC_STATS_BH(TCPPrequeueDropped);
+ NET_INC_STATS_BH(LINUX_MIB_TCPPREQUEUEDROPPED);
}
tp->ucopy.memory = 0;
#undef STATE_TRACE
#ifdef STATE_TRACE
-static char *statename[]={
+static const char *statename[]={
"Unused","Established","Syn Sent","Syn Recv",
"Fin Wait 1","Fin Wait 2","Time Wait", "Close",
"Close Wait","Last ACK","Listen","Closing"
switch (state) {
case TCP_ESTABLISHED:
if (oldstate != TCP_ESTABLISHED)
- TCP_INC_STATS(TcpCurrEstab);
+ TCP_INC_STATS(TCP_MIB_CURRESTAB);
break;
case TCP_CLOSE:
if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
- TCP_INC_STATS(TcpEstabResets);
+ TCP_INC_STATS(TCP_MIB_ESTABRESETS);
sk->sk_prot->unhash(sk);
if (tcp_sk(sk)->bind_hash &&
/* fall through */
default:
if (oldstate==TCP_ESTABLISHED)
- TCP_DEC_STATS(TcpCurrEstab);
+ TCP_DEC_STATS(TCP_MIB_CURRESTAB);
}
/* Change state AFTER socket is unhashed to avoid closed
tcp_destroy_sock(sk);
}
-static __inline__ void tcp_sack_reset(struct tcp_opt *tp)
+static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
- tp->dsack = 0;
- tp->eff_sacks = 0;
- tp->num_sacks = 0;
+ rx_opt->dsack = 0;
+ rx_opt->eff_sacks = 0;
+ rx_opt->num_sacks = 0;
}
-static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp)
+static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp)
{
- if (tp->tstamp_ok) {
+ if (tp->rx_opt.tstamp_ok) {
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
*ptr++ = htonl(tstamp);
- *ptr++ = htonl(tp->ts_recent);
+ *ptr++ = htonl(tp->rx_opt.ts_recent);
}
- if (tp->eff_sacks) {
- struct tcp_sack_block *sp = tp->dsack ? tp->duplicate_sack : tp->selective_acks;
+ if (tp->rx_opt.eff_sacks) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE +
- (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)));
- for(this_sack = 0; this_sack < tp->eff_sacks; this_sack++) {
+ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)));
+ for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
*ptr++ = htonl(sp[this_sack].start_seq);
*ptr++ = htonl(sp[this_sack].end_seq);
}
- if (tp->dsack) {
- tp->dsack = 0;
- tp->eff_sacks--;
+ if (tp->rx_opt.dsack) {
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.eff_sacks--;
}
}
}
*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
}
-/* Determine a window scaling and initial window to offer.
- * Based on the assumption that the given amount of space
- * will be offered. Store the results in the tp structure.
- * NOTE: for smooth operation initial space offering should
- * be a multiple of mss if possible. We assume here that mss >= 1.
- * This MUST be enforced by all callers.
- */
-static inline void tcp_select_initial_window(int __space, __u32 mss,
- __u32 *rcv_wnd,
- __u32 *window_clamp,
- int wscale_ok,
- __u8 *rcv_wscale)
-{
- unsigned int space = (__space < 0 ? 0 : __space);
-
- /* If no clamp set the clamp to the max possible scaled window */
- if (*window_clamp == 0)
- (*window_clamp) = (65535 << 14);
- space = min(*window_clamp, space);
-
- /* Quantize space offering to a multiple of mss if possible. */
- if (space > mss)
- space = (space / mss) * mss;
-
- /* NOTE: offering an initial window larger than 32767
- * will break some buggy TCP stacks. We try to be nice.
- * If we are not window scaling, then this truncates
- * our initial window offering to 32k. There should also
- * be a sysctl option to stop being nice.
- */
- (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
- (*rcv_wscale) = 0;
- if (wscale_ok) {
- /* See RFC1323 for an explanation of the limit to 14 */
- while (space > 65535 && (*rcv_wscale) < 14) {
- space >>= 1;
- (*rcv_wscale)++;
- }
- if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
- space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2)
- (*rcv_wscale)--;
- }
-
- /* Set initial window to value enough for senders,
- * following RFC1414. Senders, not following this RFC,
- * will be satisfied with 2.
- */
- if (mss > (1<<*rcv_wscale)) {
- int init_cwnd = 4;
- if (mss > 1460*3)
- init_cwnd = 2;
- else if (mss > 1460)
- init_cwnd = 3;
- if (*rcv_wnd > init_cwnd*mss)
- *rcv_wnd = init_cwnd*mss;
- }
- /* Set the clamp no higher than max representable value */
- (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
-}
+/* Determine a window scaling and initial window to offer. */
+extern void tcp_select_initial_window(int __space, __u32 mss,
+ __u32 *rcv_wnd, __u32 *window_clamp,
+ int wscale_ok, __u8 *rcv_wscale);
static inline int tcp_win_from_space(int space)
{
}
/* Note: caller must be prepared to deal with negative returns */
-static inline int tcp_space(struct sock *sk)
+static inline int tcp_space(const struct sock *sk)
{
return tcp_win_from_space(sk->sk_rcvbuf -
atomic_read(&sk->sk_rmem_alloc));
}
-static inline int tcp_full_space( struct sock *sk)
+static inline int tcp_full_space(const struct sock *sk)
{
return tcp_win_from_space(sk->sk_rcvbuf);
}
-static inline void tcp_acceptq_removed(struct sock *sk)
-{
- sk->sk_ack_backlog--;
-}
-
-static inline void tcp_acceptq_added(struct sock *sk)
-{
- sk->sk_ack_backlog++;
-}
-
-static inline int tcp_acceptq_is_full(struct sock *sk)
-{
- return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
-}
-
static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req,
struct sock *child)
{
- struct tcp_opt *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
req->sk = child;
- tcp_acceptq_added(sk);
+ sk_acceptq_added(sk);
if (!tp->accept_queue_tail) {
tp->accept_queue = req;
return tcp_synq_len(sk) >> tcp_sk(sk)->listen_opt->max_qlen_log;
}
-static inline void tcp_synq_unlink(struct tcp_opt *tp, struct open_request *req,
+static inline void tcp_synq_unlink(struct tcp_sock *tp, struct open_request *req,
struct open_request **prev)
{
write_lock(&tp->syn_wait_lock);
}
static __inline__ void tcp_openreq_init(struct open_request *req,
- struct tcp_opt *tp,
+ struct tcp_options_received *rx_opt,
struct sk_buff *skb)
{
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->rcv_isn = TCP_SKB_CB(skb)->seq;
- req->mss = tp->mss_clamp;
- req->ts_recent = tp->saw_tstamp ? tp->rcv_tsval : 0;
- req->tstamp_ok = tp->tstamp_ok;
- req->sack_ok = tp->sack_ok;
- req->snd_wscale = tp->snd_wscale;
- req->wscale_ok = tp->wscale_ok;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ req->tstamp_ok = rx_opt->tstamp_ok;
+ req->sack_ok = rx_opt->sack_ok;
+ req->snd_wscale = rx_opt->snd_wscale;
+ req->wscale_ok = rx_opt->wscale_ok;
req->acked = 0;
req->ecn_ok = 0;
req->rmt_port = skb->h.th->source;
}
-#define TCP_MEM_QUANTUM ((int)PAGE_SIZE)
-
-static inline void tcp_free_skb(struct sock *sk, struct sk_buff *skb)
-{
- tcp_sk(sk)->queue_shrunk = 1;
- sk->sk_wmem_queued -= skb->truesize;
- sk->sk_forward_alloc += skb->truesize;
- __kfree_skb(skb);
-}
-
-static inline void tcp_charge_skb(struct sock *sk, struct sk_buff *skb)
-{
- sk->sk_wmem_queued += skb->truesize;
- sk->sk_forward_alloc -= skb->truesize;
-}
-
-extern void __tcp_mem_reclaim(struct sock *sk);
-extern int tcp_mem_schedule(struct sock *sk, int size, int kind);
-
-static inline void tcp_mem_reclaim(struct sock *sk)
-{
- if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM)
- __tcp_mem_reclaim(sk);
-}
-
-static inline void tcp_enter_memory_pressure(void)
-{
- if (!tcp_memory_pressure) {
- NET_INC_STATS(TCPMemoryPressures);
- tcp_memory_pressure = 1;
- }
-}
-
-static inline void tcp_moderate_sndbuf(struct sock *sk)
-{
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
- sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued / 2);
- sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF);
- }
-}
-
-static inline struct sk_buff *tcp_alloc_pskb(struct sock *sk, int size, int mem, int gfp)
-{
- struct sk_buff *skb = alloc_skb(size+MAX_TCP_HEADER, gfp);
-
- if (skb) {
- skb->truesize += mem;
- if (sk->sk_forward_alloc >= (int)skb->truesize ||
- tcp_mem_schedule(sk, skb->truesize, 0)) {
- skb_reserve(skb, MAX_TCP_HEADER);
- return skb;
- }
- __kfree_skb(skb);
- } else {
- tcp_enter_memory_pressure();
- tcp_moderate_sndbuf(sk);
- }
- return NULL;
-}
-
-static inline struct sk_buff *tcp_alloc_skb(struct sock *sk, int size, int gfp)
-{
- return tcp_alloc_pskb(sk, size, 0, gfp);
-}
-
-static inline struct page * tcp_alloc_page(struct sock *sk)
-{
- if (sk->sk_forward_alloc >= (int)PAGE_SIZE ||
- tcp_mem_schedule(sk, PAGE_SIZE, 0)) {
- struct page *page = alloc_pages(sk->sk_allocation, 0);
- if (page)
- return page;
- }
- tcp_enter_memory_pressure();
- tcp_moderate_sndbuf(sk);
- return NULL;
-}
-
-static inline void tcp_writequeue_purge(struct sock *sk)
-{
- struct sk_buff *skb;
-
- while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
- tcp_free_skb(sk, skb);
- tcp_mem_reclaim(sk);
-}
-
-extern void tcp_rfree(struct sk_buff *skb);
-
-static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb->sk = sk;
- skb->destructor = tcp_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
- sk->sk_forward_alloc -= skb->truesize;
-}
+extern void tcp_enter_memory_pressure(void);
extern void tcp_listen_wlock(void);
wake_up(&tcp_lhash_wait);
}
-static inline int keepalive_intvl_when(struct tcp_opt *tp)
+static inline int keepalive_intvl_when(const struct tcp_sock *tp)
{
return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl;
}
-static inline int keepalive_time_when(struct tcp_opt *tp)
+static inline int keepalive_time_when(const struct tcp_sock *tp)
{
return tp->keepalive_time ? : sysctl_tcp_keepalive_time;
}
-static inline int tcp_fin_time(struct tcp_opt *tp)
+static inline int tcp_fin_time(const struct tcp_sock *tp)
{
int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout;
return fin_timeout;
}
-static inline int tcp_paws_check(struct tcp_opt *tp, int rst)
+static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int rst)
{
- if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+ if ((s32)(rx_opt->rcv_tsval - rx_opt->ts_recent) >= 0)
return 0;
- if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+ if (xtime.tv_sec >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)
return 0;
/* RST segments are not recommended to carry timestamp,
However, we can relax time bounds for RST segments to MSL.
*/
- if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+ if (rst && xtime.tv_sec >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
return 0;
return 1;
}
static inline int tcp_use_frto(const struct sock *sk)
{
- const struct tcp_opt *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* F-RTO must be activated in sysctl and there must be some
* unsent new data, and the advertised window should allow
* sending it.
*/
- return (sysctl_tcp_frto && tp->send_head &&
- !after(TCP_SKB_CB(tp->send_head)->end_seq,
+ return (sysctl_tcp_frto && sk->sk_send_head &&
+ !after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
tp->snd_una + tp->snd_wnd));
}
static inline void tcp_mib_init(void)
{
/* See RFC 2012 */
- TCP_ADD_STATS_USER(TcpRtoAlgorithm, 1);
- TCP_ADD_STATS_USER(TcpRtoMin, TCP_RTO_MIN*1000/HZ);
- TCP_ADD_STATS_USER(TcpRtoMax, TCP_RTO_MAX*1000/HZ);
- TCP_ADD_STATS_USER(TcpMaxConn, -1);
+ TCP_ADD_STATS_USER(TCP_MIB_RTOALGORITHM, 1);
+ TCP_ADD_STATS_USER(TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
+ TCP_ADD_STATS_USER(TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
+ TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
}
/* /proc */
#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
-static inline void tcp_westwood_update_rtt(struct tcp_opt *tp, __u32 rtt_seq)
+static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
{
- if (sysctl_tcp_westwood)
+ if (tcp_is_westwood(tp))
tp->westwood.rtt = rtt_seq;
}
-void __tcp_westwood_fast_bw(struct sock *, struct sk_buff *);
-void __tcp_westwood_slow_bw(struct sock *, struct sk_buff *);
-
-static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
- if (sysctl_tcp_westwood)
- __tcp_westwood_fast_bw(sk, skb);
-}
-
-static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
- if (sysctl_tcp_westwood)
- __tcp_westwood_slow_bw(sk, skb);
-}
-
-static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
+static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
{
return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
- (__u32) (tp->mss_cache),
+ (__u32) (tp->mss_cache_std),
2U);
}
-static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
+static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
{
- return sysctl_tcp_westwood ? __tcp_westwood_bw_rttmin(tp) : 0;
+ return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
}
-static inline int tcp_westwood_ssthresh(struct tcp_opt *tp)
+static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
{
__u32 ssthresh = 0;
- if (sysctl_tcp_westwood) {
+ if (tcp_is_westwood(tp)) {
ssthresh = __tcp_westwood_bw_rttmin(tp);
if (ssthresh)
tp->snd_ssthresh = ssthresh;
return (ssthresh != 0);
}
-static inline int tcp_westwood_cwnd(struct tcp_opt *tp)
+static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
{
__u32 cwnd = 0;
- if (sysctl_tcp_westwood) {
+ if (tcp_is_westwood(tp)) {
cwnd = __tcp_westwood_bw_rttmin(tp);
if (cwnd)
tp->snd_cwnd = cwnd;