X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fdccp%2Fccids%2Fccid3.c;h=40402c59506a32b5b5c12d3bc7ca53730d087740;hb=refs%2Fheads%2Fvserver;hp=35d1d347541c025cbddd421cabcf6f4cefe36baa;hpb=64ba3f394c830ec48a1c31b53dcae312c56f1604;p=linux-2.6.git diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 35d1d3475..40402c595 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -2,7 +2,7 @@ * net/dccp/ccids/ccid3.c * * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-6 Ian McDonald + * Copyright (c) 2005-6 Ian McDonald * * An implementation of the DCCP protocol * @@ -34,7 +34,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include "../ccid.h" #include "../dccp.h" #include "lib/packet_history.h" @@ -42,32 +41,9 @@ #include "lib/tfrc.h" #include "ccid3.h" -/* - * Reason for maths here is to avoid 32 bit overflow when a is big. - * With this we get close to the limit. - */ -static inline u32 usecs_div(const u32 a, const u32 b) -{ - const u32 div = a < (UINT_MAX / (USEC_PER_SEC / 10)) ? 10 : - a < (UINT_MAX / (USEC_PER_SEC / 50)) ? 50 : - a < (UINT_MAX / (USEC_PER_SEC / 100)) ? 100 : - a < (UINT_MAX / (USEC_PER_SEC / 500)) ? 500 : - a < (UINT_MAX / (USEC_PER_SEC / 1000)) ? 1000 : - a < (UINT_MAX / (USEC_PER_SEC / 5000)) ? 5000 : - a < (UINT_MAX / (USEC_PER_SEC / 10000)) ? 10000 : - a < (UINT_MAX / (USEC_PER_SEC / 50000)) ? 50000 : - 100000; - const u32 tmp = a * (USEC_PER_SEC / div); - return (b >= 2 * div) ? tmp / (b / div) : tmp; -} - +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG static int ccid3_debug; - -#ifdef CCID3_DEBUG -#define ccid3_pr_debug(format, a...) \ - do { if (ccid3_debug) \ - printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \ - } while (0) +#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) #else #define ccid3_pr_debug(format, a...) #endif @@ -76,24 +52,7 @@ static struct dccp_tx_hist *ccid3_tx_hist; static struct dccp_rx_hist *ccid3_rx_hist; static struct dccp_li_hist *ccid3_li_hist; -static int ccid3_init(struct sock *sk) -{ - return 0; -} - -static void ccid3_exit(struct sock *sk) -{ -} - -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, - TFRC_SSTATE_TERM, -}; - -#ifdef CCID3_DEBUG +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) { static char *ccid3_state_names[] = { @@ -107,8 +66,8 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) } #endif -static inline void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) +static void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; @@ -120,324 +79,319 @@ static inline void ccid3_hc_tx_set_state(struct sock *sk, hctx->ccid3hctx_state = state; } -/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */ -static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) +/* + * Recalculate scheduled nominal send time t_nom, inter-packet interval + * t_ipi, and delta value. Should be called after each change to X. + */ +static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx) { - /* - * If no feedback spec says t_ipi is 1 second (set elsewhere and then - * doubles after every no feedback timer (separate function) - */ - if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) - hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s, - hctx->ccid3hctx_x); -} + timeval_sub_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi); -/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ -static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx) -{ + /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ + hctx->ccid3hctx_t_ipi = scaled_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x >> 6); + + /* Update nominal send time with regard to the new t_ipi */ + timeval_add_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi); + + /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); } - /* * Update X by * If (p > 0) - * x_calc = calcX(s, R, p); + * X_calc = calcX(s, R, p); * X = max(min(X_calc, 2 * X_recv), s / t_mbi); * Else * If (now - tld >= R) * X = max(min(2 * X, 2 * X_recv), s / R); * tld = now; - */ -static void ccid3_hc_tx_update_x(struct sock *sk) + * + * Note: X and X_recv are both stored in units of 64 * bytes/second, to support + * fine-grained resolution of sending rates. This requires scaling by 2^6 + * throughout the code. Only X_calc is unscaled (in bytes/second). + * + * If X has changed, we also update the scheduled send time t_now, + * the inter-packet interval t_ipi, and the delta value. + */ +static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) + { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + const __u64 old_x = hctx->ccid3hctx_x; - /* To avoid large error in calcX */ - if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { - hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, - hctx->ccid3hctx_rtt, - hctx->ccid3hctx_p); - hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, - 2 * hctx->ccid3hctx_x_recv), - (hctx->ccid3hctx_s / - TFRC_MAX_BACK_OFF_TIME)); - } else { - struct timeval now; + if (hctx->ccid3hctx_p > 0) { - dccp_timestamp(sk, &now); - if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >= - hctx->ccid3hctx_rtt) { - hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv, - hctx->ccid3hctx_x) * 2, - usecs_div(hctx->ccid3hctx_s, - hctx->ccid3hctx_rtt)); - hctx->ccid3hctx_t_ld = now; - } + hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, + hctx->ccid3hctx_x_recv * 2); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); + + } else if (timeval_delta(now, &hctx->ccid3hctx_t_ld) - + (suseconds_t)hctx->ccid3hctx_rtt >= 0) { + + hctx->ccid3hctx_x = + max(2 * min(hctx->ccid3hctx_x, hctx->ccid3hctx_x_recv), + scaled_div(((__u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_rtt)); + hctx->ccid3hctx_t_ld = *now; + } + + if (hctx->ccid3hctx_x != old_x) + ccid3_update_send_time(hctx); +} + +/* + * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) + * @len: DCCP packet payload size in bytes + */ +static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) +{ + if (unlikely(len == 0)) + ccid3_pr_debug("Packet payload length is 0 - not updating\n"); + else + hctx->ccid3hctx_s = hctx->ccid3hctx_s == 0 ? len : + (9 * hctx->ccid3hctx_s + len) / 10; + /* + * Note: We could do a potential optimisation here - when `s' changes, + * recalculate sending rate and consequently t_ipi, t_delta, and + * t_now. This is however non-standard, and the benefits are not + * clear, so it is currently left out. + */ +} + +/* + * Update Window Counter using the algorithm from [RFC 4342, 8.1]. + * The algorithm is not applicable if RTT < 4 microseconds. + */ +static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, + struct timeval *now) +{ + suseconds_t delta; + u32 quarter_rtts; + + if (unlikely(hctx->ccid3hctx_rtt < 4)) /* avoid divide-by-zero */ + return; + + delta = timeval_delta(now, &hctx->ccid3hctx_t_last_win_count); + DCCP_BUG_ON(delta < 0); + + quarter_rtts = (u32)delta / (hctx->ccid3hctx_rtt / 4); + + if (quarter_rtts > 0) { + hctx->ccid3hctx_t_last_win_count = *now; + hctx->ccid3hctx_last_win_count += min_t(u32, quarter_rtts, 5); + hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ + + ccid3_pr_debug("now at %#X\n", hctx->ccid3hctx_last_win_count); } } static void ccid3_hc_tx_no_feedback_timer(unsigned long data) { struct sock *sk = (struct sock *)data; - unsigned long next_tmout = 0; struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + unsigned long t_nfb = USEC_PER_SEC / 5; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ /* XXX: set some sensible MIB */ - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + HZ / 5); - goto out; + goto restart_timer; } - ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk, + ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state)); - + switch (hctx->ccid3hctx_state) { - case TFRC_SSTATE_TERM: - goto out; case TFRC_SSTATE_NO_FBACK: - /* Halve send rate */ - hctx->ccid3hctx_x /= 2; - if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s / - TFRC_MAX_BACK_OFF_TIME)) - hctx->ccid3hctx_x = (hctx->ccid3hctx_s / - TFRC_MAX_BACK_OFF_TIME); - - ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d " - "bytes/s\n", - dccp_role(sk), sk, + /* RFC 3448, 4.4: Halve send rate directly */ + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); + + ccid3_pr_debug("%s(%p, state=%s), updated tx rate to %u " + "bytes/s\n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state), - hctx->ccid3hctx_x); - next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s, - hctx->ccid3hctx_x), - TFRC_INITIAL_TIMEOUT); - /* - * FIXME - not sure above calculation is correct. See section - * 5 of CCID3 11 should adjust tx_t_ipi and double that to - * achieve it really - */ + (unsigned)(hctx->ccid3hctx_x >> 6)); + /* The value of R is still undefined and so we can not recompute + * the timout value. Keep initial value as per [RFC 4342, 5]. */ + t_nfb = TFRC_INITIAL_TIMEOUT; + ccid3_update_send_time(hctx); break; case TFRC_SSTATE_FBACK: /* * Check if IDLE since last timeout and recv rate is less than - * 4 packets per RTT + * 4 packets (in units of 64*bytes/sec) per RTT */ if (!hctx->ccid3hctx_idle || - (hctx->ccid3hctx_x_recv >= - 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) { - ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", + (hctx->ccid3hctx_x_recv >= 4 * + scaled_div(((__u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_rtt))) { + struct timeval now; + + ccid3_pr_debug("%s(%p, state=%s), not idle\n", dccp_role(sk), sk, - ccid3_tx_state_name(hctx->ccid3hctx_state)); - /* Halve sending rate */ + ccid3_tx_state_name(hctx->ccid3hctx_state)); - /* If (X_calc > 2 * X_recv) + /* + * Modify the cached value of X_recv [RFC 3448, 4.4] + * + * If (p == 0 || X_calc > 2 * X_recv) * X_recv = max(X_recv / 2, s / (2 * t_mbi)); * Else * X_recv = X_calc / 4; + * + * Note that X_recv is scaled by 2^6 while X_calc is not */ - BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && - hctx->ccid3hctx_x_calc == 0); - - /* check also if p is zero -> x_calc is infinity? */ - if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || - hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) - hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, - hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME)); - else - hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4; - - /* Update sending rate */ - ccid3_hc_tx_update_x(sk); + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); + + if (hctx->ccid3hctx_p == 0 || + (hctx->ccid3hctx_x_calc > + (hctx->ccid3hctx_x_recv >> 5))) { + + hctx->ccid3hctx_x_recv = + max(hctx->ccid3hctx_x_recv / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + (2 * TFRC_T_MBI)); + + if (hctx->ccid3hctx_p == 0) + dccp_timestamp(sk, &now); + } else { + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; + hctx->ccid3hctx_x_recv <<= 4; + } + /* Now recalculate X [RFC 3448, 4.3, step (4)] */ + ccid3_hc_tx_update_x(sk, &now); } /* * Schedule no feedback timer to expire in - * max(4 * R, 2 * s / X) + * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) + * See comments in packet_recv() regarding the value of t_RTO. */ - next_tmout = max_t(u32, hctx->ccid3hctx_t_rto, - 2 * usecs_div(hctx->ccid3hctx_s, - hctx->ccid3hctx_x)); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); break; - default: - printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", - __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); - dump_stack(); + case TFRC_SSTATE_NO_SENT: + DCCP_BUG("%s(%p) - Illegal state NO_SENT", dccp_role(sk), sk); + /* fall through */ + case TFRC_SSTATE_TERM: goto out; } - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); hctx->ccid3hctx_idle = 1; + +restart_timer: + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + usecs_to_jiffies(t_nfb)); out: bh_unlock_sock(sk); sock_put(sk); } -static int ccid3_hc_tx_send_packet(struct sock *sk, - struct sk_buff *skb, int len) +/* + * returns + * > 0: delay (in msecs) that should pass before actually sending + * = 0: can send immediately + * < 0: error condition; do not send packet + */ +static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct dccp_tx_hist_entry *new_packet; struct timeval now; - long delay; - int rc = -ENOTCONN; + suseconds_t delay; - BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM); + BUG_ON(hctx == NULL); - /* Check if pure ACK or Terminating*/ /* - * XXX: We only call this function for DATA and DATAACK, on, these - * packets can have zero length, but why the comment about "pure ACK"? + * This function is called only for Data and DataAck packets. Sending + * zero-sized Data(Ack)s is theoretically possible, but for congestion + * control this case is pathological - ignore it. */ - if (unlikely(len == 0)) - goto out; - - /* See if last packet allocated was not sent */ - new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); - if (new_packet == NULL || new_packet->dccphtx_sent) { - new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist, - SLAB_ATOMIC); - - rc = -ENOBUFS; - if (unlikely(new_packet == NULL)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, not enough " - "mem to add to history, send refused\n", - __FUNCTION__, dccp_role(sk), sk); - goto out; - } - - dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet); - } + if (unlikely(skb->len == 0)) + return -EBADMSG; dccp_timestamp(sk, &now); switch (hctx->ccid3hctx_state) { case TFRC_SSTATE_NO_SENT: - hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer; - hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)); + (jiffies + + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hctx->ccid3hctx_last_win_count = 0; hctx->ccid3hctx_t_last_win_count = now; ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - hctx->ccid3hctx_t_ipi = TFRC_INITIAL_IPI; - /* Set nominal send time for initial packet */ + /* Set initial sending rate X/s to 1pps (X is scaled by 2^6) */ + ccid3_hc_tx_update_s(hctx, skb->len); + hctx->ccid3hctx_x = hctx->ccid3hctx_s; + hctx->ccid3hctx_x <<= 6; + + /* First timeout, according to [RFC 3448, 4.2], is 1 second */ + hctx->ccid3hctx_t_ipi = USEC_PER_SEC; + /* Initial delta: minimum of 0.5 sec and t_gran/2 */ + hctx->ccid3hctx_delta = TFRC_OPSYS_HALF_TIME_GRAN; + + /* Set t_0 for initial packet */ hctx->ccid3hctx_t_nom = now; - timeval_add_usecs(&hctx->ccid3hctx_t_nom, - hctx->ccid3hctx_t_ipi); - ccid3_calc_new_delta(hctx); - rc = 0; break; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) - - hctx->ccid3hctx_delta); - delay /= -1000; - /* divide by -1000 is to convert to ms and get sign right */ - rc = delay > 0 ? delay : 0; - break; - default: - printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", - __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); - dump_stack(); - rc = -EINVAL; + delay = timeval_delta(&hctx->ccid3hctx_t_nom, &now); + /* + * Scheduling of packet transmissions [RFC 3448, 4.6] + * + * if (t_now > t_nom - delta) + * // send the packet now + * else + * // send the packet in (t_nom - t_now) milliseconds. + */ + if (delay - (suseconds_t)hctx->ccid3hctx_delta >= 0) + return delay / 1000L; + + ccid3_hc_tx_update_win_count(hctx, &now); break; + case TFRC_SSTATE_TERM: + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); + return -EINVAL; } - /* Can we send? if so add options and add to packet history */ - if (rc == 0) { - dp->dccps_hc_tx_insert_options = 1; - new_packet->dccphtx_ccval = - DCCP_SKB_CB(skb)->dccpd_ccval = - hctx->ccid3hctx_last_win_count; - } -out: - return rc; + /* prepare to send now (add options etc.) */ + dp->dccps_hc_tx_insert_options = 1; + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; + + /* set the nominal send time for the next following packet */ + timeval_add_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi); + + return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, + unsigned int len) { - const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct timeval now; + struct dccp_tx_hist_entry *packet; - BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM); - - dccp_timestamp(sk, &now); - - /* check if we have sent a data packet */ - if (len > 0) { - unsigned long quarter_rtt; - struct dccp_tx_hist_entry *packet; - - packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); - if (unlikely(packet == NULL)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: packet doesn't " - "exists in history!\n", __FUNCTION__); - return; - } - if (unlikely(packet->dccphtx_sent)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: no unsent packet in " - "history!\n", __FUNCTION__); - return; - } - packet->dccphtx_tstamp = now; - packet->dccphtx_seqno = dp->dccps_gss; - /* - * Check if win_count have changed - * Algorithm in "8.1. Window Counter Valuer" in - * draft-ietf-dccp-ccid3-11.txt - */ - quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count); - if (likely(hctx->ccid3hctx_rtt > 8)) - quarter_rtt /= hctx->ccid3hctx_rtt / 4; - - if (quarter_rtt > 0) { - hctx->ccid3hctx_t_last_win_count = now; - hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count + - min_t(unsigned long, quarter_rtt, 5)) % 16; - ccid3_pr_debug("%s, sk=%p, window changed from " - "%u to %u!\n", - dccp_role(sk), sk, - packet->dccphtx_ccval, - hctx->ccid3hctx_last_win_count); - } + BUG_ON(hctx == NULL); - hctx->ccid3hctx_idle = 0; - packet->dccphtx_rtt = hctx->ccid3hctx_rtt; - packet->dccphtx_sent = 1; - } else - ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n", - dccp_role(sk), sk, dp->dccps_gss); + ccid3_hc_tx_update_s(hctx, len); - switch (hctx->ccid3hctx_state) { - case TFRC_SSTATE_NO_SENT: - /* if first wasn't pure ack */ - if (len != 0) - printk(KERN_CRIT "%s: %s, First packet sent is noted " - "as a data packet\n", - __FUNCTION__, dccp_role(sk)); + packet = dccp_tx_hist_entry_new(ccid3_tx_hist, GFP_ATOMIC); + if (unlikely(packet == NULL)) { + DCCP_CRIT("packet history - out of memory!"); return; - case TFRC_SSTATE_NO_FBACK: - case TFRC_SSTATE_FBACK: - if (len > 0) { - hctx->ccid3hctx_t_nom = now; - ccid3_calc_new_t_ipi(hctx); - ccid3_calc_new_delta(hctx); - timeval_add_usecs(&hctx->ccid3hctx_t_nom, - hctx->ccid3hctx_t_ipi); - } - break; - default: - printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", - __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); - dump_stack(); - break; } + dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, packet); + + dccp_timestamp(sk, &now); + packet->dccphtx_tstamp = now; + packet->dccphtx_seqno = dccp_sk(sk)->dccps_gss; + packet->dccphtx_rtt = hctx->ccid3hctx_rtt; + packet->dccphtx_sent = 1; + hctx->ccid3hctx_idle = 0; } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -447,13 +401,11 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) struct ccid3_options_received *opt_recv; struct dccp_tx_hist_entry *packet; struct timeval now; - unsigned long next_tmout; - u32 t_elapsed; + unsigned long t_nfb; u32 pinv; - u32 x_recv; - u32 r_sample; + suseconds_t r_sample, t_elapsed; - BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM); + BUG_ON(hctx == NULL); /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || @@ -462,41 +414,49 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) opt_recv = &hctx->ccid3hctx_options_received; - t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; - x_recv = opt_recv->ccid3or_receive_rate; - pinv = opt_recv->ccid3or_loss_event_rate; - switch (hctx->ccid3hctx_state) { - case TFRC_SSTATE_NO_SENT: - /* FIXME: what to do here? */ - return; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - /* Calculate new round trip sample by - * R_sample = (now - t_recvdata) - t_delay */ - /* get t_recvdata from history */ + /* get packet from history to look up t_recvdata */ packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, - DCCP_SKB_CB(skb)->dccpd_ack_seq); + DCCP_SKB_CB(skb)->dccpd_ack_seq); if (unlikely(packet == NULL)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, seqno " - "%llu(%s) does't exist in history!\n", - __FUNCTION__, dccp_role(sk), sk, + DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist " + "in history!\n", dccp_role(sk), sk, (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); return; } - /* Update RTT */ + /* Update receive rate in units of 64 * bytes/second */ + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; + hctx->ccid3hctx_x_recv <<= 6; + + /* Update loss event rate */ + pinv = opt_recv->ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + hctx->ccid3hctx_p = 0; + else /* can not exceed 100% */ + hctx->ccid3hctx_p = 1000000 / pinv; + dccp_timestamp(sk, &now); - r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); + + /* + * Calculate new round trip sample as per [RFC 3448, 4.3] by + * R_sample = (now - t_recvdata) - t_elapsed + */ + r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); + t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; + + DCCP_BUG_ON(r_sample < 0); if (unlikely(r_sample <= t_elapsed)) - LIMIT_NETDEBUG(KERN_WARNING "%s: r_sample=%uus, " - "t_elapsed=%uus\n", - __FUNCTION__, r_sample, t_elapsed); + DCCP_WARN("WARNING: r_sample=%dus <= t_elapsed=%dus\n", + (int)r_sample, (int)t_elapsed); else r_sample -= t_elapsed; + CCID3_RTT_SANITY_CHECK(r_sample); - /* Update RTT estimate by + /* Update RTT estimate by * If (No feedback recv) * R = R_sample; * Else @@ -505,98 +465,96 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * q is a constant, RFC 3448 recomments 0.9 */ if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - hctx->ccid3hctx_rtt = r_sample; - } else - hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + - r_sample / 10; - - ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " - "r_sample=%us\n", dccp_role(sk), sk, - hctx->ccid3hctx_rtt, r_sample); - - /* Update timeout interval */ - hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, - USEC_PER_SEC); + /* + * Larger Initial Windows [RFC 4342, sec. 5] + * We deviate in that we use `s' instead of `MSS'. + */ + __u64 w_init = min(4 * hctx->ccid3hctx_s, + max(2 * hctx->ccid3hctx_s, 4380)); + hctx->ccid3hctx_rtt = r_sample; + hctx->ccid3hctx_x = scaled_div(w_init << 6, r_sample); + hctx->ccid3hctx_t_ld = now; - /* Update receive rate */ - hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ + ccid3_update_send_time(hctx); - /* Update loss event rate */ - if (pinv == ~0 || pinv == 0) - hctx->ccid3hctx_p = 0; - else { - hctx->ccid3hctx_p = 1000000 / pinv; + ccid3_pr_debug("%s(%p), s=%u, w_init=%llu, " + "R_sample=%dus, X=%u\n", dccp_role(sk), + sk, hctx->ccid3hctx_s, w_init, + (int)r_sample, + (unsigned)(hctx->ccid3hctx_x >> 6)); - if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { - hctx->ccid3hctx_p = TFRC_SMALLEST_P; - ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", - dccp_role(sk), sk); - } + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); + } else { + hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt + + (u32)r_sample) / 10; + + /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ + if (hctx->ccid3hctx_p > 0) + hctx->ccid3hctx_x_calc = + tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); + ccid3_hc_tx_update_x(sk, &now); + + ccid3_pr_debug("%s(%p), RTT=%uus (sample=%dus), s=%u, " + "p=%u, X_calc=%u, X_recv=%u, X=%u\n", + dccp_role(sk), + sk, hctx->ccid3hctx_rtt, (int)r_sample, + hctx->ccid3hctx_s, hctx->ccid3hctx_p, + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6), + (unsigned)(hctx->ccid3hctx_x >> 6)); } /* unschedule no feedback timer */ sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); - /* Update sending rate */ - ccid3_hc_tx_update_x(sk); - - /* Update next send time */ - timeval_sub_usecs(&hctx->ccid3hctx_t_nom, - hctx->ccid3hctx_t_ipi); - ccid3_calc_new_t_ipi(hctx); - timeval_add_usecs(&hctx->ccid3hctx_t_nom, - hctx->ccid3hctx_t_ipi); - ccid3_calc_new_delta(hctx); - /* remove all packets older than the one acked from history */ dccp_tx_hist_purge_older(ccid3_tx_hist, &hctx->ccid3hctx_hist, packet); /* - * As we have calculated new ipi, delta, t_nom it is possible that - * we now can send a packet, so wake up dccp_wait_for_ccids. + * As we have calculated new ipi, delta, t_nom it is possible + * that we now can send a packet, so wake up dccp_wait_for_ccid */ sk->sk_write_space(sk); + /* + * Update timeout interval for the nofeedback timer. + * We use a configuration option to increase the lower bound. + * This can help avoid triggering the nofeedback timer too + * often ('spinning') on LANs with small RTTs. + */ + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC/1000)); /* * Schedule no feedback timer to expire in - * max(4 * R, 2 * s / X) + * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ - next_tmout = max(hctx->ccid3hctx_t_rto, - 2 * usecs_div(hctx->ccid3hctx_s, - hctx->ccid3hctx_x)); - - ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); + + ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, - usecs_to_jiffies(next_tmout), next_tmout); + dccp_role(sk), + sk, usecs_to_jiffies(t_nfb), t_nfb); - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + usecs_to_jiffies(t_nfb)); /* set idle flag */ - hctx->ccid3hctx_idle = 1; + hctx->ccid3hctx_idle = 1; break; - default: - printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", - __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); - dump_stack(); + case TFRC_SSTATE_NO_SENT: + /* + * XXX when implementing bidirectional rx/tx check this again + */ + DCCP_WARN("Illegal ACK received - no packet sent\n"); + /* fall through */ + case TFRC_SSTATE_TERM: /* ignore feedback when closing */ break; } } -static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb) -{ - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - - BUG_ON(hctx == NULL); - - if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return; - - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; -} - static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, unsigned char len, u16 idx, unsigned char *value) @@ -621,13 +579,14 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, switch (option) { case TFRC_OPT_LOSS_EVENT_RATE: if (unlikely(len != 4)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, invalid " - "len for TFRC_OPT_LOSS_EVENT_RATE\n", - __FUNCTION__, dccp_role(sk), sk); + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk, len); rc = -EINVAL; } else { - opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value); - ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n", + opt_recv->ccid3or_loss_event_rate = + ntohl(*(__be32 *)value); + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", dccp_role(sk), sk, opt_recv->ccid3or_loss_event_rate); } @@ -635,20 +594,21 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, case TFRC_OPT_LOSS_INTERVALS: opt_recv->ccid3or_loss_intervals_idx = idx; opt_recv->ccid3or_loss_intervals_len = len; - ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n", + ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", dccp_role(sk), sk, opt_recv->ccid3or_loss_intervals_idx, opt_recv->ccid3or_loss_intervals_len); break; case TFRC_OPT_RECEIVE_RATE: if (unlikely(len != 4)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, invalid " - "len for TFRC_OPT_RECEIVE_RATE\n", - __FUNCTION__, dccp_role(sk), sk); + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk, len); rc = -EINVAL; } else { - opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value); - ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n", + opt_recv->ccid3or_receive_rate = + ntohl(*(__be32 *)value); + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", dccp_role(sk), sk, opt_recv->ccid3or_receive_rate); } @@ -658,29 +618,18 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, return rc; } -static int ccid3_hc_tx_init(struct sock *sk) +static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_tx_sock *hctx; - - dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any()); - if (dp->dccps_hc_tx_ccid_private == NULL) - return -ENOMEM; - - hctx = ccid3_hc_tx_sk(sk); - memset(hctx, 0, sizeof(*hctx)); - - if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE && - dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE) - hctx->ccid3hctx_s = dp->dccps_packet_size; - else - hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE; + struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); - /* Set transmission rate to 1 packet per second */ - hctx->ccid3hctx_x = hctx->ccid3hctx_s; - hctx->ccid3hctx_t_rto = USEC_PER_SEC; + hctx->ccid3hctx_s = 0; + hctx->ccid3hctx_rtt = 0; hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; INIT_LIST_HEAD(&hctx->ccid3hctx_hist); + + hctx->ccid3hctx_no_feedback_timer.function = + ccid3_hc_tx_no_feedback_timer; + hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; init_timer(&hctx->ccid3hctx_no_feedback_timer); return 0; @@ -688,7 +637,6 @@ static int ccid3_hc_tx_init(struct sock *sk) static void ccid3_hc_tx_exit(struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); BUG_ON(hctx == NULL); @@ -698,23 +646,13 @@ static void ccid3_hc_tx_exit(struct sock *sk) /* Empty packet history */ dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist); - - kfree(dp->dccps_hc_tx_ccid_private); - dp->dccps_hc_tx_ccid_private = NULL; } /* * RX Half Connection methods */ -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, - TFRC_RSTATE_TERM = 127, -}; - -#ifdef CCID3_DEBUG +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) { static char *ccid3_rx_state_names[] = { @@ -727,8 +665,8 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) } #endif -static inline void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) +static void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; @@ -740,14 +678,24 @@ static inline void ccid3_hc_rx_set_state(struct sock *sk, hcrx->ccid3hcrx_state = state; } +static inline void ccid3_hc_rx_update_s(struct ccid3_hc_rx_sock *hcrx, int len) +{ + if (unlikely(len == 0)) /* don't update on empty packets (e.g. ACKs) */ + ccid3_pr_debug("Packet payload length is 0 - not updating\n"); + else + hcrx->ccid3hcrx_s = hcrx->ccid3hcrx_s == 0 ? len : + (9 * hcrx->ccid3hcrx_s + len) / 10; +} + static void ccid3_hc_rx_send_feedback(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct dccp_rx_hist_entry *packet; struct timeval now; + suseconds_t delta; - ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + ccid3_pr_debug("%s(%p) - entry \n", dccp_role(sk), sk); dccp_timestamp(sk, &now); @@ -755,69 +703,75 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) case TFRC_RSTATE_NO_DATA: hcrx->ccid3hcrx_x_recv = 0; break; - case TFRC_RSTATE_DATA: { - const u32 delta = timeval_delta(&now, - &hcrx->ccid3hcrx_tstamp_last_feedback); - hcrx->ccid3hcrx_x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, - delta); - } + case TFRC_RSTATE_DATA: + delta = timeval_delta(&now, + &hcrx->ccid3hcrx_tstamp_last_feedback); + DCCP_BUG_ON(delta < 0); + hcrx->ccid3hcrx_x_recv = + scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); break; - default: - printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", - __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); - dump_stack(); + case TFRC_RSTATE_TERM: + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); return; } packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist); if (unlikely(packet == NULL)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, no data packet " - "in history!\n", - __FUNCTION__, dccp_role(sk), sk); + DCCP_WARN("%s(%p), no data packet in history!\n", + dccp_role(sk), sk); return; } hcrx->ccid3hcrx_tstamp_last_feedback = now; - hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval; - hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno; + hcrx->ccid3hcrx_ccval_last_counter = packet->dccphrx_ccval; hcrx->ccid3hcrx_bytes_recv = 0; - /* Convert to multiples of 10us */ - hcrx->ccid3hcrx_elapsed_time = - timeval_delta(&now, &packet->dccphrx_tstamp) / 10; + /* Elapsed time information [RFC 4340, 13.2] in units of 10 * usecs */ + delta = timeval_delta(&now, &packet->dccphrx_tstamp); + DCCP_BUG_ON(delta < 0); + hcrx->ccid3hcrx_elapsed_time = delta / 10; + if (hcrx->ccid3hcrx_p == 0) - hcrx->ccid3hcrx_pinv = ~0; - else + hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ + else if (hcrx->ccid3hcrx_p > 1000000) { + DCCP_WARN("p (%u) > 100%%\n", hcrx->ccid3hcrx_p); + hcrx->ccid3hcrx_pinv = 1; /* use 100% in this case */ + } else hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p; + dp->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); } -static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) +static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u32 x_recv, pinv; + __be32 x_recv, pinv; BUG_ON(hcrx == NULL); if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return; + return 0; - DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; + DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_ccval_last_counter; if (dccp_packet_without_ack(skb)) - return; - - if (hcrx->ccid3hcrx_elapsed_time != 0) - dccp_insert_option_elapsed_time(sk, skb, - hcrx->ccid3hcrx_elapsed_time); - dccp_insert_option_timestamp(sk, skb); + return 0; + x_recv = htonl(hcrx->ccid3hcrx_x_recv); pinv = htonl(hcrx->ccid3hcrx_pinv); - dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, - &pinv, sizeof(pinv)); - dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, - &x_recv, sizeof(x_recv)); + + if ((hcrx->ccid3hcrx_elapsed_time != 0 && + dccp_insert_option_elapsed_time(sk, skb, + hcrx->ccid3hcrx_elapsed_time)) || + dccp_insert_option_timestamp(sk, skb) || + dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, + &pinv, sizeof(pinv)) || + dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, + &x_recv, sizeof(x_recv))) + return -1; + + return 0; } /* calculate first loss interval @@ -828,12 +782,13 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct dccp_rx_hist_entry *entry, *next, *tail = NULL; - u32 rtt, delta, x_recv, fval, p, tmp2; + u32 x_recv, p; + suseconds_t rtt, delta; struct timeval tstamp = { 0, }; int interval = 0; int win_count = 0; int step = 0; - u64 tmp1; + u64 fval; list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, dccphrx_node) { @@ -858,72 +813,166 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) } if (unlikely(step == 0)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, packet history " - "contains no data packets!\n", - __FUNCTION__, dccp_role(sk), sk); + DCCP_WARN("%s(%p), packet history has no data packets!\n", + dccp_role(sk), sk); return ~0; } if (unlikely(interval == 0)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, Could not find a " - "win_count interval > 0. Defaulting to 1\n", - __FUNCTION__, dccp_role(sk), sk); + DCCP_WARN("%s(%p), Could not find a win_count interval > 0." + "Defaulting to 1\n", dccp_role(sk), sk); interval = 1; } found: - rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval; - ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", - dccp_role(sk), sk, rtt); - if (rtt == 0) - rtt = 1; + if (!tail) { + DCCP_CRIT("tail is null\n"); + return ~0; + } + + delta = timeval_delta(&tstamp, &tail->dccphrx_tstamp); + DCCP_BUG_ON(delta < 0); + + rtt = delta * 4 / interval; + ccid3_pr_debug("%s(%p), approximated RTT to %dus\n", + dccp_role(sk), sk, (int)rtt); + + /* + * Determine the length of the first loss interval via inverse lookup. + * Assume that X_recv can be computed by the throughput equation + * s + * X_recv = -------- + * R * fval + * Find some p such that f(p) = fval; return 1/p [RFC 3448, 6.3.1]. + */ + if (rtt == 0) { /* would result in divide-by-zero */ + DCCP_WARN("RTT==0\n"); + return ~0; + } dccp_timestamp(sk, &tstamp); delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback); - x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta); + DCCP_BUG_ON(delta <= 0); + + x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN("X_recv==0\n"); + if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { + DCCP_BUG("stored value of X_recv is zero"); + return ~0; + } + } - tmp1 = (u64)x_recv * (u64)rtt; - do_div(tmp1,10000000); - tmp2 = (u32)tmp1; - fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; - /* do not alter order above or you will get overflow on 32 bit */ + fval = scaled_div(hcrx->ccid3hcrx_s, rtt); + fval = scaled_div32(fval, x_recv); p = tfrc_calc_x_reverse_lookup(fval); - ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied " + + ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); if (p == 0) return ~0; else - return 1000000 / p; + return 1000000 / p; } static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + struct dccp_li_hist_entry *head; + u64 seq_temp; - if (seq_loss != DCCP_MAX_SEQNO + 1 && - list_empty(&hcrx->ccid3hcrx_li_hist)) { - struct dccp_li_hist_entry *li_tail; + if (list_empty(&hcrx->ccid3hcrx_li_hist)) { + if (!dccp_li_hist_interval_new(ccid3_li_hist, + &hcrx->ccid3hcrx_li_hist, seq_loss, win_loss)) + return; - li_tail = dccp_li_hist_interval_new(ccid3_li_hist, - &hcrx->ccid3hcrx_li_hist, - seq_loss, win_loss); - if (li_tail == NULL) + head = list_entry(hcrx->ccid3hcrx_li_hist.next, + struct dccp_li_hist_entry, dccplih_node); + head->dccplih_interval = ccid3_hc_rx_calc_first_li(sk); + } else { + struct dccp_li_hist_entry *entry; + struct list_head *tail; + + head = list_entry(hcrx->ccid3hcrx_li_hist.next, + struct dccp_li_hist_entry, dccplih_node); + /* FIXME win count check removed as was wrong */ + /* should make this check with receive history */ + /* and compare there as per section 10.2 of RFC4342 */ + + /* new loss event detected */ + /* calculate last interval length */ + seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss); + entry = dccp_li_hist_entry_new(ccid3_li_hist, GFP_ATOMIC); + + if (entry == NULL) { + DCCP_BUG("out of memory - can not allocate entry"); return; - li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk); - } else - LIMIT_NETDEBUG(KERN_WARNING "%s: FIXME: find end of " - "interval\n", __FUNCTION__); + } + + list_add(&entry->dccplih_node, &hcrx->ccid3hcrx_li_hist); + + tail = hcrx->ccid3hcrx_li_hist.prev; + list_del(tail); + kmem_cache_free(ccid3_li_hist->dccplih_slab, tail); + + /* Create the newest interval */ + entry->dccplih_seqno = seq_loss; + entry->dccplih_interval = seq_temp; + entry->dccplih_win_count = win_loss; + } } -static void ccid3_hc_rx_detect_loss(struct sock *sk) +static int ccid3_hc_rx_detect_loss(struct sock *sk, + struct dccp_rx_hist_entry *packet) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u8 win_loss; - const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist, - &hcrx->ccid3hcrx_li_hist, - &win_loss); + struct dccp_rx_hist_entry *rx_hist = + dccp_rx_hist_head(&hcrx->ccid3hcrx_hist); + u64 seqno = packet->dccphrx_seqno; + u64 tmp_seqno; + int loss = 0; + u8 ccval; + + + tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss; + + if (!rx_hist || + follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { + hcrx->ccid3hcrx_seqno_nonloss = seqno; + hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval; + goto detect_out; + } + + + while (dccp_delta_seqno(hcrx->ccid3hcrx_seqno_nonloss, seqno) + > TFRC_RECV_NUM_LATE_LOSS) { + loss = 1; + ccid3_hc_rx_update_li(sk, hcrx->ccid3hcrx_seqno_nonloss, + hcrx->ccid3hcrx_ccval_nonloss); + tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss; + dccp_inc_seqno(&tmp_seqno); + hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno; + dccp_inc_seqno(&tmp_seqno); + while (dccp_rx_hist_find_entry(&hcrx->ccid3hcrx_hist, + tmp_seqno, &ccval)) { + hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno; + hcrx->ccid3hcrx_ccval_nonloss = ccval; + dccp_inc_seqno(&tmp_seqno); + } + } + + /* FIXME - this code could be simplified with above while */ + /* but works at moment */ + if (follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { + hcrx->ccid3hcrx_seqno_nonloss = seqno; + hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval; + } - ccid3_hc_rx_update_li(sk, seq_loss, win_loss); +detect_out: + dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, packet, + hcrx->ccid3hcrx_seqno_nonloss); + return loss; } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -932,13 +981,11 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) const struct dccp_options_received *opt_recv; struct dccp_rx_hist_entry *packet; struct timeval now; - u8 win_count; - u32 p_prev, r_sample, t_elapsed; - int ins; + u32 p_prev, rtt_prev; + suseconds_t r_sample, t_elapsed; + int loss, payload_size; - BUG_ON(hcrx == NULL || - !(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA || - hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA)); + BUG_ON(hcrx == NULL); opt_recv = &dccp_sk(sk)->dccps_options_received; @@ -949,18 +996,19 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) case DCCP_PKT_DATAACK: if (opt_recv->dccpor_timestamp_echo == 0) break; - p_prev = hcrx->ccid3hcrx_rtt; + rtt_prev = hcrx->ccid3hcrx_rtt; dccp_timestamp(sk, &now); timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10); r_sample = timeval_usecs(&now); t_elapsed = opt_recv->dccpor_elapsed_time * 10; + DCCP_BUG_ON(r_sample < 0); if (unlikely(r_sample <= t_elapsed)) - LIMIT_NETDEBUG(KERN_WARNING "%s: r_sample=%uus, " - "t_elapsed=%uus\n", - __FUNCTION__, r_sample, t_elapsed); + DCCP_WARN("r_sample=%ldus, t_elapsed=%ldus\n", + r_sample, t_elapsed); else r_sample -= t_elapsed; + CCID3_RTT_SANITY_CHECK(r_sample); if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) hcrx->ccid3hcrx_rtt = r_sample; @@ -968,9 +1016,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 + r_sample / 10; - if (p_prev != hcrx->ccid3hcrx_rtt) - ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n", - dccp_role(sk), hcrx->ccid3hcrx_rtt, + if (rtt_prev != hcrx->ccid3hcrx_rtt) + ccid3_pr_debug("%s(%p), New RTT=%uus, elapsed time=%u\n", + dccp_role(sk), sk, hcrx->ccid3hcrx_rtt, opt_recv->dccpor_elapsed_time); break; case DCCP_PKT_DATA: @@ -980,56 +1028,50 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) } packet = dccp_rx_hist_entry_new(ccid3_rx_hist, sk, opt_recv->dccpor_ndp, - skb, SLAB_ATOMIC); + skb, GFP_ATOMIC); if (unlikely(packet == NULL)) { - LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, Not enough mem to " - "add rx packet to history, consider it lost!\n", - __FUNCTION__, dccp_role(sk), sk); + DCCP_WARN("%s(%p), Not enough mem to add rx packet " + "to history, consider it lost!\n", dccp_role(sk), sk); return; } - win_count = packet->dccphrx_ccval; - - ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, - &hcrx->ccid3hcrx_li_hist, packet); + loss = ccid3_hc_rx_detect_loss(sk, packet); if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) return; + payload_size = skb->len - dccp_hdr(skb)->dccph_doff * 4; + ccid3_hc_rx_update_s(hcrx, payload_size); + switch (hcrx->ccid3hcrx_state) { case TFRC_RSTATE_NO_DATA: - ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial " - "feedback\n", - dccp_role(sk), sk, + ccid3_pr_debug("%s(%p, state=%s), skb=%p, sending initial " + "feedback\n", dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); ccid3_hc_rx_send_feedback(sk); ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); return; case TFRC_RSTATE_DATA: - hcrx->ccid3hcrx_bytes_recv += skb->len - - dccp_hdr(skb)->dccph_doff * 4; - if (ins != 0) + hcrx->ccid3hcrx_bytes_recv += payload_size; + if (loss) break; dccp_timestamp(sk, &now); - if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >= - hcrx->ccid3hcrx_rtt) { + if ((timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) - + (suseconds_t)hcrx->ccid3hcrx_rtt) >= 0) { hcrx->ccid3hcrx_tstamp_last_ack = now; ccid3_hc_rx_send_feedback(sk); } return; - default: - printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", - __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); - dump_stack(); + case TFRC_RSTATE_TERM: + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); return; } /* Dealing with packet loss */ - ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n", + ccid3_pr_debug("%s(%p, state=%s), data loss! Reacting...\n", dccp_role(sk), sk, dccp_state_name(sk->sk_state)); - ccid3_hc_rx_detect_loss(sk); p_prev = hcrx->ccid3hcrx_p; /* Calculate loss event rate */ @@ -1039,7 +1081,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Scaling up by 1000000 as fixed decimal */ if (i_mean != 0) hcrx->ccid3hcrx_p = 1000000 / i_mean; - } + } else + DCCP_BUG("empty loss history"); if (hcrx->ccid3hcrx_p > p_prev) { ccid3_hc_rx_send_feedback(sk); @@ -1047,39 +1090,25 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) } } -static int ccid3_hc_rx_init(struct sock *sk) +static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_rx_sock *hcrx; - - ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any()); - if (dp->dccps_hc_rx_ccid_private == NULL) - return -ENOMEM; - - hcrx = ccid3_hc_rx_sk(sk); - memset(hcrx, 0, sizeof(*hcrx)); - - if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE && - dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE) - hcrx->ccid3hcrx_s = dp->dccps_packet_size; - else - hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE; + ccid3_pr_debug("entry\n"); hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist); dccp_timestamp(sk, &hcrx->ccid3hcrx_tstamp_last_ack); hcrx->ccid3hcrx_tstamp_last_feedback = hcrx->ccid3hcrx_tstamp_last_ack; - hcrx->ccid3hcrx_rtt = 5000; /* XXX 5ms for now... */ + hcrx->ccid3hcrx_s = 0; + hcrx->ccid3hcrx_rtt = 0; return 0; } static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); BUG_ON(hcrx == NULL); @@ -1090,9 +1119,6 @@ static void ccid3_hc_rx_exit(struct sock *sk) /* Empty loss interval history */ dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist); - - kfree(dp->dccps_hc_rx_ccid_private); - dp->dccps_hc_rx_ccid_private = NULL; } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) @@ -1105,9 +1131,9 @@ static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) BUG_ON(hcrx == NULL); - info->tcpi_ca_state = hcrx->ccid3hcrx_state; - info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; + info->tcpi_ca_state = hcrx->ccid3hcrx_state; + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) @@ -1178,19 +1204,18 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, return 0; } -static struct ccid ccid3 = { - .ccid_id = 3, +static struct ccid_operations ccid3 = { + .ccid_id = DCCPC_CCID3, .ccid_name = "ccid3", .ccid_owner = THIS_MODULE, - .ccid_init = ccid3_init, - .ccid_exit = ccid3_exit, + .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), .ccid_hc_tx_init = ccid3_hc_tx_init, .ccid_hc_tx_exit = ccid3_hc_tx_exit, .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, - .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options, .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, + .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), .ccid_hc_rx_init = ccid3_hc_rx_init, .ccid_hc_rx_exit = ccid3_hc_rx_exit, .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, @@ -1200,9 +1225,11 @@ static struct ccid ccid3 = { .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, }; - + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, int, 0444); MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); +#endif static __init int ccid3_module_init(void) { @@ -1221,7 +1248,7 @@ static __init int ccid3_module_init(void) goto out_free_tx; rc = ccid_register(&ccid3); - if (rc != 0) + if (rc != 0) goto out_free_loss_interval_history; out: return rc; @@ -1241,15 +1268,6 @@ module_init(ccid3_module_init); static __exit void ccid3_module_exit(void) { -#ifdef CONFIG_IP_DCCP_UNLOAD_HACK - /* - * Hack to use while developing, so that we get rid of the control - * sock, that is what keeps a refcount on dccp.ko -acme - */ - extern void dccp_ctl_sock_exit(void); - - dccp_ctl_sock_exit(); -#endif ccid_unregister(&ccid3); if (ccid3_tx_hist != NULL) { @@ -1267,7 +1285,7 @@ static __exit void ccid3_module_exit(void) } module_exit(ccid3_module_exit); -MODULE_AUTHOR("Ian McDonald , " +MODULE_AUTHOR("Ian McDonald , " "Arnaldo Carvalho de Melo "); MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID"); MODULE_LICENSE("GPL");