diff options
Diffstat (limited to 'net/dccp/ccids/ccid2.c')
| -rw-r--r-- | net/dccp/ccids/ccid2.c | 807 |
1 files changed, 393 insertions, 414 deletions
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index b5b52ebb269..f053198e730 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -1,6 +1,4 @@ /* - * net/dccp/ccids/ccid2.c - * * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk> * * Changes to meet Linux coding standards, and DCCP infrastructure fixes. @@ -25,70 +23,26 @@ /* * This implementation should follow RFC 4341 */ - -#include "../ccid.h" -#include "../dccp.h" +#include <linux/slab.h> +#include "../feat.h" #include "ccid2.h" #ifdef CONFIG_IP_DCCP_CCID2_DEBUG -static int ccid2_debug; +static bool ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) - -static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) -{ - int len = 0; - int pipe = 0; - struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; - - /* there is data in the chain */ - if (seqp != hctx->ccid2hctx_seqt) { - seqp = seqp->ccid2s_prev; - len++; - if (!seqp->ccid2s_acked) - pipe++; - - while (seqp != hctx->ccid2hctx_seqt) { - struct ccid2_seq *prev = seqp->ccid2s_prev; - - len++; - if (!prev->ccid2s_acked) - pipe++; - - /* packets are sent sequentially */ - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, - prev->ccid2s_seq ) >= 0); - BUG_ON(time_before(seqp->ccid2s_sent, - prev->ccid2s_sent)); - - seqp = prev; - } - } - - BUG_ON(pipe != hctx->ccid2hctx_pipe); - ccid2_pr_debug("len of chain=%d\n", len); - - do { - seqp = seqp->ccid2s_prev; - len++; - } while (seqp != hctx->ccid2hctx_seqh); - - ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); -} #else #define ccid2_pr_debug(format, a...) -#define ccid2_hc_tx_check_sanity(hctx) #endif -static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) +static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) { struct ccid2_seq *seqp; int i; /* check if we have space to preserve the pointer to the buffer */ - if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / - sizeof(struct ccid2_seq*))) + if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / + sizeof(struct ccid2_seq *))) return -ENOMEM; /* allocate buffer and initialize linked list */ @@ -104,38 +58,34 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; /* This is the first allocation. Initiate the head and tail. */ - if (hctx->ccid2hctx_seqbufc == 0) - hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; + if (hc->tx_seqbufc == 0) + hc->tx_seqh = hc->tx_seqt = seqp; else { /* link the existing list with the one we just created */ - hctx->ccid2hctx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hctx->ccid2hctx_seqh; + hc->tx_seqh->ccid2s_next = seqp; + seqp->ccid2s_prev = hc->tx_seqh; - hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; + hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; } /* store the original pointer to the buffer so we can free it */ - hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; - hctx->ccid2hctx_seqbufc++; + hc->tx_seqbuf[hc->tx_seqbufc] = seqp; + hc->tx_seqbufc++; return 0; } static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - - if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) - return 0; - - return 1; /* XXX CCID should dequeue when ready instead of polling */ + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) + return CCID_PACKET_WILL_DEQUEUE_LATER; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { - struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); /* * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from @@ -147,109 +97,186 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } - if (val > 0xFFFF) /* RFC 4340, 11.3 */ - val = 0xFFFF; + dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, + min_t(u32, val, DCCPF_ACK_RATIO_MAX)); +} - if (val == dp->dccps_l_ack_ratio) - return; +static void ccid2_check_l_ack_ratio(struct sock *sk) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - ccid2_pr_debug("changing local ack ratio to %u\n", val); - dp->dccps_l_ack_ratio = val; + /* + * After a loss, idle period, application limited period, or RTO we + * need to check that the ack ratio is still less than the congestion + * window. Otherwise, we will send an entire congestion window of + * packets and got no response because we haven't sent ack ratio + * packets yet. + * If the ack ratio does need to be reduced, we reduce it to half of + * the congestion window (or 1 if that's zero) instead of to the + * congestion window. This prevents problems if one ack is lost. + */ + if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) + ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); } -static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) +static void ccid2_change_l_seq_window(struct sock *sk, u64 val) { - ccid2_pr_debug("change SRTT to %ld\n", val); - hctx->ccid2hctx_srtt = val; + dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, + clamp_val(val, DCCPF_SEQ_WMIN, + DCCPF_SEQ_WMAX)); } -static void ccid2_start_rto_timer(struct sock *sk); - static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - long s; + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, - jiffies + HZ / 5); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); goto out; } ccid2_pr_debug("RTO_EXPIRE\n"); - ccid2_hc_tx_check_sanity(hctx); - /* back-off timer */ - hctx->ccid2hctx_rto <<= 1; - - s = hctx->ccid2hctx_rto / HZ; - if (s > 60) - hctx->ccid2hctx_rto = 60 * HZ; - - ccid2_start_rto_timer(sk); + hc->tx_rto <<= 1; + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; /* adjust pipe, cwnd etc */ - hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; - if (hctx->ccid2hctx_ssthresh < 2) - hctx->ccid2hctx_ssthresh = 2; - hctx->ccid2hctx_cwnd = 1; - hctx->ccid2hctx_pipe = 0; + hc->tx_ssthresh = hc->tx_cwnd / 2; + if (hc->tx_ssthresh < 2) + hc->tx_ssthresh = 2; + hc->tx_cwnd = 1; + hc->tx_pipe = 0; /* clear state about stuff we sent */ - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; - hctx->ccid2hctx_packets_acked = 0; + hc->tx_seqt = hc->tx_seqh; + hc->tx_packets_acked = 0; /* clear ack ratio state. */ - hctx->ccid2hctx_rpseq = 0; - hctx->ccid2hctx_rpdupack = -1; + hc->tx_rpseq = 0; + hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - ccid2_hc_tx_check_sanity(hctx); + + /* if we were blocked before, we may now send cwnd=1 packet */ + if (sender_was_blocked) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + /* restart backed-off timer */ + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_start_rto_timer(struct sock *sk) +/* + * Congestion window validation (RFC 2861). + */ +static bool ccid2_do_cwv = true; +module_param(ccid2_do_cwv, bool, 0644); +MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); + +/** + * ccid2_update_used_window - Track how much of cwnd is actually used + * This is done in addition to CWV. The sender needs to have an idea of how many + * packets may be in flight, to set the local Sequence Window value accordingly + * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the + * maximum-used window. We use an EWMA low-pass filter to filter out noise. + */ +static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; +} - ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); +/* This borrows the code of tcp_cwnd_application_limited() */ +static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + /* don't reduce cwnd below the initial window (IW) */ + u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), + win_used = max(hc->tx_cwnd_used, init_win); + + if (win_used < hc->tx_cwnd) { + hc->tx_ssthresh = max(hc->tx_ssthresh, + (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); + hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; + } + hc->tx_cwnd_used = 0; + hc->tx_cwnd_stamp = now; - BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); - sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, - jiffies + hctx->ccid2hctx_rto); + ccid2_check_l_ack_ratio(sk); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) +/* This borrows the code of tcp_cwnd_restart() */ +static void ccid2_cwnd_restart(struct sock *sk, const u32 now) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + u32 cwnd = hc->tx_cwnd, restart_cwnd, + iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); + + hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); + + /* don't reduce cwnd below the initial window (IW) */ + restart_cwnd = min(cwnd, iwnd); + cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto; + hc->tx_cwnd = max(cwnd, restart_cwnd); + + hc->tx_cwnd_stamp = now; + hc->tx_cwnd_used = 0; + + ccid2_check_l_ack_ratio(sk); +} + +static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const u32 now = ccid2_time_stamp; struct ccid2_seq *next; - hctx->ccid2hctx_pipe++; + /* slow-start after idle periods (RFC 2581, RFC 2861) */ + if (ccid2_do_cwv && !hc->tx_pipe && + (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) + ccid2_cwnd_restart(sk, now); - hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; - hctx->ccid2hctx_seqh->ccid2s_acked = 0; - hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; + hc->tx_lsndtime = now; + hc->tx_pipe += 1; - next = hctx->ccid2hctx_seqh->ccid2s_next; + /* see whether cwnd was fully used (RFC 2861), update expected window */ + if (ccid2_cwnd_network_limited(hc)) { + ccid2_update_used_window(hc, hc->tx_cwnd); + hc->tx_cwnd_used = 0; + hc->tx_cwnd_stamp = now; + } else { + if (hc->tx_pipe > hc->tx_cwnd_used) + hc->tx_cwnd_used = hc->tx_pipe; + + ccid2_update_used_window(hc, hc->tx_cwnd_used); + + if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) + ccid2_cwnd_application_limited(sk, now); + } + + hc->tx_seqh->ccid2s_seq = dp->dccps_gss; + hc->tx_seqh->ccid2s_acked = 0; + hc->tx_seqh->ccid2s_sent = now; + + next = hc->tx_seqh->ccid2s_next; /* check if we need to alloc more space */ - if (next == hctx->ccid2hctx_seqt) { - if (ccid2_hc_tx_alloc_seq(hctx)) { + if (next == hc->tx_seqt) { + if (ccid2_hc_tx_alloc_seq(hc)) { DCCP_CRIT("packet history - out of memory!"); /* FIXME: find a more graceful way to bail out */ return; } - next = hctx->ccid2hctx_seqh->ccid2s_next; - BUG_ON(next == hctx->ccid2hctx_seqt); + next = hc->tx_seqh->ccid2s_next; + BUG_ON(next == hc->tx_seqt); } - hctx->ccid2hctx_seqh = next; + hc->tx_seqh = next; - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, - hctx->ccid2hctx_pipe); + ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); /* * FIXME: The code below is broken and the variables have been removed @@ -272,12 +299,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) */ #if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hctx->ccid2hctx_arsent++; + hc->tx_arsent++; /* We had an ack loss in this window... */ - if (hctx->ccid2hctx_ackloss) { - if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { - hctx->ccid2hctx_arsent = 0; - hctx->ccid2hctx_ackloss = 0; + if (hc->tx_ackloss) { + if (hc->tx_arsent >= hc->tx_cwnd) { + hc->tx_arsent = 0; + hc->tx_ackloss = 0; } } else { /* No acks lost up to now... */ @@ -287,234 +314,203 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - dp->dccps_l_ack_ratio; - denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; + denom = hc->tx_cwnd * hc->tx_cwnd / denom; - if (hctx->ccid2hctx_arsent >= denom) { + if (hc->tx_arsent >= denom) { ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hctx->ccid2hctx_arsent = 0; + hc->tx_arsent = 0; } } else { /* we can't increase ack ratio further [1] */ - hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ + hc->tx_arsent = 0; /* or maybe set it to cwnd*/ } } #endif - /* setup RTO timer */ - if (!timer_pending(&hctx->ccid2hctx_rtotimer)) - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { - struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; + struct ccid2_seq *seqp = hc->tx_seqt; - while (seqp != hctx->ccid2hctx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", + while (seqp != hc->tx_seqh) { + ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); seqp = seqp->ccid2s_next; } } while (0); ccid2_pr_debug("=========\n"); - ccid2_hc_tx_check_sanity(hctx); #endif } -/* XXX Lame code duplication! - * returns -1 if none was found. - * else returns the next offset to use in the function call. +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. */ -static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, - unsigned char **vec, unsigned char *veclen) +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) { - const struct dccp_hdr *dh = dccp_hdr(skb); - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - unsigned char opt, len; - unsigned char *value; - - BUG_ON(offset < 0); - options += offset; - opt_ptr = options; - if (opt_ptr >= opt_end) - return -1; - - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_invalid_option; - - len = *opt_ptr++; - if (len < 3) - goto out_invalid_option; + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; + + if (hc->tx_srtt == 0) { + /* First measurement m */ + hc->tx_srtt = m << 3; + hc->tx_mdev = m << 1; + + hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); + hc->tx_rttvar = hc->tx_mdev_max; + + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hc->tx_srtt >> 3); + hc->tx_srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hc->tx_mdev >> 2); /* - * Remove the type and len fields, leaving - * just the value size + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). */ - len -= 2; - value = opt_ptr; - opt_ptr += len; + if (m > 0) + m >>= 3; + } else { + m -= (hc->tx_mdev >> 2); + } + hc->tx_mdev += m; - if (opt_ptr > opt_end) - goto out_invalid_option; + if (hc->tx_mdev > hc->tx_mdev_max) { + hc->tx_mdev_max = hc->tx_mdev; + if (hc->tx_mdev_max > hc->tx_rttvar) + hc->tx_rttvar = hc->tx_mdev_max; } - switch (opt) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - *vec = value; - *veclen = len; - return offset + (opt_ptr - options); + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe. + * AWL is probably too low here, as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { + if (hc->tx_mdev_max < hc->tx_rttvar) + hc->tx_rttvar -= (hc->tx_rttvar - + hc->tx_mdev_max) >> 2; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + hc->tx_mdev_max = tcp_rto_min(sk); } } - return -1; - -out_invalid_option: - DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); - return -1; -} - -static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + /* + * Set RTO from SRTT and RTTVAR + * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. + * This agrees with RFC 4341, 5: + * "Because DCCP does not retransmit data, DCCP does not require + * TCP's recommended minimum timeout of one second". + */ + hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; - sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); - ccid2_pr_debug("deleted RTO timer\n"); + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; } -static inline void ccid2_new_ack(struct sock *sk, - struct ccid2_seq *seqp, - unsigned int *maxincr) +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - - if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { - if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { - hctx->ccid2hctx_cwnd += 1; - *maxincr -= 1; - hctx->ccid2hctx_packets_acked = 0; - } - } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { - hctx->ccid2hctx_cwnd += 1; - hctx->ccid2hctx_packets_acked = 0; - } - - /* update RTO */ - if (hctx->ccid2hctx_srtt == -1 || - time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; - int s; - - /* first measurement */ - if (hctx->ccid2hctx_srtt == -1) { - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", - r, jiffies, - (unsigned long long)seqp->ccid2s_seq); - ccid2_change_srtt(hctx, r); - hctx->ccid2hctx_rttvar = r >> 1; - } else { - /* RTTVAR */ - long tmp = hctx->ccid2hctx_srtt - r; - long srtt; - - if (tmp < 0) - tmp *= -1; - - tmp >>= 2; - hctx->ccid2hctx_rttvar *= 3; - hctx->ccid2hctx_rttvar >>= 2; - hctx->ccid2hctx_rttvar += tmp; - - /* SRTT */ - srtt = hctx->ccid2hctx_srtt; - srtt *= 7; - srtt >>= 3; - tmp = r >> 3; - srtt += tmp; - ccid2_change_srtt(hctx, srtt); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; + + if (hc->tx_cwnd < dp->dccps_l_seq_win && + r_seq_used < dp->dccps_r_seq_win) { + if (hc->tx_cwnd < hc->tx_ssthresh) { + if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { + hc->tx_cwnd += 1; + *maxincr -= 1; + hc->tx_packets_acked = 0; + } + } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { + hc->tx_cwnd += 1; + hc->tx_packets_acked = 0; } - s = hctx->ccid2hctx_rttvar << 2; - /* clock granularity is 1 when based on jiffies */ - if (!s) - s = 1; - hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; - - /* must be at least a second */ - s = hctx->ccid2hctx_rto / HZ; - /* DCCP doesn't require this [but I like it cuz my code sux] */ -#if 1 - if (s < 1) - hctx->ccid2hctx_rto = HZ; -#endif - /* max 60 seconds */ - if (s > 60) - hctx->ccid2hctx_rto = HZ * 60; - - hctx->ccid2hctx_lastrtt = jiffies; - - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, - hctx->ccid2hctx_rto, HZ, r); } - /* we got a new ack, so re-start RTO timer */ - ccid2_hc_tx_kill_rto_timer(sk); - ccid2_start_rto_timer(sk); -} - -static void ccid2_hc_tx_dec_pipe(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + /* + * Adjust the local sequence window and the ack ratio to allow about + * 5 times the number of packets in the network (RFC 4340 7.5.2) + */ + if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) + ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); + else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) + ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); - if (hctx->ccid2hctx_pipe == 0) - DCCP_BUG("pipe == 0"); - else - hctx->ccid2hctx_pipe--; + if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) + ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); + else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) + ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); - if (hctx->ccid2hctx_pipe == 0) - ccid2_hc_tx_kill_rto_timer(sk); + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ + ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { + if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } - hctx->ccid2hctx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; + + hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; + hc->tx_ssthresh = max(hc->tx_cwnd, 2U); - hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; - hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); + ccid2_check_l_ack_ratio(sk); +} + +static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) - ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); + switch (option) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, + option - DCCPO_ACK_VECTOR_0); + } + return 0; } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); + struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; - unsigned char *vector; - unsigned char veclen; - int offset = 0; int done = 0; unsigned int maxincr = 0; - ccid2_hc_tx_check_sanity(hctx); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -523,49 +519,52 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * -sorbo. */ /* need to bootstrap */ - if (hctx->ccid2hctx_rpdupack == -1) { - hctx->ccid2hctx_rpdupack = 0; - hctx->ccid2hctx_rpseq = seqno; + if (hc->tx_rpdupack == -1) { + hc->tx_rpdupack = 0; + hc->tx_rpseq = seqno; } else { /* check if packet is consecutive */ - if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) - hctx->ccid2hctx_rpseq = seqno; + if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) + hc->tx_rpseq = seqno; /* it's a later packet */ - else if (after48(seqno, hctx->ccid2hctx_rpseq)) { - hctx->ccid2hctx_rpdupack++; + else if (after48(seqno, hc->tx_rpseq)) { + hc->tx_rpdupack++; /* check if we got enough dupacks */ - if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { - hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ - hctx->ccid2hctx_rpseq = 0; - + if (hc->tx_rpdupack >= NUMDUPACK) { + hc->tx_rpdupack = -1; /* XXX lame */ + hc->tx_rpseq = 0; +#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ + /* + * FIXME: Ack Congestion Control is broken; in + * the current state instabilities occurred with + * Ack Ratios greater than 1; causing hang-ups + * and long RTO timeouts. This needs to be fixed + * before opening up dynamic changes. -- gerrit + */ ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); +#endif } } } /* check forward path congestion */ - /* still didn't send out new data packets */ - if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) + if (dccp_packet_without_ack(skb)) return; - switch (DCCP_SKB_CB(skb)->dccpd_type) { - case DCCP_PKT_ACK: - case DCCP_PKT_DATAACK: - break; - default: - return; - } + /* still didn't send out new data packets */ + if (hc->tx_seqh == hc->tx_seqt) + goto done; ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hctx->ccid2hctx_high_ack)) - hctx->ccid2hctx_high_ack = ackno; + if (after48(ackno, hc->tx_high_ack)) + hc->tx_high_ack = ackno; - seqp = hctx->ccid2hctx_seqt; + seqp = hc->tx_seqt; while (before48(seqp->ccid2s_seq, ackno)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->ccid2hctx_seqh) { - seqp = hctx->ccid2hctx_seqh->ccid2s_prev; + if (seqp == hc->tx_seqh) { + seqp = hc->tx_seqh->ccid2s_prev; break; } } @@ -575,26 +574,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * packets per acknowledgement. Rounding up avoids that cwnd is not * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ - if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) + if (hc->tx_cwnd < hc->tx_ssthresh) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ - while ((offset = ccid2_ackvector(sk, skb, offset, - &vector, &veclen)) != -1) { + list_for_each_entry(avp, &hc->tx_av_chunks, node) { /* go through this ack vector */ - while (veclen--) { - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; - u64 ackno_end_rl = SUB48(ackno, rl); + for (; avp->len--; avp->vec++) { + u64 ackno_end_rl = SUB48(ackno, + dccp_ackvec_runlen(avp->vec)); - ccid2_pr_debug("ackvec start:%llu end:%llu\n", + ccid2_pr_debug("ackvec %llu |%u,%u|\n", (unsigned long long)ackno, - (unsigned long long)ackno_end_rl); + dccp_ackvec_state(avp->vec) >> 6, + dccp_ackvec_runlen(avp->vec)); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. */ while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hctx->ccid2hctx_seqt) { + if (seqp == hc->tx_seqt) { done = 1; break; } @@ -607,26 +606,24 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = *vector & - DCCP_ACKVEC_STATE_MASK; + const u8 state = dccp_ackvec_state(avp->vec); /* new packet received or marked */ - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && + if (state != DCCPAV_NOT_RECEIVED && !seqp->ccid2s_acked) { - if (state == - DCCP_ACKVEC_STATE_ECN_MARKED) { + if (state == DCCPAV_ECN_MARKED) ccid2_congestion_event(sk, seqp); - } else + else ccid2_new_ack(sk, seqp, &maxincr); seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } - if (seqp == hctx->ccid2hctx_seqt) { + if (seqp == hc->tx_seqt) { done = 1; break; } @@ -636,7 +633,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) break; ackno = SUB48(ackno_end_rl, 1); - vector++; } if (done) break; @@ -645,11 +641,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* The state about what is acked should be correct now * Check for NUMDUPACK */ - seqp = hctx->ccid2hctx_seqt; - while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { + seqp = hc->tx_seqt; + while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->ccid2hctx_seqh) { - seqp = hctx->ccid2hctx_seqh->ccid2s_prev; + if (seqp == hc->tx_seqh) { + seqp = hc->tx_seqh->ccid2s_prev; break; } } @@ -660,7 +656,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (done == NUMDUPACK) break; } - if (seqp == hctx->ccid2hctx_seqt) + if (seqp == hc->tx_seqt) break; seqp = seqp->ccid2s_prev; } @@ -681,125 +677,108 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } - if (seqp == hctx->ccid2hctx_seqt) + if (seqp == hc->tx_seqt) break; seqp = seqp->ccid2s_prev; } - hctx->ccid2hctx_seqt = last_acked; + hc->tx_seqt = last_acked; } /* trim acked packets in tail */ - while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { - if (!hctx->ccid2hctx_seqt->ccid2s_acked) + while (hc->tx_seqt != hc->tx_seqh) { + if (!hc->tx_seqt->ccid2s_acked) break; - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; + hc->tx_seqt = hc->tx_seqt->ccid2s_next; } - ccid2_hc_tx_check_sanity(hctx); + /* restart RTO timer if not all outstanding data has been acked */ + if (hc->tx_pipe == 0) + sk_stop_timer(sk, &hc->tx_rtotimer); + else + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); +done: + /* check if incoming Acks allow pending packets to be sent */ + if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) { - struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid); + struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); struct dccp_sock *dp = dccp_sk(sk); u32 max_ratio; /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hctx->ccid2hctx_ssthresh = ~0U; + hc->tx_ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hctx->ccid2hctx_cwnd = min(4U, max(2U, 4380U / dp->dccps_mss_cache)); + /* Use larger initial windows (RFC 4341, section 5). */ + hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); + hc->tx_expected_wnd = hc->tx_cwnd; /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); + max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) dp->dccps_l_ack_ratio = max_ratio; /* XXX init ~ to window size... */ - if (ccid2_hc_tx_alloc_seq(hctx)) + if (ccid2_hc_tx_alloc_seq(hc)) return -ENOMEM; - hctx->ccid2hctx_rto = 3 * HZ; - ccid2_change_srtt(hctx, -1); - hctx->ccid2hctx_rttvar = -1; - hctx->ccid2hctx_rpdupack = -1; - hctx->ccid2hctx_last_cong = jiffies; - setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, + hc->tx_rto = DCCP_TIMEOUT_INIT; + hc->tx_rpdupack = -1; + hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp; + hc->tx_cwnd_used = 0; + setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - - ccid2_hc_tx_check_sanity(hctx); + INIT_LIST_HEAD(&hc->tx_av_chunks); return 0; } static void ccid2_hc_tx_exit(struct sock *sk) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); int i; - ccid2_hc_tx_kill_rto_timer(sk); + sk_stop_timer(sk, &hc->tx_rtotimer); - for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) - kfree(hctx->ccid2hctx_seqbuf[i]); - hctx->ccid2hctx_seqbufc = 0; + for (i = 0; i < hc->tx_seqbufc; i++) + kfree(hc->tx_seqbuf[i]); + hc->tx_seqbufc = 0; } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { - const struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_rx_sock *hcrx = ccid2_hc_rx_sk(sk); - - switch (DCCP_SKB_CB(skb)->dccpd_type) { - case DCCP_PKT_DATA: - case DCCP_PKT_DATAACK: - hcrx->ccid2hcrx_data++; - if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { - dccp_send_ack(sk); - hcrx->ccid2hcrx_data = 0; - } - break; + struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); + + if (!dccp_data_packet(skb)) + return; + + if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { + dccp_send_ack(sk); + hc->rx_num_data_pkts = 0; } } -static struct ccid_operations ccid2 = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_owner = THIS_MODULE, - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, +struct ccid_operations ccid2_ops = { + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0444); -MODULE_PARM_DESC(ccid2_debug, "Enable debug messages"); +module_param(ccid2_debug, bool, 0644); +MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); #endif - -static __init int ccid2_module_init(void) -{ - return ccid_register(&ccid2); -} -module_init(ccid2_module_init); - -static __exit void ccid2_module_exit(void) -{ - ccid_unregister(&ccid2); -} -module_exit(ccid2_module_exit); - -MODULE_AUTHOR("Andrea Bittau <a.bittau@cs.ucl.ac.uk>"); -MODULE_DESCRIPTION("DCCP TCP-Like (CCID2) CCID"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("net-dccp-ccid-2"); |
