cc_cubic: sync to the new specification of RFC9438 for TCP CUBIC.

Reviewed by: rscheff
Differential Revision: https://reviews.freebsd.org/D49540
This commit is contained in:
Cheng Cui 2025-03-27 13:16:04 -04:00
parent 3d8f65fbf8
commit a2f579635f
No known key found for this signature in database
GPG key ID: F9BE886D1486EF98
2 changed files with 182 additions and 144 deletions

View file

@ -38,7 +38,7 @@
/*
* An implementation of the CUBIC congestion control algorithm for FreeBSD,
* based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
* based on the Internet RFC9438 by Xu, Ha, Rhee, Goel, and Eggert.
* Originally released as part of the NewTCP research project at Swinburne
* University of Technology's Centre for Advanced Internet Architectures,
* Melbourne, Australia, which was made possible in part by a grant from the
@ -81,7 +81,7 @@ static void cubic_conn_init(struct cc_var *ccv);
static int cubic_mod_init(void);
static void cubic_post_recovery(struct cc_var *ccv);
static void cubic_record_rtt(struct cc_var *ccv);
static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg);
static uint32_t cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg);
static void cubic_after_idle(struct cc_var *ccv);
static size_t cubic_data_sz(void);
static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt);
@ -236,10 +236,11 @@ static void
cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
{
struct cubic *cubic_data;
unsigned long W_est, W_cubic;
uint32_t W_est, W_cubic, cwin, target, incr;
int usecs_since_epoch;
uint32_t mss = tcp_fixed_maxseg(ccv->tp);
cwin = CCV(ccv, snd_cwnd);
cubic_data = ccv->cc_data;
cubic_record_rtt(ccv);
@ -250,7 +251,7 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
(ccv->flags & CCF_CWND_LIMITED)) {
/* Use the logic in NewReno ack_received() for slow start. */
if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) ||
if (cwin <= CCV(ccv, snd_ssthresh) ||
cubic_data->min_rtt_usecs == TCPTV_SRTTBASE) {
cubic_does_slow_start(ccv, cubic_data);
} else {
@ -265,20 +266,32 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED;
cubic_log_hystart_event(ccv, cubic_data, 11, CCV(ccv, snd_ssthresh));
}
if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) &&
(cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) {
/* RFC8312 Section 4.7 */
cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT |
CUBICFLAG_IN_SLOWSTART);
cubic_data->W_max = CCV(ccv, snd_cwnd);
cubic_data->t_epoch = ticks;
cubic_data->K = 0;
} else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART |
if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART |
CUBICFLAG_CONG_EVENT |
CUBICFLAG_IN_APPLIMIT)) {
cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART |
CUBICFLAG_IN_APPLIMIT);
/*
* At the beginning of the current congestion
* avoidance stage, The epoch variables
* (t_epoch, cwnd_epoch, K) are updated in the
* following three cases:
* 1) just exited the slow start
* 2) after a congestion event
* 3) application-limited
*/
cubic_data->t_epoch = ticks;
cubic_data->K = cubic_k(cubic_data->W_max / mss);
cubic_data->cwnd_epoch = cwin;
cubic_data->K = cubic_k(cubic_data->W_max / mss,
cubic_data->cwnd_epoch / mss);
cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART |
CUBICFLAG_CONG_EVENT |
CUBICFLAG_IN_APPLIMIT);
if (cubic_data->flags & CUBICFLAG_RTO_EVENT) {
/* RFC9438 Section 4.8: Timeout */
cubic_data->flags &= ~CUBICFLAG_RTO_EVENT;
cubic_data->W_max = cwin;
cubic_data->K = 0;
}
}
usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick;
if (usecs_since_epoch < 0) {
@ -288,12 +301,9 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
usecs_since_epoch = INT_MAX;
cubic_data->t_epoch = ticks - INT_MAX;
}
W_est = tf_cwnd(ccv);
/*
* The mean RTT is used to best reflect the equations in
* the I-D.
* The mean RTT is used to best reflect the equations.
*/
W_cubic = cubic_cwnd(usecs_since_epoch +
cubic_data->mean_rtt_usecs,
@ -302,33 +312,24 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
cubic_data->K);
if (W_cubic < W_est) {
/*
* TCP-friendly region, follow tf
* cwnd growth.
*/
CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX);
/* RFC9438 Section 4.3: Reno-friendly region */
CCV(ccv, snd_cwnd) = W_est;
cubic_data->flags |= CUBICFLAG_IN_TF;
} else if (CCV(ccv, snd_cwnd) < W_cubic) {
} else {
/*
* Concave or convex region, follow CUBIC
* cwnd growth.
* Only update snd_cwnd, if it doesn't shrink.
* RFC9438 Section 4.4 or 4.5:
* Concave or Convex Region
*/
CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX);
cubic_data->flags &= ~CUBICFLAG_IN_TF;
}
/*
* If we're not in slow start and we're probing for a
* new cwnd limit at the start of a connection
* (happens when hostcache has a relevant entry),
* keep updating our current estimate of the
* W_max.
*/
if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) &&
cubic_data->W_max < CCV(ccv, snd_cwnd)) {
cubic_data->W_max = CCV(ccv, snd_cwnd);
cubic_data->K = cubic_k(cubic_data->W_max / mss);
if (W_cubic < cwin) {
target = cwin;
} else if (W_cubic > ((cwin * 3) >> 1)) {
target = (cwin * 3) >> 1;
} else {
target = W_cubic;
}
incr = (((target - cwin) << CUBIC_SHIFT) /
cwin * mss) >> CUBIC_SHIFT;
CCV(ccv, snd_cwnd) = cwin + incr;
}
}
} else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
@ -345,12 +346,11 @@ cubic_ack_received(struct cc_var *ccv, ccsignal_t type)
static void
cubic_after_idle(struct cc_var *ccv)
{
struct cubic *cubic_data;
cubic_data = ccv->cc_data;
struct cubic *cubic_data = ccv->cc_data;
uint32_t mss = tcp_fixed_maxseg(ccv->tp);
cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd));
cubic_data->K = cubic_k(cubic_data->W_max / tcp_fixed_maxseg(ccv->tp));
cubic_data->K = cubic_k(cubic_data->W_max / mss, cubic_data->cwnd_epoch / mss);
if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) {
/*
* Re-enable hystart if we have been idle.
@ -389,7 +389,9 @@ cubic_cb_init(struct cc_var *ccv, void *ptr)
cubic_data = ptr;
/* Init some key variables with sensible defaults. */
cubic_data->t_epoch = ticks;
cubic_data->t_epoch = 0;
cubic_data->cwnd_epoch = 0;
cubic_data->K = 0;
cubic_data->min_rtt_usecs = TCPTV_SRTTBASE;
cubic_data->mean_rtt_usecs = 1;
@ -416,7 +418,7 @@ static void
cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
{
struct cubic *cubic_data;
uint32_t mss, pipe;
uint32_t mss, pipe, ssthresh;
cubic_data = ccv->cc_data;
mss = tcp_fixed_maxseg(ccv->tp);
@ -431,10 +433,13 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
}
if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
cubic_ssthresh_update(ccv, mss);
ssthresh = cubic_get_ssthresh(ccv, mss);
CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss);
/*
* The congestion flag will recalculate K at the
* beginning of the congestion avoidance stage.
*/
cubic_data->flags |= CUBICFLAG_CONG_EVENT;
cubic_data->t_epoch = ticks;
cubic_data->K = cubic_k(cubic_data->W_max / mss);
}
ENTER_RECOVERY(CCV(ccv, t_flags));
}
@ -448,17 +453,20 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
cubic_log_hystart_event(ccv, cubic_data, 9, CCV(ccv, snd_ssthresh));
}
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
cubic_ssthresh_update(ccv, mss);
ssthresh = cubic_get_ssthresh(ccv, mss);
CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * mss);
CCV(ccv, snd_cwnd) = max(ssthresh, mss);
/*
* The congestion flag will recalculate K at the
* beginning of the congestion avoidance stage.
*/
cubic_data->flags |= CUBICFLAG_CONG_EVENT;
cubic_data->t_epoch = ticks;
cubic_data->K = cubic_k(cubic_data->W_max / mss);
CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
ENTER_CONGRECOVERY(CCV(ccv, t_flags));
}
break;
case CC_RTO:
/* RFC8312 Section 4.7 */
/* RFC9438 Section 4.8: Timeout */
if (CCV(ccv, t_rxtshift) == 1) {
/*
* Remember the state only for the first RTO event. This
@ -475,12 +483,16 @@ cubic_cong_signal(struct cc_var *ccv, ccsignal_t type)
(((uint64_t)min(CCV(ccv, snd_wnd), pipe) *
CUBIC_BETA) >> CUBIC_SHIFT) / mss) * mss;
}
cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT;
/*
* The RTO flag will recalculate K at the
* beginning of the congestion avoidance stage.
*/
cubic_data->flags |= CUBICFLAG_RTO_EVENT;
CCV(ccv, snd_cwnd) = mss;
break;
case CC_RTO_ERR:
cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT);
cubic_data->flags &= ~CUBICFLAG_RTO_EVENT;
cubic_data->K = cubic_data->undo_K;
cubic_data->W_max = cubic_data->undo_W_max;
cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch;
@ -503,7 +515,7 @@ cubic_conn_init(struct cc_var *ccv)
* this here bad things happen when entries from the TCP hostcache
* get used.
*/
cubic_data->W_max = CCV(ccv, snd_cwnd);
cubic_data->W_max = UINT_MAX;
}
static int
@ -603,44 +615,36 @@ cubic_record_rtt(struct cc_var *ccv)
}
/*
* Update the ssthresh in the event of congestion.
* Return the new value for ssthresh in the event of a congestion.
*/
static void
cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg)
static uint32_t
cubic_get_ssthresh(struct cc_var *ccv, uint32_t maxseg)
{
struct cubic *cubic_data;
uint32_t ssthresh;
uint32_t cwnd;
uint32_t cwnd, pipe;
cubic_data = ccv->cc_data;
cwnd = CCV(ccv, snd_cwnd);
/* Fast convergence heuristic. */
/* RFC9438 Section 4.7: Fast convergence */
if (cwnd < cubic_data->W_max) {
cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT;
}
cubic_data->undo_W_max = cubic_data->W_max;
cubic_data->W_max = cwnd;
if (cubic_data->flags & CUBICFLAG_IN_TF) {
/* If in the TCP friendly region, follow what newreno does */
ssthresh = newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg);
/* If in the TCP friendly region, follow what newreno does. */
return (newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg));
} else if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) {
/*
* On the first congestion event, set ssthresh to cwnd * 0.5
* and reduce W_max to cwnd * beta. This aligns the cubic
* concave region appropriately.
*/
ssthresh = cwnd >> 1;
cubic_data->W_max = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT;
} else {
/*
* On subsequent congestion events, set ssthresh to cwnd * beta.
* RFC9438 Section 4.6: Multiplicative Decrease
* Outside the TCP friendly region, set ssthresh to the size of
* inflight_size * beta.
*/
ssthresh = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT;
pipe = tcp_compute_pipe(ccv->tp);
return ((pipe * CUBIC_BETA) >> CUBIC_SHIFT);
}
CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg);
}
static void

View file

@ -88,14 +88,23 @@
/* Kernel only bits */
#ifdef _KERNEL
struct cubic {
/* CUBIC K in fixed point form with CUBIC_SHIFT worth of precision. */
/*
* CUBIC K in fixed point form with CUBIC_SHIFT worth of precision.
* Also means the time period in seconds it takes to increase the
* congestion window size at the beginning of the current congestion
* avoidance stage to W_max.
*/
int64_t K;
/* Sum of RTT samples across an epoch in usecs. */
int64_t sum_rtt_usecs;
/* Size of cwnd just before cwnd was reduced in the last congestion event */
uint64_t W_max;
/* The cwnd at the beginning of the current congestion avoidance stage */
uint64_t cwnd_epoch;
/* Size of cwnd (in bytes) just before cwnd was reduced in the last congestion event. */
uint32_t W_max;
/* An estimate (in bytes) for the congestion window in the Reno-friendly region */
uint32_t W_est;
/* An estimate (in bytes) for the congestion window in the CUBIC region */
uint32_t W_cubic;
/* The cwnd (in bytes) at the beginning of the current congestion avoidance stage. */
uint32_t cwnd_epoch;
/* various flags */
uint32_t flags;
/* Minimum observed rtt in usecs. */
@ -110,8 +119,8 @@ struct cubic {
int undo_t_epoch;
/* Few variables to restore the state after RTO_ERR */
int64_t undo_K;
uint64_t undo_W_max;
uint64_t undo_cwnd_epoch;
uint32_t undo_W_max;
uint32_t undo_cwnd_epoch;
uint32_t css_baseline_minrtt;
uint32_t css_current_round_minrtt;
uint32_t css_lastround_minrtt;
@ -130,60 +139,103 @@ struct cubic {
extern int hz;
/*
* Implementation based on the formulae found in the CUBIC Internet Draft
* "draft-ietf-tcpm-cubic-04".
* Implementation based on the formulas in RFC9438.
*
*/
static __inline float
theoretical_cubic_k(double wmax_pkts)
/*
* Returns K, the time period in seconds it takes to increase the congestion
* window size at the beginning of the current congestion avoidance stage to
* W_max.
*/
static inline float
theoretical_cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs)
{
double C;
C = 0.4;
if (wmax_segs <= cwnd_epoch_segs)
return 0.0;
return (pow((wmax_pkts * 0.3) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT));
/*
* Figure 2: K = ((W_max - cwnd_epoch) / C)^(1/3)
*/
return (pow((wmax_segs - cwnd_epoch_segs) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT));
}
static __inline unsigned long
theoretical_cubic_cwnd(int ticks_since_epoch, unsigned long wmax, uint32_t smss)
/*
* Returns the congestion window in segments at time t in seconds based on the
* cubic increase function, where t is the elapsed time in seconds from the
* beginning of the current congestion avoidance stage, as described in RFC9438
* Section 4.2.
*/
static inline unsigned long
theoretical_cubic_cwnd(int ticks_elapsed, uint32_t wmax_segs, uint32_t cwnd_epoch_segs)
{
double C, wmax_pkts;
double C, t;
float K;
C = 0.4;
wmax_pkts = wmax / (double)smss;
t = ticks_elapsed / (double)hz;
K = theoretical_cubic_k(wmax_segs, cwnd_epoch_segs);
return (smss * (wmax_pkts +
(C * pow(ticks_since_epoch / (double)hz -
theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0))));
/*
* Figure 1: W_cubic(t) = C * (t - K)^3 + W_max
*/
return (C * pow(t - K / pow(2, CUBIC_SHIFT), 3.0) + wmax_segs);
}
static __inline unsigned long
theoretical_reno_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax,
uint32_t smss)
/*
* Returns estimated Reno congestion window in segments.
*/
static inline unsigned long
theoretical_reno_cwnd(int ticks_elapsed, int rtt_ticks, uint32_t wmax_segs)
{
return ((wmax * 0.5) + ((ticks_since_epoch / (float)rtt_ticks) * smss));
return (wmax_segs * 0.5 + ticks_elapsed / (float)rtt_ticks);
}
static __inline unsigned long
theoretical_tf_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax,
uint32_t smss)
/*
* Returns an estimate for the congestion window in segments in the
* Reno-friendly region -- that is, an estimate for the congestion window of
* Reno, as described in RFC9438 Section 4.3, where:
* cwnd: Current congestion window in segments.
* cwnd_prior: Size of cwnd in segments at the time of setting ssthresh most
* recently, either upon exiting the first slow start or just before
* cwnd was reduced in the last congestion event.
* W_est: An estimate for the congestion window in segments in the Reno-friendly
* region -- that is, an estimate for the congestion window of Reno.
*/
static inline unsigned long
theoretical_tf_cwnd(unsigned long W_est, unsigned long segs_acked, unsigned long cwnd,
unsigned long cwnd_prior)
{
float cubic_alpha, cubic_beta;
return ((wmax * 0.7) + ((3 * 0.3) / (2 - 0.3) *
(ticks_since_epoch / (float)rtt_ticks) * smss));
/* RFC9438 Section 4.6: The parameter β_cubic SHOULD be set to 0.7. */
cubic_beta = 0.7;
if (W_est >= cwnd_prior)
cubic_alpha = 1.0;
else
cubic_alpha = (3.0 * (1.0 - cubic_beta)) / (1.0 + cubic_beta);
/*
* Figure 4: W_est = W_est + α_cubic * segments_acked / cwnd
*/
return (W_est + cubic_alpha * segs_acked / cwnd);
}
#endif /* !_KERNEL */
/*
* Compute the CUBIC K value used in the cwnd calculation, using an
* implementation of eqn 2 in the I-D. The method used
* here is adapted from Apple Computer Technical Report #KT-32.
* implementation mentioned in Figure. 2 of RFC9438.
* The method used here is adapted from Apple Computer Technical Report #KT-32.
*/
static __inline int64_t
cubic_k(unsigned long wmax_pkts)
static inline int64_t
cubic_k(uint32_t wmax_segs, uint32_t cwnd_epoch_segs)
{
int64_t s, K;
uint16_t p;
@ -191,8 +243,13 @@ cubic_k(unsigned long wmax_pkts)
K = s = 0;
p = 0;
/* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */
s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR;
/* Handle the corner case where W_max <= cwnd_epoch */
if (wmax_segs <= cwnd_epoch_segs) {
return 0;
}
/* (wmax - cwnd_epoch) / C with CUBIC_SHIFT worth of precision. */
s = ((wmax_segs - cwnd_epoch_segs) << (2 * CUBIC_SHIFT)) / CUBIC_C_FACTOR;
/* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */
while (s >= 256) {
@ -213,13 +270,14 @@ cubic_k(unsigned long wmax_pkts)
}
/*
* Compute the new cwnd value using an implementation of eqn 1 from the I-D.
* Compute and return the new cwnd value in bytes using an implementation
* mentioned in Figure. 1 of RFC9438.
* Thanks to Kip Macy for help debugging this function.
*
* XXXLAS: Characterise bounds for overflow.
*/
static __inline unsigned long
cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K)
static inline uint32_t
cubic_cwnd(int usecs_since_epoch, uint32_t wmax, uint32_t smss, int64_t K)
{
int64_t cwnd;
@ -238,7 +296,7 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K)
cwnd *= (cwnd * cwnd);
/*
* C(t - K)^3 + wmax
* Figure 1: C * (t - K)^3 + wmax
* The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of
* CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above,
* and an extra from multiplying through by CUBIC_C_FACTOR.
@ -252,34 +310,10 @@ cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K)
return (lmax(0,cwnd));
}
/*
* Compute an approximation of the NewReno cwnd some number of usecs after a
* congestion event. RTT should be the average RTT estimate for the path
* measured over the previous congestion epoch and wmax is the value of cwnd at
* the last congestion event. The "TCP friendly" concept in the CUBIC I-D is
* rather tricky to understand and it turns out this function is not required.
* It is left here for reference.
*
* XXX: Not used
*/
static __inline unsigned long
reno_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax,
uint32_t smss)
{
/*
* For NewReno, beta = 0.5, therefore: W_tcp(t) = wmax*0.5 + t/RTT
* W_tcp(t) deals with cwnd/wmax in pkts, so because our cwnd is in
* bytes, we have to multiply by smss.
*/
return (((wmax * RENO_BETA) + (((usecs_since_epoch * smss)
<< CUBIC_SHIFT) / rtt_usecs)) >> CUBIC_SHIFT);
}
/*
* Compute the "TCP friendly" cwnd by newreno in congestion avoidance state.
*/
static __inline unsigned long
static inline uint32_t
tf_cwnd(struct cc_var *ccv)
{
/* newreno is "TCP friendly" */