From 963fb2ad94a5fa2a4156b738b6ecc6113d366b14 Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Mon, 4 May 2020 20:28:53 +0000 Subject: [PATCH] This commit brings things into sync with the advancements that have been made in rack and adds a few fixes in BBR. This also removes any possibility of incorrectly doing OOB data the stacks do not support it. Should fix the skyzaller crashes seen in the past. Still to fix is the BBR issue just reported this weekend with the SYN and on sending a RST. Note that this version of rack can now do pacing as well. Sponsored by:Netflix Inc Differential Revision:https://reviews.freebsd.org/D24576 --- sys/netinet/tcp_stacks/bbr.c | 269 +- sys/netinet/tcp_stacks/rack.c | 7113 ++++++++++++++++++---- sys/netinet/tcp_stacks/rack_bbr_common.c | 27 +- sys/netinet/tcp_stacks/rack_bbr_common.h | 7 +- sys/netinet/tcp_stacks/tcp_bbr.h | 6 +- sys/netinet/tcp_stacks/tcp_rack.h | 197 +- 6 files changed, 6081 insertions(+), 1538 deletions(-) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 710b707443d..2767a52a692 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -1,7 +1,5 @@ /*- - * Copyright (c) 2016-9 - * Netflix Inc. - * All rights reserved. + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -72,6 +70,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -1853,28 +1852,6 @@ bbr_init_sysctls(void) &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters"); } -static inline int32_t -bbr_progress_timeout_check(struct tcp_bbr *bbr) -{ - if (bbr->rc_tp->t_maxunacktime && bbr->rc_tp->t_acktime && - TSTMP_GT(ticks, bbr->rc_tp->t_acktime)) { - if ((((uint32_t)ticks - bbr->rc_tp->t_acktime)) >= bbr->rc_tp->t_maxunacktime) { - /* - * There is an assumption here that the caller will - * drop the connection, so we increment the - * statistics. - */ - bbr_log_progress_event(bbr, bbr->rc_tp, ticks, PROGRESS_DROP, __LINE__); - BBR_STAT_INC(bbr_progress_drops); -#ifdef NETFLIX_STATS - KMOD_TCPSTAT_INC(tcps_progdrops); -#endif - return (1); - } - } - return (0); -} - static void bbr_counter_destroy(void) { @@ -1884,6 +1861,8 @@ bbr_counter_destroy(void) COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT); COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT); COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT); + counter_u64_free(bbr_nohdwr_pacing_enobuf); + counter_u64_free(bbr_hdwr_pacing_enobuf); counter_u64_free(bbr_flows_whdwr_pacing); counter_u64_free(bbr_flows_nohdwr_pacing); @@ -4643,7 +4622,8 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) /* Its not time yet */ return (0); } - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); return (1); } @@ -4815,9 +4795,8 @@ bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) } /* - * Persists timer, here we simply need to setup the - * FORCE-DATA flag the output routine will send - * the one byte send. + * Here we send a KEEP-ALIVE like probe to the + * peer, we do not send data. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). @@ -4845,7 +4824,8 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) /* * Have we exceeded the user specified progress time? */ - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } @@ -4859,6 +4839,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); + tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } @@ -4875,6 +4856,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { KMOD_TCPSTAT_INC(tcps_persistdrop); + tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } @@ -4947,6 +4929,7 @@ bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) return (1); dropit: KMOD_TCPSTAT_INC(tcps_keepdrops); + tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); return (1); } @@ -5058,8 +5041,9 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { retval = 1; + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } @@ -5078,6 +5062,7 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); retval = 1; + tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); tcp_set_inp_to_drop(bbr->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; @@ -8050,6 +8035,9 @@ nothing_left: * to reset him. */ *ret_val = 1; + tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); + /* tcp_close will kill the inp pre-log the Reset */ + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); BBR_STAT_INC(bbr_dropped_af_data); @@ -8132,7 +8120,6 @@ bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t li idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time); bbr->rc_in_persist = 0; bbr->rc_hit_state_1 = 0; - tp->t_flags &= ~TF_FORCEDATA; bbr->r_ctl.rc_del_time = cts; /* * We invalidate the last ack here since we @@ -8390,66 +8377,12 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } /* - * Process segments with URG. + * We don't support urgent data but + * drag along the up just to make sure + * if there is a stack switch no one + * is surprised. */ - if ((thflags & TH_URG) && th->th_urp && - TCPS_HAVERCVDFIN(tp->t_state) == 0) { - /* - * This is a kludge, but if we receive and accept random - * urgent pointers, we'll crash in soreceive. It's hard to - * imagine someone actually wanting to send this much urgent - * data. - */ - SOCKBUF_LOCK(&so->so_rcv); - if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { - th->th_urp = 0; /* XXX */ - thflags &= ~TH_URG; /* XXX */ - SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ - goto dodata; /* XXX */ - } - /* - * If this segment advances the known urgent pointer, then - * mark the data stream. This should not happen in - * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a - * FIN has been received from the remote side. In these - * states we ignore the URG. - * - * According to RFC961 (Assigned Protocols), the urgent - * pointer points to the last octet of urgent data. We - * continue, however, to consider it to indicate the first - * octet of data past the urgent section as the original - * spec states (in one of two places). - */ - if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { - tp->rcv_up = th->th_seq + th->th_urp; - so->so_oobmark = sbavail(&so->so_rcv) + - (tp->rcv_up - tp->rcv_nxt) - 1; - if (so->so_oobmark == 0) - so->so_rcv.sb_state |= SBS_RCVATMARK; - sohasoutofband(so); - tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); - } - SOCKBUF_UNLOCK(&so->so_rcv); - /* - * Remove out of band data so doesn't get presented to user. - * This can happen independent of advancing the URG pointer, - * but if two URG's are pending at once, some out-of-band - * data may creep in... ick. - */ - if (th->th_urp <= (uint32_t)tlen && - !(so->so_options & SO_OOBINLINE)) { - /* hdr drop is delayed */ - tcp_pulloutofband(so, th, m, drop_hdrlen); - } - } else { - /* - * If no out of band data is expected, pull receive urgent - * pointer along with the receive window. - */ - if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) - tp->rcv_up = tp->rcv_nxt; - } -dodata: /* XXX */ + tp->rcv_up = tp->rcv_nxt; INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -8792,7 +8725,7 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t nxt_pkt) + uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) { int32_t acked; uint16_t nsegs; @@ -8987,7 +8920,7 @@ bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t todrop; int32_t ourfinisacked = 0; @@ -9010,6 +8943,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9196,7 +9130,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; @@ -9207,6 +9141,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9218,6 +9153,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * data), a valid ACK, a FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { @@ -9253,6 +9189,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9405,7 +9342,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { struct tcp_bbr *bbr; int32_t ret_val; @@ -9439,7 +9376,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, __predict_true(th->th_seq == tp->rcv_nxt)) { if (tlen == 0) { if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen, - tiwin, nxt_pkt)) { + tiwin, nxt_pkt, iptos)) { return (0); } } else { @@ -9521,7 +9458,8 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } if (sbavail(&so->so_snd)) { - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9539,7 +9477,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { struct tcp_bbr *bbr; int32_t ret_val; @@ -9616,7 +9554,8 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } if (sbavail(&so->so_snd)) { - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9632,6 +9571,9 @@ bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr, if (bbr->rc_allow_data_af_clo == 0) { close_now: + tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); + /* tcp_close will kill the inp pre-log the Reset */ + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); KMOD_TCPSTAT_INC(tcps_rcvafterclose); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); @@ -9655,7 +9597,7 @@ close_now: static int bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; @@ -9764,7 +9706,8 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9781,7 +9724,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; @@ -9876,7 +9819,8 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, return (1); } if (sbavail(&so->so_snd)) { - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -9893,7 +9837,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; @@ -9988,7 +9932,8 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (1); } if (sbavail(&so->so_snd)) { - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -10006,7 +9951,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, static int bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; @@ -10104,7 +10049,8 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } if (sbavail(&so->so_snd)) { - if (bbr_progress_timeout_check(bbr)) { + if (ctf_progress_timeout_check(tp, true)) { + bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -11702,6 +11648,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ + if (thflags & TH_FIN) + tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); @@ -11740,6 +11688,7 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -11765,7 +11714,7 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, } retval = (*bbr->r_substate) (m, th, so, tp, &to, drop_hdrlen, - tlen, tiwin, thflags, nxt_pkt); + tlen, tiwin, thflags, nxt_pkt, iptos); #ifdef BBR_INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { @@ -11969,14 +11918,7 @@ bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap bbr_do_error_accounting(tp, bbr, rsm, len, error); return; } - if ((tp->t_flags & TF_FORCEDATA) && len == 1) { - /* Window probe */ - KMOD_TCPSTAT_INC(tcps_sndprobe); -#ifdef STATS - stats_voi_update_abs_u32(tp->t_stats, - VOI_TCP_RETXPB, len); -#endif - } else if (rsm) { + if (rsm) { if (rsm->r_flags & BBR_TLP) { /* * TLP should not count in retran count, but in its @@ -12241,7 +12183,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) } /* Mark that we have called bbr_output(). */ if ((bbr->r_timer_override) || - (tp->t_flags & TF_FORCEDATA) || (tp->t_state < TCPS_ESTABLISHED)) { /* Timeouts or early states are exempt */ if (inp->inp_in_hpts) @@ -12577,47 +12518,6 @@ recheck_resend: prefetch_rsm = 1; } SOCKBUF_LOCK(sb); - /* - * If in persist timeout with window of 0, send 1 byte. Otherwise, - * if window is small but nonzero and time TF_SENTFIN expired, we - * will send what we can and go to transmit state. - */ - if (tp->t_flags & TF_FORCEDATA) { - if ((sendwin == 0) || (sendwin <= (tp->snd_max - tp->snd_una))) { - /* - * If we still have some data to send, then clear - * the FIN bit. Usually this would happen below - * when it realizes that we aren't sending all the - * data. However, if we have exactly 1 byte of - * unsent data, then it won't clear the FIN bit - * below, and if we are in persist state, we wind up - * sending the packet without recording that we sent - * the FIN bit. - * - * We can't just blindly clear the FIN bit, because - * if we don't have any more data to send then the - * probe will be the FIN itself. - */ - if (sb_offset < sbused(sb)) - flags &= ~TH_FIN; - sendwin = 1; - } else { - if ((bbr->rc_in_persist != 0) && - (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), - bbr_minseg(bbr)))) { - /* Exit persists if there is space */ - bbr_exit_persist(tp, bbr, cts, __LINE__); - } - if (rsm == NULL) { - /* - * If we are dropping persist mode then we - * need to correct sb_offset if not a - * retransmit. - */ - sb_offset = tp->snd_max - tp->snd_una; - } - } - } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a @@ -12674,7 +12574,7 @@ recheck_resend: */ len = 0; } - if ((tp->t_flags & TF_FORCEDATA) && (bbr->rc_in_persist)) { + if (bbr->rc_in_persist) { /* * We are in persists, figure out if * a retransmit is available (maybe the previous @@ -12970,9 +12870,6 @@ recheck_resend: if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ goto send; } - if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ - goto send; - } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { goto send; } @@ -13013,7 +12910,7 @@ recheck_resend: goto send; } /* - * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW + * Send if we owe the peer an ACK, RST, SYN. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { @@ -13022,9 +12919,6 @@ recheck_resend: if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { goto send; } - if (SEQ_GT(tp->snd_up, tp->snd_una)) { - goto send; - } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. @@ -13089,7 +12983,6 @@ just_return_nolock: } if (tot_len == 0) counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1); - tp->t_flags &= ~TF_FORCEDATA; /* Dont update the time if we did not send */ bbr->r_ctl.rc_last_delay_val = 0; bbr->rc_output_starts_timer = 1; @@ -13586,8 +13479,6 @@ send: KMOD_TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) KMOD_TCPSTAT_INC(tcps_sndctrl); - else if (SEQ_GT(tp->snd_up, tp->snd_una)) - KMOD_TCPSTAT_INC(tcps_sndurg); else KMOD_TCPSTAT_INC(tcps_sndwinup); @@ -13774,17 +13665,11 @@ send: tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; - if (SEQ_GT(tp->snd_up, tp->snd_max)) { - th->th_urp = htons((u_short)(tp->snd_up - tp->snd_max)); - th->th_flags |= TH_URG; - } else - /* - * If no urgent pointer to send, then we pull the urgent - * pointer to the left edge of the send window so that it - * doesn't drift into the send window on sequence number - * wraparound. - */ - tp->snd_up = tp->snd_una; /* drag it along */ + /* + * We don't support urgent data, but drag along + * the pointer in case of a stack switch. + */ + tp->snd_up = tp->snd_una; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { @@ -14125,8 +14010,7 @@ out: */ return (0); } - if (((tp->t_flags & TF_FORCEDATA) == 0) || - (bbr->rc_in_persist == 0)) { + if (bbr->rc_in_persist == 0) { /* * Advance snd_nxt over sequence space of this segment. */ @@ -14254,7 +14138,6 @@ nomore: tp->t_maxseg = old_maxseg - 40; bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts); } - tp->t_flags &= ~TF_FORCEDATA; /* * Nuke all other things that can interfere * with slot @@ -14284,7 +14167,6 @@ nomore: } /* FALLTHROUGH */ default: - tp->t_flags &= ~TF_FORCEDATA; slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); @@ -14399,7 +14281,6 @@ nomore: ((flags & TH_RST) == 0) && (IN_RECOVERY(tp->t_flags) == 0) && (bbr->rc_in_persist == 0) && - ((tp->t_flags & TF_FORCEDATA) == 0) && (tot_len < bbr->r_ctl.rc_pace_max_segs)) { /* * For non-tso we need to goto again until we have sent out @@ -14416,10 +14297,14 @@ nomore: } rsm = NULL; sack_rxmit = 0; - tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA); + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); goto again; } skip_again: + if ((error == 0) && (flags & TH_FIN)) + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); + if ((error == 0) && (flags & TH_RST)) + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { /* * Calculate/Re-Calculate the hptsi slot in usecs based on @@ -14429,7 +14314,7 @@ skip_again: if (bbr->rc_no_pacing) slot = 0; } - tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA); + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); @@ -15095,6 +14980,13 @@ out: return (error); } +static int +bbr_pru_options(struct tcpcb *tp, int flags) +{ + if (flags & PRUS_OOB) + return (EOPNOTSUPP); + return (0); +} struct tcp_function_block __tcp_bbr = { .tfb_tcp_block_name = __XSTRING(STACKNAME), @@ -15111,7 +15003,8 @@ struct tcp_function_block __tcp_bbr = { .tfb_tcp_timer_stop = bbr_timer_stop, .tfb_tcp_rexmit_tmr = bbr_remxt_tmr, .tfb_tcp_handoff_ok = bbr_handoff_ok, - .tfb_tcp_mtu_chg = bbr_mtu_chg + .tfb_tcp_mtu_chg = bbr_mtu_chg, + .tfb_pru_options = bbr_pru_options, }; static const char *bbr_stack_names[] = { diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 1e774054fa3..d23fb41baab 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016-9 Netflix, Inc. + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -57,13 +57,16 @@ __FBSDID("$FreeBSD$"); #include #include #include /* Must come after qmath.h and tree.h */ +#else +#include #endif #include -#include #include +#include #include #include #include +#include #include @@ -91,10 +94,14 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include +#ifdef NETFLIX_SHARED_CWND +#include +#endif #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ @@ -169,6 +176,8 @@ struct sysctl_oid *rack_sysctl_root; * */ static int32_t rack_tlp_thresh = 1; +static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ +static int32_t rack_tlp_use_greater = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ @@ -177,24 +186,34 @@ static uint32_t rack_highest_sack_thresh_seen = 0; static uint32_t rack_highest_move_thresh_seen = 0; static int32_t rack_pkt_delay = 1; -static int32_t rack_min_pace_time = 0; static int32_t rack_early_recovery = 1; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; -static int32_t use_rack_cheat = 1; +static int32_t rack_enable_shared_cwnd = 0; +static int32_t rack_limits_scwnd = 1; +static int32_t rack_enable_mqueue_for_nonpaced = 0; +static int32_t rack_disable_prr = 0; +static int32_t use_rack_rr = 1; +static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ static int32_t rack_persist_min = 250; /* 250ms */ -static int32_t rack_persist_max = 1000; /* 1 Second */ +static int32_t rack_persist_max = 2000; /* 2 Second */ static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ -static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */ - +static int32_t rack_hw_tls_max_seg = 3; /* 3 means use hw-tls single segment */ +static int32_t rack_default_init_window = 0; /* Use system default */ +static int32_t rack_limit_time_with_srtt = 0; +static int32_t rack_hw_pace_adjust = 0; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ +static uint32_t rack_def_data_window = 20; +static uint32_t rack_goal_bdp = 2; +static uint32_t rack_min_srtts = 1; +static uint32_t rack_min_measure_usec = 0; static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ static int32_t rack_rto_max = 4000; /* 4 seconds */ @@ -204,16 +223,78 @@ static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; +static int32_t rack_wma_divisor = 8; /* For WMA calculation */ +static int32_t rack_cwnd_block_ends_measure = 0; +static int32_t rack_rwnd_block_ends_measure = 0; + static int32_t rack_lower_cwnd_at_tlp = 0; static int32_t rack_use_proportional_reduce = 0; static int32_t rack_proportional_rate = 10; static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; -static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; -static int32_t rack_per_of_gp = 50; +static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ +static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ +static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ + +/* Probertt */ +static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ +static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ +static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ +static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ +static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ + +static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ +static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ +static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ +static uint32_t rack_probertt_use_min_rtt_exit = 0; +static uint32_t rack_probe_rtt_sets_cwnd = 0; +static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ +static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ +static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ +static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ +static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ +static uint32_t rack_probertt_filter_life = 10000000; +static uint32_t rack_probertt_lower_within = 10; +static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ +static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ +static int32_t rack_probertt_clear_is = 1; +static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ +static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ + + +/* Part of pacing */ +static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ + +/* Timely information */ +/* Combine these two gives the range of 'no change' to bw */ +/* ie the up/down provide the upper and lower bound */ +static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ +static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ +static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ +static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ +static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ +static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ +static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ +static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ +static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ +static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ +static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ +static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ +static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ +static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ +static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ +static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ +static int32_t rack_use_max_for_nobackoff = 0; +static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ +static int32_t rack_timely_no_stopping = 0; +static int32_t rack_down_raise_thresh = 100; +static int32_t rack_req_segs = 1; + +/* Weird delayed ack mode */ +static int32_t rack_use_imac_dack = 0; /* Rack specific counters */ counter_u64_t rack_badfr; counter_u64_t rack_badfr_bytes; @@ -266,6 +347,7 @@ counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; counter_u64_t rack_collapsed_win; counter_u64_t rack_tlp_does_nada; +counter_u64_t rack_try_scwnd; /* Counters for HW TLS */ counter_u64_t rack_tls_rwnd; @@ -312,6 +394,8 @@ rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); static void +rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); +static void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); @@ -319,6 +403,15 @@ static void rack_dtor(void *mem, int32_t size, void *arg); static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts); +static void +rack_log_alt_to_to_cancel(struct tcp_rack *rack, + uint32_t flex1, uint32_t flex2, + uint32_t flex3, uint32_t flex4, + uint32_t flex5, uint32_t flex6, + uint16_t flex7, uint8_t mod); +static void +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, + uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm); @@ -328,6 +421,11 @@ static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); +static void +rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, + tcp_seq th_ack, int line); +static uint32_t +rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp); static void rack_init_sysctls(void); @@ -337,11 +435,11 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, - uint8_t pass, struct rack_sendmap *hintrsm); + uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); -static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num); +static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); static int32_t rack_output(struct tcpcb *tp); static uint32_t @@ -369,7 +467,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); + struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, @@ -410,7 +508,8 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); -static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); +static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, + uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); @@ -488,6 +587,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); + counter_u64_zero(rack_try_scwnd); counter_u64_zero(rack_collapsed_win); } @@ -502,7 +602,26 @@ rack_init_sysctls(void) { struct sysctl_oid *rack_counters; struct sysctl_oid *rack_attack; + struct sysctl_oid *rack_pacing; + struct sysctl_oid *rack_timely; + struct sysctl_oid *rack_timers; + struct sysctl_oid *rack_tlp; + struct sysctl_oid *rack_misc; + struct sysctl_oid *rack_measure; + struct sysctl_oid *rack_probertt; + rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "sack_attack", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Rack Sack Attack Counters and Controls"); + rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "stats", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Rack Counters"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, @@ -511,166 +630,586 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hw_tlsmax", CTLFLAG_RW, - &rack_hw_tls_max_seg , 0, - "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? "); + &rack_hw_tls_max_seg , 3, + "What is the maximum number of full TLS records that will be sent at once"); + /* Probe rtt related controls */ + rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "probertt", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "ProbeRTT related Controls"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "exit_per_hpb", CTLFLAG_RW, + &rack_atexit_prtt_hbp, 130, + "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, + &rack_atexit_prtt, 130, + "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "gp_per_mul", CTLFLAG_RW, + &rack_per_of_gp_probertt, 60, + "What percentage of goodput do we pace at in probertt"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "gp_per_reduce", CTLFLAG_RW, + &rack_per_of_gp_probertt_reduce, 10, + "What percentage of goodput do we reduce every gp_srtt"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "gp_per_low", CTLFLAG_RW, + &rack_per_of_gp_lowthresh, 40, + "What percentage of goodput do we allow the multiplier to fall to"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "time_between", CTLFLAG_RW, + & rack_time_between_probertt, 96000000, + "How many useconds between the lowest rtt falling must past before we enter probertt"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "safety", CTLFLAG_RW, + &rack_probe_rtt_safety_val, 2000000, + "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "sets_cwnd", CTLFLAG_RW, + &rack_probe_rtt_sets_cwnd, 0, + "Do we set the cwnd too (if always_lower is on)"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, + &rack_max_drain_wait, 2, + "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, + &rack_must_drain, 1, + "We must drain this many gp_srtt's waiting for flight to reach goal"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, + &rack_probertt_use_min_rtt_entry, 1, + "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, + &rack_probertt_use_min_rtt_exit, 0, + "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "length_div", CTLFLAG_RW, + &rack_probertt_gpsrtt_cnt_div, 0, + "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "length_mul", CTLFLAG_RW, + &rack_probertt_gpsrtt_cnt_mul, 0, + "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "holdtim_at_target", CTLFLAG_RW, + &rack_min_probertt_hold, 200000, + "What is the minimum time we hold probertt at target"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "filter_life", CTLFLAG_RW, + &rack_probertt_filter_life, 10000000, + "What is the time for the filters life in useconds"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "lower_within", CTLFLAG_RW, + &rack_probertt_lower_within, 10, + "If the rtt goes lower within this percentage of the time, go into probe-rtt"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "must_move", CTLFLAG_RW, + &rack_min_rtt_movement, 250, + "How much is the minimum movement in rtt to count as a drop for probertt purposes"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "clear_is_cnts", CTLFLAG_RW, + &rack_probertt_clear_is, 1, + "Do we clear I/S counts on exiting probe-rtt"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, + &rack_max_drain_hbp, 1, + "How many extra drain gpsrtt's do we get in highly buffered paths"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_probertt), + OID_AUTO, "hbp_threshold", CTLFLAG_RW, + &rack_hbp_thresh, 3, + "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); + /* Pacing related sysctls */ + rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "pacing", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Pacing related Controls"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "max_pace_over", CTLFLAG_RW, + &rack_max_per_above, 30, + "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "pace_to_one", CTLFLAG_RW, + &rack_pace_one_seg, 0, + "Do we allow low b/w pacing of 1MSS instead of two"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "limit_wsrtt", CTLFLAG_RW, + &rack_limit_time_with_srtt, 0, + "Do we limit pacing time based on srtt"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "init_win", CTLFLAG_RW, + &rack_default_init_window, 0, + "Do we have a rack initial window 0 = system default"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, + &rack_hw_pace_adjust, 0, + "What percentage do we raise the MSS by (11 = 1.1%)"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "gp_per_ss", CTLFLAG_RW, + &rack_per_of_gp_ss, 250, + "If non zero, what percentage of goodput to pace at in slow start"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "gp_per_ca", CTLFLAG_RW, + &rack_per_of_gp_ca, 150, + "If non zero, what percentage of goodput to pace at in congestion avoidance"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "gp_per_rec", CTLFLAG_RW, + &rack_per_of_gp_rec, 200, + "If non zero, what percentage of goodput to pace at in recovery"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "pace_max_seg", CTLFLAG_RW, + &rack_hptsi_segments, 40, + "What size is the max for TSO segments in pacing and burst mitigation"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "burst_reduces", CTLFLAG_RW, + &rack_slot_reduction, 4, + "When doing only burst mitigation what is the reduce divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "data_after_close", CTLFLAG_RW, - &rack_ignore_data_after_close, 0, - "Do we hold off sending a RST until all pending data is ack'd"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "cheat_rxt", CTLFLAG_RW, - &use_rack_cheat, 1, - "Do we use the rxt cheat for rack?"); + OID_AUTO, "use_pacing", CTLFLAG_RW, + &rack_pace_every_seg, 0, + "If set we use pacing, if clear we use only the original burst mitigation"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, + rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "persmin", CTLFLAG_RW, - &rack_persist_min, 250, - "What is the minimum time in milliseconds between persists"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "persmax", CTLFLAG_RW, - &rack_persist_max, 1000, - "What is the largest delay in milliseconds between persists"); + OID_AUTO, + "timely", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Rack Timely RTT Controls"); + /* Timely based GP dynmics */ SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "no_sack_needed", CTLFLAG_RW, - &rack_sack_not_required, 0, - "Do we allow rack to run on connections not supporting SACK?"); + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "upper", CTLFLAG_RW, + &rack_gp_per_bw_mul_up, 2, + "Rack timely upper range for equal b/w (in percentage)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "lower", CTLFLAG_RW, + &rack_gp_per_bw_mul_down, 4, + "Rack timely lower range for equal b/w (in percentage)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "rtt_max_mul", CTLFLAG_RW, + &rack_gp_rtt_maxmul, 3, + "Rack timely multipler of lowest rtt for rtt_max"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "rtt_min_div", CTLFLAG_RW, + &rack_gp_rtt_mindiv, 4, + "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "rtt_min_mul", CTLFLAG_RW, + &rack_gp_rtt_minmul, 1, + "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "decrease", CTLFLAG_RW, + &rack_gp_decrease_per, 20, + "Rack timely decrease percentage of our GP multiplication factor"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "increase", CTLFLAG_RW, + &rack_gp_increase_per, 2, + "Rack timely increase perentage of our GP multiplication factor"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "lowerbound", CTLFLAG_RW, + &rack_per_lower_bound, 50, + "Rack timely lowest percentage we allow GP multiplier to fall to"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "upperboundss", CTLFLAG_RW, + &rack_per_upper_bound_ss, 0, + "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "upperboundca", CTLFLAG_RW, + &rack_per_upper_bound_ca, 0, + "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "dynamicgp", CTLFLAG_RW, + &rack_do_dyn_mul, 0, + "Rack timely do we enable dynmaic timely goodput by default"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "no_rec_red", CTLFLAG_RW, + &rack_gp_no_rec_chg, 1, + "Rack timely do we prohibit the recovery multiplier from being lowered"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "red_clear_cnt", CTLFLAG_RW, + &rack_timely_dec_clear, 6, + "Rack timely what threshold do we count to before another boost during b/w decent"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "max_push_rise", CTLFLAG_RW, + &rack_timely_max_push_rise, 3, + "Rack timely how many times do we push up with b/w increase"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "max_push_drop", CTLFLAG_RW, + &rack_timely_max_push_drop, 3, + "Rack timely how many times do we push back on b/w decent"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "min_segs", CTLFLAG_RW, + &rack_timely_min_segs, 4, + "Rack timely when setting the cwnd what is the min num segments"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "noback_max", CTLFLAG_RW, + &rack_use_max_for_nobackoff, 0, + "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "interim_timely_only", CTLFLAG_RW, + &rack_timely_int_timely_only, 0, + "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "nonstop", CTLFLAG_RW, + &rack_timely_no_stopping, 0, + "Rack timely don't stop increase"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, + &rack_down_raise_thresh, 100, + "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timely), + OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, + &rack_req_segs, 1, + "Bottom dragging if not these many segments outstanding and room"); + + /* TLP and Rack related parameters */ + rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "tlp", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TLP and Rack related Controls"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_tlp), + OID_AUTO, "use_rrr", CTLFLAG_RW, + &use_rack_rr, 1, + "Do we use Rack Rapid Recovery"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_tlp), + OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, + &rack_non_rxt_use_cr, 0, + "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "gp_percentage", CTLFLAG_RW, - &rack_per_of_gp, 50, - "Do we pace to percentage of goodput (0=old method)?"); + SYSCTL_CHILDREN(rack_tlp), + OID_AUTO, "limit", CTLFLAG_RW, + &rack_tlp_limit, 2, + "How many TLP's can be sent without sending new data"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "min_pace_time", CTLFLAG_RW, - &rack_min_pace_time, 0, - "Should we enforce a minimum pace time of 1ms"); + SYSCTL_CHILDREN(rack_tlp), + OID_AUTO, "use_greater", CTLFLAG_RW, + &rack_tlp_use_greater, 1, + "Should we use the rack_rtt time if its greater than srtt"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "bb_verbose", CTLFLAG_RW, - &rack_verbose_logging, 0, - "Should RACK black box logging be verbose"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "sackfiltering", CTLFLAG_RW, - &rack_use_sack_filter, 1, - "Do we use sack filtering?"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "delayed_ack", CTLFLAG_RW, - &rack_delayed_ack_time, 200, - "Delayed ack time (200ms)"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlpminto", CTLFLAG_RW, &rack_tlp_min, 10, "TLP minimum timeout per the specification (10ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "send_oldest", CTLFLAG_RW, - &rack_always_send_oldest, 1, + &rack_always_send_oldest, 0, "Should we always send the oldest TLP and RACK-TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "minrto", CTLFLAG_RW, - &rack_rto_min, 0, - "Minimum RTO in ms -- set with caution below 1000 due to TLP"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "maxrto", CTLFLAG_RW, - &rack_rto_max, 0, - "Maxiumum RTO in ms -- should be at least as large as min_rto"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlp_retry", CTLFLAG_RW, &rack_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, - &rack_use_proportional_reduce, 0, - "Should we proportionaly reduce cwnd based on the number of losses "); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "recovery_prop", CTLFLAG_RW, - &rack_proportional_rate, 10, - "What percent reduction per loss"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, &rack_lower_cwnd_at_tlp, 0, - "When a TLP completes a retran should we enter recovery?"); + "When a TLP completes a retran should we enter recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "hptsi_reduces", CTLFLAG_RW, - &rack_slot_reduction, 4, - "When setting a slot should we reduce by divisor"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, - &rack_pace_every_seg, 0, - "Should we use the original pacing mechanism that did not pace much?"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, - &rack_hptsi_segments, 40, - "Should we pace out only a limited size of segments"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "prr_sendalot", CTLFLAG_RW, - &rack_send_a_lot_in_prr, 1, - "Send a lot in prr"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "minto", CTLFLAG_RW, - &rack_min_to, 1, - "Minimum rack timeout in milliseconds"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "earlyrecovery", CTLFLAG_RW, - &rack_early_recovery, 1, - "Do we do early recovery with rack"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &rack_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &rack_tlp_thresh, 1, - "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); + "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "reorder_fade", CTLFLAG_RW, &rack_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); - rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + /* Timer related controls */ + rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, - "stats", + "timers", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "Rack Counters"); + "Timer related controls"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "persmin", CTLFLAG_RW, + &rack_persist_min, 250, + "What is the minimum time in milliseconds between persists"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "persmax", CTLFLAG_RW, + &rack_persist_max, 2000, + "What is the largest delay in milliseconds between persists"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "delayed_ack", CTLFLAG_RW, + &rack_delayed_ack_time, 200, + "Delayed ack time (200ms)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "minrto", CTLFLAG_RW, + &rack_rto_min, 0, + "Minimum RTO in ms -- set with caution below 1000 due to TLP"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "maxrto", CTLFLAG_RW, + &rack_rto_max, 0, + "Maxiumum RTO in ms -- should be at least as large as min_rto"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "minto", CTLFLAG_RW, + &rack_min_to, 1, + "Minimum rack timeout in milliseconds"); + /* Measure controls */ + rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "measure", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Measure related controls"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "wma_divisor", CTLFLAG_RW, + &rack_wma_divisor, 8, + "When doing b/w calculation what is the divisor for the WMA"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "end_cwnd", CTLFLAG_RW, + &rack_cwnd_block_ends_measure, 0, + "Does a cwnd just-return end the measurement window (app limited)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "end_rwnd", CTLFLAG_RW, + &rack_rwnd_block_ends_measure, 0, + "Does an rwnd just-return end the measurement window (app limited -- not persists)"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "min_target", CTLFLAG_RW, + &rack_def_data_window, 20, + "What is the minimum target window (in mss) for a GP measurements"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "goal_bdp", CTLFLAG_RW, + &rack_goal_bdp, 2, + "What is the goal BDP to measure"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "min_srtts", CTLFLAG_RW, + &rack_min_srtts, 1, + "What is the goal BDP to measure"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_measure), + OID_AUTO, "min_measure_tim", CTLFLAG_RW, + &rack_min_measure_usec, 0, + "What is the Minimum time time for a measurement if 0, this is off"); + /* Misc rack controls */ + rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "misc", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Misc related controls"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "shared_cwnd", CTLFLAG_RW, + &rack_enable_shared_cwnd, 0, + "Should RACK try to use the shared cwnd on connections where allowed"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, + &rack_limits_scwnd, 1, + "Should RACK place low end time limits on the shared cwnd feature"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, + &rack_enable_mqueue_for_nonpaced, 0, + "Should RACK use mbuf queuing for non-paced connections"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "iMac_dack", CTLFLAG_RW, + &rack_use_imac_dack, 0, + "Should RACK try to emulate iMac delayed ack"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "no_prr", CTLFLAG_RW, + &rack_disable_prr, 0, + "Should RACK not use prr and only pace (must have pacing on)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "bb_verbose", CTLFLAG_RW, + &rack_verbose_logging, 0, + "Should RACK black box logging be verbose"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "data_after_close", CTLFLAG_RW, + &rack_ignore_data_after_close, 1, + "Do we hold off sending a RST until all pending data is ack'd"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "no_sack_needed", CTLFLAG_RW, + &rack_sack_not_required, 0, + "Do we allow rack to run on connections not supporting SACK"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, + &rack_use_proportional_reduce, 0, + "Should we proportionaly reduce cwnd based on the number of losses "); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "recovery_prop", CTLFLAG_RW, + &rack_proportional_rate, 10, + "What percent reduction per loss"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "prr_sendalot", CTLFLAG_RW, + &rack_send_a_lot_in_prr, 1, + "Send a lot in prr"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "earlyrecovery", CTLFLAG_RW, + &rack_early_recovery, 1, + "Do we do early recovery with rack"); + /* Sack Attacker detection stuff */ + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "detect_highsackratio", CTLFLAG_RW, + &rack_highest_sack_thresh_seen, 0, + "Highest sack to ack ratio seen"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, + &rack_highest_move_thresh_seen, 0, + "Highest move to non-move ratio seen"); + rack_ack_total = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "acktotal", CTLFLAG_RD, + &rack_ack_total, + "Total number of Ack's"); + rack_express_sack = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "exp_sacktotal", CTLFLAG_RD, + &rack_express_sack, + "Total expresss number of Sack's"); + rack_sack_total = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "sacktotal", CTLFLAG_RD, + &rack_sack_total, + "Total number of SACKs"); + rack_move_none = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "move_none", CTLFLAG_RD, + &rack_move_none, + "Total number of SACK index reuse of postions under threshold"); + rack_move_some = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "move_some", CTLFLAG_RD, + &rack_move_some, + "Total number of SACK index reuse of postions over threshold"); + rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "attacks", CTLFLAG_RD, + &rack_sack_attacks_detected, + "Total number of SACK attackers that had sack disabled"); + rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "reversed", CTLFLAG_RD, + &rack_sack_attacks_reversed, + "Total number of SACK attackers that were later determined false positive"); + rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "nextmerge", CTLFLAG_RD, + &rack_sack_used_next_merge, + "Total number of times we used the next merge"); + rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "prevmerge", CTLFLAG_RD, + &rack_sack_used_prev_merge, + "Total number of times we used the prev merge"); + /* Counters */ rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -723,7 +1262,6 @@ rack_init_sysctls(void) OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); - rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -747,32 +1285,31 @@ rack_init_sysctls(void) SYSCTL_CHILDREN(rack_counters), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, - "Total number of times the rack to expired?"); + "Total number of times the rack to expired"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, - "Total number of times the rack timer armed?"); + "Total number of times the rack timer armed"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, - "Total number of times the tlp timer armed?"); - + "Total number of times the tlp timer armed"); rack_calc_zero = counter_u64_alloc(M_WAITOK); rack_calc_nonzero = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "calc_zero", CTLFLAG_RD, &rack_calc_zero, - "Total number of times pacing time worked out to zero?"); + "Total number of times pacing time worked out to zero"); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "calc_nonzero", CTLFLAG_RD, &rack_calc_nonzero, - "Total number of times pacing time worked out to non-zero?"); + "Total number of times pacing time worked out to non-zero"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -839,7 +1376,6 @@ rack_init_sysctls(void) OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); - rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -870,78 +1406,6 @@ rack_init_sysctls(void) OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, "Total number of times we hit TLP method 2"); - /* Sack Attacker detection stuff */ - rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, - "sack_attack", - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "Rack Sack Attack Counters and Controls"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "detect_highsackratio", CTLFLAG_RW, - &rack_highest_sack_thresh_seen, 0, - "Highest sack to ack ratio seen"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, - &rack_highest_move_thresh_seen, 0, - "Highest move to non-move ratio seen"); - rack_ack_total = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "acktotal", CTLFLAG_RD, - &rack_ack_total, - "Total number of Ack's"); - - rack_express_sack = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "exp_sacktotal", CTLFLAG_RD, - &rack_express_sack, - "Total expresss number of Sack's"); - rack_sack_total = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "sacktotal", CTLFLAG_RD, - &rack_sack_total, - "Total number of SACK's"); - rack_move_none = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "move_none", CTLFLAG_RD, - &rack_move_none, - "Total number of SACK index reuse of postions under threshold"); - rack_move_some = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "move_some", CTLFLAG_RD, - &rack_move_some, - "Total number of SACK index reuse of postions over threshold"); - rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "attacks", CTLFLAG_RD, - &rack_sack_attacks_detected, - "Total number of SACK attackers that had sack disabled"); - rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "reversed", CTLFLAG_RD, - &rack_sack_attacks_reversed, - "Total number of SACK attackers that were later determined false positive"); - rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "nextmerge", CTLFLAG_RD, - &rack_sack_used_next_merge, - "Total number of times we used the next merge"); - rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_attack), - OID_AUTO, "prevmerge", CTLFLAG_RD, - &rack_sack_used_prev_merge, - "Total number of times we used the prev merge"); rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), @@ -978,6 +1442,12 @@ rack_init_sysctls(void) OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); + rack_try_scwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tried_scwnd", CTLFLAG_RD, + &rack_try_scwnd, + "Total number of scwnd attempts"); rack_tls_rwnd = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, @@ -985,42 +1455,36 @@ rack_init_sysctls(void) OID_AUTO, "tls_rwnd", CTLFLAG_RD, &rack_tls_rwnd, "Total hdwr tls rwnd limited"); - rack_tls_cwnd = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tls_cwnd", CTLFLAG_RD, &rack_tls_cwnd, "Total hdwr tls cwnd limited"); - rack_tls_app = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tls_app", CTLFLAG_RD, &rack_tls_app, "Total hdwr tls app limited"); - rack_tls_other = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tls_other", CTLFLAG_RD, &rack_tls_other, "Total hdwr tls other limited"); - rack_tls_filled = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tls_filled", CTLFLAG_RD, &rack_tls_filled, "Total hdwr tls filled"); - rack_tls_rxt = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tls_rxt", CTLFLAG_RD, &rack_tls_rxt, "Total hdwr rxt"); - rack_tls_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1033,7 +1497,6 @@ rack_init_sysctls(void) OID_AUTO, "timer_hole", CTLFLAG_RD, &rack_per_timer_hole, "Total persists start in timer hole"); - COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, @@ -1091,30 +1554,151 @@ rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); -static inline int32_t -rack_progress_timeout_check(struct tcpcb *tp) +static uint32_t +rc_init_window(struct tcp_rack *rack) { - if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { - if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { - /* - * There is an assumption that the caller - * will drop the connection so we will - * increment the counters here. - */ - struct tcp_rack *rack; - rack = (struct tcp_rack *)tp->t_fb_ptr; - counter_u64_add(rack_progress_drops, 1); -#ifdef NETFLIX_STATS - KMOD_TCPSTAT_INC(tcps_progdrops); -#endif - rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); - return (1); - } + uint32_t win; + + if (rack->rc_init_win == 0) { + /* + * Nothing set by the user, use the system stack + * default. + */ + return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); } - return (0); + win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; + return(win); } +static uint64_t +rack_get_fixed_pacing_bw(struct tcp_rack *rack) +{ + if (IN_RECOVERY(rack->rc_tp->t_flags)) + return (rack->r_ctl.rc_fixed_pacing_rate_rec); + else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) + return (rack->r_ctl.rc_fixed_pacing_rate_ss); + else + return (rack->r_ctl.rc_fixed_pacing_rate_ca); +} +static uint64_t +rack_get_bw(struct tcp_rack *rack) +{ + if (rack->use_fixed_rate) { + /* Return the fixed pacing rate */ + return (rack_get_fixed_pacing_bw(rack)); + } + if (rack->r_ctl.gp_bw == 0) { + /* + * We have yet no b/w measurement, + * if we have a user set initial bw + * return it. If we don't have that and + * we have an srtt, use the tcp IW (10) to + * calculate a fictional b/w over the SRTT + * which is more or less a guess. Note + * we don't use our IW from rack on purpose + * so if we have like IW=30, we are not + * calculating a "huge" b/w. + */ + uint64_t bw, srtt; + if (rack->r_ctl.init_rate) + return (rack->r_ctl.init_rate); + + /* Has the user set a max peak rate? */ +#ifdef NETFLIX_PEAKRATE + if (rack->rc_tp->t_maxpeakrate) + return (rack->rc_tp->t_maxpeakrate); +#endif + /* Ok lets come up with the IW guess, if we have a srtt */ + if (rack->rc_tp->t_srtt == 0) { + /* + * Go with old pacing method + * i.e. burst mitigation only. + */ + return (0); + } + /* Ok lets get the initial TCP win (not racks) */ + bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); + srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); + bw *= (uint64_t)USECS_IN_SECOND; + bw /= srtt; + return (bw); + } else { + uint64_t bw; + + if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { + /* Averaging is done, we can return the value */ + bw = rack->r_ctl.gp_bw; + } else { + /* Still doing initial average must calculate */ + bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; + } +#ifdef NETFLIX_PEAKRATE + if ((rack->rc_tp->t_maxpeakrate) && + (bw > rack->rc_tp->t_maxpeakrate)) { + /* The user has set a peak rate to pace at + * don't allow us to pace faster than that. + */ + return (rack->rc_tp->t_maxpeakrate); + } +#endif + return (bw); + } +} + +static uint16_t +rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) +{ + if (rack->use_fixed_rate) { + return (100); + } else if (rack->in_probe_rtt && (rsm == NULL)) + return(rack->r_ctl.rack_per_of_gp_probertt); + else if ((IN_RECOVERY(rack->rc_tp->t_flags) && + rack->r_ctl.rack_per_of_gp_rec)) { + if (rsm) { + /* a retransmission always use the recovery rate */ + return(rack->r_ctl.rack_per_of_gp_rec); + } else if (rack->rack_rec_nonrxt_use_cr) { + /* Directed to use the configured rate */ + goto configured_rate; + } else if (rack->rack_no_prr && + (rack->r_ctl.rack_per_of_gp_rec > 100)) { + /* No PRR, lets just use the b/w estimate only */ + return(100); + } else { + /* + * Here we may have a non-retransmit but we + * have no overrides, so just use the recovery + * rate (prr is in effect). + */ + return(rack->r_ctl.rack_per_of_gp_rec); + } + } +configured_rate: + /* For the configured rate we look at our cwnd vs the ssthresh */ + if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) + return (rack->r_ctl.rack_per_of_gp_ss); + else + return(rack->r_ctl.rack_per_of_gp_ca); +} + +static uint64_t +rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) +{ + /* + * We allow rack_per_of_gp_xx to dictate our bw rate we want. + */ + uint64_t bw_est; + uint64_t gain; + + gain = (uint64_t)rack_get_output_gain(rack, rsm); + bw_est = bw * gain; + bw_est /= (uint64_t)100; + /* Never fall below the minimum (def 64kbps) */ + if (bw_est < RACK_MIN_BW) + bw_est = RACK_MIN_BW; + return (bw_est); +} static void rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) @@ -1122,6 +1706,19 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; + + if ((mod != 1) && (rack_verbose_logging == 0)) { + /* + * We get 3 values currently for mod + * 1 - We are retransmitting and this tells the reason. + * 2 - We are clearing a dup-ack count. + * 3 - We are incrementing a dup-ack count. + * + * The clear/increment are only logged + * if you have BBverbose on. + */ + return; + } memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = tsused; log.u_bbr.flex2 = thresh; @@ -1153,14 +1750,17 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); - log.u_bbr.flex2 = to; + log.u_bbr.flex2 = to * 1000; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; log.u_bbr.flex8 = which; - log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.pkts_out = 0; + else + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.timeStamp = tcp_get_usecs(&tv); @@ -1174,7 +1774,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot } static void -rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no) +rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; @@ -1186,8 +1786,14 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no) log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; - log.u_bbr.flex3 = no; - log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + if (rsm == NULL) + log.u_bbr.flex3 = 0; + else + log.u_bbr.flex3 = rsm->r_end - rsm->r_start; + if (rack->rack_no_prr) + log.u_bbr.flex5 = 0; + else + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -1199,27 +1805,64 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no) } static void -rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, - uint32_t o_srtt, uint32_t o_var) +rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, + struct rack_sendmap *rsm, int conf) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = t; - log.u_bbr.flex2 = o_srtt; - log.u_bbr.flex3 = o_var; - log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; - log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; + log.u_bbr.flex2 = len; + log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; + log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; + log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; - log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; + log.u_bbr.flex7 = conf; + log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; - log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.pkts_out = 0; + else + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; + log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + if (rsm) { + log.u_bbr.pkt_epoch = rsm->r_start; + log.u_bbr.lost = rsm->r_end; + log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; + } else { + + /* Its a SYN */ + log.u_bbr.pkt_epoch = rack->rc_tp->iss; + log.u_bbr.lost = 0; + log.u_bbr.cwnd_gain = 0; + } + /* Write out general bits of interest rrs here */ + log.u_bbr.use_lt_bw = rack->rc_highly_buffered; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->forced_ack; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->in_probe_rtt; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->rc_gp_filled; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; + log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; + log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; + log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; + log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; + log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1285,26 +1928,28 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, } static void -rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; - struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; - log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.flex2 = 0; + else + log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, - 0, &log, false, &tv); + 0, &log, false, tv); } } @@ -1320,10 +1965,14 @@ rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_ log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.flex5 = 0; + else + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -1362,7 +2011,8 @@ rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, } static void -rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) +rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, + uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; @@ -1373,9 +2023,14 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex4 = reason; + if (rack->rack_no_prr) + log.u_bbr.flex5 = 0; + else + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; + log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -1387,24 +2042,62 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui } static void -rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) +rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, + struct timeval *tv, uint32_t flags_on_entry) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; - struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; - log.u_bbr.flex2 = 0; - log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; - log.u_bbr.flex4 = 0; - log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; + log.u_bbr.flex3 = flags_on_entry; + log.u_bbr.flex4 = us_cts; + if (rack->rack_no_prr) + log.u_bbr.flex5 = 0; + else + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; - log.u_bbr.flex8 = hpts_removed; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex7 = hpts_removed; + log.u_bbr.flex8 = 1; + log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; + log.u_bbr.timeStamp = us_cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_TIMERCANC, 0, + 0, &log, false, tv); + } +} + +static void +rack_log_alt_to_to_cancel(struct tcp_rack *rack, + uint32_t flex1, uint32_t flex2, + uint32_t flex3, uint32_t flex4, + uint32_t flex5, uint32_t flex6, + uint16_t flex7, uint8_t mod) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + if (mod == 1) { + /* No you can't use 1, its for the real to cancel */ + return; + } + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = flex1; + log.u_bbr.flex2 = flex2; + log.u_bbr.flex3 = flex3; + log.u_bbr.flex4 = flex4; + log.u_bbr.flex5 = flex5; + log.u_bbr.flex6 = flex6; + log.u_bbr.flex7 = flex7; + log.u_bbr.flex8 = mod; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1426,7 +2119,10 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; - log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.flex6 = 0; + else + log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -1438,7 +2134,7 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t } static void -rack_log_to_prr(struct tcp_rack *rack, int frm) +rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; @@ -1447,11 +2143,15 @@ rack_log_to_prr(struct tcp_rack *rack, int frm) memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; - log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.flex3 = 0; + else + log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; log.u_bbr.flex5 = rack->r_ctl.rc_sacked; log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; log.u_bbr.flex8 = frm; + log.u_bbr.pkts_out = orig_cwnd; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -1498,11 +2198,21 @@ rack_log_sad(struct tcp_rack *rack, int event) static void rack_counter_destroy(void) { + counter_u64_free(rack_ack_total); + counter_u64_free(rack_express_sack); + counter_u64_free(rack_sack_total); + counter_u64_free(rack_move_none); + counter_u64_free(rack_move_some); + counter_u64_free(rack_sack_attacks_detected); + counter_u64_free(rack_sack_attacks_reversed); + counter_u64_free(rack_sack_used_next_merge); + counter_u64_free(rack_sack_used_prev_merge); counter_u64_free(rack_badfr); counter_u64_free(rack_badfr_bytes); counter_u64_free(rack_rtm_prr_retran); counter_u64_free(rack_rtm_prr_newdata); counter_u64_free(rack_timestamp_mismatch); + counter_u64_free(rack_find_high); counter_u64_free(rack_reorder_seen); counter_u64_free(rack_tlp_tot); counter_u64_free(rack_tlp_newdata); @@ -1512,27 +2222,39 @@ rack_counter_destroy(void) counter_u64_free(rack_to_tot); counter_u64_free(rack_to_arm_rack); counter_u64_free(rack_to_arm_tlp); + counter_u64_free(rack_calc_zero); + counter_u64_free(rack_calc_nonzero); counter_u64_free(rack_paced_segments); counter_u64_free(rack_unpaced_segments); counter_u64_free(rack_saw_enobuf); counter_u64_free(rack_saw_enetunreach); + counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); - counter_u64_free(rack_sack_proc_all); - counter_u64_free(rack_sack_proc_short); - counter_u64_free(rack_sack_proc_restart); - counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_limited); counter_u64_free(rack_alloc_limited_conns); counter_u64_free(rack_split_limited); - counter_u64_free(rack_find_high); + counter_u64_free(rack_sack_proc_all); + counter_u64_free(rack_sack_proc_restart); + counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_enter_tlp_calc); counter_u64_free(rack_used_tlpmethod); counter_u64_free(rack_used_tlpmethod2); + counter_u64_free(rack_sack_skipped_acked); + counter_u64_free(rack_sack_splits); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); counter_u64_free(rack_collapsed_win); counter_u64_free(rack_tlp_does_nada); + counter_u64_free(rack_try_scwnd); + counter_u64_free(rack_tls_rwnd); + counter_u64_free(rack_tls_cwnd); + counter_u64_free(rack_tls_app); + counter_u64_free(rack_tls_other); + counter_u64_free(rack_tls_filled); + counter_u64_free(rack_tls_rxt); + counter_u64_free(rack_tls_tlp); + counter_u64_free(rack_per_timer_hole); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); } @@ -1606,10 +2328,32 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { + if (rsm->r_flags & RACK_APP_LIMITED) { + if (rack->r_ctl.rc_app_limited_cnt > 0) { + rack->r_ctl.rc_app_limited_cnt--; + } + } if (rsm->r_limit_type) { /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } + if (rsm == rack->r_ctl.rc_first_appl) { + if (rack->r_ctl.rc_app_limited_cnt == 0) + rack->r_ctl.rc_first_appl = NULL; + else { + /* Follow the next one out */ + struct rack_sendmap fe; + + fe.r_start = rsm->r_nseq_appl; + rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + } + } + if (rsm == rack->r_ctl.rc_resend) + rack->r_ctl.rc_resend = NULL; + if (rsm == rack->r_ctl.rc_rsm_at_retran) + rack->r_ctl.rc_rsm_at_retran = NULL; + if (rsm == rack->r_ctl.rc_end_appl) + rack->r_ctl.rc_end_appl = NULL; if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_sacklast == rsm) @@ -1625,6 +2369,1578 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) uma_zfree(rack_zone, rsm); } +static uint32_t +rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) +{ + uint64_t srtt, bw, len, tim; + uint32_t segsiz, def_len, minl; + + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + def_len = rack_def_data_window * segsiz; + if (rack->rc_gp_filled == 0) { + /* + * We have no measurement (IW is in flight?) so + * we can only guess using our data_window sysctl + * value (usually 100MSS). + */ + return (def_len); + } + /* + * Now we have a number of factors to consider. + * + * 1) We have a desired BDP which is usually + * at least 2. + * 2) We have a minimum number of rtt's usually 1 SRTT + * but we allow it too to be more. + * 3) We want to make sure a measurement last N useconds (if + * we have set rack_min_measure_usec. + * + * We handle the first concern here by trying to create a data + * window of max(rack_def_data_window, DesiredBDP). The + * second concern we handle in not letting the measurement + * window end normally until at least the required SRTT's + * have gone by which is done further below in + * rack_enough_for_measurement(). Finally the third concern + * we also handle here by calculating how long that time + * would take at the current BW and then return the + * max of our first calculation and that length. Note + * that if rack_min_measure_usec is 0, we don't deal + * with concern 3. Also for both Concern 1 and 3 an + * application limited period could end the measurement + * earlier. + * + * So lets calculate the BDP with the "known" b/w using + * the SRTT has our rtt and then multiply it by the + * goal. + */ + bw = rack_get_bw(rack); + srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); + len = bw * srtt; + len /= (uint64_t)HPTS_USEC_IN_SEC; + len *= max(1, rack_goal_bdp); + /* Now we need to round up to the nearest MSS */ + len = roundup(len, segsiz); + if (rack_min_measure_usec) { + /* Now calculate our min length for this b/w */ + tim = rack_min_measure_usec; + minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; + if (minl == 0) + minl = 1; + minl = roundup(minl, segsiz); + if (len < minl) + len = minl; + } + /* + * Now if we have a very small window we want + * to attempt to get the window that is + * as small as possible. This happens on + * low b/w connections and we don't want to + * span huge numbers of rtt's between measurements. + * + * We basically include 2 over our "MIN window" so + * that the measurement can be shortened (possibly) by + * an ack'ed packet. + */ + if (len < def_len) + return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); + else + return (max((uint32_t)len, def_len)); + +} + +static int +rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) +{ + uint32_t tim, srtts, segsiz; + + /* + * Has enough time passed for the GP measurement to be valid? + */ + if ((tp->snd_max == tp->snd_una) || + (th_ack == tp->snd_max)){ + /* All is acked */ + return (1); + } + if (SEQ_LT(th_ack, tp->gput_seq)) { + /* Not enough bytes yet */ + return (0); + } + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + if (SEQ_LT(th_ack, tp->gput_ack) && + ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { + /* Not enough bytes yet */ + return (0); + } + if (rack->r_ctl.rc_first_appl && + (rack->r_ctl.rc_first_appl->r_start == th_ack)) { + /* + * We are up to the app limited point + * we have to measure irrespective of the time.. + */ + return (1); + } + /* Now what about time? */ + srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); + tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; + if (tim >= srtts) { + return (1); + } + /* Nope not even a full SRTT has passed */ + return (0); +} + + +static void +rack_log_timely(struct tcp_rack *rack, + uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, + uint64_t up_bnd, int line, uint8_t method) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = logged; + log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; + log.u_bbr.flex2 <<= 4; + log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; + log.u_bbr.flex2 <<= 4; + log.u_bbr.flex2 |= rack->rc_gp_incr; + log.u_bbr.flex2 <<= 4; + log.u_bbr.flex2 |= rack->rc_gp_bwred; + log.u_bbr.flex3 = rack->rc_gp_incr; + log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; + log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; + log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; + log.u_bbr.flex7 = rack->rc_gp_bwred; + log.u_bbr.flex8 = method; + log.u_bbr.cur_del_rate = cur_bw; + log.u_bbr.delRate = low_bnd; + log.u_bbr.bw_inuse = up_bnd; + log.u_bbr.rttProp = rack_get_bw(rack); + log.u_bbr.pkt_epoch = line; + log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; + log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; + log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; + log.u_bbr.lost = rack->r_ctl.rc_loss_count; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_TIMELY_WORK, 0, + 0, &log, false, &tv); + } +} + +static int +rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) +{ + /* + * Before we increase we need to know if + * the estimate just made was less than + * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) + * + * If we already are pacing at a fast enough + * rate to push us faster there is no sense of + * increasing. + * + * We first caculate our actual pacing rate (ss or ca multipler + * times our cur_bw). + * + * Then we take the last measured rate and multipy by our + * maximum pacing overage to give us a max allowable rate. + * + * If our act_rate is smaller than our max_allowable rate + * then we should increase. Else we should hold steady. + * + */ + uint64_t act_rate, max_allow_rate; + + if (rack_timely_no_stopping) + return (1); + + if ((cur_bw == 0) || (last_bw_est == 0)) { + /* + * Initial startup case or + * everything is acked case. + */ + rack_log_timely(rack, mult, cur_bw, 0, 0, + __LINE__, 9); + return (1); + } + if (mult <= 100) { + /* + * We can always pace at or slightly above our rate. + */ + rack_log_timely(rack, mult, cur_bw, 0, 0, + __LINE__, 9); + return (1); + } + act_rate = cur_bw * (uint64_t)mult; + act_rate /= 100; + max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); + max_allow_rate /= 100; + if (act_rate < max_allow_rate) { + /* + * Here the rate we are actually pacing at + * is smaller than 10% above our last measurement. + * This means we are pacing below what we would + * like to try to achieve (plus some wiggle room). + */ + rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, + __LINE__, 9); + return (1); + } else { + /* + * Here we are already pacing at least rack_max_per_above(10%) + * what we are getting back. This indicates most likely + * that we are being limited (cwnd/rwnd/app) and can't + * get any more b/w. There is no sense of trying to + * raise up the pacing rate its not speeding us up + * and we already are pacing faster than we are getting. + */ + rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, + __LINE__, 8); + return (0); + } +} + +static void +rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) +{ + /* + * When we drag bottom, we want to assure + * that no multiplier is below 1.0, if so + * we want to restore it to at least that. + */ + if (rack->r_ctl.rack_per_of_gp_rec < 100) { + /* This is unlikely we usually do not touch recovery */ + rack->r_ctl.rack_per_of_gp_rec = 100; + } + if (rack->r_ctl.rack_per_of_gp_ca < 100) { + rack->r_ctl.rack_per_of_gp_ca = 100; + } + if (rack->r_ctl.rack_per_of_gp_ss < 100) { + rack->r_ctl.rack_per_of_gp_ss = 100; + } +} + +static void +rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) +{ + if (rack->r_ctl.rack_per_of_gp_ca > 100) { + rack->r_ctl.rack_per_of_gp_ca = 100; + } + if (rack->r_ctl.rack_per_of_gp_ss > 100) { + rack->r_ctl.rack_per_of_gp_ss = 100; + } +} + +static void +rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) +{ + int32_t calc, logged, plus; + + logged = 0; + + if (override) { + /* + * override is passed when we are + * loosing b/w and making one last + * gasp at trying to not loose out + * to a new-reno flow. + */ + goto extra_boost; + } + /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ + if (rack->rc_gp_incr && + ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { + /* + * Reset and get 5 strokes more before the boost. Note + * that the count is 0 based so we have to add one. + */ +extra_boost: + plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; + rack->rc_gp_timely_inc_cnt = 0; + } else + plus = (uint32_t)rack_gp_increase_per; + /* Must be at least 1% increase for true timely increases */ + if ((plus < 1) && + ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) + plus = 1; + if (rack->rc_gp_saw_rec && + (rack->rc_gp_no_rec_chg == 0) && + rack_bw_can_be_raised(rack, cur_bw, last_bw_est, + rack->r_ctl.rack_per_of_gp_rec)) { + /* We have been in recovery ding it too */ + calc = rack->r_ctl.rack_per_of_gp_rec + plus; + if (calc > 0xffff) + calc = 0xffff; + logged |= 1; + rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; + if (rack_per_upper_bound_ss && + (rack->rc_dragged_bottom == 0) && + (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) + rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; + } + if (rack->rc_gp_saw_ca && + (rack->rc_gp_saw_ss == 0) && + rack_bw_can_be_raised(rack, cur_bw, last_bw_est, + rack->r_ctl.rack_per_of_gp_ca)) { + /* In CA */ + calc = rack->r_ctl.rack_per_of_gp_ca + plus; + if (calc > 0xffff) + calc = 0xffff; + logged |= 2; + rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; + if (rack_per_upper_bound_ca && + (rack->rc_dragged_bottom == 0) && + (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) + rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; + } + if (rack->rc_gp_saw_ss && + rack_bw_can_be_raised(rack, cur_bw, last_bw_est, + rack->r_ctl.rack_per_of_gp_ss)) { + /* In SS */ + calc = rack->r_ctl.rack_per_of_gp_ss + plus; + if (calc > 0xffff) + calc = 0xffff; + rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; + if (rack_per_upper_bound_ss && + (rack->rc_dragged_bottom == 0) && + (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) + rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; + logged |= 4; + } + if (logged && + (rack->rc_gp_incr == 0)){ + /* Go into increment mode */ + rack->rc_gp_incr = 1; + rack->rc_gp_timely_inc_cnt = 0; + } + if (rack->rc_gp_incr && + logged && + (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { + rack->rc_gp_timely_inc_cnt++; + } + rack_log_timely(rack, logged, plus, 0, 0, + __LINE__, 1); +} + +static uint32_t +rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) +{ + /* + * norm_grad = rtt_diff / minrtt; + * new_per = curper * (1 - B * norm_grad) + * + * B = rack_gp_decrease_per (default 10%) + * rtt_dif = input var current rtt-diff + * curper = input var current percentage + * minrtt = from rack filter + * + */ + uint64_t perf; + + perf = (((uint64_t)curper * ((uint64_t)1000000 - + ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * + (((uint64_t)rtt_diff * (uint64_t)1000000)/ + (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ + (uint64_t)1000000)) / + (uint64_t)1000000); + if (perf > curper) { + /* TSNH */ + perf = curper - 1; + } + return ((uint32_t)perf); +} + +static uint32_t +rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) +{ + /* + * highrttthresh + * result = curper * (1 - (B * ( 1 - ------ )) + * gp_srtt + * + * B = rack_gp_decrease_per (default 10%) + * highrttthresh = filter_min * rack_gp_rtt_maxmul + */ + uint64_t perf; + uint32_t highrttthresh; + + highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; + + perf = (((uint64_t)curper * ((uint64_t)1000000 - + ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - + ((uint64_t)highrttthresh * (uint64_t)1000000) / + (uint64_t)rtt)) / 100)) /(uint64_t)1000000); + return (perf); +} + + +static void +rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) +{ + uint64_t logvar, logvar2, logvar3; + uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; + + if (rack->rc_gp_incr) { + /* Turn off increment counting */ + rack->rc_gp_incr = 0; + rack->rc_gp_timely_inc_cnt = 0; + } + ss_red = ca_red = rec_red = 0; + logged = 0; + /* Calculate the reduction value */ + if (rtt_diff < 0) { + rtt_diff *= -1; + } + /* Must be at least 1% reduction */ + if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { + /* We have been in recovery ding it too */ + if (timely_says == 2) { + new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); + alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); + if (alt < new_per) + val = alt; + else + val = new_per; + } else + val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); + if (rack->r_ctl.rack_per_of_gp_rec > val) { + rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); + rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; + } else { + rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; + rec_red = 0; + } + if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) + rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; + logged |= 1; + } + if (rack->rc_gp_saw_ss) { + /* Sent in SS */ + if (timely_says == 2) { + new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); + alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); + if (alt < new_per) + val = alt; + else + val = new_per; + } else + val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); + if (rack->r_ctl.rack_per_of_gp_ss > new_per) { + ss_red = rack->r_ctl.rack_per_of_gp_ss - val; + rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; + } else { + ss_red = new_per; + rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; + logvar = new_per; + logvar <<= 32; + logvar |= alt; + logvar2 = (uint32_t)rtt; + logvar2 <<= 32; + logvar2 |= (uint32_t)rtt_diff; + logvar3 = rack_gp_rtt_maxmul; + logvar3 <<= 32; + logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); + rack_log_timely(rack, timely_says, + logvar2, logvar3, + logvar, __LINE__, 10); + } + if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) + rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; + logged |= 4; + } else if (rack->rc_gp_saw_ca) { + /* Sent in CA */ + if (timely_says == 2) { + new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); + alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); + if (alt < new_per) + val = alt; + else + val = new_per; + } else + val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); + if (rack->r_ctl.rack_per_of_gp_ca > val) { + ca_red = rack->r_ctl.rack_per_of_gp_ca - val; + rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; + } else { + rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; + ca_red = 0; + logvar = new_per; + logvar <<= 32; + logvar |= alt; + logvar2 = (uint32_t)rtt; + logvar2 <<= 32; + logvar2 |= (uint32_t)rtt_diff; + logvar3 = rack_gp_rtt_maxmul; + logvar3 <<= 32; + logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); + rack_log_timely(rack, timely_says, + logvar2, logvar3, + logvar, __LINE__, 10); + } + if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) + rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; + logged |= 2; + } + if (rack->rc_gp_timely_dec_cnt < 0x7) { + rack->rc_gp_timely_dec_cnt++; + if (rack_timely_dec_clear && + (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) + rack->rc_gp_timely_dec_cnt = 0; + } + logvar = ss_red; + logvar <<= 32; + logvar |= ca_red; + rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, + __LINE__, 2); +} + +static void +rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, + uint32_t rtt, uint32_t line, uint8_t reas) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; + log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; + log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; + log.u_bbr.flex5 = rtt; + log.u_bbr.flex6 = rack->rc_highly_buffered; + log.u_bbr.flex6 <<= 1; + log.u_bbr.flex6 |= rack->forced_ack; + log.u_bbr.flex6 <<= 1; + log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; + log.u_bbr.flex6 <<= 1; + log.u_bbr.flex6 |= rack->in_probe_rtt; + log.u_bbr.flex6 <<= 1; + log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; + log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; + log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; + log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; + log.u_bbr.flex8 = reas; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.delRate = rack_get_bw(rack); + log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; + log.u_bbr.cur_del_rate <<= 32; + log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; + log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; + log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; + log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; + log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; + log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; + log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); + log.u_bbr.rttProp = us_cts; + log.u_bbr.rttProp <<= 32; + log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_RTT_SHRINKS, 0, + 0, &log, false, &rack->r_ctl.act_rcv_time); + } +} + +static void +rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) +{ + uint64_t bwdp; + + bwdp = rack_get_bw(rack); + bwdp *= (uint64_t)rtt; + bwdp /= (uint64_t)HPTS_USEC_IN_SEC; + rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); + if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { + /* + * A window protocol must be able to have 4 packets + * outstanding as the floor in order to function + * (especially considering delayed ack :D). + */ + rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); + } +} + +static void +rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) +{ + /** + * ProbeRTT is a bit different in rack_pacing than in + * BBR. It is like BBR in that it uses the lowering of + * the RTT as a signal that we saw something new and + * counts from there for how long between. But it is + * different in that its quite simple. It does not + * play with the cwnd and wait until we get down + * to N segments outstanding and hold that for + * 200ms. Instead it just sets the pacing reduction + * rate to a set percentage (70 by default) and hold + * that for a number of recent GP Srtt's. + */ + uint32_t segsiz; + + if (rack->rc_gp_dyn_mul == 0) + return; + + if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { + /* We are idle */ + return; + } + if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && + SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { + /* + * Stop the goodput now, the idea here is + * that future measurements with in_probe_rtt + * won't register if they are not greater so + * we want to get what info (if any) is available + * now. + */ + rack_do_goodput_measurement(rack->rc_tp, rack, + rack->rc_tp->snd_una, __LINE__); + } + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; + rack->r_ctl.rc_time_probertt_entered = us_cts; + segsiz = min(ctf_fixed_maxseg(rack->rc_tp), + rack->r_ctl.rc_pace_min_segs); + rack->in_probe_rtt = 1; + rack->measure_saw_probe_rtt = 1; + rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + rack->r_ctl.rc_time_probertt_starts = 0; + rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; + if (rack_probertt_use_min_rtt_entry) + rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); + else + rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); + rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), + __LINE__, RACK_RTTS_ENTERPROBE); +} + +static void +rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) +{ + struct rack_sendmap *rsm; + uint32_t segsiz; + + segsiz = min(ctf_fixed_maxseg(rack->rc_tp), + rack->r_ctl.rc_pace_min_segs); + rack->in_probe_rtt = 0; + if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && + SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { + /* + * Stop the goodput now, the idea here is + * that future measurements with in_probe_rtt + * won't register if they are not greater so + * we want to get what info (if any) is available + * now. + */ + rack_do_goodput_measurement(rack->rc_tp, rack, + rack->rc_tp->snd_una, __LINE__); + } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { + /* + * We don't have enough data to make a measurement. + * So lets just stop and start here after exiting + * probe-rtt. We probably are not interested in + * the results anyway. + */ + rack->rc_tp->t_flags &= ~TF_GPUTINPROG; + } + /* + * Measurements through the current snd_max are going + * to be limited by the slower pacing rate. + * + * We need to mark these as app-limited so we + * don't collapse the b/w. + */ + rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { + if (rack->r_ctl.rc_app_limited_cnt == 0) + rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; + else { + /* + * Go out to the end app limited and mark + * this new one as next and move the end_appl up + * to this guy. + */ + if (rack->r_ctl.rc_end_appl) + rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; + rack->r_ctl.rc_end_appl = rsm; + } + rsm->r_flags |= RACK_APP_LIMITED; + rack->r_ctl.rc_app_limited_cnt++; + } + /* + * Now, we need to examine our pacing rate multipliers. + * If its under 100%, we need to kick it back up to + * 100%. We also don't let it be over our "max" above + * the actual rate i.e. 100% + rack_clamp_atexit_prtt. + * Note setting clamp_atexit_prtt to 0 has the effect + * of setting CA/SS to 100% always at exit (which is + * the default behavior). + */ + if (rack_probertt_clear_is) { + rack->rc_gp_incr = 0; + rack->rc_gp_bwred = 0; + rack->rc_gp_timely_inc_cnt = 0; + rack->rc_gp_timely_dec_cnt = 0; + } + /* Do we do any clamping at exit? */ + if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { + rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; + rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; + } + if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { + rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; + rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; + } + /* + * Lets set rtt_diff to 0, so that we will get a "boost" + * after exiting. + */ + rack->r_ctl.rc_rtt_diff = 0; + + /* Clear all flags so we start fresh */ + rack->rc_tp->t_bytes_acked = 0; + rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; + /* + * If configured to, set the cwnd and ssthresh to + * our targets. + */ + if (rack_probe_rtt_sets_cwnd) { + uint64_t ebdp; + uint32_t setto; + + /* Set ssthresh so we get into CA once we hit our target */ + if (rack_probertt_use_min_rtt_exit == 1) { + /* Set to min rtt */ + rack_set_prtt_target(rack, segsiz, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); + } else if (rack_probertt_use_min_rtt_exit == 2) { + /* Set to current gp rtt */ + rack_set_prtt_target(rack, segsiz, + rack->r_ctl.rc_gp_srtt); + } else if (rack_probertt_use_min_rtt_exit == 3) { + /* Set to entry gp rtt */ + rack_set_prtt_target(rack, segsiz, + rack->r_ctl.rc_entry_gp_rtt); + } else { + uint64_t sum; + uint32_t setval; + + sum = rack->r_ctl.rc_entry_gp_rtt; + sum *= 10; + sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); + if (sum >= 20) { + /* + * A highly buffered path needs + * cwnd space for timely to work. + * Lets set things up as if + * we are heading back here again. + */ + setval = rack->r_ctl.rc_entry_gp_rtt; + } else if (sum >= 15) { + /* + * Lets take the smaller of the + * two since we are just somewhat + * buffered. + */ + setval = rack->r_ctl.rc_gp_srtt; + if (setval > rack->r_ctl.rc_entry_gp_rtt) + setval = rack->r_ctl.rc_entry_gp_rtt; + } else { + /* + * Here we are not highly buffered + * and should pick the min we can to + * keep from causing loss. + */ + setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); + } + rack_set_prtt_target(rack, segsiz, + setval); + } + if (rack_probe_rtt_sets_cwnd > 1) { + /* There is a percentage here to boost */ + ebdp = rack->r_ctl.rc_target_probertt_flight; + ebdp *= rack_probe_rtt_sets_cwnd; + ebdp /= 100; + setto = rack->r_ctl.rc_target_probertt_flight + ebdp; + } else + setto = rack->r_ctl.rc_target_probertt_flight; + rack->rc_tp->snd_cwnd = roundup(setto, segsiz); + if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { + /* Enforce a min */ + rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; + } + /* If we set in the cwnd also set the ssthresh point so we are in CA */ + rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); + } + rack_log_rtt_shrinks(rack, us_cts, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), + __LINE__, RACK_RTTS_EXITPROBE); + /* Clear times last so log has all the info */ + rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; + rack->r_ctl.rc_time_probertt_entered = us_cts; + rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + rack->r_ctl.rc_time_of_last_probertt = us_cts; +} + +static void +rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) +{ + /* Check in on probe-rtt */ + if (rack->rc_gp_filled == 0) { + /* We do not do p-rtt unless we have gp measurements */ + return; + } + if (rack->in_probe_rtt) { + uint64_t no_overflow; + uint32_t endtime, must_stay; + + if (rack->r_ctl.rc_went_idle_time && + ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { + /* + * We went idle during prtt, just exit now. + */ + rack_exit_probertt(rack, us_cts); + } else if (rack_probe_rtt_safety_val && + TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && + ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { + /* + * Probe RTT safety value triggered! + */ + rack_log_rtt_shrinks(rack, us_cts, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), + __LINE__, RACK_RTTS_SAFETY); + rack_exit_probertt(rack, us_cts); + } + /* Calculate the max we will wait */ + endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); + if (rack->rc_highly_buffered) + endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); + /* Calculate the min we must wait */ + must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); + if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && + TSTMP_LT(us_cts, endtime)) { + uint32_t calc; + /* Do we lower more? */ +no_exit: + if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) + calc = us_cts - rack->r_ctl.rc_time_probertt_entered; + else + calc = 0; + calc /= max(rack->r_ctl.rc_gp_srtt, 1); + if (calc) { + /* Maybe */ + calc *= rack_per_of_gp_probertt_reduce; + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; + /* Limit it too */ + if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; + } + /* We must reach target or the time set */ + return; + } + if (rack->r_ctl.rc_time_probertt_starts == 0) { + if ((TSTMP_LT(us_cts, must_stay) && + rack->rc_highly_buffered) || + (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > + rack->r_ctl.rc_target_probertt_flight)) { + /* We are not past the must_stay time */ + goto no_exit; + } + rack_log_rtt_shrinks(rack, us_cts, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), + __LINE__, RACK_RTTS_REACHTARGET); + rack->r_ctl.rc_time_probertt_starts = us_cts; + if (rack->r_ctl.rc_time_probertt_starts == 0) + rack->r_ctl.rc_time_probertt_starts = 1; + /* Restore back to our rate we want to pace at in prtt */ + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; + } + /* + * Setup our end time, some number of gp_srtts plus 200ms. + */ + no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * + (uint64_t)rack_probertt_gpsrtt_cnt_mul); + if (rack_probertt_gpsrtt_cnt_div) + endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); + else + endtime = 0; + endtime += rack_min_probertt_hold; + endtime += rack->r_ctl.rc_time_probertt_starts; + if (TSTMP_GEQ(us_cts, endtime)) { + /* yes, exit probertt */ + rack_exit_probertt(rack, us_cts); + } + + } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { + /* Go into probertt, its been too long since we went lower */ + rack_enter_probertt(rack, us_cts); + } +} + +static void +rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, + uint32_t rtt, int32_t rtt_diff) +{ + uint64_t cur_bw, up_bnd, low_bnd, subfr; + uint32_t losses; + + if ((rack->rc_gp_dyn_mul == 0) || + (rack->use_fixed_rate) || + (rack->in_probe_rtt) || + (rack->rc_always_pace == 0)) { + /* No dynamic GP multipler in play */ + return; + } + losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; + cur_bw = rack_get_bw(rack); + /* Calculate our up and down range */ + up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; + up_bnd /= 100; + up_bnd += rack->r_ctl.last_gp_comp_bw; + + subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; + subfr /= 100; + low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; + if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { + /* + * This is the case where our RTT is above + * the max target and we have been configured + * to just do timely no bonus up stuff in that case. + * + * There are two configurations, set to 1, and we + * just do timely if we are over our max. If its + * set above 1 then we slam the multipliers down + * to 100 and then decrement per timely. + */ + rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, + __LINE__, 3); + if (rack->r_ctl.rc_no_push_at_mrtt > 1) + rack_validate_multipliers_at_or_below_100(rack); + rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); + } else if ((last_bw_est < low_bnd) && !losses) { + /* + * We are decreasing this is a bit complicated this + * means we are loosing ground. This could be + * because another flow entered and we are competing + * for b/w with it. This will push the RTT up which + * makes timely unusable unless we want to get shoved + * into a corner and just be backed off (the age + * old problem with delay based CC). + * + * On the other hand if it was a route change we + * would like to stay somewhat contained and not + * blow out the buffers. + */ + rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, + __LINE__, 3); + rack->r_ctl.last_gp_comp_bw = cur_bw; + if (rack->rc_gp_bwred == 0) { + /* Go into reduction counting */ + rack->rc_gp_bwred = 1; + rack->rc_gp_timely_dec_cnt = 0; + } + if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || + (timely_says == 0)) { + /* + * Push another time with a faster pacing + * to try to gain back (we include override to + * get a full raise factor). + */ + if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || + (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || + (timely_says == 0) || + (rack_down_raise_thresh == 0)) { + /* + * Do an override up in b/w if we were + * below the threshold or if the threshold + * is zero we always do the raise. + */ + rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); + } else { + /* Log it stays the same */ + rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, + __LINE__, 11); + + } + rack->rc_gp_timely_dec_cnt++; + /* We are not incrementing really no-count */ + rack->rc_gp_incr = 0; + rack->rc_gp_timely_inc_cnt = 0; + } else { + /* + * Lets just use the RTT + * information and give up + * pushing. + */ + goto use_timely; + } + } else if ((timely_says != 2) && + !losses && + (last_bw_est > up_bnd)) { + /* + * We are increasing b/w lets keep going, updating + * our b/w and ignoring any timely input, unless + * of course we are at our max raise (if there is one). + */ + + rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, + __LINE__, 3); + rack->r_ctl.last_gp_comp_bw = cur_bw; + if (rack->rc_gp_saw_ss && + rack_per_upper_bound_ss && + (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { + /* + * In cases where we can't go higher + * we should just use timely. + */ + goto use_timely; + } + if (rack->rc_gp_saw_ca && + rack_per_upper_bound_ca && + (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { + /* + * In cases where we can't go higher + * we should just use timely. + */ + goto use_timely; + } + rack->rc_gp_bwred = 0; + rack->rc_gp_timely_dec_cnt = 0; + /* You get a set number of pushes if timely is trying to reduce */ + if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { + rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); + } else { + /* Log it stays the same */ + rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, + __LINE__, 12); + + } + return; + } else { + /* + * We are staying between the lower and upper range bounds + * so use timely to decide. + */ + rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, + __LINE__, 3); +use_timely: + if (timely_says) { + rack->rc_gp_incr = 0; + rack->rc_gp_timely_inc_cnt = 0; + if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && + !losses && + (last_bw_est < low_bnd)) { + /* We are loosing ground */ + rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); + rack->rc_gp_timely_dec_cnt++; + /* We are not incrementing really no-count */ + rack->rc_gp_incr = 0; + rack->rc_gp_timely_inc_cnt = 0; + } else + rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); + } else { + rack->rc_gp_bwred = 0; + rack->rc_gp_timely_dec_cnt = 0; + rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); + } + } +} + +static int32_t +rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) +{ + int32_t timely_says; + uint64_t log_mult, log_rtt_a_diff; + + log_rtt_a_diff = rtt; + log_rtt_a_diff <<= 32; + log_rtt_a_diff |= (uint32_t)rtt_diff; + if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * + rack_gp_rtt_maxmul)) { + /* Reduce the b/w multipler */ + timely_says = 2; + log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; + log_mult <<= 32; + log_mult |= prev_rtt; + rack_log_timely(rack, timely_says, log_mult, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), + log_rtt_a_diff, __LINE__, 4); + } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + + ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / + max(rack_gp_rtt_mindiv , 1)))) { + /* Increase the b/w multipler */ + log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + + ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / + max(rack_gp_rtt_mindiv , 1)); + log_mult <<= 32; + log_mult |= prev_rtt; + timely_says = 0; + rack_log_timely(rack, timely_says, log_mult , + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), + log_rtt_a_diff, __LINE__, 5); + } else { + /* + * Use a gradient to find it the timely gradient + * is: + * grad = rc_rtt_diff / min_rtt; + * + * anything below or equal to 0 will be + * a increase indication. Anything above + * zero is a decrease. Note we take care + * of the actual gradient calculation + * in the reduction (its not needed for + * increase). + */ + log_mult = prev_rtt; + if (rtt_diff <= 0) { + /* + * Rttdiff is less than zero, increase the + * b/w multipler (its 0 or negative) + */ + timely_says = 0; + rack_log_timely(rack, timely_says, log_mult, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); + } else { + /* Reduce the b/w multipler */ + timely_says = 1; + rack_log_timely(rack, timely_says, log_mult, + get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); + } + } + return (timely_says); +} + +static void +rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, + tcp_seq th_ack, int line) +{ + uint64_t tim, bytes_ps, ltim, stim, utim; + uint32_t segsiz, bytes, reqbytes, us_cts; + int32_t gput, new_rtt_diff, timely_says; + + us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + if (TSTMP_GEQ(us_cts, tp->gput_ts)) + tim = us_cts - tp->gput_ts; + else + tim = 0; + + if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) + stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; + else + stim = 0; + /* + * Use the larger of the send time or ack time. This prevents us + * from being influenced by ack artifacts to come up with too + * high of measurement. Note that since we are spanning over many more + * bytes in most of our measurements hopefully that is less likely to + * occur. + */ + if (tim > stim) + utim = max(tim, 1); + else + utim = max(stim, 1); + /* Lets validate utim */ + ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); + gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; + reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); + if ((tim == 0) && (stim == 0)) { + /* + * Invalid measurement time, maybe + * all on one ack/one send? + */ + bytes = 0; + bytes_ps = 0; + rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, + 0, 0, 0, 10, __LINE__, NULL); + goto skip_measurement; + } + if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { + /* We never made a us_rtt measurement? */ + bytes = 0; + bytes_ps = 0; + rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, + 0, 0, 0, 10, __LINE__, NULL); + goto skip_measurement; + } + /* + * Calculate the maximum possible b/w this connection + * could have. We base our calculation on the lowest + * rtt we have seen during the measurement and the + * largest rwnd the client has given us in that time. This + * forms a BDP that is the maximum that we could ever + * get to the client. Anything larger is not valid. + * + * I originally had code here that rejected measurements + * where the time was less than 1/2 the latest us_rtt. + * But after thinking on that I realized its wrong since + * say you had a 150Mbps or even 1Gbps link, and you + * were a long way away.. example I am in Europe (100ms rtt) + * talking to my 1Gbps link in S.C. Now measuring say 150,000 + * bytes my time would be 1.2ms, and yet my rtt would say + * the measurement was invalid the time was < 50ms. The + * same thing is true for 150Mb (8ms of time). + * + * A better way I realized is to look at what the maximum + * the connection could possibly do. This is gated on + * the lowest RTT we have seen and the highest rwnd. + * We should in theory never exceed that, if we are + * then something on the path is storing up packets + * and then feeding them all at once to our endpoint + * messing up our measurement. + */ + rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; + rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; + rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; + if (SEQ_LT(th_ack, tp->gput_seq)) { + /* No measurement can be made */ + bytes = 0; + bytes_ps = 0; + rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, + 0, 0, 0, 10, __LINE__, NULL); + goto skip_measurement; + } else + bytes = (th_ack - tp->gput_seq); + bytes_ps = (uint64_t)bytes; + /* + * Don't measure a b/w for pacing unless we have gotten at least + * an initial windows worth of data in this measurement interval. + * + * Small numbers of bytes get badly influenced by delayed ack and + * other artifacts. Note we take the initial window or our + * defined minimum GP (defaulting to 10 which hopefully is the + * IW). + */ + if (rack->rc_gp_filled == 0) { + /* + * The initial estimate is special. We + * have blasted out an IW worth of packets + * without a real valid ack ts results. We + * then setup the app_limited_needs_set flag, + * this should get the first ack in (probably 2 + * MSS worth) to be recorded as the timestamp. + * We thus allow a smaller number of bytes i.e. + * IW - 2MSS. + */ + reqbytes -= (2 * segsiz); + /* Also lets fill previous for our first measurement to be neutral */ + rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; + } + if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { + rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, + rack->r_ctl.rc_app_limited_cnt, + 0, 0, 10, __LINE__, NULL); + goto skip_measurement; + } + /* + * We now need to calculate the Timely like status so + * we can update (possibly) the b/w multipliers. + */ + new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; + if (rack->rc_gp_filled == 0) { + /* No previous reading */ + rack->r_ctl.rc_rtt_diff = new_rtt_diff; + } else { + if (rack->measure_saw_probe_rtt == 0) { + /* + * We don't want a probertt to be counted + * since it will be negative incorrectly. We + * expect to be reducing the RTT when we + * pace at a slower rate. + */ + rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); + rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); + } + } + timely_says = rack_make_timely_judgement(rack, + rack->r_ctl.rc_gp_srtt, + rack->r_ctl.rc_rtt_diff, + rack->r_ctl.rc_prev_gp_srtt + ); + bytes_ps *= HPTS_USEC_IN_SEC; + bytes_ps /= utim; + if (bytes_ps > rack->r_ctl.last_max_bw) { + /* + * Something is on path playing + * since this b/w is not possible based + * on our BDP (highest rwnd and lowest rtt + * we saw in the measurement window). + * + * Another option here would be to + * instead skip the measurement. + */ + rack_log_pacing_delay_calc(rack, bytes, reqbytes, + bytes_ps, rack->r_ctl.last_max_bw, 0, + 11, __LINE__, NULL); + bytes_ps = rack->r_ctl.last_max_bw; + } + /* We store gp for b/w in bytes per second */ + if (rack->rc_gp_filled == 0) { + /* Initial measurment */ + if (bytes_ps) { + rack->r_ctl.gp_bw = bytes_ps; + rack->rc_gp_filled = 1; + rack->r_ctl.num_avg = 1; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + } else { + rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, + rack->r_ctl.rc_app_limited_cnt, + 0, 0, 10, __LINE__, NULL); + } + if (rack->rc_inp->inp_in_hpts && + (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + /* + * Ok we can't trust the pacer in this case + * where we transition from un-paced to paced. + * Or for that matter when the burst mitigation + * was making a wild guess and got it wrong. + * Stop the pacer and clear up all the aggregate + * delays etc. + */ + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + rack->r_ctl.rc_hpts_flags = 0; + rack->r_ctl.rc_last_output_to = 0; + } + } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { + /* Still a small number run an average */ + rack->r_ctl.gp_bw += bytes_ps; + rack->r_ctl.num_avg++; + if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { + /* We have collected enought to move forward */ + rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; + } + } else { + /* + * We want to take 1/wma of the goodput and add in to 7/8th + * of the old value weighted by the srtt. So if your measurement + * period is say 2 SRTT's long you would get 1/4 as the + * value, if it was like 1/2 SRTT then you would get 1/16th. + * + * But we must be careful not to take too much i.e. if the + * srtt is say 20ms and the measurement is taken over + * 400ms our weight would be 400/20 i.e. 20. On the + * other hand if we get a measurement over 1ms with a + * 10ms rtt we only want to take a much smaller portion. + */ + uint64_t resid_bw, subpart, addpart, srtt; + + srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); + if (srtt == 0) { + /* + * Strange why did t_srtt go back to zero? + */ + if (rack->r_ctl.rc_rack_min_rtt) + srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); + else + srtt = HPTS_USEC_IN_MSEC; + } + /* + * XXXrrs: Note for reviewers, in playing with + * dynamic pacing I discovered this GP calculation + * as done originally leads to some undesired results. + * Basically you can get longer measurements contributing + * too much to the WMA. Thus I changed it if you are doing + * dynamic adjustments to only do the aportioned adjustment + * if we have a very small (time wise) measurement. Longer + * measurements just get there weight (defaulting to 1/8) + * add to the WMA. We may want to think about changing + * this to always do that for both sides i.e. dynamic + * and non-dynamic... but considering lots of folks + * were playing with this I did not want to change the + * calculation per.se. without your thoughts.. Lawerence? + * Peter?? + */ + if (rack->rc_gp_dyn_mul == 0) { + subpart = rack->r_ctl.gp_bw * utim; + subpart /= (srtt * 8); + if (subpart < (rack->r_ctl.gp_bw / 2)) { + /* + * The b/w update takes no more + * away then 1/2 our running total + * so factor it in. + */ + addpart = bytes_ps * utim; + addpart /= (srtt * 8); + } else { + /* + * Don't allow a single measurement + * to account for more than 1/2 of the + * WMA. This could happen on a retransmission + * where utim becomes huge compared to + * srtt (multiple retransmissions when using + * the sending rate which factors in all the + * transmissions from the first one). + */ + subpart = rack->r_ctl.gp_bw / 2; + addpart = bytes_ps / 2; + } + resid_bw = rack->r_ctl.gp_bw - subpart; + rack->r_ctl.gp_bw = resid_bw + addpart; + } else { + if ((utim / srtt) <= 1) { + /* + * The b/w update was over a small period + * of time. The idea here is to prevent a small + * measurement time period from counting + * too much. So we scale it based on the + * time so it attributes less than 1/rack_wma_divisor + * of its measurement. + */ + subpart = rack->r_ctl.gp_bw * utim; + subpart /= (srtt * rack_wma_divisor); + addpart = bytes_ps * utim; + addpart /= (srtt * rack_wma_divisor); + } else { + /* + * The scaled measurement was long + * enough so lets just add in the + * portion of the measurment i.e. 1/rack_wma_divisor + */ + subpart = rack->r_ctl.gp_bw / rack_wma_divisor; + addpart = bytes_ps / rack_wma_divisor; + } + if ((rack->measure_saw_probe_rtt == 0) || + (bytes_ps > rack->r_ctl.gp_bw)) { + /* + * For probe-rtt we only add it in + * if its larger, all others we just + * add in. + */ + resid_bw = rack->r_ctl.gp_bw - subpart; + rack->r_ctl.gp_bw = resid_bw + addpart; + } + } + } + /* We do not update any multipliers if we are in or have seen a probe-rtt */ + if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) + rack_update_multiplier(rack, timely_says, bytes_ps, + rack->r_ctl.rc_gp_srtt, + rack->r_ctl.rc_rtt_diff); + rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, + rack_get_bw(rack), 3, line, NULL); + /* reset the gp srtt and setup the new prev */ + rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; + /* Record the lost count for the next measurement */ + rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; + /* + * We restart our diffs based on the gpsrtt in the + * measurement window. + */ + rack->rc_gp_rtt_set = 0; + rack->rc_gp_saw_rec = 0; + rack->rc_gp_saw_ca = 0; + rack->rc_gp_saw_ss = 0; + rack->rc_dragged_bottom = 0; +skip_measurement: + +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, + gput); + /* + * XXXLAS: This is a temporary hack, and should be + * chained off VOI_TCP_GPUT when stats(9) grows an + * API to deal with chained VOIs. + */ + if (tp->t_stats_gput_prev > 0) + stats_voi_update_abs_s32(tp->t_stats, + VOI_TCP_GPUT_ND, + ((gput - tp->t_stats_gput_prev) * 100) / + tp->t_stats_gput_prev); +#endif + tp->t_flags &= ~TF_GPUTINPROG; + tp->t_stats_gput_prev = gput; + /* + * Now are we app limited now and there is space from where we + * were to where we want to go? + * + * We don't do the other case i.e. non-applimited here since + * the next send will trigger us picking up the missing data. + */ + if (rack->r_ctl.rc_first_appl && + rack->r_ctl.rc_app_limited_cnt && + (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && + ((rack->r_ctl.rc_first_appl->r_start - th_ack) > + max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { + /* + * Yep there is enough outstanding to make a measurement here. + */ + struct rack_sendmap *rsm, fe; + + tp->t_flags |= TF_GPUTINPROG; + rack->r_ctl.rc_gp_lowrtt = 0xffffffff; + rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; + tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + rack->app_limited_needs_set = 0; + tp->gput_seq = th_ack; + if (rack->in_probe_rtt) + rack->measure_saw_probe_rtt = 1; + else if ((rack->measure_saw_probe_rtt) && + (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) + rack->measure_saw_probe_rtt = 0; + if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { + /* There is a full window to gain info from */ + tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); + } else { + /* We can only measure up to the applimited point */ + tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); + } + /* + * Now we need to find the timestamp of the send at tp->gput_seq + * for the send based measurement. + */ + fe.r_start = tp->gput_seq; + rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + if (rsm) { + /* Ok send-based limit is set */ + if (SEQ_LT(rsm->r_start, tp->gput_seq)) { + /* + * Move back to include the earlier part + * so our ack time lines up right (this may + * make an overlapping measurement but thats + * ok). + */ + tp->gput_seq = rsm->r_start; + } + if (rsm->r_flags & RACK_ACKED) + tp->gput_ts = rsm->r_ack_arrival; + else + rack->app_limited_needs_set = 1; + rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + } else { + /* + * If we don't find the rsm due to some + * send-limit set the current time, which + * basically disables the send-limit. + */ + rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); + } + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + (uint64_t)rsm, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, + __LINE__, NULL); + } +} + /* * CC wrapper hook functions */ @@ -1632,10 +3948,6 @@ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery) { -#ifdef STATS - int32_t gput; -#endif - INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); @@ -1647,63 +3959,41 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui tp->ccv->bytes_this_ack = max; } } - if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || - (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && - (tp->snd_cwnd < (ctf_flight_size(tp, rack->r_ctl.rc_sacked) * 2)))) + if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; - - if (type == CC_ACK) { #ifdef STATS - stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, - ((int32_t) tp->snd_cwnd) - tp->snd_wnd); - if ((tp->t_flags & TF_GPUTINPROG) && - SEQ_GEQ(th->th_ack, tp->gput_ack)) { - gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / - max(1, tcp_ts_getticks() - tp->gput_ts); - /* We store it in bytes per ms (or kbytes per sec) */ - rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8; - rack->r_ctl.rc_gp_hist_idx++; - if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST) - rack->r_ctl.rc_gp_hist_filled = 1; - rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST; - stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, - gput); - /* - * XXXLAS: This is a temporary hack, and should be - * chained off VOI_TCP_GPUT when stats(9) grows an - * API to deal with chained VOIs. - */ - if (tp->t_stats_gput_prev > 0) - stats_voi_update_abs_s32(tp->t_stats, - VOI_TCP_GPUT_ND, - ((gput - tp->t_stats_gput_prev) * 100) / - tp->t_stats_gput_prev); - tp->t_flags &= ~TF_GPUTINPROG; - tp->t_stats_gput_prev = gput; + stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, + ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); +#endif + if ((tp->t_flags & TF_GPUTINPROG) && + rack_enough_for_measurement(tp, rack, th->th_ack)) { + /* Measure the Goodput */ + rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); #ifdef NETFLIX_PEAKRATE - if (tp->t_maxpeakrate) { - /* - * We update t_peakrate_thr. This gives us roughly - * one update per round trip time. - */ - tcp_update_peakrate_thr(tp); - } -#endif + if ((type == CC_ACK) && + (tp->t_maxpeakrate)) { + /* + * We update t_peakrate_thr. This gives us roughly + * one update per round trip time. Note + * it will only be used if pace_always is off i.e + * we don't do this for paced flows. + */ + tcp_update_peakrate_thr(tp); } #endif - if (tp->snd_cwnd > tp->snd_ssthresh) { - tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, - nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); - if (tp->t_bytes_acked >= tp->snd_cwnd) { - tp->t_bytes_acked -= tp->snd_cwnd; - tp->ccv->flags |= CCF_ABC_SENTAWND; - } - } else { - tp->ccv->flags &= ~CCF_ABC_SENTAWND; - tp->t_bytes_acked = 0; + } + if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { + tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, + nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); + if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { + tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; + tp->ccv->flags |= CCF_ABC_SENTAWND; } + } else { + tp->ccv->flags &= ~CCF_ABC_SENTAWND; + tp->t_bytes_acked = 0; } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ @@ -1711,15 +4001,19 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef STATS - stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); #endif - if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { - rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; + if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { + rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; } - /* we enforce max peak rate if it is set. */ - if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { +#ifdef NETFLIX_PEAKRATE + /* we enforce max peak rate if it is set and we are not pacing */ + if ((rack->rc_always_pace == 0) && + tp->t_peakrate_thr && + (tp->snd_cwnd > tp->t_peakrate_thr)) { tp->snd_cwnd = tp->t_peakrate_thr; } +#endif } static void @@ -1729,50 +4023,49 @@ tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); - if (rack->r_ctl.rc_prr_sndcnt > 0) - rack->r_wanted_output++; + /* + * If we are doing PRR and have enough + * room to send we are pacing and prr + * is disabled we will want to see if we + * can send data (by setting r_wanted_output to + * true). + */ + if ((rack->r_ctl.rc_prr_sndcnt > 0) || + rack->rack_no_prr) + rack->r_wanted_output = 1; } static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; + uint32_t orig_cwnd; + + orig_cwnd = tp->snd_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; - if (CC_ALGO(tp)->post_recovery != NULL) { - tp->ccv->curack = th->th_ack; - CC_ALGO(tp)->post_recovery(tp->ccv); - } - /* - * Here we can in theory adjust cwnd to be based on the number of - * losses in the window (rack->r_ctl.rc_loss_count). This is done - * based on the rack_use_proportional flag. - */ - if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { - int32_t reduce; - - reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); - if (reduce > 50) { - reduce = 50; + if (rack->rc_not_backing_off == 0) { + /* only alert CC if we alerted when we entered */ + if (CC_ALGO(tp)->post_recovery != NULL) { + tp->ccv->curack = th->th_ack; + CC_ALGO(tp)->post_recovery(tp->ccv); } - tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); - } else { if (tp->snd_cwnd > tp->snd_ssthresh) { /* Drop us down to the ssthresh (1/2 cwnd at loss) */ tp->snd_cwnd = tp->snd_ssthresh; } } - if (rack->r_ctl.rc_prr_sndcnt > 0) { + if ((rack->rack_no_prr == 0) && + (rack->r_ctl.rc_prr_sndcnt > 0)) { /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; - rack_log_to_prr(rack, 1); + rack_log_to_prr(rack, 1, 0); } + rack_log_to_prr(rack, 14, orig_cwnd); tp->snd_recover = tp->snd_una; EXIT_RECOVERY(tp->t_flags); - - } static void @@ -1788,12 +4081,12 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) tp->t_flags &= ~TF_WASFRECOVERY; tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { - rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; - rack->r_ctl.rc_loss_count = 0; - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 2); + if (rack->rack_no_prr == 0) { + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 2, 0); + } rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) @@ -1812,7 +4105,7 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); - tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); tp->snd_cwnd = ctf_fixed_maxseg(tp); if (tp->t_flags2 & TF2_ECN_PERMIT) @@ -1836,7 +4129,41 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) tp->t_badrxtwin = 0; break; } + /* + * If we are below our max rtt, don't + * signal the CC control to change things. + * instead set it up so that we are in + * recovery but not going to back off. + */ + if (rack->rc_highly_buffered) { + /* + * Do we use the higher rtt for + * our threshold to not backoff (like CDG)? + */ + uint32_t rtt_mul, rtt_div; + + if (rack_use_max_for_nobackoff) { + rtt_mul = (rack_gp_rtt_maxmul - 1); + rtt_div = 1; + } else { + rtt_mul = rack_gp_rtt_minmul; + rtt_div = max(rack_gp_rtt_mindiv , 1); + } + if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + + ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / + rtt_div))) { + /* below our min threshold */ + rack->rc_not_backing_off = 1; + ENTER_RECOVERY(rack->rc_tp->t_flags); + rack_log_rtt_shrinks(rack, 0, + rtt_mul, + rtt_div, + RACK_RTTS_NOBACKOFF); + return; + } + } + rack->rc_not_backing_off = 0; if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; @@ -1847,7 +4174,7 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) static inline void -rack_cc_after_idle(struct tcpcb *tp) +rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) { uint32_t i_cwnd; @@ -1864,7 +4191,7 @@ rack_cc_after_idle(struct tcpcb *tp) if (tp->snd_cwnd == 1) i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ else - i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); + i_cwnd = rc_init_window(rack); /* * Being idle is no differnt than the initial window. If the cc @@ -2014,7 +4341,7 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, { struct rack_sendmap *prsm; uint32_t thresh, len; - int maxseg; + int segsiz; if (srtt == 0) srtt = 1; @@ -2024,12 +4351,12 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, thresh = (srtt * 2); /* Get the previous sent packet, if any */ - maxseg = ctf_fixed_maxseg(tp); + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { /* Exactly like the ID */ - if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { + if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. @@ -2042,7 +4369,7 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, rack_head, r_tnext); - if (prsm && (len <= maxseg)) { + if (prsm && (len <= segsiz)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). @@ -2058,7 +4385,7 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; - } else if (len <= maxseg) { + } else if (len <= segsiz) { /* * Possibly compensate for delayed-ack. */ @@ -2071,7 +4398,7 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { /* 2.2 behavior */ - if (len <= maxseg) { + if (len <= segsiz) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. @@ -2144,16 +4471,22 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) idx = rsm->r_rtr_cnt - 1; srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, tsused); - if (tsused < rsm->r_tim_lastsent[idx]) { + if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { return (NULL); } if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { return (NULL); } - /* Ok if we reach here we are over-due */ - rack->r_ctl.rc_rsm_start = rsm->r_start; - rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; - rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + /* Ok if we reach here we are over-due and this guy can be sent */ + if (IN_RECOVERY(tp->t_flags) == 0) { + /* + * For the one that enters us into recovery record undo + * info. + */ + rack->r_ctl.rc_rsm_start = rsm->r_start; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + } rack_cong_signal(tp, NULL, CC_NDUPACK); return (rsm); } @@ -2198,6 +4531,7 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_ /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } + rack->rc_on_min_to = 0; if ((tp->t_state < TCPS_ESTABLISHED) || ((tp->t_flags & TF_SACK_PERMIT) == 0)) goto activate_rxt; @@ -2213,7 +4547,7 @@ activate_rxt: tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; - if (TSTMP_GT(tstmp_touse, cts)) + if (TSTMP_GT(cts, tstmp_touse)) time_since_sent = cts - tstmp_touse; } if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { @@ -2257,8 +4591,9 @@ activate_rxt: */ goto activate_rxt; } - if ((rack->use_rack_cheat == 0) && + if ((rack->use_rack_rr == 0) && (IN_RECOVERY(tp->t_flags)) && + (rack->rack_no_prr == 0) && (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { /* * We are not cheating, in recovery and @@ -2278,18 +4613,22 @@ activate_rxt: to = exp - cts; if (to < rack->r_ctl.rc_min_to) { to = rack->r_ctl.rc_min_to; + if (rack->r_rr_config == 3) + rack->rc_on_min_to = 1; } } else { to = rack->r_ctl.rc_min_to; + if (rack->r_rr_config == 3) + rack->rc_on_min_to = 1; } } else { /* Ok we need to do a TLP not RACK */ activate_tlp: - if ((rack->rc_tlp_in_progress != 0) || - (rack->r_ctl.rc_tlp_rtx_out != 0)) { + if ((rack->rc_tlp_in_progress != 0) && + (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { /* - * The previous send was a TLP or a tlp_rtx is in - * process. + * The previous send was a TLP and we have sent + * N TLP's without sending new data. */ goto activate_rxt; } @@ -2309,7 +4648,7 @@ activate_tlp: tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; - if (TSTMP_GT(tstmp_touse, cts)) + if (TSTMP_GT(cts, tstmp_touse)) time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; if (tp->t_srtt) { @@ -2317,11 +4656,27 @@ activate_tlp: srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; + /* + * If the SRTT is not keeping up and the + * rack RTT has spiked we want to use + * the last RTT not the smoothed one. + */ + if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) + srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); if (thresh > time_since_sent) to = thresh - time_since_sent; - else + else { to = rack->r_ctl.rc_min_to; + rack_log_alt_to_to_cancel(rack, + thresh, /* flex1 */ + time_since_sent, /* flex2 */ + tstmp_touse, /* flex3 */ + rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ + rsm->r_tim_lastsent[idx], + srtt, + idx, 99); + } if (to > TCPTV_REXMTMAX) { /* * If the TLP time works out to larger than the max @@ -2329,28 +4684,11 @@ activate_tlp: */ goto activate_rxt; } - if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { - /* - * The tail is no longer the last one I did a probe - * on - */ - rack->r_ctl.rc_tlp_seg_send_cnt = 0; - rack->r_ctl.rc_last_tlp_seq = rsm->r_start; - } } if (is_tlp_timer == 0) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { - if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || - (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { - /* - * We have exceeded how many times we can retran the - * current TLP timer, switch to the RTO timer. - */ - goto activate_rxt; - } else { - rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; - } + rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } if (to == 0) to = 1; @@ -2361,7 +4699,22 @@ static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { - rack->r_ctl.rc_went_idle_time = cts; + if (tp->t_flags & TF_GPUTINPROG) { + /* + * Stop the goodput now, the calling of the + * measurement function clears the flag. + */ + rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); + } +#ifdef NETFLIX_SHARED_CWND + if (rack->r_ctl.rc_scw) { + tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); + rack->rack_scwnd_is_idle = 1; + } +#endif + rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); + if (rack->r_ctl.rc_went_idle_time == 0) + rack->r_ctl.rc_went_idle_time = 1; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; rack->rc_in_persist = 1; @@ -2369,63 +4722,184 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } static void -rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) +rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack->r_ctl.rc_hpts_flags = 0; } +#ifdef NETFLIX_SHARED_CWND + if (rack->r_ctl.rc_scw) { + tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); + rack->rack_scwnd_is_idle = 0; + } +#endif + if (rack->rc_gp_dyn_mul && + (rack->use_fixed_rate == 0) && + (rack->rc_always_pace)) { + /* + * Do we count this as if a probe-rtt just + * finished? + */ + uint32_t time_idle, idle_min; + + time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; + idle_min = rack_min_probertt_hold; + if (rack_probertt_gpsrtt_cnt_div) { + uint64_t extra; + extra = (uint64_t)rack->r_ctl.rc_gp_srtt * + (uint64_t)rack_probertt_gpsrtt_cnt_mul; + extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; + idle_min += (uint32_t)extra; + } + if (time_idle >= idle_min) { + /* Yes, we count it as a probe-rtt. */ + uint32_t us_cts; + + us_cts = tcp_get_usecs(NULL); + if (rack->in_probe_rtt == 0) { + rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; + rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; + rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; + } else { + rack_exit_probertt(rack, us_cts); + } + } + + } rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; - tp->t_flags &= ~TF_FORCEDATA; tp->t_rxtshift = 0; + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; +} + +static void +rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, + struct hpts_diag *diag, struct timeval *tv) +{ + if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = diag->p_nxt_slot; + log.u_bbr.flex2 = diag->p_cur_slot; + log.u_bbr.flex3 = diag->slot_req; + log.u_bbr.flex4 = diag->inp_hptsslot; + log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex6 = diag->need_new_to; + log.u_bbr.flex7 = diag->p_hpts_active; + log.u_bbr.flex8 = diag->p_on_min_sleep; + /* Hijack other fields as needed */ + log.u_bbr.epoch = diag->have_slept; + log.u_bbr.lt_epoch = diag->yet_to_sleep; + log.u_bbr.pkts_out = diag->co_ret; + log.u_bbr.applimited = diag->hpts_sleep_time; + log.u_bbr.delivered = diag->p_prev_slot; + log.u_bbr.inflight = diag->p_runningtick; + log.u_bbr.bw_inuse = diag->wheel_tick; + log.u_bbr.rttProp = diag->wheel_cts; + log.u_bbr.timeStamp = cts; + log.u_bbr.delRate = diag->maxticks; + log.u_bbr.cur_del_rate = diag->p_curtick; + log.u_bbr.cur_del_rate <<= 32; + log.u_bbr.cur_del_rate |= diag->p_lasttick; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); + } + } static void rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t slot, uint32_t tot_len_this_send, int sup_rack) { + struct hpts_diag diag; struct inpcb *inp; + struct timeval tv; uint32_t delayed_ack = 0; uint32_t hpts_timeout; uint8_t stopped; uint32_t left = 0; + uint32_t us_cts; inp = tp->t_inpcb; - if (inp->inp_in_hpts) { - /* A previous call is already set up */ - return; - } if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { return; } + if (inp->inp_in_hpts) { + /* Already on the pacer */ + return; + } stopped = rack->rc_tmr_stopped; if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } - rack->tlp_timer_up = 0; rack->r_ctl.rc_timer_exp = 0; - if (rack->rc_inp->inp_in_hpts == 0) { - rack->r_ctl.rc_hpts_flags = 0; + rack->r_ctl.rc_hpts_flags = 0; + us_cts = tcp_get_usecs(&tv); + /* Now early/late accounting */ + if (rack->r_early) { + /* + * We have a early carry over set, + * we can always add more time so we + * can always make this compensation. + */ + slot += rack->r_ctl.rc_agg_early; + rack->r_early = 0; + rack->r_ctl.rc_agg_early = 0; + } + if (rack->r_late) { + /* + * This is harder, we can + * compensate some but it + * really depends on what + * the current pacing time is. + */ + if (rack->r_ctl.rc_agg_delayed >= slot) { + /* + * We can't compensate for it all. + * And we have to have some time + * on the clock. We always have a min + * 10 slots (10 x 10 i.e. 100 usecs). + */ + if (slot <= HPTS_TICKS_PER_USEC) { + /* We gain delay */ + rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); + slot = HPTS_TICKS_PER_USEC; + } else { + /* We take off some */ + rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); + slot = HPTS_TICKS_PER_USEC; + } + } else { + + slot -= rack->r_ctl.rc_agg_delayed; + rack->r_ctl.rc_agg_delayed = 0; + /* Make sure we have 100 useconds at minimum */ + if (slot < HPTS_TICKS_PER_USEC) { + rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; + slot = HPTS_TICKS_PER_USEC; + } + if (rack->r_ctl.rc_agg_delayed == 0) + rack->r_late = 0; + } } if (slot) { - /* We are hptsi too */ + /* We are pacing too */ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; - } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { - /* - * We are still left on the hpts when the to goes - * it will be for output. - */ - if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) - slot = rack->r_ctl.rc_last_output_to - cts; - else - slot = 1; } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); #ifdef NETFLIX_EXP_DETECTION if (rack->sack_attack_disable && - (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) { + (slot < tcp_sad_pacing_interval)) { /* * We have a potential attacker on * the line. We have possibly some @@ -2436,7 +4910,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * interval is longer). Its stored in * micro-seconds, so lets convert to msecs. */ - slot = USEC_TO_MSEC(tcp_sad_pacing_interval); + slot = tcp_sad_pacing_interval; } #endif if (tp->t_flags & TF_DELACK) { @@ -2469,6 +4943,16 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, hpts_timeout = TP_KEEPINIT(tp); } rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; + if (rack->in_probe_rtt) { + /* + * We want to instead not wake up a long time from + * now but to wake up about the time we would + * exit probe-rtt and initiate a keep-alive ack. + * This will get us out of probe-rtt and update + * our min-rtt. + */ + hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); + } } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == @@ -2494,38 +4978,70 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } + if ((rack->rc_gp_filled == 0) && + (hpts_timeout < slot) && + (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { + /* + * We have no good estimate yet for the + * old clunky burst mitigation or the + * real pacing. And the tlp or rxt is smaller + * than the pacing calculation. Lets not + * pace that long since we know the calculation + * so far is not accurate. + */ + slot = hpts_timeout; + } + rack->r_ctl.last_pacing_time = slot; if (slot) { - rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; - if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) - inp->inp_flags2 |= INP_DONT_SACK_QUEUE; - else - inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; - rack->r_ctl.rc_last_output_to = cts + slot; - if ((hpts_timeout == 0) || (hpts_timeout > slot)) { - if (rack->rc_inp->inp_in_hpts == 0) - tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); - rack_log_to_start(rack, cts, hpts_timeout, slot, 1); - } else { + rack->r_ctl.rc_last_output_to = us_cts + slot; + if (rack->rc_always_pace || rack->r_mbuf_queue) { + if ((rack->rc_gp_filled == 0) || + rack->pacing_longer_than_rtt) { + inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); + } else { + inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && + (rack->r_rr_config != 3)) + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + else + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + } + } + if ((rack->use_rack_rr) && + (rack->r_rr_config < 2) && + ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - if (rack->rc_inp->inp_in_hpts == 0) - tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), + __LINE__, &diag); + rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); + } else { + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), + __LINE__, &diag); + rack_log_hpts_diag(rack, us_cts, &diag, &tv); + rack_log_to_start(rack, cts, hpts_timeout, slot, 1); } } else if (hpts_timeout) { - if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { - /* For a rack timer, don't wake us */ - rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; - inp->inp_flags2 |= INP_DONT_SACK_QUEUE; - } else { - /* All other timers wake us up */ - rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; - inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + if (rack->rc_always_pace || rack->r_mbuf_queue) { + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { + /* For a rack timer, don't wake us */ + inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + if (rack->r_rr_config != 3) + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + else + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + } else { + /* All other timers wake us up */ + inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + } } - if (rack->rc_inp->inp_in_hpts == 0) - tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), + __LINE__, &diag); + rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { /* No timer starting */ @@ -2538,7 +5054,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, } rack->rc_tmr_stopped = 0; if (slot) - rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); + rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); } /* @@ -2559,59 +5075,69 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * settings. */ struct rack_sendmap *rsm; - int32_t recovery, ll; + int32_t recovery; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } - if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { - /* Its not time yet */ - return (0); - } recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); + rack->rc_on_min_to = 0; rsm = rack_check_recovery_mode(tp, cts); - if (rsm) - ll = rsm->r_end - rsm->r_start; - else - ll = 0; - rack_log_to_event(rack, RACK_TO_FRM_RACK, ll); + rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); if (rsm) { uint32_t rtt; + rack->r_ctl.rc_resend = rsm; + if (rack->use_rack_rr) { + /* + * Don't accumulate extra pacing delay + * we are allowing the rack timer to + * over-ride pacing i.e. rrr takes precedence + * if the pacing interval is longer than the rrr + * time (in other words we get the min pacing + * time versus rrr pacing time). + */ + rack->r_timer_override = 1; + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + } rtt = rack->rc_rack_rtt; if (rtt == 0) rtt = 1; - if ((recovery == 0) && - (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { - /* - * The rack-timeout that enter's us into recovery - * will force out one MSS and set us up so that we - * can do one more send in 2*rtt (transitioning the - * rack timeout into a rack-tlp). - */ - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 3); - } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && - rack->use_rack_cheat) { - /* - * When a rack timer goes, if the rack cheat is - * on, arrange it so we can send a full segment. - */ - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 4); + if (rack->rack_no_prr == 0) { + if ((recovery == 0) && + (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { + /* + * The rack-timeout that enter's us into recovery + * will force out one MSS and set us up so that we + * can do one more send in 2*rtt (transitioning the + * rack timeout into a rack-tlp). + */ + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack->r_timer_override = 1; + rack_log_to_prr(rack, 3, 0); + } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && + rack->use_rack_rr) { + /* + * When a rack timer goes, if the rack rr is + * on, arrange it so we can send a full segment + * overriding prr (though we pay a price for this + * for future new sends). + */ + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 4, 0); + } } - } else { - /* This is a case that should happen rarely if ever */ - counter_u64_add(rack_tlp_does_nada, 1); -#ifdef TCP_BLACKBOX - tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); -#endif - rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; + if (rsm == NULL) { + /* restart a timer and return 1 */ + rack_start_hpts_timer(rack, tp, cts, + 0, 0, 0); + return (1); + } return (0); } @@ -2626,8 +5152,10 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_dupack = rsm->r_dupack; + nrsm->usec_orig_send = rsm->usec_orig_send; nrsm->r_rtr_bytes = 0; rsm->r_end = nrsm->r_start; + nrsm->r_just_ret = rsm->r_just_ret; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } @@ -2660,6 +5188,7 @@ rack_merge_rsm(struct tcp_rack *rack, TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); r_rsm->r_in_tmap = 0; } + /* Now the flags */ if (r_rsm->r_flags & RACK_HAS_FIN) l_rsm->r_flags |= RACK_HAS_FIN; @@ -2667,6 +5196,18 @@ rack_merge_rsm(struct tcp_rack *rack, l_rsm->r_flags |= RACK_TLP; if (r_rsm->r_flags & RACK_RWND_COLLAPSED) l_rsm->r_flags |= RACK_RWND_COLLAPSED; + if ((r_rsm->r_flags & RACK_APP_LIMITED) && + ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { + /* + * If both are app-limited then let the + * free lower the count. If right is app + * limited and left is not, transfer. + */ + l_rsm->r_flags |= RACK_APP_LIMITED; + r_rsm->r_flags &= ~RACK_APP_LIMITED; + if (r_rsm == rack->r_ctl.rc_first_appl) + rack->r_ctl.rc_first_appl = l_rsm; + } rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); #ifdef INVARIANTS if (rm != r_rsm) { @@ -2711,7 +5252,8 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) /* Its not time yet */ return (0); } - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); return (1); } @@ -2719,7 +5261,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ - rack_log_to_event(rack, RACK_TO_FRM_TLP, 0); + rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); @@ -2735,32 +5277,39 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) #endif avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; - rack->tlp_timer_up = 1; if (out > tp->snd_wnd) { /* special case, we need a retransmission */ collapsed_win = 1; goto need_retran; } /* - * If we are in recovery we can jazz out a segment if new data is - * present simply by setting rc_prr_sndcnt to a segment. + * Check our send oldest always settings, and if + * there is an oldest to send jump to the need_retran. */ - if ((avail > out) && - ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { + if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) + goto need_retran; + + if (avail > out) { /* New data is available */ amm = avail - out; if (amm > ctf_fixed_maxseg(tp)) { amm = ctf_fixed_maxseg(tp); - } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) { - /* not enough to fill a MTU and no-delay is off */ + if ((amm + out) > tp->snd_wnd) { + /* We are rwnd limited */ + goto need_retran; + } + } else if (amm < ctf_fixed_maxseg(tp)) { + /* not enough to fill a MTU */ goto need_retran; } if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ - old_prr_snd = rack->r_ctl.rc_prr_sndcnt; - if (out + amm <= tp->snd_wnd) { - rack->r_ctl.rc_prr_sndcnt = amm; - rack_log_to_prr(rack, 4); + if (rack->rack_no_prr == 0) { + old_prr_snd = rack->r_ctl.rc_prr_sndcnt; + if (out + amm <= tp->snd_wnd) { + rack->r_ctl.rc_prr_sndcnt = amm; + rack_log_to_prr(rack, 4, 0); + } } else goto need_retran; } else { @@ -2770,8 +5319,6 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) else goto need_retran; } - rack->r_ctl.rc_tlp_seg_send_cnt = 0; - rack->r_ctl.rc_last_tlp_seq = tp->snd_max; rack->r_ctl.rc_tlpsend = NULL; counter_u64_add(rack_tlp_newdata, 1); goto send; @@ -2853,48 +5400,11 @@ need_retran: rsm = nrsm; } rack->r_ctl.rc_tlpsend = rsm; - rack->r_ctl.rc_tlp_rtx_out = 1; - if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { - rack->r_ctl.rc_tlp_seg_send_cnt++; - tp->t_rxtshift++; - } else { - rack->r_ctl.rc_last_tlp_seq = rsm->r_start; - rack->r_ctl.rc_tlp_seg_send_cnt = 1; - } send: - rack->r_ctl.rc_tlp_send_cnt++; - if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { - /* - * Can't [re]/transmit a segment we have not heard from the - * peer in max times. We need the retransmit timer to take - * over. - */ - restore: - rack->r_ctl.rc_tlpsend = NULL; - if (rsm) - rsm->r_flags &= ~RACK_TLP; - rack->r_ctl.rc_prr_sndcnt = old_prr_snd; - rack_log_to_prr(rack, 5); - counter_u64_add(rack_tlp_retran_fail, 1); - goto out; - } else if (rsm) { - rsm->r_flags |= RACK_TLP; - } - if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && - (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { - /* - * We don't want to send a single segment more than the max - * either. - */ - goto restore; - } rack->r_timer_override = 1; - rack->r_tlp_running = 1; - rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: - rack->tlp_timer_up = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } @@ -2913,7 +5423,7 @@ rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } - rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0); + rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; KMOD_TCPSTAT_INC(tcps_delack); @@ -2922,8 +5432,8 @@ rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } /* - * Persists timer, here we simply need to setup the - * FORCE-DATA flag the output routine will send + * Persists timer, here we simply send the + * same thing as a keepalive will. * the one byte send. * * We only return 1, saying don't proceed, if all timers @@ -2943,7 +5453,9 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } if (rack->rc_in_persist == 0) return (0); - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, false)) { + tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); + rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } @@ -2964,12 +5476,13 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); retval = 1; + tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) - rack_exit_persist(tp, rack); + rack_exit_persist(tp, rack, cts); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; /* * If the user has closed the socket then drop a persisting @@ -2979,11 +5492,17 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { retval = 1; KMOD_TCPSTAT_INC(tcps_persistdrop); + tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } t_template = tcpip_maketemplate(rack->rc_inp); if (t_template) { + /* only set it if we were answered */ + if (rack->forced_ack == 0) { + rack->forced_ack = 1; + rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); + } tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); @@ -2995,7 +5514,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; out: - rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0); + rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (retval); @@ -3018,7 +5537,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; - rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0); + rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. @@ -3043,6 +5562,10 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) KMOD_TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { + if (rack->forced_ack == 0) { + rack->forced_ack = 1; + rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); + } tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); @@ -3053,6 +5576,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) return (1); dropit: KMOD_TCPSTAT_INC(tcps_keepdrops); + tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); return (1); } @@ -3074,7 +5598,7 @@ rack_remxt_tmr(struct tcpcb *tp) rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); - rack_log_to_event(rack, RACK_TO_FRM_TMR, 0); + rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* @@ -3101,19 +5625,48 @@ rack_remxt_tmr(struct tcpcb *tp) } } trsm = rsm; + if (rsm->r_flags & RACK_ACKED) + rsm->r_flags |= RACK_WAS_ACKED; rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ rack->r_ctl.rc_sacked = 0; + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_ctl.rc_agg_early = 0; + rack->r_late = 0; /* Clear the tlp rtx mark */ - rack->r_ctl.rc_tlp_rtx_out = 0; - rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); rack->r_ctl.rc_prr_sndcnt = 0; - rack_log_to_prr(rack, 6); + rack_log_to_prr(rack, 6, 0); rack->r_timer_override = 1; } +static void +rack_cc_conn_init(struct tcpcb *tp) +{ + struct tcp_rack *rack; + + + rack = (struct tcp_rack *)tp->t_fb_ptr; + cc_conn_init(tp); + /* + * We want a chance to stay in slowstart as + * we create a connection. TCP spec says that + * initially ssthresh is infinite. For our + * purposes that is the snd_wnd. + */ + if (tp->snd_ssthresh < tp->snd_wnd) { + tp->snd_ssthresh = tp->snd_wnd; + } + /* + * We also want to assure a IW worth of + * data can get inflight. + */ + if (rc_init_window(rack) < tp->snd_cwnd) + tp->snd_cwnd = rc_init_window(rack); +} + /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. @@ -3130,7 +5683,9 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, false)) { + tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); + rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } @@ -3160,6 +5715,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); retval = 1; + tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); tcp_set_inp_to_drop(rack->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; @@ -3319,8 +5875,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } - if (rack_use_sack_filter) - sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; @@ -3344,6 +5899,20 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 return (0); return (1); } + if ((timers & PACE_TMR_RACK) && + rack->rc_on_min_to) { + /* + * For the rack timer when we + * are on a min-timeout (which means rrr_conf = 3) + * we don't want to check the timer. It may + * be going off for a pace and thats ok we + * want to send the retransmit (if its ready). + * + * If its on a normal rack timer (non-min) then + * we will check if its expired. + */ + goto skip_time_check; + } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { uint32_t left; @@ -3353,6 +5922,12 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 return (0); } if (hpts_calling == 0) { + /* + * A user send or queued mbuf (sack) has called us? We + * return 0 and let the pacing guards + * deal with it if they should or + * should not cause a send. + */ ret = -2; rack_log_to_processing(rack, cts, ret, 0); return (0); @@ -3365,9 +5940,9 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 left = rack->r_ctl.rc_timer_exp - cts; tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); - rack->rc_last_pto_set = 0; return (1); } +skip_time_check: rack->rc_tmr_stopped = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { @@ -3393,12 +5968,22 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) { + struct timeval tv; + uint32_t us_cts, flags_on_entry; uint8_t hpts_removed = 0; + + flags_on_entry = rack->r_ctl.rc_hpts_flags; + us_cts = tcp_get_usecs(&tv); if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && - TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { + ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || + ((tp->snd_max - tp->snd_una) == 0))) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; + /* If we were not delayed cancel out the flag. */ + if ((tp->snd_max - tp->snd_una) == 0) + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; @@ -3412,9 +5997,10 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } - rack_log_to_cancel(rack, hpts_removed, line); rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } + if (hpts_removed == 0) + rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); } static void @@ -3476,7 +6062,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } - if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { + if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } @@ -3577,7 +6163,7 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, - uint8_t pass, struct rack_sendmap *hintrsm) + uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm, *insret, fe; @@ -3684,6 +6270,7 @@ again: rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; + rsm->usec_orig_send = us_cts; if (th_flags & TH_SYN) { /* The data space is one beyond snd_una */ rsm->r_start = seq_out + 1; @@ -3704,6 +6291,20 @@ again: #endif TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; + /* + * Special case detection, is there just a single + * packet outstanding when we are not in recovery? + * + * If this is true mark it so. + */ + if ((IN_RECOVERY(tp->t_flags) == 0) && + (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { + struct rack_sendmap *prsm; + + prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + if (prsm) + prsm->r_one_out_nr = 1; + } return; } /* @@ -3809,8 +6410,10 @@ refind: * Record one of the RTT updates from an ack into * our sample structure. */ + static void -tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) +tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, + int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) { if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { @@ -3820,6 +6423,55 @@ tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { rack->r_ctl.rack_rs.rs_rtt_highest = rtt; } + if (rack->rc_tp->t_flags & TF_GPUTINPROG) { + if (us_rtt < rack->r_ctl.rc_gp_lowrtt) + rack->r_ctl.rc_gp_lowrtt = us_rtt; + if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) + rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; + } + if ((confidence == 1) && + ((rsm == NULL) || + (rsm->r_just_ret) || + (rsm->r_one_out_nr && + len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { + /* + * If the rsm had a just return + * hit it then we can't trust the + * rtt measurement for buffer deterimination + * Note that a confidence of 2, indicates + * SACK'd which overrides the r_just_ret or + * the r_one_out_nr. If it was a CUM-ACK and + * we had only two outstanding, but get an + * ack for only 1. Then that also lowers our + * confidence. + */ + confidence = 0; + } + if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || + (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { + if (rack->r_ctl.rack_rs.confidence == 0) { + /* + * We take anything with no current confidence + * saved. + */ + rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; + rack->r_ctl.rack_rs.confidence = confidence; + rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; + } else if (confidence || rack->r_ctl.rack_rs.confidence) { + /* + * Once we have a confident number, + * we can update it with a smaller + * value since this confident number + * may include the DSACK time until + * the next segment (the second one) arrived. + */ + rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; + rack->r_ctl.rack_rs.confidence = confidence; + rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; + } + + } + rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; rack->r_ctl.rack_rs.rs_rtt_tot += rtt; rack->r_ctl.rack_rs.rs_rtt_cnt++; @@ -3834,6 +6486,7 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) { int32_t delta; uint32_t o_srtt, o_var; + int32_t hrtt_up = 0; int32_t rtt; if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) @@ -3857,6 +6510,56 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) } if (rtt == 0) rtt = 1; + if (rack->rc_gp_rtt_set == 0) { + /* + * With no RTT we have to accept + * even one we are not confident of. + */ + rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; + rack->rc_gp_rtt_set = 1; + } else if (rack->r_ctl.rack_rs.confidence) { + /* update the running gp srtt */ + rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); + rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; + } + if (rack->r_ctl.rack_rs.confidence) { + /* + * record the low and high for highly buffered path computation, + * we only do this if we are confident (not a retransmission). + */ + if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { + rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; + hrtt_up = 1; + } + if (rack->rc_highly_buffered == 0) { + /* + * Currently once we declare a path has + * highly buffered there is no going + * back, which may be a problem... + */ + if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { + rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, + rack->r_ctl.rc_highest_us_rtt, + rack->r_ctl.rc_lowest_us_rtt, + RACK_RTTS_SEEHBP); + rack->rc_highly_buffered = 1; + } + } + } + if ((rack->r_ctl.rack_rs.confidence) || + (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { + /* + * If we are highly confident of it it was + * never retransmitted we accept it as the last us_rtt. + */ + rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; + /* The lowest rtt can be set if its was not retransmited */ + if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { + rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; + if (rack->r_ctl.rc_lowest_us_rtt == 0) + rack->r_ctl.rc_lowest_us_rtt = 1; + } + } rack_log_rtt_sample(rack, rtt); o_srtt = tp->t_srtt; o_var = tp->t_rttvar; @@ -3905,7 +6608,6 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } KMOD_TCPSTAT_INC(tcps_rttupdated); - rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); @@ -3980,30 +6682,78 @@ rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); } +static void +rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) +{ + /* + * Apply to filter the inbound us-rtt at us_cts. + */ + uint32_t old_rtt; + + old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); + apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, + us_rtt, us_cts); + if (rack->r_ctl.last_pacing_time && + rack->rc_gp_dyn_mul && + (rack->r_ctl.last_pacing_time > us_rtt)) + rack->pacing_longer_than_rtt = 1; + else + rack->pacing_longer_than_rtt = 0; + if (old_rtt > us_rtt) { + /* We just hit a new lower rtt time */ + rack_log_rtt_shrinks(rack, us_cts, old_rtt, + __LINE__, RACK_RTTS_NEWRTT); + /* + * Only count it if its lower than what we saw within our + * calculated range. + */ + if ((old_rtt - us_rtt) > rack_min_rtt_movement) { + if (rack_probertt_lower_within && + rack->rc_gp_dyn_mul && + (rack->use_fixed_rate == 0) && + (rack->rc_always_pace)) { + /* + * We are seeing a new lower rtt very close + * to the time that we would have entered probe-rtt. + * This is probably due to the fact that a peer flow + * has entered probe-rtt. Lets go in now too. + */ + uint32_t val; + + val = rack_probertt_lower_within * rack_time_between_probertt; + val /= 100; + if ((rack->in_probe_rtt == 0) && + ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { + rack_enter_probertt(rack, us_cts); + } + } + rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + } + } +} static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) + struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) { int32_t i; - uint32_t t; + uint32_t t, len_acked; - if (rsm->r_flags & RACK_ACKED) + if ((rsm->r_flags & RACK_ACKED) || + (rsm->r_flags & RACK_WAS_ACKED)) /* Already done */ return (0); + if (ack_type == CUM_ACKED) { + if (SEQ_GT(th_ack, rsm->r_end)) + len_acked = rsm->r_end - rsm->r_start; + else + len_acked = th_ack - rsm->r_start; + } else + len_acked = rsm->r_end - rsm->r_start; + if (rsm->r_rtr_cnt == 1) { + uint32_t us_rtt; - if ((rsm->r_rtr_cnt == 1) || - ((ack_type == CUM_ACKED) && - (to->to_flags & TOF_TS) && - (to->to_tsecr) && - (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) - ) { - /* - * We will only find a matching timestamp if its cum-acked. - * But if its only one retransmission its for-sure matching - * :-) - */ t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; if ((int)t <= 0) t = 1; @@ -4016,7 +6766,22 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_rack_min_rtt = 1; } } - tcp_rack_xmit_timer(rack, t + 1); + us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; + if (us_rtt == 0) + us_rtt = 1; + rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); + if (ack_type == SACKED) + tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); + else { + /* + * For cum-ack we are only confident if what + * is being acked is included in a measurement. + * Otherwise it could be an idle period that + * includes Delayed-ack time. + */ + tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, + (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); + } if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ @@ -4029,8 +6794,10 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, * When we enter recovery we need to assure * we send one packet. */ - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 7); + if (rack->rack_no_prr == 0) { + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 7, 0); + } } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { @@ -4072,24 +6839,14 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_rack_min_rtt = 1; } } - /* - * Note the following calls to - * tcp_rack_xmit_timer() are being commented - * out for now. They give us no more accuracy - * and often lead to a wrong choice. We have - * enough samples that have not been - * retransmitted. I leave the commented out - * code in here in case in the future we - * decide to add it back (though I can't forsee - * doing that). That way we will easily see - * where they need to be placed. - */ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } + tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, + rsm->r_rtr_cnt); return (1); } } @@ -4176,6 +6933,123 @@ rack_log_sack_passed(struct tcpcb *tp, } } +static void +rack_need_set_test(struct tcpcb *tp, + struct tcp_rack *rack, + struct rack_sendmap *rsm, + tcp_seq th_ack, + int line, + int use_which) +{ + + if ((tp->t_flags & TF_GPUTINPROG) && + SEQ_GEQ(rsm->r_end, tp->gput_seq)) { + /* + * We were app limited, and this ack + * butts up or goes beyond the point where we want + * to start our next measurement. We need + * to record the new gput_ts as here and + * possibly update the start sequence. + */ + uint32_t seq, ts; + + if (rsm->r_rtr_cnt > 1) { + /* + * This is a retransmit, can we + * really make any assessment at this + * point? We are not really sure of + * the timestamp, is it this or the + * previous transmission? + * + * Lets wait for something better that + * is not retransmitted. + */ + return; + } + seq = tp->gput_seq; + ts = tp->gput_ts; + rack->app_limited_needs_set = 0; + tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + /* Do we start at a new end? */ + if ((use_which == RACK_USE_BEG) && + SEQ_GEQ(rsm->r_start, tp->gput_seq)) { + /* + * When we get an ACK that just eats + * up some of the rsm, we set RACK_USE_BEG + * since whats at r_start (i.e. th_ack) + * is left unacked and thats where the + * measurement not starts. + */ + tp->gput_seq = rsm->r_start; + rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + } + if ((use_which == RACK_USE_END) && + SEQ_GEQ(rsm->r_end, tp->gput_seq)) { + /* + * We use the end when the cumack + * is moving forward and completely + * deleting the rsm passed so basically + * r_end holds th_ack. + * + * For SACK's we also want to use the end + * since this piece just got sacked and + * we want to target anything after that + * in our measurement. + */ + tp->gput_seq = rsm->r_end; + rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + } + if (use_which == RACK_USE_END_OR_THACK) { + /* + * special case for ack moving forward, + * not a sack, we need to move all the + * way up to where this ack cum-ack moves + * to. + */ + if (SEQ_GT(th_ack, rsm->r_end)) + tp->gput_seq = th_ack; + else + tp->gput_seq = rsm->r_end; + rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + } + if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { + /* + * We moved beyond this guy's range, re-calculate + * the new end point. + */ + if (rack->rc_gp_filled == 0) { + tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); + } else { + tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); + } + } + /* + * We are moving the goal post, we may be able to clear the + * measure_saw_probe_rtt flag. + */ + if ((rack->in_probe_rtt == 0) && + (rack->measure_saw_probe_rtt) && + (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) + rack->measure_saw_probe_rtt = 0; + rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, + seq, tp->gput_seq, 0, 5, line, NULL); + if (rack->rc_gp_filled && + ((tp->gput_ack - tp->gput_seq) < + max(rc_init_window(rack), (MIN_GP_WIN * + ctf_fixed_maxseg(tp))))) { + /* + * There is no sense of continuing this measurement + * because its too small to gain us anything we + * trust. Skip it and that way we can start a new + * measurement quicker. + */ + rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, + 0, 0, 0, 6, __LINE__, NULL); + tp->t_flags &= ~TF_GPUTINPROG; + } + } +} + static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) @@ -4253,11 +7127,14 @@ do_rest_ofb: next->r_start = start; /* Clear out the dup ack count of the remainder */ rsm->r_dupack = 0; + rsm->r_just_ret = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* Now lets make sure our fudge block is right */ nrsm->r_start = start; /* Now lets update all the stats and such */ - rack_update_rtt(tp, rack, nrsm, to, cts, SACKED); + rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); + if (rack->app_limited_needs_set) + rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (nrsm->r_flags & RACK_SACK_PASSED) { @@ -4323,6 +7200,7 @@ do_rest_ofb: } counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, start); + rsm->r_just_ret = 0; insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { @@ -4373,10 +7251,8 @@ do_rest_ofb: * * end |---------| */ - if (rsm->r_flags & RACK_TLP) - rack->r_ctl.rc_tlp_rtx_out = 0; if ((rsm->r_flags & RACK_ACKED) == 0) { - rack_update_rtt(tp, rack, rsm, to, cts, SACKED); + rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ @@ -4387,6 +7263,9 @@ do_rest_ofb: counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } + if (rack->app_limited_needs_set) + rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); + rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { @@ -4452,7 +7331,9 @@ do_rest_ofb: * to prev). Update the rtt and changed * based on that. Also check for reordering. */ - rack_update_rtt(tp, rack, nrsm, to, cts, SACKED); + rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); + if (rack->app_limited_needs_set) + rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (nrsm->r_flags & RACK_SACK_PASSED) { @@ -4493,6 +7374,7 @@ do_rest_ofb: counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, end); rsm->r_flags &= (~RACK_HAS_FIN); + rsm->r_just_ret = 0; insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { @@ -4506,9 +7388,7 @@ do_rest_ofb: } nrsm->r_dupack = 0; rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); - if (rsm->r_flags & RACK_TLP) - rack->r_ctl.rc_tlp_rtx_out = 0; - rack_update_rtt(tp, rack, rsm, to, cts, SACKED); + rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ @@ -4519,6 +7399,9 @@ do_rest_ofb: counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } + if (rack->app_limited_needs_set) + rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); + rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { @@ -4607,15 +7490,13 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac * Now lets possibly clear the sack filter so we start * recognizing sacks that cover this area. */ - if (rack_use_sack_filter) - sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); + sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); } static void rack_do_decay(struct tcp_rack *rack) { -#ifdef NETFLIX_EXP_DETECTION struct timeval res; #define timersub(tvp, uvp, vvp) \ @@ -4628,7 +7509,7 @@ rack_do_decay(struct tcp_rack *rack) } \ } while (0) - timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res); + timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); #undef timersub rack->r_ctl.input_pkt++; @@ -4645,8 +7526,9 @@ rack_do_decay(struct tcp_rack *rack) pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; /* Update our saved tracking values */ rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; - rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack; + rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; /* Now do we escape without decay? */ +#ifdef NETFLIX_EXP_DETECTION if (rack->rc_in_persist || (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || (pkt_delta < tcp_sad_low_pps)){ @@ -4665,8 +7547,8 @@ rack_do_decay(struct tcp_rack *rack) tcp_sad_decay_val); rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, tcp_sad_decay_val); - } #endif + } } static void @@ -4680,6 +7562,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; int loop_start = 0, moved_two = 0; + uint32_t tsused; + INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { @@ -4725,8 +7609,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) * the packets in the rack log and calculate any eligble * RTT's. */ - rack->r_wanted_output++; - more: + rack->r_wanted_output = 1; +more: rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { @@ -4759,16 +7643,17 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) #endif goto proc_sack; } - rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); + rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; + uint8_t newly_acked; rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; - if (rsm->r_flags & RACK_TLP) - rack->r_ctl.rc_tlp_rtx_out = 0; + /* Record the time of highest cumack sent */ + rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (rm != rsm) { @@ -4780,12 +7665,14 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } + newly_acked = 1; if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove * it from total */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); + newly_acked = 0; } else if (rsm->r_flags & RACK_SACK_PASSED) { /* * There are segments ACKED on the @@ -4794,22 +7681,13 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) */ rsm->r_flags &= ~RACK_SACK_PASSED; counter_u64_add(rack_reorder_seen, 1); + rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; } left = th_ack - rsm->r_end; - if (rsm->r_rtr_cnt > 1) { - /* - * Technically we should make r_rtr_cnt be - * monotonicly increasing and just mod it to - * the timestamp it is replacing.. that way - * we would have the last 3 retransmits. Now - * rc_loss_count will be wrong if we - * retransmit something more than 2 times in - * recovery :( - */ - rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); - } + if (rack->app_limited_needs_set && newly_acked) + rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); /* Free back to zone */ rack_free(rack, rsm); if (left) { @@ -4843,8 +7721,15 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) rsm->r_rtr_bytes -= ack_am; } } - /* Update where the piece starts */ + /* + * Update where the piece starts and record + * the time of send of highest cumack sent. + */ + rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; rsm->r_start = th_ack; + if (rack->app_limited_needs_set) + rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); + } proc_sack: /* Check for reneging */ @@ -4902,11 +7787,9 @@ proc_sack: * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ - if (rack_use_sack_filter) { - num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, - num_sack_blks, th->th_ack); - ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); - } + num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, + num_sack_blks, th->th_ack); + ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); if (num_sack_blks == 0) { /* Nothing to sack (DSACKs?) */ goto out_with_totals; @@ -4980,7 +7863,7 @@ do_sack_work: */ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); if (acked) { - rack->r_wanted_output++; + rack->r_wanted_output = 1; changed += acked; sack_changed += acked; } @@ -5044,7 +7927,7 @@ do_sack_work: for (i = loop_start; i < num_sack_blks; i++) { acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); if (acked) { - rack->r_wanted_output++; + rack->r_wanted_output = 1; changed += acked; sack_changed += acked; } @@ -5193,34 +8076,42 @@ out: /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } - if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { + tsused = tcp_ts_getticks(); + rsm = tcp_rack_output(tp, rack, tsused); + if ((!IN_RECOVERY(tp->t_flags)) && + rsm) { + /* Enter recovery */ + rack->r_ctl.rc_rsm_start = rsm->r_start; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + entered_recovery = 1; + rack_cong_signal(tp, NULL, CC_NDUPACK); /* - * Ok we have a high probability that we need to go in to - * recovery since we have data sack'd + * When we enter recovery we need to assure we send + * one packet. */ - struct rack_sendmap *rsm; - uint32_t tsused; - - tsused = tcp_ts_getticks(); - rsm = tcp_rack_output(tp, rack, tsused); - if (rsm) { - /* Enter recovery */ - rack->r_ctl.rc_rsm_start = rsm->r_start; - rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; - rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; - entered_recovery = 1; - rack_cong_signal(tp, NULL, CC_NDUPACK); - /* - * When we enter recovery we need to assure we send - * one packet. - */ + if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 8); - rack->r_timer_override = 1; + rack_log_to_prr(rack, 8, 0); } + rack->r_timer_override = 1; + rack->r_early = 0; + rack->r_ctl.rc_agg_early = 0; + } else if (IN_RECOVERY(tp->t_flags) && + rsm && + (rack->r_rr_config == 3)) { + /* + * Assure we can output and we get no + * remembered pace time except the retransmit. + */ + rack->r_timer_override = 1; + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + rack->r_ctl.rc_resend = rsm; } - if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { - /* Deal with changed and PRR here (in recovery only) */ + if (IN_RECOVERY(tp->t_flags) && + (rack->rack_no_prr == 0) && + (entered_recovery == 0)) { + /* Deal with PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; @@ -5239,7 +8130,7 @@ out: sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; - rack_log_to_prr(rack, 9); + rack_log_to_prr(rack, 9, 0); sndcnt = 0; } sndcnt++; @@ -5248,7 +8139,7 @@ out: else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; - rack_log_to_prr(rack, 10); + rack_log_to_prr(rack, 10, 0); } else { uint32_t limit; @@ -5261,13 +8152,21 @@ out: limit += ctf_fixed_maxseg(tp); if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); - rack_log_to_prr(rack, 11); + rack_log_to_prr(rack, 11, 0); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); - rack_log_to_prr(rack, 12); + rack_log_to_prr(rack, 12, 0); } } - if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) { + if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && + ((rack->rc_inp->inp_in_hpts == 0) && + ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { + /* + * If you are pacing output you don't want + * to override. + */ + rack->r_early = 0; + rack->r_ctl.rc_agg_early = 0; rack->r_timer_override = 1; } } @@ -5290,6 +8189,114 @@ rack_strike_dupack(struct tcp_rack *rack) } } +static void +rack_check_bottom_drag(struct tcpcb *tp, + struct tcp_rack *rack, + struct socket *so, int32_t acked) +{ + uint32_t segsiz, minseg; + + segsiz = ctf_fixed_maxseg(tp); + if (so->so_snd.sb_flags & SB_TLS_IFNET) { + minseg = rack->r_ctl.rc_pace_min_segs; + } else { + minseg = segsiz; + } + if (tp->snd_max == tp->snd_una) { + /* + * We are doing dynamic pacing and we are way + * under. Basically everything got acked while + * we were still waiting on the pacer to expire. + * + * This means we need to boost the b/w in + * addition to any earlier boosting of + * the multipler. + */ + rack->rc_dragged_bottom = 1; + rack_validate_multipliers_at_or_above100(rack); + /* + * Lets use the segment bytes acked plus + * the lowest RTT seen as the basis to + * form a b/w estimate. This will be off + * due to the fact that the true estimate + * should be around 1/2 the time of the RTT + * but we can settle for that. + */ + if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && + acked) { + uint64_t bw, calc_bw, rtt; + + rtt = rack->r_ctl.rack_rs.rs_us_rtt; + bw = acked; + calc_bw = bw * 1000000; + calc_bw /= rtt; + if (rack->r_ctl.last_max_bw && + (rack->r_ctl.last_max_bw < calc_bw)) { + /* + * If we have a last calculated max bw + * enforce it. + */ + calc_bw = rack->r_ctl.last_max_bw; + } + /* now plop it in */ + if (rack->rc_gp_filled == 0) { + if (calc_bw > ONE_POINT_TWO_MEG) { + /* + * If we have no measurement + * don't let us set in more than + * 1.2Mbps. If we are still too + * low after pacing with this we + * will hopefully have a max b/w + * available to sanity check things. + */ + calc_bw = ONE_POINT_TWO_MEG; + } + rack->r_ctl.rc_rtt_diff = 0; + rack->r_ctl.gp_bw = calc_bw; + rack->rc_gp_filled = 1; + rack->r_ctl.num_avg = RACK_REQ_AVG; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + } else if (calc_bw > rack->r_ctl.gp_bw) { + rack->r_ctl.rc_rtt_diff = 0; + rack->r_ctl.num_avg = RACK_REQ_AVG; + rack->r_ctl.gp_bw = calc_bw; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + } else + rack_increase_bw_mul(rack, -1, 0, 0, 1); + /* + * For acks over 1mss we do a extra boost to simulate + * where we would get 2 acks (we want 110 for the mul). + */ + if (acked > segsiz) + rack_increase_bw_mul(rack, -1, 0, 0, 1); + } else { + /* + * Huh, this should not be, settle + * for just an old increase. + */ + rack_increase_bw_mul(rack, -1, 0, 0, 1); + } + } else if ((IN_RECOVERY(tp->t_flags) == 0) && + (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), + minseg)) && + (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && + (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && + (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= + (segsiz * rack_req_segs))) { + /* + * We are doing dynamic GP pacing and + * we have everything except 1MSS or less + * bytes left out. We are still pacing away. + * And there is data that could be sent, This + * means we are inserting delayed ack time in + * our measurements because we are pacing too slow. + */ + rack_validate_multipliers_at_or_above100(rack); + rack->rc_dragged_bottom = 1; + rack_increase_bw_mul(rack, -1, 0, 0, 1); + } +} + /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. @@ -5307,14 +8314,19 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t acked; struct mbuf *mfree; struct tcp_rack *rack; + int32_t under_pacing = 0; int32_t recovery = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); - rack->r_wanted_output++; + rack->r_wanted_output = 1; return (1); } + if (rack->rc_gp_filled && + (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + under_pacing = 1; + } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { if (rack->rc_in_persist) tp->t_rxtshift = 0; @@ -5358,7 +8370,6 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, acked = BYTES_THIS_ACK(tp, th); KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); - /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the @@ -5371,6 +8382,21 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } + if (acked) { + /* assure we are not backed off */ + tp->t_rxtshift = 0; + rack->rc_tlp_in_progress = 0; + rack->r_ctl.rc_tlp_cnt_out = 0; + /* + * If it is the RXT timer we want to + * stop it, so we can restart a TLP. + */ + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); +#ifdef NETFLIX_HTTP_LOGGING + tcp_http_check_for_comp(rack->rc_tp, th->th_ack); +#endif + } /* * If we have a timestamp reply, update smoothed round trip time. If * no timestamp is present but transmit timer is running and timed @@ -5389,10 +8415,6 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, * data to be acked, restart retransmit timer, using current * (possibly backed-off) value. */ - if (th->th_ack == tp->snd_max) { - rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - rack->r_wanted_output++; - } if (acked == 0) { if (ofia) *ofia = ourfinisacked; @@ -5421,7 +8443,14 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, mfree = sbcut_locked(&so->so_snd, acked_amount); if ((sbused(&so->so_snd) == 0) && (acked > acked_amount) && - (tp->t_state >= TCPS_FIN_WAIT_1)) { + (tp->t_state >= TCPS_FIN_WAIT_1) && + (tp->t_flags & TF_SENTFIN)) { + /* + * We must be sure our fin + * was sent and acked (we can be + * in FIN_WAIT_1 without having + * sent the fin). + */ ourfinisacked = 1; } /* NB: sowwakeup_locked() does an implicit unlock. */ @@ -5444,16 +8473,26 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { tp->snd_nxt = tp->snd_una; } + if (under_pacing && + (rack->use_fixed_rate == 0) && + (rack->in_probe_rtt == 0) && + rack->rc_gp_dyn_mul && + rack->rc_always_pace) { + /* Check if we are dragging bottom */ + rack_check_bottom_drag(tp, rack, so, acked); + } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ + rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); + if (rack->r_ctl.rc_went_idle_time == 0) + rack->r_ctl.rc_went_idle_time = 1; rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ - rack->r_wanted_output++; - if (rack_use_sack_filter) - sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + rack->r_wanted_output = 1; + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { @@ -5463,9 +8502,12 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, * reset him. */ *ret_val = 1; + /* tcp_close will kill the inp pre-log the Reset */ + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); + } } if (ofia) @@ -5494,10 +8536,8 @@ rack_collapsed_window(struct tcp_rack *rack) */ struct rack_sendmap *rsm, *nrsm, fe, *insret; tcp_seq max_seq; - uint32_t maxseg; max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; - maxseg = ctf_fixed_maxseg(rack->rc_tp); memset(&fe, 0, sizeof(fe)); fe.r_start = max_seq; /* Find the first seq past or at maxseq */ @@ -5559,6 +8599,31 @@ rack_un_collapse_window(struct tcp_rack *rack) rack->rc_has_collapsed = 0; } +static void +rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, + int32_t tlen, int32_t tfo_syn) +{ + if (DELAY_ACK(tp, tlen) || tfo_syn) { + if (rack->rc_dack_mode && + (tlen > 500) && + (rack->rc_dack_toggle == 1)) { + goto no_delayed_ack; + } + rack_timer_cancel(tp, rack, + rack->r_ctl.rc_rcvtime, __LINE__); + tp->t_flags |= TF_DELACK; + } else { +no_delayed_ack: + rack->r_wanted_output = 1; + tp->t_flags |= TF_ACKNOW; + if (rack->rc_dack_mode) { + if (tp->t_flags & TF_DELACK) + rack->rc_dack_toggle = 1; + else + rack->rc_dack_toggle = 0; + } + } +} /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still @@ -5593,7 +8658,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; - rack->r_wanted_output++; + rack->r_wanted_output = 1; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; @@ -5610,10 +8675,10 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { - rack_exit_persist(tp, rack); + rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ - rack->r_wanted_output++; + rack->r_wanted_output = 1; } /* Do we enter persists? */ if ((rack->rc_in_persist == 0) && @@ -5636,66 +8701,10 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } /* - * Process segments with URG. + * don't process the URG bit, ignore them drag + * along the up. */ - if ((thflags & TH_URG) && th->th_urp && - TCPS_HAVERCVDFIN(tp->t_state) == 0) { - /* - * This is a kludge, but if we receive and accept random - * urgent pointers, we'll crash in soreceive. It's hard to - * imagine someone actually wanting to send this much urgent - * data. - */ - SOCKBUF_LOCK(&so->so_rcv); - if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { - th->th_urp = 0; /* XXX */ - thflags &= ~TH_URG; /* XXX */ - SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ - goto dodata; /* XXX */ - } - /* - * If this segment advances the known urgent pointer, then - * mark the data stream. This should not happen in - * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a - * FIN has been received from the remote side. In these - * states we ignore the URG. - * - * According to RFC961 (Assigned Protocols), the urgent - * pointer points to the last octet of urgent data. We - * continue, however, to consider it to indicate the first - * octet of data past the urgent section as the original - * spec states (in one of two places). - */ - if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { - tp->rcv_up = th->th_seq + th->th_urp; - so->so_oobmark = sbavail(&so->so_rcv) + - (tp->rcv_up - tp->rcv_nxt) - 1; - if (so->so_oobmark == 0) - so->so_rcv.sb_state |= SBS_RCVATMARK; - sohasoutofband(so); - tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); - } - SOCKBUF_UNLOCK(&so->so_rcv); - /* - * Remove out of band data so doesn't get presented to user. - * This can happen independent of advancing the URG pointer, - * but if two URG's are pending at once, some out-of-band - * data may creep in... ick. - */ - if (th->th_urp <= (uint32_t) tlen && - !(so->so_options & SO_OOBINLINE)) { - /* hdr drop is delayed */ - tcp_pulloutofband(so, th, m, drop_hdrlen); - } - } else { - /* - * If no out of band data is expected, pull receive urgent - * pointer along with the receive window. - */ - if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) - tp->rcv_up = tp->rcv_nxt; - } -dodata: /* XXX */ + tp->rcv_up = tp->rcv_nxt; INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -5744,14 +8753,7 @@ dodata: /* XXX */ } } #endif - if (DELAY_ACK(tp, tlen) || tfo_syn) { - rack_timer_cancel(tp, rack, - rack->r_ctl.rc_rcvtime, __LINE__); - tp->t_flags |= TF_DELACK; - } else { - rack->r_wanted_output++; - tp->t_flags |= TF_ACKNOW; - } + rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); @@ -5782,37 +8784,42 @@ dodata: /* XXX */ thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } - if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { - if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { - /* - * DSACK actually handled in the fastpath - * above. - */ - tcp_update_sack_list(tp, save_start, - save_start + save_tlen); - } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { - if ((tp->rcv_numsacks >= 1) && - (tp->sackblks[0].end == save_start)) { - /* - * Partial overlap, recorded at todrop - * above. - */ - tcp_update_sack_list(tp, - tp->sackblks[0].start, - tp->sackblks[0].end); - } else { - tcp_update_dsack_list(tp, save_start, - save_start + save_tlen); - } - } else if (tlen >= save_tlen) { - /* Update of sackblks. */ - tcp_update_dsack_list(tp, save_start, - save_start + save_tlen); - } else if (tlen > 0) { - tcp_update_dsack_list(tp, save_start, - save_start + tlen); - } - } + if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { + if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { + /* + * DSACK actually handled in the fastpath + * above. + */ + RACK_OPTS_INC(tcp_sack_path_1); + tcp_update_sack_list(tp, save_start, + save_start + save_tlen); + } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { + if ((tp->rcv_numsacks >= 1) && + (tp->sackblks[0].end == save_start)) { + /* + * Partial overlap, recorded at todrop + * above. + */ + RACK_OPTS_INC(tcp_sack_path_2a); + tcp_update_sack_list(tp, + tp->sackblks[0].start, + tp->sackblks[0].end); + } else { + RACK_OPTS_INC(tcp_sack_path_2b); + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } + } else if (tlen >= save_tlen) { + /* Update of sackblks. */ + RACK_OPTS_INC(tcp_sack_path_3); + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } else if (tlen > 0) { + RACK_OPTS_INC(tcp_sack_path_4); + tcp_update_dsack_list(tp, save_start, + save_start + tlen); + } + } } else { m_freem(m); thflags &= ~TH_FIN; @@ -5883,7 +8890,7 @@ dodata: /* XXX */ */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { - rack->r_wanted_output++; + rack->r_wanted_output = 1; } INP_WLOCK_ASSERT(tp->t_inpcb); return (0); @@ -6014,14 +9021,8 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, if (so->so_rcv.sb_shlim && mcnt != appended) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif - if (DELAY_ACK(tp, tlen)) { - rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - tp->t_flags |= TF_DELACK; - } else { - tp->t_flags |= TF_ACKNOW; - rack->r_wanted_output++; - } - if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) + rack_handle_delayed_ack(tp, rack, tlen, 0); + if (tp->snd_una == tp->snd_max) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); return (1); } @@ -6038,11 +9039,10 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, static int rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t nxt_pkt, uint32_t cts, uint8_t iptos) + uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) { int32_t acked; int32_t nsegs; - #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -6051,8 +9051,8 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; - #endif + int32_t under_pacing = 0; struct tcp_rack *rack; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { @@ -6089,14 +9089,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } /* Ok if we reach here, we can process a fast-ack */ + if (rack->rc_gp_filled && + (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + under_pacing = 1; + } nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); - /* - * We made progress, clear the tlp - * out flag so we could start a TLP - * again. - */ - rack->r_ctl.rc_tlp_rtx_out = 0; /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; @@ -6108,7 +9106,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { - rack_exit_persist(tp, rack); + rack_exit_persist(tp, rack, cts); } /* Do we enter persists? */ if ((rack->rc_in_persist == 0) && @@ -6167,6 +9165,21 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); + if (acked) { + /* assure we are not backed off */ + tp->t_rxtshift = 0; + rack->rc_tlp_in_progress = 0; + rack->r_ctl.rc_tlp_cnt_out = 0; + /* + * If it is the RXT timer we want to + * stop it, so we can restart a TLP. + */ + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); +#ifdef NETFLIX_HTTP_LOGGING + tcp_http_check_for_comp(rack->rc_tp, th->th_ack); +#endif + } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the @@ -6202,7 +9215,18 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + if (under_pacing && + (rack->use_fixed_rate == 0) && + (rack->in_probe_rtt == 0) && + rack->rc_gp_dyn_mul && + rack->rc_always_pace) { + /* Check if we are dragging bottom */ + rack_check_bottom_drag(tp, rack, so, acked); + } if (tp->snd_una == tp->snd_max) { + rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); + if (rack->r_ctl.rc_went_idle_time == 0) + rack->r_ctl.rc_went_idle_time = 1; rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) tp->t_acktime = 0; @@ -6211,7 +9235,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, /* Wake up the socket if we have room to write more */ sowwakeup(so); if (sbavail(&so->so_snd)) { - rack->r_wanted_output++; + rack->r_wanted_output = 1; } return (1); } @@ -6224,7 +9248,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, - uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t tos) + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; int32_t todrop; @@ -6241,11 +9265,12 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, * contains an ECE and ECN support is enabled, the stream is ECN * capable. if SYN has been acked change to ESTABLISHED else * SYN_RCVD state arrange for segment to be acked (eventually) - * continue processing rest of data/controls, beginning with URG + * continue processing rest of data/controls. */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -6295,15 +9320,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ - if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { - rack_timer_cancel(tp, rack, - rack->r_ctl.rc_rcvtime, __LINE__); - tp->t_flags |= TF_DELACK; - } else { - rack->r_wanted_output++; - tp->t_flags |= TF_ACKNOW; - } - + rack_handle_delayed_ack(tp, rack, tlen, tfo_partial); if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && (V_tcp_do_ecn == 1)) { tp->t_flags2 |= TF2_ECN_PERMIT; @@ -6337,7 +9354,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); - cc_conn_init(tp); + rack_cc_conn_init(tp); } } else { /* @@ -6381,7 +9398,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; - tcp_rack_xmit_timer(rack, t + 1); + tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) @@ -6437,6 +9454,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -6450,6 +9468,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { @@ -6485,6 +9504,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } @@ -6520,7 +9540,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & TH_ACK) == 0) { if (IS_FASTOPEN(tp->t_flags)) { - cc_conn_init(tp); + rack_cc_conn_init(tp); } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); @@ -6555,7 +9575,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * that occurs when a TFO SYN|ACK is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) - cc_conn_init(tp); + rack_cc_conn_init(tp); } /* * Account for the ACK of our SYN prior to @@ -6579,7 +9599,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; - tcp_rack_xmit_timer(rack, t + 1); + tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { @@ -6627,6 +9647,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + struct tcp_rack *rack; /* * Header prediction: check for the two common cases of a @@ -6642,16 +9663,14 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ + rack = (struct tcp_rack *)tp->t_fb_ptr; if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && - __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && + __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && __predict_true(SEGQ_EMPTY(tp)) && __predict_true(th->th_seq == tp->rcv_nxt)) { - struct tcp_rack *rack; - - rack = (struct tcp_rack *)tp->t_fb_ptr; if (tlen == 0) { if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, - tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime, iptos)) { + tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { return (0); } } else { @@ -6721,7 +9740,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -6735,7 +9754,8 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } if (sbavail(&so->so_snd)) { - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); @@ -6815,7 +9835,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -6829,7 +9849,9 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } if (sbavail(&so->so_snd)) { - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, + tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); @@ -6848,6 +9870,9 @@ rack_check_data_after_close(struct mbuf *m, rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: + tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); + /* tcp_close will kill the inp pre-log the Reset */ + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); KMOD_TCPSTAT_INC(tcps_rcvafterclose); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); @@ -6856,6 +9881,7 @@ rack_check_data_after_close(struct mbuf *m, if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ + tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; rack->r_wanted_output = 1; @@ -6941,7 +9967,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -6974,7 +10000,9 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, + tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); @@ -7062,7 +10090,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -7081,7 +10109,9 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, return (1); } if (sbavail(&so->so_snd)) { - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, + tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); @@ -7169,7 +10199,7 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -7188,7 +10218,9 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (1); } if (sbavail(&so->so_snd)) { - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, + tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); @@ -7279,7 +10311,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -7293,7 +10325,9 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, return (ret_val); } if (sbavail(&so->so_snd)) { - if (rack_progress_timeout_check(tp)) { + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, + tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); @@ -7303,7 +10337,6 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } - static void inline rack_clear_rate_sample(struct tcp_rack *rack) { @@ -7313,25 +10346,91 @@ rack_clear_rate_sample(struct tcp_rack *rack) } static void -rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack) +rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) { + uint64_t bw_est, rate_wanted; uint32_t tls_seg = 0; + int chged = 0; + uint32_t user_max; + user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; #ifdef KERN_TLS if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd); + if (tls_seg != rack->r_ctl.rc_pace_min_segs) + chged = 1; rack->r_ctl.rc_pace_min_segs = tls_seg; } else #endif + { + if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) + chged = 1; rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); - rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs; - if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) + } + if (rack->use_fixed_rate || rack->rc_force_max_seg) { + if (user_max != rack->r_ctl.rc_pace_max_segs) + chged = 1; + } + if (rack->rc_force_max_seg) { + rack->r_ctl.rc_pace_max_segs = user_max; + } else if (rack->use_fixed_rate) { + bw_est = rack_get_bw(rack); + if ((rack->r_ctl.crte == NULL) || + (bw_est != rack->r_ctl.crte->rate)) { + rack->r_ctl.rc_pace_max_segs = user_max; + } else { + /* We are pacing right at the hardware rate */ + uint32_t segsiz; + + segsiz = min(ctf_fixed_maxseg(tp), + rack->r_ctl.rc_pace_min_segs); + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( + bw_est, segsiz, 0, + rack->r_ctl.crte, NULL); + } + } else if (rack->rc_always_pace) { + if (rack->r_ctl.gp_bw || +#ifdef NETFLIX_PEAKRATE + rack->rc_tp->t_maxpeakrate || +#endif + rack->r_ctl.init_rate) { + /* We have a rate of some sort set */ + uint32_t orig; + + bw_est = rack_get_bw(rack); + orig = rack->r_ctl.rc_pace_max_segs; + rate_wanted = rack_get_output_bw(rack, bw_est, NULL); + if (rate_wanted) { + /* We have something */ + rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, + rate_wanted, + ctf_fixed_maxseg(rack->rc_tp)); + } else + rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; + if (orig != rack->r_ctl.rc_pace_max_segs) + chged = 1; + } else if ((rack->r_ctl.gp_bw == 0) && + (rack->r_ctl.rc_pace_max_segs == 0)) { + /* + * If we have nothing limit us to bursting + * out IW sized pieces. + */ + chged = 1; + rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); + } + } + if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { + chged = 1; rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; + } #ifdef KERN_TLS + uint32_t orig; + if (tls_seg != 0) { + orig = rack->r_ctl.rc_pace_max_segs; if (rack_hw_tls_max_seg > 1) { rack->r_ctl.rc_pace_max_segs /= tls_seg; - if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs) + if (rack_hw_tls_max_seg > rack->r_ctl.rc_pace_max_segs) rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg; } else { rack->r_ctl.rc_pace_max_segs = 1; @@ -7339,9 +10438,16 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack) if (rack->r_ctl.rc_pace_max_segs == 0) rack->r_ctl.rc_pace_max_segs = 1; rack->r_ctl.rc_pace_max_segs *= tls_seg; + if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { + /* We can't go over the max bytes (usually 64k) */ + rack->r_ctl.rc_pace_max_segs = ((PACE_MAX_IP_BYTES / tls_seg) * tls_seg); + } + if (orig != rack->r_ctl.rc_pace_max_segs) + chged = 1; } #endif - rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2); + if (chged) + rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); } static int @@ -7349,6 +10455,7 @@ rack_init(struct tcpcb *tp) { struct tcp_rack *rack = NULL; struct rack_sendmap *insret; + uint32_t iwin, snt, us_cts; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { @@ -7370,39 +10477,77 @@ rack_init(struct tcpcb *tp) if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } - tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); - rack->r_cpu = 0; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; - rack->rc_pace_reduce = rack_slot_reduction; - if (use_rack_cheat) - rack->use_rack_cheat = 1; + if (use_rack_rr) + rack->use_rack_rr = 1; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; - rack->rc_pace_max_segs = rack_hptsi_segments; + if (rack_enable_shared_cwnd) + rack->rack_enable_scwnd = 1; + rack->rc_user_set_max_segs = rack_hptsi_segments; + rack->rc_force_max_seg = 0; + if (rack_use_imac_dack) + rack->rc_dack_mode = 1; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; - rack->r_enforce_min_pace = rack_min_pace_time; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; + rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; + rack->r_ctl.rc_highest_us_rtt = 0; + if (rack_disable_prr) + rack->rack_no_prr = 1; + if (rack_gp_no_rec_chg) + rack->rc_gp_no_rec_chg = 1; rack->rc_always_pace = rack_pace_every_seg; - rack_set_pace_segments(tp, rack); + if (rack_enable_mqueue_for_nonpaced) + rack->r_mbuf_queue = 1; + else + rack->r_mbuf_queue = 0; + if (rack->r_mbuf_queue || rack->rc_always_pace) + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; + else + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + rack_set_pace_segments(tp, rack, __LINE__); + if (rack_limits_scwnd) + rack->r_limit_scw = 1; + else + rack->r_limit_scw = 0; rack->r_ctl.rc_high_rwnd = tp->snd_wnd; + rack->r_ctl.cwnd_to_use = tp->snd_cwnd; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; - rack->rack_per_of_gp = rack_per_of_gp; - microuptime(&rack->r_ctl.rc_last_ack); - rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack; - rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks(); + microuptime(&rack->r_ctl.act_rcv_time); + rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; + rack->r_running_late = 0; + rack->r_running_early = 0; + rack->rc_init_win = rack_default_init_window; + rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; + if (rack_do_dyn_mul) { + /* When dynamic adjustment is on CA needs to start at 100% */ + rack->rc_gp_dyn_mul = 1; + if (rack_do_dyn_mul >= 100) + rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; + } else + rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; + rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; + rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); + setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, + rack_probertt_filter_life); + us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + rack->r_ctl.rc_time_of_last_probertt = us_cts; + rack->r_ctl.rc_time_probertt_starts = 0; /* Do we force on detection? */ #ifdef NETFLIX_EXP_DETECTION if (tcp_force_detection) @@ -7410,6 +10555,8 @@ rack_init(struct tcpcb *tp) else #endif rack->do_detection = 0; + if (rack_non_rxt_use_cr) + rack->rack_rec_nonrxt_use_cr = 1; if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; @@ -7426,6 +10573,7 @@ rack_init(struct tcpcb *tp) rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; + rsm->usec_orig_send = us_cts; rsm->r_dupack = 0; insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS @@ -7437,8 +10585,46 @@ rack_init(struct tcpcb *tp) TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } + /* Cancel the GP measurement in progress */ + tp->t_flags &= ~TF_GPUTINPROG; + if (SEQ_GT(tp->snd_max, tp->iss)) + snt = tp->snd_max - tp->iss; + else + snt = 0; + iwin = rc_init_window(rack); + if (snt < iwin) { + /* We are not past the initial window + * so we need to make sure cwnd is + * correct. + */ + if (tp->snd_cwnd < iwin) + tp->snd_cwnd = iwin; + /* + * If we are within the initial window + * we want ssthresh to be unlimited. Setting + * it to the rwnd (which the default stack does + * and older racks) is not really a good idea + * since we want to be in SS and grow both the + * cwnd and the rwnd (via dynamic rwnd growth). If + * we set it to the rwnd then as the peer grows its + * rwnd we will be stuck in CA and never hit SS. + * + * Its far better to raise it up high (this takes the + * risk that there as been a loss already, probably + * we should have an indicator in all stacks of loss + * but we don't), but considering the normal use this + * is a risk worth taking. The consequences of not + * hitting SS are far worse than going one more time + * into it early on (before we have sent even a IW). + * It is highly unlikely that we will have had a loss + * before getting the IW out. + */ + tp->snd_ssthresh = 0xffffffff; + } rack_stop_all_timers(tp); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); + rack_log_rtt_shrinks(rack, us_cts, 0, + __LINE__, RACK_RTTS_INIT); return (0); } @@ -7474,11 +10660,29 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm, *rm; + + rack = (struct tcp_rack *)tp->t_fb_ptr; +#ifdef NETFLIX_SHARED_CWND + if (rack->r_ctl.rc_scw) { + uint32_t limit; + + if (rack->r_limit_scw) + limit = max(1, rack->r_ctl.rc_lowest_us_rtt); + else + limit = 0; + tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, + rack->r_ctl.rc_scw_index, + limit); + rack->r_ctl.rc_scw = NULL; + } +#endif + /* rack does not use force data but other stacks may clear it */ + tp->t_flags &= ~TF_FORCEDATA; if (tp->t_inpcb) { tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } - rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif @@ -7502,6 +10706,8 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } + /* Cancel the GP measurement in progress */ + tp->t_flags &= ~TF_GPUTINPROG; /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } @@ -7520,7 +10726,7 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: - rack_set_pace_segments(tp, rack); + rack_set_pace_segments(tp, rack, __LINE__); rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; @@ -7619,6 +10825,19 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ + if (rack->rc_inp->inp_in_hpts) { + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + uint32_t us_cts; + + us_cts = tcp_get_usecs(NULL); + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { + rack->r_early = 1; + rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); + } + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + } + tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); + } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } @@ -7632,18 +10851,31 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t way_out = 0; uint32_t cts; uint32_t tiwin; + struct timespec ts; struct tcpopt to; struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t prev_state = 0; + uint32_t us_cts; + /* + * tv passed from common code is from either M_TSTMP_LRO or + * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The + * rack_pacing stack assumes tv always refers to 'now', so we overwrite + * tv here to guarantee that. + */ + if (m->m_flags & M_TSTMP_LRO) + tcp_get_usecs(tv); - if (m->m_flags & M_TSTMP_LRO) { - tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; - tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; - } cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; + if ((m->m_flags & M_TSTMP) || + (m->m_flags & M_TSTMP_LRO)) { + mbuf_tstmp2timespec(m, &ts); + rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; + rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; + } else + rack->r_ctl.act_rcv_time = *tv; kern_prefetch(rack, &prev_state); prev_state = 0; thflags = th->th_flags; @@ -7656,18 +10888,73 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, __func__)); if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; - struct timeval tv; + struct timeval ltv; +#ifdef NETFLIX_HTTP_LOGGING + struct http_sendfile_track *http_req; + if (SEQ_GT(th->th_ack, tp->snd_una)) { + http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); + } else { + http_req = tcp_http_find_req_for_seq(tp, th->th_ack); + } +#endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; - log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr == 0) + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + else + log.u_bbr.flex1 = 0; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; + log.u_bbr.flex3 = m->m_flags; + log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + if (m->m_flags & M_TSTMP) { + /* Record the hardware timestamp if present */ + mbuf_tstmp2timespec(m, &ts); + ltv.tv_sec = ts.tv_sec; + ltv.tv_usec = ts.tv_nsec / 1000; + log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); + } else if (m->m_flags & M_TSTMP_LRO) { + /* Record the LRO the arrival timestamp */ + mbuf_tstmp2timespec(m, &ts); + ltv.tv_sec = ts.tv_sec; + ltv.tv_usec = ts.tv_nsec / 1000; + log.u_bbr.flex5 = tcp_tv_to_usectick(<v); + } + log.u_bbr.timeStamp = tcp_get_usecs(<v); + /* Log the rcv time */ + log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; +#ifdef NETFLIX_HTTP_LOGGING + log.u_bbr.applimited = tp->t_http_closed; + log.u_bbr.applimited <<= 8; + log.u_bbr.applimited |= tp->t_http_open; + log.u_bbr.applimited <<= 8; + log.u_bbr.applimited |= tp->t_http_req; + if (http_req) { + /* Copy out any client req info */ + /* seconds */ + log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); + /* useconds */ + log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); + log.u_bbr.rttProp = http_req->timestamp; + log.u_bbr.cur_del_rate = http_req->start; + if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { + log.u_bbr.flex8 |= 1; + } else { + log.u_bbr.flex8 |= 2; + log.u_bbr.bw_inuse = http_req->end; + } + log.u_bbr.flex6 = http_req->start_seq; + if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { + log.u_bbr.flex8 |= 4; + log.u_bbr.epoch = http_req->end_seq; + } + } +#endif TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, - tlen, &log, true, &tv); + tlen, &log, true, <v); } if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { way_out = 4; @@ -7680,6 +10967,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return(1); } @@ -7692,10 +10980,9 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, (tp->snd_max == tp->snd_una) && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { counter_u64_add(rack_input_idle_reduces, 1); - rack_cc_after_idle(tp); + rack_cc_after_idle(rack, tp); } tp->t_rcvtime = ticks; - /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. @@ -7753,6 +11040,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, if (TSTMP_GT(to.to_tsecr, cts)) to.to_tsecr = 0; } + /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! @@ -7772,7 +11060,6 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ - rack->r_cpu = inp_to_cpuid(tp->t_inpcb); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { @@ -7814,10 +11101,14 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if - * TF_SACK_PERMIT is set, if not rack is *not* possible and - * we switch to the default code. + * TF_SACK_PERMIT is set and the sack-not-required is clear. + * The code now does do dup-ack counting so if you don't + * switch back you won't get rack & TLP, but you will still + * get this stack. */ - if ((tp->t_flags & TF_SACK_PERMIT) == 0) { + + if ((rack_sack_not_required == 0) && + ((tp->t_flags & TF_SACK_PERMIT) == 0)) { tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); @@ -7828,12 +11119,38 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_set_hpts(tp->t_inpcb); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } + if (thflags & TH_FIN) + tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); + us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + if ((rack->rc_gp_dyn_mul) && + (rack->use_fixed_rate == 0) && + (rack->rc_always_pace)) { + /* Check in on probertt */ + rack_check_probe_rtt(rack, us_cts); + } + if (rack->forced_ack) { + uint32_t us_rtt; + + /* + * A persist or keep-alive was forced out, update our + * min rtt time. Note we do not worry about lost + * retransmissions since KEEP-ALIVES and persists + * are usually way long on times of sending (though + * if we were really paranoid or worried we could + * at least use timestamps if available to validate). + */ + rack->forced_ack = 0; + us_rtt = us_cts - rack->r_ctl.forced_ack_ts; + if (us_rtt == 0) + us_rtt = 1; + rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); + rack_apply_updated_usrtt(rack, us_rtt, us_cts); + } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ - memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval)); rack->r_ctl.rc_rcvtime = cts; if (rack->r_state != tp->t_state) rack_set_state(tp, rack); @@ -7841,7 +11158,6 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; - rack->r_ctl.rc_tlp_send_cnt = 0; rack_clear_rate_sample(rack); retval = (*rack->r_substate) (m, th, so, tp, &to, drop_hdrlen, @@ -7859,17 +11175,29 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); + if ((rack->rc_gp_dyn_mul) && + (rack->rc_always_pace) && + (rack->use_fixed_rate == 0) && + rack->in_probe_rtt && + (rack->r_ctl.rc_time_probertt_starts == 0)) { + /* + * If we are going for target, lets recheck before + * we output. + */ + rack_check_probe_rtt(rack, us_cts); + } if (rack->set_pacing_done_a_iw == 0) { /* How much has been acked? */ if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { /* We have enough to set in the pacing segment size */ rack->set_pacing_done_a_iw = 1; - rack_set_pace_segments(tp, rack); + rack_set_pace_segments(tp, rack, __LINE__); } } tcp_rack_xmit_timer_commit(rack, tp); - if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) { + if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { +do_output_now: did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } @@ -7889,9 +11217,26 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, /* keep alive not needed if we are hptsi output yet */ ; } else { + int late = 0; if (rack->rc_inp->inp_in_hpts) { - tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - counter_u64_add(rack_per_timer_hole, 1); + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + us_cts = tcp_get_usecs(NULL); + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { + rack->r_early = 1; + rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); + } else + late = 1; + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + } + tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); + } + if (late && (did_out == 0)) { + /* + * We are late in the sending + * and we did not call the output + * (this probably should not happen). + */ + goto do_output_now; } rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } @@ -8003,13 +11348,169 @@ check_it: return (NULL); } -static int32_t -rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len) +static void +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, + uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, + int line, struct rack_sendmap *rsm) { - int32_t slot = 0; + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; - if ((rack->rack_per_of_gp == 0) || - (rack->rc_always_pace == 0)) { + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = slot; + log.u_bbr.flex2 = len; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; + log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; + log.u_bbr.use_lt_bw = rack->app_limited_needs_set; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw = rack->rc_gp_filled; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->in_probe_rtt; + log.u_bbr.pkt_epoch = line; + log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; + log.u_bbr.bw_inuse = bw_est; + log.u_bbr.delRate = bw; + if (rack->r_ctl.gp_bw == 0) + log.u_bbr.cur_del_rate = 0; + else + log.u_bbr.cur_del_rate = rack_get_bw(rack); + log.u_bbr.rttProp = len_time; + log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; + log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; + log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); + if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { + /* We are in slow start */ + log.u_bbr.flex7 = 1; + } else { + /* we are on congestion avoidance */ + log.u_bbr.flex7 = 0; + } + log.u_bbr.flex8 = method; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSI_CALC, 0, + 0, &log, false, &tv); + } +} + +static uint32_t +rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) +{ + uint32_t new_tso, user_max; + + user_max = rack->rc_user_set_max_segs * mss; + if (rack->rc_force_max_seg) { + return (user_max); + } + if (rack->use_fixed_rate && + ((rack->r_ctl.crte == NULL) || + (bw != rack->r_ctl.crte->rate))) { + /* Use the user mss since we are not exactly matched */ + return (user_max); + } + new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); + if (new_tso > user_max) + new_tso = user_max; + return(new_tso); +} + +static void +rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, + uint64_t rate, uint64_t hw_rate, int line, + int error) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); + log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); + log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); + log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.bw_inuse = rate; + log.u_bbr.flex5 = line; + log.u_bbr.flex6 = error; + log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex8 = rack->use_fixed_rate; + log.u_bbr.flex8 <<= 1; + log.u_bbr.flex8 |= rack->rack_hdrw_pacing; + log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HDWR_PACE, 0, + 0, &log, false, &tv); + } +} + +static int32_t +pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) +{ + uint64_t lentim, fill_bw; + + /* Lets first see if we are full, if so continue with normal rate */ + if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) + return (slot); + if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) + return (slot); + if (rack->r_ctl.rc_last_us_rtt == 0) + return (slot); + if (rack->rc_pace_fill_if_rttin_range && + (rack->r_ctl.rc_last_us_rtt >= + (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { + /* The rtt is huge, N * smallest, lets not fill */ + return (slot); + } + /* + * first lets calculate the b/w based on the last us-rtt + * and the sndwnd. + */ + fill_bw = rack->r_ctl.cwnd_to_use; + /* Take the rwnd if its smaller */ + if (fill_bw > rack->rc_tp->snd_wnd) + fill_bw = rack->rc_tp->snd_wnd; + fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; + fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; + /* We are below the min b/w */ + if (fill_bw < RACK_MIN_BW) + return (slot); + /* + * Ok fill_bw holds our mythical b/w to fill the cwnd + * in a rtt, what does that time wise equate too? + */ + lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; + lentim /= fill_bw; + if (lentim < slot) { + rack_log_pacing_delay_calc(rack, len, slot, fill_bw, + 0, lentim, 12, __LINE__, NULL); + return ((int32_t)lentim); + } else + return (slot); +} + +static int32_t +rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) +{ + struct rack_sendmap *lrsm; + int32_t slot = 0; + int err; + + if (rack->rc_always_pace == 0) { /* * We use the most optimistic possible cwnd/srtt for * sending calculations. This will make our @@ -8018,8 +11519,17 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len) * the peer to have a gap in data sending. */ uint32_t srtt, cwnd, tr_perms = 0; + int32_t reduce = 0; -old_method: + old_method: + /* + * We keep no precise pacing with the old method + * instead we use the pacer to mitigate bursts. + */ + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; if (rack->r_ctl.rc_rack_min_rtt) srtt = rack->r_ctl.rc_rack_min_rtt; else @@ -8027,7 +11537,7 @@ old_method: if (rack->r_ctl.rc_rack_largest_cwnd) cwnd = rack->r_ctl.rc_rack_largest_cwnd; else - cwnd = tp->snd_cwnd; + cwnd = rack->r_ctl.cwnd_to_use; tr_perms = cwnd / srtt; if (tr_perms == 0) { tr_perms = ctf_fixed_maxseg(tp); @@ -8043,55 +11553,171 @@ old_method: */ slot = len / tr_perms; /* Now do we reduce the time so we don't run dry? */ - if (slot && rack->rc_pace_reduce) { - int32_t reduce; - - reduce = (slot / rack->rc_pace_reduce); + if (slot && rack_slot_reduction) { + reduce = (slot / rack_slot_reduction); if (reduce < slot) { slot -= reduce; } else slot = 0; } - } else { - int cnt; - uint64_t bw_est, bw_raise, res, lentim; - - bw_est = 0; - for (cnt=0; cntr_ctl.rc_gp_hist_filled == 0) && - (rack->r_ctl.rc_gp_history[cnt] == 0)) - break; - bw_est += rack->r_ctl.rc_gp_history[cnt]; - } - if (bw_est == 0) { + slot *= HPTS_USEC_IN_MSEC; + if (rsm == NULL) { /* - * No way yet to make a b/w estimate - * (no goodput est yet). + * We always consider ourselves app limited with old style + * that are not retransmits. This could be the initial + * measurement, but thats ok its all setup and specially + * handled. If another send leaks out, then that too will + * be mark app-limited. + */ + lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { + rack->r_ctl.rc_first_appl = lrsm; + lrsm->r_flags |= RACK_APP_LIMITED; + rack->r_ctl.rc_app_limited_cnt++; + } + } + rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); + } else { + uint64_t bw_est, res, lentim, rate_wanted; + uint32_t orig_val, srtt, segs, oh; + + if ((rack->r_rr_config == 1) && rsm) { + return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); + } + if (rack->use_fixed_rate) { + rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); + } else if ((rack->r_ctl.init_rate == 0) && +#ifdef NETFLIX_PEAKRATE + (rack->rc_tp->t_maxpeakrate == 0) && +#endif + (rack->r_ctl.gp_bw == 0)) { + /* no way to yet do an estimate */ + bw_est = rate_wanted = 0; + } else { + bw_est = rack_get_bw(rack); + rate_wanted = rack_get_output_bw(rack, bw_est, rsm); + } + if ((bw_est == 0) || (rate_wanted == 0)) { + /* + * No way yet to make a b/w estimate or + * our raise is set incorrectly. */ goto old_method; } - /* Covert to bytes per second */ - bw_est *= MSEC_IN_SECOND; + /* We need to account for all the overheads */ + segs = (len + segsiz - 1) / segsiz; /* - * Now ratchet it up by our percentage. Note - * that the minimum you can do is 1 which would - * get you 101% of the average last N goodput estimates. - * The max you can do is 256 which would yeild you - * 356% of the last N goodput estimates. + * We need the diff between 1514 bytes (e-mtu with e-hdr) + * and how much data we put in each packet. Yes this + * means we may be off if we are larger than 1500 bytes + * or smaller. But this just makes us more conservative. */ - bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp; - bw_est += bw_raise; - /* average by the number we added */ - bw_est /= cnt; - /* Now calculate a rate based on this b/w */ - lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND; - res = lentim / bw_est; + if (ETHERNET_SEGMENT_SIZE > segsiz) + oh = ETHERNET_SEGMENT_SIZE - segsiz; + else + oh = 0; + segs *= oh; + lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; + res = lentim / rate_wanted; slot = (uint32_t)res; - } - if (rack->r_enforce_min_pace && - (slot == 0)) { - /* We are enforcing a minimum pace time of 1ms */ - slot = rack->r_enforce_min_pace; + orig_val = rack->r_ctl.rc_pace_max_segs; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); +#ifdef KERN_TLS + /* For TLS we need to override this, possibly */ + if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + } +#endif + /* Did we change the TSO size, if so log it */ + if (rack->r_ctl.rc_pace_max_segs != orig_val) + rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); + if ((rack->rc_pace_to_cwnd) && + (rack->in_probe_rtt == 0) && + (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { + /* + * We want to pace at our rate *or* faster to + * fill the cwnd to the max if its not full. + */ + slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); + } + if ((rack->rc_inp->inp_route.ro_nh != NULL) && + (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { + if ((rack->rack_hdw_pace_ena) && + (rack->rack_hdrw_pacing == 0) && + (rack->rack_attempt_hdwr_pace == 0)) { + /* + * Lets attempt to turn on hardware pacing + * if we can. + */ + rack->rack_attempt_hdwr_pace = 1; + rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, + rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, + RS_PACING_GEQ, + &err); + if (rack->r_ctl.crte) { + rack->rack_hdrw_pacing = 1; + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz, + 0, rack->r_ctl.crte, + NULL); + rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, rack->r_ctl.crte->rate, __LINE__, + err); + } + } else if (rack->rack_hdrw_pacing && + (rack->r_ctl.crte->rate != rate_wanted)) { + /* Do we need to adjust our rate? */ + const struct tcp_hwrate_limit_table *nrte; + + nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, + rack->rc_tp, + rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, + RS_PACING_GEQ, + &err); + if (nrte == NULL) { + /* Lost the rate */ + rack->rack_hdrw_pacing = 0; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + } else if (nrte != rack->r_ctl.crte) { + rack->r_ctl.crte = nrte; + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, + segsiz, 0, + rack->r_ctl.crte, + NULL); + rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, rack->r_ctl.crte->rate, __LINE__, + err); + } + + } + } + if (rack_limit_time_with_srtt && + (rack->use_fixed_rate == 0) && +#ifdef NETFLIX_PEAKRATE + (rack->rc_tp->t_maxpeakrate == 0) && +#endif + (rack->rack_hdrw_pacing == 0)) { + /* + * Sanity check, we do not allow the pacing delay + * to be longer than the SRTT of the path. If it is + * a slow path, then adding a packet should increase + * the RTT and compensate for this i.e. the srtt will + * be greater so the allowed pacing time will be greater. + * + * Note this restriction is not for where a peak rate + * is set, we are doing fixed pacing or hardware pacing. + */ + if (rack->rc_tp->t_srtt) + srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); + else + srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ + if (srtt < slot) { + rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); + slot = srtt; + } + } + rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); } if (slot) counter_u64_add(rack_calc_nonzero, 1); @@ -8100,18 +11726,234 @@ old_method: return (slot); } +static void +rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, + tcp_seq startseq, uint32_t sb_offset) +{ + struct rack_sendmap *my_rsm = NULL; + struct rack_sendmap fe; + + tp->t_flags |= TF_GPUTINPROG; + rack->r_ctl.rc_gp_lowrtt = 0xffffffff; + rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; + tp->gput_seq = startseq; + rack->app_limited_needs_set = 0; + if (rack->in_probe_rtt) + rack->measure_saw_probe_rtt = 1; + else if ((rack->measure_saw_probe_rtt) && + (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) + rack->measure_saw_probe_rtt = 0; + if (rack->rc_gp_filled) + tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + else { + /* Special case initial measurement */ + rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); + } + /* + * We take a guess out into the future, + * if we have no measurement and no + * initial rate, we measure the first + * initial-windows worth of data to + * speed up getting some GP measurement and + * thus start pacing. + */ + if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { + rack->app_limited_needs_set = 1; + tp->gput_ack = startseq + max(rc_init_window(rack), + (MIN_GP_WIN * ctf_fixed_maxseg(tp))); + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + 0, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, + __LINE__, NULL); + return; + } + if (sb_offset) { + /* + * We are out somewhere in the sb + * can we use the already outstanding data? + */ + + if (rack->r_ctl.rc_app_limited_cnt == 0) { + /* + * Yes first one is good and in this case + * the tp->gput_ts is correctly set based on + * the last ack that arrived (no need to + * set things up when an ack comes in). + */ + my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if ((my_rsm == NULL) || + (my_rsm->r_rtr_cnt != 1)) { + /* retransmission? */ + goto use_latest; + } + } else { + if (rack->r_ctl.rc_first_appl == NULL) { + /* + * If rc_first_appl is NULL + * then the cnt should be 0. + * This is probably an error, maybe + * a KASSERT would be approprate. + */ + goto use_latest; + } + /* + * If we have a marker pointer to the last one that is + * app limited we can use that, but we need to set + * things up so that when it gets ack'ed we record + * the ack time (if its not already acked). + */ + rack->app_limited_needs_set = 1; + /* + * We want to get to the rsm that is either + * next with space i.e. over 1 MSS or the one + * after that (after the app-limited). + */ + my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, + rack->r_ctl.rc_first_appl); + if (my_rsm) { + if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) + /* Have to use the next one */ + my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, + my_rsm); + else { + /* Use after the first MSS of it is acked */ + tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); + goto start_set; + } + } + if ((my_rsm == NULL) || + (my_rsm->r_rtr_cnt != 1)) { + /* + * Either its a retransmit or + * the last is the app-limited one. + */ + goto use_latest; + } + } + tp->gput_seq = my_rsm->r_start; +start_set: + if (my_rsm->r_flags & RACK_ACKED) { + /* + * This one has been acked use the arrival ack time + */ + tp->gput_ts = my_rsm->r_ack_arrival; + rack->app_limited_needs_set = 0; + } + rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; + tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + (uint64_t)my_rsm, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, + __LINE__, NULL); + return; + } + +use_latest: + /* + * We don't know how long we may have been + * idle or if this is the first-send. Lets + * setup the flag so we will trim off + * the first ack'd data so we get a true + * measurement. + */ + rack->app_limited_needs_set = 1; + tp->gput_ack = startseq + rack_get_measure_window(tp, rack); + /* Find this guy so we can pull the send time */ + fe.r_start = startseq; + my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + if (my_rsm) { + rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; + if (my_rsm->r_flags & RACK_ACKED) { + /* + * Unlikely since its probably what was + * just transmitted (but I am paranoid). + */ + tp->gput_ts = my_rsm->r_ack_arrival; + rack->app_limited_needs_set = 0; + } + if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { + /* This also is unlikely */ + tp->gput_seq = my_rsm->r_start; + } + } else { + /* + * TSNH unless we have some send-map limit, + * and even at that it should not be hitting + * that limit (we should have stopped sending). + */ + rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); + } + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + (uint64_t)my_rsm, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, __LINE__, NULL); +} + +static inline uint32_t +rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, + uint32_t avail, int32_t sb_offset) +{ + uint32_t len; + uint32_t sendwin; + + if (tp->snd_wnd > cwnd_to_use) + sendwin = cwnd_to_use; + else + sendwin = tp->snd_wnd; + if (ctf_outstanding(tp) >= tp->snd_wnd) { + /* We never want to go over our peers rcv-window */ + len = 0; + } else { + uint32_t flight; + + flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); + if (flight >= sendwin) { + /* + * We have in flight what we are allowed by cwnd (if + * it was rwnd blocking it would have hit above out + * >= tp->snd_wnd). + */ + return (0); + } + len = sendwin - flight; + if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { + /* We would send too much (beyond the rwnd) */ + len = tp->snd_wnd - ctf_outstanding(tp); + } + if ((len + sb_offset) > avail) { + /* + * We don't have that much in the SB, how much is + * there? + */ + len = avail - sb_offset; + } + } + return (len); +} + static int rack_output(struct tcpcb *tp) { struct socket *so; - uint32_t recwin, sendwin; + uint32_t recwin; uint32_t sb_offset; int32_t len, flags, error = 0; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; - uint32_t if_hw_tsomaxsegsize = 0; - int32_t maxseg; + uint32_t if_hw_tsomaxsegsize; + int32_t segsiz, minseg; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG @@ -8121,6 +11963,7 @@ rack_output(struct tcpcb *tp) struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; + uint8_t mark = 0; uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen=0; @@ -8138,12 +11981,14 @@ rack_output(struct tcpcb *tp) struct tcpopt to; int32_t slot = 0; int32_t sup_rack = 0; - uint32_t cts; + uint32_t cts, us_cts, delayed, early; uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; + uint32_t cwnd_to_use; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; int force_tso = 0; int32_t orig_len; + struct timeval tv; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; @@ -8162,19 +12007,16 @@ rack_output(struct tcpcb *tp) sb = &so->so_snd; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; - + hpts_calling = inp->inp_hpts_calls; #ifdef KERN_TLS hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; #endif - NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); - #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif - maxseg = ctf_fixed_maxseg(tp); /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. @@ -8192,7 +12034,9 @@ rack_output(struct tcpcb *tp) isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif - cts = tcp_ts_getticks(); + early = 0; + us_cts = tcp_get_usecs(&tv); + cts = tcp_tv_to_mssectick(&tv); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* @@ -8201,9 +12045,23 @@ rack_output(struct tcpcb *tp) */ rack_timer_cancel(tp, rack, cts, __LINE__); } - /* Mark that we have called rack_output(). */ + /* Are we pacing and late? */ + if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && + TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { + /* We are delayed */ + delayed = us_cts - rack->r_ctl.rc_last_output_to; + } else { + delayed = 0; + } + /* Do the timers, which may override the pacer */ + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + if (rack_process_timers(tp, rack, cts, hpts_calling)) { + counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); + return (0); + } + } if ((rack->r_timer_override) || - (tp->t_flags & TF_FORCEDATA) || + (delayed) || (tp->t_state < TCPS_ESTABLISHED)) { if (tp->t_inpcb->inp_in_hpts) tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); @@ -8215,14 +12073,22 @@ rack_output(struct tcpcb *tp) counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } - hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; - if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { - if (rack_process_timers(tp, rack, cts, hpts_calling)) { - counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); - return (0); - } + /* Finish out both pacing early and late accounting */ + if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && + TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { + early = rack->r_ctl.rc_last_output_to - us_cts; + } else + early = 0; + if (delayed) { + rack->r_ctl.rc_agg_delayed += delayed; + rack->r_late = 1; + } else if (early) { + rack->r_ctl.rc_agg_early += early; + rack->r_early = 1; } + /* Now that early/late accounting is done turn off the flag */ + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; rack->r_wanted_output = 0; rack->r_timer_override = 0; /* @@ -8245,7 +12111,7 @@ rack_output(struct tcpcb *tp) idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) - rack_cc_after_idle(tp); + rack_cc_after_idle(rack, tp); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { @@ -8254,6 +12120,23 @@ rack_output(struct tcpcb *tp) idle = 0; } } + if ((tp->snd_una == tp->snd_max) && + rack->r_ctl.rc_went_idle_time && + TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { + idle = us_cts - rack->r_ctl.rc_went_idle_time; + if (idle > rack_min_probertt_hold) { + /* Count as a probe rtt */ + if (rack->in_probe_rtt == 0) { + rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; + rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; + rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; + } else { + rack_exit_probertt(rack, us_cts); + } + } + idle = 0; + } again: /* * If we've recently taken a timeout, snd_max will be greater than @@ -8261,19 +12144,56 @@ again: * resending already delivered data. Adjust snd_nxt accordingly. */ sendalot = 0; - cts = tcp_ts_getticks(); + us_cts = tcp_get_usecs(&tv); + cts = tcp_tv_to_mssectick(&tv); tso = 0; mtu = 0; + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + if (so->so_snd.sb_flags & SB_TLS_IFNET) { + minseg = rack->r_ctl.rc_pace_min_segs; + } else { + minseg = segsiz; + } sb_offset = tp->snd_max - tp->snd_una; - sendwin = min(tp->snd_wnd, tp->snd_cwnd); - + cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; +#ifdef NETFLIX_SHARED_CWND + if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && + rack->rack_enable_scwnd) { + /* We are doing cwnd sharing */ + if (rack->rc_gp_filled && + (rack->rack_attempted_scwnd == 0) && + (rack->r_ctl.rc_scw == NULL) && + tp->t_lib) { + /* The pcbid is in, lets make an attempt */ + counter_u64_add(rack_try_scwnd, 1); + rack->rack_attempted_scwnd = 1; + rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, + &rack->r_ctl.rc_scw_index, + segsiz); + } + if (rack->r_ctl.rc_scw && + (rack->rack_scwnd_is_idle == 1) && + (rack->rc_in_persist == 0) && + sbavail(sb)) { + /* we are no longer out of data */ + tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); + rack->rack_scwnd_is_idle = 0; + } + if (rack->r_ctl.rc_scw) { + /* First lets update and get the cwnd */ + rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, + rack->r_ctl.rc_scw_index, + tp->snd_cwnd, tp->snd_wnd, segsiz); + } + } +#endif flags = tcp_outflags[tp->t_state]; while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); if (rsm == NULL) { if (inp->inp_hpts_calls) /* Retry in a ms */ - slot = 1; + slot = (1 * HPTS_USEC_IN_MSEC); goto just_return_nolock; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); @@ -8289,53 +12209,25 @@ again: SOCKBUF_LOCK(sb); goto send; } - if (rack->r_ctl.rc_tlpsend) { - /* Tail loss probe */ - long cwin; - long tlen; - - doing_tlp = 1; - /* - * Check if we can do a TLP with a RACK'd packet - * this can happen if we are not doing the rack - * cheat and we skipped to a TLP and it - * went off. - */ - rsm = tcp_rack_output(tp, rack, cts); - if (rsm == NULL) - rsm = rack->r_ctl.rc_tlpsend; - rack->r_ctl.rc_tlpsend = NULL; - sack_rxmit = 1; - tlen = rsm->r_end - rsm->r_start; - if (tlen > ctf_fixed_maxseg(tp)) - tlen = ctf_fixed_maxseg(tp); - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); - sb_offset = rsm->r_start - tp->snd_una; - cwin = min(tp->snd_wnd, tlen); - len = cwin; - } else if (rack->r_ctl.rc_resend) { + if (rack->r_ctl.rc_resend) { /* Retransmit timer */ rsm = rack->r_ctl.rc_resend; rack->r_ctl.rc_resend = NULL; + rsm->r_flags &= ~RACK_TLP; len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); + ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", + __func__, __LINE__, + rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - if (len >= ctf_fixed_maxseg(tp)) { - len = ctf_fixed_maxseg(tp); - } + if (len >= segsiz) + len = segsiz; } else if ((rack->rc_in_persist == 0) && - ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { - int maxseg; - - maxseg = ctf_fixed_maxseg(tp); + ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { + /* We have a retransmit that takes precedence */ + rsm->r_flags &= ~RACK_TLP; if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ @@ -8347,50 +12239,79 @@ again: * When we enter recovery we need to assure we send * one packet. */ - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 13); + if (rack->rack_no_prr == 0) { + rack->r_ctl.rc_prr_sndcnt = segsiz; + rack_log_to_prr(rack, 13, 0); + } } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", - tp, rack, rsm, rsm->r_start, tp->snd_una); + tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif len = rsm->r_end - rsm->r_start; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); + ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", + __func__, __LINE__, + rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; /* Can we send it within the PRR boundary? */ - if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { - /* It does not fit */ - if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && - (rack->r_ctl.rc_prr_sndcnt < maxseg)) { - /* - * prr is less than a segment, we - * have more acks due in besides - * what we need to resend. Lets not send - * to avoid sending small pieces of - * what we need to retransmit. - */ - len = 0; - goto just_return_nolock; + if (rack->rack_no_prr == 0) { + if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { + /* It does not fit */ + if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && + (rack->r_ctl.rc_prr_sndcnt < segsiz)) { + /* + * prr is less than a segment, we + * have more acks due in besides + * what we need to resend. Lets not send + * to avoid sending small pieces of + * what we need to retransmit. + */ + len = 0; + goto just_return_nolock; + } + len = rack->r_ctl.rc_prr_sndcnt; } - len = rack->r_ctl.rc_prr_sndcnt; } sendalot = 0; - if (len >= maxseg) { - len = maxseg; - } + if (len >= segsiz) + len = segsiz; if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; KMOD_TCPSTAT_INC(tcps_sack_rexmits); KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, - min(len, ctf_fixed_maxseg(tp))); + min(len, segsiz)); counter_u64_add(rack_rtm_prr_retran, 1); } + } else if (rack->r_ctl.rc_tlpsend) { + /* Tail loss probe */ + long cwin; + long tlen; + + doing_tlp = 1; + /* + * Check if we can do a TLP with a RACK'd packet + * this can happen if we are not doing the rack + * cheat and we skipped to a TLP and it + * went off. + */ + rsm = rack->r_ctl.rc_tlpsend; + rsm->r_flags |= RACK_TLP; + rack->r_ctl.rc_tlpsend = NULL; + sack_rxmit = 1; + tlen = rsm->r_end - rsm->r_start; + if (tlen > segsiz) + tlen = segsiz; + KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), + ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", + __func__, __LINE__, + rsm->r_start, tp->snd_una, tp, rack, rsm)); + sb_offset = rsm->r_start - tp->snd_una; + cwin = min(tp->snd_wnd, tlen); + len = cwin; } /* * Enforce a connection sendmap count limit if set @@ -8438,43 +12359,6 @@ again: prefetch_rsm = 1; } SOCKBUF_LOCK(sb); - /* - * If in persist timeout with window of 0, send 1 byte. Otherwise, - * if window is small but nonzero and time TF_SENTFIN expired, we - * will send what we can and go to transmit state. - */ - if (tp->t_flags & TF_FORCEDATA) { - if (sendwin == 0) { - /* - * If we still have some data to send, then clear - * the FIN bit. Usually this would happen below - * when it realizes that we aren't sending all the - * data. However, if we have exactly 1 byte of - * unsent data, then it won't clear the FIN bit - * below, and if we are in persist state, we wind up - * sending the packet without recording that we sent - * the FIN bit. - * - * We can't just blindly clear the FIN bit, because - * if we don't have any more data to send then the - * probe will be the FIN itself. - */ - if (sb_offset < sbused(sb)) - flags &= ~TH_FIN; - sendwin = 1; - } else { - if ((rack->rc_in_persist != 0) && - (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), - rack->r_ctl.rc_pace_min_segs))) - rack_exit_persist(tp, rack); - /* - * If we are dropping persist mode then we need to - * correct snd_nxt/snd_max and off. - */ - tp->snd_nxt = tp->snd_max; - sb_offset = tp->snd_nxt - tp->snd_una; - } - } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a @@ -8490,7 +12374,7 @@ again: * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ - if (sack_rxmit == 0) { + if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { uint32_t avail; avail = sbavail(sb); @@ -8498,7 +12382,7 @@ again: sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; - if (IN_RECOVERY(tp->t_flags) == 0) { + if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { if (rack->r_ctl.rc_tlp_new_data) { /* TLP is forcing out new data */ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { @@ -8510,21 +12394,17 @@ again: len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; new_data_tlp = doing_tlp = 1; - } else { - if (sendwin > avail) { - /* use the available */ - if (avail > sb_offset) { - len = (int32_t)(avail - sb_offset); - } else { - len = 0; - } - } else { - if (sendwin > sb_offset) { - len = (int32_t)(sendwin - sb_offset); - } else { - len = 0; - } - } + } else + len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); + if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { + /* + * For prr=off, we need to send only 1 MSS + * at a time. We do this because another sack could + * be arriving that causes us to send retransmits and + * we don't want to be on a long pace due to a larger send + * that keeps us from sending out the retransmit. + */ + len = segsiz; } } else { uint32_t outstanding; @@ -8539,11 +12419,13 @@ again: if (tp->snd_wnd > outstanding) { len = tp->snd_wnd - outstanding; /* Check to see if we have the data */ - if (((sb_offset + len) > avail) && - (avail > sb_offset)) - len = avail - sb_offset; - else - len = 0; + if ((sb_offset + len) > avail) { + /* It does not all fit */ + if (avail > sb_offset) + len = avail - sb_offset; + else + len = 0; + } } else len = 0; } else if (avail > sb_offset) @@ -8558,7 +12440,7 @@ again: counter_u64_add(rack_rtm_prr_newdata, 1); } } - if (len > ctf_fixed_maxseg(tp)) { + if (len > segsiz) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr @@ -8567,8 +12449,8 @@ again: * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) - len = ctf_fixed_maxseg(tp); - } else if (len < ctf_fixed_maxseg(tp)) { + len = segsiz; + } else if (len < segsiz) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to @@ -8584,6 +12466,17 @@ again: } } } + } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { + /* + * If you have not established + * and are not doing FAST OPEN + * no data please. + */ + if ((sack_rxmit == 0) && + (!IS_FASTOPEN(tp->t_flags))){ + len = 0; + sb_offset = 0; + } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); @@ -8596,8 +12489,6 @@ again: */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { - if (tp->t_state != TCPS_SYN_RECEIVED) - flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. @@ -8605,7 +12496,6 @@ again: if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; - sb_offset--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. This @@ -8668,19 +12558,21 @@ again: ((doing_tlp == 0) || (new_data_tlp == 1)) && (len < rack->r_ctl.rc_pace_max_segs)) { /* - * We are not sending a full segment for + * We are not sending a maximum sized segment for * some reason. Should we not send anything (think * sws or persists)? */ - if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && (TCPS_HAVEESTABLISHED(tp->t_state)) && + (len < minseg) && (len < (int)(sbavail(sb) - sb_offset))) { /* * Here the rwnd is less than - * the pacing size, this is not a retransmit, + * the minimum pacing size, this is not a retransmit, * we are established and * the send is not the last in the socket buffer - * we send nothing, and may enter persists. + * we send nothing, and we may enter persists + * if nothing is outstanding. */ len = 0; if (tp->snd_max == tp->snd_una) { @@ -8691,10 +12583,10 @@ again: rack_enter_persist(tp, rack, cts); tp->snd_nxt = tp->snd_una; } - } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) && - (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && + } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && (len < (int)(sbavail(sb) - sb_offset)) && - (len < rack->r_ctl.rc_pace_min_segs)) { + (len < minseg)) { /* * Here we are not retransmitting, and * the cwnd is not so small that we could @@ -8707,8 +12599,8 @@ again: */ len = 0; } else if (((tp->snd_wnd - ctf_outstanding(tp)) < - min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && - (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && + min((rack->r_ctl.rc_high_rwnd/2), minseg)) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && (len < (int)(sbavail(sb) - sb_offset)) && (TCPS_HAVEESTABLISHED(tp->t_state))) { /* @@ -8724,7 +12616,7 @@ again: } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); - tcp_sndbuf_autoscale(tp, so, sendwin); + tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). @@ -8749,7 +12641,7 @@ again: #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - - offsetof(struct ipoption, ipopt_list); + offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -8765,15 +12657,15 @@ again: #endif #endif /* INET6 */ #ifdef INET - if (IPSEC_ENABLED(ipv4)) - ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); + if (IPSEC_ENABLED(ipv4)) + ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif - if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) && + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && @@ -8795,7 +12687,7 @@ again: flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + - sbused(sb))) + sbused(sb))) flags &= ~TH_FIN; } } @@ -8812,8 +12704,7 @@ again: * limited the window size) - we need to retransmit */ if (len) { - if (len >= ctf_fixed_maxseg(tp)) { - pass = 1; + if (len >= segsiz) { goto send; } /* @@ -8824,16 +12715,13 @@ again: */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && - ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && + ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && (tp->t_flags & TF_NOPUSH) == 0) { pass = 2; goto send; } - if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ - pass = 3; - goto send; - } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ + pass = 22; goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { @@ -8848,6 +12736,18 @@ again: pass = 6; goto send; } + if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && + (ctf_outstanding(tp) < (segsiz * 2))) { + /* + * We have less than two MSS outstanding (delayed ack) + * and our rwnd will not let us send a full sized + * MSS. Lets go ahead and let this small segment + * out because we want to try to have at least two + * packets inflight to not be caught by delayed ack. + */ + pass = 12; + goto send; + } } /* * Sending of standalone window updates. @@ -8902,15 +12802,17 @@ again: if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; - if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) && + if (adv >= (int32_t)(2 * segsiz) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || - recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || - so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) { + recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || + so->so_rcv.sb_hiwat <= 8 * segsiz)) { pass = 7; goto send; } - if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) + if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { + pass = 23; goto send; + } } dontupdate: @@ -8926,10 +12828,6 @@ dontupdate: pass = 9; goto send; } - if (SEQ_GT(tp->snd_up, tp->snd_una)) { - pass = 10; - goto send; - } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. @@ -8945,32 +12843,189 @@ dontupdate: just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: - if (tot_len_this_send == 0) - counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); - if (slot) { - /* set the rack tcb into the slot N */ - counter_u64_add(rack_paced_segments, 1); - } else if (tot_len_this_send) { - counter_u64_add(rack_unpaced_segments, 1); + { + int app_limited = CTF_JR_SENT_DATA; + + if (tot_len_this_send > 0) { + /* Make sure snd_nxt is up to max */ + if (SEQ_GT(tp->snd_max, tp->snd_nxt)) + tp->snd_nxt = tp->snd_max; + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); + } else { + int end_window = 0; + uint32_t seq = tp->gput_ack; + + rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm) { + /* + * Mark the last sent that we just-returned (hinting + * that delayed ack may play a role in any rtt measurement). + */ + rsm->r_just_ret = 1; + } + counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; + if ((ctf_outstanding(tp) + + min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), + minseg)) >= tp->snd_wnd) { + /* We are limited by the rwnd */ + app_limited = CTF_JR_RWND_LIMITED; + } else if (ctf_outstanding(tp) >= sbavail(sb)) { + /* We are limited by whats available -- app limited */ + app_limited = CTF_JR_APP_LIMITED; + } else if ((idle == 0) && + ((tp->t_flags & TF_NODELAY) == 0) && + ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && + (len < segsiz)) { + /* + * No delay is not on and the + * user is sending less than 1MSS. This + * brings out SWS avoidance so we + * don't send. Another app-limited case. + */ + app_limited = CTF_JR_APP_LIMITED; + } else if (tp->t_flags & TF_NOPUSH) { + /* + * The user has requested no push of + * the last segment and we are + * at the last segment. Another app + * limited case. + */ + app_limited = CTF_JR_APP_LIMITED; + } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { + /* Its the cwnd */ + app_limited = CTF_JR_CWND_LIMITED; + } else if (rack->rc_in_persist == 1) { + /* We are in persists */ + app_limited = CTF_JR_PERSISTS; + } else if (IN_RECOVERY(tp->t_flags) && + (rack->rack_no_prr == 0) && + (rack->r_ctl.rc_prr_sndcnt < segsiz)) { + app_limited = CTF_JR_PRR; + } else { + /* Now why here are we not sending? */ +#ifdef NOW +#ifdef INVARIANTS + panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); +#endif +#endif + app_limited = CTF_JR_ASSESSING; + } + /* + * App limited in some fashion, for our pacing GP + * measurements we don't want any gap (even cwnd). + * Close down the measurement window. + */ + if (rack_cwnd_block_ends_measure && + ((app_limited == CTF_JR_CWND_LIMITED) || + (app_limited == CTF_JR_PRR))) { + /* + * The reason we are not sending is + * the cwnd (or prr). We have been configured + * to end the measurement window in + * this case. + */ + end_window = 1; + } else if (app_limited == CTF_JR_PERSISTS) { + /* + * We never end the measurement window + * in persists, though in theory we + * should be only entering after everything + * is acknowledged (so we will probably + * never come here). + */ + end_window = 0; + } else if (rack_rwnd_block_ends_measure && + (app_limited == CTF_JR_RWND_LIMITED)) { + /* + * We are rwnd limited and have been + * configured to end the measurement + * window in this case. + */ + end_window = 1; + } else if (app_limited == CTF_JR_APP_LIMITED) { + /* + * A true application limited period, we have + * ran out of data. + */ + end_window = 1; + } else if (app_limited == CTF_JR_ASSESSING) { + /* + * In the assessing case we hit the end of + * the if/else and had no known reason + * This will panic us under invariants.. + * + * If we get this out in logs we need to + * investagate which reason we missed. + */ + end_window = 1; + } + if (end_window) { + uint8_t log = 0; + + if ((tp->t_flags & TF_GPUTINPROG) && + SEQ_GT(tp->gput_ack, tp->snd_max)) { + /* Mark the last packet has app limited */ + tp->gput_ack = tp->snd_max; + log = 1; + } + rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { + if (rack->r_ctl.rc_app_limited_cnt == 0) + rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; + else { + /* + * Go out to the end app limited and mark + * this new one as next and move the end_appl up + * to this guy. + */ + if (rack->r_ctl.rc_end_appl) + rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; + rack->r_ctl.rc_end_appl = rsm; + } + rsm->r_flags |= RACK_APP_LIMITED; + rack->r_ctl.rc_app_limited_cnt++; + } + if (log) + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rc_app_limited_cnt, seq, + tp->gput_ack, 0, 0, 4, __LINE__, NULL); + } + } + if (slot) { + /* set the rack tcb into the slot N */ + counter_u64_add(rack_paced_segments, 1); + } else if (tot_len_this_send) { + counter_u64_add(rack_unpaced_segments, 1); + } + /* Check if we need to go into persists or not */ + if ((rack->rc_in_persist == 0) && + (tp->snd_max == tp->snd_una) && + TCPS_HAVEESTABLISHED(tp->t_state) && + sbavail(sb) && + (sbavail(sb) > tp->snd_wnd) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { + /* Yes lets make sure to move to persist before timer-start */ + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } + rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); + rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); } - /* Check if we need to go into persists or not */ - if ((rack->rc_in_persist == 0) && - (tp->snd_max == tp->snd_una) && - TCPS_HAVEESTABLISHED(tp->t_state) && - sbavail(&tp->t_inpcb->inp_socket->so_snd) && - (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) && - (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { - /* Yes lets make sure to move to persist before timer-start */ - rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); +#ifdef NETFLIX_SHARED_CWND + if ((sbavail(sb) == 0) && + rack->r_ctl.rc_scw) { + tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); + rack->rack_scwnd_is_idle = 1; } - rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); - rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); - tp->t_flags &= ~TF_FORCEDATA; +#endif return (0); send: if ((flags & TH_FIN) && - sbavail(&tp->t_inpcb->inp_socket->so_snd)) { + sbavail(sb)) { /* * We do not transmit a FIN * with data outstanding. We @@ -8979,19 +13034,15 @@ send: */ flags &= ~TH_FIN; } - if (doing_tlp == 0) { - /* - * Data not a TLP, and its not the rxt firing. If it is the - * rxt firing, we want to leave the tlp_in_progress flag on - * so we don't send another TLP. It has to be a rack timer - * or normal send (response to acked data) to clear the tlp - * in progress flag. - */ - rack->rc_tlp_in_progress = 0; + /* Enforce stack imposed max seg size if we have one */ + if (rack->r_ctl.rc_pace_max_segs && + (len > rack->r_ctl.rc_pace_max_segs)) { + mark = 1; + len = rack->r_ctl.rc_pace_max_segs; } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { - if (len >= ctf_fixed_maxseg(tp)) + if (len >= segsiz) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; @@ -9040,14 +13091,14 @@ send: if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = - (u_int8_t *)&tp->t_tfo_cookie.server; + (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = - tp->t_tfo_client_cookie_len; + tp->t_tfo_client_cookie_len; to.to_tfo_cookie = - tp->t_tfo_cookie.client; + tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* @@ -9081,7 +13132,7 @@ send: if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && - tp->rcv_numsacks > 0) { + tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; @@ -9117,11 +13168,11 @@ send: ipoptlen = ip6_optlen(tp->t_inpcb); else #endif - if (tp->t_inpcb->inp_options) - ipoptlen = tp->t_inpcb->inp_options->m_len - - offsetof(struct ipoption, ipopt_list); - else - ipoptlen = 0; + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif @@ -9148,7 +13199,7 @@ send: if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, - ("%s: TSO can't do IP options", __func__)); + ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload @@ -9157,12 +13208,13 @@ send: if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - - max_linkhdr); + max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; + mark = 2; } } /* @@ -9174,17 +13226,16 @@ send: (hw_tls == 0)) { moff = len % (u_int)max_len; if (moff != 0) { + mark = 3; len -= moff; - sendalot = 1; } } /* * In case there are too many small fragments don't * use TSO: */ - if (len <= maxseg) { - len = max_len; - sendalot = 1; + if (len <= segsiz) { + mark = 4; tso = 0; } /* @@ -9193,10 +13244,11 @@ send: * implementations to clear the FIN flag on all but * the last segment. */ - if (tp->t_flags & TF_NEEDFIN) - sendalot = 1; - + if (tp->t_flags & TF_NEEDFIN) { + sendalot = 4; + } } else { + mark = 5; if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put @@ -9212,19 +13264,21 @@ send: goto out; } len = tp->t_maxseg - optlen - ipoptlen; - sendalot = 1; + sendalot = 5; } - } else + } else { tso = 0; + mark = 6; + } KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, - ("%s: len > IP_MAXPACKET", __func__)); + ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else - if (max_linkhdr + hdrlen > MHLEN) + if (max_linkhdr + hdrlen > MHLEN) #endif - panic("tcphdr too big"); + panic("tcphdr too big"); #endif /* @@ -9250,16 +13304,17 @@ send: uint32_t max_val; uint32_t moff; - if (rack->rc_pace_max_segs) - max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp); + if (rack->r_ctl.rc_pace_max_segs) + max_val = rack->r_ctl.rc_pace_max_segs; + else if (rack->rc_user_set_max_segs) + max_val = rack->rc_user_set_max_segs * segsiz; else max_val = len; - if (rack->r_ctl.rc_pace_max_segs < max_val) - max_val = rack->r_ctl.rc_pace_max_segs; /* * We allow a limit on sending with hptsi. */ if (len > max_val) { + mark = 7; len = max_val; } #ifdef INET6 @@ -9285,7 +13340,7 @@ send: mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, - mtod(m, caddr_t)+hdrlen); + mtod(m, caddr_t)+hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(sb, mb, len); m->m_len += len; @@ -9301,8 +13356,8 @@ send: tp, #endif mb, moff, &len, - if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, - ((rsm == NULL) ? hw_tls : 0) + if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, + ((rsm == NULL) ? hw_tls : 0) #ifdef NETFLIX_COPY_ARGS , &filled_all #endif @@ -9324,17 +13379,7 @@ send: goto out; } } - if ((tp->t_flags & TF_FORCEDATA) && len == 1) { - KMOD_TCPSTAT_INC(tcps_sndprobe); -#ifdef STATS - if (SEQ_LT(tp->snd_nxt, tp->snd_max)) - stats_voi_update_abs_u32(tp->t_stats, - VOI_TCP_RETXPB, len); - else - stats_voi_update_abs_u64(tp->t_stats, - VOI_TCP_TXPB, len); -#endif - } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { + if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { if (rsm && (rsm->r_flags & RACK_TLP)) { /* * TLP should not count in retran count, but @@ -9349,14 +13394,14 @@ send: } #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, - len); + len); #endif } else { KMOD_TCPSTAT_INC(tcps_sndpack); KMOD_TCPSTAT_ADD(tcps_sndbyte, len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, - len); + len); #endif } /* @@ -9369,21 +13414,6 @@ send: !(flags & TH_SYN)) flags |= TH_PUSH; - /* - * Are we doing pacing, if so we must calculate the slot. We - * only do hptsi in ESTABLISHED and with no RESET being - * sent where we have data to send. - */ - if (((tp->t_state == TCPS_ESTABLISHED) || - (tp->t_state == TCPS_CLOSE_WAIT) || - ((tp->t_state == TCPS_FIN_WAIT_1) && - ((tp->t_flags & TF_SENTFIN) == 0) && - ((flags & TH_FIN) == 0))) && - ((flags & TH_RST) == 0)) { - /* Get our pacing rate */ - tot_len_this_send += len; - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send); - } SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); @@ -9391,8 +13421,6 @@ send: KMOD_TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) KMOD_TCPSTAT_INC(tcps_sndctrl); - else if (SEQ_GT(tp->snd_up, tp->snd_una)) - KMOD_TCPSTAT_INC(tcps_sndurg); else KMOD_TCPSTAT_INC(tcps_sndwinup); @@ -9484,11 +13512,10 @@ send: /* * If the peer has ECN, mark data packets with ECN capable * transmission (ECT). Ignore pure ack packets, - * retransmissions and window probes. + * retransmissions. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && - (sack_rxmit == 0) && - !((tp->t_flags & TF_FORCEDATA) && len == 1)) { + (sack_rxmit == 0)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); @@ -9555,7 +13582,7 @@ send: recwin = 0; } else { if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)ctf_fixed_maxseg(tp)) + recwin < (long)segsiz) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) @@ -9571,7 +13598,7 @@ send: */ if (flags & TH_SYN) th->th_win = htons((u_short) - (min(sbspace(&so->so_rcv), TCP_MAXWIN))); + (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else { /* Avoid shrinking window with window scaling. */ recwin = roundup2(recwin, 1 << tp->rcv_scale); @@ -9590,17 +13617,7 @@ send: tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; - if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { - th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); - th->th_flags |= TH_URG; - } else - /* - * If no urgent pointer to send, then we pull the urgent - * pointer to the left edge of the send window so that it - * doesn't drift into the send window on sequence number - * wraparound. - */ - tp->snd_up = tp->snd_una; /* drag it along */ + tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { @@ -9611,7 +13628,7 @@ send: * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, - (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { + (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. @@ -9642,8 +13659,8 @@ send: m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, - sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, - 0); + sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, + 0); } } #endif @@ -9656,19 +13673,19 @@ send: m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + - IPPROTO_TCP + len + optlen)); + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + + IPPROTO_TCP + len + optlen)); } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, - ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* @@ -9678,13 +13695,13 @@ send: */ if (tso || force_tso) { KASSERT(force_tso || len > tp->t_maxseg - optlen, - ("%s: len <= tso_segsz", __func__)); + ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } KASSERT(len + hdrlen == m_length(m, NULL), - ("%s: mbuf chain different than expected: %d + %u != %u", - __func__, len, hdrlen, m_length(m, NULL))); + ("%s: mbuf chain different than expected: %d + %u != %u", + __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ @@ -9703,7 +13720,7 @@ send: { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + - * (th->th_off << 2) */ ); + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 @@ -9721,7 +13738,10 @@ send: memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; - log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + if (rack->rack_no_prr) + log.u_bbr.flex1 = 0; + else + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = orig_len; @@ -9729,16 +13749,27 @@ send: log.u_bbr.flex5 = 0x80000000; else log.u_bbr.flex5 = 0; + /* Save off the early/late values */ + log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; + log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; + log.u_bbr.bw_inuse = rack_get_bw(rack); if (rsm || sack_rxmit) { - log.u_bbr.flex8 = 1; + if (doing_tlp) + log.u_bbr.flex8 = 2; + else + log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } + log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); + log.u_bbr.flex7 = mark; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.lt_epoch = cwnd_to_use; + log.u_bbr.delivered = sendalot; lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, - len, &log, false, NULL, NULL, 0, &tv); + len, &log, false, NULL, NULL, 0, &tv); } else lgb = NULL; @@ -9778,10 +13809,10 @@ send: TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ - error = ip6_output(m, tp->t_inpcb->in6p_outputopts, - &inp->inp_route6, - ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), - NULL, NULL, inp); + error = ip6_output(m, inp->in6p_outputopts, + &inp->inp_route6, + ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), + NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) mtu = inp->inp_route6.ro_nh->nh_mtu; @@ -9821,9 +13852,9 @@ send: TCP_PROBE5(send, NULL, tp, ip, tp, th); - error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, - ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, - inp); + error = ip_output(m, inp->inp_options, &inp->inp_route, + ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, + inp); if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) mtu = inp->inp_route.ro_nh->nh_mtu; } @@ -9839,10 +13870,24 @@ out: * retransmit. In persist state, just set snd_max. */ if (error == 0) { + rack->forced_ack = 0; /* If we send something zap the FA flag */ + if (rsm && (doing_tlp == 0)) { + /* Set we retransmitted */ + rack->rc_gp_saw_rec = 1; + } else { + if (cwnd_to_use > tp->snd_ssthresh) { + /* Set we sent in CA */ + rack->rc_gp_saw_ca = 1; + } else { + /* Set we sent in SS */ + rack->rc_gp_saw_ss = 1; + } + } if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) tcp_clean_dsack_blocks(tp); + tot_len_this_send += len; if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { @@ -9850,7 +13895,7 @@ out: } else if (len > 1) { int idx; - idx = (len / ctf_fixed_maxseg(tp)) + 3; + idx = (len / segsiz) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else @@ -9867,13 +13912,13 @@ out: } else if (doing_tlp) { counter_u64_add(rack_tls_tlp, 1); rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1); - } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) { + } else if ( (ctf_outstanding(tp) + minseg) > sbavail(sb)) { counter_u64_add(rack_tls_app, 1); rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1); - } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) { + } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + minseg) > cwnd_to_use) { counter_u64_add(rack_tls_cwnd, 1); rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1); - } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) { + } else if ((ctf_outstanding(tp) + minseg) > tp->snd_wnd) { counter_u64_add(rack_tls_rwnd, 1); rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1); } else { @@ -9883,30 +13928,53 @@ out: } } } - if (sub_from_prr && (error == 0)) { - if (rack->r_ctl.rc_prr_sndcnt >= len) - rack->r_ctl.rc_prr_sndcnt -= len; - else - rack->r_ctl.rc_prr_sndcnt = 0; - } + if (rack->rack_no_prr == 0) { + if (sub_from_prr && (error == 0)) { + if (rack->r_ctl.rc_prr_sndcnt >= len) + rack->r_ctl.rc_prr_sndcnt -= len; + else + rack->r_ctl.rc_prr_sndcnt = 0; + } + } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, - pass, rsm); + pass, rsm, us_cts); if ((error == 0) && (len > 0) && (tp->snd_una == tp->snd_max)) rack->r_ctl.rc_tlp_rxt_last_time = cts; - if ((tp->t_flags & TF_FORCEDATA) == 0 || - (rack->rc_in_persist == 0)) { + /* Now are we in persists? */ + if (rack->rc_in_persist == 0) { tcp_seq startseq = tp->snd_nxt; + /* Track our lost count */ + if (rsm && (doing_tlp == 0)) + rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto nomore; - + if (doing_tlp == 0) { + if (rsm == NULL) { + /* + * Not a retransmission of some + * sort, new data is going out so + * clear our TLP count and flag. + */ + rack->rc_tlp_in_progress = 0; + rack->r_ctl.rc_tlp_cnt_out = 0; + } + } else { + /* + * We have just sent a TLP, mark that it is true + * and make sure our in progress is set so we + * continue to check the count. + */ + rack->rc_tlp_in_progress = 1; + rack->r_ctl.rc_tlp_cnt_out++; + } if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; @@ -9941,15 +14009,9 @@ out: tp->t_rtseq = startseq; KMOD_TCPSTAT_INC(tcps_segstimed); } -#ifdef STATS - if (!(tp->t_flags & TF_GPUTINPROG) && len) { - tp->t_flags |= TF_GPUTINPROG; - tp->gput_seq = startseq; - tp->gput_ack = startseq + - ulmin(sbavail(sb) - sb_offset, sendwin); - tp->gput_ts = tcp_ts_getticks(); - } -#endif + if (len && + ((tp->t_flags & TF_GPUTINPROG) == 0)) + rack_start_gp_measurement(tp, rack, startseq, sb_offset); } } else { /* @@ -9982,6 +14044,10 @@ out: } nomore: if (error) { + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ /* * Failures do not advance the seq counter above. For the @@ -9995,7 +14061,6 @@ nomore: sendalot = 0; switch (error) { case EPERM: - tp->t_flags &= ~TF_FORCEDATA; tp->t_softerror = error; return (error); case ENOBUFS: @@ -10004,14 +14069,14 @@ nomore: * Pace us right away to retry in a some * time */ - slot = 1 + rack->rc_enobuf; - if (rack->rc_enobuf < 255) + slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + if (rack->rc_enobuf < 126) rack->rc_enobuf++; - if (slot > (rack->rc_rack_rtt / 2)) { - slot = rack->rc_rack_rtt / 2; + if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { + slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; } - if (slot < 10) - slot = 10; + if (slot < (10 * HPTS_USEC_IN_MSEC)) + slot = 10 * HPTS_USEC_IN_MSEC; } counter_u64_add(rack_saw_enobuf, 1); error = 0; @@ -10031,9 +14096,8 @@ nomore: tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } - slot = 10; + slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); - tp->t_flags &= ~TF_FORCEDATA; return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); @@ -10045,9 +14109,8 @@ nomore: } /* FALLTHROUGH */ default: - slot = 10; + slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); - tp->t_flags &= ~TF_FORCEDATA; return (error); } } else { @@ -10065,22 +14128,53 @@ nomore: tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: - rack->r_tlp_running = 0; + /* Assure when we leave that snd_nxt will point to top */ + if (SEQ_GT(tp->snd_max, tp->snd_nxt)) + tp->snd_nxt = tp->snd_max; + if (sendalot) { + /* Do we need to turn off sendalot? */ + if (rack->r_ctl.rc_pace_max_segs && + (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { + /* We hit our max. */ + sendalot = 0; + } else if ((rack->rc_user_set_max_segs) && + (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { + /* We hit the user defined max */ + sendalot = 0; + } + } + if ((error == 0) && (flags & TH_FIN)) + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); if (flags & TH_RST) { /* * We don't send again after sending a RST. */ slot = 0; sendalot = 0; - } - if (rsm && (slot == 0)) { + if (error == 0) + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); + } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { /* - * Dup ack retransmission possibly, so - * lets assure we have at least min rack - * time, if its a rack resend then the rack - * to will also be set to this. + * Get our pacing rate, if an error + * occured in sending (ENOBUF) we would + * hit the else if with slot preset. Other + * errors return. */ - slot = rack->r_ctl.rc_min_to; + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); + } + if (rsm && + rack->use_rack_rr) { + /* Its a retransmit and we use the rack cheat? */ + if ((slot == 0) || + (rack->rc_always_pace == 0) || + (rack->r_rr_config == 1)) { + /* + * We have no pacing set or we + * are using old-style rack or + * we are overriden to use the old 1ms pacing. + */ + slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; + } } if (slot) { /* set the rack tcb into the slot N */ @@ -10089,16 +14183,25 @@ enobufs: if (len) counter_u64_add(rack_unpaced_segments, 1); sack_rxmit = 0; - tp->t_flags &= ~TF_FORCEDATA; goto again; } else if (len) { counter_u64_add(rack_unpaced_segments, 1); } - tp->t_flags &= ~TF_FORCEDATA; rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); return (error); } +static void +rack_update_seg(struct tcp_rack *rack) +{ + uint32_t orig_val; + + orig_val = rack->r_ctl.rc_pace_max_segs; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + if (orig_val != rack->r_ctl.rc_pace_max_segs) + rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); +} + /* * rack_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it @@ -10109,34 +14212,56 @@ static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { - struct epoch_tracker et; + uint64_t val; int32_t error = 0, optval; + uint16_t ca, ss; + switch (sopt->sopt_name) { - case TCP_RACK_PROP_RATE: - case TCP_RACK_PROP: - case TCP_RACK_TLP_REDUCE: - case TCP_RACK_EARLY_RECOV: - case TCP_RACK_PACE_ALWAYS: + case TCP_RACK_PROP_RATE: /* URL:prop_rate */ + case TCP_RACK_PROP : /* URL:prop */ + case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ + case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ + case TCP_RACK_PACE_REDUCE: /* Not used */ + /* Pacing related ones */ + case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ + case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ + case TCP_BBR_IWINTSO: /* URL:tso_iwin */ + case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ + case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ + case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ + case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ + case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ + case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ + case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ + case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ + case TCP_RACK_RR_CONF: /* URL:rrr_conf */ + case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ + /* End pacing related */ case TCP_DELACK: - case TCP_RACK_PACE_REDUCE: - case TCP_RACK_PACE_MAX_SEG: - case TCP_RACK_PRR_SENDALOT: - case TCP_RACK_MIN_TO: - case TCP_RACK_EARLY_SEG: - case TCP_RACK_REORD_THRESH: - case TCP_RACK_REORD_FADE: - case TCP_RACK_TLP_THRESH: - case TCP_RACK_PKT_DELAY: - case TCP_RACK_TLP_USE: - case TCP_RACK_TLP_INC_VAR: - case TCP_RACK_IDLE_REDUCE_HIGH: - case TCP_RACK_MIN_PACE: - case TCP_RACK_GP_INCREASE: - case TCP_BBR_RACK_RTT_USE: - case TCP_BBR_USE_RACK_CHEAT: - case TCP_RACK_DO_DETECTION: + case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ + case TCP_RACK_MIN_TO: /* URL:min_to */ + case TCP_RACK_EARLY_SEG: /* URL:early_seg */ + case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ + case TCP_RACK_REORD_FADE: /* URL:reord_fade */ + case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ + case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ + case TCP_RACK_TLP_USE: /* URL:tlp_use */ + case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ + case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ + case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ + case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ + case TCP_RACK_DO_DETECTION: /* URL:detect */ + case TCP_NO_PRR: /* URL:noprr */ + case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ case TCP_DATA_AFTER_CLOSE: + case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ + case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ + case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ + case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ + case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ + case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ + case TCP_RACK_PROFILE: /* URL:profile */ break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); @@ -10154,6 +14279,132 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (sopt->sopt_name) { + case TCP_RACK_PROFILE: + RACK_OPTS_INC(tcp_profile); + if (optval == 1) { + /* pace_always=1 */ + rack->rc_always_pace = 1; + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; + /* scwnd=1 */ + rack->rack_enable_scwnd = 1; + /* dynamic=100 */ + rack->rc_gp_dyn_mul = 1; + rack->r_ctl.rack_per_of_gp_ca = 100; + /* rrr_conf=3 */ + rack->r_rr_config = 3; + /* npush=2 */ + rack->r_ctl.rc_no_push_at_mrtt = 2; + /* fillcw=1 */ + rack->rc_pace_to_cwnd = 1; + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + /* noprr=1 */ + rack->rack_no_prr = 1; + /* lscwnd=1 */ + rack->r_limit_scw = 1; + } else if (optval == 2) { + /* pace_always=1 */ + rack->rc_always_pace = 1; + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; + /* scwnd=1 */ + rack->rack_enable_scwnd = 1; + /* dynamic=100 */ + rack->rc_gp_dyn_mul = 1; + rack->r_ctl.rack_per_of_gp_ca = 100; + /* rrr_conf=3 */ + rack->r_rr_config = 3; + /* npush=2 */ + rack->r_ctl.rc_no_push_at_mrtt = 2; + /* fillcw=1 */ + rack->rc_pace_to_cwnd = 1; + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + /* noprr=1 */ + rack->rack_no_prr = 1; + /* lscwnd=0 */ + rack->r_limit_scw = 0; + } + break; + case TCP_SHARED_CWND_TIME_LIMIT: + RACK_OPTS_INC(tcp_lscwnd); + if (optval) + rack->r_limit_scw = 1; + else + rack->r_limit_scw = 0; + break; + case TCP_RACK_PACE_TO_FILL: + RACK_OPTS_INC(tcp_fillcw); + if (optval == 0) + rack->rc_pace_to_cwnd = 0; + else + rack->rc_pace_to_cwnd = 1; + if ((optval >= rack_gp_rtt_maxmul) && + rack_gp_rtt_maxmul && + (optval < 0xf)) { + rack->rc_pace_fill_if_rttin_range = 1; + rack->rtt_limit_mul = optval; + } else { + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + } + break; + case TCP_RACK_NO_PUSH_AT_MAX: + RACK_OPTS_INC(tcp_npush); + if (optval == 0) + rack->r_ctl.rc_no_push_at_mrtt = 0; + else if (optval < 0xff) + rack->r_ctl.rc_no_push_at_mrtt = optval; + else + error = EINVAL; + break; + case TCP_SHARED_CWND_ENABLE: + RACK_OPTS_INC(tcp_rack_scwnd); + if (optval == 0) + rack->rack_enable_scwnd = 0; + else + rack->rack_enable_scwnd = 1; + break; + case TCP_RACK_MBUF_QUEUE: + /* Now do we use the LRO mbuf-queue feature */ + RACK_OPTS_INC(tcp_rack_mbufq); + if (optval) + rack->r_mbuf_queue = 1; + else + rack->r_mbuf_queue = 0; + if (rack->r_mbuf_queue || rack->rc_always_pace) + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; + else + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + break; + case TCP_RACK_NONRXT_CFG_RATE: + RACK_OPTS_INC(tcp_rack_cfg_rate); + if (optval == 0) + rack->rack_rec_nonrxt_use_cr = 0; + else + rack->rack_rec_nonrxt_use_cr = 1; + break; + case TCP_NO_PRR: + RACK_OPTS_INC(tcp_rack_noprr); + if (optval == 0) + rack->rack_no_prr = 0; + else + rack->rack_no_prr = 1; + break; + case TCP_TIMELY_DYN_ADJ: + RACK_OPTS_INC(tcp_timely_dyn); + if (optval == 0) + rack->rc_gp_dyn_mul = 0; + else { + rack->rc_gp_dyn_mul = 1; + if (optval >= 100) { + /* + * If the user sets something 100 or more + * its the gp_ca value. + */ + rack->r_ctl.rack_per_of_gp_ca = optval; + } + } + break; case TCP_RACK_DO_DETECTION: RACK_OPTS_INC(tcp_rack_do_detection); if (optval == 0) @@ -10192,29 +14443,219 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, RACK_OPTS_INC(tcp_rack_early_recov); rack->r_ctl.rc_early_recovery = optval; break; + + /* Pacing related ones */ case TCP_RACK_PACE_ALWAYS: - /* Use the always pace method (bool) */ + /* + * zero is old rack method, 1 is new + * method using a pacing rate. + */ RACK_OPTS_INC(tcp_rack_pace_always); if (optval > 0) rack->rc_always_pace = 1; else rack->rc_always_pace = 0; - break; - case TCP_RACK_PACE_REDUCE: - /* RACK Hptsi reduction factor (divisor) */ - RACK_OPTS_INC(tcp_rack_pace_reduce); - if (optval) - /* Must be non-zero */ - rack->rc_pace_reduce = optval; + if (rack->r_mbuf_queue || rack->rc_always_pace) + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + /* A rate may be set irate or other, if so set seg size */ + rack_update_seg(rack); + break; + case TCP_BBR_RACK_INIT_RATE: + RACK_OPTS_INC(tcp_initial_rate); + val = optval; + /* Change from kbits per second to bytes per second */ + val *= 1000; + val /= 8; + rack->r_ctl.init_rate = val; + if (rack->rc_init_win != rack_default_init_window) { + uint32_t win, snt; + + /* + * Options don't always get applied + * in the order you think. So in order + * to assure we update a cwnd we need + * to check and see if we are still + * where we should raise the cwnd. + */ + win = rc_init_window(rack); + if (SEQ_GT(tp->snd_max, tp->iss)) + snt = tp->snd_max - tp->iss; + else + snt = 0; + if ((snt < win) && + (tp->snd_cwnd < win)) + tp->snd_cwnd = win; + } + if (rack->rc_always_pace) + rack_update_seg(rack); + break; + case TCP_BBR_IWINTSO: + RACK_OPTS_INC(tcp_initial_win); + if (optval && (optval <= 0xff)) { + uint32_t win, snt; + + rack->rc_init_win = optval; + win = rc_init_window(rack); + if (SEQ_GT(tp->snd_max, tp->iss)) + snt = tp->snd_max - tp->iss; + else + snt = 0; + if ((snt < win) && + (tp->t_srtt | +#ifdef NETFLIX_PEAKRATE + tp->t_maxpeakrate | +#endif + rack->r_ctl.init_rate)) { + /* + * We are not past the initial window + * and we have some bases for pacing, + * so we need to possibly adjust up + * the cwnd. Note even if we don't set + * the cwnd, its still ok to raise the rc_init_win + * which can be used coming out of idle when we + * would have a rate. + */ + if (tp->snd_cwnd < win) + tp->snd_cwnd = win; + } + if (rack->rc_always_pace) + rack_update_seg(rack); + } else error = EINVAL; break; - case TCP_RACK_PACE_MAX_SEG: - /* Max segments in a pace */ - RACK_OPTS_INC(tcp_rack_max_seg); - rack->rc_pace_max_segs = optval; - rack_set_pace_segments(tp, rack); + case TCP_RACK_FORCE_MSEG: + RACK_OPTS_INC(tcp_rack_force_max_seg); + if (optval) + rack->rc_force_max_seg = 1; + else + rack->rc_force_max_seg = 0; break; + case TCP_RACK_PACE_MAX_SEG: + /* Max segments size in a pace in bytes */ + RACK_OPTS_INC(tcp_rack_max_seg); + rack->rc_user_set_max_segs = optval; + rack_set_pace_segments(tp, rack, __LINE__); + break; + case TCP_RACK_PACE_RATE_REC: + /* Set the fixed pacing rate in Bytes per second ca */ + RACK_OPTS_INC(tcp_rack_pace_rate_rec); + rack->r_ctl.rc_fixed_pacing_rate_rec = optval; + if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) + rack->r_ctl.rc_fixed_pacing_rate_ca = optval; + if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) + rack->r_ctl.rc_fixed_pacing_rate_ss = optval; + rack->use_fixed_rate = 1; + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rc_fixed_pacing_rate_ss, + rack->r_ctl.rc_fixed_pacing_rate_ca, + rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, + __LINE__, NULL); + break; + + case TCP_RACK_PACE_RATE_SS: + /* Set the fixed pacing rate in Bytes per second ca */ + RACK_OPTS_INC(tcp_rack_pace_rate_ss); + rack->r_ctl.rc_fixed_pacing_rate_ss = optval; + if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) + rack->r_ctl.rc_fixed_pacing_rate_ca = optval; + if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) + rack->r_ctl.rc_fixed_pacing_rate_rec = optval; + rack->use_fixed_rate = 1; + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rc_fixed_pacing_rate_ss, + rack->r_ctl.rc_fixed_pacing_rate_ca, + rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, + __LINE__, NULL); + break; + + case TCP_RACK_PACE_RATE_CA: + /* Set the fixed pacing rate in Bytes per second ca */ + RACK_OPTS_INC(tcp_rack_pace_rate_ca); + rack->r_ctl.rc_fixed_pacing_rate_ca = optval; + if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) + rack->r_ctl.rc_fixed_pacing_rate_ss = optval; + if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) + rack->r_ctl.rc_fixed_pacing_rate_rec = optval; + rack->use_fixed_rate = 1; + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rc_fixed_pacing_rate_ss, + rack->r_ctl.rc_fixed_pacing_rate_ca, + rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, + __LINE__, NULL); + break; + case TCP_RACK_GP_INCREASE_REC: + RACK_OPTS_INC(tcp_gp_inc_rec); + rack->r_ctl.rack_per_of_gp_rec = optval; + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rack_per_of_gp_ss, + rack->r_ctl.rack_per_of_gp_ca, + rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, + __LINE__, NULL); + break; + case TCP_RACK_GP_INCREASE_CA: + RACK_OPTS_INC(tcp_gp_inc_ca); + ca = optval; + if (ca < 100) { + /* + * We don't allow any reduction + * over the GP b/w. + */ + error = EINVAL; + break; + } + rack->r_ctl.rack_per_of_gp_ca = ca; + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rack_per_of_gp_ss, + rack->r_ctl.rack_per_of_gp_ca, + rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, + __LINE__, NULL); + break; + case TCP_RACK_GP_INCREASE_SS: + RACK_OPTS_INC(tcp_gp_inc_ss); + ss = optval; + if (ss < 100) { + /* + * We don't allow any reduction + * over the GP b/w. + */ + error = EINVAL; + break; + } + rack->r_ctl.rack_per_of_gp_ss = ss; + rack_log_pacing_delay_calc(rack, + rack->r_ctl.rack_per_of_gp_ss, + rack->r_ctl.rack_per_of_gp_ca, + rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, + __LINE__, NULL); + break; + case TCP_RACK_RR_CONF: + RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); + if (optval && optval <= 3) + rack->r_rr_config = optval; + else + rack->r_rr_config = 0; + break; + case TCP_BBR_HDWR_PACE: + RACK_OPTS_INC(tcp_hdwr_pacing); + if (optval){ + if (rack->rack_hdrw_pacing == 0) { + rack->rack_hdw_pace_ena = 1; + rack->rack_attempt_hdwr_pace = 0; + } else + error = EALREADY; + } else { + rack->rack_hdw_pace_ena = 0; +#ifdef RATELIMIT + if (rack->rack_hdrw_pacing) { + rack->rack_hdrw_pacing = 0; + in_pcbdetach_txrtlmt(rack->rc_inp); + } +#endif + } + break; + /* End Pacing related ones */ case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); @@ -10251,12 +14692,12 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, else error = EINVAL; break; - case TCP_BBR_USE_RACK_CHEAT: - RACK_OPTS_INC(tcp_rack_cheat); + case TCP_BBR_USE_RACK_RR: + RACK_OPTS_INC(tcp_rack_rr); if (optval) - rack->use_rack_cheat = 1; + rack->use_rack_rr = 1; else - rack->use_rack_cheat = 0; + rack->use_rack_rr = 0; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ @@ -10278,26 +14719,10 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; - NET_EPOCH_ENTER(et); rack_output(tp); - NET_EPOCH_EXIT(et); } break; - case TCP_RACK_MIN_PACE: - RACK_OPTS_INC(tcp_rack_min_pace); - if (optval > 3) - rack->r_enforce_min_pace = 3; - else - rack->r_enforce_min_pace = optval; - break; - case TCP_RACK_GP_INCREASE: - if ((optval >= 0) && - (optval <= 256)) - rack->rack_per_of_gp = optval; - else - error = EINVAL; - break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && (optval != USE_RTT_LOW) && @@ -10312,6 +14737,10 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, else rack->rc_allow_data_af_clo = 0; break; + case TCP_RACK_PACE_REDUCE: + /* sysctl only now */ + error = EINVAL; + break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; @@ -10328,7 +14757,7 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error, optval; - + uint64_t val; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever @@ -10337,10 +14766,38 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, */ error = 0; switch (sopt->sopt_name) { + case TCP_RACK_PROFILE: + /* You cannot retrieve a profile, its write only */ + error = EINVAL; + break; + case TCP_RACK_PACE_TO_FILL: + optval = rack->rc_pace_to_cwnd; + break; + case TCP_RACK_NO_PUSH_AT_MAX: + optval = rack->r_ctl.rc_no_push_at_mrtt; + break; + case TCP_SHARED_CWND_ENABLE: + optval = rack->rack_enable_scwnd; + break; + case TCP_RACK_NONRXT_CFG_RATE: + optval = rack->rack_rec_nonrxt_use_cr; + break; + case TCP_NO_PRR: + optval = rack->rack_no_prr; + break; case TCP_RACK_DO_DETECTION: optval = rack->do_detection; break; - + case TCP_RACK_MBUF_QUEUE: + /* Now do we use the LRO mbuf-queue feature */ + optval = rack->r_mbuf_queue; + break; + case TCP_TIMELY_DYN_ADJ: + optval = rack->rc_gp_dyn_mul; + break; + case TCP_BBR_IWINTSO: + optval = rack->rc_init_win; + break; case TCP_RACK_PROP_RATE: optval = rack->r_ctl.rc_prop_rate; break; @@ -10358,11 +14815,21 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ - optval = rack->rc_pace_reduce; + error = EINVAL; + break; + case TCP_BBR_RACK_INIT_RATE: + val = rack->r_ctl.init_rate; + /* convert to kbits per sec */ + val *= 8; + val /= 1000; + optval = (uint32_t)val; + break; + case TCP_RACK_FORCE_MSEG: + optval = rack->rc_force_max_seg; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ - optval = rack->rc_pace_max_segs; + optval = rack->rc_user_set_max_segs; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method */ @@ -10388,9 +14855,15 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; - case TCP_BBR_USE_RACK_CHEAT: + case TCP_BBR_USE_RACK_RR: /* Do we use the rack cheat for rxt */ - optval = rack->use_rack_cheat; + optval = rack->use_rack_rr; + break; + case TCP_RACK_RR_CONF: + optval = rack->r_rr_config; + break; + case TCP_BBR_HDWR_PACE: + optval = rack->rack_hdw_pace_ena; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ @@ -10410,11 +14883,20 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, case TCP_RACK_IDLE_REDUCE_HIGH: error = EINVAL; break; - case TCP_RACK_MIN_PACE: - optval = rack->r_enforce_min_pace; + case TCP_RACK_PACE_RATE_CA: + optval = rack->r_ctl.rc_fixed_pacing_rate_ca; break; - case TCP_RACK_GP_INCREASE: - optval = rack->rack_per_of_gp; + case TCP_RACK_PACE_RATE_SS: + optval = rack->r_ctl.rc_fixed_pacing_rate_ss; + break; + case TCP_RACK_PACE_RATE_REC: + optval = rack->r_ctl.rc_fixed_pacing_rate_rec; + break; + case TCP_RACK_GP_INCREASE_SS: + optval = rack->r_ctl.rack_per_of_gp_ca; + break; + case TCP_RACK_GP_INCREASE_CA: + optval = rack->r_ctl.rack_per_of_gp_ss; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; @@ -10425,6 +14907,9 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, case TCP_DATA_AFTER_CLOSE: optval = rack->rc_allow_data_af_clo; break; + case TCP_SHARED_CWND_TIME_LIMIT: + optval = rack->r_limit_scw; + break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; @@ -10457,6 +14942,13 @@ out: return (error); } +static int +rack_pru_options(struct tcpcb *tp, int flags) +{ + if (flags & PRUS_OOB) + return (EOPNOTSUPP); + return (0); +} static struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), @@ -10472,7 +14964,8 @@ static struct tcp_function_block __tcp_rack = { .tfb_tcp_timer_active = rack_timer_active, .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, - .tfb_tcp_handoff_ok = rack_handoff_ok + .tfb_tcp_handoff_ok = rack_handoff_ok, + .tfb_pru_options = rack_pru_options, }; static const char *rack_stack_names[] = { diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index a0ae279079d..db4a5e64c6d 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -1,7 +1,5 @@ /*- - * Copyright (c) 2016-9 - * Netflix Inc. - * All rights reserved. + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -423,6 +421,7 @@ skip_vnet: nxt_pkt = 1; else nxt_pkt = 0; + KMOD_TCPSTAT_INC(tcps_rcvtotal); retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, iptos, nxt_pkt, &tv); if (retval) { @@ -694,6 +693,7 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcp tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: + tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST); tp = tcp_close(tp); } dropped = 1; @@ -911,3 +911,24 @@ ctf_decay_count(uint32_t count, uint32_t decay) decayed_count = count - (uint32_t)perc_count; return(decayed_count); } + +int32_t +ctf_progress_timeout_check(struct tcpcb *tp, bool log) +{ + if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { + if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { + /* + * There is an assumption that the caller + * will drop the connection so we will + * increment the counters here. + */ + if (log) + tcp_log_end_status(tp, TCP_EI_STATUS_PROGRESS); +#ifdef NETFLIX_STATS + KMOD_TCPSTAT_INC(tcps_progdrops); +#endif + return (1); + } + } + return (0); +} diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h index 8f866ed731f..d9731c881b4 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.h +++ b/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -1,7 +1,7 @@ #ifndef __rack_bbr_common_h__ #define __rack_bbr_common_h__ /*- - * Copyright (c) 2017-9 Netflix, Inc. + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -80,7 +80,7 @@ /* Bits per second in bytes per second */ #define FORTY_EIGHT_MBPS 6000000 /* 48 megabits in bytes */ #define THIRTY_MBPS 3750000 /* 30 megabits in bytes */ -#define TWENTY_THREE_MBPS 2896000 +#define TWENTY_THREE_MBPS 2896000 /* 23 megabits in bytes */ #define FIVETWELVE_MBPS 64000000 /* 512 megabits in bytes */ #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ @@ -138,5 +138,8 @@ ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_bl uint32_t ctf_decay_count(uint32_t count, uint32_t decay_percentage); +int32_t +ctf_progress_timeout_check(struct tcpcb *tp, bool log); + #endif #endif diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h index 8c03183c425..0ba22e17e4f 100644 --- a/sys/netinet/tcp_stacks/tcp_bbr.h +++ b/sys/netinet/tcp_stacks/tcp_bbr.h @@ -1,7 +1,5 @@ /*- - * Copyright (c) 2016-9 - * Netflix Inc. All rights reserved. - * Author Randall R. Stewart + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -748,7 +746,7 @@ struct tcp_bbr { /* First cache line 0x00 */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, - int32_t, int32_t, uint32_t, int32_t, int32_t); /* Lock(a) */ + int32_t, int32_t, uint32_t, int32_t, int32_t, uint8_t); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ struct timeval rc_tv; diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index ac194bb0e58..e8f74d65f78 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016-9 Netflix, Inc. + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,10 +37,14 @@ #define RACK_HAS_FIN 0x0040/* segment is sent with fin */ #define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ #define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */ +#define RACK_APP_LIMITED 0x0200/* We went app limited after this send */ +#define RACK_WAS_ACKED 0x0400/* a RTO undid the ack, but it already had a rtt calc done */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ +#define RACK_REQ_AVG 4 /* Must be less than 256 */ + struct rack_sendmap { uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ @@ -51,10 +55,16 @@ struct rack_sendmap { * sent */ uint16_t r_flags; /* Flags as defined above */ uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; + uint32_t usec_orig_send; /* time of orginal send in useconds */ + uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */ + uint32_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */ uint8_t r_dupack; /* Dup ack count */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ uint8_t r_limit_type; /* is this entry counted against a limit? */ - uint8_t r_resv[49]; + uint8_t r_just_ret : 1, /* After sending, the next pkt was just returned, i.e. limited */ + r_one_out_nr : 1, /* Special case 1 outstanding and not in recovery */ + r_avail : 6; + uint8_t r_resv[36]; }; RB_HEAD(rack_rb_tree_head, rack_sendmap); @@ -76,7 +86,10 @@ struct rack_rtt_sample { uint32_t rs_rtt_lowest; uint32_t rs_rtt_highest; uint32_t rs_rtt_cnt; + uint32_t rs_us_rtt; + int32_t confidence; uint64_t rs_rtt_tot; + uint16_t rs_us_rtrcnt; }; #define RACK_LOG_TYPE_ACK 0x01 @@ -135,15 +148,57 @@ struct rack_opts_stats { uint64_t tcp_rack_idle_reduce_high; uint64_t rack_no_timer_in_hpts; uint64_t tcp_rack_min_pace_seg; - uint64_t tcp_rack_min_pace; - uint64_t tcp_rack_cheat; + uint64_t tcp_rack_pace_rate_ca; + uint64_t tcp_rack_rr; uint64_t tcp_rack_do_detection; + uint64_t tcp_rack_rrr_no_conf_rate; + uint64_t tcp_initial_rate; + uint64_t tcp_initial_win; + uint64_t tcp_hdwr_pacing; + uint64_t tcp_gp_inc_ss; + uint64_t tcp_gp_inc_ca; + uint64_t tcp_gp_inc_rec; + uint64_t tcp_rack_force_max_seg; + uint64_t tcp_rack_pace_rate_ss; + uint64_t tcp_rack_pace_rate_rec; + /* Temp counters for dsack */ + uint64_t tcp_sack_path_1; + uint64_t tcp_sack_path_2a; + uint64_t tcp_sack_path_2b; + uint64_t tcp_sack_path_3; + uint64_t tcp_sack_path_4; + /* non temp counters */ + uint64_t tcp_rack_scwnd; + uint64_t tcp_rack_noprr; + uint64_t tcp_rack_cfg_rate; + uint64_t tcp_timely_dyn; + uint64_t tcp_rack_mbufq; + uint64_t tcp_fillcw; + uint64_t tcp_npush; + uint64_t tcp_lscwnd; + uint64_t tcp_profile; }; +/* RTT shrink reasons */ +#define RACK_RTTS_INIT 0 +#define RACK_RTTS_NEWRTT 1 +#define RACK_RTTS_EXITPROBE 2 +#define RACK_RTTS_ENTERPROBE 3 +#define RACK_RTTS_REACHTARGET 4 +#define RACK_RTTS_SEEHBP 5 +#define RACK_RTTS_NOBACKOFF 6 +#define RACK_RTTS_SAFETY 7 + +#define RACK_USE_BEG 1 +#define RACK_USE_END 2 +#define RACK_USE_END_OR_THACK 3 + #define TLP_USE_ID 1 /* Internet draft behavior */ #define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */ #define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */ +#define RACK_MIN_BW 8000 /* 64kbps in Bps */ +#define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */ #ifdef _KERNEL #define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t)) extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; @@ -200,10 +255,13 @@ struct rack_control { * tlp_sending Lock(a) */ struct rack_sendmap *rc_resend; /* something we have been asked to * resend */ - struct timeval rc_last_time_decay; /* SAD time decay happened here */ uint32_t input_pkt; uint32_t saved_input_pkt; uint32_t rc_hpts_flags; + uint32_t rc_fixed_pacing_rate_ca; + uint32_t rc_fixed_pacing_rate_rec; + uint32_t rc_fixed_pacing_rate_ss; + uint32_t cwnd_to_use; /* The cwnd in use */ uint32_t rc_timer_exp; /* If a timer ticks of expiry */ uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */ uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */ @@ -223,15 +281,14 @@ struct rack_control { uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ - uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ + uint32_t xxx_rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */ - uint16_t rc_tlp_send_cnt; /* Number of TLP sends we have done - * since peer spoke to us Lock(a) */ - uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent + uint16_t rc_tlp_cnt_out; /* count of times we have sent a TLP without new data */ + uint16_t xxx_rc_tlp_seg_send_cnt; /* Number of times we have TLP sent * rc_last_tlp_seq Lock(a) */ - uint32_t rc_loss_count; /* During recovery how many segments were lost + uint32_t rc_loss_count; /* How many bytes have been retransmitted * Lock(a) */ uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ @@ -260,39 +317,81 @@ struct rack_control { struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for * cache line alignment * Lock(a) */ - struct timeval rc_last_ack; + struct rack_sendmap *rc_first_appl; /* Pointer to first app limited */ + struct rack_sendmap *rc_end_appl; /* Pointer to last app limited */ /* Cache line split 0x100 */ struct sack_filter rack_sf; /* Cache line split 0x140 */ /* Flags for various things */ + uint32_t last_pacing_time; uint32_t rc_pace_max_segs; uint32_t rc_pace_min_segs; + uint32_t rc_app_limited_cnt; + uint16_t rack_per_of_gp_ss; /* 100 = 100%, so from 65536 = 655 x bw */ + uint16_t rack_per_of_gp_ca; /* 100 = 100%, so from 65536 = 655 x bw */ + uint16_t rack_per_of_gp_rec; /* 100 = 100%, so from 65536 = 655 x bw, 0=off */ + uint16_t rack_per_of_gp_probertt; /* 100 = 100%, so from 65536 = 655 x bw, 0=off */ uint32_t rc_high_rwnd; uint32_t ack_count; uint32_t sack_count; uint32_t sack_noextra_move; uint32_t sack_moved_extra; struct rack_rtt_sample rack_rs; + const struct tcp_hwrate_limit_table *crte; + uint32_t rc_agg_early; + uint32_t rc_agg_delayed; uint32_t rc_tlp_rxt_last_time; uint32_t rc_saved_cwnd; - uint32_t rc_gp_history[RACK_GP_HIST]; + uint32_t rc_gp_output_ts; + uint32_t rc_gp_cumack_ts; + struct timeval act_rcv_time; + struct timeval rc_last_time_decay; /* SAD time decay happened here */ + uint64_t gp_bw; + uint64_t init_rate; +#ifdef NETFLIX_SHARED_CWND + struct shared_cwnd *rc_scw; +#endif + uint64_t last_gp_comp_bw; + uint64_t last_max_bw; /* Our calculated max b/w last */ + struct time_filter_small rc_gp_min_rtt; + int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */ + uint32_t rc_gp_srtt; /* Current GP srtt */ + uint32_t rc_prev_gp_srtt; /* Previous RTT */ + uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */ + uint32_t rc_loss_at_start; /* At measurement window where was our lost value */ + + uint32_t forced_ack_ts; + uint32_t rc_lower_rtt_us_cts; /* Time our GP rtt was last lowered */ + uint32_t rc_time_probertt_entered; + uint32_t rc_time_probertt_starts; + uint32_t rc_lowest_us_rtt; + uint32_t rc_highest_us_rtt; + uint32_t rc_last_us_rtt; + uint32_t rc_time_of_last_probertt; + uint32_t rc_target_probertt_flight; + uint32_t rc_probertt_sndmax_atexit; /* Highest sent to in probe-rtt */ + uint32_t rc_gp_lowrtt; /* Lowest rtt seen during GPUT measurement */ + uint32_t rc_gp_high_rwnd; /* Highest rwnd seen during GPUT measurement */ + int32_t rc_scw_index; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ + uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */ + uint8_t num_avg; /* average count before we go to normal decay */ uint8_t rc_prop_rate; /* Socket option value Lock(a) */ uint8_t rc_prop_reduce; /* Socket option value Lock(a) */ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_min_to; /* Socket option value Lock(a) */ - uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */ uint8_t rc_rate_sample_method; - uint8_t rc_gp_hist_idx: 7, - rc_gp_hist_filled: 1; - + uint8_t rc_gp_hist_idx; }; +#define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */ +#define RACK_MINRTT_FILTER_TIM 10 /* Seconds */ + #ifdef _KERNEL struct tcp_rack { @@ -306,39 +405,75 @@ struct tcp_rack { uint32_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ - uint16_t r_wanted_output; /* Output routine wanted to be called */ - uint16_t r_cpu; /* CPU that the INP is running on Lock(a) */ - uint16_t rc_pace_max_segs; /* Socket option value Lock(a) */ - uint16_t rc_pace_reduce;/* Socket option value Lock(a) */ + uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */ + rtt_limit_mul : 4, /* muliply this by low rtt */ + r_limit_scw : 1, + r_avail_bits : 10; /* Available */ + uint16_t rc_user_set_max_segs; /* Socket option value Lock(a) */ + uint16_t forced_ack : 1, + rc_gp_incr : 1, + rc_gp_bwred : 1, + rc_gp_timely_inc_cnt : 3, + rc_gp_timely_dec_cnt : 3, + rc_not_backing_off: 1, + rc_highly_buffered: 1, /* The path is highly buffered */ + rc_dragged_bottom: 1, + rc_dack_mode : 1, /* Mac O/S emulation of d-ack */ + rc_dack_toggle : 1, /* For Mac O/S emulation of d-ack */ + pacing_longer_than_rtt : 1, + rc_gp_filled : 1; uint8_t r_state; /* Current rack state Lock(a) */ uint8_t rc_tmr_stopped : 7, t_timers_stopped : 1; - uint8_t rc_enobuf; /* count of enobufs on connection provides - * backoff Lock(a) */ + uint8_t rc_enobuf : 7, /* count of enobufs on connection provides */ + rc_on_min_to : 1; uint8_t r_timer_override : 1, /* hpts override Lock(a) */ - r_tlp_running : 1, /* Running from a TLP timeout Lock(a) */ r_is_v6 : 1, /* V6 pcb Lock(a) */ rc_in_persist : 1, - rc_last_pto_set : 1, /* XXX not used */ rc_tlp_in_progress : 1, rc_always_pace : 1, /* Socket option value Lock(a) */ - tlp_timer_up : 1; /* The tlp timer is up flag Lock(a) */ - uint8_t r_enforce_min_pace : 2, + rc_pace_to_cwnd : 1, + rc_pace_fill_if_rttin_range : 1, + xxx_avail_bits : 1; + uint8_t app_limited_needs_set : 1, + use_fixed_rate : 1, rc_has_collapsed : 1, r_rep_attack : 1, r_rep_reverse : 1, - r_xxx_min_pace_seg_thresh : 3; - uint8_t rack_tlp_threshold_use; + rack_hdrw_pacing : 1, /* We are doing Hardware pacing */ + rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */ + rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */ + uint8_t rack_tlp_threshold_use : 3, /* only 1, 2 and 3 used so far */ + rack_rec_nonrxt_use_cr : 1, + rack_enable_scwnd : 1, + rack_attempted_scwnd : 1, + rack_no_prr : 1, + rack_scwnd_is_idle : 1; uint8_t rc_allow_data_af_clo: 1, delayed_ack : 1, set_pacing_done_a_iw : 1, - use_rack_cheat : 1, + use_rack_rr : 1, alloc_limit_reported : 1, sack_attack_disable : 1, do_detection : 1, - rc_avail : 1; - uint16_t rack_per_of_gp; + rc_force_max_seg : 1; + uint8_t rack_cwnd_limited : 1, + r_early : 1, + r_late : 1, + r_running_early : 1, + r_running_late : 1, + r_wanted_output: 1, + r_rr_config : 2; + uint16_t rc_init_win : 8, + rc_gp_rtt_set : 1, + rc_gp_dyn_mul : 1, + rc_gp_saw_rec : 1, + rc_gp_saw_ca : 1, + rc_gp_saw_ss : 1, + rc_gp_no_rec_chg : 1, + in_probe_rtt : 1, + measure_saw_probe_rtt : 1; /* Cache line 2 0x40 */ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE);