cxgbe(4): Fixes to tx coalescing.

- The behavior implemented in r362905 resulted in delayed transmission
  of packets in some cases, causing performance issues.  Use a different
  heuristic to predict tx requests.

- Add a tunable/sysctl (hw.cxgbe.tx_coalesce) to disable tx coalescing
  entirely.  It can be changed at any time.  There is no change in
  default behavior.
This commit is contained in:
Navdeep Parhar 2021-02-01 03:00:09 -08:00
parent 5cf6f1c4bc
commit 3447df8bc5
4 changed files with 62 additions and 13 deletions

View file

@ -561,7 +561,7 @@ struct txpkts {
uint8_t wr_type; /* type 0 or type 1 */
uint8_t npkt; /* # of packets in this work request */
uint8_t len16; /* # of 16B pieces used by this work request */
uint8_t score; /* 1-10. coalescing attempted if score > 3 */
uint8_t score;
uint8_t max_npkt; /* maximum number of packets allowed */
uint16_t plen; /* total payload (sum of all packets) */
@ -584,6 +584,7 @@ struct sge_txq {
struct sglist *gl;
__be32 cpl_ctrl0; /* for convenience */
int tc_idx; /* traffic class */
uint64_t last_tx; /* cycle count when eth_tx was last called */
struct txpkts txp;
struct task tx_reclaim_task;
@ -599,6 +600,7 @@ struct sge_txq {
uint64_t txpkts1_wrs; /* # of type1 coalesced tx work requests */
uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */
uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */
uint64_t txpkts_flush; /* # of times txp had to be sent by tx_update */
uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */
uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */
uint64_t vxlan_txcsum;

View file

@ -10718,6 +10718,7 @@ clear_stats(struct adapter *sc, u_int port_id)
txq->txpkts1_wrs = 0;
txq->txpkts0_pkts = 0;
txq->txpkts1_pkts = 0;
txq->txpkts_flush = 0;
txq->raw_wrs = 0;
txq->vxlan_tso_wrs = 0;
txq->vxlan_txcsum = 0;

View file

@ -212,6 +212,22 @@ static counter_u64_t defrags;
SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags,
"Number of mbuf defrags performed");
static int t4_tx_coalesce = 1;
SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0,
"tx coalescing allowed");
/*
* The driver will make aggressive attempts at tx coalescing if it sees these
* many packets eligible for coalescing in quick succession, with no more than
* the specified gap in between the eth_tx calls that delivered the packets.
*/
static int t4_tx_coalesce_pkts = 32;
SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN,
&t4_tx_coalesce_pkts, 0,
"# of consecutive packets (1 - 255) that will trigger tx coalescing");
static int t4_tx_coalesce_gap = 5;
SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN,
&t4_tx_coalesce_gap, 0, "tx gap (in microseconds)");
static int service_iq(struct sge_iq *, int);
static int service_iq_fl(struct sge_iq *, int);
@ -3120,6 +3136,26 @@ set_txupdate_flags(struct sge_txq *txq, u_int avail,
}
}
#if defined(__i386__) || defined(__amd64__)
extern uint64_t tsc_freq;
#endif
static inline bool
record_eth_tx_time(struct sge_txq *txq)
{
const uint64_t cycles = get_cyclecount();
const uint64_t last_tx = txq->last_tx;
#if defined(__i386__) || defined(__amd64__)
const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000;
#else
const uint64_t itg = 0;
#endif
MPASS(cycles >= last_tx);
txq->last_tx = cycles;
return (cycles - last_tx < itg);
}
/*
* r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
* be consumed. Return the actual number consumed. 0 indicates a stall.
@ -3137,10 +3173,11 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
u_int n, avail, dbdiff; /* # of hardware descriptors */
int i, rc;
struct mbuf *m0;
bool snd;
bool snd, recent_tx;
void *wr; /* start of the last WR written to the ring */
TXQ_LOCK_ASSERT_OWNED(txq);
recent_tx = record_eth_tx_time(txq);
remaining = IDXDIFF(pidx, cidx, r->size);
if (__predict_false(discard_tx(eq))) {
@ -3159,17 +3196,15 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
}
/* How many hardware descriptors do we have readily available. */
if (eq->pidx == eq->cidx) {
if (eq->pidx == eq->cidx)
avail = eq->sidx - 1;
if (txp->score++ >= 5)
txp->score = 5; /* tx is completely idle, reset. */
} else
else
avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
total = 0;
if (remaining == 0) {
if (txp->score-- == 1) /* egr_update had to drain txpkts */
txp->score = 1;
txp->score = 0;
txq->txpkts_flush++;
goto send_txpkts;
}
@ -3183,7 +3218,17 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
if (avail < 2 * SGE_MAX_WR_NDESC)
avail += reclaim_tx_descs(txq, 64);
if (txp->npkt > 0 || remaining > 1 || txp->score > 3 ||
if (t4_tx_coalesce == 0 && txp->npkt == 0)
goto skip_coalescing;
if (cannot_use_txpkts(m0))
txp->score = 0;
else if (recent_tx) {
if (++txp->score == 0)
txp->score = UINT8_MAX;
} else
txp->score = 1;
if (txp->npkt > 0 || remaining > 1 ||
txp->score >= t4_tx_coalesce_pkts ||
atomic_load_int(&txq->eq.equiq) != 0) {
if (vi->flags & TX_USES_VM_WR)
rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
@ -3198,8 +3243,6 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
for (i = 0; i < txp->npkt; i++)
ETHER_BPF_MTAP(ifp, txp->mb[i]);
if (txp->npkt > 1) {
if (txp->score++ >= 10)
txp->score = 10;
MPASS(avail >= tx_len16_to_desc(txp->len16));
if (vi->flags & TX_USES_VM_WR)
n = write_txpkts_vm_wr(sc, txq);
@ -3239,7 +3282,7 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
MPASS(rc != 0 && rc != EAGAIN);
MPASS(txp->npkt == 0);
skip_coalescing:
n = tx_len16_to_desc(mbuf_len16(m0));
if (__predict_false(avail < n)) {
avail += reclaim_tx_descs(txq, min(n, 32));
@ -4304,7 +4347,6 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
M_ZERO | M_WAITOK);
txp = &txq->txp;
txp->score = 5;
MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
txq->txp.max_npkt = min(nitems(txp->mb),
sc->params.max_pkts_per_eth_tx_pkts_wr);
@ -4363,6 +4405,9 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
CTLFLAG_RD, &txq->txpkts1_pkts,
"# of frames tx'd using type1 txpkts work requests");
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts_flush",
CTLFLAG_RD, &txq->txpkts_flush,
"# of times txpkts had to be flushed out by an egress-update");
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
&txq->raw_wrs, "# of raw work requests (non-packets)");
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs",

View file

@ -893,6 +893,7 @@ t4vf_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
txq->txpkts1_wrs = 0;
txq->txpkts0_pkts = 0;
txq->txpkts1_pkts = 0;
txq->txpkts_flush = 0;
mp_ring_reset_stats(txq->r);
}
}