ena: Add differentiation for missing TX completions reset

This commit adds differentiation for a reset caused by missing tx
completions, by verifying if the driver didn't receive tx
completions caused by missing interrupts.
The cleanup_running field was added to ena_ring because
cleanup_task.ta_pending is zeroed before ena_cleanup() runs.

Also ena_increment_reset_counter() API was added in order to support
only incrementing the reset counter.

Approved by: cperciva (mentor)
Sponsored by: Amazon, Inc.

(cherry picked from commit a33ec635d1)
This commit is contained in:
Osama Abboud 2024-08-07 06:24:19 +00:00 committed by Osama Abboud
parent a20c06c6f1
commit db0c751ed7
3 changed files with 77 additions and 18 deletions

View file

@ -169,6 +169,9 @@ static int ena_copy_eni_metrics(struct ena_adapter *);
static int ena_copy_srd_metrics(struct ena_adapter *);
static int ena_copy_customer_metrics(struct ena_adapter *);
static void ena_timer_service(void *);
static enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *,
struct ena_ring *);
static char ena_version[] = ENA_DEVICE_NAME ENA_DRV_MODULE_NAME
" v" ENA_DRV_MODULE_VERSION;
@ -3088,6 +3091,31 @@ check_for_rx_interrupt_queue(struct ena_adapter *adapter,
return (0);
}
static enum ena_regs_reset_reason_types
check_cdesc_in_tx_cq(struct ena_adapter *adapter,
struct ena_ring *tx_ring)
{
device_t pdev = adapter->pdev;
int rc;
u16 req_id;
rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, &req_id);
/* TX CQ is empty */
if (rc == ENA_COM_TRY_AGAIN) {
ena_log(pdev, ERR,
"No completion descriptors found in CQ %d\n",
tx_ring->qid);
return ENA_REGS_RESET_MISS_TX_CMPL;
}
/* TX CQ has cdescs */
ena_log(pdev, ERR,
"Completion descriptors found in CQ %d",
tx_ring->qid);
return ENA_REGS_RESET_MISS_INTERRUPT;
}
static int
check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
struct ena_ring *tx_ring)
@ -3100,6 +3128,8 @@ check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
int missing_tx_comp_to;
sbintime_t time_offset;
int i, rc = 0;
enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL;
bool cleanup_scheduled, cleanup_running;
getbinuptime(&curtime);
@ -3155,7 +3185,19 @@ check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
"The number of lost tx completion is above the threshold "
"(%d > %d). Reset the device\n",
missed_tx, adapter->missing_tx_threshold);
ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_TX_CMPL);
/* Set the reset flag to prevent ena_cleanup() from running */
ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter);
/* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and
* that cleanup_running is visible to check_missing_comp_in_tx_queue() to
* prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq()
*/
mb();
cleanup_scheduled = !!(atomic_load_16(&tx_ring->que->cleanup_task.ta_pending));
cleanup_running = !!(atomic_load_8((&tx_ring->cleanup_running)));
if (!(cleanup_scheduled || cleanup_running))
reset_reason = check_cdesc_in_tx_cq(adapter, tx_ring);
adapter->reset_reason = reset_reason;
rc = EIO;
}
/* Add the newly discovered missing TX completions */
@ -3618,6 +3660,7 @@ ena_reset_task(void *arg, int pending)
ENA_LOCK_LOCK();
if (likely(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
ena_increment_reset_counter(adapter);
ena_destroy_device(adapter, false);
ena_restore_device(adapter);

View file

@ -327,6 +327,7 @@ struct ena_ring {
};
uint8_t first_interrupt;
uint8_t cleanup_running;
uint16_t no_interrupt_event_cnt;
struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
@ -583,22 +584,28 @@ ena_mbuf_count(struct mbuf *mbuf)
return count;
}
static inline void
ena_increment_reset_counter(struct ena_adapter *adapter)
{
enum ena_regs_reset_reason_types reset_reason = adapter->reset_reason;
const struct ena_reset_stats_offset *ena_reset_stats_offset =
&resets_to_stats_offset_map[reset_reason];
if (ena_reset_stats_offset->has_counter) {
uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats +
ena_reset_stats_offset->stat_offset;
counter_u64_add((counter_u64_t)(*stat_ptr), 1);
}
counter_u64_add(adapter->dev_stats.total_resets, 1);
}
static inline void
ena_trigger_reset(struct ena_adapter *adapter,
enum ena_regs_reset_reason_types reset_reason)
{
if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
const struct ena_reset_stats_offset *ena_reset_stats_offset =
&resets_to_stats_offset_map[reset_reason];
if (ena_reset_stats_offset->has_counter) {
uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats +
ena_reset_stats_offset->stat_offset;
counter_u64_add((counter_u64_t)(*stat_ptr), 1);
}
counter_u64_add(adapter->dev_stats.total_resets, 1);
adapter->reset_reason = reset_reason;
ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter);
}

View file

@ -77,17 +77,24 @@ ena_cleanup(void *arg, int pending)
int qid, ena_qid;
int txc, rxc, i;
if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
return;
ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n");
tx_ring = que->tx_ring;
rx_ring = que->rx_ring;
qid = que->id;
ena_qid = ENA_IO_TXQ_IDX(qid);
io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
atomic_store_8(&tx_ring->cleanup_running, 1);
/* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and
* that cleanup_running is visible to check_missing_comp_in_tx_queue() to
* prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq()
*/
mb();
if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) ||
(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))))
return;
ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n");
atomic_store_8(&tx_ring->first_interrupt, 1);
atomic_store_8(&rx_ring->first_interrupt, 1);
@ -95,7 +102,8 @@ ena_cleanup(void *arg, int pending)
rxc = ena_rx_cleanup(rx_ring);
txc = ena_tx_cleanup(tx_ring);
if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) ||
(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))))
return;
if ((txc != ENA_TX_BUDGET) && (rxc != ENA_RX_BUDGET))
@ -107,6 +115,7 @@ ena_cleanup(void *arg, int pending)
ENA_TX_IRQ_INTERVAL, true, false);
counter_u64_add(tx_ring->tx_stats.unmask_interrupt_num, 1);
ena_com_unmask_intr(io_cq, &intr_reg);
atomic_store_8(&tx_ring->cleanup_running, 0);
}
void