Since a fatal trap can happen at aribtrary times, don't panic when the

completions are not in a consistent state. Cope with the different places the normal I/O completion polling thread can be interrupted and then re-entered during a kernel panic + dump. Reviewed by: jhb and markj (both prior versions) Differential Revision: https://reviews.freebsd.org/D20478
2026-06-09 00:32:25 -04:00 · 2019-06-01 15:37:44 +00:00 · 2019-06-01 15:37:44 +00:00 · d0aaeffdb4
commit d0aaeffdb4
parent eafaa1bc35
1 changed files with 66 additions and 13 deletions
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@ -31,6 +31,8 @@ __FBSDID("$FreeBSD$");

 #include <sys/param.h>
 #include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/proc.h>

 #include <dev/pci/pcivar.h>

@ -308,7 +310,7 @@ get_status_string(uint16_t sct, uint16_t sc)
 }

 static void
-nvme_qpair_print_completion(struct nvme_qpair *qpair, 
+nvme_qpair_print_completion(struct nvme_qpair *qpair,
    struct nvme_completion *cpl)
 {
 	uint16_t sct, sc;
@ -479,18 +481,51 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 	struct nvme_tracker	*tr;
 	struct nvme_completion	cpl;
 	int done = 0;
+	bool in_panic = dumping || SCHEDULER_STOPPED();

 	qpair->num_intr_handler_calls++;

+	/*
+	 * qpair is not enabled, likely because a controller reset is is in
+	 * progress.  Ignore the interrupt - any I/O that was associated with
+	 * this interrupt will get retried when the reset is complete.
+	 */
 	if (!qpair->is_enabled)
-		/*
-		 * qpair is not enabled, likely because a controller reset is
-		 *  is in progress.  Ignore the interrupt - any I/O that was
-		 *  associated with this interrupt will get retried when the
-		 *  reset is complete.
-		 */
 		return (false);

+	/*
+	 * A panic can stop the CPU this routine is running on at any point.  If
+	 * we're called during a panic, complete the sq_head wrap protocol for
+	 * the case where we are interrupted just after the increment at 1
+	 * below, but before we can reset cq_head to zero at 2. Also cope with
+	 * the case where we do the zero at 2, but may or may not have done the
+	 * phase adjustment at step 3. The panic machinery flushes all pending
+	 * memory writes, so we can make these strong ordering assumptions
+	 * that would otherwise be unwise if we were racing in real time.
+	 */
+	if (__predict_false(in_panic)) {
+		if (qpair->cq_head == qpair->num_entries) {
+			/*
+			 * Here we know that we need to zero cq_head and then negate
+			 * the phase, which hasn't been assigned if cq_head isn't
+			 * zero due to the atomic_store_rel.
+			 */
+			qpair->cq_head = 0;
+			qpair->phase = !qpair->phase;
+		} else if (qpair->cq_head == 0) {
+			/*
+			 * In this case, we know that the assignment at 2
+			 * happened below, but we don't know if it 3 happened or
+			 * not. To do this, we look at the last completion
+			 * entry and set the phase to the opposite phase
+			 * that it has. This gets us back in sync
+			 */
+			cpl = qpair->cpl[qpair->num_entries - 1];
+			nvme_completion_swapbytes(&cpl);
+			qpair->phase = !NVME_STATUS_GET_P(cpl.status);
+		}
+	}
+
 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	while (1) {
@ -508,17 +543,35 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 			nvme_qpair_complete_tracker(qpair, tr, &cpl, ERROR_PRINT_ALL);
 			qpair->sq_head = cpl.sqhd;
 			done++;
-		} else {
-			nvme_printf(qpair->ctrlr, 
+		} else if (!in_panic) {
+			/*
+			 * A missing tracker is normally an error.  However, a
+			 * panic can stop the CPU this routine is running on
+			 * after completing an I/O but before updating
+			 * qpair->cq_head at 1 below.  Later, we re-enter this
+			 * routine to poll I/O associated with the kernel
+			 * dump. We find that the tr has been set to null before
+			 * calling the completion routine.  If it hasn't
+			 * completed (or it triggers a panic), then '1' below
+			 * won't have updated cq_head. Rather than panic again,
+			 * ignore this condition because it's not unexpected.
+			 */
+			nvme_printf(qpair->ctrlr,
 			    "cpl does not map to outstanding cmd\n");
 			/* nvme_dump_completion expects device endianess */
 			nvme_dump_completion(&qpair->cpl[qpair->cq_head]);
-			KASSERT(0, ("received completion for unknown cmd\n"));
+			KASSERT(0, ("received completion for unknown cmd"));
 		}

-		if (++qpair->cq_head == qpair->num_entries) {
-			qpair->cq_head = 0;
-			qpair->phase = !qpair->phase;
+		/*
+		 * There's a number of races with the following (see above) when
+		 * the system panics. We compensate for each one of them by
+		 * using the atomic store to force strong ordering (at least when
+		 * viewed in the aftermath of a panic).
+		 */
+		if (++qpair->cq_head == qpair->num_entries) {		/* 1 */
+			atomic_store_rel_int(&qpair->cq_head, 0);	/* 2 */
+			qpair->phase = !qpair->phase;			/* 3 */
 		}

 		nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl,