From 380c25f6402d099310f1cf6f6d1e17e65531f958 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 13 Oct 2023 13:41:11 -0400
Subject: [PATCH 01/11] FreeBSD: Improve taskq wrapper

- Group tqent_task and tqent_timeout_task into a union.  They are
never used same time. This shrinks taskq_ent_t from 192 to 160 bytes.
 - Remove tqent_registered.  Use tqent_id != 0 instead.
 - Remove tqent_cancelled.  Use taskqueue pending counter instead.
 - Change tqent_type into uint_t.  We don't need to pack it any more.
 - Change tqent_rc into uint_t, matching refcount(9).
 - Take shared locks in taskq_lookup().
 - Call proper taskqueue_drain_timeout() for TIMEOUT_TASK in
taskq_cancel_id() and taskq_wait_id().
 - Switch from CK_LIST to regular LIST.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15356
---
 include/os/freebsd/spl/sys/taskq.h | 18 ++++----
 module/os/freebsd/spl/spl_taskq.c  | 74 +++++++++++++++---------------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h
index 30579b39171..b23a939b3aa 100644
--- a/include/os/freebsd/spl/sys/taskq.h
+++ b/include/os/freebsd/spl/sys/taskq.h
@@ -30,9 +30,9 @@
 
 #include <sys/types.h>
 #include <sys/proc.h>
+#include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/thread.h>
-#include <sys/ck.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -48,16 +48,16 @@ typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
 typedef struct taskq_ent {
-	struct task	 tqent_task;
-	struct timeout_task tqent_timeout_task;
+	union {
+		struct task	 tqent_task;
+		struct timeout_task tqent_timeout_task;
+	};
 	task_func_t	*tqent_func;
 	void		*tqent_arg;
-	taskqid_t tqent_id;
-	CK_LIST_ENTRY(taskq_ent) tqent_hash;
-	uint8_t tqent_type;
-	uint8_t tqent_registered;
-	uint8_t tqent_cancelled;
-	volatile uint32_t tqent_rc;
+	taskqid_t	 tqent_id;
+	LIST_ENTRY(taskq_ent) tqent_hash;
+	uint_t		 tqent_type;
+	volatile uint_t	 tqent_rc;
 } taskq_ent_t;
 
 /*
diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c
index ba22c77b69c..daefe355953 100644
--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -30,8 +30,6 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/ck.h>
-#include <sys/epoch.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/lock.h>
@@ -70,7 +68,7 @@ extern int uma_align_cache;
 
 static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
 
-static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
+static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
 static unsigned long tqenthash;
 static unsigned long tqenthashlock;
 static struct sx *tqenthashtbl_lock;
@@ -80,8 +78,8 @@ static taskqid_t tqidnext;
 #define	TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
 #define	TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
 
+#define	NORMAL_TASK 0
 #define	TIMEOUT_TASK 1
-#define	NORMAL_TASK 2
 
 static void
 system_taskq_init(void *arg)
@@ -121,7 +119,7 @@ system_taskq_fini(void *arg)
 	for (i = 0; i < tqenthashlock + 1; i++)
 		sx_destroy(&tqenthashtbl_lock[i]);
 	for (i = 0; i < tqenthash + 1; i++)
-		VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i]));
+		VERIFY(LIST_EMPTY(&tqenthashtbl[i]));
 	free(tqenthashtbl_lock, M_TASKQ);
 	free(tqenthashtbl, M_TASKQ);
 }
@@ -162,27 +160,27 @@ taskq_lookup(taskqid_t tqid)
 {
 	taskq_ent_t *ent = NULL;
 
-	sx_xlock(TQIDHASHLOCK(tqid));
-	CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
+	if (tqid == 0)
+		return (NULL);
+	sx_slock(TQIDHASHLOCK(tqid));
+	LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
 		if (ent->tqent_id == tqid)
 			break;
 	}
 	if (ent != NULL)
 		refcount_acquire(&ent->tqent_rc);
-	sx_xunlock(TQIDHASHLOCK(tqid));
+	sx_sunlock(TQIDHASHLOCK(tqid));
 	return (ent);
 }
 
 static taskqid_t
 taskq_insert(taskq_ent_t *ent)
 {
-	taskqid_t tqid;
+	taskqid_t tqid = __taskq_genid();
 
-	tqid = __taskq_genid();
 	ent->tqent_id = tqid;
-	ent->tqent_registered = B_TRUE;
 	sx_xlock(TQIDHASHLOCK(tqid));
-	CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
+	LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
 	sx_xunlock(TQIDHASHLOCK(tqid));
 	return (tqid);
 }
@@ -192,13 +190,14 @@ taskq_remove(taskq_ent_t *ent)
 {
 	taskqid_t tqid = ent->tqent_id;
 
-	if (!ent->tqent_registered)
+	if (tqid == 0)
 		return;
-
 	sx_xlock(TQIDHASHLOCK(tqid));
-	CK_LIST_REMOVE(ent, tqent_hash);
+	if (ent->tqent_id != 0) {
+		LIST_REMOVE(ent, tqent_hash);
+		ent->tqent_id = 0;
+	}
 	sx_xunlock(TQIDHASHLOCK(tqid));
-	ent->tqent_registered = B_FALSE;
 }
 
 static void
@@ -285,21 +284,22 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
 	int rc;
 	taskq_ent_t *ent;
 
-	if (tid == 0)
-		return (0);
-
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return (0);
 
-	ent->tqent_cancelled = B_TRUE;
-	if (ent->tqent_type == TIMEOUT_TASK) {
+	if (ent->tqent_type == NORMAL_TASK) {
+		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
+		if (rc == EBUSY)
+			taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	} else {
 		rc = taskqueue_cancel_timeout(tq->tq_queue,
 		    &ent->tqent_timeout_task, &pend);
-	} else
-		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
-	if (rc == EBUSY) {
-		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
-	} else if (pend) {
+		if (rc == EBUSY) {
+			taskqueue_drain_timeout(tq->tq_queue,
+			    &ent->tqent_timeout_task);
+		}
+	}
+	if (pend) {
 		/*
 		 * Tasks normally free themselves when run, but here the task
 		 * was cancelled so it did not free itself.
@@ -312,12 +312,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
 }
 
 static void
-taskq_run(void *arg, int pending __unused)
+taskq_run(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 
-	if (!task->tqent_cancelled)
-		task->tqent_func(task->tqent_arg);
+	if (pending == 0)
+		return;
+	task->tqent_func(task->tqent_arg);
 	taskq_free(task);
 }
 
@@ -345,7 +346,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_type = TIMEOUT_TASK;
-	task->tqent_cancelled = B_FALSE;
 	refcount_init(&task->tqent_rc, 1);
 	tqid = taskq_insert(task);
 	TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
@@ -379,7 +379,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	refcount_init(&task->tqent_rc, 1);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
-	task->tqent_cancelled = B_FALSE;
 	task->tqent_type = NORMAL_TASK;
 	tqid = taskq_insert(task);
 	TASK_INIT(&task->tqent_task, prio, taskq_run, task);
@@ -388,10 +387,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 }
 
 static void
-taskq_run_ent(void *arg, int pending __unused)
+taskq_run_ent(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 
+	if (pending == 0)
+		return;
 	task->tqent_func(task->tqent_arg);
 }
 
@@ -406,8 +407,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
 	 * can go at the front of the queue.
 	 */
 	prio = !!(flags & TQ_FRONT);
-	task->tqent_cancelled = B_FALSE;
-	task->tqent_registered = B_FALSE;
 	task->tqent_id = 0;
 	task->tqent_func = func;
 	task->tqent_arg = arg;
@@ -427,12 +426,13 @@ taskq_wait_id(taskq_t *tq, taskqid_t tid)
 {
 	taskq_ent_t *ent;
 
-	if (tid == 0)
-		return;
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return;
 
-	taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	if (ent->tqent_type == NORMAL_TASK)
+		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	else
+		taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task);
 	taskq_free(ent);
 }
 

From c0e58995e33479a9c1d97fb2a19f8f507cc954b7 Mon Sep 17 00:00:00 2001
From: John Wren Kennedy <john.kennedy@delphix.com>
Date: Fri, 13 Oct 2023 12:15:09 -0600
Subject: [PATCH 02/11] Large sync writes perform worse with slog

For synchronous write workloads with large IO sizes, a pool configured
with a slog performs worse than one with an embedded zil:

sequential_writes 1m sync ios, 16 threads
  Write IOPS:              1292          438   -66.10%
  Write Bandwidth:      1323570       448910   -66.08%
  Write Latency:       12128400     36330970      3.0x

sequential_writes 1m sync ios, 32 threads
  Write IOPS:              1293          430   -66.74%
  Write Bandwidth:      1324184       441188   -66.68%
  Write Latency:       24486278     74028536      3.0x

The reason is the `zil_slog_bulk` variable. In `zil_lwb_write_open`,
if a zil block is greater than 768K, the priority of the write is
downgraded from sync to async. Increasing the value allows greater
throughput. To select a value for this PR, I ran an fio workload with
the following values for `zil_slog_bulk`:

    zil_slog_bulk    KiB/s
    1048576         422132
    2097152         478935
    4194304         533645
    8388608         623031
    12582912        827158
    16777216       1038359
    25165824       1142210
    33554432       1211472
    50331648       1292847
    67108864       1308506
    100663296      1306821
    134217728      1304998

At 64M, the results with a slog are now improved to parity with an
embedded zil:

sequential_writes 1m sync ios, 16 threads
  Write IOPS:               438         1288      2.9x
  Write Bandwidth:       448910      1319062      2.9x
  Write Latency:       36330970     12163408   -66.52%

sequential_writes 1m sync ios, 32 threads
  Write IOPS:               430         1290      3.0x
  Write Bandwidth:       441188      1321693      3.0x
  Write Latency:       74028536     24519698   -66.88%

None of the other tests in the performance suite (run with a zil or
slog) had a significant change, including the random_write_zil tests,
which use multiple datasets.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: John Wren Kennedy <john.kennedy@delphix.com>
Closes #14378
---
 man/man4/zfs.4   | 2 +-
 module/zfs/zil.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 71a3e67ee67..5f89f6adf1e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2172,7 +2172,7 @@ if a volatile out-of-order write cache is enabled.
 Disable intent logging replay.
 Can be disabled for recovery from corrupted ZIL.
 .
-.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64
+.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Limit SLOG write size per commit executed with synchronous priority.
 Any writes above that will be executed with lower (asynchronous) priority
 to limit potential SLOG device abuse by single active ZIL writer.
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 18c6cbf028b..a1188613699 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -145,7 +145,7 @@ static int zil_nocacheflush = 0;
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
-static uint64_t zil_slog_bulk = 768 * 1024;
+static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;

From f0f330e1212ee3924a3eadbdb40e7d71e0fb0a69 Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@delphix.com>
Date: Fri, 20 Oct 2023 10:29:02 -0600
Subject: [PATCH 03/11] Fix ZED auto-replace for VDEVs using by-id paths

The change is simple -- restore the original code so that the VDEV
path is updated when using by-id paths.  The more challenging part
was to devise a second ZTS test, that would test auto-replace for
'by-id' and help prevent a future regression.

With that new test, we can now do an A|B test with , and without,
the fix to confirm that auto-replace for by-id paths works. The
existing auto-replace test, functional/fault/auto_replace_001_pos,
will confirm that we didn't break auto-replace for 'by-vdev' paths.

In the original functional/fault/auto_replace_001_pos test, the disk
wipe (using dd) was not effective in removing the partitioning since
the kernel was never informed of the wipe.

Added a call to wipefs(8) so that the kernel is informed and ZED will
re-partition the device.

Added a validation step that the re-partitioning occurred by
confirming  that the GPT partition UUID changes.

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15363
---
 cmd/zed/agents/zfs_mod.c                      |  55 +++--
 include/libzutil.h                            |   2 +-
 lib/libzutil/os/linux/zutil_import_os.c       |   5 +-
 tests/runfiles/linux.run                      |   8 +-
 tests/test-runner/bin/zts-report.py.in        |   1 +
 tests/zfs-tests/include/commands.cfg          |   9 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../functional/fault/auto_replace_001_pos.ksh |  41 +++-
 .../functional/fault/auto_replace_002_pos.ksh | 192 ++++++++++++++++++
 9 files changed, 279 insertions(+), 35 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh

diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index b2c008ad1d0..9636c99fc85 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -24,6 +24,7 @@
  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2016, 2017, Intel Corporation.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 /*
@@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	uint64_t is_spare = 0;
 	const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
-	char devpath[PATH_MAX];
+	char pathbuf[PATH_MAX];
 	int ret;
 	int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
 	boolean_t is_sd = B_FALSE;
@@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	char **lines = NULL;
 	int lines_cnt = 0;
 
+	/*
+	 * Get the persistent path, typically under the '/dev/disk/by-id' or
+	 * '/dev/disk/by-vdev' directories.  Note that this path can change
+	 * when a vdev is replaced with a new disk.
+	 */
 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
 		return;
 
@@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
 	    is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
 
-	if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
+	if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) {
 		zed_log_msg(LOG_INFO, "  realpath: %s failed (%s)",
 		    rawpath, strerror(errno));
 
-		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
-		    &newstate);
+		int err = zpool_vdev_online(zhp, fullpath,
+		    ZFS_ONLINE_FORCEFAULT, &newstate);
 
-		zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s)",
-		    fullpath, libzfs_error_description(g_zfshdl));
+		zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s) "
+		    "err %d, new state %d",
+		    fullpath, libzfs_error_description(g_zfshdl), err,
+		    err ? (int)newstate : 0);
 		return;
 	}
 
@@ -428,7 +436,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		 * to trigger a ZFS fault for the device (and any hot spare
 		 * replacement).
 		 */
-		leafname = strrchr(devpath, '/') + 1;
+		leafname = strrchr(pathbuf, '/') + 1;
 
 		/*
 		 * If this is a request to label a whole disk, then attempt to
@@ -436,7 +444,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		 */
 		if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname,
 		    vdev, "autoreplace", &lines, &lines_cnt) != 0) {
-			zed_log_msg(LOG_INFO,
+			zed_log_msg(LOG_WARNING,
 			    "  zpool_prepare_and_label_disk: could not "
 			    "label '%s' (%s)", leafname,
 			    libzfs_error_description(g_zfshdl));
@@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		    sizeof (device->pd_physpath));
 		list_insert_tail(&g_device_list, device);
 
-		zed_log_msg(LOG_INFO, "  zpool_label_disk: async '%s' (%llu)",
+		zed_log_msg(LOG_NOTICE, "  zpool_label_disk: async '%s' (%llu)",
 		    leafname, (u_longlong_t)guid);
 
 		return;	/* resumes at EC_DEV_ADD.ESC_DISK for partition */
@@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		}
 		if (!found) {
 			/* unexpected partition slice encountered */
-			zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
-			    fullpath);
+			zed_log_msg(LOG_WARNING, "labeled disk %s was "
+			    "unexpected here", fullpath);
 			(void) zpool_vdev_online(zhp, fullpath,
 			    ZFS_ONLINE_FORCEFAULT, &newstate);
 			return;
@@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		zed_log_msg(LOG_INFO, "  zpool_label_disk: resume '%s' (%llu)",
 		    physpath, (u_longlong_t)guid);
 
-		(void) snprintf(devpath, sizeof (devpath), "%s%s",
-		    DEV_BYID_PATH, new_devid);
+		/*
+		 * Paths that begin with '/dev/disk/by-id/' will change and so
+		 * they must be updated before calling zpool_vdev_attach().
+		 */
+		if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) {
+			(void) snprintf(pathbuf, sizeof (pathbuf), "%s%s",
+			    DEV_BYID_PATH, new_devid);
+			zed_log_msg(LOG_INFO, "  zpool_label_disk: path '%s' "
+			    "replaced by '%s'", path, pathbuf);
+			path = pathbuf;
+		}
 	}
 
 	libzfs_free_str_array(lines, lines_cnt);
@@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 	 * Wait for udev to verify the links exist, then auto-replace
 	 * the leaf disk at same physical location.
 	 */
-	if (zpool_label_disk_wait(path, 3000) != 0) {
-		zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
-		    "disk %s is missing", path);
+	if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) {
+		zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling "
+		    "replacement disk, the expected disk partition link '%s' "
+		    "is missing after waiting %u ms",
+		    zpool_get_name(zhp), path, DISK_LABEL_WAIT);
 		nvlist_free(nvroot);
 		return;
 	}
@@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		    B_TRUE, B_FALSE);
 	}
 
-	zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
+	zed_log_msg(LOG_WARNING, "  zpool_vdev_replace: %s with %s (%s)",
 	    fullpath, path, (ret == 0) ? "no errors" :
 	    libzfs_error_description(g_zfshdl));
 
@@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
 		    dp->dd_prop, path);
 		dp->dd_found = B_TRUE;
 
-		/* pass the new devid for use by replacing code */
+		/* pass the new devid for use by auto-replacing code */
 		if (dp->dd_new_devid != NULL) {
 			(void) nvlist_add_string(nvl, "new_devid",
 			    dp->dd_new_devid);
diff --git a/include/libzutil.h b/include/libzutil.h
index 237ff976ba6..053b1ed4b52 100644
--- a/include/libzutil.h
+++ b/include/libzutil.h
@@ -34,7 +34,7 @@ extern "C" {
 #endif
 
 /*
- * Default wait time for a device name to be created.
+ * Default wait time in milliseconds for a device name to be created.
  */
 #define	DISK_LABEL_WAIT		(30 * 1000)  /* 30 seconds */
 
diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c
index 8b64369dc29..44ed697dd49 100644
--- a/lib/libzutil/os/linux/zutil_import_os.c
+++ b/lib/libzutil/os/linux/zutil_import_os.c
@@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
  * Wait up to timeout_ms for udev to set up the device node.  The device is
  * considered ready when libudev determines it has been initialized, all of
  * the device links have been verified to exist, and it has been allowed to
- * settle.  At this point the device the device can be accessed reliably.
- * Depending on the complexity of the udev rules this process could take
- * several seconds.
+ * settle.  At this point the device can be accessed reliably. Depending on
+ * the complexity of the udev rules this process could take several seconds.
  */
 int
 zpool_label_disk_wait(const char *path, int timeout_ms)
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 2252e46df3a..8bc55a1b4b4 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -122,10 +122,10 @@ tags = ['functional', 'fallocate']
 
 [tests/functional/fault:Linux]
 tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
-    'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
-    'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
-    'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
-    'zpool_status_-s']
+    'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
+    'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
+    'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
+    'scrub_after_resilver', 'zpool_status_-s']
 tags = ['functional', 'fault']
 
 [tests/functional/features/large_dnode:Linux]
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 5d1360380de..4608e87522a 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true':
         'fault/auto_online_001_pos': ['SKIP', ci_reason],
         'fault/auto_online_002_pos': ['SKIP', ci_reason],
         'fault/auto_replace_001_pos': ['SKIP', ci_reason],
+        'fault/auto_replace_002_pos': ['SKIP', ci_reason],
         'fault/auto_spare_ashift': ['SKIP', ci_reason],
         'fault/auto_spare_shared': ['SKIP', ci_reason],
         'procfs/pool_state': ['SKIP', ci_reason],
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index fa545e06bbf..648f2203dfb 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr
     chattr
     exportfs
     fallocate
+    flock
     free
     getfattr
     groupadd
     groupdel
     groupmod
     hostid
+    logger
     losetup
     lsattr
     lsblk
@@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr
     md5sum
     mkswap
     modprobe
+    mountpoint
     mpstat
     nsenter
     parted
     perf
     setfattr
+    setpriv
     sha256sum
     udevadm
     unshare
     useradd
     userdel
     usermod
-    setpriv
-    mountpoint
-    flock
-    logger'
+    wipefs'
 
 export ZFS_FILES='zdb
     zfs
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 158401e078a..87b50f59ca7 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/auto_online_001_pos.ksh \
 	functional/fault/auto_online_002_pos.ksh \
 	functional/fault/auto_replace_001_pos.ksh \
+	functional/fault/auto_replace_002_pos.ksh \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh
index 081e6c18430..ae56ee9919b 100755
--- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh
@@ -34,13 +34,14 @@
 # 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
 #    This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
 # 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
-# 3. Export a pool
+# 3. Export the pool
 # 4. Wipe and offline the scsi_debug disk
-# 5. Import pool with missing disk
+# 5. Import the pool with missing disk
 # 6. Re-online the wiped scsi_debug disk
-# 7. Verify the ZED detects the new unused disk and adds it back to the pool
+# 7. Verify ZED detects the new blank disk and replaces the missing vdev
+# 8. Verify that the scsi_debug disk was re-partitioned
 #
-# Creates a raidz1 zpool using persistent disk path names
+# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names
 # (ie not /dev/sdc)
 #
 # Auto-replace is opt in, and matches by phys_path.
@@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3
 log_must zpool set autoreplace=on $TESTPOOL
 
 # Add some data to the pool
-log_must mkfile $FSIZE /$TESTPOOL/data
+log_must zfs create $TESTPOOL/fs
+log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
 log_must zpool export $TESTPOOL
 
+# Record the partition UUID for later comparison
+part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
+    --value /dev/disk/by-id/$SD_DEVICE_ID)
+[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
+
+#
 # Wipe and offline the disk
+#
+# Note that it is not enough to zero the disk to expunge the partitions.
+# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
+#
+# Using partprobe is overkill and hdparm is not as common as wipefs. So
+# we use wipefs which lets the kernel know the partition was removed
+# from the device (i.e., calls BLKRRPART ioctl).
+#
 log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
+log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
 remove_disk $SD
 block_device_wait
 
@@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60
 # Validate auto-replace was successful
 log_must check_state $TESTPOOL "" "ONLINE"
 
+#
+# Confirm the partition UUID changed so we know the new disk was relabeled
+#
+# Note: some older versions of udevadm don't support "--property" option so
+# we'll # skip this test when it is not supported
+#
+if [ ! -z "$part_uuid" ]; then
+	new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
+	    --value /dev/disk/by-id/$SD_DEVICE_ID)
+	log_note new disk GPT uuid ${new_uuid}
+	[[ "$part_uuid" = "$new_uuid" ]] && \
+	    log_fail "The new disk was not relabeled as expected"
+fi
+
 log_pass "Auto-replace test successful"
diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh
new file mode 100755
index 00000000000..2259e604317
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh
@@ -0,0 +1,192 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2017 by Intel Corporation. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/fault/fault.cfg
+
+#
+# DESCRIPTION:
+# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test.
+# Verifys that auto-replace works with by-id paths.
+#
+# STRATEGY:
+# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
+#    This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
+# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
+# 3. Export the pool
+# 4. Wipe and offline the scsi_debug disk
+# 5. Import the pool with missing disk
+# 6. Re-online the wiped scsi_debug disk with a new serial number
+# 7. Verify ZED detects the new blank disk and replaces the missing vdev
+# 8. Verify that the scsi_debug disk was re-partitioned
+#
+# Creates a raidz1 zpool using persistent /dev/disk/by-id path names
+#
+# Auto-replace is opt in, and matches by phys_path.
+#
+
+verify_runnable "both"
+
+if ! is_physical_device $DISKS; then
+	log_unsupported "Unsupported disks for this test."
+fi
+
+function cleanup
+{
+	zpool status $TESTPOOL
+	destroy_pool $TESTPOOL
+	sed -i '/alias scsidebug/d' $VDEVID_CONF
+	unload_scsi_debug
+}
+
+#
+# Wait until a vdev transitions to its replacement vdev
+#
+# Return 0 when vdev reaches expected state, 1 on timeout.
+#
+# Note: index +2 is to skip over root and raidz-0 vdevs
+#
+function wait_vdev_online # pool index oldguid timeout
+{
+	typeset pool=$1
+	typeset -i index=$2+2
+	typeset guid=$3
+	typeset timeout=${4:-60}
+	typeset -i i=0
+
+	while [[ $i -lt $timeout ]]; do
+		vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) )
+
+		if [ "${vdev_guids[$index]}" != "${guid}" ]; then
+			log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}"
+			return 0
+		fi
+
+		i=$((i+1))
+		sleep 1
+	done
+
+	return 1
+}
+log_assert "automated auto-replace with by-id paths"
+log_onexit cleanup
+
+load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b'
+SD=$(get_debug_device)
+SD_DEVICE_ID=$(get_persistent_disk_name $SD)
+SD_HOST=$(get_scsi_host $SD)
+
+# Register vdev_id alias for scsi_debug device to create a persistent path
+echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF
+block_device_wait
+
+SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \
+    awk -F'=' '/ID_VDEV=/ {print $2; exit}')
+[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly"
+
+log_must zpool events -c
+log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3
+
+vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID)
+log_note original vdev guid ${vdev_guid}
+
+# Auto-replace is opt-in so need to set property
+log_must zpool set autoreplace=on $TESTPOOL
+
+# Add some data to the pool
+log_must zfs create $TESTPOOL/fs
+log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
+log_must zpool export $TESTPOOL
+
+# Record the partition UUID for later comparison
+part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
+    --value /dev/disk/by-id/$SD_DEVICE_ID)
+[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
+
+#
+# Wipe and offline the disk
+#
+# Note that it is not enough to zero the disk to expunge the partitions.
+# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
+#
+# Using partprobe is overkill and hdparm is not as common as wipefs. So
+# we use wipefs which lets the kernel know the partition was removed
+# from the device (i.e., calls BLKRRPART ioctl).
+#
+log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
+log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
+remove_disk $SD
+block_device_wait
+
+# Re-import pool with drive missing
+log_must zpool import $TESTPOOL
+log_must check_state $TESTPOOL "" "DEGRADED"
+block_device_wait
+
+#
+# Online an empty disk in the same physical location, with a different by-id
+# symlink. We use vpd_use_hostno to make sure the underlying serial number
+# changes for the new disk which in turn gives us a different by-id path.
+#
+# The original names were something like:
+# 	/dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1
+# 	/dev/disk/by-id/wwn-0x33333330000007d0-part1
+#
+# This new inserted disk, will have different links like:
+# 	/dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1
+# 	/dev/disk/by-id/wwn-0x0x3333333000003e80 -part1
+#
+echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno
+
+insert_disk $SD $SD_HOST
+
+# make sure the physical path points to the same scsi-debug device
+SD_DEVICE_ID=$(get_persistent_disk_name $SD)
+echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF
+block_device_wait
+
+# Wait for the new disk to be online and replaced
+log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45
+log_must wait_replacing $TESTPOOL 45
+
+# Validate auto-replace was successful
+log_must check_state $TESTPOOL "" "ONLINE"
+
+#
+# Confirm the partition UUID changed so we know the new disk was relabeled
+#
+# Note: some older versions of udevadm don't support "--property" option so
+# we'll # skip this test when it is not supported
+#
+if [ ! -z "$part_uuid" ]; then
+	new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
+	    --value /dev/disk/by-id/$SD_DEVICE_ID)
+	log_note new disk GPT uuid ${new_uuid}
+	[[ "$part_uuid" = "$new_uuid" ]] && \
+	    log_fail "The new disk was not relabeled as expected"
+fi
+
+log_pass "automated auto-replace with by-id paths"

From ea30b5a9e0d266baa13398ed8f9435de050f4b25 Mon Sep 17 00:00:00 2001
From: Colin Percival <cperciva@tarsnap.com>
Date: Fri, 20 Oct 2023 10:30:32 -0700
Subject: [PATCH 04/11] Set spa_ccw_fail_time=0 when expanding a vdev.

When a vdev is to be expanded -- either via `zpool online -e` or via
the autoexpand option -- a SPA_ASYNC_CONFIG_UPDATE request is queued
to be handled via an asynchronous worker thread (spa_async_thread).
This normally happens almost immediately; but will be delayed up to
zfs_ccw_retry_interval seconds (default 5 minutes) if an attempt to
write the zpool configuration cache failed.

When FreeBSD boots ZFS-root VM images generated using `makefs -t zfs`,
the zpoolupgrade rc.d script runs `zpool upgrade`, which modifies the
pool configuration and triggers an attempt to write to the cache file.
This attempted write fails because the filesystem is still mounted
read-only at this point in the boot process, triggering a 5-minute
cooldown before SPA_ASYNC_CONFIG_UPDATE requests will be handled by
the asynchronous worker thread.

When expanding a vdev, reset the "when did a configuration cache
write last fail" value so that the SPA_ASYNC_CONFIG_UPDATE request
will be handled promptly.  A cleaner but more intrusive option would
be to use separate SPA_ASYNC_ flags for "configuration changed" and
"try writing the configuration cache again", but with FreeBSD 14.0
coming very soon I'd prefer to leave such refactoring for a later
date.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Colin Percival <cperciva@FreeBSD.org>
Closes #15405
---
 module/zfs/vdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 87c14559323..afb01c0ef7f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4215,6 +4215,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 

From b9384b94988cbcd697199e3284abd5f151978fe6 Mon Sep 17 00:00:00 2001
From: Olivier Certner <ocert.dev@free.fr>
Date: Fri, 20 Oct 2023 20:49:56 +0200
Subject: [PATCH 05/11] FreeBSD: taskq: Remove unused declaration

Variable 'uma_align_cache' has not been used since commit "FreeBSD: Use
a hash table for taskqid lookups" (3933305ea).  Moreover, it is soon
going to become private to FreeBSD's UMA in 15.0-CURRENT (main),
14.0-STABLE (stable/14) and 13.2-STABLE (stable/13).  Should accessing
this information become necessary again, one will have to use the new
accessors for recent versions.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olivier Certner <olce.freebsd@certner.fr>
Closes #15416
---
 module/os/freebsd/spl/spl_taskq.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c
index daefe355953..842b80ade1f 100644
--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -64,8 +64,6 @@ taskq_t *dynamic_taskq = NULL;
 
 proc_t *system_proc;
 
-extern int uma_align_cache;
-
 static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
 
 static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;

From 0d6cec418e10de077d5252e8bc5d1b0b490da54f Mon Sep 17 00:00:00 2001
From: dennisfriedrichsen
 <31087738+dennisfriedrichsen@users.noreply.github.com>
Date: Fri, 20 Oct 2023 13:52:13 -0500
Subject: [PATCH 06/11] Fix typo in 
 tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg

Reviewed-by: Rob N <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com>
Closes #15417
---
 tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg
index e98b5e8b221..9c76a8780b4 100644
--- a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg
+++ b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg
@@ -29,7 +29,7 @@
 #
 
 if is_linux; then
-	# these are the set of setable ZFS properties
+	# these are the set of settable ZFS properties
 	PROP_NAMES="\
 		acltype		atime		 \
 		checksum	compression			devices \
@@ -81,7 +81,7 @@ elif is_freebsd; then
 		hidden"
 
 else
-	# these are the set of setable ZFS properties
+	# these are the set of settable ZFS properties
 	PROP_NAMES="\
 		aclinherit	aclmode		atime		 \
 		checksum	compression			devices \

From b29e98fa8dfcee3b5362a9a2cb479a2379fb16da Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 20 Oct 2023 14:54:05 -0400
Subject: [PATCH 07/11] Properly pad struct tx_cpu to cache line

We already use ____cacheline_aligned in many places, so add one more
instead of seems arbitrary char tc_pad[8].

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15402
---
 include/sys/txg_impl.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h
index 45fde2e1f35..8ab7969b25b 100644
--- a/include/sys/txg_impl.h
+++ b/include/sys/txg_impl.h
@@ -73,8 +73,7 @@ struct tx_cpu {
 	kcondvar_t	tc_cv[TXG_SIZE];
 	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
-	char		tc_pad[8];		/* pad to fill 3 cache lines */
-};
+} ____cacheline_aligned;
 
 /*
  * The tx_state structure maintains the state information about the different

From de7b1ae30ab83c37978902dfd53c1ac783ddbc6e Mon Sep 17 00:00:00 2001
From: VaibhavB <88050553+vaibhav-delphix@users.noreply.github.com>
Date: Sat, 21 Oct 2023 00:27:39 +0530
Subject: [PATCH 08/11] run-zts test procfs/pool_state failed with
 uncorrectable I/O failure

Once we trigger the zpool scrub, all zpool/zfs command gets stuck for
180 seconds. Post 180 seconds zpool/zfs commands gets start executing
however few more seconds(10s) it take to update the status. hence
sleeping for 200 seconds so that we get the correct status.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: vaibhav.bhanawat <vaibhav.bhanawat@delphix.com>
Closes #15364
---
 tests/zfs-tests/tests/functional/procfs/pool_state.ksh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh
index 7a02eb68abd..bae87637917 100755
--- a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh
+++ b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh
@@ -141,7 +141,11 @@ remove_disk $SDISK
 # background since the command will hang when the pool gets suspended.  The
 # command will resume and exit after we restore the missing disk later on.
 zpool scrub $TESTPOOL2 &
-sleep 3		# Give the scrub some time to run before we check if it fails
+# Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds.
+# Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s)
+# it take to update the status.
+# hence sleeping for 200 seconds so that we get the correct status.
+sleep 200		# Give the scrub some time to run before we check if it fails
 
 log_must check_all $TESTPOOL2 "SUSPENDED"
 

From 4fbc52495552b7e8b3337c94d0b7080db65657b8 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 20 Oct 2023 15:37:16 -0400
Subject: [PATCH 09/11] Remove lock from dsl_pool_need_dirty_delay()

Torn reads/writes of dp_dirty_total are unlikely: on 64-bit systems
due to register size, while on 32-bit due to memory constraints.
And even if we hit some race, the code implementing the delay takes
the lock any way.

Removal of the poll-wide lock acquisition saves ~1% of CPU time on
8-thread 8KB write workload.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15390
---
 module/zfs/dsl_pool.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 9120fef93c7..17b97124828 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -965,18 +965,18 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 
-	mutex_enter(&dp->dp_lock);
-	uint64_t dirty = dp->dp_dirty_total;
-	mutex_exit(&dp->dp_lock);
-
-	return (dirty > delay_min_bytes);
+	/*
+	 * We are not taking the dp_lock here and few other places, since torn
+	 * reads are unlikely: on 64-bit systems due to register size and on
+	 * 32-bit due to memory constraints.  Pool-wide locks in hot path may
+	 * be too expensive, while we do not need a precise result here.
+	 */
+	return (dp->dp_dirty_total > delay_min_bytes);
 }
 
 static boolean_t
 dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
 {
-	ASSERT(MUTEX_HELD(&dp->dp_lock));
-
 	uint64_t dirty_min_bytes =
 	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
 	uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];

From 57b409856265a53f93dd2497e5c046506cf0fc70 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 20 Oct 2023 15:38:37 -0400
Subject: [PATCH 10/11] Trust ARC_BUF_SHARED() more

In my understanding ARC_BUF_SHARED() and arc_buf_is_shared() should
return identical results, except the second also asserts it deeper.
The first is much cheaper though, saving few pointer dereferences.
Replace production arc_buf_is_shared() calls with ARC_BUF_SHARED(),
and call arc_buf_is_shared() in random assertions, while making it
even more strict.

On my tests this in half reduces arc_buf_destroy_impl() time, that
noticeably reduces hash_lock congestion under heavy dbuf eviction.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15397
---
 module/zfs/arc.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index b5946e7604c..5d4a52fa069 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1364,7 +1364,7 @@ arc_buf_is_shared(arc_buf_t *buf)
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
-	IMPLY(shared, ARC_BUF_SHARED(buf));
+	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
@@ -1998,7 +1998,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
-	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+	IMPLY(encrypted, !arc_buf_is_shared(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
@@ -2066,7 +2066,9 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 	}
 
 	if (hdr_compressed == compressed) {
-		if (!arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
+			ASSERT(arc_buf_is_shared(buf));
+		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
@@ -2078,7 +2080,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(ARC_BUF_COMPRESSED(buf));
 
 			/* We need to give the buf its own b_data */
@@ -2090,6 +2092,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
+			ASSERT(!arc_buf_is_shared(buf));
+
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
@@ -2217,7 +2221,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
+		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
@@ -2256,7 +2260,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
+		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
@@ -2481,7 +2485,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
-				if (arc_buf_is_shared(buf))
+				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
@@ -2537,7 +2541,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
-				if (arc_buf_is_shared(buf))
+				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
@@ -3061,9 +3065,10 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
+			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
@@ -3104,9 +3109,9 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
-			VERIFY(!arc_buf_is_shared(lastbuf));
+			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
-			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
@@ -6189,7 +6194,7 @@ arc_release(arc_buf_t *buf, const void *tag)
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		VERIFY3S(remove_reference(hdr, tag), >, 0);
 
-		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
@@ -6206,9 +6211,9 @@ arc_release(arc_buf_t *buf, const void *tag)
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
-			VERIFY(!arc_buf_is_shared(lastbuf));
+			ASSERT(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
@@ -6241,7 +6246,7 @@ arc_release(arc_buf_t *buf, const void *tag)
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
-			ASSERT(!ARC_BUF_SHARED(buf));
+			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
@@ -6335,9 +6340,10 @@ arc_write_ready(zio_t *zio)
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
-			if (arc_buf_is_shared(buf)) {
+			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
+				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
@@ -6636,9 +6642,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
-		if (arc_buf_is_shared(buf)) {
+		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
+			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);

From 797f55ef12d752d2a7fb04fae5d24e019adf2a1d Mon Sep 17 00:00:00 2001
From: Sam Atkinson <atkinson.samuel.j@gmail.com>
Date: Fri, 20 Oct 2023 17:22:04 -0400
Subject: [PATCH 11/11] Do not persist user/group/project quota zap objects
 when unneeded

In the zfs_id_over*quota functions, there is a short-circuit to skip
the zap_lookup when the quota zap does not exist. If quotas are never
used in a zpool, then the quota zap will never exist. But if
user/group/project quotas are ever used, the zap objects will be
created and will persist even if the quotas are deleted.

The quota zap_lookup in the write path can become a bottleneck for
write-heavy small I/O workloads. Before this commit, it was not
possible to remove this lookup without creating a new zpool.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sam Atkinson <samatk@amazon.com>
Closes #14721
---
 module/zfs/zfs_quota.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c
index 9b351eefc04..56f9d22ed0e 100644
--- a/module/zfs/zfs_quota.c
+++ b/module/zfs/zfs_quota.c
@@ -347,18 +347,32 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 	if (*objp == 0) {
 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 		    DMU_OT_NONE, 0, tx);
-		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		VERIFY0(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 	}
-	mutex_exit(&zfsvfs->z_lock);
 
 	if (quota == 0) {
 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 		if (err == ENOENT)
 			err = 0;
+		/*
+		 * If the quota contains no more entries after the entry
+		 * was removed, destroy the quota zap and remove the
+		 * reference from zfsvfs. This will save us unnecessary
+		 * zap_lookups for the quota during writes.
+		 */
+		uint64_t zap_nentries;
+		VERIFY0(zap_count(zfsvfs->z_os, *objp, &zap_nentries));
+		if (zap_nentries == 0) {
+			VERIFY0(zap_remove(zfsvfs->z_os, MASTER_NODE_OBJ,
+			    zfs_userquota_prop_prefixes[type], tx));
+			VERIFY0(zap_destroy(zfsvfs->z_os, *objp, tx));
+			*objp = 0;
+		}
 	} else {
 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 	}
+	mutex_exit(&zfsvfs->z_lock);
 	ASSERT(err == 0);
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);