From d54358ff59c640595ce318705c69b5c927a3073b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Sat, 26 Aug 2023 01:13:43 +0200
Subject: [PATCH 01/13] Make zoned/jailed zfsprops(7) make more sense.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Distribute zfs-[un]jail.8 on FreeBSD and zfs-[un]zone.8 on Linux
- zfsprops.7: mirror zoned/jailed, only available on respective platforms

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Closes #15161
---
 contrib/debian/openzfs-zfsutils.install |  2 --
 man/Makefile.am                         | 16 ++++++++++++----
 man/man7/zfsprops.7                     | 16 +++++++++-------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install
index 0f58508f006..301d8f67b3a 100644
--- a/contrib/debian/openzfs-zfsutils.install
+++ b/contrib/debian/openzfs-zfsutils.install
@@ -60,7 +60,6 @@ usr/share/man/man8/zfs-get.8
 usr/share/man/man8/zfs-groupspace.8
 usr/share/man/man8/zfs-hold.8
 usr/share/man/man8/zfs-inherit.8
-usr/share/man/man8/zfs-jail.8
 usr/share/man/man8/zfs-list.8
 usr/share/man/man8/zfs-load-key.8
 usr/share/man/man8/zfs-mount-generator.8
@@ -80,7 +79,6 @@ usr/share/man/man8/zfs-set.8
 usr/share/man/man8/zfs-share.8
 usr/share/man/man8/zfs-snapshot.8
 usr/share/man/man8/zfs-unallow.8
-usr/share/man/man8/zfs-unjail.8
 usr/share/man/man8/zfs-unload-key.8
 usr/share/man/man8/zfs-unmount.8
 usr/share/man/man8/zfs-unzone.8
diff --git a/man/Makefile.am b/man/Makefile.am
index 2973520324f..36c1aede106 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -38,7 +38,6 @@ dist_man_MANS = \
 	%D%/man8/zfs-groupspace.8 \
 	%D%/man8/zfs-hold.8 \
 	%D%/man8/zfs-inherit.8 \
-	%D%/man8/zfs-jail.8 \
 	%D%/man8/zfs-list.8 \
 	%D%/man8/zfs-load-key.8 \
 	%D%/man8/zfs-mount.8 \
@@ -57,14 +56,11 @@ dist_man_MANS = \
 	%D%/man8/zfs-share.8 \
 	%D%/man8/zfs-snapshot.8 \
 	%D%/man8/zfs-unallow.8 \
-	%D%/man8/zfs-unjail.8 \
 	%D%/man8/zfs-unload-key.8 \
 	%D%/man8/zfs-unmount.8 \
-	%D%/man8/zfs-unzone.8 \
 	%D%/man8/zfs-upgrade.8 \
 	%D%/man8/zfs-userspace.8 \
 	%D%/man8/zfs-wait.8 \
-	%D%/man8/zfs-zone.8 \
 	%D%/man8/zfs_ids_to_path.8 \
 	%D%/man8/zgenhostid.8 \
 	%D%/man8/zinject.8 \
@@ -104,6 +100,18 @@ dist_man_MANS = \
 	%D%/man8/zstreamdump.8 \
 	%D%/man8/zpool_influxdb.8
 
+if BUILD_FREEBSD
+dist_man_MANS += \
+	%D%/man8/zfs-jail.8 \
+	%D%/man8/zfs-unjail.8
+endif
+
+if BUILD_LINUX
+dist_man_MANS += \
+	%D%/man8/zfs-unzone.8 \
+	%D%/man8/zfs-zone.8
+endif
+
 nodist_man_MANS = \
 	%D%/man8/zed.8 \
 	%D%/man8/zfs-mount-generator.8
diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7
index 8f6b919cfc0..51ddd85eb79 100644
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@@ -38,7 +38,7 @@
 .\" Copyright (c) 2019, Kjeld Schouten-Lebbing
 .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
 .\"
-.Dd April 18, 2023
+.Dd August 8, 2023
 .Dt ZFSPROPS 7
 .Os
 .
@@ -1916,13 +1916,15 @@ See
 for more information.
 Jails are a
 .Fx
-feature and are not relevant on other platforms.
-The default value is
-.Sy off .
-.It Sy zoned Ns = Ns Sy on Ns | Ns Sy off
+feature and this property is not available on other platforms.
+.It Sy zoned Ns = Ns Sy off Ns | Ns Sy on
 Controls whether the dataset is managed from a non-global zone or namespace.
-The default value is
-.Sy off .
+See
+.Xr zfs-zone 8
+for more information.
+Zoning is a
+Linux
+feature and this property is not available on other platforms.
 .El
 .Pp
 The following three properties cannot be changed after the file system is

From f0e34c88798c21836f3db9da34b6b6d1c6f807c8 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Sat, 26 Aug 2023 11:22:28 -0700
Subject: [PATCH 02/13] zed: update zed.d/statechange-slot_off.sh

The statechange-slot_off.sh zedlet which was added in #15200
needed to be installed so it's included by the packages.

Additional testing has also shown that multiple retries are
often needed for the script to operate reliably.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #15210
---
 cmd/zed/zed.d/Makefile.am             |  2 ++
 cmd/zed/zed.d/statechange-slot_off.sh | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am
index c65b43fb027..812558cf6d0 100644
--- a/cmd/zed/zed.d/Makefile.am
+++ b/cmd/zed/zed.d/Makefile.am
@@ -16,6 +16,7 @@ dist_zedexec_SCRIPTS = \
 	%D%/scrub_finish-notify.sh \
 	%D%/statechange-led.sh \
 	%D%/statechange-notify.sh \
+	%D%/statechange-slot_off.sh \
 	%D%/trim_finish-notify.sh \
 	%D%/vdev_attach-led.sh \
 	%D%/vdev_clear-led.sh
@@ -35,6 +36,7 @@ zedconfdefaults = \
 	scrub_finish-notify.sh \
 	statechange-led.sh \
 	statechange-notify.sh \
+	statechange-slot_off.sh \
 	vdev_attach-led.sh \
 	vdev_clear-led.sh
 
diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh
index d6f3c94a419..9d218ddaa64 100755
--- a/cmd/zed/zed.d/statechange-slot_off.sh
+++ b/cmd/zed/zed.d/statechange-slot_off.sh
@@ -43,15 +43,17 @@ if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
 	exit 4
 fi
 
-echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
-
-# Wait for sysfs for report that the slot is off.  It can take ~400ms on some
-# enclosures.
+# Turn off the slot and wait for sysfs to report that the slot is off.
+# It can take ~400ms on some enclosures and multiple retries may be needed.
 for i in $(seq 1 20) ; do
-	if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
-		break
-	fi
-	sleep 0.1
+	echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
+
+	for j in $(seq 1 5) ; do
+		if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
+			break 2
+		fi
+		sleep 0.1
+	done
 done
 
 if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then

From 277f2e587b085d1eb8aa48b4ac0768a9ef5745ab Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Sat, 26 Aug 2023 14:25:46 -0400
Subject: [PATCH 03/13] Avoid save/restoring AMX registers to avoid a SPR
 erratum

Intel SPR erratum SPR4 says that if you trip into a vmexit while
doing FPU save/restore, your AMX register state might misbehave...
and by misbehave, I mean save all zeroes incorrectly, leading to
explosions if you restore it.

Since we're not using AMX for anything, the simple way to avoid
this is to just not save/restore those when we do anything, since
we're killing preemption of any sort across our save/restores.

If we ever decide to use AMX, it's not clear that we have any
way to mitigate this, on Linux...but I am not an expert.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #14989
Closes #15168
---
 include/os/linux/kernel/linux/simd_x86.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h
index 1d77f0487a3..699b8a57182 100644
--- a/include/os/linux/kernel/linux/simd_x86.h
+++ b/include/os/linux/kernel/linux/simd_x86.h
@@ -147,6 +147,15 @@
 #error "Toolchain needs to support the XSAVE assembler instruction"
 #endif
 
+#ifndef XFEATURE_MASK_XTILE
+/*
+ * For kernels where this doesn't exist yet, we still don't want to break
+ * by save/restoring this broken nonsense.
+ * See issue #14989 or Intel errata SPR4 for why
+ */
+#define	XFEATURE_MASK_XTILE	0x60000
+#endif
+
 #include <linux/mm.h>
 #include <linux/slab.h>
 
@@ -315,18 +324,18 @@ kfpu_begin(void)
 	uint8_t *state = zfs_kfpu_fpregs[smp_processor_id()];
 #if defined(HAVE_XSAVES)
 	if (static_cpu_has(X86_FEATURE_XSAVES)) {
-		kfpu_do_xsave("xsaves", state, ~0);
+		kfpu_do_xsave("xsaves", state, ~XFEATURE_MASK_XTILE);
 		return;
 	}
 #endif
 #if defined(HAVE_XSAVEOPT)
 	if (static_cpu_has(X86_FEATURE_XSAVEOPT)) {
-		kfpu_do_xsave("xsaveopt", state, ~0);
+		kfpu_do_xsave("xsaveopt", state, ~XFEATURE_MASK_XTILE);
 		return;
 	}
 #endif
 	if (static_cpu_has(X86_FEATURE_XSAVE)) {
-		kfpu_do_xsave("xsave", state, ~0);
+		kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE);
 	} else if (static_cpu_has(X86_FEATURE_FXSR)) {
 		kfpu_save_fxsr(state);
 	} else {
@@ -376,12 +385,12 @@ kfpu_end(void)
 	uint8_t  *state = zfs_kfpu_fpregs[smp_processor_id()];
 #if defined(HAVE_XSAVES)
 	if (static_cpu_has(X86_FEATURE_XSAVES)) {
-		kfpu_do_xrstor("xrstors", state, ~0);
+		kfpu_do_xrstor("xrstors", state, ~XFEATURE_MASK_XTILE);
 		goto out;
 	}
 #endif
 	if (static_cpu_has(X86_FEATURE_XSAVE)) {
-		kfpu_do_xrstor("xrstor", state, ~0);
+		kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE);
 	} else if (static_cpu_has(X86_FEATURE_FXSR)) {
 		kfpu_restore_fxsr(state);
 	} else {

From 11326f8eb1bb0a4c5d8e7ae3b577c8cd56c9009c Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Sat, 26 Aug 2023 11:30:19 -0700
Subject: [PATCH 04/13] Try to clarify wording to reduce zpool add incidents

Try to clarify wording to reduce zpool add incidents.
Add an attach example.

Reviewed-by: Rich Ercolani <Rincebrain@gmail.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #15179
---
 man/man8/zpool.8 | 58 +++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index e8eadffa6fc..4e45890f1e2 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -110,9 +110,10 @@ Removes ZFS label information from the specified
 .It Xo
 .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8
 .Xc
-Increases or decreases redundancy by
-.Cm attach Ns ing or
-.Cm detach Ns ing a device on an existing vdev (virtual device).
+Converts a non-redundant disk into a mirror, or increases the redundancy level of an existing mirror
+.Ns (
+.Cm attach Ns ), or performs the inverse operation (
+.Cm detach Ns ).
 .It Xo
 .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8
 .Xc
@@ -233,16 +234,16 @@ Invalid command line options were specified.
 .El
 .
 .Sh EXAMPLES
-.\" Examples 1, 2, 3, 4, 11, 12 are shared with zpool-create.8.
-.\" Examples 5, 13 are shared with zpool-add.8.
-.\" Examples 6, 15 are shared with zpool-list.8.
-.\" Examples 7 are shared with zpool-destroy.8.
-.\" Examples 8 are shared with zpool-export.8.
-.\" Examples 9 are shared with zpool-import.8.
-.\" Examples 10 are shared with zpool-upgrade.8.
-.\" Examples 14 are shared with zpool-remove.8.
-.\" Examples 16 are shared with zpool-status.8.
-.\" Examples 13, 16 are also shared with zpool-iostat.8.
+.\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8.
+.\" Examples 6, 14 are shared with zpool-add.8.
+.\" Examples 7, 16 are shared with zpool-list.8.
+.\" Examples 8 are shared with zpool-destroy.8.
+.\" Examples 9 are shared with zpool-export.8.
+.\" Examples 10 are shared with zpool-import.8.
+.\" Examples 11 are shared with zpool-upgrade.8.
+.\" Examples 15 are shared with zpool-remove.8.
+.\" Examples 17 are shared with zpool-status.8.
+.\" Examples 14, 17 are also shared with zpool-iostat.8.
 .\" Make sure to update them omnidirectionally
 .Ss Example 1 : No Creating a RAID-Z Storage Pool
 The following command creates a pool with a single raidz root vdev that
@@ -264,14 +265,21 @@ While not recommended, a pool based on files can be useful for experimental
 purposes.
 .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b
 .
-.Ss Example 5 : No Adding a Mirror to a ZFS Storage Pool
+.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored.
+The following command converts an existing single device
+.Ar sda
+into a mirror by attaching a second device to it,
+.Ar sdb .
+.Dl # Nm zpool Cm attach Ar tank Pa sda sdb
+.
+.Ss Example 6 : No Adding a Mirror to a ZFS Storage Pool
 The following command adds two mirrored disks to the pool
 .Ar tank ,
 assuming the pool is already made up of two-way mirrors.
 The additional space is immediately available to any datasets within the pool.
 .Dl # Nm zpool Cm add Ar tank Sy mirror Pa sda sdb
 .
-.Ss Example 6 : No Listing Available ZFS Storage Pools
+.Ss Example 7 : No Listing Available ZFS Storage Pools
 The following command lists all available pools on the system.
 In this case, the pool
 .Ar zion
@@ -285,19 +293,19 @@ tank   61.5G  20.0G  41.5G         -    48%    32%  1.00x  ONLINE  -
 zion       -      -      -         -      -      -      -  FAULTED -
 .Ed
 .
-.Ss Example 7 : No Destroying a ZFS Storage Pool
+.Ss Example 8 : No Destroying a ZFS Storage Pool
 The following command destroys the pool
 .Ar tank
 and any datasets contained within:
 .Dl # Nm zpool Cm destroy Fl f Ar tank
 .
-.Ss Example 8 : No Exporting a ZFS Storage Pool
+.Ss Example 9 : No Exporting a ZFS Storage Pool
 The following command exports the devices in pool
 .Ar tank
 so that they can be relocated or later imported:
 .Dl # Nm zpool Cm export Ar tank
 .
-.Ss Example 9 : No Importing a ZFS Storage Pool
+.Ss Example 10 : No Importing a ZFS Storage Pool
 The following command displays available pools, and then imports the pool
 .Ar tank
 for use on the system.
@@ -318,7 +326,7 @@ config:
 .No # Nm zpool Cm import Ar tank
 .Ed
 .
-.Ss Example 10 : No Upgrading All ZFS Storage Pools to the Current Version
+.Ss Example 11 : No Upgrading All ZFS Storage Pools to the Current Version
 The following command upgrades all ZFS Storage pools to the current version of
 the software:
 .Bd -literal -compact -offset Ds
@@ -326,7 +334,7 @@ the software:
 This system is currently running ZFS version 2.
 .Ed
 .
-.Ss Example 11 : No Managing Hot Spares
+.Ss Example 12 : No Managing Hot Spares
 The following command creates a new pool with an available hot spare:
 .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy spare Pa sdc
 .Pp
@@ -341,12 +349,12 @@ The hot spare can be permanently removed from the pool using the following
 command:
 .Dl # Nm zpool Cm remove Ar tank Pa sdc
 .
-.Ss Example 12 : No Creating a ZFS Pool with Mirrored Separate Intent Logs
+.Ss Example 13 : No Creating a ZFS Pool with Mirrored Separate Intent Logs
 The following command creates a ZFS storage pool consisting of two, two-way
 mirrors and mirrored log devices:
 .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf
 .
-.Ss Example 13 : No Adding Cache Devices to a ZFS Pool
+.Ss Example 14 : No Adding Cache Devices to a ZFS Pool
 The following command adds two disks for use as cache devices to a ZFS storage
 pool:
 .Dl # Nm zpool Cm add Ar pool Sy cache Pa sdc sdd
@@ -359,7 +367,7 @@ Capacity and reads can be monitored using the
 subcommand as follows:
 .Dl # Nm zpool Cm iostat Fl v Ar pool 5
 .
-.Ss Example 14 : No Removing a Mirrored top-level (Log or Data) Device
+.Ss Example 15 : No Removing a Mirrored top-level (Log or Data) Device
 The following commands remove the mirrored log device
 .Sy mirror-2
 and mirrored top-level data device
@@ -394,7 +402,7 @@ The command to remove the mirrored data
 .Ar mirror-1 No is :
 .Dl # Nm zpool Cm remove Ar tank mirror-1
 .
-.Ss Example 15 : No Displaying expanded space on a device
+.Ss Example 16 : No Displaying expanded space on a device
 The following command displays the detailed information for the pool
 .Ar data .
 This pool is comprised of a single raidz vdev where one of its devices
@@ -411,7 +419,7 @@ data        23.9G  14.6G  9.30G         -    48%    61%  1.00x  ONLINE  -
     sdc         -      -      -         -      -
 .Ed
 .
-.Ss Example 16 : No Adding output columns
+.Ss Example 17 : No Adding output columns
 Additional columns can be added to the
 .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c .
 .Bd -literal -compact -offset Ds

From bee9cfb813f45f8458a8fd7584e526124d2e2d03 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Sat, 26 Aug 2023 11:34:43 -0700
Subject: [PATCH 05/13] Increase limit of redaction list by using spill block

Currently redaction bookmarks and their associated redaction lists
have a relatively low limit of 36 redaction snapshots. This is imposed
by the number of snapshot GUIDs that fit in the bonus buffer of the
redaction list object. While this is more than enough for most use
cases, there are some limited cases where larger numbers would be
useful to support.

We tweak the redaction list creation code to use a spill block if
the number of redaction snapshots is above the amount that would fit
in the bonus buffer. We also make a small change to allow spill blocks
to be use for types of data besides SA. In order to fully leverage
this logic, we also change the redaction code to use vmem_alloc, to
handle extremely large allocations if needed. Finally, small tweaks
were made to the zfs commands and the test suite.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #15018
---
 cmd/zdb/zdb.c                                 | 15 ++++-
 cmd/zfs/zfs_main.c                            |  4 ++
 include/sys/dsl_bookmark.h                    |  1 +
 include/zfeature_common.h                     |  1 +
 lib/libzfs/libzfs.abi                         |  9 +--
 man/man7/zpool-features.7                     | 12 ++++
 module/zcommon/zfeature_common.c              | 12 ++++
 module/zfs/dmu_redact.c                       | 17 +++--
 module/zfs/dnode.c                            |  1 +
 module/zfs/dsl_bookmark.c                     | 67 ++++++++++++++-----
 module/zfs/dsl_destroy.c                      | 10 +++
 .../cli_root/zpool_get/zpool_get.cfg          |  1 +
 .../redacted_send/redacted_many_clones.ksh    | 12 ++--
 13 files changed, 128 insertions(+), 34 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 4b9921d47b8..87499cdc95c 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -5293,8 +5293,18 @@ dump_one_objset(const char *dsname, void *arg)
 	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
 	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
 		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
-		if (dbn->dbn_phys.zbm_redaction_obj != 0)
-			global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++;
+		if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+			global_feature_count[
+			    SPA_FEATURE_REDACTION_BOOKMARKS]++;
+			objset_t *mos = os->os_spa->spa_meta_objset;
+			dnode_t *rl;
+			VERIFY0(dnode_hold(mos,
+			    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
+			if (rl->dn_have_spill) {
+				global_feature_count[
+				    SPA_FEATURE_REDACTION_LIST_SPILL]++;
+			}
+		}
 		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
 	}
@@ -8135,6 +8145,7 @@ dump_zpool(spa_t *spa)
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
 			global_feature_count[f] = UINT64_MAX;
 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
+		global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
 		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
 
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 5ed25d1ea72..ea73bb018a9 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -3978,6 +3978,10 @@ zfs_do_redact(int argc, char **argv)
 		(void) fprintf(stderr, gettext("potentially invalid redaction "
 		    "snapshot; full dataset names required\n"));
 		break;
+	case ESRCH:
+		(void) fprintf(stderr, gettext("attempted to resume redaction "
+		    " with a mismatched redaction list\n"));
+		break;
 	default:
 		(void) fprintf(stderr, gettext("internal error: %s\n"),
 		    strerror(errno));
diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h
index 353c5c2d260..d4e559a0903 100644
--- a/include/sys/dsl_bookmark.h
+++ b/include/sys/dsl_bookmark.h
@@ -72,6 +72,7 @@ typedef struct redaction_list_phys {
 typedef struct redaction_list {
 	dmu_buf_user_t		rl_dbu;
 	redaction_list_phys_t	*rl_phys;
+	dmu_buf_t		*rl_bonus;
 	dmu_buf_t		*rl_dbuf;
 	uint64_t		rl_object;
 	zfs_refcount_t		rl_longholds;
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 7066c699e20..1025c44738b 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -80,6 +80,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_BLAKE3,
 	SPA_FEATURE_BLOCK_CLONING,
 	SPA_FEATURE_AVZ_V2,
+	SPA_FEATURE_REDACTION_LIST_SPILL,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 6e53bcb41a8..0a8e9bcbd74 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -596,7 +596,7 @@
     <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2184' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2240' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -5809,7 +5809,8 @@
       <enumerator name='SPA_FEATURE_BLAKE3' value='36'/>
       <enumerator name='SPA_FEATURE_BLOCK_CLONING' value='37'/>
       <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
-      <enumerator name='SPA_FEATURES' value='39'/>
+      <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
+      <enumerator name='SPA_FEATURES' value='40'/>
     </enum-decl>
     <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
     <qualified-type-def type-id='22cce67b' const='yes' id='d2816df0'/>
@@ -8706,8 +8707,8 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='17472' id='dd432c71'>
-      <subrange length='39' type-id='7359adad' id='ae4a9561'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='17920' id='dd432c71'>
+      <subrange length='40' type-id='7359adad' id='ae4a9561'/>
     </array-type-def>
     <enum-decl name='zfeature_flags' id='6db816a4'>
       <underlying-type type-id='9cac1fee'/>
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index b901ce6c293..3c7b0b345d9 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -947,6 +947,18 @@ once all filesystems that have ever had their
 property set to
 .Sy zstd
 are destroyed.
+.
+.feature com.delphix redaction_list_spill no redaction_bookmarks
+This feature enables the redaction list created by zfs redact to store
+many more entries.
+It becomes
+.Sy active
+when a redaction list is created with more than 36 entries,
+and returns to being
+.Sy enabled
+when no long redaction lists remain in the pool.
+For more information about redacted sends, see
+.Xr zfs-send 8 .
 .El
 .
 .Sh SEE ALSO
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 4c9b7ed72a0..2c74d10f43f 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -737,6 +737,18 @@ zpool_feature_init(void)
 	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL,
 	    sfeatures);
 
+	{
+		static const spa_feature_t redact_list_spill_deps[] = {
+			SPA_FEATURE_REDACTION_BOOKMARKS,
+			SPA_FEATURE_NONE
+		};
+		zfeature_register(SPA_FEATURE_REDACTION_LIST_SPILL,
+		    "com.delphix:redaction_list_spill", "redaction_list_spill",
+		    "Support for increased number of redaction_snapshot "
+		    "arguments in zfs redact.", 0, ZFEATURE_TYPE_BOOLEAN,
+		    redact_list_spill_deps, sfeatures);
+	}
+
 	zfs_mod_list_supported_free(sfeatures);
 }
 
diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index 6bd35713ff1..5ac14edfca1 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c
@@ -746,7 +746,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
 		bqueue_enqueue(q, record, sizeof (*record));
 		return (0);
 	}
-	redact_nodes = kmem_zalloc(num_threads *
+	redact_nodes = vmem_zalloc(num_threads *
 	    sizeof (*redact_nodes), KM_SLEEP);
 
 	avl_create(&start_tree, redact_node_compare_start,
@@ -820,7 +820,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
 
 	avl_destroy(&start_tree);
 	avl_destroy(&end_tree);
-	kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
+	vmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
 	if (current_record != NULL)
 		bqueue_enqueue(q, current_record, sizeof (*current_record));
 	return (err);
@@ -1030,7 +1030,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 
 	numsnaps = fnvlist_num_pairs(redactnvl);
 	if (numsnaps > 0)
-		args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
+		args = vmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
 
 	nvpair_t *pair = NULL;
 	for (int i = 0; i < numsnaps; i++) {
@@ -1079,7 +1079,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 		kmem_free(newredactbook,
 		    sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
 		if (args != NULL)
-			kmem_free(args, numsnaps * sizeof (*args));
+			vmem_free(args, numsnaps * sizeof (*args));
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 	err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
@@ -1119,7 +1119,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 	} else {
 		uint64_t *guids = NULL;
 		if (numsnaps > 0) {
-			guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
+			guids = vmem_zalloc(numsnaps * sizeof (uint64_t),
 			    KM_SLEEP);
 		}
 		for (int i = 0; i < numsnaps; i++) {
@@ -1131,10 +1131,9 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 		dp = NULL;
 		err = dsl_bookmark_create_redacted(newredactbook, snapname,
 		    numsnaps, guids, FTAG, &new_rl);
-		kmem_free(guids, numsnaps * sizeof (uint64_t));
-		if (err != 0) {
+		vmem_free(guids, numsnaps * sizeof (uint64_t));
+		if (err != 0)
 			goto out;
-		}
 	}
 
 	for (int i = 0; i < numsnaps; i++) {
@@ -1188,7 +1187,7 @@ out:
 	}
 
 	if (args != NULL)
-		kmem_free(args, numsnaps * sizeof (*args));
+		vmem_free(args, numsnaps * sizeof (*args));
 	if (dp != NULL)
 		dsl_pool_rele(dp, FTAG);
 	if (ds != NULL) {
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 7cf03264dce..79fd02dcb9a 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -720,6 +720,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
+	    (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index e04796a0814..03d9420dbdb 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -34,6 +34,7 @@
 #include <sys/dsl_bookmark.h>
 #include <zfs_namecheck.h>
 #include <sys/dmu_send.h>
+#include <sys/dbuf.h>
 
 static int
 dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
@@ -459,25 +460,42 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
 	    SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
 	if (redaction_list != NULL || bookmark_redacted) {
 		redaction_list_t *local_rl;
+		boolean_t spill = B_FALSE;
 		if (bookmark_redacted) {
 			redact_snaps = dsredactsnaps;
 			num_redact_snaps = dsnumsnaps;
 		}
+		int bonuslen = sizeof (redaction_list_phys_t) +
+		    num_redact_snaps * sizeof (uint64_t);
+		if (bonuslen > dmu_bonus_max())
+			spill = B_TRUE;
 		dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
 		    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
-		    DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) +
-		    num_redact_snaps * sizeof (uint64_t), tx);
+		    DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx);
 		spa_feature_incr(dp->dp_spa,
 		    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+		if (spill) {
+			spa_feature_incr(dp->dp_spa,
+			    SPA_FEATURE_REDACTION_LIST_SPILL, tx);
+		}
 
 		VERIFY0(dsl_redaction_list_hold_obj(dp,
 		    dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
 		dsl_redaction_list_long_hold(dp, local_rl, tag);
 
-		ASSERT3U((local_rl)->rl_dbuf->db_size, >=,
-		    sizeof (redaction_list_phys_t) + num_redact_snaps *
-		    sizeof (uint64_t));
-		dmu_buf_will_dirty(local_rl->rl_dbuf, tx);
+		if (!spill) {
+			ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen);
+			dmu_buf_will_dirty(local_rl->rl_bonus, tx);
+		} else {
+			dmu_buf_t *db;
+			VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
+			    DB_RF_MUST_SUCCEED, FTAG, &db));
+			dmu_buf_will_fill(db, tx);
+			VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
+			    SPA_MINBLOCKSIZE), tx));
+			local_rl->rl_phys = db->db_data;
+			local_rl->rl_dbuf = db;
+		}
 		memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps,
 		    sizeof (uint64_t) * num_redact_snaps);
 		local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
@@ -636,11 +654,15 @@ dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
 	    SPA_FEATURE_REDACTION_BOOKMARKS))
 		return (SET_ERROR(ENOTSUP));
 	/*
-	 * If the list of redact snaps will not fit in the bonus buffer with
-	 * the furthest reached object and offset, fail.
+	 * If the list of redact snaps will not fit in the bonus buffer (or
+	 * spill block, with the REDACTION_LIST_SPILL feature) with the
+	 * furthest reached object and offset, fail.
 	 */
-	if (dbcra->dbcra_numsnaps > (dmu_bonus_max() -
-	    sizeof (redaction_list_phys_t)) / sizeof (uint64_t))
+	uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) :
+	    dmu_bonus_max()) -
+	    sizeof (redaction_list_phys_t)) / sizeof (uint64_t);
+	if (dbcra->dbcra_numsnaps > snaplimit)
 		return (SET_ERROR(E2BIG));
 
 	if (dsl_bookmark_create_nvl_validate_pair(
@@ -1040,6 +1062,14 @@ dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
 	}
 
 	if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+		dnode_t *rl;
+		VERIFY0(dnode_hold(mos,
+		    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
+		if (rl->dn_have_spill) {
+			spa_feature_decr(dmu_objset_spa(mos),
+			    SPA_FEATURE_REDACTION_LIST_SPILL, tx);
+		}
+		dnode_rele(rl, FTAG);
 		VERIFY0(dmu_object_free(mos,
 		    dbn->dbn_phys.zbm_redaction_obj, tx));
 		spa_feature_decr(dmu_objset_spa(mos),
@@ -1213,7 +1243,9 @@ redaction_list_evict_sync(void *rlu)
 void
 dsl_redaction_list_rele(redaction_list_t *rl, const void *tag)
 {
-	dmu_buf_rele(rl->rl_dbuf, tag);
+	if (rl->rl_bonus != rl->rl_dbuf)
+		dmu_buf_rele(rl->rl_dbuf, tag);
+	dmu_buf_rele(rl->rl_bonus, tag);
 }
 
 int
@@ -1221,7 +1253,7 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag,
     redaction_list_t **rlp)
 {
 	objset_t *mos = dp->dp_meta_objset;
-	dmu_buf_t *dbuf;
+	dmu_buf_t *dbuf, *spill_dbuf;
 	redaction_list_t *rl;
 	int err;
 
@@ -1236,13 +1268,18 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag,
 		redaction_list_t *winner = NULL;
 
 		rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
-		rl->rl_dbuf = dbuf;
+		rl->rl_bonus = dbuf;
+		if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) {
+			rl->rl_dbuf = spill_dbuf;
+		} else {
+			rl->rl_dbuf = dbuf;
+		}
 		rl->rl_object = rlobj;
-		rl->rl_phys = dbuf->db_data;
+		rl->rl_phys = rl->rl_dbuf->db_data;
 		rl->rl_mos = dp->dp_meta_objset;
 		zfs_refcount_create(&rl->rl_longholds);
 		dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
-		    &rl->rl_dbuf);
+		    &rl->rl_bonus);
 		if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
 			kmem_free(rl, sizeof (*rl));
 			rl = winner;
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index 053f26878cf..d9d88a981e0 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -1125,6 +1125,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 		while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) !=
 		    NULL) {
 			if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+				dnode_t *rl;
+				VERIFY0(dnode_hold(mos,
+				    dbn->dbn_phys.zbm_redaction_obj, FTAG,
+				    &rl));
+				if (rl->dn_have_spill) {
+					spa_feature_decr(dmu_objset_spa(mos),
+					    SPA_FEATURE_REDACTION_LIST_SPILL,
+					    tx);
+				}
+				dnode_rele(rl, FTAG);
 				VERIFY0(dmu_object_free(mos,
 				    dbn->dbn_phys.zbm_redaction_obj, tx));
 				spa_feature_decr(dmu_objset_spa(mos),
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 160a0ca2e6d..4248578cde1 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -86,6 +86,7 @@ typeset -a properties=(
     "feature@log_spacemap"
     "feature@device_rebuild"
     "feature@draid"
+    "feature@redaction_list_spill"
 )
 
 if is_linux || is_freebsd; then
diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh
index 3386643b295..f2150be3bc9 100755
--- a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh
+++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh
@@ -27,7 +27,6 @@
 #    second (the last block in the file) is common to them all.
 # 2. Verify a redacted stream with a reasonable redaction list length can
 #    be correctly processed.
-# 3. Verify that if the list is too long, the send fails gracefully.
 #
 
 typeset ds_name="many_clones"
@@ -56,13 +55,18 @@ for i in {1..64}; do
 	log_must zfs snapshot ${clone}$i@snap
 done
 
-# The limit isn't necessarily 32 snapshots. The maximum number of snapshots in
+# The limit isn't necessarily 64 snapshots. The maximum number of snapshots in
 # the redacted list is determined in dsl_bookmark_create_redacted_check().
-log_must zfs redact $sendfs@snap book1 $clone{1..32}@snap
+log_must zfs redact $sendfs@snap book1 $clone{1..64}@snap
 log_must eval "zfs send --redact book1 $sendfs@snap >$stream"
 log_must eval "zfs recv $recvfs <$stream"
 compare_files $sendfs $recvfs "f2" "$RANGE8"
 
-log_mustnot zfs redact $sendfs@snap book2 $clone{1..64}@snap
+rls_value="$(zpool get -H -o value feature@redaction_list_spill $POOL)"
+if [ "$rls_value" = "active" ]; then
+	log_note "redaction_list_spill feature active"
+else
+	log_fail "redaction_list_spill feature not active"
+fi
 
 log_pass "Redacted send can deal with a large redaction list."

From cad00d51805ae272cbc2ae514449a72225054ffd Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Tue, 29 Aug 2023 09:12:40 -0700
Subject: [PATCH 06/13] checkstyle: fix action failures

Reviewed-by: Don Brady <dev.fs.zfs@gmail.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #15220
---
 cmd/zed/zed.d/statechange-slot_off.sh | 1 +
 man/man8/zpool.8                      | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh
index 9d218ddaa64..150012abe71 100755
--- a/cmd/zed/zed.d/statechange-slot_off.sh
+++ b/cmd/zed/zed.d/statechange-slot_off.sh
@@ -1,4 +1,5 @@
 #!/bin/sh
+# shellcheck disable=SC3014,SC2154,SC2086,SC2034
 #
 # Turn off disk's enclosure slot if it becomes FAULTED.
 #
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index 4e45890f1e2..4c4020bdd81 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -110,9 +110,9 @@ Removes ZFS label information from the specified
 .It Xo
 .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8
 .Xc
-Converts a non-redundant disk into a mirror, or increases the redundancy level of an existing mirror
-.Ns (
-.Cm attach Ns ), or performs the inverse operation (
+Converts a non-redundant disk into a mirror, or increases
+the redundancy level of an existing mirror
+.Cm ( attach Ns ), or performs the inverse operation (
 .Cm detach Ns ).
 .It Xo
 .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8
@@ -265,7 +265,7 @@ While not recommended, a pool based on files can be useful for experimental
 purposes.
 .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b
 .
-.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored.
+.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored
 The following command converts an existing single device
 .Ar sda
 into a mirror by attaching a second device to it,

From 010c003e5f9e345878243528a4ae857ebb4d5e8b Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Fri, 1 Sep 2023 03:17:12 +0200
Subject: [PATCH 07/13] dmu_buf_will_clone: change assertion to fix 32-bit
 compiler warning

Building module/zfs/dbuf.c for 32-bit targets can result in a warning:

In file included from
/usr/src/sys/contrib/openzfs/include/sys/zfs_context.h:97,
                 from /usr/src/sys/contrib/openzfs/module/zfs/dbuf.c:32:
/usr/src/sys/contrib/openzfs/module/zfs/dbuf.c: In function
'dmu_buf_will_clone':
/usr/src/sys/contrib/openzfs/lib/libspl/include/assert.h:116:33: error:
cast from pointer to integer of different size
[-Werror=pointer-to-int-cast]
  116 |         const uint64_t __left = (uint64_t)(LEFT);
  \
      |                                 ^
/usr/src/sys/contrib/openzfs/lib/libspl/include/assert.h:148:25: note:
in expansion of macro 'VERIFY0'
  148 | #define ASSERT0         VERIFY0
      |                         ^~~~~~~
/usr/src/sys/contrib/openzfs/module/zfs/dbuf.c:2704:9: note: in
expansion of macro 'ASSERT0'
 2704 |         ASSERT0(dbuf_find_dirty_eq(db, tx->tx_txg));
      |         ^~~~~~~

This is because dbuf_find_dirty_eq() returns a pointer, which if
pointers are 32-bit results in a warning about the cast to uint64_t.

Instead, use the ASSERT3P() macro, with == and NULL as second and third
arguments, which should work regardless of the target's bitness.

Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Dimitry Andric <dimitry@andric.com>
Closes #15224
---
 module/zfs/dbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index b7453578a76..f2831a0e8ab 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2701,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	 */
 	mutex_enter(&db->db_mtx);
 	VERIFY(!dbuf_undirty(db, tx));
-	ASSERT0(dbuf_find_dirty_eq(db, tx->tx_txg));
+	ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL);
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;

From bbcf18c293655d7771e346202413beccc64a69d6 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 1 Sep 2023 20:13:22 -0400
Subject: [PATCH 08/13] ZIL: Tune some assertions.

In zil_free_lwb() we should first assert lwb_state or the rest of
assertions can be misleading if it is false.

Add lwb_state assertions in zil_lwb_add_block() to make sure we are
not trying to add elements to lwb_vdev_tree after it was processed.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15227
---
 module/zfs/zil.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index f2d279e36a9..be3311b031c 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -814,17 +814,17 @@ static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-	VERIFY(list_is_empty(&lwb->lwb_itxs));
-	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
-	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+	VERIFY(list_is_empty(&lwb->lwb_itxs));
+	VERIFY(list_is_empty(&lwb->lwb_waiters));
+	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
@@ -1329,6 +1329,9 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
 	if (zil_nocacheflush)
 		return;
 

From b1b99e10a6ccda811c29b50136df24228c3e1d92 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 1 Sep 2023 20:13:52 -0400
Subject: [PATCH 09/13] ZIL: Revert zl_lock scope reduction.

While I have no reports of it, I suspect possible use-after-free
scenario when zil_commit_waiter() tries to dereference zcw_lwb
for lwb already freed by zil_sync(), while zcw_done is not set.
Extension of zl_lock scope as it was originally should block
zil_sync() from freeing the lwb, closing this race.

This reverts #14959 and couple chunks of #14841.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15228
---
 module/zfs/zil.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index be3311b031c..297c6b65d4f 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1411,15 +1411,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
-	uint64_t txg;
-	list_t itxs, waiters;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
-	list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
-	list_create(&waiters, sizeof (zil_commit_waiter_t),
-	    offsetof(zil_commit_waiter_t, zcw_node));
-
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
@@ -1428,6 +1422,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 
 	lwb->lwb_root_zio = NULL;
 
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
+
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
@@ -1438,22 +1435,13 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
-	list_move_tail(&itxs, &lwb->lwb_itxs);
-	list_move_tail(&waiters, &lwb->lwb_waiters);
-	txg = lwb->lwb_issued_txg;
-
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
-	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
-
-	mutex_exit(&zilog->zl_lock);
-
-	while ((itx = list_remove_head(&itxs)) != NULL)
+	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
-	list_destroy(&itxs);
 
-	while ((zcw = list_remove_head(&waiters)) != NULL) {
+	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
+		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
@@ -1478,7 +1466,11 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 
 		mutex_exit(&zcw->zcw_lock);
 	}
-	list_destroy(&waiters);
+
+	uint64_t txg = lwb->lwb_issued_txg;
+
+	/* Once we drop the lock, lwb may be freed by zil_sync(). */
+	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);

From 9da6b60417e9eebd066b44bab5b1938cab504678 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 1 Sep 2023 20:14:50 -0400
Subject: [PATCH 10/13] ZIL: Change ZIOs issue order.

In zil_lwb_write_issue(), after issuing lwb_root_zio/lwb_write_zio,
we have no right to access lwb->lwb_child_zio. If it was not there,
the first two ZIOs may have already completed and freed the lwb.
ZIOs issue in opposite order from children to parent should keep
the lwb valid till the end, since the lwb can be freed only after
lwb_root_zio completion callback.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15233
---
 module/zfs/zil.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 297c6b65d4f..b30676b42d8 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1924,10 +1924,10 @@ next_lwb:
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
-	zio_nowait(lwb->lwb_root_zio);
-	zio_nowait(lwb->lwb_write_zio);
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
+	zio_nowait(lwb->lwb_write_zio);
+	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,

From bcb1159c095f57564914b59f5e7e82170261afb0 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Sat, 2 Sep 2023 02:21:40 +0200
Subject: [PATCH 11/13] Linux 6.5 compat: safe cleanup in spl_proc_fini()

If we fail to create a proc entry in spl_proc_init() we may end up
calling unregister_sysctl_table() twice: one in the failure path of
spl_proc_init() and another time during spl_proc_fini().

Avoid the double call to unregister_sysctl_table() and while at it
refactor the code a bit to reduce code duplication.

This was accidentally introduced when the spl code was
updated for Linux 6.5 compatibility.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Closes #15234
Closes #15235
---
 module/os/linux/spl/spl-proc.c | 36 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index bcc356ae55b..5cb5a6dadb0 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -659,6 +659,21 @@ static struct ctl_table spl_root[] = {
 };
 #endif
 
+static void spl_proc_cleanup(void)
+{
+	remove_proc_entry("kstat", proc_spl);
+	remove_proc_entry("slab", proc_spl_kmem);
+	remove_proc_entry("kmem", proc_spl);
+	remove_proc_entry("taskq-all", proc_spl);
+	remove_proc_entry("taskq", proc_spl);
+	remove_proc_entry("spl", NULL);
+
+	if (spl_header) {
+		unregister_sysctl_table(spl_header);
+		spl_header = NULL;
+	}
+}
+
 int
 spl_proc_init(void)
 {
@@ -723,15 +738,8 @@ spl_proc_init(void)
 		goto out;
 	}
 out:
-	if (rc) {
-		remove_proc_entry("kstat", proc_spl);
-		remove_proc_entry("slab", proc_spl_kmem);
-		remove_proc_entry("kmem", proc_spl);
-		remove_proc_entry("taskq-all", proc_spl);
-		remove_proc_entry("taskq", proc_spl);
-		remove_proc_entry("spl", NULL);
-		unregister_sysctl_table(spl_header);
-	}
+	if (rc)
+		spl_proc_cleanup();
 
 	return (rc);
 }
@@ -739,13 +747,5 @@ out:
 void
 spl_proc_fini(void)
 {
-	remove_proc_entry("kstat", proc_spl);
-	remove_proc_entry("slab", proc_spl_kmem);
-	remove_proc_entry("kmem", proc_spl);
-	remove_proc_entry("taskq-all", proc_spl);
-	remove_proc_entry("taskq", proc_spl);
-	remove_proc_entry("spl", NULL);
-
-	ASSERT(spl_header != NULL);
-	unregister_sysctl_table(spl_header);
+	spl_proc_cleanup();
 }

From 71472bf375deb4fbd42fd66181aa35864b603b3a Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Sat, 2 Sep 2023 05:25:11 +0500
Subject: [PATCH 12/13] Relax error reporting in zpool import and zpool split

For zpool import and zpool split, zpool_enable_datasets is called
to mount and share all datasets in a pool. If there is an error
while mounting or sharing any dataset in the pool, the status of
import or split is reported as failure. However, the changes do
show up in zpool list.

This commit updates the error reporting in zpool import and zpool
split path. More descriptive messages are shown to user in case
there is an error during mount or share. Errors in mount or share
do not effect the overall status of zpool import and zpool split.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #15216
---
 cmd/zpool/zpool_main.c    | 34 +++++++++++++++++++++++-----------
 include/libzfs.h          |  1 +
 lib/libzfs/libzfs_mount.c |  4 ++--
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 10a3b5b14fc..6d0dae8d8b0 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -3143,6 +3143,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
     nvlist_t *props, int flags)
 {
 	int ret = 0;
+	int ms_status = 0;
 	zpool_handle_t *zhp;
 	const char *name;
 	uint64_t version;
@@ -3232,10 +3233,15 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 			ret = 1;
 
 	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
-	    !(flags & ZFS_IMPORT_ONLY) &&
-	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
-		zpool_close(zhp);
-		return (1);
+	    !(flags & ZFS_IMPORT_ONLY)) {
+		ms_status = zpool_enable_datasets(zhp, mntopts, 0);
+		if (ms_status == EZFS_SHAREFAILED) {
+			(void) fprintf(stderr, gettext("Import was "
+			    "successful, but unable to share some datasets"));
+		} else if (ms_status == EZFS_MOUNTFAILED) {
+			(void) fprintf(stderr, gettext("Import was "
+			    "successful, but unable to mount some datasets"));
+		}
 	}
 
 	zpool_close(zhp);
@@ -6755,6 +6761,7 @@ zpool_do_split(int argc, char **argv)
 	char *mntopts = NULL;
 	splitflags_t flags;
 	int c, ret = 0;
+	int ms_status = 0;
 	boolean_t loadkeys = B_FALSE;
 	zpool_handle_t *zhp;
 	nvlist_t *config, *props = NULL;
@@ -6891,13 +6898,18 @@ zpool_do_split(int argc, char **argv)
 			ret = 1;
 	}
 
-	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
-	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
-		ret = 1;
-		(void) fprintf(stderr, gettext("Split was successful, but "
-		    "the datasets could not all be mounted\n"));
-		(void) fprintf(stderr, gettext("Try doing '%s' with a "
-		    "different altroot\n"), "zpool import");
+	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
+		ms_status = zpool_enable_datasets(zhp, mntopts, 0);
+		if (ms_status == EZFS_SHAREFAILED) {
+			(void) fprintf(stderr, gettext("Split was successful, "
+			    "datasets are mounted but sharing of some datasets "
+			    "has failed\n"));
+		} else if (ms_status == EZFS_MOUNTFAILED) {
+			(void) fprintf(stderr, gettext("Split was successful"
+			    ", but some datasets could not be mounted\n"));
+			(void) fprintf(stderr, gettext("Try doing '%s' with a "
+			    "different altroot\n"), "zpool import");
+		}
 	}
 	zpool_close(zhp);
 	nvlist_free(config);
diff --git a/include/libzfs.h b/include/libzfs.h
index a7037e3e626..fa05b7921bb 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -156,6 +156,7 @@ typedef enum zfs_error {
 	EZFS_NOT_USER_NAMESPACE,	/* a file is not a user namespace */
 	EZFS_CKSUM,		/* insufficient replicas */
 	EZFS_RESUME_EXISTS,	/* Resume on existing dataset without force */
+	EZFS_SHAREFAILED,	/* filesystem share failed */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index 5d1fe651c97..b38ad88096b 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -1300,7 +1300,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
 	    zfs_mount_one, &ms, B_TRUE);
 	if (ms.ms_mntstatus != 0)
-		ret = ms.ms_mntstatus;
+		ret = EZFS_MOUNTFAILED;
 
 	/*
 	 * Share all filesystems that need to be shared. This needs to be
@@ -1311,7 +1311,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
 	    zfs_share_one, &ms, B_FALSE);
 	if (ms.ms_mntstatus != 0)
-		ret = ms.ms_mntstatus;
+		ret = EZFS_SHAREFAILED;
 	else
 		zfs_commit_shares(NULL);
 

From 95f71c019d7c3e3b728a9b05e2117ce6b09f1b87 Mon Sep 17 00:00:00 2001
From: ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com>
Date: Fri, 1 Sep 2023 19:00:30 -0600
Subject: [PATCH 13/13] Selectable block allocators

ZFS historically has had several space allocators that were
dynamically selectable.  While these have been retained in
OpenZFS, only a single allocator has been statically compiled
in. This patch compiles all allocators for OpenZFS and provides
a module parameter to allow for manual selection between them.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Edmund Nadolski <edmund.nadolski@ixsystems.com>
Closes #15218
---
 include/os/freebsd/spl/sys/mod_os.h |  3 +
 include/sys/metaslab.h              |  1 +
 include/sys/spa.h                   |  3 +
 include/sys/spa_impl.h              |  3 +
 module/os/freebsd/zfs/sysctl_os.c   | 18 ++++++
 module/os/linux/zfs/spa_misc_os.c   | 12 ++++
 module/zfs/metaslab.c               | 98 +++++++++++++++++++++--------
 module/zfs/spa.c                    | 14 +++--
 module/zfs/spa_misc.c               |  6 ++
 9 files changed, 127 insertions(+), 31 deletions(-)

diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h
index 77ce75ca3f1..08d983c51f1 100644
--- a/include/os/freebsd/spl/sys/mod_os.h
+++ b/include/os/freebsd/spl/sys/mod_os.h
@@ -73,6 +73,9 @@
 #define	param_set_deadman_failmode_args(var) \
     CTLTYPE_STRING, NULL, 0, param_set_deadman_failmode, "A"
 
+#define	param_set_active_allocator_args(var) \
+    CTLTYPE_STRING, NULL, 0, param_set_active_allocator, "A"
+
 #define	param_set_deadman_synctime_args(var) \
     CTLTYPE_U64, NULL, 0, param_set_deadman_synctime, "QU"
 
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 0df6e5f81fc..815b5d0c9cf 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -39,6 +39,7 @@ extern "C" {
 
 
 typedef struct metaslab_ops {
+	const char *msop_name;
 	uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
 } metaslab_ops_t;
 
diff --git a/include/sys/spa.h b/include/sys/spa.h
index b9085568741..18062d3f2a9 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1056,6 +1056,8 @@ extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_deadman_ziotime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
 extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
+extern int spa_get_allocator(spa_t *spa);
+extern void spa_set_allocator(spa_t *spa, const char *allocator);
 
 /* Miscellaneous support routines */
 extern void spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -1207,6 +1209,7 @@ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS);
 int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS);
+int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 588c72f6e4f..1a04bedc313 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -263,6 +263,7 @@ struct spa {
 	 */
 	spa_alloc_t	*spa_allocs;
 	int		spa_alloc_count;
+	int		spa_active_allocator;	/* selectable allocator */
 
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
@@ -467,6 +468,8 @@ extern int param_set_deadman_failmode_common(const char *val);
 extern void spa_set_deadman_synctime(hrtime_t ns);
 extern void spa_set_deadman_ziotime(hrtime_t ns);
 extern const char *spa_history_zone(void);
+extern const char *zfs_active_allocator;
+extern int param_set_active_allocator_common(const char *val);
 
 #ifdef	__cplusplus
 }
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index 8ae2f23c3ec..ba9a95e4a66 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -503,6 +503,24 @@ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance,
 
 /* metaslab.c */
 
+int
+param_set_active_allocator(SYSCTL_HANDLER_ARGS)
+{
+	char buf[16];
+	int rc;
+
+	if (req->newptr == NULL)
+		strlcpy(buf, zfs_active_allocator, sizeof (buf));
+
+	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (rc || req->newptr == NULL)
+		return (rc);
+	if (strcmp(buf, zfs_active_allocator) == 0)
+		return (0);
+
+	return (param_set_active_allocator_common(buf));
+}
+
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
diff --git a/module/os/linux/zfs/spa_misc_os.c b/module/os/linux/zfs/spa_misc_os.c
index 3efc8b9644f..c8cbedcd515 100644
--- a/module/os/linux/zfs/spa_misc_os.c
+++ b/module/os/linux/zfs/spa_misc_os.c
@@ -103,6 +103,18 @@ param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp)
 	return (0);
 }
 
+int
+param_set_active_allocator(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = -param_set_active_allocator_common(val);
+	if (error == 0)
+		error = param_set_charp(val, kp);
+
+	return (error);
+}
+
 const char *
 spa_history_zone(void)
 {
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 20dc934593f..dd4ff77e6f5 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -40,8 +40,6 @@
 #include <sys/zap.h>
 #include <sys/btree.h>
 
-#define	WITH_DF_BLOCK_ALLOCATOR
-
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
@@ -1622,9 +1620,6 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
 	return (rs);
 }
 
-#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
-    defined(WITH_CF_BLOCK_ALLOCATOR)
-
 /*
  * This is a helper function that can be used by the allocator to find a
  * suitable block to allocate. This will search the specified B-tree looking
@@ -1659,9 +1654,74 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
 	*cursor = 0;
 	return (-1ULL);
 }
-#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
 
-#if defined(WITH_DF_BLOCK_ALLOCATOR)
+static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
+metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
+static metaslab_ops_t metaslab_allocators[] = {
+	{ "dynamic", metaslab_df_alloc },
+	{ "cursor", metaslab_cf_alloc },
+	{ "new-dynamic", metaslab_ndf_alloc },
+};
+
+static int
+spa_find_allocator_byname(const char *val)
+{
+	int a = ARRAY_SIZE(metaslab_allocators) - 1;
+	if (strcmp("new-dynamic", val) == 0)
+		return (-1); /* remove when ndf is working */
+	for (; a >= 0; a--) {
+		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
+			return (a);
+	}
+	return (-1);
+}
+
+void
+spa_set_allocator(spa_t *spa, const char *allocator)
+{
+	int a = spa_find_allocator_byname(allocator);
+	if (a < 0) a = 0;
+	spa->spa_active_allocator = a;
+	zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name);
+}
+
+int
+spa_get_allocator(spa_t *spa)
+{
+	return (spa->spa_active_allocator);
+}
+
+#if defined(_KERNEL)
+int
+param_set_active_allocator_common(const char *val)
+{
+	char *p;
+
+	if (val == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if ((p = strchr(val, '\n')) != NULL)
+		*p = '\0';
+
+	int a = spa_find_allocator_byname(val);
+	if (a < 0)
+		return (SET_ERROR(EINVAL));
+
+	zfs_active_allocator = metaslab_allocators[a].msop_name;
+	return (0);
+}
+#endif
+
+metaslab_ops_t *
+metaslab_allocator(spa_t *spa)
+{
+	int allocator = spa_get_allocator(spa);
+	return (&metaslab_allocators[allocator]);
+}
+
 /*
  * ==========================================================================
  * Dynamic Fit (df) block allocator
@@ -1736,12 +1796,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	return (offset);
 }
 
-const metaslab_ops_t zfs_metaslab_ops = {
-	metaslab_df_alloc
-};
-#endif /* WITH_DF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_CF_BLOCK_ALLOCATOR)
 /*
  * ==========================================================================
  * Cursor fit block allocator -
@@ -1784,12 +1838,6 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 	return (offset);
 }
 
-const metaslab_ops_t zfs_metaslab_ops = {
-	metaslab_cf_alloc
-};
-#endif /* WITH_CF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_NDF_BLOCK_ALLOCATOR)
 /*
  * ==========================================================================
  * New dynamic fit allocator -
@@ -1846,12 +1894,6 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	return (-1ULL);
 }
 
-const metaslab_ops_t zfs_metaslab_ops = {
-	metaslab_ndf_alloc
-};
-#endif /* WITH_NDF_BLOCK_ALLOCATOR */
-
-
 /*
  * ==========================================================================
  * Metaslabs
@@ -6232,3 +6274,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
+	param_set_active_allocator, param_get_charp, ZMOD_RW,
+	"SPA active allocator");
+/* END CSTYLED */
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 88ee4ea9f45..cda62f939c1 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1295,24 +1295,26 @@ spa_thread(void *arg)
 }
 #endif
 
+extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
+	metaslab_ops_t *msp = metaslab_allocator(spa);
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 
-	spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-	spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-	spa->spa_embedded_log_class =
-	    metaslab_class_create(spa, &zfs_metaslab_ops);
-	spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-	spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops);
+	spa->spa_normal_class = metaslab_class_create(spa, msp);
+	spa->spa_log_class = metaslab_class_create(spa, msp);
+	spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
+	spa->spa_special_class = metaslab_class_create(spa, msp);
+	spa->spa_dedup_class = metaslab_class_create(spa, msp);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 3b355e0debc..413476196b9 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -389,6 +389,11 @@ static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
 static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
 static const int spa_allocators = 4;
 
+/*
+ * Spa active allocator.
+ * Valid values are zfs_active_allocator=<dynamic|cursor|new-dynamic>.
+ */
+const char *zfs_active_allocator = "dynamic";
 
 void
 spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -710,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
 	spa_set_deadman_failmode(spa, zfs_deadman_failmode);
+	spa_set_allocator(spa, zfs_active_allocator);
 
 	zfs_refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);