From f7de776da2ee4b703529035975fd3216b4bacc7a Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 13 Jul 2021 11:41:59 -0400
Subject: [PATCH 01/20] Fix ARC ghost states eviction accounting

arc_evict_hdr() returns number of evicted bytes in scope of specific
state.  For ghost states it does not mean the amount of really freed
memory, but the logical buffer size.  It is correct for the eviction
process, but not for waking up threads waiting for ARC size reduction,
as added in "Revise ARC shrinker algorithm" commit, causing premature
wakeups while ARC is still overflowed, allowing even bigger overflow,
plus processing overhead when next allocation will also get blocked,
probably also for too short time.

To fix that make arc_evict_hdr() also return the amount of really
freed memory, which for the ghost states is only the header, and use
it to update arc_evict_count instead.  Originally I was thinking to
not return it at all, since arc_get_data_impl() does not account for
the headers, but decided that some slow allocation progress is better
than long waits, reaching on my tests up to 100ms.

To reduce negative latency effects of long time periods when reclaim
thread can free little real memory, start reclamation process earlier,
before we actually reached the overflow threshold, when we have to
throttle new allocations.  We can also do it without taking global
arc_evict_lock, reducing the contention.

Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12279
---
 include/sys/arc_impl.h         |   1 -
 man/man4/zfs.4                 |  24 ++---
 module/os/freebsd/zfs/arc_os.c |   2 -
 module/zfs/arc.c               | 167 ++++++++++++++++++++-------------
 4 files changed, 113 insertions(+), 81 deletions(-)

diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 1f341ec94fa..ddfa28c15d1 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -984,7 +984,6 @@ extern unsigned long zfs_arc_max;
 extern void arc_reduce_target_size(int64_t to_free);
 extern boolean_t arc_reclaim_needed(void);
 extern void arc_kmem_reap_soon(void);
-extern boolean_t arc_is_overflowing(void);
 extern void arc_wait_for_eviction(uint64_t);
 
 extern void arc_lowmem_init(void);
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6da8d42b42b..346e83a9eb8 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -712,20 +712,22 @@ equivalent to the greater of the number of online CPUs and
 The ARC size is considered to be overflowing if it exceeds the current
 ARC target size
 .Pq Sy arc_c
-by a threshold determined by this parameter.
-The threshold is calculated as a fraction of
-.Sy arc_c
-using the formula
-.Sy arc_c >> zfs_arc_overflow_shift .
+by thresholds determined by this parameter.
+Exceeding by
+.Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5
+starts ARC reclamation process.
+If that appears insufficient, exceeding by
+.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5
+blocks new buffer allocation until the reclaim thread catches up.
+Started reclamation process continues till ARC size returns below the
+target size.
 .Pp
 The default value of
 .Sy 8
-causes the ARC to be considered overflowing if it exceeds the target size by
-.Em 1/256th Pq Em 0.3%
-of the target size.
-.Pp
-When the ARC is overflowing, new buffer allocations are stalled until
-the reclaim thread catches up and the overflow condition no longer exists.
+causes the ARC to start reclamation if it exceeds the target size by
+.Em 0.2%
+of the target size, and block allocations by
+.Em 0.6% .
 .
 .It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int
 If nonzero, this will update
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 05377bb7ed9..3b8b11cff0c 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -234,8 +234,6 @@ arc_lowmem(void *arg __unused, int howto __unused)
 	 */
 	if (curproc == pageproc)
 		arc_wait_for_eviction(to_free);
-	else
-		arc_wait_for_eviction(0);
 }
 
 void
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 394ca1bfe42..f1cd482e990 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -826,6 +826,12 @@ typedef enum arc_fill_flags {
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
+typedef enum arc_ovf_level {
+	ARC_OVF_NONE,			/* ARC within target size. */
+	ARC_OVF_SOME,			/* ARC is slightly overflowed. */
+	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
+} arc_ovf_level_t;
+
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
@@ -3861,9 +3867,18 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
+ *
+ * Return total size of evicted data buffers for eviction progress tracking.
+ * When evicting from ghost states return logical buffer size to make eviction
+ * progress at the same (or at least comparable) rate as from non-ghost states.
+ *
+ * Return *real_evicted for actual ARC size reduction to wake up threads
+ * waiting for it.  For non-ghost states it includes size of evicted data
+ * buffers (the headers are not freed there).  For ghost states it includes
+ * only the evicted headers size.
  */
 static int64_t
-arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
@@ -3873,6 +3888,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
+	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
@@ -3909,9 +3925,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 			 */
 			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
+			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
 			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
+			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
@@ -3935,8 +3953,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 			break;
 		}
-		if (buf->b_data != NULL)
+		if (buf->b_data != NULL) {
 			bytes_evicted += HDR_GET_LSIZE(hdr);
+			*real_evicted += HDR_GET_LSIZE(hdr);
+		}
 		mutex_exit(&buf->b_evict_lock);
 		arc_buf_destroy_impl(buf);
 	}
@@ -3972,6 +3992,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		arc_cksum_free(hdr);
 
 		bytes_evicted += arc_hdr_size(hdr);
+		*real_evicted += arc_hdr_size(hdr);
 
 		/*
 		 * If this hdr is being evicted and has a compressed
@@ -4013,7 +4034,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, int64_t bytes)
 {
 	multilist_sublist_t *mls;
-	uint64_t bytes_evicted = 0;
+	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	int evict_count = 0;
@@ -4074,10 +4095,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
-			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+			uint64_t revicted;
+			uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
+			    &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
+			real_evicted += revicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
@@ -4107,7 +4131,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
-	arc_evict_count += bytes_evicted;
+	arc_evict_count += real_evicted;
 
 	if (arc_free_memory() > arc_sys_free / 2) {
 		arc_evict_waiter_t *aw;
@@ -5121,7 +5145,7 @@ arc_adapt(int bytes, arc_state_t *state)
  * Check if arc_size has grown past our upper threshold, determined by
  * zfs_arc_overflow_shift.
  */
-boolean_t
+static arc_ovf_level_t
 arc_is_overflowing(void)
 {
 	/* Always allow at least one block of overflow */
@@ -5137,8 +5161,10 @@ arc_is_overflowing(void)
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
-	return (aggsum_lower_bound(&arc_sums.arcstat_size) >=
-	    (int64_t)arc_c + overflow);
+	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
+	    arc_c - overflow / 2;
+	return (over < 0 ? ARC_OVF_NONE :
+	    over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
@@ -5180,58 +5206,73 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 void
 arc_wait_for_eviction(uint64_t amount)
 {
-	mutex_enter(&arc_evict_lock);
-	if (arc_is_overflowing()) {
-		arc_evict_needed = B_TRUE;
-		zthr_wakeup(arc_evict_zthr);
-
-		if (amount != 0) {
-			arc_evict_waiter_t aw;
-			list_link_init(&aw.aew_node);
-			cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
-
-			uint64_t last_count = 0;
-			if (!list_is_empty(&arc_evict_waiters)) {
-				arc_evict_waiter_t *last =
-				    list_tail(&arc_evict_waiters);
-				last_count = last->aew_count;
-			}
-			/*
-			 * Note, the last waiter's count may be less than
-			 * arc_evict_count if we are low on memory in which
-			 * case arc_evict_state_impl() may have deferred
-			 * wakeups (but still incremented arc_evict_count).
-			 */
-			aw.aew_count =
-			    MAX(last_count, arc_evict_count) + amount;
-
-			list_insert_tail(&arc_evict_waiters, &aw);
-
-			arc_set_need_free();
-
-			DTRACE_PROBE3(arc__wait__for__eviction,
-			    uint64_t, amount,
-			    uint64_t, arc_evict_count,
-			    uint64_t, aw.aew_count);
-
-			/*
-			 * We will be woken up either when arc_evict_count
-			 * reaches aew_count, or when the ARC is no longer
-			 * overflowing and eviction completes.
-			 */
-			cv_wait(&aw.aew_cv, &arc_evict_lock);
-
-			/*
-			 * In case of "false" wakeup, we will still be on the
-			 * list.
-			 */
-			if (list_link_active(&aw.aew_node))
-				list_remove(&arc_evict_waiters, &aw);
-
-			cv_destroy(&aw.aew_cv);
+	switch (arc_is_overflowing()) {
+	case ARC_OVF_NONE:
+		return;
+	case ARC_OVF_SOME:
+		/*
+		 * This is a bit racy without taking arc_evict_lock, but the
+		 * worst that can happen is we either call zthr_wakeup() extra
+		 * time due to race with other thread here, or the set flag
+		 * get cleared by arc_evict_cb(), which is unlikely due to
+		 * big hysteresis, but also not important since at this level
+		 * of overflow the eviction is purely advisory.  Same time
+		 * taking the global lock here every time without waiting for
+		 * the actual eviction creates a significant lock contention.
+		 */
+		if (!arc_evict_needed) {
+			arc_evict_needed = B_TRUE;
+			zthr_wakeup(arc_evict_zthr);
 		}
+		return;
+	case ARC_OVF_SEVERE:
+	default:
+	{
+		arc_evict_waiter_t aw;
+		list_link_init(&aw.aew_node);
+		cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
+
+		uint64_t last_count = 0;
+		mutex_enter(&arc_evict_lock);
+		if (!list_is_empty(&arc_evict_waiters)) {
+			arc_evict_waiter_t *last =
+			    list_tail(&arc_evict_waiters);
+			last_count = last->aew_count;
+		} else if (!arc_evict_needed) {
+			arc_evict_needed = B_TRUE;
+			zthr_wakeup(arc_evict_zthr);
+		}
+		/*
+		 * Note, the last waiter's count may be less than
+		 * arc_evict_count if we are low on memory in which
+		 * case arc_evict_state_impl() may have deferred
+		 * wakeups (but still incremented arc_evict_count).
+		 */
+		aw.aew_count = MAX(last_count, arc_evict_count) + amount;
+
+		list_insert_tail(&arc_evict_waiters, &aw);
+
+		arc_set_need_free();
+
+		DTRACE_PROBE3(arc__wait__for__eviction,
+		    uint64_t, amount,
+		    uint64_t, arc_evict_count,
+		    uint64_t, aw.aew_count);
+
+		/*
+		 * We will be woken up either when arc_evict_count reaches
+		 * aew_count, or when the ARC is no longer overflowing and
+		 * eviction completes.
+		 * In case of "false" wakeup, we will still be on the list.
+		 */
+		do {
+			cv_wait(&aw.aew_cv, &arc_evict_lock);
+		} while (list_link_active(&aw.aew_node));
+		mutex_exit(&arc_evict_lock);
+
+		cv_destroy(&aw.aew_cv);
+	}
 	}
-	mutex_exit(&arc_evict_lock);
 }
 
 /*
@@ -5262,16 +5303,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
-	 *
-	 * We do the overflowing check without holding the arc_evict_lock to
-	 * reduce lock contention in this hot path.  Note that
-	 * arc_wait_for_eviction() will acquire the lock and check again to
-	 * ensure we are truly overflowing before blocking.
 	 */
-	if (arc_is_overflowing()) {
-		arc_wait_for_eviction(size *
-		    zfs_arc_eviction_pct / 100);
-	}
+	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {

From 1325434b2d758ecd0b07a3a4f883a0c43393ed37 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Tue, 13 Jul 2021 11:47:57 -0400
Subject: [PATCH 02/20] Tinker with slop space accounting with dedup

* Tinker with slop space accounting with dedup

Do not include the deduplicated space usage in the slop space
reservation, it leads to surprising outcomes.

* Update spa_dedup_dspace sometimes

Sometimes, we get into spa_get_slop_space() with
spa_dedup_dspace=~0ULL, AKA "unset", while spa_dspace is correctly set.

So call the code to update it before we use it if we hit that case.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #12271
---
 module/zfs/ddt.c      |  2 +-
 module/zfs/spa_misc.c | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 7b0b1d89676..479e5a3ad62 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -503,7 +503,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
 				ddt_histogram_add(ddh,
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 157dede93cf..29a5381e4b4 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1786,8 +1786,22 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 uint64_t
 spa_get_slop_space(spa_t *spa)
 {
-	uint64_t space = spa_get_dspace(spa);
-	uint64_t slop = MIN(space >> spa_slop_shift, spa_max_slop);
+	uint64_t space = 0;
+	uint64_t slop = 0;
+
+	/*
+	 * Make sure spa_dedup_dspace has been set.
+	 */
+	if (spa->spa_dedup_dspace == ~0ULL)
+		spa_update_dspace(spa);
+
+	/*
+	 * spa_get_dspace() includes the space only logically "used" by
+	 * deduplicated data, so since it's not useful to reserve more
+	 * space with more deduplicated data, we subtract that out here.
+	 */
+	space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
+	slop = MIN(space >> spa_slop_shift, spa_max_slop);
 
 	/*
 	 * Subtract the embedded log space, but no more than half the (3.2%)

From d9f0f1582c4bd73e5beb8be447358c8a7170e553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Tue, 13 Jul 2021 22:50:48 +0200
Subject: [PATCH 03/20] config/libatomic: require -latomic iff atomic.c doesn't
 link w/o it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In absence of LTO, and dynamic libatomic, la.so ends up in the needs
section of every toolchain executable; some consider this an issue.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Closes #12345
Closes #12359
---
 config/user-libatomic.m4 | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/config/user-libatomic.m4 b/config/user-libatomic.m4
index 14a60bbea9d..d15069f9c44 100644
--- a/config/user-libatomic.m4
+++ b/config/user-libatomic.m4
@@ -1,34 +1,28 @@
 dnl #
-dnl # If -latomic exists, it's needed for __atomic intrinsics.
-dnl #
-dnl # Some systems (like FreeBSD 13) don't have a libatomic at all because
-dnl # their toolchain doesn't ship it – they obviously don't need it.
-dnl #
-dnl # Others (like sufficiently ancient CentOS) have one,
-dnl # but terminally broken or unlinkable (e.g. it's a dangling symlink,
-dnl # or a linker script that points to a nonexistent file) –
-dnl # most arches affected by this don't actually need -latomic (and if they do,
-dnl # then they should have libatomic that actually exists and links,
-dnl # so don't fall into this category).
-dnl #
-dnl # Technically, we could check if the platform *actually* needs -latomic,
-dnl # or if it has native support for all the intrinsics we use,
-dnl # but it /really/ doesn't matter, and C11 recommends to always link it.
+dnl # If -latomic exists and atomic.c doesn't link without it,
+dnl # it's needed for __atomic intrinsics.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_USER_LIBATOMIC], [
-	AC_MSG_CHECKING([whether -latomic is present])
+	AC_MSG_CHECKING([whether -latomic is required])
 
 	saved_libs="$LIBS"
 	LIBS="$LIBS -latomic"
+	LIBATOMIC_LIBS=""
 
 	AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])], [
-		LIBATOMIC_LIBS="-latomic"
-		AC_MSG_RESULT([yes])
-	], [
-		LIBATOMIC_LIBS=""
-		AC_MSG_RESULT([no])
+		LIBS="$saved_libs"
+		saved_cflags="$CFLAGS"
+		CFLAGS="$CFLAGS -isystem lib/libspl/include"
+		AC_LINK_IFELSE([AC_LANG_PROGRAM([#include "lib/libspl/atomic.c"], [])], [], [LIBATOMIC_LIBS="-latomic"])
+		CFLAGS="$saved_cflags"
 	])
 
+	if test -n "$LIBATOMIC_LIBS"; then
+		AC_MSG_RESULT([yes])
+	else
+		AC_MSG_RESULT([no])
+	fi
+
 	LIBS="$saved_libs"
 	AC_SUBST([LIBATOMIC_LIBS])
 ])

From 41eba770616c36126cba3468a1781d212d3afb9d Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Fri, 16 Jul 2021 03:31:00 +0900
Subject: [PATCH 04/20] pass handle to do_unmount()

The same change has already been done for domount(). On macOS platform
we need to have access to zhp to handle devdisks and snapshots.
Also, symmetry is pleasing.

In addition, the code in zpool_disable_datasets which sorts the
mountpoints did not sort the related handle, which meant that the
mountpoint, and the handle that it is paired with, was lost.
You'd get a random handle with the mountpoint.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #12296
---
 lib/libzfs/libzfs_impl.h              |  3 +-
 lib/libzfs/libzfs_mount.c             | 73 +++++++++++++--------------
 lib/libzfs/os/freebsd/libzfs_zmount.c |  2 +-
 lib/libzfs/os/linux/libzfs_mount_os.c |  2 +-
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h
index ce7373582f0..b1cf4f825f8 100644
--- a/lib/libzfs/libzfs_impl.h
+++ b/lib/libzfs/libzfs_impl.h
@@ -243,7 +243,8 @@ extern proto_table_t proto_table[PROTO_END];
 
 extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts,
     int flags);
-extern int do_unmount(const char *mntpt, int flags);
+extern int do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags);
+extern int zfs_mount_delegation_check(void);
 extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto);
 extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *);
 extern int unshare_one(libzfs_handle_t *hdl, const char *name,
diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index b074a6e6f37..5729f120e9d 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -568,11 +568,11 @@ zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags,
  * Unmount a single filesystem.
  */
 static int
-unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
+unmount_one(zfs_handle_t *zhp, const char *mountpoint, int flags)
 {
 	int error;
 
-	error = do_unmount(mountpoint, flags);
+	error = do_unmount(zhp, mountpoint, flags);
 	if (error != 0) {
 		int libzfs_err;
 
@@ -595,7 +595,7 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
 		default:
 			libzfs_err = EZFS_UMOUNTFAILED;
 		}
-		return (zfs_error_fmt(hdl, libzfs_err,
+		return (zfs_error_fmt(zhp->zfs_hdl, libzfs_err,
 		    dgettext(TEXT_DOMAIN, "cannot unmount '%s'"),
 		    mountpoint));
 	}
@@ -637,7 +637,7 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
 		}
 		zfs_commit_all_shares();
 
-		if (unmount_one(hdl, mntpt, flags) != 0) {
+		if (unmount_one(zhp, mntpt, flags) != 0) {
 			free(mntpt);
 			(void) zfs_shareall(zhp);
 			zfs_commit_all_shares();
@@ -1503,13 +1503,18 @@ out:
 	return (ret);
 }
 
+struct sets_s {
+	char *mountpoint;
+	zfs_handle_t *dataset;
+};
+
 static int
 mountpoint_compare(const void *a, const void *b)
 {
-	const char *mounta = *((char **)a);
-	const char *mountb = *((char **)b);
+	const struct sets_s *mounta = (struct sets_s *)a;
+	const struct sets_s *mountb = (struct sets_s *)b;
 
-	return (strcmp(mountb, mounta));
+	return (strcmp(mountb->mountpoint, mounta->mountpoint));
 }
 
 /*
@@ -1526,8 +1531,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	FILE *mnttab;
 	struct mnttab entry;
 	size_t namelen;
-	char **mountpoints = NULL;
-	zfs_handle_t **datasets = NULL;
+	struct sets_s *sets = NULL;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	int i;
 	int ret = -1;
@@ -1562,35 +1566,27 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 		 */
 		if (used == alloc) {
 			if (alloc == 0) {
-				if ((mountpoints = zfs_alloc(hdl,
-				    8 * sizeof (void *))) == NULL)
-					goto out;
 
-				if ((datasets = zfs_alloc(hdl,
-				    8 * sizeof (void *))) == NULL)
+				if ((sets = zfs_alloc(hdl,
+				    8 * sizeof (struct sets_s))) == NULL)
 					goto out;
 
 				alloc = 8;
 			} else {
 				void *ptr;
 
-				if ((ptr = zfs_realloc(hdl, mountpoints,
-				    alloc * sizeof (void *),
-				    alloc * 2 * sizeof (void *))) == NULL)
+				if ((ptr = zfs_realloc(hdl, sets,
+				    alloc * sizeof (struct sets_s),
+				    alloc * 2 * sizeof (struct sets_s)))
+				    == NULL)
 					goto out;
-				mountpoints = ptr;
-
-				if ((ptr = zfs_realloc(hdl, datasets,
-				    alloc * sizeof (void *),
-				    alloc * 2 * sizeof (void *))) == NULL)
-					goto out;
-				datasets = ptr;
+				sets = ptr;
 
 				alloc *= 2;
 			}
 		}
 
-		if ((mountpoints[used] = zfs_strdup(hdl,
+		if ((sets[used].mountpoint = zfs_strdup(hdl,
 		    entry.mnt_mountp)) == NULL)
 			goto out;
 
@@ -1599,7 +1595,8 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 		 * is only used to determine if we need to remove the underlying
 		 * mountpoint, so failure is not fatal.
 		 */
-		datasets[used] = make_dataset_handle(hdl, entry.mnt_special);
+		sets[used].dataset = make_dataset_handle(hdl,
+		    entry.mnt_special);
 
 		used++;
 	}
@@ -1608,7 +1605,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	 * At this point, we have the entire list of filesystems, so sort it by
 	 * mountpoint.
 	 */
-	qsort(mountpoints, used, sizeof (char *), mountpoint_compare);
+	qsort(sets, used, sizeof (struct sets_s), mountpoint_compare);
 
 	/*
 	 * Walk through and first unshare everything.
@@ -1617,9 +1614,9 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 		zfs_share_proto_t *curr_proto;
 		for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
 		    curr_proto++) {
-			if (is_shared(mountpoints[i], *curr_proto) &&
-			    unshare_one(hdl, mountpoints[i],
-			    mountpoints[i], *curr_proto) != 0)
+			if (is_shared(sets[i].mountpoint, *curr_proto) &&
+			    unshare_one(hdl, sets[i].mountpoint,
+			    sets[i].mountpoint, *curr_proto) != 0)
 				goto out;
 		}
 	}
@@ -1630,25 +1627,25 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	 * appropriate.
 	 */
 	for (i = 0; i < used; i++) {
-		if (unmount_one(hdl, mountpoints[i], flags) != 0)
+		if (unmount_one(sets[i].dataset, sets[i].mountpoint,
+		    flags) != 0)
 			goto out;
 	}
 
 	for (i = 0; i < used; i++) {
-		if (datasets[i])
-			remove_mountpoint(datasets[i]);
+		if (sets[i].dataset)
+			remove_mountpoint(sets[i].dataset);
 	}
 
 	ret = 0;
 out:
 	(void) fclose(mnttab);
 	for (i = 0; i < used; i++) {
-		if (datasets[i])
-			zfs_close(datasets[i]);
-		free(mountpoints[i]);
+		if (sets[i].dataset)
+			zfs_close(sets[i].dataset);
+		free(sets[i].mountpoint);
 	}
-	free(datasets);
-	free(mountpoints);
+	free(sets);
 
 	return (ret);
 }
diff --git a/lib/libzfs/os/freebsd/libzfs_zmount.c b/lib/libzfs/os/freebsd/libzfs_zmount.c
index e1febe6a2d9..6bc073cb03b 100644
--- a/lib/libzfs/os/freebsd/libzfs_zmount.c
+++ b/lib/libzfs/os/freebsd/libzfs_zmount.c
@@ -121,7 +121,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags)
 }
 
 int
-do_unmount(const char *mntpt, int flags)
+do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags)
 {
 	if (unmount(mntpt, flags) < 0)
 		return (errno);
diff --git a/lib/libzfs/os/linux/libzfs_mount_os.c b/lib/libzfs/os/linux/libzfs_mount_os.c
index 547895d7e37..42f300b36c9 100644
--- a/lib/libzfs/os/linux/libzfs_mount_os.c
+++ b/lib/libzfs/os/linux/libzfs_mount_os.c
@@ -374,7 +374,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags)
 }
 
 int
-do_unmount(const char *mntpt, int flags)
+do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags)
 {
 	if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 		int rv = umount2(mntpt, flags);

From c1b5869bab987711ed00c3b5b43ee7145d332003 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 16 Jul 2021 15:39:24 -0400
Subject: [PATCH 05/20] Introduce dsl_dir_diduse_transfer_space()

Most of dsl_dir_diduse_space() and dsl_dir_transfer_space() CPU time
is a dd_lock overhead and time spent in dmu_buf_will_dirty(). Calling
them one after another is a waste of time and even more contention.
Doing that twice for each rewritten block within dbuf_write_done()
via dsl_dataset_block_kill() and dsl_dataset_block_born() created one
of the biggest CPU overheads in case of small blocks rewrite.

dsl_dir_diduse_transfer_space() combines functionality of these two
functions for cases where it is needed, but without double overhead,
practically for the cost of dsl_dir_diduse_space() or even cheaper.

While there, optimize dsl_dir_phys() calls in dsl_dir_diduse_space()
and dsl_dir_transfer_space().  It seems Clang detects some aliasing
there, repeating dd->dd_dbuf->db_data dereference multiple times,
increasing dd_lock scope and contention.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Author: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12300
---
 include/sys/dsl_dir.h    |   3 ++
 module/zfs/dsl_dataset.c |  10 ++--
 module/zfs/dsl_dir.c     | 110 +++++++++++++++++++++++++++------------
 3 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 7cf5093c2c3..d635b314042 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -174,6 +174,9 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
 void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
+void dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
+    int64_t compressed, int64_t uncompressed, int64_t tonew,
+    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
 int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 1c03216ef6d..f99964511aa 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -192,9 +192,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	}
 
 	mutex_exit(&ds->ds_lock);
-	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
-	    compressed, uncompressed, tx);
-	dsl_dir_transfer_space(ds->ds_dir, used - delta,
+	dsl_dir_diduse_transfer_space(ds->ds_dir, delta,
+	    compressed, uncompressed, used,
 	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 }
 
@@ -291,9 +290,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		delta = parent_delta(ds, -used);
 		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-		    delta, -compressed, -uncompressed, tx);
-		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+		dsl_dir_diduse_transfer_space(ds->ds_dir,
+		    delta, -compressed, -uncompressed, -used,
 		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index df2c3d8f063..84caace4dba 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -1517,6 +1517,11 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 {
 	int64_t accounted_delta;
 
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(type < DD_USED_NUM);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
 	/*
 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
 	 * dd_lock held, so that it can atomically update
@@ -1525,36 +1530,28 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 	 * consistently.
 	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(type < DD_USED_NUM);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
-	accounted_delta =
-	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
-	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
-	ASSERT(compressed >= 0 ||
-	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
+	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
+	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
+	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
+	ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
-	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
-	dsl_dir_phys(dd)->dd_used_bytes += used;
-	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
-	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
+	    ddp->dd_uncompressed_bytes >= -uncompressed);
+	ddp->dd_used_bytes += used;
+	ddp->dd_uncompressed_bytes += uncompressed;
+	ddp->dd_compressed_bytes += compressed;
 
-	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-		ASSERT(used > 0 ||
-		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
-		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
+	if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used);
+		ddp->dd_used_breakdown[type] += used;
 #ifdef ZFS_DEBUG
 		{
 			dd_used_t t;
 			uint64_t u = 0;
 			for (t = 0; t < DD_USED_NUM; t++)
-				u += dsl_dir_phys(dd)->dd_used_breakdown[t];
-			ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
+				u += ddp->dd_used_breakdown[t];
+			ASSERT3U(u, ==, ddp->dd_used_bytes);
 		}
 #endif
 	}
@@ -1562,11 +1559,9 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 		mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
-		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-		    accounted_delta, compressed, uncompressed, tx);
-		dsl_dir_transfer_space(dd->dd_parent,
-		    used - accounted_delta,
-		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+		dsl_dir_diduse_transfer_space(dd->dd_parent,
+		    accounted_delta, compressed, uncompressed,
+		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 	}
 }
 
@@ -1578,21 +1573,72 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
+	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
 	if (delta == 0 ||
-	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
+	    !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
-	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
-	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
-	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
-	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
-	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+	    ddp->dd_used_breakdown[oldtype] >= delta :
+	    ddp->dd_used_breakdown[newtype] >= -delta);
+	ASSERT(ddp->dd_used_bytes >= ABS(delta));
+	ddp->dd_used_breakdown[oldtype] -= delta;
+	ddp->dd_used_breakdown[newtype] += delta;
 	mutex_exit(&dd->dd_lock);
 }
 
+void
+dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
+    int64_t compressed, int64_t uncompressed, int64_t tonew,
+    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+	int64_t accounted_delta;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(oldtype < DD_USED_NUM);
+	ASSERT(newtype < DD_USED_NUM);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
+	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
+	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
+	ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
+	ASSERT(uncompressed >= 0 ||
+	    ddp->dd_uncompressed_bytes >= -uncompressed);
+	ddp->dd_used_bytes += used;
+	ddp->dd_uncompressed_bytes += uncompressed;
+	ddp->dd_compressed_bytes += compressed;
+
+	if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		ASSERT(tonew - used <= 0 ||
+		    ddp->dd_used_breakdown[oldtype] >= tonew - used);
+		ASSERT(tonew >= 0 ||
+		    ddp->dd_used_breakdown[newtype] >= -tonew);
+		ddp->dd_used_breakdown[oldtype] -= tonew - used;
+		ddp->dd_used_breakdown[newtype] += tonew;
+#ifdef ZFS_DEBUG
+		{
+			dd_used_t t;
+			uint64_t u = 0;
+			for (t = 0; t < DD_USED_NUM; t++)
+				u += ddp->dd_used_breakdown[t];
+			ASSERT3U(u, ==, ddp->dd_used_bytes);
+		}
+#endif
+	}
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent != NULL) {
+		dsl_dir_diduse_transfer_space(dd->dd_parent,
+		    accounted_delta, compressed, uncompressed,
+		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+	}
+}
+
 typedef struct dsl_dir_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;

From b7ec530233ec1bf16a330443848a23a93a056624 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Fri, 16 Jul 2021 15:58:01 -0400
Subject: [PATCH 06/20] Correct zfs-send(8) on readonly sends

zfs-send(8) claimed in the flags list you could use -pR when sending
a readonly filesystem or volume. You cannot.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #12336
---
 cmd/zfs/zfs_main.c  |  2 +-
 man/man8/zfs-send.8 | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 38bfdc91aea..a8d68cd03cf 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -317,7 +317,7 @@ get_usage(zfs_help_t idx)
 	case HELP_SEND:
 		return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] "
 		    "<snapshot>\n"
-		    "\tsend [-nvPLecw] [-i snapshot|bookmark] "
+		    "\tsend [-DnvPLecw] [-i snapshot|bookmark] "
 		    "<filesystem|volume|snapshot>\n"
 		    "\tsend [-DnPpvLec] [-i bookmark|snapshot] "
 		    "--redact <bookmark> <snapshot>\n"
diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8
index a3d08fbf6e2..688bd033979 100644
--- a/man/man8/zfs-send.8
+++ b/man/man8/zfs-send.8
@@ -44,7 +44,7 @@
 .Ar snapshot
 .Nm zfs
 .Cm send
-.Op Fl DLPRcenpsvw
+.Op Fl DLPcensvw
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Nm zfs
@@ -285,7 +285,7 @@ You will be able to receive your streams on future versions of ZFS.
 .It Xo
 .Nm zfs
 .Cm send
-.Op Fl DLPRcenpvw
+.Op Fl DLPcenvw
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Xc
@@ -296,7 +296,11 @@ filesystem must not be mounted.
 When the stream generated from a filesystem or volume is received, the default
 snapshot name will be
 .Qq --head-- .
-.Bl -tag -width "-L"
+.Bl -tag -width "-D"
+.It Fl D , -dedup
+Deduplicated send is no longer supported.
+This flag is accepted for backwards compatibility, but a regular,
+non-deduplicated stream will be generated.
 .It Fl L , -large-block
 Generate a stream which may contain blocks larger than 128KB.
 This flag has no effect if the

From b17b19943edfe734211388c6ebd626109232cf17 Mon Sep 17 00:00:00 2001
From: George Melikov <mail@gmelikov.ru>
Date: Fri, 16 Jul 2021 23:04:00 +0300
Subject: [PATCH 07/20] zpool_influxdb: fix -Werror=stringop-truncation

Use strlcpy instead of problematic strncpy

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: George Melikov <mail@gmelikov.ru>
Closes #12344
---
 cmd/zpool_influxdb/zpool_influxdb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cmd/zpool_influxdb/zpool_influxdb.c b/cmd/zpool_influxdb/zpool_influxdb.c
index 5dc39afe830..b60d18ee955 100644
--- a/cmd/zpool_influxdb/zpool_influxdb.c
+++ b/cmd/zpool_influxdb/zpool_influxdb.c
@@ -684,9 +684,8 @@ print_recursive_stats(stat_printer_f func, nvlist_t *nvroot,
 
 	if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0) {
-		(void) strncpy(vdev_name, get_vdev_name(nvroot, parent_name),
+		(void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name),
 		    sizeof (vdev_name));
-		vdev_name[sizeof (vdev_name) - 1] = '\0';
 
 		for (c = 0; c < children; c++) {
 			print_recursive_stats(func, child[c], pool_name,

From ca14e08cbff36cadd26928cb01222707930973cb Mon Sep 17 00:00:00 2001
From: Kevin Bowling <kevin.bowling@kev009.com>
Date: Fri, 16 Jul 2021 13:28:55 -0700
Subject: [PATCH 08/20] Detect HAVE_LARGE_STACKS at compile time

Move HAVE_LARGE_STACKS definitions to header and set when appropriate.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Kevin Bowling <kbowling@FreeBSD.org>
Closes #12350
---
 config/Rules.am                               |  1 -
 config/kernel-config-defined.m4               | 31 -------------------
 include/os/freebsd/zfs/sys/zfs_context_os.h   |  4 +++
 include/os/linux/zfs/sys/zfs_context_os.h     |  5 +++
 .../include/os/freebsd/sys/zfs_context_os.h   |  1 +
 .../include/os/linux/sys/zfs_context_os.h     |  3 ++
 6 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/config/Rules.am b/config/Rules.am
index 8fe2fa9ca8d..be80c1e9c7c 100644
--- a/config/Rules.am
+++ b/config/Rules.am
@@ -39,7 +39,6 @@ AM_CPPFLAGS  = -D_GNU_SOURCE
 AM_CPPFLAGS += -D_REENTRANT
 AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64
 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE
-AM_CPPFLAGS += -DHAVE_LARGE_STACKS=1
 AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\"
 AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\"
 AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\"
diff --git a/config/kernel-config-defined.m4 b/config/kernel-config-defined.m4
index 9b9468269ca..c7d18b49b14 100644
--- a/config/kernel-config-defined.m4
+++ b/config/kernel-config-defined.m4
@@ -19,7 +19,6 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [
 		])
 	])
 
-	ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE
 	ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC
 	ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS
 	ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE
@@ -29,42 +28,12 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [
 	ZFS_LINUX_TEST_COMPILE_ALL([config])
 	AC_MSG_RESULT([done])
 
-	ZFS_AC_KERNEL_CONFIG_THREAD_SIZE
 	ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC
 	ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS
 	ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE
 	ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE
 ])
 
-dnl #
-dnl # Check configured THREAD_SIZE
-dnl #
-dnl # The stack size will vary by architecture, but as of Linux 3.15 on x86_64
-dnl # the default thread stack size was increased to 16K from 8K.  Therefore,
-dnl # on newer kernels and some architectures stack usage optimizations can be
-dnl # conditionally applied to improve performance without negatively impacting
-dnl # stability.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE], [
-	ZFS_LINUX_TEST_SRC([config_thread_size], [
-		#include <linux/module.h>
-	],[
-		#if (THREAD_SIZE < 16384)
-		#error "THREAD_SIZE is less than 16K"
-		#endif
-	])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_CONFIG_THREAD_SIZE], [
-	AC_MSG_CHECKING([whether kernel was built with 16K or larger stacks])
-	ZFS_LINUX_TEST_RESULT([config_thread_size], [
-		AC_MSG_RESULT([yes])
-		AC_DEFINE(HAVE_LARGE_STACKS, 1, [kernel has large stacks])
-	],[
-		AC_MSG_RESULT([no])
-	])
-])
-
 dnl #
 dnl # Check CONFIG_DEBUG_LOCK_ALLOC
 dnl #
diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h
index 8dbe907d098..a32eb52c53c 100644
--- a/include/os/freebsd/zfs/sys/zfs_context_os.h
+++ b/include/os/freebsd/zfs/sys/zfs_context_os.h
@@ -41,6 +41,10 @@
 #include <sys/ccompat.h>
 #include <linux/types.h>
 
+#if KSTACK_PAGES * PAGE_SIZE >= 16384
+#define	HAVE_LARGE_STACKS	1
+#endif
+
 #define	cond_resched()		kern_yield(PRI_USER)
 
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
diff --git a/include/os/linux/zfs/sys/zfs_context_os.h b/include/os/linux/zfs/sys/zfs_context_os.h
index de7015b929b..981a6b8a63e 100644
--- a/include/os/linux/zfs/sys/zfs_context_os.h
+++ b/include/os/linux/zfs/sys/zfs_context_os.h
@@ -25,5 +25,10 @@
 
 #include <linux/dcache_compat.h>
 #include <linux/utsname_compat.h>
+#include <linux/module.h>
+
+#if THREAD_SIZE >= 16384
+#define	HAVE_LARGE_STACKS	1
+#endif
 
 #endif
diff --git a/lib/libspl/include/os/freebsd/sys/zfs_context_os.h b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h
index f5a136d2212..b9bf487c2ae 100644
--- a/lib/libspl/include/os/freebsd/sys/zfs_context_os.h
+++ b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h
@@ -29,6 +29,7 @@
 #ifndef ZFS_CONTEXT_OS_H_
 #define	ZFS_CONTEXT_OS_H_
 
+#define	HAVE_LARGE_STACKS	1
 #define	ZFS_EXPORTS_PATH	"/etc/zfs/exports"
 
 #endif
diff --git a/lib/libspl/include/os/linux/sys/zfs_context_os.h b/lib/libspl/include/os/linux/sys/zfs_context_os.h
index 008e57df4ea..81ced520774 100644
--- a/lib/libspl/include/os/linux/sys/zfs_context_os.h
+++ b/lib/libspl/include/os/linux/sys/zfs_context_os.h
@@ -22,4 +22,7 @@
 
 #ifndef ZFS_CONTEXT_OS_H
 #define	ZFS_CONTEXT_OS_H
+
+#define	HAVE_LARGE_STACKS	1
+
 #endif

From eecceeae9feee7f7398c423e81b276a394c8ffae Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 19 Jul 2021 11:56:58 -0400
Subject: [PATCH 09/20] FreeBSD: Switch from MAXPHYS to maxphys on FreeBSD 13+

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12378
---
 module/os/freebsd/zfs/vdev_geom.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 2353c460023..6ac37da1c58 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -381,7 +381,11 @@ vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
 	int i, n_bios, j;
 	size_t bios_size;
 
+#if __FreeBSD_version > 1300130
+	maxio = maxphys - (maxphys % cp->provider->sectorsize);
+#else
 	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
+#endif
 	n_bios = 0;
 
 	/* How many bios are required for all commands ? */

From de12cd251105d4842a8b749b7386b8bb74e8f8cc Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Mon, 19 Jul 2021 12:02:35 -0400
Subject: [PATCH 10/20] Remove unused fields from zvol_task_t

We don't use or need the pool name or value source in the zvol tasks.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #12361
---
 module/zfs/zvol.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 23df0e1541a..e7b84fa815a 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -106,10 +106,8 @@ typedef enum {
 
 typedef struct {
 	zvol_async_op_t op;
-	char pool[MAXNAMELEN];
 	char name1[MAXNAMELEN];
 	char name2[MAXNAMELEN];
-	zprop_source_t source;
 	uint64_t value;
 } zvol_task_t;
 
@@ -1435,7 +1433,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
     uint64_t value)
 {
 	zvol_task_t *task;
-	char *delim;
 
 	/* Never allow tasks on hidden names. */
 	if (name1[0] == '$')
@@ -1444,8 +1441,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->op = op;
 	task->value = value;
-	delim = strchr(name1, '/');
-	strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
 
 	strlcpy(task->name1, name1, MAXNAMELEN);
 	if (name2 != NULL)

From 65b929364162418337ae563fa523d480411790f9 Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Mon, 19 Jul 2021 12:52:50 -0400
Subject: [PATCH 11/20] Use SET_ERROR for more errors in FreeBSD vnops

We should use SET_ERROR when we first get an error.

Add it in the FreeBSD xattr implementations where missing.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #12356
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 45 ++++++++++++++++++----------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 46a632b0385..846b4b60531 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -5343,7 +5343,7 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
@@ -5374,15 +5374,17 @@ zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
 
 	error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
 	    &nv_value, &nv_size);
-	if (error)
-		return (error);
+	if (error != 0)
+		return (SET_ERROR(error));
 
 	if (ap->a_size != NULL)
 		*ap->a_size = nv_size;
 	else if (ap->a_uio != NULL)
 		error = uiomove(nv_value, nv_size, ap->a_uio);
+	if (error != 0)
+		return (SET_ERROR(error));
 
-	return (error);
+	return (0);
 }
 
 /*
@@ -5405,7 +5407,7 @@ zfs_getextattr(struct vop_getextattr_args *ap)
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname));
@@ -5456,7 +5458,7 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
 	vp = nd.ni_vp;
 	if (error != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
-		return (error);
+		return (SET_ERROR(error));
 	}
 
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
@@ -5487,7 +5489,9 @@ zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
 
 	nvl = zp->z_xattr_cached;
 	error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
-	if (error == 0)
+	if (error != 0)
+		error = SET_ERROR(error);
+	else
 		error = zfs_sa_set_xattr(zp);
 	if (error != 0) {
 		zp->z_xattr_cached = NULL;
@@ -5516,7 +5520,7 @@ zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname));
@@ -5583,7 +5587,7 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
@@ -5617,13 +5621,18 @@ zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
 		return (SET_ERROR(EFBIG));
 	error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 	if (sa_size > DXATTR_MAX_SA_SIZE)
 		return (SET_ERROR(EFBIG));
 	uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
 	error = uiomove(buf, entry_size, ap->a_uio);
-	if (error == 0)
+	if (error != 0) {
+		error = SET_ERROR(error);
+	} else {
 		error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
+		if (error != 0)
+			error = SET_ERROR(error);
+	}
 	kmem_free(buf, entry_size);
 	if (error == 0)
 		error = zfs_sa_set_xattr(zp);
@@ -5654,7 +5663,7 @@ zfs_setextattr(struct vop_setextattr_args *ap)
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname));
@@ -5733,7 +5742,7 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
@@ -5779,8 +5788,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
 					char *namep = dp->d_name + plen;
 					error = uiomove(namep, nlen, ap->a_uio);
 				}
-				if (error != 0)
+				if (error != 0) {
+					error = SET_ERROR(error);
 					break;
+				}
 			}
 		}
 	} while (!eof && error == 0);
@@ -5825,8 +5836,10 @@ zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
 				char *namep = __DECONST(char *, name) + plen;
 				error = uiomove(namep, nlen, ap->a_uio);
 			}
-			if (error != 0)
+			if (error != 0) {
+				error = SET_ERROR(error);
 				break;
+			}
 		}
 	}
 
@@ -5856,7 +5869,7 @@ zfs_listextattr(struct vop_listextattr_args *ap)
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(error));
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof (attrprefix));

From 23c13c7e807ec8abb368e00699a34ffe0bd50885 Mon Sep 17 00:00:00 2001
From: Alexander <solbjorn@users.noreply.github.com>
Date: Tue, 20 Jul 2021 16:03:33 +0200
Subject: [PATCH 12/20] A few fixes of callback typecasting (for the upcoming
 ClangCFI)

* zio: avoid callback typecasting
* zil: avoid zil_itxg_clean() callback typecasting
* zpl: decouple zpl_readpage() into two separate callbacks
* nvpair: explicitly declare callbacks for xdr_array()
* linux/zfs_nvops: don't use external iput() as a callback
* zcp_synctask: don't use fnvlist_free() as a callback
* zvol: don't use ops->zv_free() as a callback for taskq_dispatch()

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Closes #12260
---
 include/sys/zio.h                  |  4 +-
 module/nvpair/nvpair.c             | 64 ++++++++++++++++++++++++++----
 module/os/linux/zfs/zfs_vnops_os.c |  8 +++-
 module/os/linux/zfs/zpl_file.c     | 19 +++++++--
 module/zfs/zcp_synctask.c          | 15 ++++---
 module/zfs/zil.c                   |  5 ++-
 module/zfs/zio.c                   | 18 ++++-----
 module/zfs/zvol.c                  | 10 ++++-
 8 files changed, 111 insertions(+), 32 deletions(-)

diff --git a/include/sys/zio.h b/include/sys/zio.h
index c792cb65b67..2d34481f6be 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -572,8 +572,8 @@ extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
-extern void zio_execute(zio_t *zio);
-extern void zio_interrupt(zio_t *zio);
+extern void zio_execute(void *zio);
+extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, char *tag);
diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c
index 990a4482c99..5f427c8cf2e 100644
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@@ -3213,6 +3213,56 @@ nvs_xdr_nvl_fini(nvstream_t *nvs)
 	return (0);
 }
 
+/*
+ * xdrproc_t-compatible callbacks for xdr_array()
+ */
+
+#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */
+
+#define	NVS_BUILD_XDRPROC_T(type)		\
+static bool_t					\
+nvs_xdr_nvp_##type(XDR *xdrs, void *ptr)	\
+{						\
+	return (xdr_##type(xdrs, ptr));		\
+}
+
+#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */
+
+#define	NVS_BUILD_XDRPROC_T(type)		\
+static bool_t					\
+nvs_xdr_nvp_##type(XDR *xdrs, ...)		\
+{						\
+	va_list args;				\
+	void *ptr;				\
+						\
+	va_start(args, xdrs);			\
+	ptr = va_arg(args, void *);		\
+	va_end(args);				\
+						\
+	return (xdr_##type(xdrs, ptr));		\
+}
+
+#else /* FreeBSD, sunrpc */
+
+#define	NVS_BUILD_XDRPROC_T(type)		\
+static bool_t					\
+nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...)	\
+{						\
+	return (xdr_##type(xdrs, ptr));		\
+}
+
+#endif
+
+/* BEGIN CSTYLED */
+NVS_BUILD_XDRPROC_T(char);
+NVS_BUILD_XDRPROC_T(short);
+NVS_BUILD_XDRPROC_T(u_short);
+NVS_BUILD_XDRPROC_T(int);
+NVS_BUILD_XDRPROC_T(u_int);
+NVS_BUILD_XDRPROC_T(longlong_t);
+NVS_BUILD_XDRPROC_T(u_longlong_t);
+/* END CSTYLED */
+
 /*
  * The format of xdr encoded nvpair is:
  * encode_size, decode_size, name string, data type, nelem, data
@@ -3335,38 +3385,38 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
-		    (xdrproc_t)xdr_char);
+		    nvs_xdr_nvp_char);
 		break;
 
 	case DATA_TYPE_INT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
-		    sizeof (int16_t), (xdrproc_t)xdr_short);
+		    sizeof (int16_t), nvs_xdr_nvp_short);
 		break;
 
 	case DATA_TYPE_UINT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
-		    sizeof (uint16_t), (xdrproc_t)xdr_u_short);
+		    sizeof (uint16_t), nvs_xdr_nvp_u_short);
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
-		    sizeof (int32_t), (xdrproc_t)xdr_int);
+		    sizeof (int32_t), nvs_xdr_nvp_int);
 		break;
 
 	case DATA_TYPE_UINT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
-		    sizeof (uint32_t), (xdrproc_t)xdr_u_int);
+		    sizeof (uint32_t), nvs_xdr_nvp_u_int);
 		break;
 
 	case DATA_TYPE_INT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
-		    sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
+		    sizeof (int64_t), nvs_xdr_nvp_longlong_t);
 		break;
 
 	case DATA_TYPE_UINT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
-		    sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
+		    sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t);
 		break;
 
 	case DATA_TYPE_STRING_ARRAY: {
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 24c016c5fcf..e0dc6ed9574 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -367,6 +367,12 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
 	return (error);
 }
 
+static void
+zfs_rele_async_task(void *arg)
+{
+	iput(arg);
+}
+
 void
 zfs_zrele_async(znode_t *zp)
 {
@@ -386,7 +392,7 @@ zfs_zrele_async(znode_t *zp)
 	 */
 	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
-		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+		    zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 	}
 }
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 524c43dcded..0319148b983 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -591,8 +591,8 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
  * only used to support mmap(2).  There will be an identical copy of the
  * data in the ARC which is kept up to date via .write() and .writepage().
  */
-static int
-zpl_readpage(struct file *filp, struct page *pp)
+static inline int
+zpl_readpage_common(struct page *pp)
 {
 	struct inode *ip;
 	struct page *pl[1];
@@ -620,6 +620,18 @@ zpl_readpage(struct file *filp, struct page *pp)
 	return (error);
 }
 
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+	return (zpl_readpage_common(pp));
+}
+
+static int
+zpl_readpage_filler(void *data, struct page *pp)
+{
+	return (zpl_readpage_common(pp));
+}
+
 /*
  * Populate a set of pages with data for the Linux page cache.  This
  * function will only be called for read ahead and never for demand
@@ -630,8 +642,7 @@ static int
 zpl_readpages(struct file *filp, struct address_space *mapping,
     struct list_head *pages, unsigned nr_pages)
 {
-	return (read_cache_pages(mapping, pages,
-	    (filler_t *)zpl_readpage, filp));
+	return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
 }
 
 static int
diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c
index 4e0fa0d85cb..c6ade59b9ce 100644
--- a/module/zfs/zcp_synctask.c
+++ b/module/zfs/zcp_synctask.c
@@ -54,6 +54,12 @@ typedef struct zcp_synctask_info {
 	int blocks_modified;
 } zcp_synctask_info_t;
 
+static void
+zcp_synctask_cleanup(void *arg)
+{
+	fnvlist_free(arg);
+}
+
 /*
  * Generic synctask interface for channel program syncfuncs.
  *
@@ -275,7 +281,7 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
 	fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
 
 	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
+	    zcp_synctask_cleanup, ddsa.ddsa_snaps);
 
 	err = zcp_sync_task(state, dsl_dataset_snapshot_check,
 	    dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
@@ -363,7 +369,7 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
 	fnvlist_add_boolean(dpsa->dpsa_props, prop);
 
 	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props);
+	    zcp_synctask_cleanup, dpsa->dpsa_props);
 
 	err = zcp_sync_task(state, zcp_synctask_inherit_prop_check,
 	    zcp_synctask_inherit_prop_sync, &zipa, sync, dsname);
@@ -402,7 +408,7 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
 	fnvlist_add_string(bmarks, new, source);
 
 	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, bmarks);
+	    zcp_synctask_cleanup, bmarks);
 
 	dsl_bookmark_create_arg_t dbca = {
 		.dbca_bmarks = bmarks,
@@ -467,8 +473,7 @@ zcp_synctask_wrapper(lua_State *state)
 	 * Make sure err_details is properly freed, even if a fatal error is
 	 * thrown during the synctask.
 	 */
-	zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, err_details);
+	zch = zcp_register_cleanup(state, zcp_synctask_cleanup, err_details);
 
 	zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
 	boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 78d0711cce4..d8d39f861c7 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1822,12 +1822,13 @@ zil_itx_destroy(itx_t *itx)
  * so no locks are needed.
  */
 static void
-zil_itxg_clean(itxs_t *itxs)
+zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
+	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
@@ -2047,7 +2048,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
-	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
+	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index e33d36dab5f..6030b3813f2 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1891,8 +1891,8 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
-	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
-	    flags, &zio->io_tqent);
+	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
+	    &zio->io_tqent);
 }
 
 static boolean_t
@@ -1923,7 +1923,7 @@ zio_issue_async(zio_t *zio)
 }
 
 void
-zio_interrupt(zio_t *zio)
+zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
@@ -1981,8 +1981,8 @@ zio_delay_interrupt(zio_t *zio)
 				 * OpenZFS's timeout_generic().
 				 */
 				tid = taskq_dispatch_delay(system_taskq,
-				    (task_func_t *)zio_interrupt,
-				    zio, TQ_NOSLEEP, expire_at_tick);
+				    zio_interrupt, zio, TQ_NOSLEEP,
+				    expire_at_tick);
 				if (tid == TASKQID_INVALID) {
 					/*
 					 * Couldn't allocate a task.  Just
@@ -2103,7 +2103,7 @@ static zio_pipe_stage_t *zio_pipeline[];
  * it is externally visible.
  */
 void
-zio_execute(zio_t *zio)
+zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
@@ -2292,8 +2292,9 @@ zio_nowait(zio_t *zio)
  */
 
 static void
-zio_reexecute(zio_t *pio)
+zio_reexecute(void *arg)
 {
+	zio_t *pio = arg;
 	zio_t *cio, *cio_next;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -4788,8 +4789,7 @@ zio_done(zio_t *zio)
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
-			    (task_func_t *)zio_reexecute, zio, 0,
-			    &zio->io_tqent);
+			    zio_reexecute, zio, 0, &zio->io_tqent);
 		}
 		return (NULL);
 	}
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index e7b84fa815a..c4ecf14df6d 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1195,6 +1195,12 @@ zvol_create_minor(const char *name)
  * Remove minors for specified dataset including children and snapshots.
  */
 
+static void
+zvol_free_task(void *arg)
+{
+	ops->zv_free(arg);
+}
+
 void
 zvol_remove_minors_impl(const char *name)
 {
@@ -1243,8 +1249,8 @@ zvol_remove_minors_impl(const char *name)
 			mutex_exit(&zv->zv_state_lock);
 
 			/* Try parallel zv_free, if failed do it in place */
-			t = taskq_dispatch(system_taskq,
-			    (task_func_t *)ops->zv_free, zv, TQ_SLEEP);
+			t = taskq_dispatch(system_taskq, zvol_free_task, zv,
+			    TQ_SLEEP);
 			if (t == TASKQID_INVALID)
 				list_insert_head(&free_list, zv);
 		} else {

From e04210035eba31c40978658b8d2fa7f823a9f7de Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Tue, 20 Jul 2021 23:08:45 +0900
Subject: [PATCH 13/20] dmu_redact.c does not call bqueue_destroy

Ensure all calls to bqueue_init() has a corresponding call to bqueue_destroy()

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #12118
---
 module/zfs/dmu_redact.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index 62c7d01d4bd..fdbdf7d6e86 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c
@@ -816,6 +816,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
 		avl_remove(&end_tree, &redact_nodes[i]);
 		kmem_free(redact_nodes[i].record,
 		    sizeof (struct redact_record));
+		bqueue_destroy(&thread_args[i].q);
 	}
 
 	avl_destroy(&start_tree);
@@ -1164,6 +1165,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
 	(void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,
 	    TS_RUN, minclsyspri);
 	err = perform_redaction(os, new_rl, rmta);
+	bqueue_destroy(&rmta->q);
 	kmem_free(rmta, sizeof (struct redact_merge_thread_arg));
 
 out:

From 8172df643b6cdc7fe233b18c8d8e9b29455ae9f1 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 20 Jul 2021 10:13:21 -0400
Subject: [PATCH 14/20] Minor ARC optimizations

Remove unneeded global, practically constant, state pointer variables
(arc_anon, arc_mru, etc.), replacing them with macros of real state
variables addresses (&ARC_anon, &ARC_mru, etc.).

Change ARC_EVICT_ALL from -1ULL to UINT64_MAX, not requiring special
handling in inner loop of ARC reclamation.  Respectively change bytes
argument of arc_evict_state() from int64_t to uint64_t.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Closes #12348
---
 include/sys/arc.h      |  2 +-
 include/sys/arc_impl.h | 11 +++++++++--
 module/zfs/arc.c       | 40 +++++++++-------------------------------
 3 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/include/sys/arc.h b/include/sys/arc.h
index f58fa53b600..ef07a657f53 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -44,7 +44,7 @@ extern "C" {
  * Used by arc_flush() to inform arc_evict_state() that it should evict
  * all available buffers from the arc state being passed in.
  */
-#define	ARC_EVICT_ALL	-1ULL
+#define	ARC_EVICT_ALL	UINT64_MAX
 
 #define	HDR_SET_LSIZE(hdr, x) do { \
 	ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index ddfa28c15d1..747100a2206 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -964,6 +964,13 @@ typedef struct arc_evict_waiter {
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
 #define	arc_sys_free	ARCSTAT(arcstat_sys_free) /* target system free bytes */
 
+#define	arc_anon	(&ARC_anon)
+#define	arc_mru		(&ARC_mru)
+#define	arc_mru_ghost	(&ARC_mru_ghost)
+#define	arc_mfu		(&ARC_mfu)
+#define	arc_mfu_ghost	(&ARC_mfu_ghost)
+#define	arc_l2c_only	(&ARC_l2c_only)
+
 extern taskq_t *arc_prune_taskq;
 extern arc_stats_t arc_stats;
 extern arc_sums_t arc_sums;
@@ -974,8 +981,8 @@ extern int arc_no_grow_shift;
 extern int arc_shrink_shift;
 extern kmutex_t arc_prune_mtx;
 extern list_t arc_prune_list;
-extern arc_state_t	*arc_mfu;
-extern arc_state_t	*arc_mru;
+extern arc_state_t	ARC_mfu;
+extern arc_state_t	ARC_mru;
 extern uint_t zfs_arc_pc_percent;
 extern int arc_lotsfree_percent;
 extern unsigned long zfs_arc_min;
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f1cd482e990..bf76c8523f4 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -648,13 +648,6 @@ arc_sums_t arc_sums;
 	} while (0)
 
 kstat_t			*arc_ksp;
-static arc_state_t	*arc_anon;
-static arc_state_t	*arc_mru_ghost;
-static arc_state_t	*arc_mfu_ghost;
-static arc_state_t	*arc_l2c_only;
-
-arc_state_t	*arc_mru;
-arc_state_t	*arc_mfu;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
@@ -2203,7 +2196,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 		return;
 	}
 
-	ASSERT(!GHOST_STATE(state));
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
@@ -2244,7 +2236,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 		return;
 	}
 
-	ASSERT(!GHOST_STATE(state));
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
@@ -4031,23 +4022,21 @@ arc_set_need_free(void)
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
-    uint64_t spa, int64_t bytes)
+    uint64_t spa, uint64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
-	int evict_count = 0;
+	int evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
-	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
 	mls = multilist_sublist_lock(ml, idx);
 
-	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
-		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
-		    (evict_count >= zfs_arc_evict_batch_limit))
+		if ((evict_count <= 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
@@ -4109,7 +4098,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
-				evict_count++;
+				evict_count--;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
@@ -4170,7 +4159,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
-arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
     arc_buf_contents_t type)
 {
 	uint64_t total_evicted = 0;
@@ -4178,8 +4167,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 
-	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
-
 	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
@@ -4211,7 +4198,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
-	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+	while (total_evicted < bytes) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
 
@@ -4239,9 +4226,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
-			if (bytes == ARC_EVICT_ALL)
-				bytes_remaining = ARC_EVICT_ALL;
-			else if (total_evicted < bytes)
+			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
@@ -4336,7 +4321,7 @@ static uint64_t
 arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
     arc_buf_contents_t type)
 {
-	int64_t delta;
+	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
@@ -7596,13 +7581,6 @@ arc_tuning_update(boolean_t verbose)
 static void
 arc_state_init(void)
 {
-	arc_anon = &ARC_anon;
-	arc_mru = &ARC_mru;
-	arc_mru_ghost = &ARC_mru_ghost;
-	arc_mfu = &ARC_mfu;
-	arc_mfu_ghost = &ARC_mfu_ghost;
-	arc_l2c_only = &ARC_l2c_only;
-
 	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),

From a7bd20e309a4b45b18b1da8e379f5826debe4870 Mon Sep 17 00:00:00 2001
From: Kevin Jin <33590050+jxdking@users.noreply.github.com>
Date: Tue, 20 Jul 2021 11:40:24 -0400
Subject: [PATCH 15/20] Add Module Parameter Regarding Log Size Limit

* Add Module Parameters Regarding Log Size Limit

zfs_wrlog_data_max
The upper limit of TX_WRITE log data. Once it is reached,
write operation is blocked, until log data is cleared out
after txg sync. It only counts TX_WRITE log with WR_COPIED
or WR_NEED_COPY.

Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: jxdking <lostking2008@hotmail.com>
Closes #12284
---
 include/sys/dmu_tx.h   |  1 +
 include/sys/dsl_pool.h |  7 ++++++
 man/man4/zfs.4         | 12 +++++++++
 module/zfs/arc.c       | 12 +++++++++
 module/zfs/dmu_tx.c    |  7 ++++++
 module/zfs/dsl_pool.c  | 57 ++++++++++++++++++++++++++++++++++++++++++
 module/zfs/zfs_log.c   |  5 ++++
 module/zfs/zvol.c      |  7 ++++--
 8 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h
index 60e9ed6e26f..71a9ac7ca7b 100644
--- a/include/sys/dmu_tx.h
+++ b/include/sys/dmu_tx.h
@@ -124,6 +124,7 @@ typedef struct dmu_tx_stats {
 	kstat_named_t dmu_tx_dirty_throttle;
 	kstat_named_t dmu_tx_dirty_delay;
 	kstat_named_t dmu_tx_dirty_over_max;
+	kstat_named_t dmu_tx_wrlog_over_max;
 	kstat_named_t dmu_tx_dirty_frees_delay;
 	kstat_named_t dmu_tx_quota;
 } dmu_tx_stats_t;
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 8249bb8fc63..44900f8ceb2 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -40,6 +40,7 @@
 #include <sys/rrwlock.h>
 #include <sys/dsl_synctask.h>
 #include <sys/mmp.h>
+#include <sys/aggsum.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -58,6 +59,7 @@ struct dsl_deadlist;
 
 extern unsigned long zfs_dirty_data_max;
 extern unsigned long zfs_dirty_data_max_max;
+extern unsigned long zfs_wrlog_data_max;
 extern int zfs_dirty_data_sync_percent;
 extern int zfs_dirty_data_max_percent;
 extern int zfs_dirty_data_max_max_percent;
@@ -119,6 +121,9 @@ typedef struct dsl_pool {
 	uint64_t dp_mos_compressed_delta;
 	uint64_t dp_mos_uncompressed_delta;
 
+	aggsum_t dp_wrlog_pertxg[TXG_SIZE];
+	aggsum_t dp_wrlog_total;
+
 	/*
 	 * Time of most recently scheduled (furthest in the future)
 	 * wakeup for delayed transactions.
@@ -158,6 +163,8 @@ int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
 uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
     zfs_space_check_t slop_policy);
+void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
+boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
 void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 346e83a9eb8..9a1dec3e67c 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1068,6 +1068,18 @@ Start syncing out a transaction group if there's at least this much dirty data
 This should be less than
 .Sy zfs_vdev_async_write_active_min_dirty_percent .
 .
+.It Sy zfs_wrlog_data_max Ns = Pq int
+The upper limit of write-transaction zil log data size in bytes.
+Once it is reached, write operation is blocked, until log data is cleared out
+after transaction group sync. Because of some overhead, it should be set
+at least 2 times the size of
+.Sy zfs_dirty_data_max
+.No to prevent harming normal write throughput.
+It also should be smaller than the size of the slog device if slog is present.
+.Pp
+Defaults to
+.Sy zfs_dirty_data_max*2
+.
 .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
 Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
 preallocated for a file in order to guarantee that later writes will not
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index bf76c8523f4..02663e8e2e5 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -7980,6 +7980,18 @@ arc_init(void)
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
+
+	if (zfs_wrlog_data_max == 0) {
+
+		/*
+		 * dp_wrlog_total is reduced for each txg at the end of
+		 * spa_sync(). However, dp_dirty_total is reduced every time
+		 * a block is written out. Thus under normal operation,
+		 * dp_wrlog_total could grow 2 times as big as
+		 * zfs_dirty_data_max.
+		 */
+		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
+	}
 }
 
 void
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 0beb983f992..5fa51686666 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = {
 	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_wrlog_over_max",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 },
 };
@@ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 		return (SET_ERROR(ERESTART));
 	}
 
+	if (!tx->tx_dirty_delayed &&
+	    dsl_pool_wrlog_over_max(tx->tx_pool)) {
+		DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
+		return (SET_ERROR(ERESTART));
+	}
+
 	if (!tx->tx_dirty_delayed &&
 	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
 		tx->tx_wait_dirty = B_TRUE;
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 72f4b86d772..1350f132956 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0;
 int zfs_dirty_data_max_percent = 10;
 int zfs_dirty_data_max_max_percent = 25;
 
+/*
+ * zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
+ * Once it is reached, write operation is blocked,
+ * until log data is cleared out after txg sync.
+ * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
+ */
+unsigned long zfs_wrlog_data_max = 0;
+
 /*
  * If there's at least this much dirty data (as a percentage of
  * zfs_dirty_data_max), push out a txg.  This should be less than
@@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
+	aggsum_init(&dp->dp_wrlog_total, 0);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		aggsum_init(&dp->dp_wrlog_pertxg[i], 0);
+	}
+
 	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
 	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
 	    TASKQ_THREADS_CPU_PCT);
@@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp)
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	cv_destroy(&dp->dp_spaceavail_cv);
+
+	ASSERT0(aggsum_value(&dp->dp_wrlog_total));
+	aggsum_fini(&dp->dp_wrlog_total);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));
+		aggsum_fini(&dp->dp_wrlog_pertxg[i]);
+	}
+
 	taskq_destroy(dp->dp_unlinked_drain_taskq);
 	taskq_destroy(dp->dp_zrele_taskq);
 	if (dp->dp_blkstats != NULL) {
@@ -592,6 +613,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
+void
+dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
+{
+	ASSERT3S(size, >=, 0);
+
+	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);
+	aggsum_add(&dp->dp_wrlog_total, size);
+
+	/* Choose a value slightly bigger than min dirty sync bytes */
+	uint64_t sync_min =
+	    zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
+	if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
+		txg_kick(dp, txg);
+}
+
+boolean_t
+dsl_pool_wrlog_over_max(dsl_pool_t *dp)
+{
+	return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
+}
+
+static void
+dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
+{
+	int64_t delta;
+	delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
+	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
+	aggsum_add(&dp->dp_wrlog_total, delta);
+}
+
 #ifdef ZFS_DEBUG
 static boolean_t
 dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
@@ -816,6 +867,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
+
+	dsl_pool_wrlog_clear(dp, txg);
+
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
@@ -1405,6 +1459,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
 	"Determines the dirty space limit");
 
+ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
+	"The size limit of write-transaction zil log data");
+
 /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
 	"zfs_dirty_data_max upper bound in bytes");
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 30d5c4821ae..0f330ec933a 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -541,6 +541,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	itx_wr_state_t write_state;
 	uintptr_t fsync_cnt;
 	uint64_t gen = 0;
+	ssize_t size = resid;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked ||
 	    zfs_xattr_owner_unlinked(zp)) {
@@ -626,6 +627,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		off += len;
 		resid -= len;
 	}
+
+	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
+		dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg);
+	}
 }
 
 /*
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index c4ecf14df6d..b7bc587cf62 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -84,10 +84,8 @@
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
-
 #include <sys/zvol_impl.h>
 
-
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 
@@ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	itx_wr_state_t write_state;
+	uint64_t sz = size;
 
 	if (zil_replaying(zilog, tx))
 		return;
@@ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 		offset += len;
 		size -= len;
 	}
+
+	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
+		dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
+	}
 }
 
 /*

From bc93935ef0294e559bd0b46809a33c717070a8ce Mon Sep 17 00:00:00 2001
From: George Melikov <mail@gmelikov.ru>
Date: Wed, 21 Jul 2021 01:21:00 +0300
Subject: [PATCH 16/20] CI: generate ABI files if changed

So commit author can just download them as
artifacts and commit.

Reviewed-by: Ryan Moeller <ryan@ixsystems.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Signed-off-by: George Melikov <mail@gmelikov.ru>
Closes #12379
---
 .github/workflows/checkstyle.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/checkstyle.yaml b/.github/workflows/checkstyle.yaml
index 8dcd5047a74..33276d26909 100644
--- a/.github/workflows/checkstyle.yaml
+++ b/.github/workflows/checkstyle.yaml
@@ -32,5 +32,19 @@ jobs:
       run: |
         make lint
     - name: CheckABI
+      id: CheckABI
       run: |
         make checkabi
+    - name: StoreABI
+      if: failure() && steps.CheckABI.outcome == 'failure'
+      run: |
+        make storeabi
+    - name: Prepare artifacts
+      if: failure() && steps.CheckABI.outcome == 'failure'
+      run: |
+        find -name *.abi | tar -cf abi_files.tar -T -
+    - uses: actions/upload-artifact@v2
+      if: failure() && steps.CheckABI.outcome == 'failure'
+      with:
+        name: New ABI files (use only if you're sure about interface changes)
+        path: abi_files.tar

From 1b50749ce9757030ddd4dba9f03ee18182cda82e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 21 Jul 2021 08:40:36 -0400
Subject: [PATCH 17/20] Optimize allocation throttling

Remove mc_lock use from metaslab_class_throttle_*().  The math there
is based on refcounts and so atomic, so the only race possible there
is between zfs_refcount_count() and zfs_refcount_add().  But in most
cases metaslab_class_throttle_reserve() is called with the allocator
lock held, which covers the race.  In cases where the lock is not
held, GANG_ALLOCATION() or METASLAB_MUST_RESERVE are set, and so we
do not use zfs_refcount_count().  And even if we assume some other
non-existing scenario, the worst that may happen from this race is
few more I/Os get to allocation earlier, that is not a problem.

Move locks and data of different allocators into different cache
lines to avoid false sharing.  Group spa_alloc_* arrays together
into single array of aligned struct spa_alloc spa_allocs.  Align
struct metaslab_class_allocator.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12314
---
 include/sys/metaslab_impl.h |  2 +-
 include/sys/spa_impl.h      | 15 +++++++++------
 module/zfs/metaslab.c       | 20 ++++----------------
 module/zfs/spa.c            | 12 ++++++------
 module/zfs/spa_misc.c       | 21 +++++++++------------
 module/zfs/zio.c            | 33 ++++++++++++++++-----------------
 6 files changed, 45 insertions(+), 58 deletions(-)

diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 9924c3ba0ea..adf4c03a20d 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
 	 */
 	uint64_t		mca_alloc_max_slots;
 	zfs_refcount_t		mca_alloc_slots;
-} metaslab_class_allocator_t;
+} ____cacheline_aligned metaslab_class_allocator_t;
 
 /*
  * A metaslab class encompasses a category of allocatable top-level vdevs.
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 280f8cf1695..21729e617ac 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -57,6 +57,11 @@
 extern "C" {
 #endif
 
+typedef struct spa_alloc {
+	kmutex_t	spaa_lock;
+	avl_tree_t	spaa_tree;
+} ____cacheline_aligned spa_alloc_t;
+
 typedef struct spa_error_entry {
 	zbookmark_phys_t	se_bookmark;
 	char			*se_name;
@@ -250,13 +255,11 @@ struct spa {
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
 	/*
-	 * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
-	 * stored in spa_alloc_count. There is one tree and one lock for each
-	 * allocator, to help improve allocation performance in write-heavy
-	 * workloads.
+	 * spa_allocs is an array, whose lengths is stored in spa_alloc_count.
+	 * There is one tree and one lock for each allocator, to help improve
+	 * allocation performance in write-heavy workloads.
 	 */
-	kmutex_t	*spa_alloc_locks;
-	avl_tree_t	*spa_alloc_trees;
+	spa_alloc_t	*spa_allocs;
 	int		spa_alloc_count;
 
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 23f3e2989ae..93d409ceb43 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5611,19 +5611,11 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
-	uint64_t available_slots = 0;
-	boolean_t slot_reserved = B_FALSE;
 	uint64_t max = mca->mca_alloc_max_slots;
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	mutex_enter(&mc->mc_lock);
-
-	uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
-	if (reserved_slots < max)
-		available_slots = max - reserved_slots;
-
-	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
-	    flags & METASLAB_MUST_RESERVE) {
+	if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
+	    zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
 		/*
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
@@ -5631,11 +5623,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
 		for (int d = 0; d < slots; d++)
 			zfs_refcount_add(&mca->mca_alloc_slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
-		slot_reserved = B_TRUE;
+		return (B_TRUE);
 	}
-
-	mutex_exit(&mc->mc_lock);
-	return (slot_reserved);
+	return (B_FALSE);
 }
 
 void
@@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	mutex_enter(&mc->mc_lock);
 	for (int d = 0; d < slots; d++)
 		zfs_refcount_remove(&mca->mca_alloc_slots, zio);
-	mutex_exit(&mc->mc_lock);
 }
 
 static int
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index f6dce076d13..2a4db7d562b 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -9197,9 +9197,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa->spa_sync_pass = 0;
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
+		mutex_enter(&spa->spa_allocs[i].spaa_lock);
+		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
+		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
@@ -9309,9 +9309,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	dsl_pool_sync_done(dp, txg);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
+		mutex_enter(&spa->spa_allocs[i].spaa_lock);
+		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
+		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 29a5381e4b4..58039f3d103 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -700,13 +700,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		spa->spa_root = spa_strdup(altroot);
 
 	spa->spa_alloc_count = spa_allocators;
-	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (kmutex_t), KM_SLEEP);
-	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (avl_tree_t), KM_SLEEP);
+	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
-		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
+		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
 	}
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
@@ -799,13 +798,11 @@ spa_remove(spa_t *spa)
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		avl_destroy(&spa->spa_alloc_trees[i]);
-		mutex_destroy(&spa->spa_alloc_locks[i]);
+		avl_destroy(&spa->spa_allocs[i].spaa_tree);
+		mutex_destroy(&spa->spa_allocs[i].spaa_lock);
 	}
-	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
-	    sizeof (kmutex_t));
-	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
-	    sizeof (avl_tree_t));
+	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
+	    sizeof (spa_alloc_t));
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6030b3813f2..76ed4fad430 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
-		if (zio->io_metaslab_class == NULL)
-			zio->io_metaslab_class = pio->io_metaslab_class;
+		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -3380,9 +3379,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
+	ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
 
-	zio = avl_first(&spa->spa_alloc_trees[allocator]);
+	zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
 	if (zio == NULL)
 		return (NULL);
 
@@ -3394,11 +3393,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 	 */
 	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
-	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
+	    zio->io_prop.zp_copies, allocator, zio, 0)) {
 		return (NULL);
 	}
 
-	avl_remove(&spa->spa_alloc_trees[allocator], zio);
+	avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
@@ -3422,8 +3421,8 @@ zio_dva_throttle(zio_t *zio)
 		return (zio);
 	}
 
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
@@ -3435,14 +3434,14 @@ zio_dva_throttle(zio_t *zio)
 	 * into 2^20 block regions, and then hash based on the objset, object,
 	 * level, and region to accomplish both of these goals.
 	 */
-	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+	int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
 	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_allocator = allocator;
 	zio->io_metaslab_class = mc;
-	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
-	nio = zio_io_to_allocate(spa, zio->io_allocator);
-	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
+	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
+	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
+	nio = zio_io_to_allocate(spa, allocator);
+	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	return (nio);
 }
 
@@ -3451,9 +3450,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	mutex_enter(&spa->spa_alloc_locks[allocator]);
+	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	zio = zio_io_to_allocate(spa, allocator);
-	mutex_exit(&spa->spa_alloc_locks[allocator]);
+	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	if (zio == NULL)
 		return;
 
@@ -3643,8 +3642,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * some parallelism.
 	 */
 	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
-	int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
-	    spa->spa_alloc_count;
+	int allocator = (uint_t)cityhash4(0, 0, 0,
+	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	*slog = (error == 0);

From c14ad80fcbcfc011686f01a89644eea7c028a879 Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Thu, 22 Jul 2021 11:22:27 +0900
Subject: [PATCH 18/20] Remove old orig_fd variable from zfs send

Possibly required in the past, but is currently fills no purpose.
Ordinarily such tiny cleanup is not generally worth it, however
on the macOS port, in a future commit, we do unspeakable things to the
"fd" for send/recv, and it would be easier to only have to deal with
one "fd" instead of two.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #12404
---
 lib/libzfs/libzfs_sendrecv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 5c57028c401..36a480d36a0 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -2391,7 +2391,6 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 	int err;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char *name = zhp->zfs_name;
-	int orig_fd = fd;
 	pthread_t ptid;
 	progress_arg_t pa = { 0 };
 
@@ -2523,7 +2522,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 
 	if (flags->props || flags->holds || flags->backup) {
 		/* Write the final end record. */
-		err = send_conclusion_record(orig_fd, NULL);
+		err = send_conclusion_record(fd, NULL);
 		if (err != 0)
 			return (zfs_standard_error(hdl, err, errbuf));
 	}

From 46197dc85892fc15a21b409dc78054a28ac85d6e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 22 Jul 2021 12:22:14 -0400
Subject: [PATCH 19/20] FreeBSD: Ignore make_dev_s() errors

Since errors returned by zvol_create_minor_impl() are ignored by the
common code, it is more convenient to ignore make_dev_s() errors there.
It allows, for example, to get device created for the zvol after later
rename instead of having it further stuck in half-created state.
zvol_rename_minor() already ignores those errors.

While there, switch from MAXPHYS to maxphys in FreeBSD 13+.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12375
---
 module/os/freebsd/zfs/zvol_os.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index 34aad72fbbf..45036919256 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -1241,7 +1241,11 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
 		args.mda_si_drv2 = zv;
 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
 		    == 0) {
+#if __FreeBSD_version > 1300130
+			dev->si_iosize_max = maxphys;
+#else
 			dev->si_iosize_max = MAXPHYS;
+#endif
 			zsd->zsd_cdev = dev;
 		}
 	}
@@ -1277,9 +1281,10 @@ zvol_free(zvol_state_t *zv)
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;
 
-		ASSERT3P(dev->si_drv2, ==, NULL);
-
-		destroy_dev(dev);
+		if (dev != NULL) {
+			ASSERT3P(dev->si_drv2, ==, NULL);
+			destroy_dev(dev);
+		}
 	}
 
 	mutex_destroy(&zv->zv_state_lock);
@@ -1374,16 +1379,15 @@ zvol_create_minor_impl(const char *name)
 		args.mda_gid = GID_OPERATOR;
 		args.mda_mode = 0640;
 		args.mda_si_drv2 = zv;
-		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
-		if (error) {
-			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
-			mutex_destroy(&zv->zv_state_lock);
-			kmem_free(zv, sizeof (*zv));
-			dmu_objset_disown(os, B_TRUE, FTAG);
-			goto out_doi;
+		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
+		    == 0) {
+#if __FreeBSD_version > 1300130
+			dev->si_iosize_max = maxphys;
+#else
+			dev->si_iosize_max = MAXPHYS;
+#endif
+			zsd->zsd_cdev = dev;
 		}
-		dev->si_iosize_max = MAXPHYS;
-		zsd->zsd_cdev = dev;
 	}
 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
@@ -1456,7 +1460,8 @@ zvol_clear_private(zvol_state_t *zv)
 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 		struct cdev *dev = zsd->zsd_cdev;
 
-		dev->si_drv2 = NULL;
+		if (dev != NULL)
+			dev->si_drv2 = NULL;
 	}
 }
 

From 14b43fbd9c13d802409ed886bb6b66fd528fb209 Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Thu, 22 Jul 2021 17:29:27 -0400
Subject: [PATCH 20/20] zloop: Add a max iterations option, use default
 run/pass times

It is useful to have control over the number of iterations of zloop so
we can easily produce "x core dumps found *in y iterations*" metrics.

Using random values for run/pass times doesn't improve coverage in a
meaningful way.

Randomizing run time could be seen as a compromise between running a
greater variety of shorter tests versus a smaller variety of longer
tests within a fixed time span.  However, it is not desirable when
running a fixed number of iterations.

Pass time already incorporates randomness within ztest.

Either parameter can be passed to ztest explicitly if the defaults are
not satisfactory.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #12411
---
 .github/workflows/zloop.yml |  2 +-
 scripts/zloop.sh            | 60 ++++++++++++++++++++-----------------
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml
index b3679e7f7f2..cf81ad4bcaf 100644
--- a/.github/workflows/zloop.yml
+++ b/.github/workflows/zloop.yml
@@ -45,7 +45,7 @@ jobs:
       run: |
         sudo mkdir -p $TEST_DIR
         # run for 20 minutes to have a total runner time of 30 minutes
-        sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1
+        sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1 -- -T 120 -P 60
     - name: Prepare artifacts
       if: failure()
       run: |
diff --git a/scripts/zloop.sh b/scripts/zloop.sh
index 546e7001776..4a572ebab1f 100755
--- a/scripts/zloop.sh
+++ b/scripts/zloop.sh
@@ -38,25 +38,30 @@ DEFAULTCOREDIR=/var/tmp/zloop
 
 function usage
 {
-	echo -e "\n$0 [-t <timeout>] [ -s <vdev size> ] [-c <dump directory>]" \
-	    "[ -- [extra ztest parameters]]\n" \
-	    "\n" \
-	    "  This script runs ztest repeatedly with randomized arguments.\n" \
-	    "  If a crash is encountered, the ztest logs, any associated\n" \
-	    "  vdev files, and core file (if one exists) are moved to the\n" \
-	    "  output directory ($DEFAULTCOREDIR by default). Any options\n" \
-	    "  after the -- end-of-options marker will be passed to ztest.\n" \
-	    "\n" \
-	    "  Options:\n" \
-	    "    -t  Total time to loop for, in seconds. If not provided,\n" \
-	    "        zloop runs forever.\n" \
-	    "    -s  Size of vdev devices.\n" \
-	    "    -f  Specify working directory for ztest vdev files.\n" \
-	    "    -c  Specify a core dump directory to use.\n" \
-	    "    -m  Max number of core dumps to allow before exiting.\n" \
-	    "    -l  Create 'ztest.core.N' symlink to core directory.\n" \
-	    "    -h  Print this help message.\n" \
-	    "" >&2
+	cat >&2 <<EOF
+
+$0 [-hl] [-c <dump directory>] [-f <vdev directory>]
+  [-m <max core dumps>] [-s <vdev size>] [-t <timeout>]
+  [-I <max iterations>] [-- [extra ztest parameters]]
+
+  This script runs ztest repeatedly with randomized arguments.
+  If a crash is encountered, the ztest logs, any associated
+  vdev files, and core file (if one exists) are moved to the
+  output directory ($DEFAULTCOREDIR by default). Any options
+  after the -- end-of-options marker will be passed to ztest.
+
+  Options:
+    -c  Specify a core dump directory to use.
+    -f  Specify working directory for ztest vdev files.
+    -h  Print this help message.
+    -l  Create 'ztest.core.N' symlink to core directory.
+    -m  Max number of core dumps to allow before exiting.
+    -s  Size of vdev devices.
+    -t  Total time to loop for, in seconds. If not provided,
+        zloop runs forever.
+    -I  Max number of iterations to loop before exiting.
+
+EOF
 }
 
 function or_die
@@ -185,10 +190,12 @@ timeout=0
 size="512m"
 coremax=0
 symlink=0
-while getopts ":ht:m:s:c:f:l" opt; do
+iterations=0
+while getopts ":ht:m:I:s:c:f:l" opt; do
 	case $opt in
 		t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;;
 		m ) [[ $OPTARG -gt 0 ]] && coremax=$OPTARG ;;
+		I ) [[ $OPTARG ]] && iterations=$OPTARG ;;
 		s ) [[ $OPTARG ]] && size=$OPTARG ;;
 		c ) [[ $OPTARG ]] && coredir=$OPTARG ;;
 		f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;;
@@ -233,9 +240,14 @@ ztrc=0		# ztest return value
 foundcrashes=0	# number of crashes found so far
 starttime=$(date +%s)
 curtime=$starttime
+iteration=0
 
 # if no timeout was specified, loop forever.
-while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
+while (( timeout == 0 )) || (( curtime <= (starttime + timeout) )); do
+	if (( iterations > 0 )) && (( iteration++ == iterations )); then
+		break
+	fi
+
 	zopt="-G -VVVVV"
 
 	# start each run with an empty directory
@@ -284,10 +296,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
 		raid_type="draid"
 	fi
 
-	# run from 30 to 120 seconds
-	runtime=$(((RANDOM % 90) + 30))
-	passtime=$((RANDOM % (runtime / 3 + 1) + 10))
-
 	zopt="$zopt -K $raid_type"
 	zopt="$zopt -m $mirrors"
 	zopt="$zopt -r $raid_children"
@@ -297,8 +305,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
 	zopt="$zopt -v $vdevs"
 	zopt="$zopt -a $align"
 	zopt="$zopt -C $class"
-	zopt="$zopt -T $runtime"
-	zopt="$zopt -P $passtime"
 	zopt="$zopt -s $size"
 	zopt="$zopt -f $workdir"