From 8af1104f83eb44501b83218ed456e2d4b0ac3521 Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Wed, 14 Jun 2023 17:01:17 +0200
Subject: [PATCH 1/5] Store the L2ARC device ashift in the vdev label

If this is not done, and the pool has an ashift other than the default
(at the moment 9) then the following happens:

1) vdev_alloc() assigns the ashift of the pool to L2ARC device, but
   upon export it is not stored anywhere
2) at the first import, vdev_open() sees an vdev_ashift() of 0 and
   assigns the logical_ashift, which is 9
3) reading the contents of L2ARC, including the header fails
4) L2ARC buffers are not restored in ARC.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #14313
Closes #14963
---
 module/zfs/vdev_label.c                       |  3 +++
 .../l2arc/persist_l2arc_001_pos.ksh           | 19 ++++++++-----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 85c7134ca4c..a5c76808f2d 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -486,6 +486,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (vd->vdev_isspare)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
+	if (flags & VDEV_CONFIG_L2CACHE)
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh
index 6f7b9aff7c3..a9968723c3c 100755
--- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh
@@ -27,15 +27,14 @@
 #
 # STRATEGY:
 #	1. Create pool with a cache device.
-#	2. Export and re-import pool without writing any data.
-#	3. Create a random file in that pool and random read for 10 sec.
-#	4. Export pool.
-#	5. Read the amount of log blocks written from the header of the
+#	2. Create a random file in that pool and random read for 10 sec.
+#	3. Export pool.
+#	4. Read the amount of log blocks written from the header of the
 #		L2ARC device.
-#	6. Import pool.
-#	7. Read the amount of log blocks rebuilt in arcstats and compare to
+#	5. Import pool.
+#	6. Read the amount of log blocks rebuilt in arcstats and compare to
 #		(5).
-#	8. Check if the labels of the L2ARC device are intact.
+#	7. Check if the labels of the L2ARC device are intact.
 #
 #	* We can predict the minimum bytes of L2ARC restored if we subtract
 #	from the effective size of the cache device the bytes l2arc_evict()
@@ -77,10 +76,8 @@ export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M
 
 log_must truncate -s ${cache_sz}M $VDEV_CACHE
 
-log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
-
-log_must zpool export $TESTPOOL
-log_must zpool import -d $VDIR $TESTPOOL
+log_must zpool create -f -o ashift=12 $TESTPOOL $VDEV
+log_must zpool add $TESTPOOL cache $VDEV_CACHE
 
 log_must fio $FIO_SCRIPTS/mkfiles.fio
 log_must fio $FIO_SCRIPTS/random_reads.fio

From d057807ede05ce809e9ba1e2b47b12ada0d3b2ed Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 14 Jun 2023 11:02:27 -0400
Subject: [PATCH 2/5] Switch refcount tracking from lists to AVL-trees.

With large number of tracked references list searches under the lock
become too expensive, creating enormous lock contention.

On my tests with ZFS_DEBUG enabled this increases write throughput
with 32KB blocks from ~1.2GB/s to ~7.5GB/s.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14970
---
 include/sys/zfs_refcount.h |  18 ++--
 module/zfs/refcount.c      | 189 +++++++++++++++++++------------------
 2 files changed, 110 insertions(+), 97 deletions(-)

diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h
index 4efa266a53c..77965a0aa58 100644
--- a/include/sys/zfs_refcount.h
+++ b/include/sys/zfs_refcount.h
@@ -27,6 +27,7 @@
 #define	_SYS_ZFS_REFCOUNT_H
 
 #include <sys/inttypes.h>
+#include <sys/avl.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
 
@@ -43,19 +44,22 @@ extern "C" {
 
 #ifdef	ZFS_DEBUG
 typedef struct reference {
-	list_node_t ref_link;
+	union {
+		avl_node_t a;
+		list_node_t l;
+	} ref_link;
 	const void *ref_holder;
 	uint64_t ref_number;
-	uint8_t *ref_removed;
+	boolean_t ref_search;
 } reference_t;
 
 typedef struct refcount {
-	kmutex_t rc_mtx;
-	boolean_t rc_tracked;
-	list_t rc_list;
-	list_t rc_removed;
 	uint64_t rc_count;
-	uint64_t rc_removed_count;
+	kmutex_t rc_mtx;
+	avl_tree_t rc_tree;
+	list_t rc_removed;
+	uint_t rc_removed_count;
+	boolean_t rc_tracked;
 } zfs_refcount_t;
 
 /*
diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c
index 601d27f8c47..718bbb34a8d 100644
--- a/module/zfs/refcount.c
+++ b/module/zfs/refcount.c
@@ -36,33 +36,40 @@ int reference_tracking_enable = B_FALSE;
 static uint_t reference_history = 3; /* tunable */
 
 static kmem_cache_t *reference_cache;
-static kmem_cache_t *reference_history_cache;
 
 void
 zfs_refcount_init(void)
 {
 	reference_cache = kmem_cache_create("reference_cache",
 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	reference_history_cache = kmem_cache_create("reference_history_cache",
-	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_refcount_fini(void)
 {
 	kmem_cache_destroy(reference_cache);
-	kmem_cache_destroy(reference_history_cache);
+}
+
+static int
+zfs_refcount_compare(const void *x1, const void *x2)
+{
+	const reference_t *r1 = (const reference_t *)x1;
+	const reference_t *r2 = (const reference_t *)x2;
+
+	int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
+	int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
+	int cmp = cmp1 ? cmp1 : cmp2;
+	return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
 }
 
 void
 zfs_refcount_create(zfs_refcount_t *rc)
 {
 	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&rc->rc_list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
+	    offsetof(reference_t, ref_link.a));
 	list_create(&rc->rc_removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	    offsetof(reference_t, ref_link.l));
 	rc->rc_count = 0;
 	rc->rc_removed_count = 0;
 	rc->rc_tracked = reference_tracking_enable;
@@ -86,16 +93,15 @@ void
 zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
 {
 	reference_t *ref;
+	void *cookie = NULL;
 
 	ASSERT3U(rc->rc_count, ==, number);
-	while ((ref = list_remove_head(&rc->rc_list)))
+	while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
 		kmem_cache_free(reference_cache, ref);
-	list_destroy(&rc->rc_list);
+	avl_destroy(&rc->rc_tree);
 
-	while ((ref = list_remove_head(&rc->rc_removed))) {
-		kmem_cache_free(reference_history_cache, ref->ref_removed);
+	while ((ref = list_remove_head(&rc->rc_removed)))
 		kmem_cache_free(reference_cache, ref);
-	}
 	list_destroy(&rc->rc_removed);
 	mutex_destroy(&rc->rc_mtx);
 }
@@ -121,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc)
 int64_t
 zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
-	reference_t *ref = NULL;
+	reference_t *ref;
 	int64_t count;
 
-	if (!rc->rc_tracked) {
+	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, number);
 		ASSERT3U(count, >=, number);
 		return (count);
@@ -133,8 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
 	ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
 	ref->ref_holder = holder;
 	ref->ref_number = number;
+	ref->ref_search = B_FALSE;
 	mutex_enter(&rc->rc_mtx);
-	list_insert_head(&rc->rc_list, ref);
+	avl_add(&rc->rc_tree, ref);
 	rc->rc_count += number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
@@ -151,7 +158,7 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
 void
 zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		(void) zfs_refcount_add_many(rc, number, holder);
 	else for (; number > 0; number--)
 		(void) zfs_refcount_add(rc, holder);
@@ -161,47 +168,42 @@ int64_t
 zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
     const void *holder)
 {
-	reference_t *ref;
+	reference_t *ref, s;
 	int64_t count;
 
-	if (!rc->rc_tracked) {
+	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, -number);
 		ASSERT3S(count, >=, 0);
 		return (count);
 	}
 
+	s.ref_holder = holder;
+	s.ref_number = number;
+	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ASSERT3U(rc->rc_count, >=, number);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder && ref->ref_number == number) {
-			list_remove(&rc->rc_list, ref);
-			if (reference_history > 0) {
-				ref->ref_removed =
-				    kmem_cache_alloc(reference_history_cache,
-				    KM_SLEEP);
-				list_insert_head(&rc->rc_removed, ref);
-				rc->rc_removed_count++;
-				if (rc->rc_removed_count > reference_history) {
-					ref = list_tail(&rc->rc_removed);
-					list_remove(&rc->rc_removed, ref);
-					kmem_cache_free(reference_history_cache,
-					    ref->ref_removed);
-					kmem_cache_free(reference_cache, ref);
-					rc->rc_removed_count--;
-				}
-			} else {
-				kmem_cache_free(reference_cache, ref);
-			}
-			rc->rc_count -= number;
-			count = rc->rc_count;
-			mutex_exit(&rc->rc_mtx);
-			return (count);
-		}
+	ref = avl_find(&rc->rc_tree, &s, NULL);
+	if (unlikely(ref == NULL)) {
+		panic("No such hold %p on refcount %llx", holder,
+		    (u_longlong_t)(uintptr_t)rc);
+		return (-1);
 	}
-	panic("No such hold %p on refcount %llx", holder,
-	    (u_longlong_t)(uintptr_t)rc);
-	return (-1);
+	avl_remove(&rc->rc_tree, ref);
+	if (reference_history > 0) {
+		list_insert_head(&rc->rc_removed, ref);
+		if (rc->rc_removed_count >= reference_history) {
+			ref = list_remove_tail(&rc->rc_removed);
+			kmem_cache_free(reference_cache, ref);
+		} else {
+			rc->rc_removed_count++;
+		}
+	} else {
+		kmem_cache_free(reference_cache, ref);
+	}
+	rc->rc_count -= number;
+	count = rc->rc_count;
+	mutex_exit(&rc->rc_mtx);
+	return (count);
 }
 
 int64_t
@@ -213,7 +215,7 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
 void
 zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		(void) zfs_refcount_remove_many(rc, number, holder);
 	else for (; number > 0; number--)
 		(void) zfs_refcount_remove(rc, holder);
@@ -222,31 +224,38 @@ zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 void
 zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
 {
-	int64_t count, removed_count;
-	list_t list, removed;
+	avl_tree_t tree;
+	list_t removed;
+	reference_t *ref;
+	void *cookie = NULL;
+	uint64_t count;
+	uint_t removed_count;
 
-	list_create(&list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
+	    offsetof(reference_t, ref_link.a));
 	list_create(&removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
+	    offsetof(reference_t, ref_link.l));
 
 	mutex_enter(&src->rc_mtx);
 	count = src->rc_count;
 	removed_count = src->rc_removed_count;
 	src->rc_count = 0;
 	src->rc_removed_count = 0;
-	list_move_tail(&list, &src->rc_list);
+	avl_swap(&tree, &src->rc_tree);
 	list_move_tail(&removed, &src->rc_removed);
 	mutex_exit(&src->rc_mtx);
 
 	mutex_enter(&dst->rc_mtx);
 	dst->rc_count += count;
 	dst->rc_removed_count += removed_count;
-	list_move_tail(&dst->rc_list, &list);
+	if (avl_is_empty(&dst->rc_tree))
+		avl_swap(&dst->rc_tree, &tree);
+	else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
+		avl_add(&dst->rc_tree, ref);
 	list_move_tail(&dst->rc_removed, &removed);
 	mutex_exit(&dst->rc_mtx);
 
-	list_destroy(&list);
+	avl_destroy(&tree);
 	list_destroy(&removed);
 }
 
@@ -254,23 +263,19 @@ void
 zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
     const void *current_holder, const void *new_holder)
 {
-	reference_t *ref;
-	boolean_t found = B_FALSE;
+	reference_t *ref, s;
 
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		return;
 
+	s.ref_holder = current_holder;
+	s.ref_number = number;
+	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == current_holder &&
-		    ref->ref_number == number) {
-			ref->ref_holder = new_holder;
-			found = B_TRUE;
-			break;
-		}
-	}
-	ASSERT(found);
+	ref = avl_find(&rc->rc_tree, &s, NULL);
+	ASSERT(ref);
+	ref->ref_holder = new_holder;
+	avl_update(&rc->rc_tree, ref);
 	mutex_exit(&rc->rc_mtx);
 }
 
@@ -290,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
 boolean_t
 zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
 {
-	reference_t *ref;
+	reference_t *ref, s;
+	avl_index_t idx;
+	boolean_t res;
 
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		return (zfs_refcount_count(rc) > 0);
 
+	s.ref_holder = holder;
+	s.ref_number = 0;
+	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder) {
-			mutex_exit(&rc->rc_mtx);
-			return (B_TRUE);
-		}
-	}
+	ref = avl_find(&rc->rc_tree, &s, &idx);
+	if (likely(ref == NULL))
+		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
+	res = ref && ref->ref_holder == holder;
 	mutex_exit(&rc->rc_mtx);
-	return (B_FALSE);
+	return (res);
 }
 
 /*
@@ -315,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
 boolean_t
 zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
 {
-	reference_t *ref;
+	reference_t *ref, s;
+	avl_index_t idx;
+	boolean_t res;
 
-	if (!rc->rc_tracked)
+	if (likely(!rc->rc_tracked))
 		return (B_TRUE);
 
 	mutex_enter(&rc->rc_mtx);
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder) {
-			mutex_exit(&rc->rc_mtx);
-			return (B_FALSE);
-		}
-	}
+	s.ref_holder = holder;
+	s.ref_number = 0;
+	s.ref_search = B_TRUE;
+	ref = avl_find(&rc->rc_tree, &s, &idx);
+	if (likely(ref == NULL))
+		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
+	res = ref == NULL || ref->ref_holder != holder;
 	mutex_exit(&rc->rc_mtx);
-	return (B_TRUE);
+	return (res);
 }
 
 EXPORT_SYMBOL(zfs_refcount_create);

From e32e326c5b3c6ba4a7632abcf86f394233041148 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 14 Jun 2023 10:04:05 -0500
Subject: [PATCH 3/5] ZTS: Skip send_raw_ashift on FreeBSD

On FreeBSD 14 this test runs slowly in the CI environment
and is killed by the 10 minute timeout.  Skip the test on
FreeBSD until the slow down is resolved.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #14961
---
 tests/test-runner/bin/zts-report.py.in                     | 1 +
 tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 9517ce8073a..cf438e0e649 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -173,6 +173,7 @@ if sys.platform.startswith('freebsd'):
         'link_count/link_count_001': ['SKIP', na_reason],
         'casenorm/mixed_create_failure': ['FAIL', 13215],
         'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
+        'rsend/send_raw_ashift': ['SKIP', 14961],
     })
 elif sys.platform.startswith('linux'):
     known.update({
diff --git a/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh b/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh
index 3cea334495d..f238c361134 100755
--- a/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh
@@ -37,6 +37,10 @@ verify_runnable "both"
 
 log_assert "Verify raw sending to pools with greater ashift succeeds"
 
+if is_freebsd; then
+	log_unsupported "Runs too long on FreeBSD 14 (Issue #14961)"
+fi
+
 function cleanup
 {
 	rm -f $BACKDIR/fs@*

From ccec7fbe1c66c5b63a3af9d152403ce43344f4ab Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 15 Jun 2023 13:49:03 -0400
Subject: [PATCH 4/5] Remove ARC/ZIO physdone callbacks.

Those callbacks were introduced many years ago as part of a bigger
patch to smoothen the write throttling within a txg. They allow to
account completion of individual physical writes within a logical
one, improving cases when some of physical writes complete much
sooner than others, gradually opening the write throttle.

Few years after that ZFS got allocation throttling, working on a
level of logical writes and limiting number of writes queued to
vdevs at any point, and so limiting latency distribution between
the physical writes and especially writes of multiple copies.
The addition of scheduling deadline I proposed in #14925 should
further reduce the latency distribution.  Grown memory sizes over
the past 10 years should also reduce importance of the smoothing.

While the use of physdone callback may still in theory provide
some smoother throttling, there are cases where we simply can not
afford it.  Since dirty data accounting is protected by pool-wide
lock, in case of 6-wide RAIDZ, for example, it requires us to take
it 8 times per logical block write, creating huge lock contention.

My tests of this patch show radical reduction of the lock spinning
time on workloads when smaller blocks are written to RAIDZ pools,
when each of the disks receives 8-16KB chunks, but the total rate
reaching 100K+ blocks per second.  Same time attempts to measure
any write time fluctuations didn't show anything noticeable.

While there, remove also io_child_count/io_parent_count counters.
They are used only for couple assertions that can be avoided.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14948
---
 include/sys/arc.h       |  5 +--
 include/sys/arc_impl.h  |  1 -
 include/sys/zio.h       |  9 +---
 module/zfs/arc.c        | 22 ++--------
 module/zfs/dbuf.c       | 94 ++++-------------------------------------
 module/zfs/dmu.c        |  4 +-
 module/zfs/dmu_objset.c |  2 +-
 module/zfs/zio.c        | 32 +++-----------
 8 files changed, 26 insertions(+), 143 deletions(-)

diff --git a/include/sys/arc.h b/include/sys/arc.h
index 836ed679dba..9d67dab06ca 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -304,9 +304,8 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
     arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
-    arc_write_done_func_t *physdone, arc_write_done_func_t *done,
-    void *priv, zio_priority_t priority, int zio_flags,
-    const zbookmark_phys_t *zb);
+    arc_write_done_func_t *done, void *priv, zio_priority_t priority,
+    int zio_flags, const zbookmark_phys_t *zb);
 
 arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
 void arc_remove_prune_callback(arc_prune_t *p);
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index fd24d2f3c8b..78774792f36 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -123,7 +123,6 @@ struct arc_write_callback {
 	void			*awcb_private;
 	arc_write_done_func_t	*awcb_ready;
 	arc_write_done_func_t	*awcb_children_ready;
-	arc_write_done_func_t	*awcb_physdone;
 	arc_write_done_func_t	*awcb_done;
 	arc_buf_t		*awcb_buf;
 };
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 6b1352a72b9..ec32211f690 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -460,7 +460,6 @@ struct zio {
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
-	zio_done_func_t	*io_physdone;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
@@ -503,9 +502,6 @@ struct zio {
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
-	uint64_t	io_child_count;
-	uint64_t	io_phys_children;
-	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
@@ -553,9 +549,8 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
-    zio_done_func_t *physdone, zio_done_func_t *done,
-    void *priv, zio_priority_t priority, zio_flag_t flags,
-    const zbookmark_phys_t *zb);
+    zio_done_func_t *done, void *priv, zio_priority_t priority,
+    zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index a23715309f2..7023f448182 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -6675,18 +6675,6 @@ arc_write_children_ready(zio_t *zio)
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
-/*
- * The SPA calls this callback for each physical write that happens on behalf
- * of a logical write.  See the comment in dbuf_write_physdone() for details.
- */
-static void
-arc_write_physdone(zio_t *zio)
-{
-	arc_write_callback_t *cb = zio->io_private;
-	if (cb->awcb_physdone != NULL)
-		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
-}
-
 static void
 arc_write_done(zio_t *zio)
 {
@@ -6776,9 +6764,9 @@ zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
-    arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
-    arc_write_done_func_t *done, void *private, zio_priority_t priority,
-    int zio_flags, const zbookmark_phys_t *zb)
+    arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
+    void *private, zio_priority_t priority, int zio_flags,
+    const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
@@ -6825,7 +6813,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
-	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
@@ -6862,8 +6849,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
-	    arc_write_physdone, arc_write_done, callback,
-	    priority, zio_flags, zb);
+	    arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 272e712586f..1ea075217fb 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4369,22 +4369,6 @@ dbuf_lightweight_ready(zio_t *zio)
 	rw_exit(&parent_db->db_rwlock);
 }
 
-static void
-dbuf_lightweight_physdone(zio_t *zio)
-{
-	dbuf_dirty_record_t *dr = zio->io_private;
-	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
-	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-	/*
-	 * The callback will be called io_phys_children times.  Retire one
-	 * portion of our dirty space each time we are called.  Any rounding
-	 * error will be cleaned up by dbuf_lightweight_done().
-	 */
-	int delta = dr->dr_accounted / zio->io_phys_children;
-	dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
 static void
 dbuf_lightweight_done(zio_t *zio)
 {
@@ -4403,16 +4387,8 @@ dbuf_lightweight_done(zio_t *zio)
 		dsl_dataset_block_born(ds, zio->io_bp, tx);
 	}
 
-	/*
-	 * See comment in dbuf_write_done().
-	 */
-	if (zio->io_phys_children == 0) {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted, zio->io_txg);
-	} else {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-	}
+	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+	    zio->io_txg);
 
 	abd_free(dr->dt.dll.dr_abd);
 	kmem_free(dr, sizeof (*dr));
@@ -4446,8 +4422,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
 	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
 	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
-	    dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
-	    ZIO_PRIORITY_ASYNC_WRITE,
+	    dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
 
 	zio_nowait(dr->dr_zio);
@@ -4789,37 +4764,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	DB_DNODE_EXIT(db);
 }
 
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times).  This
- * allows the DMU to monitor the progress of each logical i/o.  For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block.  There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-	(void) buf;
-	dmu_buf_impl_t *db = arg;
-	objset_t *os = db->db_objset;
-	dsl_pool_t *dp = dmu_objset_pool(os);
-	dbuf_dirty_record_t *dr;
-	int delta = 0;
-
-	dr = db->db_data_pending;
-	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-	/*
-	 * The callback will be called io_phys_children times.  Retire one
-	 * portion of our dirty space each time we are called.  Any rounding
-	 * error will be cleaned up by dbuf_write_done().
-	 */
-	delta = dr->dr_accounted / zio->io_phys_children;
-	dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
@@ -4894,27 +4838,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
-	/*
-	 * If we didn't do a physical write in this ZIO and we
-	 * still ended up here, it means that the space of the
-	 * dbuf that we just released (and undirtied) above hasn't
-	 * been marked as undirtied in the pool's accounting.
-	 *
-	 * Thus, we undirty that space in the pool's view of the
-	 * world here. For physical writes this type of update
-	 * happens in dbuf_write_physdone().
-	 *
-	 * If we did a physical write, cleanup any rounding errors
-	 * that came up due to writing multiple copies of a block
-	 * on disk [see dbuf_write_physdone()].
-	 */
-	if (zio->io_phys_children == 0) {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted, zio->io_txg);
-	} else {
-		dsl_pool_undirty_space(dmu_objset_pool(os),
-		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-	}
+	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+	    zio->io_txg);
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 }
@@ -5162,7 +5087,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
-		    dbuf_write_override_ready, NULL, NULL,
+		    dbuf_write_override_ready, NULL,
 		    dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
@@ -5176,7 +5101,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
-		    dbuf_write_nofill_ready, NULL, NULL,
+		    dbuf_write_nofill_ready, NULL,
 		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5195,9 +5120,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
 		    dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
-		    children_ready_cb, dbuf_write_physdone,
-		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED, &zb);
+		    children_ready_cb, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
 
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 8a13b8f410a..dda869287c7 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1698,7 +1698,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
-	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
@@ -1864,7 +1864,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
 	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
-	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+	    &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 778b18817ee..d134d4958f7 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1698,7 +1698,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
-	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+	    &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
 	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d7b2217623e..fb8164f0aea 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -650,9 +650,6 @@ zio_add_child(zio_t *pio, zio_t *cio)
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
-	pio->io_child_count++;
-	cio->io_parent_count++;
-
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
@@ -669,9 +666,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
-	pio->io_child_count--;
-	cio->io_parent_count--;
-
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
@@ -1162,9 +1156,8 @@ zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
-    zio_done_func_t *physdone, zio_done_func_t *done,
-    void *private, zio_priority_t priority, zio_flag_t flags,
-    const zbookmark_phys_t *zb)
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
@@ -1184,7 +1177,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
-	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
 	/*
@@ -1517,16 +1509,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
-
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
-	zio->io_physdone = pio->io_physdone;
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
-		zio->io_logical->io_phys_children++;
-
 	return (zio);
 }
 
@@ -2711,7 +2698,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
-	ASSERT(zio->io_child_count == 0);
+	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
@@ -2969,7 +2956,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
 		    resid) : NULL, lsize, lsize, &zp,
-		    zio_write_gang_member_ready, NULL, NULL,
+		    zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
@@ -3431,7 +3418,7 @@ zio_ddt_write(zio_t *zio)
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
-		    zio_ddt_child_write_ready, NULL, NULL,
+		    zio_ddt_child_write_ready, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
@@ -4134,13 +4121,6 @@ zio_vdev_io_assess(zio_t *zio)
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    zio->io_physdone != NULL) {
-		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
-		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
-		zio->io_physdone(zio->io_logical);
-	}
-
 	return (zio);
 }
 
@@ -4890,7 +4870,7 @@ zio_done(zio_t *zio)
 		return (NULL);
 	}
 
-	ASSERT(zio->io_child_count == 0);
+	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 

From 10e36e17612ba9c634b140ae76847bb62b5be68f Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Thu, 15 Jun 2023 21:45:36 +0200
Subject: [PATCH 5/5] Shorten arcstat_quiescence sleep time

With the latest L2ARC fixes, 2 seconds is too long to wait for
quiescence of arcstats like l2_size. Shorten this interval to avoid
having the persistent L2ARC tests in ZTS prematurely terminated.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #14981
---
 tests/zfs-tests/include/libtest.shlib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 133f8387dda..844caa17d8e 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -3706,7 +3706,7 @@ function arcstat_quiescence # stat echo
 
 	while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do
 		typeset stat1=$(get_arcstat $stat)
-		sleep 2
+		sleep 0.5
 		typeset stat2=$(get_arcstat $stat)
 		do_once=false
 	done