From cfc564f9b11eed1421789fa018f7b3d141cc18d0 Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Wed, 30 Jun 2021 10:37:20 -0400
Subject: [PATCH 01/15] ZED: Match added disk by pool/vdev GUID if found
 (#12217)

This enables ZED to auto-online vdevs that are not wholedisk managed by
ZFS.

Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 cmd/zed/agents/zfs_mod.c                      | 40 ++++++--
 cmd/zed/zed_disk_event.c                      |  2 +
 tests/runfiles/linux.run                      |  9 +-
 tests/test-runner/bin/zts-report.py.in        |  1 +
 .../tests/functional/fault/Makefile.am        |  1 +
 .../functional/fault/auto_online_002_pos.ksh  | 94 +++++++++++++++++++
 6 files changed, 137 insertions(+), 10 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh

diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index b564f2b12a2..3bcdf6e1d71 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -640,6 +640,27 @@ devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice)
 	return (data.dd_found);
 }
 
+/*
+ * Given a device guid, find any vdevs with a matching guid.
+ */
+static boolean_t
+guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
+    zfs_process_func_t func, boolean_t is_slice)
+{
+	dev_data_t data = { 0 };
+
+	data.dd_func = func;
+	data.dd_found = B_FALSE;
+	data.dd_pool_guid = pool_guid;
+	data.dd_vdev_guid = vdev_guid;
+	data.dd_islabeled = is_slice;
+	data.dd_new_devid = devid;
+
+	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+	return (data.dd_found);
+}
+
 /*
  * Handle a EC_DEV_ADD.ESC_DISK event.
  *
@@ -663,15 +684,18 @@ static int
 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
 {
 	char *devpath = NULL, *devid;
+	uint64_t pool_guid = 0, vdev_guid = 0;
 	boolean_t is_slice;
 
 	/*
-	 * Expecting a devid string and an optional physical location
+	 * Expecting a devid string and an optional physical location and guid
 	 */
 	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
 		return (-1);
 
 	(void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
+	(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
+	(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
 
 	is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
 
@@ -682,12 +706,16 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
 	 * Iterate over all vdevs looking for a match in the following order:
 	 * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
 	 * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
-	 *
-	 * For disks, we only want to pay attention to vdevs marked as whole
-	 * disks or are a multipath device.
+	 * 3. ZPOOL_CONFIG_GUID (identifies unique vdev).
 	 */
-	if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL)
-		(void) devphys_iter(devpath, devid, zfs_process_add, is_slice);
+	if (devid_iter(devid, zfs_process_add, is_slice))
+		return (0);
+	if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add,
+	    is_slice))
+		return (0);
+	if (vdev_guid != 0)
+		(void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add,
+		    is_slice);
 
 	return (0);
 }
diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
index 6ec566cff3c..94e24236063 100644
--- a/cmd/zed/zed_disk_event.c
+++ b/cmd/zed/zed_disk_event.c
@@ -72,6 +72,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
 	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
+	if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
+		zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
 	if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index c6d4f5f6d34..642ed824d46 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -98,10 +98,11 @@ tests = ['fallocate_prealloc', 'fallocate_punch-hole']
 tags = ['functional', 'fallocate']
 
 [tests/functional/fault:Linux]
-tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos',
-    'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple',
-    'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault',
-    'decompress_fault', 'scrub_after_resilver', 'zpool_status_-s']
+tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
+    'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
+    'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
+    'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
+    'zpool_status_-s']
 tags = ['functional', 'fault']
 
 [tests/functional/features/large_dnode:Linux]
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 27c865ed5c7..8c3bce13491 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -323,6 +323,7 @@ if os.environ.get('CI') == 'true':
         'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason],
         'fault/auto_offline_001_pos': ['SKIP', ci_reason],
         'fault/auto_online_001_pos': ['SKIP', ci_reason],
+        'fault/auto_online_002_pos': ['SKIP', ci_reason],
         'fault/auto_replace_001_pos': ['SKIP', ci_reason],
         'fault/auto_spare_ashift': ['SKIP', ci_reason],
         'fault/auto_spare_shared': ['SKIP', ci_reason],
diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am
index f2fc06877d3..ba0d7d6992c 100644
--- a/tests/zfs-tests/tests/functional/fault/Makefile.am
+++ b/tests/zfs-tests/tests/functional/fault/Makefile.am
@@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \
 	cleanup.ksh \
 	auto_offline_001_pos.ksh \
 	auto_online_001_pos.ksh \
+	auto_online_002_pos.ksh \
 	auto_replace_001_pos.ksh \
 	auto_spare_001_pos.ksh \
 	auto_spare_002_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh
new file mode 100755
index 00000000000..60185ace34b
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh
@@ -0,0 +1,94 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
+# Copyright (c) 2019 by Delphix. All rights reserved.
+# Portions Copyright 2021 iXsystems, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/fault/fault.cfg
+
+#
+# DESCRIPTION:
+# Testing Fault Management Agent ZED Logic - Automated Auto-Online Test.
+# Now with partitioned vdevs.
+#
+# STRATEGY:
+# 1. Partition a scsi_debug device for simulating removal
+# 2. Create a pool
+# 3. Offline disk
+# 4. ZED polls for an event change for online disk to be automatically
+#    added back to the pool.
+#
+verify_runnable "both"
+
+function cleanup
+{
+	poolexists ${TESTPOOL} && destroy_pool ${TESTPOOL}
+	unload_scsi_debug
+}
+
+log_assert "Testing automated auto-online FMA test with partitioned vdev"
+
+log_onexit cleanup
+
+load_scsi_debug ${SDSIZE} ${SDHOSTS} ${SDTGTS} ${SDLUNS} '512b'
+SDDEVICE=$(get_debug_device)
+zpool labelclear -f ${SDDEVICE}
+partition_disk ${SDSIZE} ${SDDEVICE} 1
+part=${SDDEVICE}1
+host=$(get_scsi_host ${SDDEVICE})
+
+block_device_wait /dev/${part}
+log_must zpool create -f ${TESTPOOL} raidz1 ${part} ${DISKS}
+
+# Add some data to the pool
+log_must mkfile ${FSIZE} /${TESTPOOL}/data
+
+remove_disk ${SDDEVICE}
+check_state ${TESTPOOL} "" "degraded" || \
+    log_fail "${TESTPOOL} is not degraded"
+
+# Clear zpool events
+log_must zpool events -c
+
+# Online disk
+insert_disk ${SDDEVICE} ${host}
+
+log_note "Delay for ZED auto-online"
+typeset -i timeout=0
+until is_pool_resilvered ${TESTPOOL}; do
+	if ((timeout++ == MAXTIMEOUT)); then
+		log_fail "Timeout occurred"
+	fi
+	sleep 1
+done
+log_note "Auto-online of ${SDDEVICE} is complete"
+
+# Validate auto-online was successful
+sleep 1
+check_state ${TESTPOOL} "" "online" || \
+    log_fail "${TESTPOOL} is not back online"
+
+log_must zpool destroy ${TESTPOOL}
+
+log_pass "Auto-online with partitioned vdev test successful"

From 42afb12da70f1fd73c7fcf653d5e2bf42f05b42b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 1 Jul 2021 11:16:54 -0400
Subject: [PATCH 02/15] Remove refcount from spa_config_*()

The only reason for spa_config_*() to use refcount instead of simple
non-atomic (thanks to scl_lock) variable for scl_count is tracking,
hard disabled for the last 8 years.  Switch to simple int scl_count
reduces the lock hold time by avoiding atomic, plus makes structure
fit into single cache line, reducing the locks contention.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12287
---
 include/sys/spa_impl.h |  4 ++--
 module/zfs/spa_misc.c  | 19 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index bc88cfa15e8..280f8cf1695 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -141,9 +141,9 @@ typedef struct spa_config_lock {
 	kmutex_t	scl_lock;
 	kthread_t	*scl_writer;
 	int		scl_write_wanted;
+	int		scl_count;
 	kcondvar_t	scl_cv;
-	zfs_refcount_t	scl_count;
-} spa_config_lock_t;
+} ____cacheline_aligned spa_config_lock_t;
 
 typedef struct spa_config_dirent {
 	list_node_t	scd_link;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index e2523231d28..157dede93cf 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -444,9 +444,9 @@ spa_config_lock_init(spa_t *spa)
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
-		zfs_refcount_create_untracked(&scl->scl_count);
 		scl->scl_writer = NULL;
 		scl->scl_write_wanted = 0;
+		scl->scl_count = 0;
 	}
 }
 
@@ -457,9 +457,9 @@ spa_config_lock_destroy(spa_t *spa)
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_destroy(&scl->scl_lock);
 		cv_destroy(&scl->scl_cv);
-		zfs_refcount_destroy(&scl->scl_count);
 		ASSERT(scl->scl_writer == NULL);
 		ASSERT(scl->scl_write_wanted == 0);
+		ASSERT(scl->scl_count == 0);
 	}
 }
 
@@ -480,7 +480,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
-			if (!zfs_refcount_is_zero(&scl->scl_count)) {
+			if (scl->scl_count != 0) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
@@ -488,7 +488,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 			}
 			scl->scl_writer = curthread;
 		}
-		(void) zfs_refcount_add(&scl->scl_count, tag);
+		scl->scl_count++;
 		mutex_exit(&scl->scl_lock);
 	}
 	return (1);
@@ -514,14 +514,14 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
-			while (!zfs_refcount_is_zero(&scl->scl_count)) {
+			while (scl->scl_count != 0) {
 				scl->scl_write_wanted++;
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 				scl->scl_write_wanted--;
 			}
 			scl->scl_writer = curthread;
 		}
-		(void) zfs_refcount_add(&scl->scl_count, tag);
+		scl->scl_count++;
 		mutex_exit(&scl->scl_lock);
 	}
 	ASSERT3U(wlocks_held, <=, locks);
@@ -535,8 +535,8 @@ spa_config_exit(spa_t *spa, int locks, const void *tag)
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
-		ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
-		if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
+		ASSERT(scl->scl_count > 0);
+		if (--scl->scl_count == 0) {
 			ASSERT(scl->scl_writer == NULL ||
 			    scl->scl_writer == curthread);
 			scl->scl_writer = NULL;	/* OK in either case */
@@ -555,8 +555,7 @@ spa_config_held(spa_t *spa, int locks, krw_t rw)
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
-		if ((rw == RW_READER &&
-		    !zfs_refcount_is_zero(&scl->scl_count)) ||
+		if ((rw == RW_READER && scl->scl_count != 0) ||
 		    (rw == RW_WRITER && scl->scl_writer == curthread))
 			locks_held |= 1 << i;
 	}

From 50e09eddd0c556ace90357b081d23df3f00c2d83 Mon Sep 17 00:00:00 2001
From: Kevin Jin <33590050+jxdking@users.noreply.github.com>
Date: Thu, 1 Jul 2021 11:20:27 -0400
Subject: [PATCH 03/15] Optimize txg_kick() process (#12274)

Use dp_dirty_pertxg[] for txg_kick(), instead of dp_dirty_total in
original code. Extra parameter "txg" is added for txg_kick(), thus it
knows which txg to kick. Also txg_kick() call is moved from
dsl_pool_need_dirty_delay() to dsl_pool_dirty_space() so that we can
know the txg number assigned for txg_kick().

Some unnecessary code regarding dp_dirty_total in txg_sync_thread() is
also cleaned up.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: jxdking <lostking2008@hotmail.com>
Closes #12274
---
 include/sys/txg.h     |  2 +-
 module/zfs/dsl_pool.c | 25 +++++++++++++++++++------
 module/zfs/txg.c      | 36 ++++++++++++++----------------------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/include/sys/txg.h b/include/sys/txg.h
index 22158bd1a5e..f38f0006c04 100644
--- a/include/sys/txg.h
+++ b/include/sys/txg.h
@@ -78,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
 extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
     hrtime_t resolution);
-extern void txg_kick(struct dsl_pool *dp);
+extern void txg_kick(struct dsl_pool *dp, uint64_t txg);
 
 /*
  * Wait until the given transaction group has finished syncing.
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index e66c136a9e0..72f4b86d772 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -898,18 +898,26 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	uint64_t dirty_min_bytes =
-	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
-	uint64_t dirty;
 
 	mutex_enter(&dp->dp_lock);
-	dirty = dp->dp_dirty_total;
+	uint64_t dirty = dp->dp_dirty_total;
 	mutex_exit(&dp->dp_lock);
-	if (dirty > dirty_min_bytes)
-		txg_kick(dp);
+
 	return (dirty > delay_min_bytes);
 }
 
+static boolean_t
+dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+	uint64_t dirty_min_bytes =
+	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+	uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+	return (dirty > dirty_min_bytes);
+}
+
 void
 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
@@ -917,7 +925,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 		mutex_enter(&dp->dp_lock);
 		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 		dsl_pool_dirty_delta(dp, space);
+		boolean_t needsync = !dmu_tx_is_syncing(tx) &&
+		    dsl_pool_need_dirty_sync(dp, tx->tx_txg);
 		mutex_exit(&dp->dp_lock);
+
+		if (needsync)
+			txg_kick(dp, tx->tx_txg);
 	}
 }
 
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index c55b1d8f960..c9eb84bbdb1 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -498,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp)
 		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
 }
 
-static boolean_t
-txg_is_syncing(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
-	return (tx->tx_syncing_txg != 0);
-}
-
 static boolean_t
 txg_is_quiescing(dsl_pool_t *dp)
 {
@@ -539,8 +531,6 @@ txg_sync_thread(void *arg)
 		clock_t timeout = zfs_txg_timeout * hz;
 		clock_t timer;
 		uint64_t txg;
-		uint64_t dirty_min_bytes =
-		    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
 
 		/*
 		 * We sync when we're scanning, there's someone waiting
@@ -551,8 +541,7 @@ txg_sync_thread(void *arg)
 		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    !txg_has_quiesced_to_sync(dp) &&
-		    dp->dp_dirty_total < dirty_min_bytes) {
+		    !txg_has_quiesced_to_sync(dp)) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    (u_longlong_t)tx->tx_synced_txg,
 			    (u_longlong_t)tx->tx_sync_txg_waiting, dp);
@@ -566,6 +555,11 @@ txg_sync_thread(void *arg)
 		 * prompting it to do so if necessary.
 		 */
 		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
+			if (txg_is_quiescing(dp)) {
+				txg_thread_wait(tx, &cpr,
+				    &tx->tx_quiesce_done_cv, 0);
+				continue;
+			}
 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 			cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -791,24 +785,22 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
 }
 
 /*
- * If there isn't a txg syncing or in the pipeline, push another txg through
- * the pipeline by quiescing the open txg.
+ * Pass in the txg number that should be synced.
  */
 void
-txg_kick(dsl_pool_t *dp)
+txg_kick(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
+	if (tx->tx_sync_txg_waiting >= txg)
+		return;
+
 	mutex_enter(&tx->tx_sync_lock);
-	if (!txg_is_syncing(dp) &&
-	    !txg_is_quiescing(dp) &&
-	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
-	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
-	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
-		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
-		cv_broadcast(&tx->tx_quiesce_more_cv);
+	if (tx->tx_sync_txg_waiting < txg) {
+		tx->tx_sync_txg_waiting = txg;
+		cv_broadcast(&tx->tx_sync_more_cv);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }

From eca174527e0b8416550e6ce87c405702fd379ada Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Fri, 2 Jul 2021 00:22:16 +0900
Subject: [PATCH 04/15] Upstream: dmu_zfetch_stream_fini leaks refcount

dmu_zfetch_stream_fini() is missing calls to destroy the refcounts,
leaking them and the mutex inside.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #12294
---
 module/zfs/dmu_zfetch.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index 4a323fa990f..a26b0d73992 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -159,6 +159,8 @@ static void
 dmu_zfetch_stream_fini(zstream_t *zs)
 {
 	ASSERT(!list_link_active(&zs->zs_node));
+	zfs_refcount_destroy(&zs->zs_callers);
+	zfs_refcount_destroy(&zs->zs_refs);
 	kmem_free(zs, sizeof (*zs));
 }
 

From c6d1112bf4125e5a22eb47ceb7b8cee01f0df9a1 Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Fri, 2 Jul 2021 00:28:15 +0900
Subject: [PATCH 05/15] Fix abd leak, kmem_free correct size of abd_t

Fix a leak of abd_t that manifested mostly when using
raidzN with at least as many columns as N (e.g. a
four-disk raidz2 but not a three-disk raidz2).
Sufficiently heavy raidz use would eventually run a system
out of memory.

Additionally:

* Switch abd_cache arena to FIRSTFIT, which empirically
improves perofrmance.

* Make abd_chunk_cache more performant and debuggable.

* Allocate the abd_zero_buf from abd_chunk_cache rather
than the heap.

* Don't try to reap non-existent qcaches in abd_cache arena.

* KM_PUSHPAGE->KM_SLEEP when allocating chunks from their
own arena

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Co-authored-by: Sean Doran <smd@use.net>
Closes #12295
---
 include/sys/abd_impl.h         | 2 +-
 module/os/freebsd/zfs/abd_os.c | 9 ++++++---
 module/os/linux/zfs/abd_os.c   | 3 ++-
 module/zfs/abd.c               | 2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index 6bce08cfa34..113700cd72b 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -64,7 +64,7 @@ void abd_free_struct(abd_t *);
  */
 
 abd_t *abd_alloc_struct_impl(size_t);
-abd_t *abd_get_offset_scatter(abd_t *, abd_t *, size_t);
+abd_t *abd_get_offset_scatter(abd_t *, abd_t *, size_t, size_t);
 void abd_free_struct_impl(abd_t *);
 void abd_alloc_chunks(abd_t *, size_t);
 void abd_free_chunks(abd_t *);
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index 47adc2278df..4f5b33d9451 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -374,14 +374,17 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
 }
 
 abd_t *
-abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
+    size_t size)
 {
 	abd_verify(sabd);
 	ASSERT3U(off, <=, sabd->abd_size);
 
 	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
-	uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
-	    (new_offset / zfs_abd_chunk_size);
+	size_t chunkcnt = abd_chunkcnt_for_bytes(
+	    (new_offset % zfs_abd_chunk_size) + size);
+
+	ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd));
 
 	/*
 	 * If an abd struct is provided, it is only the minimum size.  If we
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index af543d6e3f7..d1d238a4e30 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -835,7 +835,8 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
 }
 
 abd_t *
-abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
+    size_t size)
 {
 	int i = 0;
 	struct scatterlist *sg = NULL;
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index d5fafccd08a..cc2d3575db6 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -531,7 +531,7 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
 		}
 		ASSERT3U(left, ==, 0);
 	} else {
-		abd = abd_get_offset_scatter(abd, sabd, off);
+		abd = abd_get_offset_scatter(abd, sabd, off, size);
 	}
 
 	ASSERT3P(abd, !=, NULL);

From 490c845efe3ca29eaa8aa6ea1e3f45eda72895fe Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 1 Jul 2021 11:30:31 -0400
Subject: [PATCH 06/15] Compact dbuf/buf hashes and lock arrays

With default dbuf cache size of 1/32 of ARC, it makes no sense to have
hash table of the same size (or even bigger on Linux).  Reduce it to
1/8 of ARC's one, still leaving some slack, assuming higher I/O rate
via dbuf cache than via ARC.

Remove padding from ARC hash locks array.  The idea behind padding
is to avoid false sharing between locks.  It would have sense if
there would be a limited number of very busy locks.  But since we
have no limit on the number, using the same memory for more locks we
can achieve even lower lock contention with the same false sharing,
or we can use less memory for the same contention level.

Reduce number of hash locks from 8192 to 2048.  The number is still
big enough to not cause contention, but reduced memory size improves
cache hit rate for mutex_tryenter() in ARC eviction thread, saving
about 1% of the thread time.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12289
---
 include/sys/dbuf.h |  4 ++--
 module/zfs/arc.c   | 25 ++++++-------------------
 module/zfs/dbuf.c  |  6 +++---
 3 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index d221eac4c81..6ae079c6a64 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -322,12 +322,12 @@ typedef struct dmu_buf_impl {
 } dmu_buf_impl_t;
 
 /* Note: the dbuf hash table is exposed only for the mdb module */
-#define	DBUF_MUTEXES 8192
+#define	DBUF_MUTEXES 2048
 #define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
 typedef struct dbuf_hash_table {
 	uint64_t hash_table_mask;
 	dmu_buf_impl_t **hash_table;
-	kmutex_t hash_mutexes[DBUF_MUTEXES];
+	kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned;
 } dbuf_hash_table_t;
 
 typedef void (*dbuf_prefetch_fn)(void *, boolean_t);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3484fff3b4d..394ca1bfe42 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -740,29 +740,18 @@ taskq_t *arc_prune_taskq;
  * Hash table routines
  */
 
-#define	HT_LOCK_ALIGN	64
-#define	HT_LOCK_PAD	(P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
-
-struct ht_lock {
-	kmutex_t	ht_lock;
-#ifdef _KERNEL
-	unsigned char	pad[HT_LOCK_PAD];
-#endif
-};
-
-#define	BUF_LOCKS 8192
+#define	BUF_LOCKS 2048
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
-	struct ht_lock ht_locks[BUF_LOCKS];
+	kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define	BUF_HASH_LOCK(idx)	(&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
@@ -1111,7 +1100,7 @@ buf_fini(void)
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (i = 0; i < BUF_LOCKS; i++)
-		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_full_crypt_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
@@ -1276,10 +1265,8 @@ retry:
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
-	for (i = 0; i < BUF_LOCKS; i++) {
-		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-	}
+	for (i = 0; i < BUF_LOCKS; i++)
+		mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 9ce091b80dc..289247c6ed6 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -826,12 +826,12 @@ dbuf_init(void)
 	int i;
 
 	/*
-	 * The hash table is big enough to fill all of physical memory
+	 * The hash table is big enough to fill one eighth of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
-	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
 		hsize <<= 1;
 
 retry:
@@ -3055,8 +3055,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	db->db_state = DB_EVICTING; /* not worth logging this state change */
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
-		kmem_cache_free(dbuf_kmem_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
+		kmem_cache_free(dbuf_kmem_cache, db);
 		DBUF_STAT_BUMP(hash_insert_race);
 		return (odb);
 	}

From b192a2c0a19133234972b491e9c06093b3d21358 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 1 Jul 2021 11:32:31 -0400
Subject: [PATCH 07/15] Remove avl_size field from struct avl_tree

This field is used only by illumos mdb.  On other platforms it only
increases the struct size from 32 to 40 bytes.  For struct vdev_queue
including 13 instances of avl_tree_t size means active cache lines.

Keep the padding in user-space for now to not break the ABI.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12290
---
 include/sys/avl_impl.h | 4 +++-
 module/avl/avl.c       | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/sys/avl_impl.h b/include/sys/avl_impl.h
index f577ecd42f7..c464a62a1ca 100644
--- a/include/sys/avl_impl.h
+++ b/include/sys/avl_impl.h
@@ -147,7 +147,9 @@ struct avl_tree {
 	int (*avl_compar)(const void *, const void *);
 	size_t avl_offset;		/* offsetof(type, avl_link_t field) */
 	ulong_t avl_numnodes;		/* number of nodes in the tree */
-	size_t avl_size;		/* sizeof user type struct */
+#ifndef _KERNEL
+	size_t avl_pad;			/* For backwards ABI compatibility. */
+#endif
 };
 
 
diff --git a/module/avl/avl.c b/module/avl/avl.c
index 1a95092bc2b..3d36d4c87e0 100644
--- a/module/avl/avl.c
+++ b/module/avl/avl.c
@@ -875,7 +875,6 @@ avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
 
 	ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
 	ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
-	ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
 
 	temp_node = tree1->avl_root;
 	temp_numnodes = tree1->avl_numnodes;
@@ -903,7 +902,6 @@ avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
 	tree->avl_compar = compar;
 	tree->avl_root = NULL;
 	tree->avl_numnodes = 0;
-	tree->avl_size = size;
 	tree->avl_offset = offset;
 }
 

From a5398f8782ebc22930cfbad846decaeb17570e86 Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Tue, 29 Jun 2021 18:50:13 -0700
Subject: [PATCH 08/15] Udev rules: use non-ancient comma syntax

This file is old as dirt. It's entirely possible that commas were
optional in udev back at that time. But they're definitely supposed to
be there nowadays.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Justin Gottula <justin@jgottula.com>
Closes #12302
---
 udev/rules.d/60-zvol.rules.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in
index 60bbff8d2df..4447a253850 100644
--- a/udev/rules.d/60-zvol.rules.in
+++ b/udev/rules.d/60-zvol.rules.in
@@ -3,4 +3,4 @@
 # persistent disk links: /dev/zvol/dataset_name
 # also creates compatibility symlink of /dev/dataset_name
 
-KERNEL=="zd*" SUBSYSTEM=="block" ACTION=="add|change" PROGRAM="@udevdir@/zvol_id $tempnode" SYMLINK+="zvol/%c %c"
+KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $tempnode", SYMLINK+="zvol/%c %c"

From 17c794e7b0723b1f90229771ddf1dc2ac227f224 Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Tue, 29 Jun 2021 18:52:33 -0700
Subject: [PATCH 09/15] Udev rules: replace deprecated $tempnode with $devnode

The $tempnode substitution is so old that it's not even mentioned in the
man page anymore. It is still technically supported by udev, but with
plenty of "deprecated" comments surrounding it.

The preferred modern equivalent of $tempnode is $devnode (or
alternatively, %N).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Justin Gottula <justin@jgottula.com>
Closes #12302
---
 udev/rules.d/60-zvol.rules.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in
index 4447a253850..5e1829029c6 100644
--- a/udev/rules.d/60-zvol.rules.in
+++ b/udev/rules.d/60-zvol.rules.in
@@ -3,4 +3,4 @@
 # persistent disk links: /dev/zvol/dataset_name
 # also creates compatibility symlink of /dev/dataset_name
 
-KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $tempnode", SYMLINK+="zvol/%c %c"
+KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c"

From f1aef8d4df83c8dc27e366fa1c720f91661d89c2 Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Tue, 29 Jun 2021 18:54:38 -0700
Subject: [PATCH 10/15] Udev rules: use match (==) rather than assign (=) for
 PROGRAM

Assignment syntax (=) can be used for the PROGRAM key. But the PROGRAM
key is really a match key, not an assign key. The internal logic used by
udev to decide whether a PROGRAM key "matched" or not (which determines
whether the remainder of the rule is evaluated) depends on whether the
operator was OP_MATCH (==) or OP_NOMATCH (!=). [1]

The man page claims that '"=", ":=", and "+=" have the same effect as
"=="' for PROGRAM keys. And, after a brief perusal, the udev source code
does seem to confirm that operators other than OP_MATCH (==) or
OP_NOMATCH (!=) are implicitly converted to OP_MATCH (==). [2] But it's
not entirely clear that this is definitely the case: anecdotal testing
seems to indicate that when OP_ASSIGN (=) is used, the program's exit
status is disregarded and the remainder of the rule is processed
regardless of whether it was, in fact, a successful exit.

The bottom line here is that, if zvol_id hits some snag and returns a
nonzero exit status, then we almost certainly do NOT want to continue on
with the rule and use whatever the stdout contents may have been to
mindlessly create /dev/zvol/* symlinks. Therefore, let's be extra-sure
and use the match (==) operator explicitly, to eliminate any possibility
that udev might do the wrong thing, and ensure that a nonzero exit
status will definitely short-circuit the rest of the rule, bypassing the
SYMLINK+= assignments.

[1]
udev,
 file src/udev/udev-rules.c,
  func udev_rule_apply_token_to_event,
   switch case TK_M_PROGRAM if r != 0 (nonzero exit status):
      return token->op == OP_NOMATCH;
   switch case TK_M_PROGRAM if r == 0 (zero exit status):
      return token->op == OP_MATCH;
   func retval 0 => key is considered to have matched
   func retval 1 => key is considered to have NOT matched

[2]
udev,
 file src/udev/udev-rules.c,
  func parse_token,
   at func start:
      bool is_match = IN_SET(op, OP_MATCH, OP_NOMATCH);
   in else-if case streq(key, "PROGRAM"):
      if (!is_match) op = OP_MATCH;

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Justin Gottula <justin@jgottula.com>
Closes #12302
---
 udev/rules.d/60-zvol.rules.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in
index 5e1829029c6..a3c7d2acf75 100644
--- a/udev/rules.d/60-zvol.rules.in
+++ b/udev/rules.d/60-zvol.rules.in
@@ -3,4 +3,4 @@
 # persistent disk links: /dev/zvol/dataset_name
 # also creates compatibility symlink of /dev/dataset_name
 
-KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c"
+KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM=="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c"

From b19e2bdfb5a2f0b652e12f73d8e01e80773aaadd Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Tue, 29 Jun 2021 20:14:18 -0700
Subject: [PATCH 11/15] Print zvol_id error messages to stderr rather than
 stdout

The zvol_id program is invoked by udev, via a PROGRAM key in the
60-zvol.rules.in rule file, to determine the "pretty" /dev/zvol/*
symlink paths paths that should be generated for each opaquely named
/dev/zd* dev node.

The udev rule uses the PROGRAM key, followed by a SYMLINK+= assignment
containing the %c substitution, to collect the program's stdout and then
"paste" it directly into the name of the symlink(s) to be created.

Unfortunately, as currently written, zvol_id outputs both its intended
output (a single string representing the symlink path that should be
created to refer to the name of the dataset whose /dev/zd* path is
given) AND its error messages (if any) to stdout.

When processing PROGRAM keys (and others, such as IMPORT{program}), udev
uses only the data written to stdout for functional purposes. Any data
written to stderr is used solely for the purposes of logging (if udev's
log_level is set to debug).

The unintended consequence of this is as follows: if zvol_id encounters
an error condition; and then udev fails to halt processing of the
current rule (either because zvol_id didn't return a nonzero exit
status, or because the PROGRAM key in the rule wasn't written properly
to result in a "non-match" condition that would stop the current rule on
a nonzero exit); then udev will create a space-delimited list of symlink
names derived directly from the words of the error message string!

I've observed this exact behavior on my own system, in a situation where
the open() syscall on /dev/zd* dev nodes was failing sporadically (for
reasons that aren't especially relevant here). Because the open() call
failed, zvol_id printed "Unable to open device file: /dev/zd736\n" to
stdout and then exited.

The udev rule finished with SYMLINK+="zvol/%c %c". Assuming a volume
name like pool/foo/bar, this would ordinarily expand to
   SYMLINK+="zvol/pool/foo/bar pool/foo/bar"
and would cause symlinks to be created like this:
   /dev/zvol/pool/foo/bar -> /dev/zd736
   /dev/pool/foo/bar      -> /dev/zd736

But because of the combination of error messages being printed to
stdout, and the udev syntax freely accepting a space-delimited sequence
of names in this context, the error message string
   "Unable to open device file: /dev/zd736\n"
in reality expanded to
   SYMLINK+="zvol/Unable to open device file: /dev/zd736"
which caused the following symlinks to actually be created:
   /dev/zvol/Unable -> /dev/zd736
   /dev/to          -> /dev/zd736
   /dev/open        -> /dev/zd736
   /dev/device      -> /dev/zd736
   /dev/file:       -> /dev/zd736
   /dev//dev/zd736  -> /dev/zd736

(And, because multiple zvols had open() syscall errors, multiple zvols
attempted to claim several of those symlink names, resulting in numerous
udev errors and timeouts and general chaos.)

This commit rectifies all this silliness by simply printing error
messages to stderr, as Dennis Ritchie originally intended.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Justin Gottula <justin@jgottula.com>
Closes #12302
---
 cmd/zvol_id/zvol_id_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c
index 4a2d74cc203..7e01028562b 100644
--- a/cmd/zvol_id/zvol_id_main.c
+++ b/cmd/zvol_id/zvol_id_main.c
@@ -63,14 +63,14 @@ main(int argc, char **argv)
 	int rc;
 
 	if (argc < 2) {
-		printf("Usage: %s /dev/zvol_device_node\n", argv[0]);
+		fprintf(stderr, "Usage: %s /dev/zvol_device_node\n", argv[0]);
 		return (EINVAL);
 	}
 
 	dev_name = argv[1];
 	error = stat64(dev_name, &statbuf);
 	if (error != 0) {
-		printf("Unable to access device file: %s\n", dev_name);
+		fprintf(stderr, "Unable to access device file: %s\n", dev_name);
 		return (errno);
 	}
 
@@ -79,13 +79,13 @@ main(int argc, char **argv)
 
 	fd = open(dev_name, O_RDONLY);
 	if (fd < 0) {
-		printf("Unable to open device file: %s\n", dev_name);
+		fprintf(stderr, "Unable to open device file: %s\n", dev_name);
 		return (errno);
 	}
 
 	error = ioctl_get_msg(zvol_name, fd);
 	if (error < 0) {
-		printf("ioctl_get_msg failed:%s\n", strerror(errno));
+		fprintf(stderr, "ioctl_get_msg failed: %s\n", strerror(errno));
 		return (errno);
 	}
 	if (dev_part > 0)

From f24c7c359ea486438ecadf1496e7c9a77b132a0c Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Tue, 29 Jun 2021 21:29:09 -0700
Subject: [PATCH 12/15] Use substantially more robust program exit status logic
 in zvol_id

Currently, there are several places in zvol_id where the program logic
returns particular errno values, or even particular ioctl return values,
as the program exit status, rather than a straightforward system of
explicit zero on success and explicit nonzero value(s) on failure.

This is problematic for multiple reasons. One particularly interesting
problem that can arise, is that if any of these values happens to have
all 8 least significant bits unset (i.e., it is a positive or negative
multiple of 256), then although the C program sees a nonzero int value
(presumed to be a failure exit status), the actual exit status as seen
by the system is only the bottom 8 bits of that integer: zero.

This can happen in practice, and I have encountered it myself. In a
particularly weird situation, the zvol_open code in the zfs kernel
module was behaving in such a manner that it caused the open() syscall
to fail and for errno to be set to a kernel-private value (ERESTARTSYS,
which happens to be defined as 512). It turns out that 512 is evenly
divisible by 256; or, in other words, its least significant 8 bits are
all-zero. So even though zvol_id believed it was returning a nonzero
(failure) exit status of 512, the system modulo'd that value by 256,
resulting in the actual exit status visible by other programs being 0!
This actually-zero (non-failure) exit status caused problems: udev
believed that the program was operating successfully, when in fact it
was attempting to indicate failure via a nonzero exit status integer.
Combined with another problem, this led to the creation of nonsense
symlinks for zvol dev nodes by udev.

Let's get rid of all this problematic logic, and simply return
EXIT_SUCCESS (0) is everything went fine, and EXIT_FAILURE (1) if
anything went wrong.

Additionally, let's clarify some of the variable names (error is similar
to errno, etc) and clean up the overall program flow a bit.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Justin Gottula <justin@jgottula.com>
Closes #12302
---
 cmd/zvol_id/zvol_id_main.c | 50 ++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c
index 7e01028562b..22f2e848cba 100644
--- a/cmd/zvol_id/zvol_id_main.c
+++ b/cmd/zvol_id/zvol_id_main.c
@@ -38,40 +38,39 @@
 static int
 ioctl_get_msg(char *var, int fd)
 {
-	int error = 0;
+	int ret;
 	char msg[ZFS_MAX_DATASET_NAME_LEN];
 
-	error = ioctl(fd, BLKZNAME, msg);
-	if (error < 0) {
-		return (error);
+	ret = ioctl(fd, BLKZNAME, msg);
+	if (ret < 0) {
+		return (ret);
 	}
 
 	snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg);
-	return (error);
+	return (ret);
 }
 
 int
 main(int argc, char **argv)
 {
-	int fd, error = 0;
+	int fd = -1, ret = 0, status = EXIT_FAILURE;
 	char zvol_name[ZFS_MAX_DATASET_NAME_LEN];
 	char *zvol_name_part = NULL;
 	char *dev_name;
 	struct stat64 statbuf;
 	int dev_minor, dev_part;
 	int i;
-	int rc;
 
 	if (argc < 2) {
 		fprintf(stderr, "Usage: %s /dev/zvol_device_node\n", argv[0]);
-		return (EINVAL);
+		goto fail;
 	}
 
 	dev_name = argv[1];
-	error = stat64(dev_name, &statbuf);
-	if (error != 0) {
+	ret = stat64(dev_name, &statbuf);
+	if (ret != 0) {
 		fprintf(stderr, "Unable to access device file: %s\n", dev_name);
-		return (errno);
+		goto fail;
 	}
 
 	dev_minor = minor(statbuf.st_rdev);
@@ -80,22 +79,22 @@ main(int argc, char **argv)
 	fd = open(dev_name, O_RDONLY);
 	if (fd < 0) {
 		fprintf(stderr, "Unable to open device file: %s\n", dev_name);
-		return (errno);
+		goto fail;
 	}
 
-	error = ioctl_get_msg(zvol_name, fd);
-	if (error < 0) {
+	ret = ioctl_get_msg(zvol_name, fd);
+	if (ret < 0) {
 		fprintf(stderr, "ioctl_get_msg failed: %s\n", strerror(errno));
-		return (errno);
+		goto fail;
 	}
 	if (dev_part > 0)
-		rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name,
+		ret = asprintf(&zvol_name_part, "%s-part%d", zvol_name,
 		    dev_part);
 	else
-		rc = asprintf(&zvol_name_part, "%s", zvol_name);
+		ret = asprintf(&zvol_name_part, "%s", zvol_name);
 
-	if (rc == -1 || zvol_name_part == NULL)
-		goto error;
+	if (ret == -1 || zvol_name_part == NULL)
+		goto fail;
 
 	for (i = 0; i < strlen(zvol_name_part); i++) {
 		if (isblank(zvol_name_part[i]))
@@ -103,8 +102,13 @@ main(int argc, char **argv)
 	}
 
 	printf("%s\n", zvol_name_part);
-	free(zvol_name_part);
-error:
-	close(fd);
-	return (error);
+	status = EXIT_SUCCESS;
+
+fail:
+	if (zvol_name_part)
+		free(zvol_name_part);
+	if (fd >= 0)
+		close(fd);
+
+	return (status);
 }

From 6e4e3c3ab67d4ad050a5e704287ea3577fe45b17 Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Tue, 6 Jul 2021 13:41:17 -0700
Subject: [PATCH 13/15] Udev rules: remove zvol compat symlinks (without the
 leading zvol/)

This is a potentially arguable change, because it removes some
compatibility cruft that certain systems or people may have come to rely
on (either a very long time ago, or unwisely in recent times).

On the other hand, it's been literally over a decade since OpenZFS
switched to the strategy of using opaque numbered /dev/zd* device nodes,
with the canonical zvol access path being a directory tree of symlinks
created by udev rules inside /dev/zvol/*. (See #102.) Even at the time,
the /dev/* scheme was labeled as being for "compatibility".

This commit removes the second tree of symlinks located directly at
/dev/*, under the assumption that anybody with any sense has been using
the intended /dev/zvol/* path for a very very long time now.

(The more I think about this, the more I anticipate that some large
fraction of people will have been blissfully unaware that the intention
has been for them to use the /dev/zvol/* tree all along, and they will
have come to rely upon the /dev/* tree simply because it's been there
this whole time despite being a compat thing.)

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Neal Gompa <ngompa@datto.com>
Signed-off-by: Justin Gottula <justin@jgottula.com>
Closes #12303
---
 udev/rules.d/60-zvol.rules.in | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in
index a3c7d2acf75..9a2a473a841 100644
--- a/udev/rules.d/60-zvol.rules.in
+++ b/udev/rules.d/60-zvol.rules.in
@@ -1,6 +1,11 @@
 # Persistent links for zvol
 #
 # persistent disk links: /dev/zvol/dataset_name
-# also creates compatibility symlink of /dev/dataset_name
+#
+# NOTE: We used to also create an additional tree of zvol symlinks located at
+#       /dev/dataset_name (i.e. without the 'zvol' path component) for
+#       compatibility reasons. These are no longer created anymore, and should
+#       not be relied upon.
+#       
 
-KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM=="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c"
+KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM=="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c"

From 97752ba22a4a036f9bc54556d114c77f68796b8f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 6 Jul 2021 17:38:00 -0400
Subject: [PATCH 14/15] Move gethrtime() calls out of vdev queue lock

This dramatically reduces the lock contention on systems with slower
(non-TSC) timecounters.  With TSC the difference is minimal, but since
this lock is pretty congested, any improvement counts.  Plus I don't
see any reason to do it under the lock other than the latency of the
lock itself, which this change actually reduces.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12281
---
 module/zfs/vdev_queue.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 198861edb81..06d22f6df4c 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -912,9 +912,9 @@ vdev_queue_io(zio_t *zio)
 	}
 
 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+	zio->io_timestamp = gethrtime();
 
 	mutex_enter(&vq->vq_lock);
-	zio->io_timestamp = gethrtime();
 	vdev_queue_io_add(vq, zio);
 	nio = vdev_queue_io_to_issue(vq);
 	mutex_exit(&vq->vq_lock);
@@ -936,14 +936,13 @@ vdev_queue_io_done(zio_t *zio)
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
+	hrtime_t now = gethrtime();
+	vq->vq_io_complete_ts = now;
+	vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
+
 	mutex_enter(&vq->vq_lock);
-
 	vdev_queue_pending_remove(vq, zio);
 
-	zio->io_delta = gethrtime() - zio->io_timestamp;
-	vq->vq_io_complete_ts = gethrtime();
-	vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
-
 	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 		mutex_exit(&vq->vq_lock);
 		if (nio->io_done == vdev_queue_agg_io_done) {

From bdd11cbb90a2afa54fd00935ac0d34b4ddf2515c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 6 Jul 2021 20:39:23 -0400
Subject: [PATCH 15/15] FreeBSD: Hardcode abd_chunk_size to PAGE_SIZE

It makes no sense to set it below PAGE_SIZE, since it increases all
overheads and makes returning memory to OS problematic.  It makes no
sense to set it above PAGE_SIZE, since such allocations and especially
frees are too expensive and cause KVA fragmentation to benefit from
fewer chunks.  After that it makes no sense to keep more complicated
math here.

What may have sense though is just a tunable border between linear and
scatter ABDs, previously also controlled by this tunable.  Retain that
functionality by taking abd_scatter_min_size tunable from Linux, just
with different default value.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Closes #12328
---
 include/sys/abd.h              |   1 -
 module/os/freebsd/zfs/abd_os.c | 129 +++++++++++++--------------------
 2 files changed, 51 insertions(+), 79 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index a7eee89ca91..6903e0c0e71 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -61,7 +61,6 @@ typedef struct abd {
 		struct abd_scatter {
 			uint_t		abd_offset;
 #if defined(__FreeBSD__) && defined(_KERNEL)
-			uint_t  abd_chunk_size;
 			void    *abd_chunks[1]; /* actually variable-length */
 #else
 			uint_t		abd_nents;
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index 4f5b33d9451..95a83542fad 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -79,22 +79,29 @@ struct {
 } abd_sums;
 
 /*
- * The size of the chunks ABD allocates. Because the sizes allocated from the
- * kmem_cache can't change, this tunable can only be modified at boot. Changing
- * it at runtime would cause ABD iteration to work incorrectly for ABDs which
- * were allocated with the old size, so a safeguard has been put in place which
- * will cause the machine to panic if you change it and try to access the data
- * within a scattered ABD.
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's for.  Smaller allocations will use linear ABD's which use
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page).  Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations.
+ *
+ * Linear ABDs for multi-page allocations are easier to use, and in some cases
+ * it allows to avoid buffer copying.  But allocation and especially free
+ * of multi-page linear ABDs are expensive operations due to KVA mapping and
+ * unmapping, and with time they cause KVA fragmentations.
  */
-size_t zfs_abd_chunk_size = 4096;
+size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
 
 #if defined(_KERNEL)
 SYSCTL_DECL(_vfs_zfs);
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
-	&zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
+	&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
 #endif
 
 kmem_cache_t *abd_chunk_cache;
@@ -102,23 +109,16 @@ static kstat_t *abd_ksp;
 
 /*
  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
- * just a single zero'd sized zfs_abd_chunk_size buffer. This
- * allows us to conserve memory by only using a single zero buffer
- * for the scatter chunks.
+ * just a single zero'd page-sized buffer. This allows us to conserve
+ * memory by only using a single zero buffer for the scatter chunks.
  */
 abd_t *abd_zero_scatter = NULL;
 static char *abd_zero_buf = NULL;
 
-static void
-abd_free_chunk(void *c)
-{
-	kmem_cache_free(abd_chunk_cache, c);
-}
-
 static uint_t
 abd_chunkcnt_for_bytes(size_t size)
 {
-	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+	return ((size + PAGE_MASK) >> PAGE_SHIFT);
 }
 
 static inline uint_t
@@ -132,7 +132,7 @@ abd_scatter_chunkcnt(abd_t *abd)
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
-	return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
+	return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
 }
 
 void
@@ -140,7 +140,7 @@ abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 {
 	uint_t n = abd_scatter_chunkcnt(abd);
 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
-	int waste = n * zfs_abd_chunk_size - abd->abd_size;
+	int waste = (n << PAGE_SHIFT) - abd->abd_size;
 	if (op == ABDSTAT_INCR) {
 		ABDSTAT_BUMP(abdstat_scatter_cnt);
 		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
@@ -173,11 +173,11 @@ abd_verify_scatter(abd_t *abd)
 	uint_t i, n;
 
 	/*
-	 * There is no scatter linear pages in FreeBSD so there is an
-	 * if an error if the ABD has been marked as a linear page.
+	 * There is no scatter linear pages in FreeBSD so there is
+	 * an error if the ABD has been marked as a linear page.
 	 */
 	ASSERT(!abd_is_linear_page(abd));
-	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE);
 	n = abd_scatter_chunkcnt(abd);
 	for (i = 0; i < n; i++) {
 		ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
@@ -191,11 +191,9 @@ abd_alloc_chunks(abd_t *abd, size_t size)
 
 	n = abd_chunkcnt_for_bytes(size);
 	for (i = 0; i < n; i++) {
-		void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
-		ASSERT3P(c, !=, NULL);
-		ABD_SCATTER(abd).abd_chunks[i] = c;
+		ABD_SCATTER(abd).abd_chunks[i] =
+		    kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
 	}
-	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
 }
 
 void
@@ -205,7 +203,8 @@ abd_free_chunks(abd_t *abd)
 
 	n = abd_scatter_chunkcnt(abd);
 	for (i = 0; i < n; i++) {
-		abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
+		kmem_cache_free(abd_chunk_cache,
+		    ABD_SCATTER(abd).abd_chunks[i]);
 	}
 }
 
@@ -250,15 +249,13 @@ abd_alloc_zero_scatter(void)
 	uint_t i, n;
 
 	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
+	abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
 
 	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 
 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
-	    zfs_abd_chunk_size;
 
 	for (i = 0; i < n; i++) {
 		ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
@@ -266,18 +263,18 @@ abd_alloc_zero_scatter(void)
 	}
 
 	ABDSTAT_BUMP(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size);
+	ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE);
 }
 
 static void
 abd_free_zero_scatter(void)
 {
 	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
+	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE);
 
 	abd_free_struct(abd_zero_scatter);
 	abd_zero_scatter = NULL;
-	kmem_free(abd_zero_buf, zfs_abd_chunk_size);
+	kmem_cache_free(abd_chunk_cache, abd_zero_buf);
 }
 
 static int
@@ -305,7 +302,7 @@ abd_kstats_update(kstat_t *ksp, int rw)
 void
 abd_init(void)
 {
-	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+	abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0,
 	    NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
 
 	wmsum_init(&abd_sums.abdstat_struct_size, 0);
@@ -382,7 +379,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 
 	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
 	size_t chunkcnt = abd_chunkcnt_for_bytes(
-	    (new_offset % zfs_abd_chunk_size) + size);
+	    (new_offset & PAGE_MASK) + size);
 
 	ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd));
 
@@ -397,7 +394,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 	}
 
 	if (abd == NULL)
-		abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size);
+		abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
 
 	/*
 	 * Even if this buf is filesystem metadata, we only track that
@@ -405,34 +402,16 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
 	 */
 
-	ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
-	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+	ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK;
 
 	/* Copy the scatterlist starting at the correct offset */
 	(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
-	    &ABD_SCATTER(sabd).abd_chunks[new_offset /
-	    zfs_abd_chunk_size],
+	    &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT],
 	    chunkcnt * sizeof (void *));
 
 	return (abd);
 }
 
-static inline size_t
-abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
-{
-	ASSERT(!abd_is_linear(aiter->iter_abd));
-	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
-	    aiter->iter_pos) % zfs_abd_chunk_size);
-}
-
-static inline size_t
-abd_iter_scatter_chunk_index(struct abd_iter *aiter)
-{
-	ASSERT(!abd_is_linear(aiter->iter_abd));
-	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
-	    aiter->iter_pos) / zfs_abd_chunk_size);
-}
-
 /*
  * Initialize the abd_iter.
  */
@@ -483,29 +462,25 @@ void
 abd_iter_map(struct abd_iter *aiter)
 {
 	void *paddr;
-	size_t offset = 0;
 
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
 
-	/* Panic if someone has changed zfs_abd_chunk_size */
-	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
-	    ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
-
 	/* There's nothing left to iterate over, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
-	if (abd_is_linear(aiter->iter_abd)) {
-		offset = aiter->iter_pos;
-		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
-		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	abd_t *abd = aiter->iter_abd;
+	size_t offset = aiter->iter_pos;
+	if (abd_is_linear(abd)) {
+		aiter->iter_mapsize = abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(abd);
 	} else {
-		size_t index = abd_iter_scatter_chunk_index(aiter);
-		offset = abd_iter_scatter_chunk_offset(aiter);
-		aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
-		    aiter->iter_abd->abd_size - aiter->iter_pos);
-		paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
+		offset += ABD_SCATTER(abd).abd_offset;
+		paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT];
+		offset &= PAGE_MASK;
+		aiter->iter_mapsize = MIN(PAGE_SIZE - offset,
+		    abd->abd_size - aiter->iter_pos);
 	}
 	aiter->iter_mapaddr = (char *)paddr + offset;
 }
@@ -517,12 +492,10 @@ abd_iter_map(struct abd_iter *aiter)
 void
 abd_iter_unmap(struct abd_iter *aiter)
 {
-	/* There's nothing left to unmap, so do nothing */
-	if (abd_iter_at_end(aiter))
-		return;
-
-	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
-	ASSERT3U(aiter->iter_mapsize, >, 0);
+	if (!abd_iter_at_end(aiter)) {
+		ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+		ASSERT3U(aiter->iter_mapsize, >, 0);
+	}
 
 	aiter->iter_mapaddr = NULL;
 	aiter->iter_mapsize = 0;