From 5b3b6e95c0f3aeea55932d91f469e8edd3c9cd0f Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 12 May 2023 09:07:58 -0700
Subject: [PATCH 01/11] ZTS: Add auto_replace_001_pos to exceptions

The auto_replace_001_pos test case does not reliably pass on
Fedora 37 and newer.  Until the test case can be updated to make
it reliable add it to the list of "maybe" exceptions on Linux.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #14851
Closes #14852
---
 tests/test-runner/bin/zts-report.py.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 63470bc041c..3f7498f5c6b 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -264,6 +264,7 @@ elif sys.platform.startswith('linux'):
         'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
         'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
         'fault/auto_online_002_pos': ['FAIL', 11889],
+        'fault/auto_replace_001_pos': ['FAIL', 14851],
         'fault/auto_spare_002_pos': ['FAIL', 11889],
         'fault/auto_spare_multiple': ['FAIL', 11889],
         'fault/auto_spare_shared': ['FAIL', 11889],

From da211a4a337cce2917fa597d6930cff75f6cca2e Mon Sep 17 00:00:00 2001
From: Don Brady <dev.fs.zfs@gmail.com>
Date: Fri, 12 May 2023 10:12:28 -0600
Subject: [PATCH 02/11] Refine special_small_blocks property validation

When the special_small_blocks property is being set during a pool
create it enforces a limit of 128KiB even if the pool's record size
is larger.

If the recordsize property is being set during a pool create, then
use that value instead of the default SPA_OLD_MAXBLOCKSIZE value.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #13815
Closes #14811
---
 lib/libzfs/libzfs_dataset.c                   |  8 +++-
 tests/runfiles/common.run                     |  2 +-
 tests/zfs-tests/tests/Makefile.am             |  2 +
 .../alloc_class/alloc_class_014_neg.ksh       | 38 ++++++++++++++++
 .../alloc_class/alloc_class_015_pos.ksh       | 45 +++++++++++++++++++
 5 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
 create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh

diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 138eca19acc..fe9f3268d33 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1034,6 +1034,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 	nvlist_t *ret;
 	int chosen_normal = -1;
 	int chosen_utf = -1;
+	int set_maxbs = 0;
 
 	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
@@ -1252,12 +1253,17 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
+			/* save the ZFS_PROP_RECORDSIZE during create op */
+			if (zpool_hdl == NULL && prop == ZFS_PROP_RECORDSIZE) {
+				set_maxbs = intval;
+			}
 			break;
 		}
 
 		case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
 		{
-			int maxbs = SPA_OLD_MAXBLOCKSIZE;
+			int maxbs =
+			    set_maxbs == 0 ? SPA_OLD_MAXBLOCKSIZE : set_maxbs;
 			char buf[64];
 
 			if (zpool_hdl != NULL) {
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index e2137ac596d..1665e20e0e3 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -37,7 +37,7 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
-    'alloc_class_013_pos']
+    'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/append]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 9299a4ca9b4..a4932fc988a 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -415,6 +415,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
+	functional/alloc_class/alloc_class_014_neg.ksh \
+	functional/alloc_class/alloc_class_015_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
 	functional/append/file_append.ksh \
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
new file mode 100755
index 00000000000..1b52014fd2d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
@@ -0,0 +1,38 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+#	Setting the special_small_blocks property greater than recordsize fails.
+#
+
+verify_runnable "global"
+
+claim="Setting the special_small_blocks property greater than recordsize fails"
+
+log_assert $claim
+log_onexit cleanup
+log_must disk_setup
+
+for size in 512 4096 32768 131072 524288 1048576
+do
+	let bigger=$size*2
+	log_mustnot zpool create -O recordsize=$size \
+		-O special_small_blocks=$bigger \
+		$TESTPOOL raidz $ZPOOL_DISKS special mirror \
+		$CLASS_DISK0 $CLASS_DISK1
+done
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
new file mode 100755
index 00000000000..49c468af670
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
@@ -0,0 +1,45 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# 	Can set special_small_blocks property less than or equal to recordsize.
+#
+
+verify_runnable "global"
+
+claim="Can set special_small_blocks property less than or equal to recordsize"
+
+log_assert $claim
+log_onexit cleanup
+log_must disk_setup
+
+for size in 8192 32768 131072 524288 1048576
+do
+	let smaller=$size/2
+	log_must zpool create -O recordsize=$size \
+		-O special_small_blocks=$smaller \
+		$TESTPOOL raidz $ZPOOL_DISKS special mirror \
+		$CLASS_DISK0 $CLASS_DISK1
+	log_must zpool destroy -f "$TESTPOOL"
+
+	log_must zpool create -O recordsize=$size \
+		-O special_small_blocks=$size \
+		$TESTPOOL raidz $ZPOOL_DISKS special mirror \
+		$CLASS_DISK0 $CLASS_DISK1
+	log_must zpool destroy -f "$TESTPOOL"
+done
+
+log_pass $claim

From 895e03135e4251be0872d96ce38f387bdc13faa2 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 12 May 2023 12:14:29 -0400
Subject: [PATCH 03/11] zil: Some micro-optimizations.

Should not cause functional changes.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #14854
---
 module/zfs/zil.c | 75 +++++++++++++++---------------------------------
 1 file changed, 23 insertions(+), 52 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index c37da89dd43..81e1c3be108 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -146,9 +146,6 @@ static uint64_t zil_slog_bulk = 768 * 1024;
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
-#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
-    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
-
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
@@ -769,11 +766,6 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
 
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-	VERIFY(list_is_empty(&lwb->lwb_itxs));
-
 	return (lwb);
 }
 
@@ -782,8 +774,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-	VERIFY(list_is_empty(&lwb->lwb_itxs));
+	ASSERT(list_is_empty(&lwb->lwb_waiters));
+	ASSERT(list_is_empty(&lwb->lwb_itxs));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
@@ -1026,12 +1018,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
-		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_fastwrite)
 				metaslab_fastwrite_unmark(zilog->zl_spa,
 				    &lwb->lwb_blk);
-
-			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
@@ -1387,6 +1377,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
@@ -1399,9 +1390,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 	 */
 	lwb->lwb_buf = NULL;
 
-	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
-	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 3 +
-	    gethrtime() - lwb->lwb_issued_timestamp) / 4;
+	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
@@ -1418,17 +1407,12 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
-	while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
-		list_remove(&lwb->lwb_itxs, itx);
+	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
-	}
 
-	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
-		ASSERT(list_link_active(&zcw->zcw_node));
-		list_remove(&lwb->lwb_waiters, zcw);
-
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
@@ -1581,7 +1565,7 @@ zil_lwb_write_done(zio_t *zio)
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
-	if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
@@ -1589,7 +1573,7 @@ zil_lwb_write_done(zio_t *zio)
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL) {
+		if (vd != NULL && !vd->vdev_nowritecache) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
@@ -1980,8 +1964,6 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	zilog->zl_cur_used += (reclen + dlen);
 	txg = lrc->lrc_txg;
 
-	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
-
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
@@ -1997,7 +1979,6 @@ cont:
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_open(zilog, lwb);
-		ASSERT(LWB_EMPTY(lwb));
 		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
 
 		/*
@@ -2184,7 +2165,7 @@ zil_itxg_clean(void *arg)
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
-	while ((itx = list_head(list)) != NULL) {
+	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
@@ -2207,7 +2188,6 @@ zil_itxg_clean(void *arg)
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
-		list_remove(list, itx);
 		zil_itx_destroy(itx);
 	}
 
@@ -2215,8 +2195,7 @@ zil_itxg_clean(void *arg)
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
-		while ((itx = list_head(list)) != NULL) {
-			list_remove(list, itx);
+		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
@@ -2277,8 +2256,7 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
-	while ((itx = list_head(&clean_list)) != NULL) {
-		list_remove(&clean_list, itx);
+	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
@@ -2580,7 +2558,7 @@ zil_commit_writer_stall(zilog_t *zilog)
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
-	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 /*
@@ -2605,7 +2583,7 @@ zil_process_commit_list(zilog_t *zilog)
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
-	if (list_head(&zilog->zl_itx_commit_list) == NULL)
+	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
@@ -2629,7 +2607,7 @@ zil_process_commit_list(zilog_t *zilog)
 		    plwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	}
 
-	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
@@ -2643,8 +2621,6 @@ zil_process_commit_list(zilog_t *zilog)
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
-		list_remove(&zilog->zl_itx_commit_list, itx);
-
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
@@ -2730,20 +2706,16 @@ zil_process_commit_list(zilog_t *zilog)
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
-		while ((zcw = list_head(&nolwb_waiters)) != NULL) {
+		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
-			list_remove(&nolwb_waiters, zcw);
-		}
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
-		while ((itx = list_head(&nolwb_itxs)) != NULL) {
-			list_remove(&nolwb_itxs, itx);
+		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
-		}
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
@@ -2951,7 +2923,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 */
 	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
 
-	IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 
 	/*
 	 * Since the lwb's zio hadn't been issued by the time this thread
@@ -3429,7 +3401,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
-		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
@@ -3473,7 +3445,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
-		if (list_head(&zilog->zl_lwb_list) == NULL)
+		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
@@ -3674,7 +3646,7 @@ zil_close(zilog_t *zilog)
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
-		ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
@@ -3716,15 +3688,14 @@ zil_close(zilog_t *zilog)
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
-	lwb = list_head(&zilog->zl_lwb_list);
+	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
-		ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
 
 		if (lwb->lwb_fastwrite)
 			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
 
-		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}

From 7381ddf1abd16152646c921384c094ffbcae2271 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 12 May 2023 12:49:26 -0400
Subject: [PATCH 04/11] zil: Free lwb_buf after write completion.

There is no sense to keep that memory allocated during the flush.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #14855
---
 module/zfs/zil.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 81e1c3be108..d887e4900d1 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1376,20 +1376,10 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
-	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
-	/*
-	 * If we have had an allocation failure and the txg is
-	 * waiting to sync then we want zil_sync() to remove the lwb so
-	 * that it's not picked up as the next new one in
-	 * zil_process_commit_list(). zil_sync() will only remove the
-	 * lwb if lwb_buf is null.
-	 */
-	lwb->lwb_buf = NULL;
-
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
@@ -1475,7 +1465,8 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
-		IMPLY(lwb->lwb_state == LWB_STATE_FLUSH_DONE,
+		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
@@ -1519,6 +1510,8 @@ zil_lwb_write_done(zio_t *zio)
 	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
 	abd_free(zio->io_abd);
+	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
@@ -3433,7 +3426,8 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
-		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
+		    lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_free(spa, txg, &lwb->lwb_blk);

From c87798d8ff6a63158e80acbbce8b034518a1656e Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Fri, 12 May 2023 16:47:56 -0400
Subject: [PATCH 05/11] Fix use after free regression in
 spa_remove_healed_errors()

6839ec6f1098c28ff7b772f1b31b832d05e6b567 placed code in
spa_remove_healed_errors() that uses a pointer after the kmem_free()
call that frees it.

Reported-by: Coverity (CID-1562375)
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Amanakis <gamanakis@gmail.com>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14860
---
 module/zfs/spa_errlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 31719063a22..5fe35278683 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -683,7 +683,6 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
 	    &cookie)) != NULL) {
 		remove_error_from_list(spa, s, &se->se_bookmark);
 		remove_error_from_list(spa, l, &se->se_bookmark);
-		kmem_free(se, sizeof (spa_error_entry_t));
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 			bookmark_to_name(&se->se_bookmark, name, sizeof (name));
@@ -713,6 +712,7 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
 			}
 			zap_cursor_fini(&zc);
 		}
+		kmem_free(se, sizeof (spa_error_entry_t));
 	}
 }
 

From ee7b71dbc919439b1db6352bcd95f121127b42dd Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Fri, 12 May 2023 17:10:14 -0400
Subject: [PATCH 06/11] Fix undefined behavior in spa_sync_props()

8eae2d214cfa53862833eeeda9a5c1e9d5ded47d caused Coverity to begin
complaining about "Improper use of negative value" in two places in
spa_sync_props() because Coverity correctly inferred from `prop ==
ZPOOL_PROP_INVAL` that prop could be -1 while both zpool_prop_to_name()
and zpool_prop_get_type() use it an array index, which is undefined
behavior.

Assuming that the system does not panic from an attempt to read invalid
memory, the case statement for ZPOOL_PROP_INVAL will ensure that only
user properties will reach this code when prop is ZPOOL_PROP_INVAL, such
that execution will continue safely. However, if we are unlucky enough
to read invalid memory, then the system will panic.

This issue predates the patch that caused coverity to begin complaining.
Thankfully, our userland tools do not pass nonsense to us, so this bug
should not be triggered unless a future userland tool attempts to set a
property that we do not understand.

Reported-by: Coverity (CID-1561129)
Reported-by: Coverity (CID-1561130)
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Amanakis <gamanakis@gmail.com>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14860
---
 module/zfs/spa.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 16396170273..1ca114783ce 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -8942,12 +8942,12 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 			}
 
 			/* normalize the property name */
-			propname = zpool_prop_to_name(prop);
-			proptype = zpool_prop_get_type(prop);
-			if (prop == ZPOOL_PROP_INVAL &&
-			    zfs_prop_user(elemname)) {
+			if (prop == ZPOOL_PROP_INVAL) {
 				propname = elemname;
 				proptype = PROP_TYPE_STRING;
+			} else {
+				propname = zpool_prop_to_name(prop);
+				proptype = zpool_prop_get_type(prop);
 			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {

From e0d5007bcf7e4425d43ba2ad56489c7db5c4a4c5 Mon Sep 17 00:00:00 2001
From: Antonio Russo <aerusso@aerusso.net>
Date: Mon, 15 May 2023 17:11:33 -0600
Subject: [PATCH 07/11] test-runner: pass kmemleak and kmsg to Cmd.run

test-runner.py orchestrates all of the ZTS executions. The `Cmd` object
manages these process, and its `run` method specifically invokes these
possibly long-running processes, possibly retrying in the event of a
timeout. Since its inception, memory leak detection using the kmemleak
infrastructure [1], and kernel logging [2] have been added to this run
mechanism.

However, the callback to cull a process beyond its timeout threshold,
`kill_cmd`, has evaded modernization by both of these changes. As a
result, this function fails to properly invoke `run`, leading to an
untrapped exception and unreported test failure.

This patch extends `kill_cmd` to receive these kernel devices through
the `options` parameter, and regularizes all the `.run` calls from
`Cmd`, and its subclasses, to accept that parameter.

[1] Commit a69765ea5b563e0cd4d15fac4b1ac08c6ccf12d1
[2] Commit fc2c0256c55a2859d1988671b0896d22b75c8aba

Reviewed-by: John Wren Kennedy <john.kennedy@delphix.com>
Signed-off-by: Antonio Russo <aerusso@aerusso.net>
Closes #14849
---
 tests/test-runner/bin/test-runner.py.in | 38 ++++++++++++++-----------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in
index c454bf8d7c6..422ebd7bc8b 100755
--- a/tests/test-runner/bin/test-runner.py.in
+++ b/tests/test-runner/bin/test-runner.py.in
@@ -181,7 +181,7 @@ Timeout: %d
 User: %s
 ''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user)
 
-    def kill_cmd(self, proc, keyboard_interrupt=False):
+    def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False):
         """
         Kill a running command due to timeout, or ^C from the keyboard. If
         sudo is required, this user was verified previously.
@@ -211,7 +211,7 @@ User: %s
             if int(self.timeout) > runtime:
                 self.killed = False
                 self.reran = False
-                self.run(False)
+                self.run(options, dryrun=False, kmemleak=kmemleak)
                 self.reran = True
 
     def update_cmd_privs(self, cmd, user):
@@ -257,15 +257,19 @@ User: %s
 
         return out.lines, err.lines
 
-    def run(self, dryrun, kmemleak, kmsg):
+    def run(self, options, dryrun=None, kmemleak=None):
         """
         This is the main function that runs each individual test.
         Determine whether or not the command requires sudo, and modify it
         if needed. Run the command, and update the result object.
         """
+        if dryrun is None:
+            dryrun = options.dryrun
         if dryrun is True:
             print(self)
             return
+        if kmemleak is None:
+            kmemleak = options.kmemleak
 
         privcmd = self.update_cmd_privs(self.pathname, self.user)
         try:
@@ -280,7 +284,7 @@ User: %s
         Log each test we run to /dev/kmsg (on Linux), so if there's a kernel
         warning we'll be able to match it up to a particular test.
         """
-        if kmsg is True and exists("/dev/kmsg"):
+        if options.kmsg is True and exists("/dev/kmsg"):
             try:
                 kp = Popen([SUDO, "sh", "-c",
                             f"echo ZTS run {self.pathname} > /dev/kmsg"])
@@ -298,7 +302,9 @@ User: %s
         # Allow a special timeout value of 0 to mean infinity
         if int(self.timeout) == 0:
             self.timeout = sys.maxsize / (10 ** 9)
-        t = Timer(int(self.timeout), self.kill_cmd, [proc])
+        t = Timer(
+            int(self.timeout), self.kill_cmd, [proc, options, kmemleak]
+        )
 
         try:
             t.start()
@@ -310,7 +316,7 @@ User: %s
                 cmd = f'{SUDO} cat {KMEMLEAK_FILE}'
                 self.result.kmemleak = check_output(cmd, shell=True)
         except KeyboardInterrupt:
-            self.kill_cmd(proc, True)
+            self.kill_cmd(proc, options, kmemleak, True)
             fail('\nRun terminated at user request.')
         finally:
             t.cancel()
@@ -450,7 +456,7 @@ Tags: %s
 
         return True
 
-    def run(self, options):
+    def run(self, options, dryrun=None, kmemleak=None):
         """
         Create Cmd instances for the pre/post/failsafe scripts. If the pre
         script doesn't pass, skip this Test. Run the post script regardless.
@@ -472,14 +478,14 @@ Tags: %s
 
         cont = True
         if len(pretest.pathname):
-            pretest.run(options.dryrun, False, options.kmsg)
+            pretest.run(options, kmemleak=False)
             cont = pretest.result.result == 'PASS'
             pretest.log(options)
 
         if cont:
-            test.run(options.dryrun, options.kmemleak, options.kmsg)
+            test.run(options, kmemleak=kmemleak)
             if test.result.result == 'KILLED' and len(failsafe.pathname):
-                failsafe.run(options.dryrun, False, options.kmsg)
+                failsafe.run(options, kmemleak=False)
                 failsafe.log(options, suppress_console=True)
         else:
             test.skip()
@@ -487,7 +493,7 @@ Tags: %s
         test.log(options)
 
         if len(posttest.pathname):
-            posttest.run(options.dryrun, False, options.kmsg)
+            posttest.run(options, kmemleak=False)
             posttest.log(options)
 
 
@@ -571,7 +577,7 @@ Tags: %s
 
         return len(self.tests) != 0
 
-    def run(self, options):
+    def run(self, options, dryrun=None, kmemleak=None):
         """
         Create Cmd instances for the pre/post/failsafe scripts. If the pre
         script doesn't pass, skip all the tests in this TestGroup. Run the
@@ -590,7 +596,7 @@ Tags: %s
 
         cont = True
         if len(pretest.pathname):
-            pretest.run(options.dryrun, False, options.kmsg)
+            pretest.run(options, dryrun=dryrun, kmemleak=False)
             cont = pretest.result.result == 'PASS'
             pretest.log(options)
 
@@ -603,9 +609,9 @@ Tags: %s
             failsafe = Cmd(self.failsafe, outputdir=odir, timeout=self.timeout,
                            user=self.failsafe_user, identifier=self.identifier)
             if cont:
-                test.run(options.dryrun, options.kmemleak, options.kmsg)
+                test.run(options, dryrun=dryrun, kmemleak=kmemleak)
                 if test.result.result == 'KILLED' and len(failsafe.pathname):
-                    failsafe.run(options.dryrun, False, options.kmsg)
+                    failsafe.run(options, dryrun=dryrun, kmemleak=False)
                     failsafe.log(options, suppress_console=True)
             else:
                 test.skip()
@@ -613,7 +619,7 @@ Tags: %s
             test.log(options)
 
         if len(posttest.pathname):
-            posttest.run(options.dryrun, False, options.kmsg)
+            posttest.run(options, dryrun=dryrun, kmemleak=False)
             posttest.log(options)
 
 

From e34e15ed6d1882d29e314321b7642305d99f1b78 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 18 May 2023 10:02:20 -0700
Subject: [PATCH 08/11] Add the ability to uninitialize

zpool initialize functions well for touching every free byte...once.
But if we want to do it again, we're currently out of luck.

So let's add zpool initialize -u to clear it.

Co-authored-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #12451
Closes #14873
---
 cmd/zpool/zpool_main.c                        |  22 ++-
 include/sys/fs/zfs.h                          |   1 +
 include/sys/vdev_initialize.h                 |   1 +
 lib/libzfs/libzfs.abi                         |   3 +-
 lib/libzfs/libzfs_pool.c                      |  15 +-
 lib/libzfs_core/libzfs_core.abi               |   3 +-
 man/man8/zpool-initialize.8                   |  10 +-
 module/zfs/spa.c                              |   7 +
 module/zfs/vdev_initialize.c                  |  66 +++++++-
 module/zfs/zfs_ioctl.c                        |   3 +-
 tests/runfiles/common.run                     |   1 +
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../zpool_initialize_uninit.ksh               | 141 ++++++++++++++++++
 13 files changed, 258 insertions(+), 16 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 301c5f4bfc6..3e08e031414 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -398,7 +398,7 @@ get_usage(zpool_help_t idx)
 	case HELP_REOPEN:
 		return (gettext("\treopen [-n] <pool>\n"));
 	case HELP_INITIALIZE:
-		return (gettext("\tinitialize [-c | -s] [-w] <pool> "
+		return (gettext("\tinitialize [-c | -s | -u] [-w] <pool> "
 		    "[<device> ...]\n"));
 	case HELP_SCRUB:
 		return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
@@ -585,12 +585,13 @@ usage(boolean_t requested)
 }
 
 /*
- * zpool initialize [-c | -s] [-w] <pool> [<vdev> ...]
+ * zpool initialize [-c | -s | -u] [-w] <pool> [<vdev> ...]
  * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
  * if none specified.
  *
  *	-c	Cancel. Ends active initializing.
  *	-s	Suspend. Initializing can then be restarted with no flags.
+ *	-u	Uninitialize. Clears initialization state.
  *	-w	Wait. Blocks until initializing has completed.
  */
 int
@@ -606,12 +607,14 @@ zpool_do_initialize(int argc, char **argv)
 	struct option long_options[] = {
 		{"cancel",	no_argument,		NULL, 'c'},
 		{"suspend",	no_argument,		NULL, 's'},
+		{"uninit",	no_argument,		NULL, 'u'},
 		{"wait",	no_argument,		NULL, 'w'},
 		{0, 0, 0, 0}
 	};
 
 	pool_initialize_func_t cmd_type = POOL_INITIALIZE_START;
-	while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) {
+	while ((c = getopt_long(argc, argv, "csuw", long_options,
+	    NULL)) != -1) {
 		switch (c) {
 		case 'c':
 			if (cmd_type != POOL_INITIALIZE_START &&
@@ -631,6 +634,15 @@ zpool_do_initialize(int argc, char **argv)
 			}
 			cmd_type = POOL_INITIALIZE_SUSPEND;
 			break;
+		case 'u':
+			if (cmd_type != POOL_INITIALIZE_START &&
+			    cmd_type != POOL_INITIALIZE_UNINIT) {
+				(void) fprintf(stderr, gettext("-u cannot be "
+				    "combined with other options\n"));
+				usage(B_FALSE);
+			}
+			cmd_type = POOL_INITIALIZE_UNINIT;
+			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
@@ -657,8 +669,8 @@ zpool_do_initialize(int argc, char **argv)
 	}
 
 	if (wait && (cmd_type != POOL_INITIALIZE_START)) {
-		(void) fprintf(stderr, gettext("-w cannot be used with -c or "
-		    "-s\n"));
+		(void) fprintf(stderr, gettext("-w cannot be used with -c, -s"
+		    "or -u\n"));
 		usage(B_FALSE);
 	}
 
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 0734ff12280..4c2097fb830 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1265,6 +1265,7 @@ typedef enum pool_initialize_func {
 	POOL_INITIALIZE_START,
 	POOL_INITIALIZE_CANCEL,
 	POOL_INITIALIZE_SUSPEND,
+	POOL_INITIALIZE_UNINIT,
 	POOL_INITIALIZE_FUNCS
 } pool_initialize_func_t;
 
diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h
index 4e63f063cb6..78702b7325a 100644
--- a/include/sys/vdev_initialize.h
+++ b/include/sys/vdev_initialize.h
@@ -33,6 +33,7 @@ extern "C" {
 #endif
 
 extern void vdev_initialize(vdev_t *vd);
+extern void vdev_uninitialize(vdev_t *vd);
 extern void vdev_initialize_stop(vdev_t *vd,
     vdev_initializing_state_t tgt_state, list_t *vd_list);
 extern void vdev_initialize_stop_all(vdev_t *vd,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 732863dcffc..57b096ca6e9 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5741,7 +5741,8 @@
       <enumerator name='POOL_INITIALIZE_START' value='0'/>
       <enumerator name='POOL_INITIALIZE_CANCEL' value='1'/>
       <enumerator name='POOL_INITIALIZE_SUSPEND' value='2'/>
-      <enumerator name='POOL_INITIALIZE_FUNCS' value='3'/>
+      <enumerator name='POOL_INITIALIZE_UNINIT' value='3'/>
+      <enumerator name='POOL_INITIALIZE_FUNCS' value='4'/>
     </enum-decl>
     <typedef-decl name='pool_initialize_func_t' type-id='5c246ad4' id='7063e1ab'/>
     <enum-decl name='pool_trim_func' id='54ed608a'>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 4fb71b4e0dc..a71cb24736a 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2387,8 +2387,8 @@ xlate_init_err(int err)
 }
 
 /*
- * Begin, suspend, or cancel the initialization (initializing of all free
- * blocks) for the given vdevs in the given pool.
+ * Begin, suspend, cancel, or uninit (clear) the initialization (initializing
+ * of all free blocks) for the given vdevs in the given pool.
  */
 static int
 zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
@@ -2414,11 +2414,16 @@ zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
 	    vdev_guids, &errlist);
 
 	if (err != 0) {
-		if (errlist != NULL) {
-			vd_errlist = fnvlist_lookup_nvlist(errlist,
-			    ZPOOL_INITIALIZE_VDEVS);
+		if (errlist != NULL && nvlist_lookup_nvlist(errlist,
+		    ZPOOL_INITIALIZE_VDEVS, &vd_errlist) == 0) {
 			goto list_errors;
 		}
+
+		if (err == EINVAL && cmd_type == POOL_INITIALIZE_UNINIT) {
+			zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
+			    "uninitialize is not supported by kernel"));
+		}
+
 		(void) zpool_standard_error(zhp->zpool_hdl, err,
 		    dgettext(TEXT_DOMAIN, "operation failed"));
 		goto out;
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index ec94a465055..33d794e3f80 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -1249,7 +1249,8 @@
       <enumerator name='POOL_INITIALIZE_START' value='0'/>
       <enumerator name='POOL_INITIALIZE_CANCEL' value='1'/>
       <enumerator name='POOL_INITIALIZE_SUSPEND' value='2'/>
-      <enumerator name='POOL_INITIALIZE_FUNCS' value='3'/>
+      <enumerator name='POOL_INITIALIZE_UNINIT' value='3'/>
+      <enumerator name='POOL_INITIALIZE_FUNCS' value='4'/>
     </enum-decl>
     <typedef-decl name='pool_initialize_func_t' type-id='5c246ad4' id='7063e1ab'/>
     <enum-decl name='pool_trim_func' id='54ed608a'>
diff --git a/man/man8/zpool-initialize.8 b/man/man8/zpool-initialize.8
index eae711bff42..a9c8fd35aec 100644
--- a/man/man8/zpool-initialize.8
+++ b/man/man8/zpool-initialize.8
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm initialize
-.Op Fl c Ns | Ns Fl s
+.Op Fl c Ns | Ns Fl s | Ns Fl u
 .Op Fl w
 .Ar pool
 .Oo Ar device Oc Ns …
@@ -60,6 +60,14 @@ initialized, the command will fail and no suspension will occur on any device.
 Initializing can then be resumed by running
 .Nm zpool Cm initialize
 with no flags on the relevant target devices.
+.It Fl u , -uninit
+Clears the initialization state on the specified devices, or all eligible
+devices if none are specified.
+If the devices are being actively initialized the command will fail.
+After being cleared
+.Nm zpool Cm initialize
+with no flags can be used to re-initialize all unallocoated regions on
+the relevant target devices.
 .It Fl w , -wait
 Wait until the devices have finished initializing before returning.
 .El
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 1ca114783ce..51d6de9105f 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -7421,6 +7421,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+	    vd->vdev_initialize_thread != NULL) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		return (SET_ERROR(EBUSY));
 	}
 
 	switch (cmd_type) {
@@ -7433,6 +7437,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	case POOL_INITIALIZE_SUSPEND:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 		break;
+	case POOL_INITIALIZE_UNINIT:
+		vdev_uninitialize(vd);
+		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index 75beb0cc3d1..ffdcef1972c 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -96,6 +96,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
 	    &initialize_state, tx));
 }
 
+static void
+vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx)
+{
+	uint64_t guid = *(uint64_t *)arg;
+
+	kmem_free(arg, sizeof (uint64_t));
+
+	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+		return;
+
+	ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE);
+	ASSERT3U(vd->vdev_leaf_zap, !=, 0);
+
+	vd->vdev_initialize_last_offset = 0;
+	vd->vdev_initialize_action_time = 0;
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+	int error;
+
+	error = zap_remove(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx);
+	VERIFY(error == 0 || error == ENOENT);
+
+	error = zap_remove(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_STATE, tx);
+	VERIFY(error == 0 || error == ENOENT);
+
+	error = zap_remove(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx);
+	VERIFY(error == 0 || error == ENOENT);
+}
+
 static void
 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 {
@@ -123,8 +156,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
-	    guid, tx);
+
+	if (new_state != VDEV_INITIALIZE_NONE) {
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_update_sync, guid, tx);
+	} else {
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_remove_sync, guid, tx);
+	}
 
 	switch (new_state) {
 	case VDEV_INITIALIZE_ACTIVE:
@@ -145,6 +184,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 		spa_history_log_internal(spa, "initialize", tx,
 		    "vdev=%s complete", vd->vdev_path);
 		break;
+	case VDEV_INITIALIZE_NONE:
+		spa_history_log_internal(spa, "uninitialize", tx,
+		    "vdev=%s", vd->vdev_path);
+		break;
 	default:
 		panic("invalid state %llu", (unsigned long long)new_state);
 	}
@@ -594,6 +637,24 @@ vdev_initialize(vdev_t *vd)
 	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 }
 
+/*
+ * Uninitializes a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_uninitialize(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_initialize_exit_wanted);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE);
+}
+
 /*
  * Wait for the initialize thread to be terminated (cancelled or stopped).
  */
@@ -750,6 +811,7 @@ vdev_initialize_restart(vdev_t *vd)
 }
 
 EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_uninitialize);
 EXPORT_SYMBOL(vdev_initialize_stop);
 EXPORT_SYMBOL(vdev_initialize_stop_all);
 EXPORT_SYMBOL(vdev_initialize_stop_wait);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 3b1e2ae5fb5..efaf6f9b390 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4070,7 +4070,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
 	    cmd_type == POOL_INITIALIZE_START ||
-	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
+	    cmd_type == POOL_INITIALIZE_SUSPEND ||
+	    cmd_type == POOL_INITIALIZE_UNINIT)) {
 		return (SET_ERROR(EINVAL));
 	}
 
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 1665e20e0e3..62d9cbeb6d9 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -446,6 +446,7 @@ tests = ['zpool_initialize_attach_detach_add_remove',
     'zpool_initialize_start_and_cancel_neg',
     'zpool_initialize_start_and_cancel_pos',
     'zpool_initialize_suspend_resume',
+    'zpool_initialize_uninit',
     'zpool_initialize_unsupported_vdevs',
     'zpool_initialize_verify_checksums',
     'zpool_initialize_verify_initialized']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index a4932fc988a..3e4120f52ca 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1102,6 +1102,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \
+	functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh
new file mode 100755
index 00000000000..17f776cfbc2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh
@@ -0,0 +1,141 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (C) 2023 Lawrence Livermore National Security, LLC.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Starting, stopping, uninitializing, and restart an initialize works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Verify uninitialize succeeds for uninitialized pool.
+# 3. Verify pool wide cancel|suspend + uninit
+#   a. Start initializing and verify that initializing is active.
+#   b. Verify uninitialize fails when actively initializing.
+#   c. Cancel or suspend initializing and verify that initializing is not active.
+#   d. Verify uninitialize succeeds after being cancelled.
+# 4. Verify per-disk cancel|suspend + uninit
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+DISK3="$(echo $DISKS | cut -d' ' -f3)"
+
+function status_check # pool disk1-state disk2-state disk3-state
+{
+        typeset pool="$1"
+        typeset disk1_state="$2"
+        typeset disk2_state="$3"
+        typeset disk3_state="$4"
+
+	state=$(zpool status -i "$pool" | grep "$DISK1" | grep "$disk1_state")
+        if [[ -z "$state" ]]; then
+		log_fail "DISK1 state; expected='$disk1_state' got '$state'"
+	fi
+
+	state=$(zpool status -i "$pool" | grep "$DISK2" | grep "$disk2_state")
+        if [[ -z "$state" ]]; then
+		log_fail "DISK2 state; expected='$disk2_state' got '$state'"
+	fi
+
+	state=$(zpool status -i "$pool" | grep "$DISK3" | grep "$disk3_state")
+        if [[ -z "$state" ]]; then
+		log_fail "DISK3 state; expected='$disk3_state' got '$state'"
+	fi
+}
+
+function status_check_all # pool disk-state
+{
+        typeset pool="$1"
+        typeset disk_state="$2"
+
+	status_check "$pool" "$disk_state" "$disk_state" "$disk_state"
+}
+
+# 1. Create a one-disk pool.
+log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3
+status_check_all $TESTPOOL "uninitialized"
+
+# 2. Verify uninitialize succeeds for uninitialized pool.
+log_must zpool initialize -u $TESTPOOL
+status_check_all $TESTPOOL "uninitialized"
+
+# 3. Verify pool wide cancel + uninit
+log_must zpool initialize $TESTPOOL
+status_check_all $TESTPOOL "[[:digit:]]* initialized"
+
+log_mustnot zpool initialize -u $TESTPOOL
+status_check_all $TESTPOOL "[[:digit:]]* initialized"
+
+log_must zpool initialize -c $TESTPOOL
+status_check_all $TESTPOOL "uninitialized"
+
+log_must zpool initialize -u $TESTPOOL
+status_check_all $TESTPOOL "uninitialized"
+
+# 3. Verify pool wide suspend + uninit
+log_must zpool initialize $TESTPOOL
+status_check_all $TESTPOOL "[[:digit:]]* initialized"
+
+log_mustnot zpool initialize -u $TESTPOOL
+status_check_all $TESTPOOL "[[:digit:]]* initialized"
+
+log_must zpool initialize -s $TESTPOOL
+status_check_all $TESTPOOL "suspended"
+
+log_must zpool initialize -u $TESTPOOL
+status_check_all $TESTPOOL "uninitialized"
+
+# 4. Verify per-disk cancel|suspend + uninit
+log_must zpool initialize $TESTPOOL
+status_check_all $TESTPOOL "[[:digit:]]* initialized"
+
+log_must zpool initialize -c $TESTPOOL $DISK1
+log_must zpool initialize -s $TESTPOOL $DISK2
+log_mustnot zpool initialize -u $TESTPOOL $DISK3
+status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized"
+
+log_must zpool initialize -u $TESTPOOL $DISK1
+status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized"
+
+log_must zpool initialize -u $TESTPOOL $DISK2
+status_check $TESTPOOL "uninitialized" "uninitialized" "[[:digit:]]* initialized"
+
+log_must zpool initialize $TESTPOOL $DISK1
+status_check $TESTPOOL "[[:digit:]]* initialized" "uninitialized" "[[:digit:]]* initialized"
+
+log_must zpool initialize $TESTPOOL $DISK2
+status_check_all $TESTPOOL "[[:digit:]]* initialized"
+
+log_must zpool initialize -s $TESTPOOL
+status_check_all $TESTPOOL "suspended"
+
+log_must zpool initialize -u $TESTPOOL $DISK1 $DISK2 $DISK3
+status_check_all $TESTPOOL "uninitialized"
+
+log_pass "Initialize start + cancel/suspend + uninit + start works"

From 482eeef804f0f325faddb102f112c0f1ec86a1b6 Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Fri, 17 Dec 2021 21:35:28 +0100
Subject: [PATCH 09/11] Teach zpool scrub to scrub only blocks in error log

Added a flag '-e' in zpool scrub to scrub only blocks in error log. A
user can pause, resume and cancel the error scrub by passing additional
command line arguments -p -s just like a regular scrub. This involves
adding a new flag, creating new libzfs interfaces, a new ioctl, and the
actual iteration and read-issuing logic. Error scrubbing is executed in
multiple txg to make sure pool performance is not affected.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Co-authored-by: TulsiJain tulsi.jain@delphix.com
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #8995
Closes #12355
---
 cmd/zpool/zpool_main.c                        | 111 ++-
 include/libzfs.h                              |   3 +
 include/libzfs_core.h                         |   2 +
 include/sys/dmu.h                             |   1 +
 include/sys/dsl_scan.h                        |  27 +-
 include/sys/fs/zfs.h                          |  19 +-
 include/sys/spa.h                             |   8 +
 include/sys/spa_impl.h                        |   4 +
 include/sys/sysevent/eventdefs.h              |   5 +
 lib/libzfs/libzfs.abi                         |   3 +-
 lib/libzfs/libzfs_pool.c                      | 105 ++-
 lib/libzfs/libzfs_util.c                      |  14 +-
 lib/libzfs_core/libzfs_core.abi               | 105 +++
 lib/libzfs_core/libzfs_core.c                 |   7 +
 man/man4/zfs.4                                |   3 +
 man/man8/zpool-scrub.8                        |  19 +
 module/zfs/dsl_scan.c                         | 696 +++++++++++++++++-
 module/zfs/spa.c                              |   6 +
 module/zfs/spa_errlog.c                       |  82 ++-
 module/zfs/spa_misc.c                         |  25 +-
 module/zfs/zfs_ioctl.c                        |  46 ++
 tests/runfiles/common.run                     |   4 +-
 tests/zfs-tests/cmd/libzfs_input_check.c      |  15 +
 tests/zfs-tests/include/libtest.shlib         |  18 +
 tests/zfs-tests/tests/Makefile.am             |   4 +
 .../zpool_scrub/zpool_error_scrub_001_pos.ksh |  79 ++
 .../zpool_scrub/zpool_error_scrub_002_pos.ksh |  99 +++
 .../zpool_scrub/zpool_error_scrub_003_pos.ksh | 109 +++
 .../zpool_scrub/zpool_error_scrub_004_pos.ksh |  54 ++
 29 files changed, 1602 insertions(+), 71 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 3e08e031414..013dd4a2338 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -401,7 +401,7 @@ get_usage(zpool_help_t idx)
 		return (gettext("\tinitialize [-c | -s | -u] [-w] <pool> "
 		    "[<device> ...]\n"));
 	case HELP_SCRUB:
-		return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
+		return (gettext("\tscrub [-s | -p] [-w] [-e] <pool> ...\n"));
 	case HELP_RESILVER:
 		return (gettext("\tresilver <pool> ...\n"));
 	case HELP_TRIM:
@@ -7309,8 +7309,9 @@ wait_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool scrub [-s | -p] [-w] <pool> ...
+ * zpool scrub [-s | -p] [-w] [-e] <pool> ...
  *
+ *	-e	Only scrub blocks in the error log.
  *	-s	Stop.  Stops any in-progress scrub.
  *	-p	Pause. Pause in-progress scrub.
  *	-w	Wait.  Blocks until scrub has completed.
@@ -7326,14 +7327,21 @@ zpool_do_scrub(int argc, char **argv)
 	cb.cb_type = POOL_SCAN_SCRUB;
 	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
 
+	boolean_t is_error_scrub = B_FALSE;
+	boolean_t is_pause = B_FALSE;
+	boolean_t is_stop = B_FALSE;
+
 	/* check options */
-	while ((c = getopt(argc, argv, "spw")) != -1) {
+	while ((c = getopt(argc, argv, "spwe")) != -1) {
 		switch (c) {
+		case 'e':
+			is_error_scrub = B_TRUE;
+			break;
 		case 's':
-			cb.cb_type = POOL_SCAN_NONE;
+			is_stop = B_TRUE;
 			break;
 		case 'p':
-			cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
+			is_pause = B_TRUE;
 			break;
 		case 'w':
 			wait = B_TRUE;
@@ -7345,11 +7353,21 @@ zpool_do_scrub(int argc, char **argv)
 		}
 	}
 
-	if (cb.cb_type == POOL_SCAN_NONE &&
-	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
-		(void) fprintf(stderr, gettext("invalid option combination: "
-		    "-s and -p are mutually exclusive\n"));
+	if (is_pause && is_stop) {
+		(void) fprintf(stderr, gettext("invalid option "
+		    "combination :-s and -p are mutually exclusive\n"));
 		usage(B_FALSE);
+	} else {
+		if (is_error_scrub)
+			cb.cb_type = POOL_SCAN_ERRORSCRUB;
+
+		if (is_pause) {
+			cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
+		} else if (is_stop) {
+			cb.cb_type = POOL_SCAN_NONE;
+		} else {
+			cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+		}
 	}
 
 	if (wait && (cb.cb_type == POOL_SCAN_NONE ||
@@ -7573,6 +7591,70 @@ secs_to_dhms(uint64_t total, char *buf)
 	}
 }
 
+/*
+ * Print out detailed error scrub status.
+ */
+static void
+print_err_scrub_status(pool_scan_stat_t *ps)
+{
+	time_t start, end, pause;
+	uint64_t total_secs_left;
+	uint64_t secs_left, mins_left, hours_left, days_left;
+	uint64_t examined, to_be_examined;
+
+	if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) {
+		return;
+	}
+
+	(void) printf(gettext(" scrub: "));
+
+	start = ps->pss_error_scrub_start;
+	end = ps->pss_error_scrub_end;
+	pause = ps->pss_pass_error_scrub_pause;
+	examined = ps->pss_error_scrub_examined;
+	to_be_examined = ps->pss_error_scrub_to_be_examined;
+
+	assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB);
+
+	if (ps->pss_error_scrub_state == DSS_FINISHED) {
+		total_secs_left = end - start;
+		days_left = total_secs_left / 60 / 60 / 24;
+		hours_left = (total_secs_left / 60 / 60) % 24;
+		mins_left = (total_secs_left / 60) % 60;
+		secs_left = (total_secs_left % 60);
+
+		(void) printf(gettext("scrubbed %llu error blocks in %llu days "
+		    "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined,
+		    (u_longlong_t)days_left, (u_longlong_t)hours_left,
+		    (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+		    ctime(&end));
+
+		return;
+	} else if (ps->pss_error_scrub_state == DSS_CANCELED) {
+		(void) printf(gettext("error scrub canceled on %s"),
+		    ctime(&end));
+		return;
+	}
+	assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING);
+
+	/* Error scrub is in progress. */
+	if (pause == 0) {
+		(void) printf(gettext("error scrub in progress since %s"),
+		    ctime(&start));
+	} else {
+		(void) printf(gettext("error scrub paused since %s"),
+		    ctime(&pause));
+		(void) printf(gettext("\terror scrub started on %s"),
+		    ctime(&start));
+	}
+
+	double fraction_done = (double)examined / (to_be_examined + examined);
+	(void) printf(gettext("\t%.2f%% done, issued I/O for %llu error"
+	    " blocks"), 100 * fraction_done, (u_longlong_t)examined);
+
+	(void) printf("\n");
+}
+
 /*
  * Print out detailed scrub status.
  */
@@ -7909,10 +7991,12 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 {
 	uint64_t rebuild_end_time = 0, resilver_end_time = 0;
 	boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
+	boolean_t have_errorscrub = B_FALSE;
 	boolean_t active_resilver = B_FALSE;
 	pool_checkpoint_stat_t *pcs = NULL;
 	pool_scan_stat_t *ps = NULL;
 	uint_t c;
+	time_t scrub_start = 0, errorscrub_start = 0;
 
 	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
 	    (uint64_t **)&ps, &c) == 0) {
@@ -7921,16 +8005,23 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 			active_resilver = (ps->pss_state == DSS_SCANNING);
 		}
 
+
 		have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
 		have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
+		scrub_start = ps->pss_start_time;
+		have_errorscrub = (ps->pss_error_scrub_func ==
+		    POOL_SCAN_ERRORSCRUB);
+		errorscrub_start = ps->pss_error_scrub_start;
 	}
 
 	boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
 	boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
 
 	/* Always print the scrub status when available. */
-	if (have_scrub)
+	if (have_scrub && scrub_start > errorscrub_start)
 		print_scan_scrub_resilver_status(ps);
+	else if (have_errorscrub && errorscrub_start >= scrub_start)
+		print_err_scrub_status(ps);
 
 	/*
 	 * When there is an active resilver or rebuild print its status.
diff --git a/include/libzfs.h b/include/libzfs.h
index 87d1ed738f2..a7037e3e626 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -125,11 +125,14 @@ typedef enum zfs_error {
 	EZFS_THREADCREATEFAILED, /* thread create failed */
 	EZFS_POSTSPLIT_ONLINE,	/* onlining a disk after splitting it */
 	EZFS_SCRUBBING,		/* currently scrubbing */
+	EZFS_ERRORSCRUBBING,	/* currently error scrubbing */
+	EZFS_ERRORSCRUB_PAUSED,	/* error scrub currently paused */
 	EZFS_NO_SCRUB,		/* no active scrub */
 	EZFS_DIFF,		/* general failure of zfs diff */
 	EZFS_DIFFDATA,		/* bad zfs diff data */
 	EZFS_POOLREADONLY,	/* pool is in read-only mode */
 	EZFS_SCRUB_PAUSED,	/* scrub currently paused */
+	EZFS_SCRUB_PAUSED_TO_CANCEL,	/* scrub currently paused */
 	EZFS_ACTIVE_POOL,	/* pool is imported on a different system */
 	EZFS_CRYPTOFAILED,	/* failed to setup encryption */
 	EZFS_NO_PENDING,	/* cannot cancel, no operation is pending */
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 14a4857c35d..867c18b9c22 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -155,6 +155,8 @@ _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **);
 _LIBZFS_CORE_H int lzc_get_vdev_prop(const char *, nvlist_t *, nvlist_t **);
 _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
 
+_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 5ee6704668a..7e57d133c2e 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -378,6 +378,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
+#define	DMU_POOL_ERRORSCRUB		"error_scrub"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index 8925b5815a3..6753b4a8f35 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -29,6 +29,7 @@
 
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
+#include <sys/zap.h>
 #include <sys/ddt.h>
 #include <sys/bplist.h>
 
@@ -78,6 +79,21 @@ typedef enum dsl_scan_flags {
 
 #define	DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN)
 
+typedef struct dsl_errorscrub_phys {
+	uint64_t dep_func; /* pool_scan_func_t */
+	uint64_t dep_state; /* dsl_scan_state_t */
+	uint64_t dep_cursor; /* serialized zap cursor for tracing progress */
+	uint64_t dep_start_time; /* error scrub start time, unix timestamp */
+	uint64_t dep_end_time; /* error scrub end time, unix timestamp */
+	uint64_t dep_to_examine; /* total error blocks to be scrubbed */
+	uint64_t dep_examined; /* blocks scrubbed so far */
+	uint64_t dep_errors;	/* error scrub I/O error count */
+	uint64_t dep_paused_flags; /* flag for paused */
+} dsl_errorscrub_phys_t;
+
+#define	ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \
+	/ sizeof (uint64_t))
+
 /*
  * Every pool will have one dsl_scan_t and this structure will contain
  * in-memory information about the scan and a pointer to the on-disk
@@ -151,11 +167,15 @@ typedef struct dsl_scan {
 	uint64_t scn_avg_zio_size_this_txg;
 	uint64_t scn_zios_this_txg;
 
+	/* zap cursor for tracing error scrub progress */
+	zap_cursor_t errorscrub_cursor;
 	/* members needed for syncing scan status to disk */
 	dsl_scan_phys_t scn_phys;	/* on disk representation of scan */
 	dsl_scan_phys_t scn_phys_cached;
 	avl_tree_t scn_queue;		/* queue of datasets to scan */
 	uint64_t scn_queues_pending;	/* outstanding data to issue */
+	/* members needed for syncing error scrub status to disk */
+	dsl_errorscrub_phys_t errorscrub_phys;
 } dsl_scan_t;
 
 typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
@@ -171,8 +191,12 @@ int dsl_scan_cancel(struct dsl_pool *);
 int dsl_scan(struct dsl_pool *, pool_scan_func_t);
 void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd);
 boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
-int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
+boolean_t dsl_errorscrubbing(const struct dsl_pool *dp);
+boolean_t dsl_errorscrub_active(dsl_scan_t *scn);
 void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg);
+int dsl_scrub_set_pause_resume(const struct dsl_pool *dp,
+    pool_scrub_cmd_t cmd);
+void dsl_errorscrub_sync(struct dsl_pool *, dmu_tx_t *);
 boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
@@ -184,6 +208,7 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
     struct dmu_tx *tx);
 boolean_t dsl_scan_active(dsl_scan_t *scn);
 boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn);
 void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
 void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
 void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 4c2097fb830..93193fa142d 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1036,6 +1036,7 @@ typedef enum pool_scan_func {
 	POOL_SCAN_NONE,
 	POOL_SCAN_SCRUB,
 	POOL_SCAN_RESILVER,
+	POOL_SCAN_ERRORSCRUB,
 	POOL_SCAN_FUNCS
 } pool_scan_func_t;
 
@@ -1099,6 +1100,20 @@ typedef struct pool_scan_stat {
 	uint64_t	pss_pass_scrub_spent_paused;
 	uint64_t	pss_pass_issued; /* issued bytes per scan pass */
 	uint64_t	pss_issued;	/* total bytes checked by scanner */
+
+	/* error scrub values stored on disk */
+	uint64_t	pss_error_scrub_func;	/* pool_scan_func_t */
+	uint64_t	pss_error_scrub_state;	/* dsl_scan_state_t */
+	uint64_t	pss_error_scrub_start;	/* error scrub start time */
+	uint64_t	pss_error_scrub_end;	/* error scrub end time */
+	uint64_t	pss_error_scrub_examined; /* error blocks issued I/O */
+	/* error blocks to be issued I/O */
+	uint64_t	pss_error_scrub_to_be_examined;
+
+	/* error scrub values not stored on disk */
+	/* error scrub pause time in milliseconds */
+	uint64_t	pss_pass_error_scrub_pause;
+
 } pool_scan_stat_t;
 
 typedef struct pool_removal_stat {
@@ -1120,6 +1135,7 @@ typedef enum dsl_scan_state {
 	DSS_SCANNING,
 	DSS_FINISHED,
 	DSS_CANCELED,
+	DSS_ERRORSCRUBBING,
 	DSS_NUM_STATES
 } dsl_scan_state_t;
 
@@ -1360,7 +1376,7 @@ typedef enum {
  */
 typedef enum zfs_ioc {
 	/*
-	 * Core features - 81/128 numbers reserved.
+	 * Core features - 88/128 numbers reserved.
 	 */
 #ifdef __FreeBSD__
 	ZFS_IOC_FIRST =	0,
@@ -1455,6 +1471,7 @@ typedef enum zfs_ioc {
 	ZFS_IOC_WAIT_FS,			/* 0x5a54 */
 	ZFS_IOC_VDEV_GET_PROPS,			/* 0x5a55 */
 	ZFS_IOC_VDEV_SET_PROPS,			/* 0x5a56 */
+	ZFS_IOC_POOL_SCRUB,			/* 0x5a57 */
 
 	/*
 	 * Per-platform (Optional) - 8/128 numbers reserved.
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 460ea2bfee4..ed752967cca 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1155,6 +1155,7 @@ extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_approx_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
+extern uint64_t spa_get_last_errlog_size(spa_t *spa);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
@@ -1165,6 +1166,13 @@ extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds,
 extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj,
     dmu_tx_t *tx);
 extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx);
+extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds,
+    zbookmark_err_phys_t *zep, uint64_t *top_affected_fs);
+extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep,
+    uint64_t *birth_txg);
+extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep,
+    zbookmark_phys_t *zb);
+extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep);
 
 /* vdev cache */
 extern void vdev_cache_stat_init(void);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 5782c54bd78..44afa763283 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -295,6 +295,10 @@ struct spa {
 	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
 	uint64_t	spa_scan_pass_issued;	/* issued bytes per pass */
 
+	/* error scrub pause time in milliseconds */
+	uint64_t	spa_scan_pass_errorscrub_pause;
+	/* total error scrub paused time in milliseconds */
+	uint64_t	spa_scan_pass_errorscrub_spent_paused;
 	/*
 	 * We are in the middle of a resilver, and another resilver
 	 * is needed once this one completes. This is set iff any
diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h
index eb1dfd16c0f..a2108525796 100644
--- a/include/sys/sysevent/eventdefs.h
+++ b/include/sys/sysevent/eventdefs.h
@@ -123,6 +123,11 @@ extern "C" {
 #define	ESC_ZFS_TRIM_CANCEL		"trim_cancel"
 #define	ESC_ZFS_TRIM_RESUME		"trim_resume"
 #define	ESC_ZFS_TRIM_SUSPEND		"trim_suspend"
+#define	ESC_ZFS_ERRORSCRUB_START	"errorscrub_start"
+#define	ESC_ZFS_ERRORSCRUB_FINISH	"errorscrub_finish"
+#define	ESC_ZFS_ERRORSCRUB_ABORT	"errorscrub_abort"
+#define	ESC_ZFS_ERRORSCRUB_RESUME	"errorscrub_resume"
+#define	ESC_ZFS_ERRORSCRUB_PAUSED	"errorscrub_paused"
 
 /*
  * datalink subclass definitions.
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 57b096ca6e9..6e53bcb41a8 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5717,7 +5717,8 @@
       <enumerator name='POOL_SCAN_NONE' value='0'/>
       <enumerator name='POOL_SCAN_SCRUB' value='1'/>
       <enumerator name='POOL_SCAN_RESILVER' value='2'/>
-      <enumerator name='POOL_SCAN_FUNCS' value='3'/>
+      <enumerator name='POOL_SCAN_ERRORSCRUB' value='3'/>
+      <enumerator name='POOL_SCAN_FUNCS' value='4'/>
     </enum-decl>
     <typedef-decl name='pool_scan_func_t' type-id='1b092565' id='7313fbe2'/>
     <enum-decl name='pool_scrub_cmd' id='a1474cbd'>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index a71cb24736a..d4af31c50cf 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2648,50 +2648,84 @@ out:
 int
 zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
 {
-	zfs_cmd_t zc = {"\0"};
 	char errbuf[ERRBUFLEN];
 	int err;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 
-	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	zc.zc_cookie = func;
-	zc.zc_flags = cmd;
+	nvlist_t *args = fnvlist_alloc();
+	fnvlist_add_uint64(args, "scan_type", (uint64_t)func);
+	fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd);
 
-	if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
+	err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL);
+	fnvlist_free(args);
+
+	if (err == 0) {
 		return (0);
+	} else if (err == ZFS_ERR_IOC_CMD_UNAVAIL) {
+		zfs_cmd_t zc = {"\0"};
+		(void) strlcpy(zc.zc_name, zhp->zpool_name,
+		    sizeof (zc.zc_name));
+		zc.zc_cookie = func;
+		zc.zc_flags = cmd;
 
-	err = errno;
+		if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
+			return (0);
+	}
 
-	/* ECANCELED on a scrub means we resumed a paused scrub */
-	if (err == ECANCELED && func == POOL_SCAN_SCRUB &&
-	    cmd == POOL_SCRUB_NORMAL)
+	/*
+	 * An ECANCELED on a scrub means one of the following:
+	 * 1. we resumed a paused scrub.
+	 * 2. we resumed a paused error scrub.
+	 * 3. Error scrub is not run because of no error log.
+	 */
+	if (err == ECANCELED && (func == POOL_SCAN_SCRUB ||
+	    func == POOL_SCAN_ERRORSCRUB) && cmd == POOL_SCRUB_NORMAL)
 		return (0);
-
-	if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL)
+	/*
+	 * The following cases have been handled here:
+	 * 1. Paused a scrub/error scrub if there is none in progress.
+	 */
+	if (err == ENOENT && func != POOL_SCAN_NONE && cmd ==
+	    POOL_SCRUB_PAUSE) {
 		return (0);
+	}
 
-	if (func == POOL_SCAN_SCRUB) {
+	ASSERT3U(func, >=, POOL_SCAN_NONE);
+	ASSERT3U(func, <, POOL_SCAN_FUNCS);
+
+	if (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) {
 		if (cmd == POOL_SCRUB_PAUSE) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"),
-			    zc.zc_name);
+			    zhp->zpool_name);
 		} else {
 			assert(cmd == POOL_SCRUB_NORMAL);
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN, "cannot scrub %s"),
-			    zc.zc_name);
+			    zhp->zpool_name);
 		}
 	} else if (func == POOL_SCAN_RESILVER) {
 		assert(cmd == POOL_SCRUB_NORMAL);
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot restart resilver on %s"), zc.zc_name);
+		    "cannot restart resilver on %s"), zhp->zpool_name);
 	} else if (func == POOL_SCAN_NONE) {
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot cancel scrubbing %s"), zc.zc_name);
+		    "cannot cancel scrubbing %s"), zhp->zpool_name);
 	} else {
 		assert(!"unexpected result");
 	}
 
+	/*
+	 * With EBUSY, five cases are possible:
+	 *
+	 * Current state		Requested
+	 * 1. Normal Scrub Running	Normal Scrub or Error Scrub
+	 * 2. Normal Scrub Paused	Error Scrub
+	 * 3. Normal Scrub Paused 	Pause Normal Scrub
+	 * 4. Error Scrub Running	Normal Scrub or Error Scrub
+	 * 5. Error Scrub Paused	Pause Error Scrub
+	 * 6. Resilvering		Anything else
+	 */
 	if (err == EBUSY) {
 		nvlist_t *nvroot;
 		pool_scan_stat_t *ps = NULL;
@@ -2703,12 +2737,43 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
 		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
 		if (ps && ps->pss_func == POOL_SCAN_SCRUB &&
 		    ps->pss_state == DSS_SCANNING) {
-			if (cmd == POOL_SCRUB_PAUSE)
-				return (zfs_error(hdl, EZFS_SCRUB_PAUSED,
+			if (ps->pss_pass_scrub_pause == 0) {
+				/* handles case 1 */
+				assert(cmd == POOL_SCRUB_NORMAL);
+				return (zfs_error(hdl, EZFS_SCRUBBING,
 				    errbuf));
-			else
-				return (zfs_error(hdl, EZFS_SCRUBBING, errbuf));
+			} else {
+				if (func == POOL_SCAN_ERRORSCRUB) {
+					/* handles case 2 */
+					ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL);
+					return (zfs_error(hdl,
+					    EZFS_SCRUB_PAUSED_TO_CANCEL,
+					    errbuf));
+				} else {
+					/* handles case 3 */
+					ASSERT3U(func, ==, POOL_SCAN_SCRUB);
+					ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE);
+					return (zfs_error(hdl,
+					    EZFS_SCRUB_PAUSED, errbuf));
+				}
+			}
+		} else if (ps &&
+		    ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB &&
+		    ps->pss_error_scrub_state == DSS_ERRORSCRUBBING) {
+			if (ps->pss_pass_error_scrub_pause == 0) {
+				/* handles case 4 */
+				ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL);
+				return (zfs_error(hdl, EZFS_ERRORSCRUBBING,
+				    errbuf));
+			} else {
+				/* handles case 5 */
+				ASSERT3U(func, ==, POOL_SCAN_ERRORSCRUB);
+				ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE);
+				return (zfs_error(hdl, EZFS_ERRORSCRUB_PAUSED,
+				    errbuf));
+			}
 		} else {
+			/* handles case 6 */
 			return (zfs_error(hdl, EZFS_RESILVERING, errbuf));
 		}
 	} else if (err == ENOENT) {
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 4b8a20160e0..b94abea3d58 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -243,10 +243,20 @@ libzfs_error_description(libzfs_handle_t *hdl)
 		    "into a new one"));
 	case EZFS_SCRUB_PAUSED:
 		return (dgettext(TEXT_DOMAIN, "scrub is paused; "
-		    "use 'zpool scrub' to resume"));
+		    "use 'zpool scrub' to resume scrub"));
+	case EZFS_SCRUB_PAUSED_TO_CANCEL:
+		return (dgettext(TEXT_DOMAIN, "scrub is paused; "
+		    "use 'zpool scrub' to resume or 'zpool scrub -s' to "
+		    "cancel scrub"));
 	case EZFS_SCRUBBING:
 		return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
-		    "use 'zpool scrub -s' to cancel current scrub"));
+		    "use 'zpool scrub -s' to cancel scrub"));
+	case EZFS_ERRORSCRUBBING:
+		return (dgettext(TEXT_DOMAIN, "currently error scrubbing; "
+		    "use 'zpool scrub -s' to cancel error scrub"));
+	case EZFS_ERRORSCRUB_PAUSED:
+		return (dgettext(TEXT_DOMAIN, "error scrub is paused; "
+		    "use 'zpool scrub -e' to resume error scrub"));
 	case EZFS_NO_SCRUB:
 		return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
 	case EZFS_DIFF:
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index 33d794e3f80..f2087186aa4 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -187,6 +187,7 @@
     <elf-symbol name='lzc_reopen' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_rollback' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_rollback_to' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='lzc_scrub' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_send' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_send_redacted' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_send_resume' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -1261,6 +1262,110 @@
       <enumerator name='POOL_TRIM_FUNCS' value='3'/>
     </enum-decl>
     <typedef-decl name='pool_trim_func_t' type-id='54ed608a' id='b1146b8d'/>
+    <enum-decl name='zfs_ioc' id='12033f13'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_IOC_FIRST' value='23040'/>
+      <enumerator name='ZFS_IOC' value='23040'/>
+      <enumerator name='ZFS_IOC_POOL_CREATE' value='23040'/>
+      <enumerator name='ZFS_IOC_POOL_DESTROY' value='23041'/>
+      <enumerator name='ZFS_IOC_POOL_IMPORT' value='23042'/>
+      <enumerator name='ZFS_IOC_POOL_EXPORT' value='23043'/>
+      <enumerator name='ZFS_IOC_POOL_CONFIGS' value='23044'/>
+      <enumerator name='ZFS_IOC_POOL_STATS' value='23045'/>
+      <enumerator name='ZFS_IOC_POOL_TRYIMPORT' value='23046'/>
+      <enumerator name='ZFS_IOC_POOL_SCAN' value='23047'/>
+      <enumerator name='ZFS_IOC_POOL_FREEZE' value='23048'/>
+      <enumerator name='ZFS_IOC_POOL_UPGRADE' value='23049'/>
+      <enumerator name='ZFS_IOC_POOL_GET_HISTORY' value='23050'/>
+      <enumerator name='ZFS_IOC_VDEV_ADD' value='23051'/>
+      <enumerator name='ZFS_IOC_VDEV_REMOVE' value='23052'/>
+      <enumerator name='ZFS_IOC_VDEV_SET_STATE' value='23053'/>
+      <enumerator name='ZFS_IOC_VDEV_ATTACH' value='23054'/>
+      <enumerator name='ZFS_IOC_VDEV_DETACH' value='23055'/>
+      <enumerator name='ZFS_IOC_VDEV_SETPATH' value='23056'/>
+      <enumerator name='ZFS_IOC_VDEV_SETFRU' value='23057'/>
+      <enumerator name='ZFS_IOC_OBJSET_STATS' value='23058'/>
+      <enumerator name='ZFS_IOC_OBJSET_ZPLPROPS' value='23059'/>
+      <enumerator name='ZFS_IOC_DATASET_LIST_NEXT' value='23060'/>
+      <enumerator name='ZFS_IOC_SNAPSHOT_LIST_NEXT' value='23061'/>
+      <enumerator name='ZFS_IOC_SET_PROP' value='23062'/>
+      <enumerator name='ZFS_IOC_CREATE' value='23063'/>
+      <enumerator name='ZFS_IOC_DESTROY' value='23064'/>
+      <enumerator name='ZFS_IOC_ROLLBACK' value='23065'/>
+      <enumerator name='ZFS_IOC_RENAME' value='23066'/>
+      <enumerator name='ZFS_IOC_RECV' value='23067'/>
+      <enumerator name='ZFS_IOC_SEND' value='23068'/>
+      <enumerator name='ZFS_IOC_INJECT_FAULT' value='23069'/>
+      <enumerator name='ZFS_IOC_CLEAR_FAULT' value='23070'/>
+      <enumerator name='ZFS_IOC_INJECT_LIST_NEXT' value='23071'/>
+      <enumerator name='ZFS_IOC_ERROR_LOG' value='23072'/>
+      <enumerator name='ZFS_IOC_CLEAR' value='23073'/>
+      <enumerator name='ZFS_IOC_PROMOTE' value='23074'/>
+      <enumerator name='ZFS_IOC_SNAPSHOT' value='23075'/>
+      <enumerator name='ZFS_IOC_DSOBJ_TO_DSNAME' value='23076'/>
+      <enumerator name='ZFS_IOC_OBJ_TO_PATH' value='23077'/>
+      <enumerator name='ZFS_IOC_POOL_SET_PROPS' value='23078'/>
+      <enumerator name='ZFS_IOC_POOL_GET_PROPS' value='23079'/>
+      <enumerator name='ZFS_IOC_SET_FSACL' value='23080'/>
+      <enumerator name='ZFS_IOC_GET_FSACL' value='23081'/>
+      <enumerator name='ZFS_IOC_SHARE' value='23082'/>
+      <enumerator name='ZFS_IOC_INHERIT_PROP' value='23083'/>
+      <enumerator name='ZFS_IOC_SMB_ACL' value='23084'/>
+      <enumerator name='ZFS_IOC_USERSPACE_ONE' value='23085'/>
+      <enumerator name='ZFS_IOC_USERSPACE_MANY' value='23086'/>
+      <enumerator name='ZFS_IOC_USERSPACE_UPGRADE' value='23087'/>
+      <enumerator name='ZFS_IOC_HOLD' value='23088'/>
+      <enumerator name='ZFS_IOC_RELEASE' value='23089'/>
+      <enumerator name='ZFS_IOC_GET_HOLDS' value='23090'/>
+      <enumerator name='ZFS_IOC_OBJSET_RECVD_PROPS' value='23091'/>
+      <enumerator name='ZFS_IOC_VDEV_SPLIT' value='23092'/>
+      <enumerator name='ZFS_IOC_NEXT_OBJ' value='23093'/>
+      <enumerator name='ZFS_IOC_DIFF' value='23094'/>
+      <enumerator name='ZFS_IOC_TMP_SNAPSHOT' value='23095'/>
+      <enumerator name='ZFS_IOC_OBJ_TO_STATS' value='23096'/>
+      <enumerator name='ZFS_IOC_SPACE_WRITTEN' value='23097'/>
+      <enumerator name='ZFS_IOC_SPACE_SNAPS' value='23098'/>
+      <enumerator name='ZFS_IOC_DESTROY_SNAPS' value='23099'/>
+      <enumerator name='ZFS_IOC_POOL_REGUID' value='23100'/>
+      <enumerator name='ZFS_IOC_POOL_REOPEN' value='23101'/>
+      <enumerator name='ZFS_IOC_SEND_PROGRESS' value='23102'/>
+      <enumerator name='ZFS_IOC_LOG_HISTORY' value='23103'/>
+      <enumerator name='ZFS_IOC_SEND_NEW' value='23104'/>
+      <enumerator name='ZFS_IOC_SEND_SPACE' value='23105'/>
+      <enumerator name='ZFS_IOC_CLONE' value='23106'/>
+      <enumerator name='ZFS_IOC_BOOKMARK' value='23107'/>
+      <enumerator name='ZFS_IOC_GET_BOOKMARKS' value='23108'/>
+      <enumerator name='ZFS_IOC_DESTROY_BOOKMARKS' value='23109'/>
+      <enumerator name='ZFS_IOC_RECV_NEW' value='23110'/>
+      <enumerator name='ZFS_IOC_POOL_SYNC' value='23111'/>
+      <enumerator name='ZFS_IOC_CHANNEL_PROGRAM' value='23112'/>
+      <enumerator name='ZFS_IOC_LOAD_KEY' value='23113'/>
+      <enumerator name='ZFS_IOC_UNLOAD_KEY' value='23114'/>
+      <enumerator name='ZFS_IOC_CHANGE_KEY' value='23115'/>
+      <enumerator name='ZFS_IOC_REMAP' value='23116'/>
+      <enumerator name='ZFS_IOC_POOL_CHECKPOINT' value='23117'/>
+      <enumerator name='ZFS_IOC_POOL_DISCARD_CHECKPOINT' value='23118'/>
+      <enumerator name='ZFS_IOC_POOL_INITIALIZE' value='23119'/>
+      <enumerator name='ZFS_IOC_POOL_TRIM' value='23120'/>
+      <enumerator name='ZFS_IOC_REDACT' value='23121'/>
+      <enumerator name='ZFS_IOC_GET_BOOKMARK_PROPS' value='23122'/>
+      <enumerator name='ZFS_IOC_WAIT' value='23123'/>
+      <enumerator name='ZFS_IOC_WAIT_FS' value='23124'/>
+      <enumerator name='ZFS_IOC_VDEV_GET_PROPS' value='23125'/>
+      <enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
+      <enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
+      <enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
+      <enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
+      <enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
+      <enumerator name='ZFS_IOC_EVENTS_SEEK' value='23171'/>
+      <enumerator name='ZFS_IOC_NEXTBOOT' value='23172'/>
+      <enumerator name='ZFS_IOC_JAIL' value='23173'/>
+      <enumerator name='ZFS_IOC_UNJAIL' value='23174'/>
+      <enumerator name='ZFS_IOC_SET_BOOTENV' value='23175'/>
+      <enumerator name='ZFS_IOC_GET_BOOTENV' value='23176'/>
+      <enumerator name='ZFS_IOC_LAST' value='23177'/>
+    </enum-decl>
+    <typedef-decl name='zfs_ioc_t' type-id='12033f13' id='5b35941c'/>
     <enum-decl name='zpool_wait_activity_t' naming-typedef-id='73446457' id='849338e3'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='ZPOOL_WAIT_CKPT_DISCARD' value='0'/>
diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c
index 254f14e0432..c63a16de5ab 100644
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -247,6 +247,13 @@ out:
 	return (error);
 }
 
+int
+lzc_scrub(zfs_ioc_t ioc, const char *name,
+    nvlist_t *source, nvlist_t **resultp)
+{
+	return (lzc_ioctl(ioc, name, source, resultp));
+}
+
 int
 lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props,
     uint8_t *wkeydata, uint_t wkeylen)
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index d529147464f..9ec940a9448 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1764,6 +1764,9 @@ Scrubs are processed by the sync thread.
 While scrubbing, it will spend at least this much time
 working on a scrub between TXG flushes.
 .
+.It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint
+Error blocks to be scrubbed in one txg.
+.
 .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint
 To preserve progress across reboots, the sequential scan algorithm periodically
 needs to stop metadata scanning and issue all the verification I/O to disk.
diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8
index 1fdbb8a5d56..138226e4562 100644
--- a/man/man8/zpool-scrub.8
+++ b/man/man8/zpool-scrub.8
@@ -38,6 +38,7 @@
 .Cm scrub
 .Op Fl s Ns | Ns Fl p
 .Op Fl w
+.Op Fl e
 .Ar pool Ns …
 .
 .Sh DESCRIPTION
@@ -62,6 +63,13 @@ device
 whereas scrubbing examines all data to discover silent errors due to hardware
 faults or disk failure.
 .Pp
+When scrubbing a pool with encrypted filesystems the keys do not need to be
+loaded.
+However, if the keys are not loaded and an unrepairable checksum error is
+detected the file name cannot be included in the
+.Nm zpool Cm status Fl v
+verbose error report.
+.Pp
 Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows
 one at a time.
 .Pp
@@ -92,9 +100,20 @@ Once resumed the scrub will pick up from the place where it was last
 checkpointed to disk.
 To resume a paused scrub issue
 .Nm zpool Cm scrub
+or
+.Nm zpool Cm scrub
+.Fl e
 again.
 .It Fl w
 Wait until scrub has completed before returning.
+.It Fl e
+Only scrub files with known data errors as reported by
+.Nm zpool Cm status Fl v .
+The pool must have been scrubbed at least once with the
+.Sy head_errlog
+feature enabled to use this option.
+Error scrubbing cannot be run simultaneously with regular scrubbing or
+resilvering, nor can it be run when a regular scrub is paused.
 .El
 .Sh EXAMPLES
 .Ss Example 1
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index d398b670555..5e3559b251e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -54,6 +54,7 @@
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/range_tree.h>
+#include <sys/dbuf.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
@@ -129,6 +130,7 @@ static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
 static uint64_t dsl_scan_count_data_disks(spa_t *spa);
+static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
 
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
@@ -231,6 +233,9 @@ static int zfs_resilver_disable_defer = B_FALSE;
  */
 static int zfs_free_bpobj_enabled = 1;
 
+/* Error blocks to be scrubbed in one txg. */
+unsigned long zfs_scrub_error_blocks_per_txg = 1 << 12;
+
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
@@ -511,9 +516,17 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		    "scrub_queue", sizeof (uint64_t), 1,
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
+		    ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
+
+		if (err != 0 && err != ENOENT)
+			return (err);
+
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
+
 		/*
 		 * Detect if the pool contains the signature of #2094.  If it
 		 * does properly update the scn->scn_phys structure and notify
@@ -663,6 +676,22 @@ dsl_scan_scrubbing(const dsl_pool_t *dp)
 	    scn_phys->scn_func == POOL_SCAN_SCRUB);
 }
 
+boolean_t
+dsl_errorscrubbing(const dsl_pool_t *dp)
+{
+	dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
+
+	return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
+	    errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
+}
+
+boolean_t
+dsl_errorscrub_is_paused(const dsl_scan_t *scn)
+{
+	return (dsl_errorscrubbing(scn->scn_dp) &&
+	    scn->errorscrub_phys.dep_paused_flags);
+}
+
 boolean_t
 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 {
@@ -670,6 +699,68 @@ dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
 }
 
+static void
+dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	scn->errorscrub_phys.dep_cursor =
+	    zap_cursor_serialize(&scn->errorscrub_cursor);
+
+	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
+	    &scn->errorscrub_phys, tx));
+}
+
+static void
+dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	pool_scan_func_t *funcp = arg;
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(!dsl_scan_is_running(scn));
+	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+
+	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+	scn->errorscrub_phys.dep_func = *funcp;
+	scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
+	scn->errorscrub_phys.dep_start_time = gethrestime_sec();
+	scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
+	scn->errorscrub_phys.dep_examined = 0;
+	scn->errorscrub_phys.dep_errors = 0;
+	scn->errorscrub_phys.dep_cursor = 0;
+	zap_cursor_init_serialized(&scn->errorscrub_cursor,
+	    spa->spa_meta_objset, spa->spa_errlog_last,
+	    scn->errorscrub_phys.dep_cursor);
+
+	vdev_config_dirty(spa->spa_root_vdev);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
+
+	dsl_errorscrub_sync_state(scn, tx);
+
+	spa_history_log_internal(spa, "error scrub setup", tx,
+	    "func=%u mintxg=%u maxtxg=%llu",
+	    *funcp, 0, (u_longlong_t)tx->tx_txg);
+}
+
+static int
+dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
+{
+	(void) arg;
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
+		return (SET_ERROR(EBUSY));
+	}
+
+	if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
+		return (ECANCELED);
+	}
+	return (0);
+}
+
 /*
  * Writes out a persistent dsl_scan_phys_t record to the pool directory.
  * Because we can be running in the block sorting algorithm, we do not always
@@ -745,7 +836,8 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
-	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
+	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
+	    dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(EBUSY));
 
 	return (0);
@@ -754,6 +846,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
+	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dmu_object_type_t ot = 0;
@@ -763,6 +856,14 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 	memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
+
+	/*
+	 * If we are starting a fresh scrub, we erase the error scrub
+	 * information from disk.
+	 */
+	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+	dsl_errorscrub_sync_state(scn, tx);
+
 	scn->scn_phys.scn_func = *funcp;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = 0;
@@ -856,8 +957,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 }
 
 /*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
+ * error scrub or resilver. Can also be called to resume a paused scrub or
+ * error scrub.
  */
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
@@ -883,6 +985,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 		return (0);
 	}
 
+	if (func == POOL_SCAN_ERRORSCRUB) {
+		if (dsl_errorscrub_is_paused(dp->dp_scan)) {
+			/*
+			 * got error scrub start cmd, resume paused error scrub.
+			 */
+			int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+			    POOL_SCRUB_NORMAL);
+			if (err == 0) {
+				spa_event_notify(spa, NULL, NULL,
+				    ESC_ZFS_ERRORSCRUB_RESUME);
+				return (ECANCELED);
+			}
+			return (SET_ERROR(err));
+		}
+
+		return (dsl_sync_task(spa_name(dp->dp_spa),
+		    dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
+		    &func, 0, ZFS_SPACE_CHECK_RESERVED));
+	}
+
 	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
 		/* got scrub start cmd, resume paused scrub */
 		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -891,7 +1013,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
 			return (SET_ERROR(ECANCELED));
 		}
-
 		return (SET_ERROR(err));
 	}
 
@@ -899,6 +1020,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
+static void
+dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	if (complete) {
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
+		spa_history_log_internal(spa, "error scrub done", tx,
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+	} else {
+		spa_history_log_internal(spa, "error scrub canceled", tx,
+		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+	}
+
+	scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
+	spa->spa_scrub_active = B_FALSE;
+	spa_errlog_rotate(spa);
+	scn->errorscrub_phys.dep_end_time = gethrestime_sec();
+	zap_cursor_fini(&scn->errorscrub_cursor);
+
+	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+		spa->spa_errata = 0;
+
+	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+}
+
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
@@ -1045,6 +1193,92 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 	ASSERT(!dsl_scan_is_running(scn));
 }
 
+static int
+dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+	pool_scrub_cmd_t *cmd = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (*cmd == POOL_SCRUB_PAUSE) {
+		/*
+		 * can't pause a error scrub when there is no in-progress
+		 * error scrub.
+		 */
+		if (!dsl_errorscrubbing(dp))
+			return (SET_ERROR(ENOENT));
+
+		/* can't pause a paused error scrub */
+		if (dsl_errorscrub_is_paused(scn))
+			return (SET_ERROR(EBUSY));
+	} else if (*cmd != POOL_SCRUB_NORMAL) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	return (0);
+}
+
+static void
+dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+	pool_scrub_cmd_t *cmd = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (*cmd == POOL_SCRUB_PAUSE) {
+		spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
+		scn->errorscrub_phys.dep_paused_flags = B_TRUE;
+		dsl_errorscrub_sync_state(scn, tx);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
+	} else {
+		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+		if (dsl_errorscrub_is_paused(scn)) {
+			/*
+			 * We need to keep track of how much time we spend
+			 * paused per pass so that we can adjust the error scrub
+			 * rate shown in the output of 'zpool status'.
+			 */
+			spa->spa_scan_pass_errorscrub_spent_paused +=
+			    gethrestime_sec() -
+			    spa->spa_scan_pass_errorscrub_pause;
+
+			spa->spa_scan_pass_errorscrub_pause = 0;
+			scn->errorscrub_phys.dep_paused_flags = B_FALSE;
+
+			zap_cursor_init_serialized(
+			    &scn->errorscrub_cursor,
+			    spa->spa_meta_objset, spa->spa_errlog_last,
+			    scn->errorscrub_phys.dep_cursor);
+
+			dsl_errorscrub_sync_state(scn, tx);
+		}
+	}
+}
+
+static int
+dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
+{
+	(void) arg;
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	/* can't cancel a error scrub when there is no one in-progress */
+	if (!dsl_errorscrubbing(scn->scn_dp))
+		return (SET_ERROR(ENOENT));
+	return (0);
+}
+
+static void
+dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	(void) arg;
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	dsl_errorscrub_done(scn, B_FALSE, tx);
+	dsl_errorscrub_sync_state(scn, tx);
+	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
+	    ESC_ZFS_ERRORSCRUB_ABORT);
+}
+
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
@@ -1070,6 +1304,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
+	if (dsl_errorscrubbing(dp)) {
+		return (dsl_sync_task(spa_name(dp->dp_spa),
+		    dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
+		    NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+	}
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
@@ -1136,6 +1375,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 int
 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 {
+	if (dsl_errorscrubbing(dp)) {
+		return (dsl_sync_task(spa_name(dp->dp_spa),
+		    dsl_errorscrub_pause_resume_check,
+		    dsl_errorscrub_pause_resume_sync, &cmd, 3,
+		    ZFS_SPACE_CHECK_RESERVED));
+	}
 	return (dsl_sync_task(spa_name(dp->dp_spa),
 	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
 	    ZFS_SPACE_CHECK_RESERVED));
@@ -1422,6 +1667,42 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	return (B_FALSE);
 }
 
+static boolean_t
+dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+	/*
+	 * We suspend if:
+	 *  - we have scrubbed for at least the minimum time (default 1 sec
+	 *    for error scrub), someone is explicitly waiting for this txg
+	 *    to complete, or we have used up all of the time in the txg
+	 *    timeout (default 5 sec).
+	 *  or
+	 *  - the spa is shutting down because this pool is being exported
+	 *    or the machine is rebooting.
+	 */
+	uint64_t curr_time_ns = gethrtime();
+	uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
+	uint64_t sync_time_ns = curr_time_ns -
+	    scn->scn_dp->dp_spa->spa_sync_starttime;
+	int mintime = zfs_scrub_min_time_ms;
+
+	if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
+	    (txg_sync_waiting(scn->scn_dp) ||
+	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+		if (zb) {
+			dprintf("error scrub suspending at bookmark "
+			    "%llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+		}
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
@@ -3352,6 +3633,19 @@ dsl_scan_active(dsl_scan_t *scn)
 	return ((used != 0) || (clones_left));
 }
 
+boolean_t
+dsl_errorscrub_active(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	if (spa->spa_load_state != SPA_LOAD_NONE)
+		return (B_FALSE);
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+	if (dsl_errorscrubbing(scn->scn_dp))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 static boolean_t
 dsl_scan_check_deferred(vdev_t *vd)
 {
@@ -3568,6 +3862,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 	return (0);
 }
 
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+	zb->zb_objset = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_object = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == '\0');
+}
+
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+	*obj = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == '\0');
+}
+
+static void
+read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	dsl_dataset_t *ds;
+	objset_t *os;
+	if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
+		return;
+
+	if (dmu_objset_from_ds(ds, &os) != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	/*
+	 * If the key is not loaded dbuf_dnode_findbp() will error out with
+	 * EACCES. However in that case dnode_hold() will eventually call
+	 * dbuf_read()->zio_wait() which may call spa_log_error(). This will
+	 * lead to a deadlock due to us holding the mutex spa_errlist_lock.
+	 * Avoid this by checking here if the keys are loaded, if not return.
+	 * If the keys are not loaded the head_errlog feature is meaningless
+	 * as we cannot figure out the birth txg of the block pointer.
+	 */
+	if (dsl_dataset_get_keystatus(ds->ds_dir) ==
+	    ZFS_KEYSTATUS_UNAVAILABLE) {
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	dnode_t *dn;
+	blkptr_t bp;
+
+	if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
+	    NULL);
+
+	if (error) {
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	if (!error && BP_IS_HOLE(&bp)) {
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+		dsl_dataset_rele(ds, FTAG);
+		return;
+	}
+
+	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb.zb_level == ZB_ZIL_LEVEL)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	ASSERT(!BP_IS_EMBEDDED(&bp));
+	scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * We keep track of the scrubbed error blocks in "count". This will be used
+ * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
+ * function is modelled after check_filesystem().
+ */
+static int
+scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
+    int *count)
+{
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	uint64_t latest_txg;
+	uint64_t txg_to_consider = spa->spa_syncing_txg;
+	boolean_t check_snapshot = B_TRUE;
+
+	error = find_birth_txg(ds, zep, &latest_txg);
+
+	/*
+	 * If find_birth_txg() errors out, then err on the side of caution and
+	 * proceed. In worst case scenario scrub all objects. If zep->zb_birth
+	 * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
+	 * scrub all objects.
+	 */
+	if (error == 0 && zep->zb_birth == latest_txg) {
+		/* Block neither free nor re written. */
+		zbookmark_phys_t zb;
+		zep_to_zb(fs, zep, &zb);
+		scn->scn_zio_root = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		/* We have already acquired the config lock for spa */
+		read_by_block_level(scn, zb);
+
+		(void) zio_wait(scn->scn_zio_root);
+		scn->scn_zio_root = NULL;
+
+		scn->errorscrub_phys.dep_examined++;
+		scn->errorscrub_phys.dep_to_examine--;
+		(*count)++;
+		if ((*count) == zfs_scrub_error_blocks_per_txg ||
+		    dsl_error_scrub_check_suspend(scn, &zb)) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EFAULT));
+		}
+
+		check_snapshot = B_FALSE;
+	} else if (error == 0) {
+		txg_to_consider = latest_txg;
+	}
+
+	/*
+	 * Retrieve the number of snapshots if the dataset is not a snapshot.
+	 */
+	uint64_t snap_count = 0;
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+
+		error = zap_count(spa->spa_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+	}
+
+	if (snap_count == 0) {
+		/* Filesystem without snapshots. */
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+	dsl_dataset_rele(ds, FTAG);
+
+	/* Check only snapshots created from this file system. */
+	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+	    snap_obj_txg <= txg_to_consider) {
+
+		error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
+		if (error != 0)
+			return (error);
+
+		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
+			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+			dsl_dataset_rele(ds, FTAG);
+			continue;
+		}
+
+		boolean_t affected = B_TRUE;
+		if (check_snapshot) {
+			uint64_t blk_txg;
+			error = find_birth_txg(ds, zep, &blk_txg);
+
+			/*
+			 * Scrub the snapshot also when zb_birth == 0 or when
+			 * find_birth_txg() returns an error.
+			 */
+			affected = (error == 0 && zep->zb_birth == blk_txg) ||
+			    (error != 0) || (zep->zb_birth == 0);
+		}
+
+		/* Scrub snapshots. */
+		if (affected) {
+			zbookmark_phys_t zb;
+			zep_to_zb(snap_obj, zep, &zb);
+			scn->scn_zio_root = zio_root(spa, NULL, NULL,
+			    ZIO_FLAG_CANFAIL);
+			/* We have already acquired the config lock for spa */
+			read_by_block_level(scn, zb);
+
+			(void) zio_wait(scn->scn_zio_root);
+			scn->scn_zio_root = NULL;
+
+			scn->errorscrub_phys.dep_examined++;
+			scn->errorscrub_phys.dep_to_examine--;
+			(*count)++;
+			if ((*count) == zfs_scrub_error_blocks_per_txg ||
+			    dsl_error_scrub_check_suspend(scn, &zb)) {
+				dsl_dataset_rele(ds, FTAG);
+				return (EFAULT);
+			}
+		}
+		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		dsl_dataset_rele(ds, FTAG);
+	}
+	return (0);
+}
+
+void
+dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
+		return;
+	}
+
+	if (dsl_scan_resilvering(scn->scn_dp)) {
+		/* cancel the error scrub if resilver started */
+		dsl_scan_cancel(scn->scn_dp);
+		return;
+	}
+
+	spa->spa_scrub_active = B_TRUE;
+	scn->scn_sync_start_time = gethrtime();
+
+	/*
+	 * zfs_scan_suspend_progress can be set to disable scrub progress.
+	 * See more detailed comment in dsl_scan_sync().
+	 */
+	if (zfs_scan_suspend_progress) {
+		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		int mintime = zfs_scrub_min_time_ms;
+
+		while (zfs_scan_suspend_progress &&
+		    !txg_sync_waiting(scn->scn_dp) &&
+		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
+		    NSEC2MSEC(scan_time_ns) < mintime) {
+			delay(hz);
+			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		}
+		return;
+	}
+
+	int i = 0;
+	zap_attribute_t *za;
+	zbookmark_phys_t *zb;
+	boolean_t limit_exceeded = B_FALSE;
+
+	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+	zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+		    zap_cursor_advance(&scn->errorscrub_cursor)) {
+			name_to_bookmark(za->za_name, zb);
+
+			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+			    NULL, ZIO_FLAG_CANFAIL);
+			dsl_pool_config_enter(dp, FTAG);
+			read_by_block_level(scn, *zb);
+			dsl_pool_config_exit(dp, FTAG);
+
+			(void) zio_wait(scn->scn_zio_root);
+			scn->scn_zio_root = NULL;
+
+			scn->errorscrub_phys.dep_examined += 1;
+			scn->errorscrub_phys.dep_to_examine -= 1;
+			i++;
+			if (i == zfs_scrub_error_blocks_per_txg ||
+			    dsl_error_scrub_check_suspend(scn, zb)) {
+				limit_exceeded = B_TRUE;
+				break;
+			}
+		}
+
+		if (!limit_exceeded)
+			dsl_errorscrub_done(scn, B_TRUE, tx);
+
+		dsl_errorscrub_sync_state(scn, tx);
+		kmem_free(za, sizeof (*za));
+		kmem_free(zb, sizeof (*zb));
+		return;
+	}
+
+	int error = 0;
+	for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+	    zap_cursor_advance(&scn->errorscrub_cursor)) {
+
+		zap_cursor_t *head_ds_cursor;
+		zap_attribute_t *head_ds_attr;
+		zbookmark_err_phys_t head_ds_block;
+
+		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+		head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+		uint64_t head_ds_err_obj = za->za_first_integer;
+		uint64_t head_ds;
+		name_to_object(za->za_name, &head_ds);
+		boolean_t config_held = B_FALSE;
+		uint64_t top_affected_fs;
+
+		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
+		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
+		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
+
+			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
+
+			/*
+			 * In case we are called from spa_sync the pool
+			 * config is already held.
+			 */
+			if (!dsl_pool_config_held(dp)) {
+				dsl_pool_config_enter(dp, FTAG);
+				config_held = B_TRUE;
+			}
+
+			error = find_top_affected_fs(spa,
+			    head_ds, &head_ds_block, &top_affected_fs);
+			if (error)
+				break;
+
+			error = scrub_filesystem(spa, top_affected_fs,
+			    &head_ds_block, &i);
+
+			if (error == SET_ERROR(EFAULT)) {
+				limit_exceeded = B_TRUE;
+				break;
+			}
+		}
+
+		zap_cursor_fini(head_ds_cursor);
+		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
+		kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+
+		if (config_held)
+			dsl_pool_config_exit(dp, FTAG);
+	}
+
+	kmem_free(za, sizeof (*za));
+	kmem_free(zb, sizeof (*zb));
+	if (!limit_exceeded)
+		dsl_errorscrub_done(scn, B_TRUE, tx);
+
+	dsl_errorscrub_sync_state(scn, tx);
+}
+
 /*
  * This is the primary entry point for scans that is called from syncing
  * context. Scans must happen entirely during syncing context so that we
@@ -4109,7 +4784,14 @@ dsl_scan_scrub_done(zio_t *zio)
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
-		atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+		if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
+		    !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
+			atomic_inc_64(&spa->spa_dsl_pool->dp_scan
+			    ->errorscrub_phys.dep_errors);
+		} else {
+			atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
+			    .scn_errors);
+		}
 	}
 }
 
@@ -4559,3 +5241,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
 	"Process all resilvers immediately");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, U64, ZMOD_RW,
+	"Error blocks to be scrubbed in one txg");
+/* END CSTYLED */
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 51d6de9105f..1fc2c5e8c55 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -8173,6 +8173,7 @@ spa_scan_stop(spa_t *spa)
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
+
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
@@ -8198,6 +8199,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
 		return (0);
 	}
 
+	if (func == POOL_SCAN_ERRORSCRUB &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+		return (SET_ERROR(ENOTSUP));
+
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
@@ -9249,6 +9254,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		brt_sync(spa, txg);
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
+		dsl_errorscrub_sync(dp, tx);
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 5fe35278683..2e5c22c1149 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -110,7 +110,7 @@ errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
 /*
  * Convert a string to a err_phys.
  */
-static void
+void
 name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
 {
 	zep->zb_object = zfs_strtonum(buf, &buf);
@@ -139,8 +139,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 	ASSERT(*buf == '\0');
 }
 
-#ifdef _KERNEL
-static void
+void
 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = dataset;
@@ -148,7 +147,6 @@ zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
 	zb->zb_level = zep->zb_level;
 	zb->zb_blkid = zep->zb_blkid;
 }
-#endif
 
 static void
 name_to_object(char *buf, uint64_t *obj)
@@ -238,8 +236,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
-#ifdef _KERNEL
-static int
+int
 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
     uint64_t *birth_txg)
 {
@@ -267,6 +264,34 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
 	return (error);
 }
 
+/*
+ * This function finds the oldest affected filesystem containing an error
+ * block.
+ */
+int
+find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+    uint64_t *top_affected_fs)
+{
+	uint64_t oldest_dsobj;
+	int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
+	    &oldest_dsobj);
+	if (error != 0)
+		return (error);
+
+	dsl_dataset_t *ds;
+	error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
+	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	*top_affected_fs =
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+	return (0);
+}
+
+
+#ifdef _KERNEL
 /*
  * Copy the bookmark to the end of the user-space buffer which starts at
  * uaddr and has *count unused entries, and decrement *count by 1.
@@ -288,7 +313,8 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
  * Each time the error block is referenced by a snapshot or clone, add a
  * zbookmark_phys_t entry to the userspace array at uaddr. The array is
  * filled from the back and the in-out parameter *count is modified to be the
- * number of unused entries at the beginning of the array.
+ * number of unused entries at the beginning of the array. The function
+ * scrub_filesystem() is modelled after this one.
  */
 static int
 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
@@ -449,28 +475,6 @@ out:
 	return (error);
 }
 
-static int
-find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
-    uint64_t *top_affected_fs)
-{
-	uint64_t oldest_dsobj;
-	int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
-	    &oldest_dsobj);
-	if (error != 0)
-		return (error);
-
-	dsl_dataset_t *ds;
-	error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
-	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	*top_affected_fs =
-	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
-	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
-	return (0);
-}
-
 static int
 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     void *uaddr, uint64_t *count)
@@ -536,6 +540,21 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
 }
 #endif
 
+/* Return the number of errors in the error log */
+uint64_t
+spa_get_last_errlog_size(spa_t *spa)
+{
+	uint64_t total = 0, count;
+	mutex_enter(&spa->spa_errlog_lock);
+
+	if (spa->spa_errlog_last != 0 &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+	    &count) == 0)
+		total += count;
+	mutex_exit(&spa->spa_errlog_lock);
+	return (total);
+}
+
 /*
  * If a healed bookmark matches an entry in the error log we stash it in a tree
  * so that we can later remove the related log entries in sync context.
@@ -1447,6 +1466,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
 /* error handling */
 EXPORT_SYMBOL(spa_log_error);
 EXPORT_SYMBOL(spa_approx_errlog_size);
+EXPORT_SYMBOL(spa_get_last_errlog_size);
 EXPORT_SYMBOL(spa_get_errlog);
 EXPORT_SYMBOL(spa_errlog_rotate);
 EXPORT_SYMBOL(spa_errlog_drain);
@@ -1456,6 +1476,10 @@ EXPORT_SYMBOL(spa_delete_dataset_errlog);
 EXPORT_SYMBOL(spa_swap_errlog);
 EXPORT_SYMBOL(sync_error_list);
 EXPORT_SYMBOL(spa_upgrade_errlog);
+EXPORT_SYMBOL(find_top_affected_fs);
+EXPORT_SYMBOL(find_birth_txg);
+EXPORT_SYMBOL(zep_to_zb);
+EXPORT_SYMBOL(name_to_errphys);
 #endif
 
 /* BEGIN CSTYLED */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 54a0eeccf27..89e1ce7165d 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2579,9 +2579,18 @@ spa_scan_stat_init(spa_t *spa)
 		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
 	else
 		spa->spa_scan_pass_scrub_pause = 0;
+
+	if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
+		spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
+	else
+		spa->spa_scan_pass_errorscrub_pause = 0;
+
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
 	spa->spa_scan_pass_issued = 0;
+
+	// error scrub stats
+	spa->spa_scan_pass_errorscrub_spent_paused = 0;
 }
 
 /*
@@ -2592,8 +2601,10 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
 
-	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+	if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
+	    scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
 		return (SET_ERROR(ENOENT));
+
 	memset(ps, 0, sizeof (pool_scan_stat_t));
 
 	/* data stored on disk */
@@ -2616,6 +2627,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	ps->pss_issued =
 	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
+	/* error scrub data stored on disk */
+	ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
+	ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
+	ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
+	ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
+	ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
+	ps->pss_error_scrub_to_be_examined =
+	    scn->errorscrub_phys.dep_to_examine;
+
+	/* error scrub data not stored on disk */
+	ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
+
 	return (0);
 }
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index efaf6f9b390..f91a2f3bbca 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1685,6 +1685,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * poolname             name of the pool
+ * scan_type            scan func (pool_scan_func_t)
+ * scan_command         scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
+	{"scan_type",		DATA_TYPE_UINT64,	0},
+	{"scan_command",	DATA_TYPE_UINT64,	0},
+};
+
+static int
+zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+	uint64_t scan_type, scan_cmd;
+
+	if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
+		return (SET_ERROR(EINVAL));
+	if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (scan_cmd >= POOL_SCRUB_FLAGS_END)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+		return (error);
+
+	if (scan_cmd == POOL_SCRUB_PAUSE) {
+		error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+	} else if (scan_type == POOL_SCAN_NONE) {
+		error = spa_scan_stop(spa);
+	} else {
+		error = spa_scan(spa, scan_type);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
@@ -7218,6 +7259,11 @@ zfs_ioctl_init(void)
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
 	    zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props));
 
+	zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
+	    zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_NONE, B_TRUE, B_TRUE,
+	    zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 62d9cbeb6d9..9ed1a6d37a9 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -479,7 +479,9 @@ tags = ['functional', 'cli_root', 'zpool_resilver']
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
-    'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies']
+    'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
+    'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
+    'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c
index a1dfaefd710..c661718a296 100644
--- a/tests/zfs-tests/cmd/libzfs_input_check.c
+++ b/tests/zfs-tests/cmd/libzfs_input_check.c
@@ -27,6 +27,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_bootenv.h>
+#include <sys/fs/zfs.h>
 
 /*
  * Test the nvpair inputs for the non-legacy zfs ioctl commands.
@@ -688,6 +689,17 @@ test_vdev_trim(const char *pool)
 	nvlist_free(required);
 }
 
+/* Test with invalid values */
+static void
+test_scrub(const char *pool)
+{
+	nvlist_t *required = fnvlist_alloc();
+	fnvlist_add_uint64(required, "scan_type", POOL_SCAN_FUNCS + 1);
+	fnvlist_add_uint64(required, "scan_command", POOL_SCRUB_FLAGS_END + 1);
+	IOC_INPUT_TEST(ZFS_IOC_POOL_SCRUB, pool, required, NULL, EINVAL);
+	nvlist_free(required);
+}
+
 static int
 zfs_destroy(const char *dataset)
 {
@@ -868,6 +880,8 @@ zfs_ioc_input_tests(const char *pool)
 	test_set_bootenv(pool);
 	test_get_bootenv(pool);
 
+	test_scrub(pool);
+
 	/*
 	 * cleanup
 	 */
@@ -1022,6 +1036,7 @@ validate_ioc_values(void)
 	CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS);
 	CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT);
 	CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS);
+	CHECK(ZFS_IOC_BASE + 87 == ZFS_IOC_POOL_SCRUB);
 	CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT);
 	CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR);
 	CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK);
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 8521f271be5..133f8387dda 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -1969,6 +1969,12 @@ function is_pool_scrubbing #pool <verbose>
 	check_pool_status "$1" "scan" "scrub in progress since " $2
 }
 
+function is_pool_error_scrubbing #pool <verbose>
+{
+	check_pool_status "$1" "scrub" "error scrub in progress since " $2
+	return $?
+}
+
 function is_pool_scrubbed #pool <verbose>
 {
 	check_pool_status "$1" "scan" "scrub repaired" $2
@@ -1979,11 +1985,23 @@ function is_pool_scrub_stopped #pool <verbose>
 	check_pool_status "$1" "scan" "scrub canceled" $2
 }
 
+function is_pool_error_scrub_stopped #pool <verbose>
+{
+	check_pool_status "$1" "scrub" "error scrub canceled on " $2
+	return $?
+}
+
 function is_pool_scrub_paused #pool <verbose>
 {
 	check_pool_status "$1" "scan" "scrub paused since " $2
 }
 
+function is_pool_error_scrub_paused #pool <verbose>
+{
+	check_pool_status "$1" "scrub" "error scrub paused since " $2
+	return $?
+}
+
 function is_pool_removing #pool
 {
 	check_pool_status "$1" "remove" "in progress since "
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 3e4120f52ca..ad4aec54329 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1153,6 +1153,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
+	functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
+	functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
+	functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
+	functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \
 	functional/cli_root/zpool_set/cleanup.ksh \
 	functional/cli_root/zpool_set/setup.ksh \
 	functional/cli_root/zpool/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh
new file mode 100755
index 00000000000..e414cd1beaa
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh
@@ -0,0 +1,79 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2019, Delphix. All rights reserved.
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#	Verify scrub -e, -p, and -s show the right status.
+#
+# STRATEGY:
+#	1. Create a pool and create a 10MB file in it.
+#	2. Start a error scrub (-e) and verify it's doing a scrub.
+#	3. Pause error scrub (-p) and verify it's paused.
+#	4. Try to pause a paused error scrub (-p) and make sure that fails.
+#	5. Resume the paused error scrub and verify again it's doing a scrub.
+#	6. Verify zpool scrub -s succeed when the system is error scrubbing.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+	log_must zinject -c all
+	rm -f /$TESTPOOL/10m_file
+}
+
+log_onexit cleanup
+
+log_assert "Verify scrub -e, -p, and -s show the right status."
+
+log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file
+
+# create some error blocks
+dd if=/$TESTPOOL/10m_file bs=1M count=1 || true
+
+# sync error blocks to disk
+log_must sync_pool $TESTPOOL
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+log_must zpool scrub -e $TESTPOOL
+log_must is_pool_error_scrubbing $TESTPOOL true
+log_must zpool scrub -p $TESTPOOL
+log_must is_pool_error_scrub_paused $TESTPOOL true
+log_mustnot zpool scrub -p $TESTPOOL
+log_must is_pool_error_scrub_paused $TESTPOOL true
+log_must zpool scrub -e $TESTPOOL
+log_must is_pool_error_scrubbing $TESTPOOL true
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_error_scrub_stopped $TESTPOOL true
+
+log_pass "Verified scrub -e, -p, and -s show expected status."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh
new file mode 100755
index 00000000000..daa11c3949c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh
@@ -0,0 +1,99 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2019, Delphix. All rights reserved.
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#	Verify regular scrub and error scrub can't run at the same time.
+#
+# STRATEGY:
+#	1. Create a pool and create a 10MB file in it.
+#	2. Start a scrub and verify it's doing a scrub.
+# 	3. Start a error scrub (-e) and verify it fails.
+#	4. Pause scrub (-p) and verify it's paused.
+#	5. Start a error scrub (-e) verify it fails again.
+#	6. Resume the paused scrub, verify it and cancel it.
+#	7. Start a error scrub (-e) and verify it's doing error scrub.
+# 	8. Start a scrub and verify it fails.
+# 	9. Cancel error scrub (-e) and verify it is canceled.
+# 	10. Start scrub, verify it, cancel it and verify it.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+	log_must zinject -c all
+	rm -f /$TESTPOOL/10m_file
+}
+
+log_onexit cleanup
+
+log_assert "Verify regular scrub and error scrub can't run at the same time."
+
+log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file
+
+# create some error blocks before error scrub is requested.
+dd if=/$TESTPOOL/10m_file bs=1M count=1 || true
+# sync error blocks to disk
+log_must sync_pool $TESTPOOL
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool scrub $TESTPOOL
+log_must is_pool_scrubbing $TESTPOOL true
+log_mustnot zpool scrub -e $TESTPOOL
+log_must zpool scrub -p $TESTPOOL
+log_must is_pool_scrub_paused $TESTPOOL true
+log_mustnot zpool scrub -e $TESTPOOL
+log_must zpool scrub $TESTPOOL
+log_must is_pool_scrubbing $TESTPOOL true
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_scrub_stopped $TESTPOOL true
+
+# create some error blocks before error scrub is requested.
+dd if=/$TESTPOOL/10m_file bs=1M count=1 || true
+# sync error blocks to disk
+log_must sync_pool $TESTPOOL
+
+log_must zpool scrub -e $TESTPOOL
+log_must is_pool_error_scrubbing $TESTPOOL true
+log_mustnot zpool scrub $TESTPOOL
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_error_scrub_stopped $TESTPOOL true
+
+log_must zpool scrub $TESTPOOL
+log_must is_pool_scrubbing $TESTPOOL true
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_scrub_stopped $TESTPOOL true
+
+log_pass "Verified regular scrub and error scrub can't run at the same time."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh
new file mode 100755
index 00000000000..d0066fdbb4a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh
@@ -0,0 +1,109 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2019, Delphix. All rights reserved.
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#	Verify error scrub clears the errorlog, if errors no longer exist.
+#
+# STRATEGY:
+#	1. Create a pool and create file in it.
+#	2. Zinject errors and read using dd to log errors to disk.
+#	3. Make sure file name is mentioned in the list of error files.
+#	4. Start error scrub and wait for it finish.
+#	5. Check scrub ran and errors are still reported.
+#	6. Clear corruption and error scrub again.
+#	7. Check scrub ran and errors are cleared.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	zinject -c all
+	rm -f /$TESTPOOL2/$TESTFILE0
+	destroy_pool $TESTPOOL2
+}
+
+log_onexit cleanup
+
+log_assert "Verify error scrub clears the errorlog, if errors no longer exist."
+
+truncate -s $MINVDEVSIZE $TESTDIR/vdev_a
+log_must zpool create -f -O primarycache=none $TESTPOOL2 $TESTDIR/vdev_a
+log_must zfs create $TESTPOOL2/$TESTFS1
+typeset file=/$TESTPOOL2/$TESTFS1/$TESTFILE0
+log_must dd if=/dev/urandom of=$file bs=2M count=10
+
+lastfs="$(zfs list -r $TESTPOOL2 | tail -1 | awk '{print $1}')"
+for i in {1..3}; do
+	log_must zfs snap $lastfs@snap$i
+	log_must zfs clone $lastfs@snap$i $TESTPOOL2/clone$i
+	lastfs="$(zfs list -r $TESTPOOL2/clone$i | tail -1 | awk '{print $1}')"
+done
+
+log_must zinject -t data -e checksum -f 100 -a $file
+dd if=$file of=/dev/null bs=2M count=10
+
+# Important: sync error log to disk
+log_must sync_pool $TESTPOOL2
+
+# Check reported errors
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v $TESTPOOL2 | \
+    grep \"Permanent errors have been detected\""
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'"
+
+# Check errors are reported if corruption persists
+log_must zpool scrub -e -w $TESTPOOL2
+log_must eval "zpool status -v | grep 'error blocks'"
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v $TESTPOOL2 | \
+    grep \"Permanent errors have been detected\""
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'"
+
+# Check errors are cleared
+log_must zinject -c all
+log_must zpool scrub -e -w $TESTPOOL2
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v | grep 'error blocks'"
+log_mustnot eval "zpool status -v | grep '$TESTFILE0'"
+
+
+log_pass "Verify error scrub clears the errorlog, if errors no longer exist."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh
new file mode 100755
index 00000000000..c88b9b0c8d3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh
@@ -0,0 +1,54 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#	Verify error scrub clears the errorlog, if errors no longer exist.
+#
+# STRATEGY:
+#	1. Create a pool with head_errlog disabled.
+#	2. Run an error scrub and verify it is not supported.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	rm -f /$TESTPOOL2/$TESTFILE0
+	destroy_pool $TESTPOOL2
+}
+
+log_onexit cleanup
+
+log_assert "Verify error scrub cannot run without the head_errlog feature."
+
+truncate -s $MINVDEVSIZE $TESTDIR/vdev_a
+log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL2 $TESTDIR/vdev_a
+log_mustnot zpool scrub -ew $TESTPOOL2
+
+log_pass "Verify error scrub cannot run without the head_errlog feature."
+

From 577e835f30c9b92ed8126eb4e8fb17cb0e411c04 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 19 May 2023 13:05:09 -0700
Subject: [PATCH 10/11] Probe vdevs before marking removed

Before allowing the ZED to mark a vdev as REMOVED due to a
hotplug event confirm that it is non-responsive with probe.
Any device which can be successfully probed should be left
ONLINE to prevent a healthy pool from being incorrectly
SUSPENDED.  This may occur for at least the following two
scenarios.

1) Drive expansion (zpool online -e) in VMware environments.
   If, during the partition resize operation, a partition is
   removed and re-created then udev will send a removed event.

2) Re-scanning the namespaces of an NVMe device (nvme ns-rescan)
   may result in a udev remove and add event being delivered.

Finally, update the ZED to only kick in a spare when the
removal was successful.

Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #14859
Closes #14861
---
 cmd/zed/agents/zfs_retire.c |  8 +++++---
 module/zfs/vdev.c           | 11 +++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index 28714ec295b..f83ae09259a 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -445,14 +445,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 			return;
 
 		/* Remove the vdev since device is unplugged */
+		int remove_status = 0;
 		if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) {
-			int status = zpool_vdev_remove_wanted(zhp, devname);
+			remove_status = zpool_vdev_remove_wanted(zhp, devname);
 			fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'"
-			    ", ret:%d", devname, status);
+			    ", err:%d", devname, libzfs_errno(zhdl));
 		}
 
 		/* Replace the vdev with a spare if its not a l2arc */
-		if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
+		if (!l2arc && !remove_status &&
+		    (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
 		    replace_with_spare(hdl, zhp, vdev) == B_FALSE)) {
 			/* Could not handle with spare */
 			fmd_hdl_debug(hdl, "no spare for '%s'", devname);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 4bfd95861e0..c243dddb7e6 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4097,11 +4097,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	/*
-	 * If the vdev is already removed, then don't do anything.
+	 * If the vdev is already removed, or expanding which can trigger
+	 * repartition add/remove events, then don't do anything.
 	 */
-	if (vd->vdev_removed)
+	if (vd->vdev_removed || vd->vdev_expanding)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
+	/*
+	 * Confirm the vdev has been removed, otherwise don't do anything.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
 	vd->vdev_remove_wanted = B_TRUE;
 	spa_async_request(spa, SPA_ASYNC_REMOVE);
 

From ad0a554614b096698d9969340c4c593690042d5b Mon Sep 17 00:00:00 2001
From: Brian Atkinson <batkinson@lanl.gov>
Date: Fri, 19 May 2023 16:05:53 -0400
Subject: [PATCH 11/11] Hold db_mtx when updating db_state

Commit 555ef90 did some general code refactoring for
dmu_buf_will_not_fill() and dmu_buf_will_fill(). However, the db_mtx was
not held when update db->db_state in those code block. The rest of the
dbuf code always holds the db_mtx when updating db_state. This is
important because cv_wait() db_changed is used to check for db_state
changes.

Updating dmu_buf_will_not_fill() and dmu_buf_will_fill() to hold the
db_mtx when updating db_state.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #14875
---
 module/zfs/dbuf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 049a62c1c17..272e712586f 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2716,8 +2716,10 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
+	mutex_enter(&db->db_mtx);
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
+	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
@@ -2736,6 +2738,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
+	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_NOFILL) {
 		/*
 		 * Block cloning: We will be completely overwriting a block
@@ -2743,11 +2746,10 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 		 * pending clone and mark the block as uncached. This will be
 		 * as if the clone was never done.
 		 */
-		mutex_enter(&db->db_mtx);
 		VERIFY(!dbuf_undirty(db, tx));
-		mutex_exit(&db->db_mtx);
 		db->db_state = DB_UNCACHED;
 	}
+	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);