From 380c25f6402d099310f1cf6f6d1e17e65531f958 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 13 Oct 2023 13:41:11 -0400 Subject: [PATCH 01/11] FreeBSD: Improve taskq wrapper - Group tqent_task and tqent_timeout_task into a union. They are never used same time. This shrinks taskq_ent_t from 192 to 160 bytes. - Remove tqent_registered. Use tqent_id != 0 instead. - Remove tqent_cancelled. Use taskqueue pending counter instead. - Change tqent_type into uint_t. We don't need to pack it any more. - Change tqent_rc into uint_t, matching refcount(9). - Take shared locks in taskq_lookup(). - Call proper taskqueue_drain_timeout() for TIMEOUT_TASK in taskq_cancel_id() and taskq_wait_id(). - Switch from CK_LIST to regular LIST. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Reviewed-by: Mateusz Guzik Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15356 --- include/os/freebsd/spl/sys/taskq.h | 18 ++++---- module/os/freebsd/spl/spl_taskq.c | 74 +++++++++++++++--------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 30579b39171..b23a939b3aa 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -30,9 +30,9 @@ #include #include +#include #include #include -#include #ifdef __cplusplus extern "C" { @@ -48,16 +48,16 @@ typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { - struct task tqent_task; - struct timeout_task tqent_timeout_task; + union { + struct task tqent_task; + struct timeout_task tqent_timeout_task; + }; task_func_t *tqent_func; void *tqent_arg; - taskqid_t tqent_id; - CK_LIST_ENTRY(taskq_ent) tqent_hash; - uint8_t tqent_type; - uint8_t tqent_registered; - uint8_t tqent_cancelled; - volatile uint32_t tqent_rc; + taskqid_t tqent_id; + LIST_ENTRY(taskq_ent) tqent_hash; + uint_t tqent_type; + volatile uint_t tqent_rc; } taskq_ent_t; /* diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index ba22c77b69c..daefe355953 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -30,8 +30,6 @@ __FBSDID("$FreeBSD$"); #include -#include -#include #include #include #include @@ -70,7 +68,7 @@ extern int uma_align_cache; static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); -static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; +static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; static unsigned long tqenthash; static unsigned long tqenthashlock; static struct sx *tqenthashtbl_lock; @@ -80,8 +78,8 @@ static taskqid_t tqidnext; #define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash]) #define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)]) +#define NORMAL_TASK 0 #define TIMEOUT_TASK 1 -#define NORMAL_TASK 2 static void system_taskq_init(void *arg) @@ -121,7 +119,7 @@ system_taskq_fini(void *arg) for (i = 0; i < tqenthashlock + 1; i++) sx_destroy(&tqenthashtbl_lock[i]); for (i = 0; i < tqenthash + 1; i++) - VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i])); + VERIFY(LIST_EMPTY(&tqenthashtbl[i])); free(tqenthashtbl_lock, M_TASKQ); free(tqenthashtbl, M_TASKQ); } @@ -162,27 +160,27 @@ taskq_lookup(taskqid_t tqid) { taskq_ent_t *ent = NULL; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { + if (tqid == 0) + return (NULL); + sx_slock(TQIDHASHLOCK(tqid)); + LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { if (ent->tqent_id == tqid) break; } if (ent != NULL) refcount_acquire(&ent->tqent_rc); - sx_xunlock(TQIDHASHLOCK(tqid)); + sx_sunlock(TQIDHASHLOCK(tqid)); return (ent); } static taskqid_t taskq_insert(taskq_ent_t *ent) { - taskqid_t tqid; + taskqid_t tqid = __taskq_genid(); - tqid = __taskq_genid(); ent->tqent_id = tqid; - ent->tqent_registered = B_TRUE; sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); + LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); sx_xunlock(TQIDHASHLOCK(tqid)); return (tqid); } @@ -192,13 +190,14 @@ taskq_remove(taskq_ent_t *ent) { taskqid_t tqid = ent->tqent_id; - if (!ent->tqent_registered) + if (tqid == 0) return; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_REMOVE(ent, tqent_hash); + if (ent->tqent_id != 0) { + LIST_REMOVE(ent, tqent_hash); + ent->tqent_id = 0; + } sx_xunlock(TQIDHASHLOCK(tqid)); - ent->tqent_registered = B_FALSE; } static void @@ -285,21 +284,22 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) int rc; taskq_ent_t *ent; - if (tid == 0) - return (0); - if ((ent = taskq_lookup(tid)) == NULL) return (0); - ent->tqent_cancelled = B_TRUE; - if (ent->tqent_type == TIMEOUT_TASK) { + if (ent->tqent_type == NORMAL_TASK) { + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + } else { rc = taskqueue_cancel_timeout(tq->tq_queue, &ent->tqent_timeout_task, &pend); - } else - rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); - if (rc == EBUSY) { - taskqueue_drain(tq->tq_queue, &ent->tqent_task); - } else if (pend) { + if (rc == EBUSY) { + taskqueue_drain_timeout(tq->tq_queue, + &ent->tqent_timeout_task); + } + } + if (pend) { /* * Tasks normally free themselves when run, but here the task * was cancelled so it did not free itself. @@ -312,12 +312,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) } static void -taskq_run(void *arg, int pending __unused) +taskq_run(void *arg, int pending) { taskq_ent_t *task = arg; - if (!task->tqent_cancelled) - task->tqent_func(task->tqent_arg); + if (pending == 0) + return; + task->tqent_func(task->tqent_arg); taskq_free(task); } @@ -345,7 +346,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, task->tqent_func = func; task->tqent_arg = arg; task->tqent_type = TIMEOUT_TASK; - task->tqent_cancelled = B_FALSE; refcount_init(&task->tqent_rc, 1); tqid = taskq_insert(task); TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, @@ -379,7 +379,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) refcount_init(&task->tqent_rc, 1); task->tqent_func = func; task->tqent_arg = arg; - task->tqent_cancelled = B_FALSE; task->tqent_type = NORMAL_TASK; tqid = taskq_insert(task); TASK_INIT(&task->tqent_task, prio, taskq_run, task); @@ -388,10 +387,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) } static void -taskq_run_ent(void *arg, int pending __unused) +taskq_run_ent(void *arg, int pending) { taskq_ent_t *task = arg; + if (pending == 0) + return; task->tqent_func(task->tqent_arg); } @@ -406,8 +407,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, * can go at the front of the queue. */ prio = !!(flags & TQ_FRONT); - task->tqent_cancelled = B_FALSE; - task->tqent_registered = B_FALSE; task->tqent_id = 0; task->tqent_func = func; task->tqent_arg = arg; @@ -427,12 +426,13 @@ taskq_wait_id(taskq_t *tq, taskqid_t tid) { taskq_ent_t *ent; - if (tid == 0) - return; if ((ent = taskq_lookup(tid)) == NULL) return; - taskqueue_drain(tq->tq_queue, &ent->tqent_task); + if (ent->tqent_type == NORMAL_TASK) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + else + taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task); taskq_free(ent); } From c0e58995e33479a9c1d97fb2a19f8f507cc954b7 Mon Sep 17 00:00:00 2001 From: John Wren Kennedy Date: Fri, 13 Oct 2023 12:15:09 -0600 Subject: [PATCH 02/11] Large sync writes perform worse with slog For synchronous write workloads with large IO sizes, a pool configured with a slog performs worse than one with an embedded zil: sequential_writes 1m sync ios, 16 threads Write IOPS: 1292 438 -66.10% Write Bandwidth: 1323570 448910 -66.08% Write Latency: 12128400 36330970 3.0x sequential_writes 1m sync ios, 32 threads Write IOPS: 1293 430 -66.74% Write Bandwidth: 1324184 441188 -66.68% Write Latency: 24486278 74028536 3.0x The reason is the `zil_slog_bulk` variable. In `zil_lwb_write_open`, if a zil block is greater than 768K, the priority of the write is downgraded from sync to async. Increasing the value allows greater throughput. To select a value for this PR, I ran an fio workload with the following values for `zil_slog_bulk`: zil_slog_bulk KiB/s 1048576 422132 2097152 478935 4194304 533645 8388608 623031 12582912 827158 16777216 1038359 25165824 1142210 33554432 1211472 50331648 1292847 67108864 1308506 100663296 1306821 134217728 1304998 At 64M, the results with a slog are now improved to parity with an embedded zil: sequential_writes 1m sync ios, 16 threads Write IOPS: 438 1288 2.9x Write Bandwidth: 448910 1319062 2.9x Write Latency: 36330970 12163408 -66.52% sequential_writes 1m sync ios, 32 threads Write IOPS: 430 1290 3.0x Write Bandwidth: 441188 1321693 3.0x Write Latency: 74028536 24519698 -66.88% None of the other tests in the performance suite (run with a zil or slog) had a significant change, including the random_write_zil tests, which use multiple datasets. Reviewed-by: Alexander Motin Reviewed-by: Tony Nguyen Signed-off-by: John Wren Kennedy Closes #14378 --- man/man4/zfs.4 | 2 +- module/zfs/zil.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 71a3e67ee67..5f89f6adf1e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2172,7 +2172,7 @@ if a volatile out-of-order write cache is enabled. Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . -.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64 +.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 18c6cbf028b..a1188613699 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -145,7 +145,7 @@ static int zil_nocacheflush = 0; * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ -static uint64_t zil_slog_bulk = 768 * 1024; +static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; From f0f330e1212ee3924a3eadbdb40e7d71e0fb0a69 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Fri, 20 Oct 2023 10:29:02 -0600 Subject: [PATCH 03/11] Fix ZED auto-replace for VDEVs using by-id paths The change is simple -- restore the original code so that the VDEV path is updated when using by-id paths. The more challenging part was to devise a second ZTS test, that would test auto-replace for 'by-id' and help prevent a future regression. With that new test, we can now do an A|B test with , and without, the fix to confirm that auto-replace for by-id paths works. The existing auto-replace test, functional/fault/auto_replace_001_pos, will confirm that we didn't break auto-replace for 'by-vdev' paths. In the original functional/fault/auto_replace_001_pos test, the disk wipe (using dd) was not effective in removing the partitioning since the kernel was never informed of the wipe. Added a call to wipefs(8) so that the kernel is informed and ZED will re-partition the device. Added a validation step that the re-partitioning occurred by confirming that the GPT partition UUID changes. Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Rob Norris Reviewed-by: Tony Hutter Signed-off-by: Don Brady Closes #15363 --- cmd/zed/agents/zfs_mod.c | 55 +++-- include/libzutil.h | 2 +- lib/libzutil/os/linux/zutil_import_os.c | 5 +- tests/runfiles/linux.run | 8 +- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/include/commands.cfg | 9 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/fault/auto_replace_001_pos.ksh | 41 +++- .../functional/fault/auto_replace_002_pos.ksh | 192 ++++++++++++++++++ 9 files changed, 279 insertions(+), 35 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index b2c008ad1d0..9636c99fc85 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -24,6 +24,7 @@ * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, 2017, Intel Corporation. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2023, Klara Inc. */ /* @@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) uint64_t is_spare = 0; const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; - char devpath[PATH_MAX]; + char pathbuf[PATH_MAX]; int ret; int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; boolean_t is_sd = B_FALSE; @@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) char **lines = NULL; int lines_cnt = 0; + /* + * Get the persistent path, typically under the '/dev/disk/by-id' or + * '/dev/disk/by-vdev' directories. Note that this path can change + * when a vdev is replaced with a new disk. + */ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; @@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) (void) snprintf(rawpath, sizeof (rawpath), "%s%s", is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); - if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { + if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) { zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", rawpath, strerror(errno)); - (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, - &newstate); + int err = zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); - zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", - fullpath, libzfs_error_description(g_zfshdl)); + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) " + "err %d, new state %d", + fullpath, libzfs_error_description(g_zfshdl), err, + err ? (int)newstate : 0); return; } @@ -428,7 +436,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * to trigger a ZFS fault for the device (and any hot spare * replacement). */ - leafname = strrchr(devpath, '/') + 1; + leafname = strrchr(pathbuf, '/') + 1; /* * If this is a request to label a whole disk, then attempt to @@ -436,7 +444,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) */ if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, vdev, "autoreplace", &lines, &lines_cnt) != 0) { - zed_log_msg(LOG_INFO, + zed_log_msg(LOG_WARNING, " zpool_prepare_and_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); @@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) sizeof (device->pd_physpath)); list_insert_tail(&g_device_list, device); - zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)", leafname, (u_longlong_t)guid); return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ @@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } if (!found) { /* unexpected partition slice encountered */ - zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", - fullpath); + zed_log_msg(LOG_WARNING, "labeled disk %s was " + "unexpected here", fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; @@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", physpath, (u_longlong_t)guid); - (void) snprintf(devpath, sizeof (devpath), "%s%s", - DEV_BYID_PATH, new_devid); + /* + * Paths that begin with '/dev/disk/by-id/' will change and so + * they must be updated before calling zpool_vdev_attach(). + */ + if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) { + (void) snprintf(pathbuf, sizeof (pathbuf), "%s%s", + DEV_BYID_PATH, new_devid); + zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' " + "replaced by '%s'", path, pathbuf); + path = pathbuf; + } } libzfs_free_str_array(lines, lines_cnt); @@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * Wait for udev to verify the links exist, then auto-replace * the leaf disk at same physical location. */ - if (zpool_label_disk_wait(path, 3000) != 0) { - zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " - "disk %s is missing", path); + if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling " + "replacement disk, the expected disk partition link '%s' " + "is missing after waiting %u ms", + zpool_get_name(zhp), path, DISK_LABEL_WAIT); nvlist_free(nvroot); return; } @@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) B_TRUE, B_FALSE); } - zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : libzfs_error_description(g_zfshdl)); @@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) dp->dd_prop, path); dp->dd_found = B_TRUE; - /* pass the new devid for use by replacing code */ + /* pass the new devid for use by auto-replacing code */ if (dp->dd_new_devid != NULL) { (void) nvlist_add_string(nvl, "new_devid", dp->dd_new_devid); diff --git a/include/libzutil.h b/include/libzutil.h index 237ff976ba6..053b1ed4b52 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -34,7 +34,7 @@ extern "C" { #endif /* - * Default wait time for a device name to be created. + * Default wait time in milliseconds for a device name to be created. */ #define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index 8b64369dc29..44ed697dd49 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) * Wait up to timeout_ms for udev to set up the device node. The device is * considered ready when libudev determines it has been initialized, all of * the device links have been verified to exist, and it has been allowed to - * settle. At this point the device the device can be accessed reliably. - * Depending on the complexity of the udev rules this process could take - * several seconds. + * settle. At this point the device can be accessed reliably. Depending on + * the complexity of the udev rules this process could take several seconds. */ int zpool_label_disk_wait(const char *path, int timeout_ms) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2252e46df3a..8bc55a1b4b4 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -122,10 +122,10 @@ tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', - 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', - 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', - 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', - 'zpool_status_-s'] + 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', + 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', + 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', + 'scrub_after_resilver', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 5d1360380de..4608e87522a 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true': 'fault/auto_online_001_pos': ['SKIP', ci_reason], 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], + 'fault/auto_replace_002_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index fa545e06bbf..648f2203dfb 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr chattr exportfs fallocate + flock free getfattr groupadd groupdel groupmod hostid + logger losetup lsattr lsblk @@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr md5sum mkswap modprobe + mountpoint mpstat nsenter parted perf setfattr + setpriv sha256sum udevadm unshare useradd userdel usermod - setpriv - mountpoint - flock - logger' + wipefs' export ZFS_FILES='zdb zfs diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 158401e078a..87b50f59ca7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ + functional/fault/auto_replace_002_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index 081e6c18430..ae56ee9919b 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -34,13 +34,14 @@ # 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. # This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". # 2. Create a pool and set autoreplace=on (auto-replace is opt-in) -# 3. Export a pool +# 3. Export the pool # 4. Wipe and offline the scsi_debug disk -# 5. Import pool with missing disk +# 5. Import the pool with missing disk # 6. Re-online the wiped scsi_debug disk -# 7. Verify the ZED detects the new unused disk and adds it back to the pool +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned # -# Creates a raidz1 zpool using persistent disk path names +# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names # (ie not /dev/sdc) # # Auto-replace is opt in, and matches by phys_path. @@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3 log_must zpool set autoreplace=on $TESTPOOL # Add some data to the pool -log_must mkfile $FSIZE /$TESTPOOL/data +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z log_must zpool export $TESTPOOL +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# # Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID remove_disk $SD block_device_wait @@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + log_pass "Auto-replace test successful" diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh new file mode 100755 index 00000000000..2259e604317 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh @@ -0,0 +1,192 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2017 by Intel Corporation. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test. +# Verifys that auto-replace works with by-id paths. +# +# STRATEGY: +# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. +# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". +# 2. Create a pool and set autoreplace=on (auto-replace is opt-in) +# 3. Export the pool +# 4. Wipe and offline the scsi_debug disk +# 5. Import the pool with missing disk +# 6. Re-online the wiped scsi_debug disk with a new serial number +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned +# +# Creates a raidz1 zpool using persistent /dev/disk/by-id path names +# +# Auto-replace is opt in, and matches by phys_path. +# + +verify_runnable "both" + +if ! is_physical_device $DISKS; then + log_unsupported "Unsupported disks for this test." +fi + +function cleanup +{ + zpool status $TESTPOOL + destroy_pool $TESTPOOL + sed -i '/alias scsidebug/d' $VDEVID_CONF + unload_scsi_debug +} + +# +# Wait until a vdev transitions to its replacement vdev +# +# Return 0 when vdev reaches expected state, 1 on timeout. +# +# Note: index +2 is to skip over root and raidz-0 vdevs +# +function wait_vdev_online # pool index oldguid timeout +{ + typeset pool=$1 + typeset -i index=$2+2 + typeset guid=$3 + typeset timeout=${4:-60} + typeset -i i=0 + + while [[ $i -lt $timeout ]]; do + vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) ) + + if [ "${vdev_guids[$index]}" != "${guid}" ]; then + log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}" + return 0 + fi + + i=$((i+1)) + sleep 1 + done + + return 1 +} +log_assert "automated auto-replace with by-id paths" +log_onexit cleanup + +load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b' +SD=$(get_debug_device) +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +SD_HOST=$(get_scsi_host $SD) + +# Register vdev_id alias for scsi_debug device to create a persistent path +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \ + awk -F'=' '/ID_VDEV=/ {print $2; exit}') +[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly" + +log_must zpool events -c +log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3 + +vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID) +log_note original vdev guid ${vdev_guid} + +# Auto-replace is opt-in so need to set property +log_must zpool set autoreplace=on $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z +log_must zpool export $TESTPOOL + +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# +# Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# +log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID +remove_disk $SD +block_device_wait + +# Re-import pool with drive missing +log_must zpool import $TESTPOOL +log_must check_state $TESTPOOL "" "DEGRADED" +block_device_wait + +# +# Online an empty disk in the same physical location, with a different by-id +# symlink. We use vpd_use_hostno to make sure the underlying serial number +# changes for the new disk which in turn gives us a different by-id path. +# +# The original names were something like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1 +# /dev/disk/by-id/wwn-0x33333330000007d0-part1 +# +# This new inserted disk, will have different links like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1 +# /dev/disk/by-id/wwn-0x0x3333333000003e80 -part1 +# +echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno + +insert_disk $SD $SD_HOST + +# make sure the physical path points to the same scsi-debug device +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +# Wait for the new disk to be online and replaced +log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45 +log_must wait_replacing $TESTPOOL 45 + +# Validate auto-replace was successful +log_must check_state $TESTPOOL "" "ONLINE" + +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + +log_pass "automated auto-replace with by-id paths" From ea30b5a9e0d266baa13398ed8f9435de050f4b25 Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Fri, 20 Oct 2023 10:30:32 -0700 Subject: [PATCH 04/11] Set spa_ccw_fail_time=0 when expanding a vdev. When a vdev is to be expanded -- either via `zpool online -e` or via the autoexpand option -- a SPA_ASYNC_CONFIG_UPDATE request is queued to be handled via an asynchronous worker thread (spa_async_thread). This normally happens almost immediately; but will be delayed up to zfs_ccw_retry_interval seconds (default 5 minutes) if an attempt to write the zpool configuration cache failed. When FreeBSD boots ZFS-root VM images generated using `makefs -t zfs`, the zpoolupgrade rc.d script runs `zpool upgrade`, which modifies the pool configuration and triggers an attempt to write to the cache file. This attempted write fails because the filesystem is still mounted read-only at this point in the boot process, triggering a 5-minute cooldown before SPA_ASYNC_CONFIG_UPDATE requests will be handled by the asynchronous worker thread. When expanding a vdev, reset the "when did a configuration cache write last fail" value so that the SPA_ASYNC_CONFIG_UPDATE request will be handled promptly. A cleaner but more intrusive option would be to use separate SPA_ASYNC_ flags for "configuration changed" and "try writing the configuration cache again", but with FreeBSD 14.0 coming very soon I'd prefer to leave such refactoring for a later date. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Colin Percival Closes #15405 --- module/zfs/vdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 87c14559323..afb01c0ef7f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4215,6 +4215,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } From b9384b94988cbcd697199e3284abd5f151978fe6 Mon Sep 17 00:00:00 2001 From: Olivier Certner Date: Fri, 20 Oct 2023 20:49:56 +0200 Subject: [PATCH 05/11] FreeBSD: taskq: Remove unused declaration Variable 'uma_align_cache' has not been used since commit "FreeBSD: Use a hash table for taskqid lookups" (3933305ea). Moreover, it is soon going to become private to FreeBSD's UMA in 15.0-CURRENT (main), 14.0-STABLE (stable/14) and 13.2-STABLE (stable/13). Should accessing this information become necessary again, one will have to use the new accessors for recent versions. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Olivier Certner Closes #15416 --- module/os/freebsd/spl/spl_taskq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index daefe355953..842b80ade1f 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -64,8 +64,6 @@ taskq_t *dynamic_taskq = NULL; proc_t *system_proc; -extern int uma_align_cache; - static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; From 0d6cec418e10de077d5252e8bc5d1b0b490da54f Mon Sep 17 00:00:00 2001 From: dennisfriedrichsen <31087738+dennisfriedrichsen@users.noreply.github.com> Date: Fri, 20 Oct 2023 13:52:13 -0500 Subject: [PATCH 06/11] Fix typo in tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg Reviewed-by: Rob N Reviewed-by: Brian Behlendorf Signed-off-by: Dennis R. Friedrichsen Closes #15417 --- tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg index e98b5e8b221..9c76a8780b4 100644 --- a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg +++ b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg @@ -29,7 +29,7 @@ # if is_linux; then - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ acltype atime \ checksum compression devices \ @@ -81,7 +81,7 @@ elif is_freebsd; then hidden" else - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ aclinherit aclmode atime \ checksum compression devices \ From b29e98fa8dfcee3b5362a9a2cb479a2379fb16da Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Oct 2023 14:54:05 -0400 Subject: [PATCH 07/11] Properly pad struct tx_cpu to cache line We already use ____cacheline_aligned in many places, so add one more instead of seems arbitrary char tc_pad[8]. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15402 --- include/sys/txg_impl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h index 45fde2e1f35..8ab7969b25b 100644 --- a/include/sys/txg_impl.h +++ b/include/sys/txg_impl.h @@ -73,8 +73,7 @@ struct tx_cpu { kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[8]; /* pad to fill 3 cache lines */ -}; +} ____cacheline_aligned; /* * The tx_state structure maintains the state information about the different From de7b1ae30ab83c37978902dfd53c1ac783ddbc6e Mon Sep 17 00:00:00 2001 From: VaibhavB <88050553+vaibhav-delphix@users.noreply.github.com> Date: Sat, 21 Oct 2023 00:27:39 +0530 Subject: [PATCH 08/11] run-zts test procfs/pool_state failed with uncorrectable I/O failure Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds. Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s) it take to update the status. hence sleeping for 200 seconds so that we get the correct status. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: vaibhav.bhanawat Closes #15364 --- tests/zfs-tests/tests/functional/procfs/pool_state.ksh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh index 7a02eb68abd..bae87637917 100755 --- a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh +++ b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh @@ -141,7 +141,11 @@ remove_disk $SDISK # background since the command will hang when the pool gets suspended. The # command will resume and exit after we restore the missing disk later on. zpool scrub $TESTPOOL2 & -sleep 3 # Give the scrub some time to run before we check if it fails +# Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds. +# Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s) +# it take to update the status. +# hence sleeping for 200 seconds so that we get the correct status. +sleep 200 # Give the scrub some time to run before we check if it fails log_must check_all $TESTPOOL2 "SUSPENDED" From 4fbc52495552b7e8b3337c94d0b7080db65657b8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Oct 2023 15:37:16 -0400 Subject: [PATCH 09/11] Remove lock from dsl_pool_need_dirty_delay() Torn reads/writes of dp_dirty_total are unlikely: on 64-bit systems due to register size, while on 32-bit due to memory constraints. And even if we hit some race, the code implementing the delay takes the lock any way. Removal of the poll-wide lock acquisition saves ~1% of CPU time on 8-thread 8KB write workload. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15390 --- module/zfs/dsl_pool.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9120fef93c7..17b97124828 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -965,18 +965,18 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - mutex_enter(&dp->dp_lock); - uint64_t dirty = dp->dp_dirty_total; - mutex_exit(&dp->dp_lock); - - return (dirty > delay_min_bytes); + /* + * We are not taking the dp_lock here and few other places, since torn + * reads are unlikely: on 64-bit systems due to register size and on + * 32-bit due to memory constraints. Pool-wide locks in hot path may + * be too expensive, while we do not need a precise result here. + */ + return (dp->dp_dirty_total > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - ASSERT(MUTEX_HELD(&dp->dp_lock)); - uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; From 57b409856265a53f93dd2497e5c046506cf0fc70 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Oct 2023 15:38:37 -0400 Subject: [PATCH 10/11] Trust ARC_BUF_SHARED() more In my understanding ARC_BUF_SHARED() and arc_buf_is_shared() should return identical results, except the second also asserts it deeper. The first is much cheaper though, saving few pointer dereferences. Replace production arc_buf_is_shared() calls with ARC_BUF_SHARED(), and call arc_buf_is_shared() in random assertions, while making it even more strict. On my tests this in half reduces arc_buf_destroy_impl() time, that noticeably reduces hash_lock congestion under heavy dbuf eviction. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15397 --- module/zfs/arc.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b5946e7604c..5d4a52fa069 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1364,7 +1364,7 @@ arc_buf_is_shared(arc_buf_t *buf) abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); + EQUIV(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* @@ -1998,7 +1998,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); - IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + IMPLY(encrypted, !arc_buf_is_shared(buf)); /* * If the caller wanted encrypted data we just need to copy it from @@ -2066,7 +2066,9 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, } if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { + ASSERT(arc_buf_is_shared(buf)); + } else { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } @@ -2078,7 +2080,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ @@ -2090,6 +2092,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!arc_buf_is_shared(buf)); + /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); @@ -2217,7 +2221,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2256,7 +2260,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2481,7 +2485,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many( @@ -2537,7 +2541,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many( @@ -3061,9 +3065,10 @@ arc_buf_destroy_impl(arc_buf_t *buf) arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!arc_buf_is_shared(buf)); uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); @@ -3104,9 +3109,9 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); @@ -6189,7 +6194,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } @@ -6206,9 +6211,9 @@ arc_release(arc_buf_t *buf, const void *tag) * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between @@ -6241,7 +6246,7 @@ arc_release(arc_buf_t *buf, const void *tag) */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); + ASSERT(!arc_buf_is_shared(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); @@ -6335,9 +6340,10 @@ arc_write_ready(zio_t *zio) arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } } @@ -6636,9 +6642,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); From 797f55ef12d752d2a7fb04fae5d24e019adf2a1d Mon Sep 17 00:00:00 2001 From: Sam Atkinson Date: Fri, 20 Oct 2023 17:22:04 -0400 Subject: [PATCH 11/11] Do not persist user/group/project quota zap objects when unneeded In the zfs_id_over*quota functions, there is a short-circuit to skip the zap_lookup when the quota zap does not exist. If quotas are never used in a zpool, then the quota zap will never exist. But if user/group/project quotas are ever used, the zap objects will be created and will persist even if the quotas are deleted. The quota zap_lookup in the write path can become a bottleneck for write-heavy small I/O workloads. Before this commit, it was not possible to remove this lookup without creating a new zpool. Reviewed-by: Brian Behlendorf Signed-off-by: Sam Atkinson Closes #14721 --- module/zfs/zfs_quota.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c index 9b351eefc04..56f9d22ed0e 100644 --- a/module/zfs/zfs_quota.c +++ b/module/zfs/zfs_quota.c @@ -347,18 +347,32 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); - VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + VERIFY0(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } - mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; + /* + * If the quota contains no more entries after the entry + * was removed, destroy the quota zap and remove the + * reference from zfsvfs. This will save us unnecessary + * zap_lookups for the quota during writes. + */ + uint64_t zap_nentries; + VERIFY0(zap_count(zfsvfs->z_os, *objp, &zap_nentries)); + if (zap_nentries == 0) { + VERIFY0(zap_remove(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], tx)); + VERIFY0(zap_destroy(zfsvfs->z_os, *objp, tx)); + *objp = 0; + } } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } + mutex_exit(&zfsvfs->z_lock); ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx);