From cfc564f9b11eed1421789fa018f7b3d141cc18d0 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Wed, 30 Jun 2021 10:37:20 -0400 Subject: [PATCH 01/15] ZED: Match added disk by pool/vdev GUID if found (#12217) This enables ZED to auto-online vdevs that are not wholedisk managed by ZFS. Signed-off-by: Ryan Moeller Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter --- cmd/zed/agents/zfs_mod.c | 40 ++++++-- cmd/zed/zed_disk_event.c | 2 + tests/runfiles/linux.run | 9 +- tests/test-runner/bin/zts-report.py.in | 1 + .../tests/functional/fault/Makefile.am | 1 + .../functional/fault/auto_online_002_pos.ksh | 94 +++++++++++++++++++ 6 files changed, 137 insertions(+), 10 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index b564f2b12a2..3bcdf6e1d71 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -640,6 +640,27 @@ devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) return (data.dd_found); } +/* + * Given a device guid, find any vdevs with a matching guid. + */ +static boolean_t +guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, + zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_func = func; + data.dd_found = B_FALSE; + data.dd_pool_guid = pool_guid; + data.dd_vdev_guid = vdev_guid; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + /* * Handle a EC_DEV_ADD.ESC_DISK event. * @@ -663,15 +684,18 @@ static int zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) { char *devpath = NULL, *devid; + uint64_t pool_guid = 0, vdev_guid = 0; boolean_t is_slice; /* - * Expecting a devid string and an optional physical location + * Expecting a devid string and an optional physical location and guid */ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) return (-1); (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); @@ -682,12 +706,16 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) * Iterate over all vdevs looking for a match in the following order: * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). - * - * For disks, we only want to pay attention to vdevs marked as whole - * disks or are a multipath device. + * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). */ - if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) - (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); + if (devid_iter(devid, zfs_process_add, is_slice)) + return (0); + if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add, + is_slice)) + return (0); + if (vdev_guid != 0) + (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, + is_slice); return (0); } diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 6ec566cff3c..94e24236063 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -72,6 +72,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE) + zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART); if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index c6d4f5f6d34..642ed824d46 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -98,10 +98,11 @@ tests = ['fallocate_prealloc', 'fallocate_punch-hole'] tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] -tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos', - 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', - 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', - 'decompress_fault', 'scrub_after_resilver', 'zpool_status_-s'] +tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', + 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', + 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', + 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', + 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 27c865ed5c7..8c3bce13491 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -323,6 +323,7 @@ if os.environ.get('CI') == 'true': 'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason], 'fault/auto_offline_001_pos': ['SKIP', ci_reason], 'fault/auto_online_001_pos': ['SKIP', ci_reason], + 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index f2fc06877d3..ba0d7d6992c 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ auto_offline_001_pos.ksh \ auto_online_001_pos.ksh \ + auto_online_002_pos.ksh \ auto_replace_001_pos.ksh \ auto_spare_001_pos.ksh \ auto_spare_002_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh new file mode 100755 index 00000000000..60185ace34b --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_online_002_pos.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. +# Copyright (c) 2019 by Delphix. All rights reserved. +# Portions Copyright 2021 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Online Test. +# Now with partitioned vdevs. +# +# STRATEGY: +# 1. Partition a scsi_debug device for simulating removal +# 2. Create a pool +# 3. Offline disk +# 4. ZED polls for an event change for online disk to be automatically +# added back to the pool. +# +verify_runnable "both" + +function cleanup +{ + poolexists ${TESTPOOL} && destroy_pool ${TESTPOOL} + unload_scsi_debug +} + +log_assert "Testing automated auto-online FMA test with partitioned vdev" + +log_onexit cleanup + +load_scsi_debug ${SDSIZE} ${SDHOSTS} ${SDTGTS} ${SDLUNS} '512b' +SDDEVICE=$(get_debug_device) +zpool labelclear -f ${SDDEVICE} +partition_disk ${SDSIZE} ${SDDEVICE} 1 +part=${SDDEVICE}1 +host=$(get_scsi_host ${SDDEVICE}) + +block_device_wait /dev/${part} +log_must zpool create -f ${TESTPOOL} raidz1 ${part} ${DISKS} + +# Add some data to the pool +log_must mkfile ${FSIZE} /${TESTPOOL}/data + +remove_disk ${SDDEVICE} +check_state ${TESTPOOL} "" "degraded" || \ + log_fail "${TESTPOOL} is not degraded" + +# Clear zpool events +log_must zpool events -c + +# Online disk +insert_disk ${SDDEVICE} ${host} + +log_note "Delay for ZED auto-online" +typeset -i timeout=0 +until is_pool_resilvered ${TESTPOOL}; do + if ((timeout++ == MAXTIMEOUT)); then + log_fail "Timeout occurred" + fi + sleep 1 +done +log_note "Auto-online of ${SDDEVICE} is complete" + +# Validate auto-online was successful +sleep 1 +check_state ${TESTPOOL} "" "online" || \ + log_fail "${TESTPOOL} is not back online" + +log_must zpool destroy ${TESTPOOL} + +log_pass "Auto-online with partitioned vdev test successful" From 42afb12da70f1fd73c7fcf653d5e2bf42f05b42b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 1 Jul 2021 11:16:54 -0400 Subject: [PATCH 02/15] Remove refcount from spa_config_*() The only reason for spa_config_*() to use refcount instead of simple non-atomic (thanks to scl_lock) variable for scl_count is tracking, hard disabled for the last 8 years. Switch to simple int scl_count reduces the lock hold time by avoiding atomic, plus makes structure fit into single cache line, reducing the locks contention. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: Mark Maybee Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #12287 --- include/sys/spa_impl.h | 4 ++-- module/zfs/spa_misc.c | 19 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index bc88cfa15e8..280f8cf1695 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -141,9 +141,9 @@ typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; + int scl_count; kcondvar_t scl_cv; - zfs_refcount_t scl_count; -} spa_config_lock_t; +} ____cacheline_aligned spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index e2523231d28..157dede93cf 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -444,9 +444,9 @@ spa_config_lock_init(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); - zfs_refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; + scl->scl_count = 0; } } @@ -457,9 +457,9 @@ spa_config_lock_destroy(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); - zfs_refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); + ASSERT(scl->scl_count == 0); } } @@ -480,7 +480,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) } } else { ASSERT(scl->scl_writer != curthread); - if (!zfs_refcount_is_zero(&scl->scl_count)) { + if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); @@ -488,7 +488,7 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) } scl->scl_writer = curthread; } - (void) zfs_refcount_add(&scl->scl_count, tag); + scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); @@ -514,14 +514,14 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } } else { ASSERT(scl->scl_writer != curthread); - while (!zfs_refcount_is_zero(&scl->scl_count)) { + while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } - (void) zfs_refcount_add(&scl->scl_count, tag); + scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); @@ -535,8 +535,8 @@ spa_config_exit(spa_t *spa, int locks, const void *tag) if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); - ASSERT(!zfs_refcount_is_zero(&scl->scl_count)); - if (zfs_refcount_remove(&scl->scl_count, tag) == 0) { + ASSERT(scl->scl_count > 0); + if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ @@ -555,8 +555,7 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; - if ((rw == RW_READER && - !zfs_refcount_is_zero(&scl->scl_count)) || + if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } From 50e09eddd0c556ace90357b081d23df3f00c2d83 Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Thu, 1 Jul 2021 11:20:27 -0400 Subject: [PATCH 03/15] Optimize txg_kick() process (#12274) Use dp_dirty_pertxg[] for txg_kick(), instead of dp_dirty_total in original code. Extra parameter "txg" is added for txg_kick(), thus it knows which txg to kick. Also txg_kick() call is moved from dsl_pool_need_dirty_delay() to dsl_pool_dirty_space() so that we can know the txg number assigned for txg_kick(). Some unnecessary code regarding dp_dirty_total in txg_sync_thread() is also cleaned up. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: Alexander Motin Signed-off-by: jxdking Closes #12274 --- include/sys/txg.h | 2 +- module/zfs/dsl_pool.c | 25 +++++++++++++++++++------ module/zfs/txg.c | 36 ++++++++++++++---------------------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/include/sys/txg.h b/include/sys/txg.h index 22158bd1a5e..f38f0006c04 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -78,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, hrtime_t resolution); -extern void txg_kick(struct dsl_pool *dp); +extern void txg_kick(struct dsl_pool *dp, uint64_t txg); /* * Wait until the given transaction group has finished syncing. diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index e66c136a9e0..72f4b86d772 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -898,18 +898,26 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; - uint64_t dirty; mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_total; + uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - if (dirty > dirty_min_bytes) - txg_kick(dp); + return (dirty > delay_min_bytes); } +static boolean_t +dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + return (dirty > dirty_min_bytes); +} + void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { @@ -917,7 +925,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); + boolean_t needsync = !dmu_tx_is_syncing(tx) && + dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); + + if (needsync) + txg_kick(dp, tx->tx_txg); } } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index c55b1d8f960..c9eb84bbdb1 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -498,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp) taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } -static boolean_t -txg_is_syncing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_syncing_txg != 0); -} - static boolean_t txg_is_quiescing(dsl_pool_t *dp) { @@ -539,8 +531,6 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -551,8 +541,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { + !txg_has_quiesced_to_sync(dp)) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", (u_longlong_t)tx->tx_synced_txg, (u_longlong_t)tx->tx_sync_txg_waiting, dp); @@ -566,6 +555,11 @@ txg_sync_thread(void *arg) * prompting it to do so if necessary. */ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { + if (txg_is_quiescing(dp)) { + txg_thread_wait(tx, &cpr, + &tx->tx_quiesce_done_cv, 0); + continue; + } if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -791,24 +785,22 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) } /* - * If there isn't a txg syncing or in the pipeline, push another txg through - * the pipeline by quiescing the open txg. + * Pass in the txg number that should be synced. */ void -txg_kick(dsl_pool_t *dp) +txg_kick(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); + if (tx->tx_sync_txg_waiting >= txg) + return; + mutex_enter(&tx->tx_sync_lock); - if (!txg_is_syncing(dp) && - !txg_is_quiescing(dp) && - tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && - tx->tx_sync_txg_waiting <= tx->tx_synced_txg && - tx->tx_quiesced_txg <= tx->tx_synced_txg) { - tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; - cv_broadcast(&tx->tx_quiesce_more_cv); + if (tx->tx_sync_txg_waiting < txg) { + tx->tx_sync_txg_waiting = txg; + cv_broadcast(&tx->tx_sync_more_cv); } mutex_exit(&tx->tx_sync_lock); } From eca174527e0b8416550e6ce87c405702fd379ada Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Fri, 2 Jul 2021 00:22:16 +0900 Subject: [PATCH 04/15] Upstream: dmu_zfetch_stream_fini leaks refcount dmu_zfetch_stream_fini() is missing calls to destroy the refcounts, leaking them and the mutex inside. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Jorgen Lundman Closes #12294 --- module/zfs/dmu_zfetch.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 4a323fa990f..a26b0d73992 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -159,6 +159,8 @@ static void dmu_zfetch_stream_fini(zstream_t *zs) { ASSERT(!list_link_active(&zs->zs_node)); + zfs_refcount_destroy(&zs->zs_callers); + zfs_refcount_destroy(&zs->zs_refs); kmem_free(zs, sizeof (*zs)); } From c6d1112bf4125e5a22eb47ceb7b8cee01f0df9a1 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Fri, 2 Jul 2021 00:28:15 +0900 Subject: [PATCH 05/15] Fix abd leak, kmem_free correct size of abd_t Fix a leak of abd_t that manifested mostly when using raidzN with at least as many columns as N (e.g. a four-disk raidz2 but not a three-disk raidz2). Sufficiently heavy raidz use would eventually run a system out of memory. Additionally: * Switch abd_cache arena to FIRSTFIT, which empirically improves perofrmance. * Make abd_chunk_cache more performant and debuggable. * Allocate the abd_zero_buf from abd_chunk_cache rather than the heap. * Don't try to reap non-existent qcaches in abd_cache arena. * KM_PUSHPAGE->KM_SLEEP when allocating chunks from their own arena Reviewed-by: Matthew Ahrens Reviewed-by: Alexander Motin Signed-off-by: Jorgen Lundman Co-authored-by: Sean Doran Closes #12295 --- include/sys/abd_impl.h | 2 +- module/os/freebsd/zfs/abd_os.c | 9 ++++++--- module/os/linux/zfs/abd_os.c | 3 ++- module/zfs/abd.c | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 6bce08cfa34..113700cd72b 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -64,7 +64,7 @@ void abd_free_struct(abd_t *); */ abd_t *abd_alloc_struct_impl(size_t); -abd_t *abd_get_offset_scatter(abd_t *, abd_t *, size_t); +abd_t *abd_get_offset_scatter(abd_t *, abd_t *, size_t, size_t); void abd_free_struct_impl(abd_t *); void abd_alloc_chunks(abd_t *, size_t); void abd_free_chunks(abd_t *); diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 47adc2278df..4f5b33d9451 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -374,14 +374,17 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) } abd_t * -abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off) +abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, + size_t size) { abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; - uint_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); + size_t chunkcnt = abd_chunkcnt_for_bytes( + (new_offset % zfs_abd_chunk_size) + size); + + ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd)); /* * If an abd struct is provided, it is only the minimum size. If we diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index af543d6e3f7..d1d238a4e30 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -835,7 +835,8 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) } abd_t * -abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off) +abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, + size_t size) { int i = 0; struct scatterlist *sg = NULL; diff --git a/module/zfs/abd.c b/module/zfs/abd.c index d5fafccd08a..cc2d3575db6 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -531,7 +531,7 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) } ASSERT3U(left, ==, 0); } else { - abd = abd_get_offset_scatter(abd, sabd, off); + abd = abd_get_offset_scatter(abd, sabd, off, size); } ASSERT3P(abd, !=, NULL); From 490c845efe3ca29eaa8aa6ea1e3f45eda72895fe Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 1 Jul 2021 11:30:31 -0400 Subject: [PATCH 06/15] Compact dbuf/buf hashes and lock arrays With default dbuf cache size of 1/32 of ARC, it makes no sense to have hash table of the same size (or even bigger on Linux). Reduce it to 1/8 of ARC's one, still leaving some slack, assuming higher I/O rate via dbuf cache than via ARC. Remove padding from ARC hash locks array. The idea behind padding is to avoid false sharing between locks. It would have sense if there would be a limited number of very busy locks. But since we have no limit on the number, using the same memory for more locks we can achieve even lower lock contention with the same false sharing, or we can use less memory for the same contention level. Reduce number of hash locks from 8192 to 2048. The number is still big enough to not cause contention, but reduced memory size improves cache hit rate for mutex_tryenter() in ARC eviction thread, saving about 1% of the thread time. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #12289 --- include/sys/dbuf.h | 4 ++-- module/zfs/arc.c | 25 ++++++------------------- module/zfs/dbuf.c | 6 +++--- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index d221eac4c81..6ae079c6a64 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -322,12 +322,12 @@ typedef struct dmu_buf_impl { } dmu_buf_impl_t; /* Note: the dbuf hash table is exposed only for the mdb module */ -#define DBUF_MUTEXES 8192 +#define DBUF_MUTEXES 2048 #define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) typedef struct dbuf_hash_table { uint64_t hash_table_mask; dmu_buf_impl_t **hash_table; - kmutex_t hash_mutexes[DBUF_MUTEXES]; + kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, boolean_t); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3484fff3b4d..394ca1bfe42 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -740,29 +740,18 @@ taskq_t *arc_prune_taskq; * Hash table routines */ -#define HT_LOCK_ALIGN 64 -#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) - -struct ht_lock { - kmutex_t ht_lock; -#ifdef _KERNEL - unsigned char pad[HT_LOCK_PAD]; -#endif -}; - -#define BUF_LOCKS 8192 +#define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS]; + kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) -#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) @@ -1111,7 +1100,7 @@ buf_fini(void) (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (i = 0; i < BUF_LOCKS; i++) - mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); @@ -1276,10 +1265,8 @@ retry: for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); - for (i = 0; i < BUF_LOCKS; i++) { - mutex_init(&buf_hash_table.ht_locks[i].ht_lock, - NULL, MUTEX_DEFAULT, NULL); - } + for (i = 0; i < BUF_LOCKS; i++) + mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9ce091b80dc..289247c6ed6 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -826,12 +826,12 @@ dbuf_init(void) int i; /* - * The hash table is big enough to fill all of physical memory + * The hash table is big enough to fill one eighth of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ - while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) + while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; retry: @@ -3055,8 +3055,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); + kmem_cache_free(dbuf_kmem_cache, db); DBUF_STAT_BUMP(hash_insert_race); return (odb); } From b192a2c0a19133234972b491e9c06093b3d21358 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 1 Jul 2021 11:32:31 -0400 Subject: [PATCH 07/15] Remove avl_size field from struct avl_tree This field is used only by illumos mdb. On other platforms it only increases the struct size from 32 to 40 bytes. For struct vdev_queue including 13 instances of avl_tree_t size means active cache lines. Keep the padding in user-space for now to not break the ABI. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #12290 --- include/sys/avl_impl.h | 4 +++- module/avl/avl.c | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sys/avl_impl.h b/include/sys/avl_impl.h index f577ecd42f7..c464a62a1ca 100644 --- a/include/sys/avl_impl.h +++ b/include/sys/avl_impl.h @@ -147,7 +147,9 @@ struct avl_tree { int (*avl_compar)(const void *, const void *); size_t avl_offset; /* offsetof(type, avl_link_t field) */ ulong_t avl_numnodes; /* number of nodes in the tree */ - size_t avl_size; /* sizeof user type struct */ +#ifndef _KERNEL + size_t avl_pad; /* For backwards ABI compatibility. */ +#endif }; diff --git a/module/avl/avl.c b/module/avl/avl.c index 1a95092bc2b..3d36d4c87e0 100644 --- a/module/avl/avl.c +++ b/module/avl/avl.c @@ -875,7 +875,6 @@ avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); - ASSERT3U(tree1->avl_size, ==, tree2->avl_size); temp_node = tree1->avl_root; temp_numnodes = tree1->avl_numnodes; @@ -903,7 +902,6 @@ avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), tree->avl_compar = compar; tree->avl_root = NULL; tree->avl_numnodes = 0; - tree->avl_size = size; tree->avl_offset = offset; } From a5398f8782ebc22930cfbad846decaeb17570e86 Mon Sep 17 00:00:00 2001 From: Justin Gottula Date: Tue, 29 Jun 2021 18:50:13 -0700 Subject: [PATCH 08/15] Udev rules: use non-ancient comma syntax This file is old as dirt. It's entirely possible that commas were optional in udev back at that time. But they're definitely supposed to be there nowadays. Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Zakharov Signed-off-by: Justin Gottula Closes #12302 --- udev/rules.d/60-zvol.rules.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in index 60bbff8d2df..4447a253850 100644 --- a/udev/rules.d/60-zvol.rules.in +++ b/udev/rules.d/60-zvol.rules.in @@ -3,4 +3,4 @@ # persistent disk links: /dev/zvol/dataset_name # also creates compatibility symlink of /dev/dataset_name -KERNEL=="zd*" SUBSYSTEM=="block" ACTION=="add|change" PROGRAM="@udevdir@/zvol_id $tempnode" SYMLINK+="zvol/%c %c" +KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $tempnode", SYMLINK+="zvol/%c %c" From 17c794e7b0723b1f90229771ddf1dc2ac227f224 Mon Sep 17 00:00:00 2001 From: Justin Gottula Date: Tue, 29 Jun 2021 18:52:33 -0700 Subject: [PATCH 09/15] Udev rules: replace deprecated $tempnode with $devnode The $tempnode substitution is so old that it's not even mentioned in the man page anymore. It is still technically supported by udev, but with plenty of "deprecated" comments surrounding it. The preferred modern equivalent of $tempnode is $devnode (or alternatively, %N). Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Zakharov Signed-off-by: Justin Gottula Closes #12302 --- udev/rules.d/60-zvol.rules.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in index 4447a253850..5e1829029c6 100644 --- a/udev/rules.d/60-zvol.rules.in +++ b/udev/rules.d/60-zvol.rules.in @@ -3,4 +3,4 @@ # persistent disk links: /dev/zvol/dataset_name # also creates compatibility symlink of /dev/dataset_name -KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $tempnode", SYMLINK+="zvol/%c %c" +KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c" From f1aef8d4df83c8dc27e366fa1c720f91661d89c2 Mon Sep 17 00:00:00 2001 From: Justin Gottula Date: Tue, 29 Jun 2021 18:54:38 -0700 Subject: [PATCH 10/15] Udev rules: use match (==) rather than assign (=) for PROGRAM Assignment syntax (=) can be used for the PROGRAM key. But the PROGRAM key is really a match key, not an assign key. The internal logic used by udev to decide whether a PROGRAM key "matched" or not (which determines whether the remainder of the rule is evaluated) depends on whether the operator was OP_MATCH (==) or OP_NOMATCH (!=). [1] The man page claims that '"=", ":=", and "+=" have the same effect as "=="' for PROGRAM keys. And, after a brief perusal, the udev source code does seem to confirm that operators other than OP_MATCH (==) or OP_NOMATCH (!=) are implicitly converted to OP_MATCH (==). [2] But it's not entirely clear that this is definitely the case: anecdotal testing seems to indicate that when OP_ASSIGN (=) is used, the program's exit status is disregarded and the remainder of the rule is processed regardless of whether it was, in fact, a successful exit. The bottom line here is that, if zvol_id hits some snag and returns a nonzero exit status, then we almost certainly do NOT want to continue on with the rule and use whatever the stdout contents may have been to mindlessly create /dev/zvol/* symlinks. Therefore, let's be extra-sure and use the match (==) operator explicitly, to eliminate any possibility that udev might do the wrong thing, and ensure that a nonzero exit status will definitely short-circuit the rest of the rule, bypassing the SYMLINK+= assignments. [1] udev, file src/udev/udev-rules.c, func udev_rule_apply_token_to_event, switch case TK_M_PROGRAM if r != 0 (nonzero exit status): return token->op == OP_NOMATCH; switch case TK_M_PROGRAM if r == 0 (zero exit status): return token->op == OP_MATCH; func retval 0 => key is considered to have matched func retval 1 => key is considered to have NOT matched [2] udev, file src/udev/udev-rules.c, func parse_token, at func start: bool is_match = IN_SET(op, OP_MATCH, OP_NOMATCH); in else-if case streq(key, "PROGRAM"): if (!is_match) op = OP_MATCH; Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Zakharov Signed-off-by: Justin Gottula Closes #12302 --- udev/rules.d/60-zvol.rules.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in index 5e1829029c6..a3c7d2acf75 100644 --- a/udev/rules.d/60-zvol.rules.in +++ b/udev/rules.d/60-zvol.rules.in @@ -3,4 +3,4 @@ # persistent disk links: /dev/zvol/dataset_name # also creates compatibility symlink of /dev/dataset_name -KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c" +KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM=="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c" From b19e2bdfb5a2f0b652e12f73d8e01e80773aaadd Mon Sep 17 00:00:00 2001 From: Justin Gottula Date: Tue, 29 Jun 2021 20:14:18 -0700 Subject: [PATCH 11/15] Print zvol_id error messages to stderr rather than stdout The zvol_id program is invoked by udev, via a PROGRAM key in the 60-zvol.rules.in rule file, to determine the "pretty" /dev/zvol/* symlink paths paths that should be generated for each opaquely named /dev/zd* dev node. The udev rule uses the PROGRAM key, followed by a SYMLINK+= assignment containing the %c substitution, to collect the program's stdout and then "paste" it directly into the name of the symlink(s) to be created. Unfortunately, as currently written, zvol_id outputs both its intended output (a single string representing the symlink path that should be created to refer to the name of the dataset whose /dev/zd* path is given) AND its error messages (if any) to stdout. When processing PROGRAM keys (and others, such as IMPORT{program}), udev uses only the data written to stdout for functional purposes. Any data written to stderr is used solely for the purposes of logging (if udev's log_level is set to debug). The unintended consequence of this is as follows: if zvol_id encounters an error condition; and then udev fails to halt processing of the current rule (either because zvol_id didn't return a nonzero exit status, or because the PROGRAM key in the rule wasn't written properly to result in a "non-match" condition that would stop the current rule on a nonzero exit); then udev will create a space-delimited list of symlink names derived directly from the words of the error message string! I've observed this exact behavior on my own system, in a situation where the open() syscall on /dev/zd* dev nodes was failing sporadically (for reasons that aren't especially relevant here). Because the open() call failed, zvol_id printed "Unable to open device file: /dev/zd736\n" to stdout and then exited. The udev rule finished with SYMLINK+="zvol/%c %c". Assuming a volume name like pool/foo/bar, this would ordinarily expand to SYMLINK+="zvol/pool/foo/bar pool/foo/bar" and would cause symlinks to be created like this: /dev/zvol/pool/foo/bar -> /dev/zd736 /dev/pool/foo/bar -> /dev/zd736 But because of the combination of error messages being printed to stdout, and the udev syntax freely accepting a space-delimited sequence of names in this context, the error message string "Unable to open device file: /dev/zd736\n" in reality expanded to SYMLINK+="zvol/Unable to open device file: /dev/zd736" which caused the following symlinks to actually be created: /dev/zvol/Unable -> /dev/zd736 /dev/to -> /dev/zd736 /dev/open -> /dev/zd736 /dev/device -> /dev/zd736 /dev/file: -> /dev/zd736 /dev//dev/zd736 -> /dev/zd736 (And, because multiple zvols had open() syscall errors, multiple zvols attempted to claim several of those symlink names, resulting in numerous udev errors and timeouts and general chaos.) This commit rectifies all this silliness by simply printing error messages to stderr, as Dennis Ritchie originally intended. Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Zakharov Signed-off-by: Justin Gottula Closes #12302 --- cmd/zvol_id/zvol_id_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c index 4a2d74cc203..7e01028562b 100644 --- a/cmd/zvol_id/zvol_id_main.c +++ b/cmd/zvol_id/zvol_id_main.c @@ -63,14 +63,14 @@ main(int argc, char **argv) int rc; if (argc < 2) { - printf("Usage: %s /dev/zvol_device_node\n", argv[0]); + fprintf(stderr, "Usage: %s /dev/zvol_device_node\n", argv[0]); return (EINVAL); } dev_name = argv[1]; error = stat64(dev_name, &statbuf); if (error != 0) { - printf("Unable to access device file: %s\n", dev_name); + fprintf(stderr, "Unable to access device file: %s\n", dev_name); return (errno); } @@ -79,13 +79,13 @@ main(int argc, char **argv) fd = open(dev_name, O_RDONLY); if (fd < 0) { - printf("Unable to open device file: %s\n", dev_name); + fprintf(stderr, "Unable to open device file: %s\n", dev_name); return (errno); } error = ioctl_get_msg(zvol_name, fd); if (error < 0) { - printf("ioctl_get_msg failed:%s\n", strerror(errno)); + fprintf(stderr, "ioctl_get_msg failed: %s\n", strerror(errno)); return (errno); } if (dev_part > 0) From f24c7c359ea486438ecadf1496e7c9a77b132a0c Mon Sep 17 00:00:00 2001 From: Justin Gottula Date: Tue, 29 Jun 2021 21:29:09 -0700 Subject: [PATCH 12/15] Use substantially more robust program exit status logic in zvol_id Currently, there are several places in zvol_id where the program logic returns particular errno values, or even particular ioctl return values, as the program exit status, rather than a straightforward system of explicit zero on success and explicit nonzero value(s) on failure. This is problematic for multiple reasons. One particularly interesting problem that can arise, is that if any of these values happens to have all 8 least significant bits unset (i.e., it is a positive or negative multiple of 256), then although the C program sees a nonzero int value (presumed to be a failure exit status), the actual exit status as seen by the system is only the bottom 8 bits of that integer: zero. This can happen in practice, and I have encountered it myself. In a particularly weird situation, the zvol_open code in the zfs kernel module was behaving in such a manner that it caused the open() syscall to fail and for errno to be set to a kernel-private value (ERESTARTSYS, which happens to be defined as 512). It turns out that 512 is evenly divisible by 256; or, in other words, its least significant 8 bits are all-zero. So even though zvol_id believed it was returning a nonzero (failure) exit status of 512, the system modulo'd that value by 256, resulting in the actual exit status visible by other programs being 0! This actually-zero (non-failure) exit status caused problems: udev believed that the program was operating successfully, when in fact it was attempting to indicate failure via a nonzero exit status integer. Combined with another problem, this led to the creation of nonsense symlinks for zvol dev nodes by udev. Let's get rid of all this problematic logic, and simply return EXIT_SUCCESS (0) is everything went fine, and EXIT_FAILURE (1) if anything went wrong. Additionally, let's clarify some of the variable names (error is similar to errno, etc) and clean up the overall program flow a bit. Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Zakharov Signed-off-by: Justin Gottula Closes #12302 --- cmd/zvol_id/zvol_id_main.c | 50 ++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c index 7e01028562b..22f2e848cba 100644 --- a/cmd/zvol_id/zvol_id_main.c +++ b/cmd/zvol_id/zvol_id_main.c @@ -38,40 +38,39 @@ static int ioctl_get_msg(char *var, int fd) { - int error = 0; + int ret; char msg[ZFS_MAX_DATASET_NAME_LEN]; - error = ioctl(fd, BLKZNAME, msg); - if (error < 0) { - return (error); + ret = ioctl(fd, BLKZNAME, msg); + if (ret < 0) { + return (ret); } snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg); - return (error); + return (ret); } int main(int argc, char **argv) { - int fd, error = 0; + int fd = -1, ret = 0, status = EXIT_FAILURE; char zvol_name[ZFS_MAX_DATASET_NAME_LEN]; char *zvol_name_part = NULL; char *dev_name; struct stat64 statbuf; int dev_minor, dev_part; int i; - int rc; if (argc < 2) { fprintf(stderr, "Usage: %s /dev/zvol_device_node\n", argv[0]); - return (EINVAL); + goto fail; } dev_name = argv[1]; - error = stat64(dev_name, &statbuf); - if (error != 0) { + ret = stat64(dev_name, &statbuf); + if (ret != 0) { fprintf(stderr, "Unable to access device file: %s\n", dev_name); - return (errno); + goto fail; } dev_minor = minor(statbuf.st_rdev); @@ -80,22 +79,22 @@ main(int argc, char **argv) fd = open(dev_name, O_RDONLY); if (fd < 0) { fprintf(stderr, "Unable to open device file: %s\n", dev_name); - return (errno); + goto fail; } - error = ioctl_get_msg(zvol_name, fd); - if (error < 0) { + ret = ioctl_get_msg(zvol_name, fd); + if (ret < 0) { fprintf(stderr, "ioctl_get_msg failed: %s\n", strerror(errno)); - return (errno); + goto fail; } if (dev_part > 0) - rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name, + ret = asprintf(&zvol_name_part, "%s-part%d", zvol_name, dev_part); else - rc = asprintf(&zvol_name_part, "%s", zvol_name); + ret = asprintf(&zvol_name_part, "%s", zvol_name); - if (rc == -1 || zvol_name_part == NULL) - goto error; + if (ret == -1 || zvol_name_part == NULL) + goto fail; for (i = 0; i < strlen(zvol_name_part); i++) { if (isblank(zvol_name_part[i])) @@ -103,8 +102,13 @@ main(int argc, char **argv) } printf("%s\n", zvol_name_part); - free(zvol_name_part); -error: - close(fd); - return (error); + status = EXIT_SUCCESS; + +fail: + if (zvol_name_part) + free(zvol_name_part); + if (fd >= 0) + close(fd); + + return (status); } From 6e4e3c3ab67d4ad050a5e704287ea3577fe45b17 Mon Sep 17 00:00:00 2001 From: Justin Gottula Date: Tue, 6 Jul 2021 13:41:17 -0700 Subject: [PATCH 13/15] Udev rules: remove zvol compat symlinks (without the leading zvol/) This is a potentially arguable change, because it removes some compatibility cruft that certain systems or people may have come to rely on (either a very long time ago, or unwisely in recent times). On the other hand, it's been literally over a decade since OpenZFS switched to the strategy of using opaque numbered /dev/zd* device nodes, with the canonical zvol access path being a directory tree of symlinks created by udev rules inside /dev/zvol/*. (See #102.) Even at the time, the /dev/* scheme was labeled as being for "compatibility". This commit removes the second tree of symlinks located directly at /dev/*, under the assumption that anybody with any sense has been using the intended /dev/zvol/* path for a very very long time now. (The more I think about this, the more I anticipate that some large fraction of people will have been blissfully unaware that the intention has been for them to use the /dev/zvol/* tree all along, and they will have come to rely upon the /dev/* tree simply because it's been there this whole time despite being a compat thing.) Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Zakharov Reviewed-by: Neal Gompa Signed-off-by: Justin Gottula Closes #12303 --- udev/rules.d/60-zvol.rules.in | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/udev/rules.d/60-zvol.rules.in b/udev/rules.d/60-zvol.rules.in index a3c7d2acf75..9a2a473a841 100644 --- a/udev/rules.d/60-zvol.rules.in +++ b/udev/rules.d/60-zvol.rules.in @@ -1,6 +1,11 @@ # Persistent links for zvol # # persistent disk links: /dev/zvol/dataset_name -# also creates compatibility symlink of /dev/dataset_name +# +# NOTE: We used to also create an additional tree of zvol symlinks located at +# /dev/dataset_name (i.e. without the 'zvol' path component) for +# compatibility reasons. These are no longer created anymore, and should +# not be relied upon. +# -KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM=="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c %c" +KERNEL=="zd*", SUBSYSTEM=="block", ACTION=="add|change", PROGRAM=="@udevdir@/zvol_id $devnode", SYMLINK+="zvol/%c" From 97752ba22a4a036f9bc54556d114c77f68796b8f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 6 Jul 2021 17:38:00 -0400 Subject: [PATCH 14/15] Move gethrtime() calls out of vdev queue lock This dramatically reduces the lock contention on systems with slower (non-TSC) timecounters. With TSC the difference is minimal, but since this lock is pretty congested, any improvement counts. Plus I don't see any reason to do it under the lock other than the latency of the lock itself, which this change actually reduces. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #12281 --- module/zfs/vdev_queue.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 198861edb81..06d22f6df4c 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -912,9 +912,9 @@ vdev_queue_io(zio_t *zio) } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); - zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); @@ -936,14 +936,13 @@ vdev_queue_io_done(zio_t *zio) vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; + hrtime_t now = gethrtime(); + vq->vq_io_complete_ts = now; + vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; + mutex_enter(&vq->vq_lock); - vdev_queue_pending_remove(vq, zio); - zio->io_delta = gethrtime() - zio->io_timestamp; - vq->vq_io_complete_ts = gethrtime(); - vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; - while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { From bdd11cbb90a2afa54fd00935ac0d34b4ddf2515c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 6 Jul 2021 20:39:23 -0400 Subject: [PATCH 15/15] FreeBSD: Hardcode abd_chunk_size to PAGE_SIZE It makes no sense to set it below PAGE_SIZE, since it increases all overheads and makes returning memory to OS problematic. It makes no sense to set it above PAGE_SIZE, since such allocations and especially frees are too expensive and cause KVA fragmentation to benefit from fewer chunks. After that it makes no sense to keep more complicated math here. What may have sense though is just a tunable border between linear and scatter ABDs, previously also controlled by this tunable. Retain that functionality by taking abd_scatter_min_size tunable from Linux, just with different default value. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Reviewed-by: Ryan Moeller Signed-off-by: Alexander Motin Closes #12328 --- include/sys/abd.h | 1 - module/os/freebsd/zfs/abd_os.c | 129 +++++++++++++-------------------- 2 files changed, 51 insertions(+), 79 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index a7eee89ca91..6903e0c0e71 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -61,7 +61,6 @@ typedef struct abd { struct abd_scatter { uint_t abd_offset; #if defined(__FreeBSD__) && defined(_KERNEL) - uint_t abd_chunk_size; void *abd_chunks[1]; /* actually variable-length */ #else uint_t abd_nents; diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 4f5b33d9451..95a83542fad 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -79,22 +79,29 @@ struct { } abd_sums; /* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's for. Smaller allocations will use linear ABD's which use + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. + * + * Linear ABDs for multi-page allocations are easier to use, and in some cases + * it allows to avoid buffer copying. But allocation and especially free + * of multi-page linear ABDs are expensive operations due to KVA mapping and + * unmapping, and with time they cause KVA fragmentations. */ -size_t zfs_abd_chunk_size = 4096; +size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1; #if defined(_KERNEL) SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, - &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN, + &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations."); #endif kmem_cache_t *abd_chunk_cache; @@ -102,23 +109,16 @@ static kstat_t *abd_ksp; /* * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are - * just a single zero'd sized zfs_abd_chunk_size buffer. This - * allows us to conserve memory by only using a single zero buffer - * for the scatter chunks. + * just a single zero'd page-sized buffer. This allows us to conserve + * memory by only using a single zero buffer for the scatter chunks. */ abd_t *abd_zero_scatter = NULL; static char *abd_zero_buf = NULL; -static void -abd_free_chunk(void *c) -{ - kmem_cache_free(abd_chunk_cache, c); -} - static uint_t abd_chunkcnt_for_bytes(size_t size) { - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); + return ((size + PAGE_MASK) >> PAGE_SHIFT); } static inline uint_t @@ -132,7 +132,7 @@ abd_scatter_chunkcnt(abd_t *abd) boolean_t abd_size_alloc_linear(size_t size) { - return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); + return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE); } void @@ -140,7 +140,7 @@ abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { uint_t n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); - int waste = n * zfs_abd_chunk_size - abd->abd_size; + int waste = (n << PAGE_SHIFT) - abd->abd_size; if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); @@ -173,11 +173,11 @@ abd_verify_scatter(abd_t *abd) uint_t i, n; /* - * There is no scatter linear pages in FreeBSD so there is an - * if an error if the ABD has been marked as a linear page. + * There is no scatter linear pages in FreeBSD so there is + * an error if the ABD has been marked as a linear page. */ ASSERT(!abd_is_linear_page(abd)); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE); n = abd_scatter_chunkcnt(abd); for (i = 0; i < n; i++) { ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL); @@ -191,11 +191,9 @@ abd_alloc_chunks(abd_t *abd, size_t size) n = abd_chunkcnt_for_bytes(size); for (i = 0; i < n; i++) { - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); - ASSERT3P(c, !=, NULL); - ABD_SCATTER(abd).abd_chunks[i] = c; + ABD_SCATTER(abd).abd_chunks[i] = + kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); } - ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; } void @@ -205,7 +203,8 @@ abd_free_chunks(abd_t *abd) n = abd_scatter_chunkcnt(abd); for (i = 0; i < n; i++) { - abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + kmem_cache_free(abd_chunk_cache, + ABD_SCATTER(abd).abd_chunks[i]); } } @@ -250,15 +249,13 @@ abd_alloc_zero_scatter(void) uint_t i, n; n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); - abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); + abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; ABD_SCATTER(abd_zero_scatter).abd_offset = 0; - ABD_SCATTER(abd_zero_scatter).abd_chunk_size = - zfs_abd_chunk_size; for (i = 0; i < n; i++) { ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = @@ -266,18 +263,18 @@ abd_alloc_zero_scatter(void) } ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE); } static void abd_free_zero_scatter(void) { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE); abd_free_struct(abd_zero_scatter); abd_zero_scatter = NULL; - kmem_free(abd_zero_buf, zfs_abd_chunk_size); + kmem_cache_free(abd_chunk_cache, abd_zero_buf); } static int @@ -305,7 +302,7 @@ abd_kstats_update(kstat_t *ksp, int rw) void abd_init(void) { - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0, NULL, NULL, NULL, NULL, 0, KMC_NODEBUG); wmsum_init(&abd_sums.abdstat_struct_size, 0); @@ -382,7 +379,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; size_t chunkcnt = abd_chunkcnt_for_bytes( - (new_offset % zfs_abd_chunk_size) + size); + (new_offset & PAGE_MASK) + size); ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd)); @@ -397,7 +394,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, } if (abd == NULL) - abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size); + abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT); /* * Even if this buf is filesystem metadata, we only track that @@ -405,34 +402,16 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; - ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK; /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, - &ABD_SCATTER(sabd).abd_chunks[new_offset / - zfs_abd_chunk_size], + &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT], chunkcnt * sizeof (void *)); return (abd); } -static inline size_t -abd_iter_scatter_chunk_offset(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((ABD_SCATTER(aiter->iter_abd).abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); -} - -static inline size_t -abd_iter_scatter_chunk_index(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((ABD_SCATTER(aiter->iter_abd).abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); -} - /* * Initialize the abd_iter. */ @@ -483,29 +462,25 @@ void abd_iter_map(struct abd_iter *aiter) { void *paddr; - size_t offset = 0; ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - ABD_SCATTER(aiter->iter_abd).abd_chunk_size); - /* There's nothing left to iterate over, so do nothing */ if (abd_iter_at_end(aiter)) return; - if (abd_is_linear(aiter->iter_abd)) { - offset = aiter->iter_pos; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = ABD_LINEAR_BUF(aiter->iter_abd); + abd_t *abd = aiter->iter_abd; + size_t offset = aiter->iter_pos; + if (abd_is_linear(abd)) { + aiter->iter_mapsize = abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(abd); } else { - size_t index = abd_iter_scatter_chunk_index(aiter); - offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + offset += ABD_SCATTER(abd).abd_offset; + paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT]; + offset &= PAGE_MASK; + aiter->iter_mapsize = MIN(PAGE_SIZE - offset, + abd->abd_size - aiter->iter_pos); } aiter->iter_mapaddr = (char *)paddr + offset; } @@ -517,12 +492,10 @@ abd_iter_map(struct abd_iter *aiter) void abd_iter_unmap(struct abd_iter *aiter) { - /* There's nothing left to unmap, so do nothing */ - if (abd_iter_at_end(aiter)) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); + if (!abd_iter_at_end(aiter)) { + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + } aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0;