From 052777406601a4049c28e25fe0b4df57160b5a58 Mon Sep 17 00:00:00 2001 From: Andrew Innes Date: Thu, 2 Nov 2023 06:19:44 +0800 Subject: [PATCH 01/13] Use env var for sed Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Andrew Innes Closes #15470 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 4c75616e429..99db7a1e35f 100644 --- a/configure.ac +++ b/configure.ac @@ -44,7 +44,7 @@ AM_INIT_AUTOMAKE([subdir-objects foreign]) # Remove default macros from config.h: # PACKAGE, PACKAGE_{BUGREPORT,NAME,STRING,TARNAME,VERSION}, STDC_HEADERS, VERSION AC_CONFIG_HEADERS([zfs_config.h], [ - sed -nri~ -e '/^$/be' -e 'N;N;/#define (PACKAGE|VERSION|STDC_HEADERS)/d' -e ':e' -e 'p' zfs_config.h && rm zfs_config.h~ || exit]) + $SED -nri~ -e '/^$/be' -e 'N;N;/#define (PACKAGE|VERSION|STDC_HEADERS)/d' -e ':e' -e 'p' zfs_config.h && rm zfs_config.h~ || exit]) LT_INIT AC_PROG_INSTALL From 3bd4df3841529316e5145590cc67076467b6abb7 Mon Sep 17 00:00:00 2001 From: ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com> Date: Mon, 6 Nov 2023 11:38:42 -0700 Subject: [PATCH 02/13] Improve ZFS objset sync parallelism As part of transaction group commit, dsl_pool_sync() sequentially calls dsl_dataset_sync() for each dirty dataset, which subsequently calls dmu_objset_sync(). dmu_objset_sync() in turn uses up to 75% of CPU cores to run sync_dnodes_task() in taskq threads to sync the dirty dnodes (files). There are two problems: 1. Each ZVOL in a pool is a separate dataset/objset having a single dnode. This means the objsets are synchronized serially, which leads to a bottleneck of ~330K blocks written per second per pool. 2. In the case of multiple dirty dnodes/files on a dataset/objset on a big system they will be sync'd in parallel taskq threads. However, it is inefficient to to use 75% of CPU cores of a big system to do that, because of (a) bottlenecks on a single write issue taskq, and (b) allocation throttling. In addition, if not for the allocation throttling sorting write requests by bookmarks (logical address), writes for different files may reach space allocators interleaved, leading to unwanted fragmentation. The solution to both problems is to always sync no more and (if possible) no fewer dnodes at the same time than there are allocators the pool. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Edmund Nadolski Closes #15197 --- include/os/freebsd/spl/sys/taskq.h | 3 + include/os/linux/spl/sys/taskq.h | 2 + include/sys/spa.h | 5 + include/sys/spa_impl.h | 12 +- include/sys/zfs_context.h | 2 + include/sys/zio.h | 6 + lib/libzpool/taskq.c | 30 +++++ man/man4/zfs.4 | 22 ++-- module/os/freebsd/spl/spl_taskq.c | 82 +++++++++++++ module/os/linux/spl/spl-taskq.c | 36 ++++++ module/zfs/dbuf.c | 9 +- module/zfs/dmu_objset.c | 138 +++++++++++++++------ module/zfs/dnode_sync.c | 1 + module/zfs/dsl_dataset.c | 5 +- module/zfs/dsl_pool.c | 31 ++--- module/zfs/spa.c | 190 +++++++++++++++++++++++------ module/zfs/spa_misc.c | 14 ++- module/zfs/zio.c | 43 ++++--- 18 files changed, 513 insertions(+), 118 deletions(-) diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index b23a939b3aa..0f23eafe3d4 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -42,6 +42,7 @@ extern "C" { typedef struct taskq { struct taskqueue *tq_queue; + int tq_nthreads; } taskq_t; typedef uintptr_t taskqid_t; @@ -93,6 +94,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, struct proc *, uint_t); diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index 6c1b4377a98..aa5860c56e8 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -150,6 +150,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); extern void taskq_destroy(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); diff --git a/include/sys/spa.h b/include/sys/spa.h index 88ef510b744..cef7933df44 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -825,6 +825,11 @@ extern void spa_sync_allpools(void); extern uint_t zfs_sync_pass_deferred_free; +/* spa sync taskqueues */ +taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); +void spa_sync_tq_destroy(spa_t *spa); +void spa_select_allocator(zio_t *zio); + /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 094258d47a4..b1eb06f94fc 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -188,6 +188,12 @@ typedef struct spa_taskqs { taskq_t **stqs_taskq; } spa_taskqs_t; +/* one for each thread in the spa sync taskq */ +typedef struct spa_syncthread_info { + kthread_t *sti_thread; + taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */ +} spa_syncthread_info_t; + typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ @@ -265,6 +271,10 @@ struct spa { int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ + /* per-allocator sync thread taskqs */ + taskq_t *spa_sync_tq; + spa_syncthread_info_t *spa_syncthreads; + spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ @@ -456,7 +466,7 @@ extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 750ca612b96..9ec2f73b366 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -496,6 +496,8 @@ extern taskq_t *system_taskq; extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ diff --git a/include/sys/zio.h b/include/sys/zio.h index e1f4d5c0449..25a4b221f05 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -223,6 +223,9 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_ALLOCATOR_NONE (-1) +#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) + #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) @@ -526,6 +529,9 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; + + /* write issue taskq selection, based upon sync thread */ + taskq_t *io_wr_iss_tq; }; enum blk_verify_flag { diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index a2e457ef9e6..99a181ec3c9 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -337,6 +337,36 @@ taskq_destroy(taskq_t *tq) kmem_free(tq, sizeof (taskq_t)); } +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + (void) pri; (void) minalloc; (void) maxalloc; + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + for (int i = 0; i < nthreads; i++) { + kthreads[i] = tq->tq_threadlist[i]; + } + *ktpp = kthreads; + return (tq); +} + int taskq_member(taskq_t *tq, kthread_t *t) { diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ddad00be412..f9824ac170e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -496,6 +496,13 @@ If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . +.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int +Determines the number of block alloctators to use per spa instance. +Capped by the number of actual CPUs in the system. +.Pp +Note that setting this value too high could result in performance +degredation and/or excess fragmentation. +. .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the new format when enabling the @@ -1974,13 +1981,6 @@ and may need to load new metaslabs to satisfy these allocations. .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . -.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int -This controls the number of threads used by -.Sy dp_sync_taskq . -The default value of -.Sy 75% -will create a maximum of one thread per CPU. -. .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before @@ -2265,6 +2265,14 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint +Determines the number of CPUs to run write issue taskqs. +.Pp +When 0 (the default), the value to use is computed internally +as the number of actual CPUs in the system divided by the +.Sy spa_num_allocators +value. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index 842b80ade1f..6912b220a94 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -220,6 +220,7 @@ taskq_create_impl(const char *name, int nthreads, pri_t pri, nthreads = MAX((mp_ncpus * nthreads) / 100, 1); tq = kmem_alloc(sizeof (*tq), KM_SLEEP); + tq->tq_nthreads = nthreads; tq->tq_queue = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &tq->tq_queue); taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, @@ -254,6 +255,87 @@ taskq_destroy(taskq_t *tq) kmem_free(tq, sizeof (*tq)); } +static void taskq_sync_assign(void *arg); + +typedef struct taskq_sync_arg { + kthread_t *tqa_thread; + kcondvar_t tqa_cv; + kmutex_t tqa_lock; + int tqa_ready; +} taskq_sync_arg_t; + +static void +taskq_sync_assign(void *arg) +{ + taskq_sync_arg_t *tqa = arg; + + mutex_enter(&tqa->tqa_lock); + tqa->tqa_thread = curthread; + tqa->tqa_ready = 1; + cv_signal(&tqa->tqa_cv); + while (tqa->tqa_ready == 1) + cv_wait(&tqa->tqa_cv, &tqa->tqa_lock); + mutex_exit(&tqa->tqa_lock); +} + +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP); + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + /* spawn all syncthreads */ + for (int i = 0; i < nthreads; i++) { + cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL); + (void) taskq_dispatch(tq, taskq_sync_assign, + &tqs[i], TQ_FRONT); + } + + /* wait on all syncthreads to start */ + for (int i = 0; i < nthreads; i++) { + mutex_enter(&tqs[i].tqa_lock); + while (tqs[i].tqa_ready == 0) + cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock); + mutex_exit(&tqs[i].tqa_lock); + } + + /* let all syncthreads resume, finish */ + for (int i = 0; i < nthreads; i++) { + mutex_enter(&tqs[i].tqa_lock); + tqs[i].tqa_ready = 2; + cv_broadcast(&tqs[i].tqa_cv); + mutex_exit(&tqs[i].tqa_lock); + } + taskq_wait(tq); + + for (int i = 0; i < nthreads; i++) { + kthreads[i] = tqs[i].tqa_thread; + mutex_destroy(&tqs[i].tqa_lock); + cv_destroy(&tqs[i].tqa_cv); + } + kmem_free(tqs, sizeof (*tqs) * nthreads); + + *ktpp = kthreads; + return (tq); +} + int taskq_member(taskq_t *tq, kthread_t *thread) { diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index d18f935b167..79a1a8e5a5a 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -1262,6 +1262,42 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int i = 0; + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + /* taskq_create spawns all the threads before returning */ + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) { + kthreads[i] = tqt->tqt_thread; + i++; + } + + ASSERT3S(i, ==, nthreads); + *ktpp = kthreads; + + return (tq); +} +EXPORT_SYMBOL(taskq_create_synced); + static unsigned int spl_taskq_kick = 0; /* diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c0c2692c113..0a179fffb16 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4587,6 +4587,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } } +/* + * Syncs out a range of dirty records for indirect or leaf dbufs. May be + * called recursively from dbuf_sync_indirect(). + */ void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { @@ -5005,7 +5009,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } -/* Issue I/O to commit a dirty buffer to disk. */ +/* + * Populate dr->dr_zio with a zio to commit a dirty buffer to disk. + * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio). + */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 76e65b5506a..f098e1daa44 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1639,28 +1639,90 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) kmem_free(bp, sizeof (*bp)); } +typedef struct sync_objset_arg { + zio_t *soa_zio; + objset_t *soa_os; + dmu_tx_t *soa_tx; + kmutex_t soa_mutex; + int soa_count; + taskq_ent_t soa_tq_ent; +} sync_objset_arg_t; + typedef struct sync_dnodes_arg { - multilist_t *sda_list; - int sda_sublist_idx; - multilist_t *sda_newlist; - dmu_tx_t *sda_tx; + multilist_t *sda_list; + int sda_sublist_idx; + multilist_t *sda_newlist; + sync_objset_arg_t *sda_soa; } sync_dnodes_arg_t; +static void sync_meta_dnode_task(void *arg); + static void sync_dnodes_task(void *arg) { sync_dnodes_arg_t *sda = arg; + sync_objset_arg_t *soa = sda->sda_soa; + objset_t *os = soa->soa_os; multilist_sublist_t *ms = multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); - dmu_objset_sync_dnodes(ms, sda->sda_tx); + dmu_objset_sync_dnodes(ms, soa->soa_tx); multilist_sublist_unlock(ms); kmem_free(sda, sizeof (*sda)); + + mutex_enter(&soa->soa_mutex); + ASSERT(soa->soa_count != 0); + if (--soa->soa_count != 0) { + mutex_exit(&soa->soa_mutex); + return; + } + mutex_exit(&soa->soa_mutex); + + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent); } +/* + * Issue the zio_nowait() for all dirty record zios on the meta dnode, + * then trigger the callback for the zil_sync. This runs once for each + * objset, only after any/all sublists in the objset have been synced. + */ +static void +sync_meta_dnode_task(void *arg) +{ + sync_objset_arg_t *soa = arg; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + int txgoff = tx->tx_txg & TXG_MASK; + dbuf_dirty_record_t *dr; + + ASSERT0(soa->soa_count); + + list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_remove_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(soa->soa_zio); + + mutex_destroy(&soa->soa_mutex); + kmem_free(soa, sizeof (*soa)); +} /* called from dsl */ void @@ -1670,8 +1732,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); @@ -1758,39 +1818,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) offsetof(dnode_t, dn_dirty_link[txgoff])); } + /* + * zio_nowait(zio) is done after any/all sublist and meta dnode + * zios have been nowaited, and the zil_sync() has been performed. + * The soa is freed at the end of sync_meta_dnode_task. + */ + sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP); + soa->soa_zio = zio; + soa->soa_os = os; + soa->soa_tx = tx; + taskq_init_ent(&soa->soa_tq_ent); + mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL); + ml = &os->os_dirty_dnodes[txgoff]; - num_sublists = multilist_get_num_sublists(ml); + soa->soa_count = num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(ml, i)) - continue; - sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = ml; - sda->sda_sublist_idx = i; - sda->sda_tx = tx; - (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - sync_dnodes_task, sda, 0); - /* callback frees sda */ - } - taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); - - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_remove_head(list)) != NULL) { - ASSERT0(dr->dr_dbuf->db_level); - zio_nowait(dr->dr_zio); + soa->soa_count--; } - /* Enable dnode backfill if enough objects have been freed. */ - if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { - os->os_rescan_dnodes = B_TRUE; - os->os_freed_dnodes = 0; + if (soa->soa_count == 0) { + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent); + } else { + /* + * Sync sublists in parallel. The last to finish + * (i.e., when soa->soa_count reaches zero) must + * dispatch sync_meta_dnode_task. + */ + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; + sync_dnodes_arg_t *sda = + kmem_alloc(sizeof (*sda), KM_SLEEP); + sda->sda_list = ml; + sda->sda_sublist_idx = i; + sda->sda_soa = soa; + (void) taskq_dispatch( + dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_task, sda, 0); + /* sync_dnodes_task frees sda */ + } } - - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - os->os_phys->os_zil_header = os->os_zil_header; - zio_nowait(zio); } boolean_t diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 8e39af83bb0..8cffbdb9d20 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. + * Does not wait for zio completions. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index d6db6172922..62a1649d378 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, return (error); } +/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */ void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); @@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } - dmu_objset_sync(ds->ds_objset, zio, tx); + dmu_objset_sync(ds->ds_objset, rio, tx); } /* diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 17b97124828..370c6a010dc 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -140,11 +140,6 @@ uint_t zfs_delay_min_dirty_percent = 60; */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; -/* - * This determines the number of threads used by the dp_sync_taskq. - */ -static int zfs_sync_taskq_batch_pct = 75; - /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg @@ -214,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); - dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, - TASKQ_THREADS_CPU_PCT); + dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq"); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, @@ -409,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); - taskq_destroy(dp->dp_sync_taskq); + spa_sync_tq_destroy(dp->dp_spa); /* * We can't set retry to TRUE since we're explicitly specifying @@ -674,7 +667,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { - zio_t *zio; + zio_t *rio; /* root zio for all dirty dataset syncs */ dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; @@ -704,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } /* - * Write out all dirty blocks of dirty datasets. + * Write out all dirty blocks of dirty datasets. Note, this could + * create a very large (+10k) zio tree. */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because @@ -715,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Update the long range free counter after @@ -748,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); /* * Release any key mappings created by calls to @@ -767,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Now that the datasets have been completely synced, we can @@ -1481,9 +1475,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, - "Max percent of CPUs that are used to sync dirty data"); - ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index aa97144f16e..68f367c1c74 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -99,6 +99,7 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +#include /* * spa_thread() existed on Illumos as a parent thread for the various worker @@ -128,16 +129,16 @@ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ + ZTI_MODE_SYNC, /* sync thread assigned */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } -#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } +#define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) @@ -158,14 +159,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE - * macros. Other operations process a large amount of data; the ZTI_BATCH + * macros. Other operations process a large amount of data; the ZTI_SCALE * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, - * but with number of taskqs also scaling with number of CPUs. + * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs + * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that @@ -175,7 +176,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ + { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ @@ -206,6 +207,8 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif +static uint_t zio_taskq_wr_iss_ncpus = 0; + /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. @@ -1054,21 +1057,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; -#ifdef HAVE_SYSDC - boolean_t batch = B_FALSE; -#endif switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; - case ZTI_MODE_BATCH: -#ifdef HAVE_SYSDC - batch = B_TRUE; -#endif + case ZTI_MODE_SYNC: + + /* + * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', + * not to exceed the number of spa allocators. + */ + if (zio_taskq_wr_iss_ncpus == 0) { + count = MAX(boot_ncpus / spa->spa_alloc_count, 1); + } else { + count = MAX(1, + boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); + } + count = MAX(count, (zio_taskq_batch_pct + 99) / 100); + count = MIN(count, spa->spa_alloc_count); + + /* + * zio_taskq_batch_pct is unbounded and may exceed 100%, but no + * single taskq may have more threads than 100% of online cpus. + */ + value = (zio_taskq_batch_pct + count / 2) / count; + value = MIN(value, 100); flags |= TASKQ_THREADS_CPU_PCT; - value = MIN(zio_taskq_batch_pct, 100); break; case ZTI_MODE_SCALE: @@ -1115,7 +1131,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " - "spa_activate()", + "spa_taskqs_init()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } @@ -1137,9 +1153,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) #ifdef HAVE_SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; - (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); @@ -1200,12 +1213,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). + * on the taskq itself. */ -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +static taskq_t * +spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + zio_t *zio) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; @@ -1213,12 +1225,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { + /* dispatch to assigned write issue taskq */ + tq = zio->io_wr_iss_tq; + return (tq); + } + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } + return (tq); +} +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, + zio_t *zio) +{ + taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); taskq_dispatch_ent(tq, func, arg, flags, ent); } @@ -1229,20 +1256,8 @@ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; - taskqid_t id; - - ASSERT3P(tqs->stqs_taskq, !=, NULL); - ASSERT3U(tqs->stqs_count, !=, 0); - - if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { - tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; - } - - id = taskq_dispatch(tq, func, arg, flags); + taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); + taskqid_t id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } @@ -9649,6 +9664,104 @@ spa_sync_allpools(void) mutex_exit(&spa_namespace_lock); } +taskq_t * +spa_sync_tq_create(spa_t *spa, const char *name) +{ + kthread_t **kthreads; + + ASSERT(spa->spa_sync_tq == NULL); + ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); + + /* + * - do not allow more allocators than cpus. + * - there may be more cpus than allocators. + * - do not allow more sync taskq threads than allocators or cpus. + */ + int nthreads = spa->spa_alloc_count; + spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * + nthreads, KM_SLEEP); + + spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, + nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); + VERIFY(spa->spa_sync_tq != NULL); + VERIFY(kthreads != NULL); + + spa_taskqs_t *tqs = + &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + ti->sti_thread = kthreads[i]; + if (w == tqs->stqs_count) { + w = 0; + } + ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + } + + kmem_free(kthreads, sizeof (*kthreads) * nthreads); + return (spa->spa_sync_tq); +} + +void +spa_sync_tq_destroy(spa_t *spa) +{ + ASSERT(spa->spa_sync_tq != NULL); + + taskq_wait(spa->spa_sync_tq); + taskq_destroy(spa->spa_sync_tq); + kmem_free(spa->spa_syncthreads, + sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); + spa->spa_sync_tq = NULL; +} + +void +spa_select_allocator(zio_t *zio) +{ + zbookmark_phys_t *bm = &zio->io_bookmark; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * A gang block (for example) may have inherited its parent's + * allocator, in which case there is nothing further to do here. + */ + if (ZIO_HAS_ALLOCATOR(zio)) + return; + + ASSERT(spa != NULL); + ASSERT(bm != NULL); + + /* + * First try to use an allocator assigned to the syncthread, and set + * the corresponding write issue taskq for the allocator. + * Note, we must have an open pool to do this. + */ + if (spa->spa_sync_tq != NULL) { + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + zio->io_allocator = i; + zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + return; + } + } + } + + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, + bm->zb_blkid >> 20); + + zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; + zio->io_wr_iss_tq = NULL; +} + /* * ========================================================================== * Miscellaneous routines @@ -10242,3 +10355,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, + "Number of CPUs to run write issue taskqs"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index c7472f972cc..3990af98c73 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -388,7 +388,11 @@ uint_t spa_asize_inflation = 24; uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; -static const int spa_allocators = 4; + +/* + * Number of allocators to use, per spa instance + */ +static int spa_num_allocators = 4; /* * Spa active allocator. @@ -730,7 +734,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - spa->spa_alloc_count = spa_allocators; + /* Do not allow more allocators than CPUs. */ + spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus); + spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { @@ -739,6 +745,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, @@ -3009,3 +3016,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); + +ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, + "Number of allocators per spa, capped by ncpus"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 4eb276352a2..2f5b423ee72 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -899,6 +899,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; + zio->io_allocator = ZIO_ALLOCATOR_NONE; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || (pipeline & ZIO_STAGE_READY) == 0; @@ -2007,7 +2008,7 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, - &zio->io_tqent); + &zio->io_tqent, zio); } static boolean_t @@ -2032,8 +2033,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q) static zio_t * zio_issue_async(zio_t *zio) { + ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (NULL); } @@ -2347,6 +2348,9 @@ zio_wait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); mutex_enter(&zio->io_lock); @@ -2399,6 +2403,9 @@ zio_nowait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); } @@ -2863,6 +2870,13 @@ zio_gang_issue(zio_t *zio) return (zio); } +static void +zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) +{ + cio->io_allocator = pio->io_allocator; + cio->io_wr_iss_tq = pio->io_wr_iss_tq; +} + static void zio_write_gang_member_ready(zio_t *zio) { @@ -2934,6 +2948,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) gbh_copies = MIN(2, spa_max_replication(spa)); } + ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2997,6 +3012,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(pio, zio); + /* * Create and nowait the gang children. */ @@ -3027,6 +3044,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(zio, cio); + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); @@ -3539,6 +3558,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to @@ -3575,21 +3595,12 @@ zio_dva_throttle(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - zbookmark_phys_t *bm = &zio->io_bookmark; - /* - * We want to try to use as many allocators as possible to help improve - * performance, but we also want logically adjacent IOs to be physically - * adjacent to improve sequential read performance. We chunk each object - * into 2^20 block regions, and then hash based on the objset, object, - * level, and region to accomplish both of these goals. - */ - int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, - bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - zio->io_allocator = allocator; + int allocator = zio->io_allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); @@ -3663,6 +3674,7 @@ zio_dva_allocate(zio_t *zio) * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ + ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -4515,6 +4527,7 @@ zio_ready(zio_t *zio) ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and @@ -4601,6 +4614,7 @@ zio_dva_throttle_done(zio_t *zio) } ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); @@ -4663,6 +4677,7 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); @@ -4928,7 +4943,7 @@ zio_done(zio_t *zio) ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - zio_reexecute, zio, 0, &zio->io_tqent); + zio_reexecute, zio, 0, &zio->io_tqent, NULL); } return (NULL); } From 2a154b84847d9895b76bfb41a6e30e0a627ee62c Mon Sep 17 00:00:00 2001 From: MigeljanImeri <78048439+MigeljanImeri@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:06:14 -0700 Subject: [PATCH 03/13] Fix accounting error for pending sync IO ops in zpool iostat Currently vdev_queue_class_length is responsible for checking how long the queue length is, however, it doesn't check the length when a list is used, rather it just returns whether it is empty or not. To fix this I added a counter variable to vdev_queue_class to keep track of the sync IO ops, and changed vdev_queue_class_length to reference this variable instead. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: MigeljanImeri Closes #15478 --- include/sys/vdev_impl.h | 5 ++++- module/zfs/vdev_queue.c | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index ad9dc3aefd8..3f2312c2343 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -131,7 +131,10 @@ typedef const struct vdev_ops { * Virtual device properties */ typedef union vdev_queue_class { - list_t vqc_list; + struct { + ulong_t vqc_list_numnodes; + list_t vqc_list; + }; avl_tree_t vqc_tree; } vdev_queue_class_t; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 08d918467d0..092b3f375be 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -273,8 +273,10 @@ vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) { zio_priority_t p = zio->io_priority; vq->vq_cqueued |= 1U << p; - if (vdev_queue_class_fifo(p)) + if (vdev_queue_class_fifo(p)) { list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } else avl_add(&vq->vq_class[p].vqc_tree, zio); } @@ -288,6 +290,7 @@ vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) list_t *list = &vq->vq_class[p].vqc_list; list_remove(list, zio); empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; } else { avl_tree_t *tree = &vq->vq_class[p].vqc_tree; avl_remove(tree, zio); @@ -1069,7 +1072,7 @@ vdev_queue_class_length(vdev_t *vd, zio_priority_t p) { vdev_queue_t *vq = &vd->vdev_queue; if (vdev_queue_class_fifo(p)) - return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + return (vq->vq_class[p].vqc_list_numnodes); else return (avl_numnodes(&vq->vq_class[p].vqc_tree)); } From 358ce2cf28a99095915deb883cbacd2d2e1f7b63 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 7 Nov 2023 09:09:24 -0800 Subject: [PATCH 04/13] zed: misc vdev_enc_sysfs_path fixes There have been rare cases where the VDEV_ENC_SYSFS_PATH value that zed gets passed is stale. To mitigate this, dynamically check the sysfs path at the time of zed event processing, and use the dynamic value if possible. Note that there will be other times when we can not dynamically detect the sysfs path (like if a disk disappears) and have to rely on the old value for things like turning on the fault LED. That is to say, we can't just blindly use the dynamic path in every case. Also: - Add enclosure sysfs entry when running 'zpool add' - Fix 'slot' and 'enc' zpool.d scripts for nvme Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15462 --- cmd/zed/agents/zfs_mod.c | 4 +++ cmd/zed/zed_event.c | 31 +++++++++++++++++++++++ cmd/zpool/zpool.d/ses | 12 +++++++-- cmd/zpool/zpool_vdev.c | 4 +++ include/libzutil.h | 2 ++ lib/libzfs/libzfs.abi | 7 +++++ lib/libzutil/os/freebsd/zutil_import_os.c | 9 +++++++ lib/libzutil/os/linux/zutil_import_os.c | 17 ++++++++----- 8 files changed, 78 insertions(+), 8 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 9636c99fc85..69163b80bd5 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -233,8 +233,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index c60d5a4bc22..7e586769223 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -35,6 +35,7 @@ #include "zed_strings.h" #include "agents/zfs_agents.h" +#include #define MAXBUF 4096 @@ -922,6 +923,25 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) } } + +static void +_zed_event_update_enc_sysfs_path(nvlist_t *nvl) +{ + const char *vdev_path; + + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + &vdev_path) != 0) { + return; /* some other kind of event, ignore it */ + } + + if (vdev_path == NULL) { + return; + } + + update_vdev_config_dev_sysfs_path(nvl, vdev_path, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH); +} + /* * Service the next zevent, blocking until one is available. */ @@ -969,6 +989,17 @@ zed_event_service(struct zed_conf *zcp) zed_log_msg(LOG_WARNING, "Failed to lookup zevent class (eid=%llu)", eid); } else { + /* + * Special case: If we can dynamically detect an enclosure sysfs + * path, then use that value rather than the one stored in the + * vd->vdev_enc_sysfs_path. There have been rare cases where + * vd->vdev_enc_sysfs_path becomes outdated. However, there + * will be other times when we can not dynamically detect the + * sysfs path (like if a disk disappears) and have to rely on + * the old value for things like turning on the fault LED. + */ + _zed_event_update_enc_sysfs_path(nvl); + /* let internal modules see this event first */ zfs_agent_post_event(class, NULL, nvl); diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses index 638145c95d4..19ef92ad67b 100755 --- a/cmd/zpool/zpool.d/ses +++ b/cmd/zpool/zpool.d/ses @@ -33,10 +33,18 @@ for i in $scripts ; do val="" case $i in enc) - val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$VDEV_ENC_SYSFS_PATH" + else + val="$(ls """$VDEV_ENC_SYSFS_PATH/../../""" 2>/dev/null)" + fi ;; slot) - val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$(basename """$VDEV_ENC_SYSFS_PATH""")" + else + val="$(cat """$VDEV_ENC_SYSFS_PATH/slot""" 2>/dev/null)" + fi ;; encdev) val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 3d0fc089c32..fbd4b81dfac 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -372,6 +372,10 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + /* Lookup and add the enclosure sysfs path (if exists) */ + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); diff --git a/include/libzutil.h b/include/libzutil.h index 053b1ed4b52..9842c225b6f 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -208,6 +208,8 @@ int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); +_LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, + const char *path, const char *key); #ifdef __cplusplus } #endif diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f8b0d395161..bcdcff97666 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -260,6 +260,7 @@ + @@ -8332,6 +8333,12 @@ + + + + + + diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c index 19ba58e79a0..a134c173bc8 100644 --- a/lib/libzutil/os/freebsd/zutil_import_os.c +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -249,6 +249,15 @@ zfs_dev_flush(int fd) return (0); } +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) +{ + (void) nv; + (void) path; + (void) key; +} + void update_vdevs_config_dev_sysfs_path(nvlist_t *config) { diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index 44ed697dd49..fbfae4f7e68 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -766,9 +766,12 @@ no_dev: * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + * + * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH) */ -static void -update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) { char *upath, *spath; @@ -777,9 +780,9 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) spath = zfs_get_enclosure_sysfs_path(upath); if (spath) { - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); + (void) nvlist_add_string(nv, key, spath); } else { - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + (void) nvlist_remove_all(nv, key); } free(upath); @@ -799,7 +802,8 @@ sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) return (1); /* Rescan our enclosure sysfs path for this vdev */ - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); return (0); } @@ -888,7 +892,8 @@ update_vdev_config_dev_strs(nvlist_t *nv) (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vds.vds_devphys); } - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } else { /* Clear out any stale entries. */ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); From f4cd1bac723633a22adafc87c2a9f874fbcbc9d6 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Nov 2023 13:24:15 -0500 Subject: [PATCH 05/13] Make abd_raidz_gen_iterate() pass an initialized pointer to the callback Otherwise callbacks may trigger KMSAN violations in the dlen == 0 case. For example, raidz_syn_pq_abd() will compare an uninitialized pointer with itself before returning. This seems harmless, but let's maintain good hygiene and avoid passing uninitialized variables, if only to placate KMSAN. Reviewed-by: Alexander Motin Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Mark Johnston Closes #15491 --- module/zfs/abd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index bcc6ddd5e81..0a2411a2d57 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -1025,7 +1025,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, size_t len, dlen; struct abd_iter caiters[3]; struct abd_iter daiter; - void *caddrs[3]; + void *caddrs[3], *daddr; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; @@ -1057,10 +1057,13 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, if (dsize > 0) { IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); + daddr = daiter.iter_mapaddr; len = MIN(daiter.iter_mapsize, len); dlen = len; - } else + } else { + daddr = NULL; dlen = 0; + } /* must be progressive */ ASSERT3U(len, >, 0); @@ -1070,7 +1073,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + func_raidz_gen(caddrs, daddr, len, dlen); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); From 9ce567c6fffd39e4bdf0cfe93f04a008de876b48 Mon Sep 17 00:00:00 2001 From: AllKind Date: Tue, 7 Nov 2023 20:27:29 +0100 Subject: [PATCH 06/13] Fix dkms installation of deb packages created with Alien. Alien does not honour the %posttrans hook. So move the dkms uninstall/install scripts to the %pre/%post hooks in case of package install/upgrade. In case of package removal, handle that in %preun. Add removal of all old dkms modules. Add checking for broken 'dkms status'. Handle that as good as possible and warn the user about it. Also add more verbose messages about what we are doing. Reviewed-by: Brian Behlendorf Signed-off-by: Mart Frauenlob Closes #15415 --- rpm/generic/zfs-dkms.spec.in | 90 ++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 23c3ed6ff40..d56967d7a8b 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -24,6 +24,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch Requires: dkms >= 2.2.0.3 +Requires(pre): dkms >= 2.2.0.3 Requires(post): dkms >= 2.2.0.3 Requires(preun): dkms >= 2.2.0.3 Requires: gcc, make, perl, diffutils @@ -68,9 +69,92 @@ fi %defattr(-,root,root) /usr/src/%{module}-%{version} +%pre +echo "Running pre installation script: $0. Parameters: $*" +# We don't want any other versions lingering around in dkms. +# Tests with 'dnf' showed that in case of reinstall, or upgrade +# the preun scriptlet removed the version we are trying to install. +# Because of this, find all zfs dkms sources in /var/lib/dkms and +# remove them, if we find a matching version in dkms. + +dkms_root=/var/lib/dkms +if [ -d ${dkms_root}/%{module} ]; then + cd ${dkms_root}/%{module} + for x in [[:digit:]]*; do + [ -d "$x" ] || continue + otherver="$x" + opath="${dkms_root}/%{module}/${otherver}" + if [ "$otherver" != %{version} ]; then + # This is a workaround for a broken 'dkms status', we caused in a previous version. + # One day it might be not needed anymore, but it does not hurt to keep it. + if dkms status -m %{module} -v "$otherver" 2>&1 | grep "${opath}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + if [ -L "${opath}/source" -a ! -d "${opath}/source" ] + then + echo "Trying to fix it by removing the symlink: ${opath}/source" >&2 + echo "You should manually remove ${opath}" >&2 + rm -f "${opath}/source" || echo "Removal failed!" >&2 + fi + fi + if [ `dkms status -m %{module} -v "$otherver" | grep -c %{module}` -gt 0 ]; then + echo "Removing old %{module} dkms modules version $otherver from all kernels." + dkms remove -m %{module} -v "$otherver" --all ||: + fi + fi + done +fi + +# Uninstall this version of zfs dkms modules before installation of the package. +if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all ||: +fi + +%post +echo "Running post installation script: $0. Parameters: $*" +# Add the module to dkms, as reccommended in the dkms man page. +# This is generally rpm specfic. +# But this also may help, if we have a broken 'dkms status'. +# Because, if the sources are available and only the symlink pointing +# to them is missing, this will resolve the situation +echo "Adding %{module} dkms modules version %{version} to dkms." +dkms add -m %{module} -v %{version} %{!?not_rpm:--rpm_safe_upgrade} ||: + +# After installing the package, dkms install this zfs version for the current kernel. +# Force the overwriting of old modules to avoid diff warnings in dkms status. +# Or in case of a downgrade to overwrite newer versions. +# Or if some other backed up versions have been restored before. +echo "Installing %{module} dkms modules version %{version} for the current kernel." +dkms install --force -m %{module} -v %{version} ||: + %preun -dkms remove -m %{module} -v %{version} --all +dkms_root="/var/lib/dkms/%{module}/%{version}" +echo "Running pre uninstall script: $0. Parameters: $*" +# In case of upgrade we do nothing. See above comment in pre hook. +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then + echo "This is an upgrade. Skipping pre uninstall action." + exit 0 +fi -%posttrans -/usr/lib/dkms/common.postinst %{module} %{version} +# Check if we uninstall the package. In that case remove the dkms modules. +# '0' is the value for the first parameter for rpm packages. +# 'remove' or 'purge' are the possible names for deb packages. +if [ "$1" = "0" -o "$1" = "remove" -o "$1" = "purge" ] ; then + if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} && exit 0 + fi + # If removing the modules failed, it might be because of the broken 'dkms status'. + if dkms status -m %{module} -v %{version} 2>&1 | grep "${dkms_root}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + echo "You should manually remove ${dkms_root}" >&2 + echo "WARNING: installed modules in /lib/modules/`uname -r`/extra could not be removed automatically!" >&2 + fi +else + echo "Script parameter $1 did not match any removal condition." +fi + +exit 0 From e36ff84c338d2f7b15aef2538f6a9507115bbf4a Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 Nov 2023 12:34:50 -0700 Subject: [PATCH 07/13] Update the kstat dataset_name when renaming a zvol Add a dataset_kstats_rename function, and call it when renaming a zvol on FreeBSD and Linux. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Alan Somers Sponsored-by: Axcient Closes #15482 Closes #15486 --- include/sys/dataset_kstats.h | 1 + module/os/freebsd/zfs/zvol_os.c | 1 + module/os/linux/zfs/zvol_os.c | 2 ++ module/zfs/dataset_kstats.c | 12 ++++++++++++ 4 files changed, 16 insertions(+) diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h index 40cf5258a2e..c81a07f0c11 100644 --- a/include/sys/dataset_kstats.h +++ b/include/sys/dataset_kstats.h @@ -71,6 +71,7 @@ typedef struct dataset_kstats { int dataset_kstats_create(dataset_kstats_t *, objset_t *); void dataset_kstats_destroy(dataset_kstats_t *); +void dataset_kstats_rename(dataset_kstats_t *dk, const char *); void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 0830e1c2654..6a7c2d2811b 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1319,6 +1319,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + dataset_kstats_rename(&zv->zv_kstat, newname); } /* diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index fd0fd2c36dc..384f5785a49 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1528,6 +1528,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); + + dataset_kstats_rename(&zv->zv_kstat, newname); } void diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 767a461e002..2ac058fd2c9 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -198,6 +198,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk) zil_sums_fini(&dk->dk_zil_sums); } +void +dataset_kstats_rename(dataset_kstats_t *dk, const char *name) +{ + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + char *ds_name; + + ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name); + ASSERT3S(ds_name, !=, NULL); + (void) strlcpy(ds_name, name, + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); +} + void dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) From 58398cbd035116ba1d5756bae338664573e60d21 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 7 Nov 2023 14:35:40 -0500 Subject: [PATCH 08/13] FreeBSD: Optimize large kstat outputs - Use sbuf_new_for_sysctl() to reduce double-buffering on sysctl output. - Use much faster sbuf_cat() instead of sbuf_printf("%s"). Together it reduces `sysctl kstat.zfs.misc.dbufs` time from minutes to seconds, making dbufstat almost usable. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15495 --- module/os/freebsd/spl/spl_kstat.c | 38 +++++++++++++------------------ 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c index 9f5f92e194e..43cd4da02e3 100644 --- a/module/os/freebsd/spl/spl_kstat.c +++ b/module/os/freebsd/spl/spl_kstat.c @@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS) static int kstat_sysctl_io(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; kstat_t *ksp = arg1; kstat_io_t *kip = ksp->ks_data; int rc; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, 0, req); + /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); /* though wlentime & friends are signed, they will never be negative */ - sbuf_printf(sb, + sbuf_printf(&sb, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, @@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS) kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } static int kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; void *data; kstat_t *ksp = arg1; void *(*addr_op)(kstat_t *ksp, loff_t index); int n, has_header, rc = 0; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req); if (ksp->ks_raw_ops.addr) addr_op = ksp->ks_raw_ops.addr; @@ -258,8 +253,10 @@ restart_headers: if (has_header) { if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart_headers; - if (rc == 0) - sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + if (rc == 0) { + sbuf_cat(&sb, "\n"); + sbuf_cat(&sb, ksp->ks_raw_buf); + } } while ((data = addr_op(ksp, n)) != NULL) { @@ -270,22 +267,19 @@ restart: if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (rc == 0) - sbuf_printf(sb, "%s", ksp->ks_raw_buf); + sbuf_cat(&sb, ksp->ks_raw_buf); } else { ASSERT3U(ksp->ks_ndata, ==, 1); - sbuf_hexdump(sb, ksp->ks_data, + sbuf_hexdump(&sb, ksp->ks_data, ksp->ks_data_size, NULL, 0); } n++; } free(ksp->ks_raw_buf, M_TEMP); mutex_exit(ksp->ks_lock); - sbuf_trim(sb); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } From 020f6fd093b628af5a34eb1cea9a3d800aa17ffb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 7 Nov 2023 14:37:18 -0500 Subject: [PATCH 09/13] FreeBSD: Implement taskq_init_ent() Previously taskq_init_ent() was an empty macro, while actual init was done by taskq_dispatch_ent(). It could be slightly faster in case taskq never enqueued. But without it taskq_empty_ent() relied on the structure being zeroed by somebody else, that is not good. As a side effect this allows the same task to be queued several times, that is normal on FreeBSD, that may or may not get useful here also one day. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15455 --- include/os/freebsd/spl/sys/taskq.h | 2 +- module/os/freebsd/spl/spl_taskq.c | 30 ++++++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 0f23eafe3d4..d84136ed4b9 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -82,7 +82,6 @@ typedef struct taskq_ent { #define TASKQID_INVALID ((taskqid_t)0) -#define taskq_init_ent(x) extern taskq_t *system_taskq; /* Global dynamic task queue for long delay */ extern taskq_t *system_delay_taskq; @@ -93,6 +92,7 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, kthread_t ***); diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index 6912b220a94..cc276e56832 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -480,21 +480,33 @@ void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, taskq_ent_t *task) { - int prio; - /* * If TQ_FRONT is given, we want higher priority for this task, so it * can go at the front of the queue. */ - prio = !!(flags & TQ_FRONT); - task->tqent_id = 0; + task->tqent_task.ta_priority = !!(flags & TQ_FRONT); task->tqent_func = func; task->tqent_arg = arg; - - TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); taskqueue_enqueue(tq->tq_queue, &task->tqent_task); } +void +taskq_init_ent(taskq_ent_t *task) +{ + TASK_INIT(&task->tqent_task, 0, taskq_run_ent, task); + task->tqent_func = NULL; + task->tqent_arg = NULL; + task->tqent_id = 0; + task->tqent_type = NORMAL_TASK; + task->tqent_rc = 0; +} + +int +taskq_empty_ent(taskq_ent_t *task) +{ + return (task->tqent_task.ta_pending == 0); +} + void taskq_wait(taskq_t *tq) { @@ -521,9 +533,3 @@ taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) { taskqueue_drain_all(tq->tq_queue); } - -int -taskq_empty_ent(taskq_ent_t *t) -{ - return (t->tqent_task.ta_pending == 0); -} From 78ac86882484dc66c920c8bd3aedb64d5f6f4c0c Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 Nov 2023 01:04:56 +0500 Subject: [PATCH 10/13] Remove obsolete_counts from grub2 compatibility list PR#15459 add all read-only compatible zpool features to grub2 compatibility list. 'obsolete_counts' is a read-only features that depends on 'device_removal' feature which is not read-only and is marked as ZFEATURE_FLAG_MOS. Creating a pool with grub2 compatibility enables 'device_removal' feature as well, which is not desired. This commit removes the 'obsolete_counts' feature from grub2 compatibility list, as GRUB only supports read-only compatible features. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15499 --- cmd/zpool/compatibility.d/grub2 | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/zpool/compatibility.d/grub2 b/cmd/zpool/compatibility.d/grub2 index 8c3a3ce11d9..6d60e643593 100644 --- a/cmd/zpool/compatibility.d/grub2 +++ b/cmd/zpool/compatibility.d/grub2 @@ -14,7 +14,6 @@ large_blocks livelist log_spacemap lz4_compress -obsolete_counts project_quota resilver_defer spacemap_histogram From 3d86999c759f720ed14dda8c50de0ad8029f2d23 Mon Sep 17 00:00:00 2001 From: Jason King Date: Tue, 7 Nov 2023 14:11:48 -0600 Subject: [PATCH 11/13] sa_lookup() ignores buffer size. When retrieving a system attribute, the size of the supplied buffer is ignored. If the buffer is too small to hold the attribute, sa_attr_op() will write past the end of the buffer. Reviewed-by: Brian Behlendorf Signed-off-by: Jason King Closes #15476 --- module/zfs/sa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index f9daaabbed3..0ae4c331dd3 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -23,6 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 RackTop Systems, Inc. */ #include @@ -369,7 +370,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, - bulk[i].sa_size); + MIN(bulk[i].sa_size, bulk[i].sa_length)); } continue; From dc45a00eac35a2e66c3ddba6e52e2b2280bd2be0 Mon Sep 17 00:00:00 2001 From: Gordon Tetlow Date: Tue, 7 Nov 2023 13:21:56 -0800 Subject: [PATCH 12/13] Add kern.features.zfs Add a ZFS feature flag to indicate OpenZFS availability. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Gordon Tetlow Closes #15484 --- module/os/freebsd/zfs/kmod_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index 9a268573528..00c1acf5771 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -336,6 +337,8 @@ static moduledata_t zfs_mod = { EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); #endif +FEATURE(zfs, "OpenZFS support"); + DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); MODULE_VERSION(zfsctrl, 1); #if __FreeBSD_version > 1300092 From 9198de8f1079a8bbb837de3e3f8e236777b1375d Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 Nov 2023 02:24:16 +0500 Subject: [PATCH 13/13] Linux 6.6 compat: fix implicit conversion error with debug build With Linux v6.6.0 and GCC 12, when debug build is configured, implicit conversion error is raised while converting 'enum ' to 'boolean_t'. Use 'B_TRUE' instead of 'true' to fix the issue. Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Snajdr Reviewed-by: Brian Atkinson Signed-off-by: Umer Saleem Closes #15489 --- module/os/linux/zfs/zfs_vfsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index a1db5c57c18..2792bc02721 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1488,7 +1488,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) - vfs->vfs_readonly = true; + vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) {