From 9edf6af4aed14948925588eb4d2171d229c9713a Mon Sep 17 00:00:00 2001 From: chenqiuhao1997 Date: Fri, 10 May 2024 23:47:21 +0800 Subject: [PATCH 01/48] Replace P2ALIGN with P2ALIGN_TYPED and delete P2ALIGN. In P2ALIGN, the result would be incorrect when align is unsigned integer and x is larger than max value of the type of align. In that case, -(align) would be a positive integer, which means high bits would be zero and finally stay zero after '&' when align is converted to a larger integer type. Reviewed-by: Brian Behlendorf Reviewed-by: Youzhong Yang Signed-off-by: Qiuhao Chen Closes #15940 --- cmd/zdb/zdb.c | 2 +- cmd/ztest.c | 12 +++++++----- include/os/freebsd/spl/sys/ccompile.h | 3 ++- include/os/freebsd/spl/sys/sysmacros.h | 3 ++- include/os/linux/spl/sys/sysmacros.h | 3 ++- lib/libefi/rdwr_efi.c | 4 ++-- lib/libspl/include/os/linux/sys/sysmacros.h | 3 ++- lib/libzfs/os/linux/libzfs_pool_os.c | 3 ++- module/os/freebsd/zfs/vdev_geom.c | 2 +- module/os/linux/zfs/zvol_os.c | 2 +- module/zcommon/zfs_fletcher.c | 8 +++++--- module/zfs/btree.c | 2 +- module/zfs/dmu.c | 5 +++-- module/zfs/dmu_object.c | 2 +- module/zfs/metaslab.c | 4 ++-- module/zfs/vdev.c | 11 ++++++----- 16 files changed, 40 insertions(+), 29 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d81199765c6..1eeb6e9116a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5217,7 +5217,7 @@ dump_label(const char *dev) sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); ashift = SPA_MINBLOCKSHIFT; /* diff --git a/cmd/ztest.c b/cmd/ztest.c index 34744d12b59..ca3105ffbbb 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2448,7 +2448,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { - offset = P2ALIGN(offset, size); + offset = P2ALIGN_TYPED(offset, size, uint64_t); } else { ASSERT3U(offset, <, size); offset = 0; @@ -4668,7 +4668,8 @@ ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); - os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, + uint64_t); mutex_exit(&os->os_obj_lock); } @@ -6284,7 +6285,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ - uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), + uint64_t); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; @@ -6600,8 +6602,8 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) - inc += P2ALIGN(ztest_random(64), - sizeof (uint32_t)); + inc += P2ALIGN_TYPED(ztest_random(64), + sizeof (uint32_t), uint64_t); if (inc > (size - pos)) inc = size - pos; diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h index 26cf4db87ae..bebeb0db245 100644 --- a/include/os/freebsd/spl/sys/ccompile.h +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -138,7 +138,8 @@ typedef int enum_t; #define readdir64 readdir #define dirent64 dirent #endif -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) diff --git a/include/os/freebsd/spl/sys/sysmacros.h b/include/os/freebsd/spl/sys/sysmacros.h index 3e8841ae66b..2c9f4438d76 100644 --- a/include/os/freebsd/spl/sys/sysmacros.h +++ b/include/os/freebsd/spl/sys/sysmacros.h @@ -191,7 +191,8 @@ extern unsigned char bcd_to_byte[256]; * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) /* * return x % (mod) align diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h index 99e3a6fb41c..0e839073630 100644 --- a/include/os/linux/spl/sys/sysmacros.h +++ b/include/os/linux/spl/sys/sysmacros.h @@ -159,7 +159,8 @@ makedev(unsigned int major, unsigned int minor) /* * Compatibility macros/typedefs needed for Solaris -> Linux port */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index 739219e0410..63c91059ae5 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -1175,8 +1175,8 @@ efi_use_whole_disk(int fd) * (for performance reasons). The alignment should match the * alignment used by the "zpool_label_disk" function. */ - limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, - PARTITION_END_ALIGNMENT); + limit = P2ALIGN_TYPED(efi_label->efi_last_lba - nblocks - + EFI_MIN_RESV_SIZE, PARTITION_END_ALIGNMENT, diskaddr_t); if (data_start + data_size != limit || resv_start != limit) sync_needed = B_TRUE; diff --git a/lib/libspl/include/os/linux/sys/sysmacros.h b/lib/libspl/include/os/linux/sys/sysmacros.h index 5765ee25c6c..26e1c87a35a 100644 --- a/lib/libspl/include/os/linux/sys/sysmacros.h +++ b/lib/libspl/include/os/linux/sys/sysmacros.h @@ -52,7 +52,8 @@ /* * Compatibility macros/typedefs needed for Solaris -> Linux port */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2BOUNDARY(off, len, align) \ diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c index 86eef3255bc..7b18e31c867 100644 --- a/lib/libzfs/os/linux/libzfs_pool_os.c +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -268,7 +268,8 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) if (start_block == MAXOFFSET_T) start_block = NEW_START_BLOCK; slice_size -= start_block; - slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT, + uint64_t); vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 196d67b4b59..2df13e7e187 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -457,7 +457,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); size = sizeof (*vdev_lists[0]) + pp->sectorsize - ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 61c8db5d817..41a9b1c53a2 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -384,7 +384,7 @@ zvol_discard(zv_request_t *zvr) */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN(end, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); size = end - start; } diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 619ddef0243..74b8c2a475a 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -471,7 +471,8 @@ fletcher_4_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, + uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); @@ -519,7 +520,8 @@ fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, + uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); @@ -878,7 +880,7 @@ abd_fletcher_4_iter(void *data, size_t size, void *private) fletcher_4_ctx_t *ctx = cdp->acd_ctx; fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; - uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + uint64_t asize = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); diff --git a/module/zfs/btree.c b/module/zfs/btree.c index af2b94a850b..9c52083603f 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -218,7 +218,7 @@ zfs_btree_create_custom(zfs_btree_t *tree, zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; - tree->bt_leaf_cap = P2ALIGN(esize / size, 2); + tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t); tree->bt_height = -1; tree->bt_bulk = NULL; } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index d8d5cfdbd23..1d10ceebfb9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -537,7 +537,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; + P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t)) + >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " @@ -854,7 +855,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) } /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); + *start = P2ALIGN_TYPED(*start, iblkrange, uint64_t); } if (*start < minimum) *start = minimum; diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index d0e39a423bb..56986ea4344 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, * is not suitably aligned. */ os->os_obj_next_chunk = - P2ALIGN(object, dnodes_per_chunk) + + P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) + dnodes_per_chunk; (void) atomic_swap_64(cpuobj, object); mutex_exit(&os->os_obj_lock); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index dbfc00362ff..d50225fc3f6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -629,8 +629,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc) * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ - space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 981da4e986c..c9ee39b468f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -347,7 +347,8 @@ vdev_get_min_asize(vdev_t *vd) * to the nearest metaslab. */ if (vd == vd->vdev_top) - return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); + return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, + uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } @@ -2107,8 +2108,8 @@ vdev_open(vdev_t *vd) } } - osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); - max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); + osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); + max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { @@ -4730,9 +4731,9 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { - vs->vs_esize = P2ALIGN( + vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL From b474dfad0d9dfe708eea0a14108e17374dd9b093 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 22 Apr 2024 14:41:03 -0400 Subject: [PATCH 02/48] Refactor dbuf_read() for safer decryption In dbuf_read_verify_dnode_crypt(): - We don't need original dbuf locked there. Instead take a lock on a dnode dbuf, that is actually manipulated. - Block decryption for a dnode dbuf if it is currently being written. ARC hash lock does not protect anonymous buffers, so arc_untransform() is unsafe when used on buffers being written, that may happen in case of encrypted dnode buffers, since they are not copied by dbuf_dirty()/dbuf_hold_copy(). In dbuf_read(): - If the buffer is in flight, recheck its compression/encryption status after it is cached, since it may need arc_untransform(). Tested-by: Rich Ercolani Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16104 --- module/zfs/dbuf.c | 222 ++++++++++++++++++++++------------------------ 1 file changed, 108 insertions(+), 114 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 8bd9dd9a88c..d56117cde01 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -161,13 +161,13 @@ struct { } dbuf_sums; #define DBUF_STAT_INCR(stat, val) \ - wmsum_add(&dbuf_sums.stat, val); + wmsum_add(&dbuf_sums.stat, val) #define DBUF_STAT_DECR(stat, val) \ - DBUF_STAT_INCR(stat, -(val)); + DBUF_STAT_INCR(stat, -(val)) #define DBUF_STAT_BUMP(stat) \ - DBUF_STAT_INCR(stat, 1); + DBUF_STAT_INCR(stat, 1) #define DBUF_STAT_BUMPDOWN(stat) \ - DBUF_STAT_INCR(stat, -1); + DBUF_STAT_INCR(stat, -1) #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ @@ -177,7 +177,6 @@ struct { static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); -static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); /* * Global data structures and functions for the dbuf cache. @@ -1403,13 +1402,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, * a decrypted block. Otherwise success. */ static int -dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) { - int bonuslen, max_bonuslen, err; - - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err) - return (err); + int bonuslen, max_bonuslen; bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); @@ -1494,32 +1489,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int -dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { - int err = 0; objset_t *os = db->db_objset; - arc_buf_t *dnode_abuf; - dnode_t *dn; + dmu_buf_impl_t *dndb; + arc_buf_t *dnbuf; zbookmark_phys_t zb; - - ASSERT(MUTEX_HELD(&db->db_mtx)); + int err; if ((flags & DB_RF_NO_DECRYPT) != 0 || - !os->os_encrypted || os->os_raw_receive) + !os->os_encrypted || os->os_raw_receive || + (dndb = dn->dn_dbuf) == NULL) return (0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; - - if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { - DB_DNODE_EXIT(db); + dnbuf = dndb->db_buf; + if (!arc_is_encrypted(dnbuf)) return (0); - } + + mutex_enter(&dndb->db_mtx); + + /* + * Since dnode buffer is modified by sync process, there can be only + * one copy of it. It means we can not modify (decrypt) it while it + * is being written. I don't see how this may happen now, since + * encrypted dnode writes by receive should be completed before any + * plain-text reads due to txg wait, but better be safe than sorry. + */ + while (1) { + if (!arc_is_encrypted(dnbuf)) { + mutex_exit(&dndb->db_mtx); + return (0); + } + dbuf_dirty_record_t *dr = dndb->db_data_pending; + if (dr == NULL || dr->dt.dl.dr_data != dnbuf) + break; + cv_wait(&dndb->db_changed, &dndb->db_mtx); + }; SET_BOOKMARK(&zb, dmu_objset_id(os), - DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); - err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); + DMU_META_DNODE_OBJECT, 0, dndb->db_blkid); + err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not @@ -1532,7 +1541,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; - DB_DNODE_EXIT(db); + mutex_exit(&dndb->db_mtx); return (err); } @@ -1558,7 +1567,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { - err = dbuf_read_bonus(db, dn, flags); + err = dbuf_read_bonus(db, dn); goto early_unlock; } @@ -1619,10 +1628,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, goto early_unlock; } - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err != 0) - goto early_unlock; - db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); @@ -1738,19 +1743,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) int dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) { - int err = 0; - boolean_t prefetch; dnode_t *dn; + boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; + int err; - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* + * Ensure that this block's dnode has been decrypted if the caller + * has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, dn, flags); + if (err != 0) + goto done; + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0; @@ -1759,13 +1768,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) db->db_partial_read = B_TRUE; else if (!(flags & DB_RF_PARTIAL_MORE)) db->db_partial_read = B_FALSE; - if (db->db_state == DB_CACHED) { - /* - * Ensure that this block's dnode has been decrypted if - * the caller has requested decrypted data. - */ - err = dbuf_read_verify_dnode_crypt(db, flags); + miss = (db->db_state != DB_CACHED); + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* + * Another reader came in while the dbuf was in flight between + * UNCACHED and CACHED. Either a writer will finish filling + * the buffer, sending the dbuf to CACHED, or the first reader's + * request will reach the read_done callback and send the dbuf + * to CACHED. Otherwise, a failure occurred and the dbuf will + * be sent to UNCACHED. + */ + if (flags & DB_RF_NEVERWAIT) { + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + do { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, + zio_t *, pio); + cv_wait(&db->db_changed, &db->db_mtx); + } while (db->db_state == DB_READ || db->db_state == DB_FILL); + if (db->db_state == DB_UNCACHED) { + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + } + + if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it @@ -1773,8 +1807,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) * unauthenticated blocks, which will verify their MAC if * the key is now available. */ - if (err == 0 && db->db_buf != NULL && - (flags & DB_RF_NO_DECRYPT) == 0 && + if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { @@ -1788,17 +1821,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_FALSE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) { - boolean_t need_wait = B_FALSE; - + } else { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - if (pio == NULL && (db->db_state == DB_NOFILL || (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { spa_t *spa = dn->dn_objset->os_spa; @@ -1806,65 +1832,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) need_wait = B_TRUE; } err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); - /* - * dbuf_read_impl has dropped db_mtx and our parent's rwlock - * for us - */ - if (!err && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - db->db_state != DB_CACHED, - flags & DB_RF_HAVESTRUCT); - } - - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - /* - * If we created a zio_root we must execute it to avoid - * leaking it, even if it isn't attached to any work due - * to an error in dbuf_read_impl(). - */ - if (need_wait) { - if (err == 0) - err = zio_wait(pio); - else - (void) zio_wait(pio); - pio = NULL; - } - } else { - /* - * Another reader came in while the dbuf was in flight - * between UNCACHED and CACHED. Either a writer will finish - * writing the buffer (sending the dbuf to CACHED) or the - * first reader's request will reach the read_done callback - * and send the dbuf to CACHED. Otherwise, a failure - * occurred and the dbuf went to UNCACHED. - */ - mutex_exit(&db->db_mtx); - if (prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_TRUE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - /* Skip the wait per the caller's request. */ - if ((flags & DB_RF_NEVERWAIT) == 0) { - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, pio); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); - } + /* dbuf_read_impl drops db_mtx and parent's rwlock. */ + miss = (db->db_state != DB_CACHED); } + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); + + /* + * If we created a zio we must execute it to avoid leaking it, even if + * it isn't attached to any work due to an error in dbuf_read_impl(). + */ + if (need_wait) { + if (err == 0) + err = zio_wait(pio); + else + (void) zio_wait(pio); + pio = NULL; + } + +done: + if (miss) + DBUF_STAT_BUMP(hash_misses); + else + DBUF_STAT_BUMP(hash_hits); if (pio && err != 0) { zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); From 0f1e8ba2f8260bd36dce010c0adb4dda80a9d442 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 23 Apr 2024 12:06:00 -0400 Subject: [PATCH 03/48] L2ARC: Cleanup buffer re-compression When compressed ARC is disabled, we may have to re-compress when writing into L2ARC. If doing so we can't fit it into the original physical size, we should just fail immediately, since even if it may still fit into allocation size, its checksum will never match. While there, refactor the code similar to other compression places without using abd_return_buf_copy(). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16038 --- module/zfs/arc.c | 59 ++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1953640139b..4bcf9f1522b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8895,7 +8895,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; - void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); @@ -8916,12 +8915,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ - if (HDR_HAS_RABD(hdr) && asize != psize) { - ASSERT3U(asize, >=, psize); + if (HDR_HAS_RABD(hdr)) { + ASSERT3U(asize, >, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); + abd_zero_off(to_write, psize, asize - psize); goto out; } @@ -8930,48 +8928,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) + if (asize > size) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - /* - * In some cases, we can wind up with size > asize, so - * we need to opt for the larger allocation option here. - * - * (We also need abd_return_buf_copy in all cases because - * it's an ASSERT() to modify the buffer before returning it - * with arc_return_buf(), and all the compressors - * write things before deciding to fail compression in nearly - * every case.) - */ - uint64_t bufsize = MAX(size, asize); - cabd = abd_alloc_for_io(bufsize, ismd); - tmp = abd_borrow_buf(cabd, bufsize); - - psize = zio_compress_data(compress, to_write, &tmp, size, - hdr->b_complevel); - - if (psize >= asize) { - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, bufsize); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); - to_write = cabd; - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); - goto encrypt; + size_t bufsize = MAX(size, asize); + void *buf = zio_buf_alloc(bufsize); + uint64_t csize = zio_compress_data(compress, to_write, &buf, + size, hdr->b_complevel); + if (csize > psize) { + /* + * We can't re-compress the block into the original + * psize. Even if it fits into asize, it does not + * matter, since checksum will never match on read. + */ + zio_buf_free(buf, bufsize); + return (SET_ERROR(EIO)); } - ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); - if (psize < asize) - memset((char *)tmp + psize, 0, bufsize - psize); - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, bufsize); - to_write = cabd; + if (asize > csize) + memset((char *)buf + csize, 0, asize - csize); + to_write = cabd = abd_get_from_buf(buf, bufsize); + abd_take_ownership_of_buf(cabd, B_TRUE); } -encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); From 938d1588ebc7fb41e24500b2f21839f5a53b4ab5 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 24 Apr 2024 17:38:48 -0400 Subject: [PATCH 04/48] Make more taskq parameters writable There is no reason for these module parameters to be read-only. Being modified they just apply on next pool import/creation, that is useful for testing different values. Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16118 --- man/man4/zfs.4 | 8 ++++++-- module/zfs/spa.c | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 1191cc96249..717cf8e6eae 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2324,8 +2324,8 @@ Prioritize requeued I/O. . .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. -These workers are responsible for I/O work such as compression and -checksum calculations. +These workers are responsible for I/O work such as compression, encryption, +checksum and parity calculations. Fractional number of CPUs will be rounded down. .Pp The default value of @@ -2333,6 +2333,7 @@ The default value of was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when slower compression and/or checksumming is enabled. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. @@ -2342,16 +2343,19 @@ while higher reduces lock contention. If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp Set the queue and thread configuration for the IO read queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. +Set values only apply to pools imported/created after that. . .It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp Set the queue and thread configuration for the IO write queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. +Set values only apply to pools imported/created after that. . .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 251dd8a4d1c..fc7cf000f0c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -10561,10 +10561,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ @@ -10595,10 +10595,10 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, - spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, "Configure IO queues for read IO"); ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, - spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif /* END CSTYLED */ From 672474659687115fc10f56f5473cd16eb307a99f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 1 May 2024 13:59:32 -0400 Subject: [PATCH 05/48] Slightly improve dnode hash As I understand just for being less predictable dnode hash includes 8 bits of objset pointer, starting at 6. But since objset_t is more than 1KB in size, its allocations are likely aligned to 2KB, that means 11 lower bits provide no entropy. Just take the 8 bits starting from 11. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16131 --- module/zfs/dmu_objset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index ef966d6703c..c9729fd8853 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj) ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* - * The low 6 bits of the pointer don't have much entropy, because - * the objset_t is larger than 2^6 bytes long. + * The lower 11 bits of the pointer don't have much entropy, because + * the objset_t is more than 1KB long and so likely aligned to 2KB. */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; From 41f2a9c81f6151cd01fc455f13b8330aa72718a6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 9 May 2024 10:32:59 -0400 Subject: [PATCH 06/48] Fix scn_queue races on very old pools Code for pools before version 11 uses dmu_objset_find_dp() to scan for children datasets/clones. It calls enqueue_clones_cb() and enqueue_cb() callbacks in parallel from multiple taskq threads. It ends up bad for scan_ds_queue_insert(), corrupting scn_queue AVL-tree. Fix it by introducing a mutex to protect those two scan_ds_queue_insert() calls. All other calls are done from the sync thread and so serialized. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16162 --- include/sys/dsl_scan.h | 1 + module/zfs/dsl_scan.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 2e3452e5eba..f32f59a2bed 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -173,6 +173,7 @@ typedef struct dsl_scan { dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ + kmutex_t scn_queue_lock; /* serializes scn_queue inserts */ uint64_t scn_queues_pending; /* outstanding data to issue */ /* members needed for syncing error scrub status to disk */ dsl_errorscrub_phys_t errorscrub_phys; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 34012db82de..c509f402c44 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); + mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); @@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp) scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); + mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); @@ -2727,8 +2729,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } @@ -2919,8 +2923,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } From 4c484d66b7f8fe595fdb89d91797e3e8fa686dab Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 9 May 2024 10:39:57 -0400 Subject: [PATCH 07/48] Fix ZIL clone records for legacy holes Previous code overengineered cloned range calculation by using BP_GET_LSIZE(). The problem is that legacy holes don't have the logical size, so result will be wrong. But we also don't need to look on every block size, since they all must be identical. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16165 --- module/zfs/zfs_log.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 50325907b0d..006b9119816 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -903,7 +903,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, itx_t *itx; lr_clone_range_t *lr; uint64_t partlen, max_log_data; - size_t i, partnbps; + size_t partnbps; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; @@ -912,10 +912,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, while (nbps > 0) { partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); - partlen = 0; - for (i = 0; i < partnbps; i++) { - partlen += BP_GET_LSIZE(&bps[i]); - } + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); partlen = MIN(partlen, len); itx = zil_itx_create(txtype, From fa4b1a404e12d3b883c86d8ea13d8c1203addce0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 10 May 2024 15:35:20 -0400 Subject: [PATCH 08/48] ZAP: Fix leaf references on zap_expand_leaf() errors Depending on kind of error zap_expand_leaf() may return with or without valid leaf reference held. Make sure it returns NULL if due to error it has no leaf to return. Make its callers to check the returned leaf pointer, and release the leaf if it is not NULL. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #12366 Closes #16159 --- module/zfs/zap.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index da86defb445..e8e02bf0c26 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -635,6 +635,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, uint64_t object = zap->zap_object; zap_put_leaf(l); + *lp = l = NULL; zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); @@ -844,21 +845,17 @@ retry: } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) { + if (err == 0) goto retry; - } else if (err == ENOSPC) { - /* - * If we failed to expand the leaf, then bailout - * as there is no point trying - * zap_put_leaf_maybe_grow_ptrtbl(). - */ - return (err); - } } out: - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } @@ -915,8 +912,12 @@ retry: goto retry; } - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } From 4c0fbd8d6d1565ba2dd3d403a2bb42e2b6c64ec0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 16 May 2024 20:56:55 -0400 Subject: [PATCH 09/48] FreeBSD: Add zfs_link_create() error handling Originally Solaris didn't expect errors there, but they may happen if we fail to add entry into ZAP. Linux fixed it in #7421, but it was never fully ported to FreeBSD. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #13215 Closes #16138 --- module/os/freebsd/zfs/zfs_dir.c | 1 + module/os/freebsd/zfs/zfs_vnops_os.c | 54 ++++++++++++++++++++------ module/os/freebsd/zfs/zfs_znode.c | 1 - tests/test-runner/bin/zts-report.py.in | 1 - 4 files changed, 43 insertions(+), 14 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c index 948df8e50de..3cdb94d6cd5 100644 --- a/module/os/freebsd/zfs/zfs_dir.c +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -543,6 +543,7 @@ zfs_rmnode(znode_t *zp) dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); + zfs_znode_free(zp); dmu_tx_commit(tx); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 1ba25bce619..5962baf23af 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1169,10 +1169,25 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dzp, name, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + getnewvnode_drop_reserve(); + goto out; + } + if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); @@ -1520,13 +1535,19 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* * Now put new name in parent dir. */ - (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + error = zfs_link_create(dzp, dirname, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); *zpp = zp; @@ -1534,6 +1555,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); +out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -1544,7 +1566,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); - return (0); + return (error); } #if __FreeBSD_version < 1300124 @@ -3578,10 +3600,14 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, /* * Insert the new object into the directory. */ - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); - - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - *zpp = zp; + error = zfs_link_create(dzp, name, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + } else { + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } zfs_acl_ids_free(&acl_ids); @@ -3589,8 +3615,12 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, getnewvnode_drop_reserve(); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } zfs_exit(zfsvfs, FTAG); return (error); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 0d4c94555c6..0eea2a84941 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -1234,7 +1234,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); - zfs_znode_free(zp); } void diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ecc50f48715..5ca13093136 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -182,7 +182,6 @@ if sys.platform.startswith('freebsd'): 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], - 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], 'rsend/send_raw_ashift': ['SKIP', 14961], }) From 2eab4f7b396a1abb9adf7cab82d6ff47f9d78e9b Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Sat, 25 May 2024 04:02:58 +0200 Subject: [PATCH 10/48] Fix assertion in Persistent L2ARC At the end of l2arc_evict() fix an assertion in the case that l2ad_hand + distance == l2ad_end. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #16202 Closes #16207 --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 4bcf9f1522b..f25afa312cb 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8879,7 +8879,7 @@ out: * assertions may be violated without functional consequences * as the device is about to be removed. */ - ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } From 54ef0fdf60a8e7633c38cb46e1f5bcfcec792f4e Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Mon, 15 Jul 2024 18:07:33 +0200 Subject: [PATCH 11/48] head_errlog: fix use-after-free In the commit of the head_errlog feature we introduced a bug in dsl_dataset_promote_sync(): we may dereference origin_head and hds, both dereferencing ddpa after calling promote_sync() on ddpa. Reviewed-by: Brian Behlendorf Reviewed-by: Chunwei Chen Reviewed-by: Rob Norris Reviewed-by: Tony Hutter Signed-off-by: George Amanakis Closes #16272 Closes #16273 --- module/zfs/dsl_dataset.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index d6db6172922..94c25e67487 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -3710,16 +3710,19 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal_ds(hds, "promote", tx, " "); dsl_dir_rele(odd, FTAG); - promote_rele(ddpa, FTAG); /* - * Transfer common error blocks from old head to new head. + * Transfer common error blocks from old head to new head, before + * calling promote_rele() on ddpa since we need to dereference + * origin_head and hds. */ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { uint64_t old_head = origin_head->ds_object; uint64_t new_head = hds->ds_object; spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); } + + promote_rele(ddpa, FTAG); } /* From f4e2aed42a704e2f238886b0dac50c06671b2d70 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 11:54:24 +1000 Subject: [PATCH 12/48] Linux 6.7 compat: detect if kernel defines intptr_t Since Linux 6.7 the kernel has defined intptr_t. Clang has -Wtypedef-redefinition by default, which causes the build to fail because we also have a typedef for intptr_t. Since its better to use the kernel's if it exists, detect it and skip our own. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16201 --- config/kernel-types.m4 | 40 ++++++++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ include/os/linux/spl/sys/types.h | 2 ++ 3 files changed, 44 insertions(+) create mode 100644 config/kernel-types.m4 diff --git a/config/kernel-types.m4 b/config/kernel-types.m4 new file mode 100644 index 00000000000..ed76af28337 --- /dev/null +++ b/config/kernel-types.m4 @@ -0,0 +1,40 @@ +dnl # +dnl # check if kernel provides definitions for given types +dnl # + +dnl _ZFS_AC_KERNEL_SRC_TYPE(type) +AC_DEFUN([_ZFS_AC_KERNEL_SRC_TYPE], [ + ZFS_LINUX_TEST_SRC([type_$1], [ + #include + ],[ + const $1 __attribute__((unused)) x = ($1) 0; + ]) +]) + +dnl _ZFS_AC_KERNEL_TYPE(type) +AC_DEFUN([_ZFS_AC_KERNEL_TYPE], [ + AC_MSG_CHECKING([whether kernel defines $1]) + ZFS_LINUX_TEST_RESULT([type_$1], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_KERNEL_]m4_quote(m4_translit([$1], [a-z], [A-Z])), + 1, [kernel defines $1]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl ZFS_AC_KERNEL_TYPES([types...]) +AC_DEFUN([ZFS_AC_KERNEL_TYPES], [ + AC_DEFUN([ZFS_AC_KERNEL_SRC_TYPES], [ + m4_foreach_w([type], [$1], [ + _ZFS_AC_KERNEL_SRC_TYPE(type) + ]) + ]) + AC_DEFUN([ZFS_AC_KERNEL_TYPES], [ + m4_foreach_w([type], [$1], [ + _ZFS_AC_KERNEL_TYPE(type) + ]) + ]) +]) + +ZFS_AC_KERNEL_TYPES([intptr_t]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 548905ccd04..b51477b6a95 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -37,6 +37,7 @@ dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ + ZFS_AC_KERNEL_SRC_TYPES ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE @@ -187,6 +188,7 @@ dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ + ZFS_AC_KERNEL_TYPES ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h index 20ba457f7ef..94ba7b6ad32 100644 --- a/include/os/linux/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -38,7 +38,9 @@ typedef unsigned long ulong_t; typedef unsigned long long u_longlong_t; typedef long long longlong_t; +#ifndef HAVE_KERNEL_INTPTR_T typedef long intptr_t; +#endif typedef unsigned long long rlim64_t; typedef struct task_struct kthread_t; From c24a039042ef846940adae8e1dd1fc33cbb1f147 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 28 Jun 2024 09:52:03 -0700 Subject: [PATCH 13/48] Linux 6.9: Call add_disk() from workqueue to fix zfs_allow_010_pos (#16282) The 6.9 kernel behaves differently in how it releases block devices. In the common case it will async release the device only after the return to userspace. This is different from the 6.8 and older kernels which release the block devices synchronously. To get around this, call add_disk() from a workqueue so that the kernel uses a different codepath to release our zvols in the way we expect. This stops zfs_allow_010_pos from hanging. Fixes: #16089 Signed-off-by: Tony Hutter Reviewed-by: Tino Reichardt Reviewed-by: Rob Norris --- module/os/linux/zfs/zvol_os.c | 102 ++++++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 41a9b1c53a2..85d2bebc4bc 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -41,6 +41,7 @@ #include #include +#include #ifdef HAVE_BLK_MQ #include @@ -1337,6 +1338,101 @@ zvol_wait_close(zvol_state_t *zv) { } +struct add_disk_work { + struct delayed_work work; + struct gendisk *disk; + int error; +}; + +static int +__zvol_os_add_disk(struct gendisk *disk) +{ + int error = 0; +#ifdef HAVE_ADD_DISK_RET + error = add_disk(disk); +#else + add_disk(disk); +#endif + return (error); +} + +#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) +static void +zvol_os_add_disk_work(struct work_struct *work) +{ + struct add_disk_work *add_disk_work; + add_disk_work = container_of(work, struct add_disk_work, work.work); + add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); +} +#endif + +/* + * SPECIAL CASE: + * + * This function basically calls add_disk() from a workqueue. You may be + * thinking: why not just call add_disk() directly? + * + * When you call add_disk(), the zvol appears to the world. When this happens, + * the kernel calls disk_scan_partitions() on the zvol, which behaves + * differently on the 6.9+ kernels: + * + * - 6.8 and older kernels - + * disk_scan_partitions() + * handle = bdev_open_by_dev( + * zvol_open() + * bdev_release(handle); + * zvol_release() + * + * + * - 6.9+ kernels - + * disk_scan_partitions() + * file = bdev_file_open_by_dev() + * zvol_open() + * fput(file) + * < wait for return to userspace > + * zvol_release() + * + * The difference is that the bdev_release() from the 6.8 kernel is synchronous + * while the fput() from the 6.9 kernel is async. Or more specifically it's + * async that has to wait until we return to userspace (since it adds the fput + * into the caller's work queue with the TWA_RESUME flag set). This is not the + * behavior we want, since we want do things like create+destroy a zvol within + * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the + * reference to the zvol while we're in the IOCTL, which can't wait until we + * return to userspace. + * + * We can get around this since fput() has a special codepath for when it's + * running in a kernel thread or interrupt. In those cases, it just puts the + * fput into the system workqueue, which we can force to run with + * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it + * run from a kernel thread and "tricks" the fput() codepaths. + * + * Note that __flush_workqueue() is slowly getting deprecated. This may be ok + * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via + * fput) to happen, which it eventually, naturally, will from the system_wq + * without us explicitly calling __flush_workqueue(). + */ +static int +zvol_os_add_disk(struct gendisk *disk) +{ +#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ + struct add_disk_work add_disk_work; + + INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); + add_disk_work.disk = disk; + add_disk_work.error = 0; + + /* Use *_delayed_work functions since they're not GPL'd */ + schedule_delayed_work(&add_disk_work.work, 0); + flush_delayed_work(&add_disk_work.work); + + __flush_workqueue(system_wq); + return (add_disk_work.error); +#else /* <= 6.8 kernel */ + return (__zvol_os_add_disk(disk)); +#endif +} + /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block @@ -1541,11 +1637,7 @@ out_doi: rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); -#ifdef HAVE_ADD_DISK_RET - error = add_disk(zv->zv_zso->zvo_disk); -#else - add_disk(zv->zv_zso->zvo_disk); -#endif + error = zvol_os_add_disk(zv->zv_zso->zvo_disk); } else { ida_simple_remove(&zvol_ida, idx); } From d7bf0e5259f311dbe6466cc3655e8cdb27392fb9 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 11 Jul 2024 16:41:26 -0700 Subject: [PATCH 14/48] Linux 6.9: Fix UBSAN errors in zap_micro.c You can use the UBSAN_SANITIZE_* Kbuild options to exclude certain kernel objects from the UBSAN checks. We previously excluded zap_micro.o with: UBSAN_SANITIZE_zap_micro.o := n For some reason that didn't work for the 6.9 kernel, which wants us to use: UBSAN_SANITIZE_zfs/zap_micro.o := n Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #16278 Closes #16330 --- module/Kbuild.in | 1 + 1 file changed, 1 insertion(+) diff --git a/module/Kbuild.in b/module/Kbuild.in index f1a145779dd..d13e1cf1143 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -492,6 +492,7 @@ zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) UBSAN_SANITIZE_zap_leaf.o := n UBSAN_SANITIZE_zap_micro.o := n UBSAN_SANITIZE_sa.o := n +UBSAN_SANITIZE_zfs/zap_micro.o := n # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. From 0342c4a6b22d872beb286a09004706f521061c24 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 27 May 2024 21:32:07 -0400 Subject: [PATCH 15/48] Linux 6.10: rework queue limits setup Linux has started moving to a model where instead of applying block queue limits through individual modification functions, a complete limits structure is built up and applied atomically, either when the block device or open, or some time afterwards. As of 6.10 this transition appears only partly completed. This commit matches that model within OpenZFS in a way that should work for past and future kernels. We set up a queue limits structure with any limits that have had their modification functions removed. For newer kernels that can have limits applied at block device open (HAVE_BLK_ALLOC_DISK_2ARG), we have a conversion function to turn the OpenZFS queue limits structure into Linux's queue_limits structure, which can then be passed in. For older kernels, we provide an application function that just calls the old functions for each limit in the structure. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf --- config/kernel-blk-queue.m4 | 4 +- module/os/linux/zfs/zvol_os.c | 186 +++++++++++++++++++++------------- 2 files changed, 118 insertions(+), 72 deletions(-) diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index 15dbe1c7dff..2f0b386e663 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -332,7 +332,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [ ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [ AC_MSG_RESULT(yes) ],[ - ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors]) + AC_MSG_RESULT(no) ]) ]) @@ -355,7 +355,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blk_queue_max_segments]) + AC_MSG_RESULT(no) ]) ]) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 85d2bebc4bc..129e3606bb8 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1074,8 +1074,106 @@ static const struct block_device_operations zvol_ops = { #endif }; +typedef struct zvol_queue_limits { + unsigned int zql_max_hw_sectors; + unsigned short zql_max_segments; + unsigned int zql_max_segment_size; + unsigned int zql_io_opt; +} zvol_queue_limits_t; + +static void +zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, + boolean_t use_blk_mq) +{ + limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; + + if (use_blk_mq) { + /* + * IO requests can be really big (1MB). When an IO request + * comes in, it is passed off to zvol_read() or zvol_write() + * in a new thread, where it is chunked up into 'volblocksize' + * sized pieces and processed. So for example, if the request + * is a 1MB write and your volblocksize is 128k, one zvol_write + * thread will take that request and sequentially do ten 128k + * IOs. This is due to the fact that the thread needs to lock + * each volblocksize sized block. So you might be wondering: + * "instead of passing the whole 1MB request to one thread, + * why not pass ten individual 128k chunks to ten threads and + * process the whole write in parallel?" The short answer is + * that there's a sweet spot number of chunks that balances + * the greater parallelism with the added overhead of more + * threads. The sweet spot can be different depending on if you + * have a read or write heavy workload. Writes typically want + * high chunk counts while reads typically want lower ones. On + * a test pool with 6 NVMe drives in a 3x 2-disk mirror + * configuration, with volblocksize=8k, the sweet spot for good + * sequential reads and writes was at 8 chunks. + */ + + /* + * Below we tell the kernel how big we want our requests + * to be. You would think that blk_queue_io_opt() would be + * used to do this since it is used to "set optimal request + * size for the queue", but that doesn't seem to do + * anything - the kernel still gives you huge requests + * with tons of little PAGE_SIZE segments contained within it. + * + * Knowing that the kernel will just give you PAGE_SIZE segments + * no matter what, you can say "ok, I want PAGE_SIZE byte + * segments, and I want 'N' of them per request", where N is + * the correct number of segments for the volblocksize and + * number of chunks you want. + */ +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_blocks_per_thread != 0) { + unsigned int chunks; + chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); + + limits->zql_max_segment_size = PAGE_SIZE; + limits->zql_max_segments = + (zv->zv_volblocksize * chunks) / PAGE_SIZE; + } else { + /* + * Special case: zvol_blk_mq_blocks_per_thread = 0 + * Max everything out. + */ + limits->zql_max_segments = UINT16_MAX; + limits->zql_max_segment_size = UINT_MAX; + } + } else { +#endif + limits->zql_max_segments = UINT16_MAX; + limits->zql_max_segment_size = UINT_MAX; + } + + limits->zql_io_opt = zv->zv_volblocksize; +} + +#ifdef HAVE_BLK_ALLOC_DISK_2ARG +static void +zvol_queue_limits_convert(zvol_queue_limits_t *limits, + struct queue_limits *qlimits) +{ + memset(qlimits, 0, sizeof (struct queue_limits)); + qlimits->max_hw_sectors = limits->zql_max_hw_sectors; + qlimits->max_segments = limits->zql_max_segments; + qlimits->max_segment_size = limits->zql_max_segment_size; + qlimits->io_opt = limits->zql_io_opt; +} +#else +static void +zvol_queue_limits_apply(zvol_queue_limits_t *limits, + struct request_queue *queue) +{ + blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); + blk_queue_max_segments(queue, limits->zql_max_segments); + blk_queue_max_segment_size(queue, limits->zql_max_segment_size); + blk_queue_io_opt(queue, limits->zql_io_opt); +} +#endif + static int -zvol_alloc_non_blk_mq(struct zvol_state_os *zso) +zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) @@ -1085,8 +1183,11 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) - struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + struct queue_limits qlimits; + zvol_queue_limits_convert(limits, &qlimits); + struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); if (IS_ERR(disk)) { zso->zvo_disk = NULL; return (1); @@ -1107,6 +1208,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) } zso->zvo_disk->queue = zso->zvo_queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); @@ -1120,13 +1222,14 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) } zso->zvo_disk->queue = zso->zvo_queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ return (0); } static int -zvol_alloc_blk_mq(zvol_state_t *zv) +zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) { #ifdef HAVE_BLK_MQ struct zvol_state_os *zso = zv->zv_zso; @@ -1142,9 +1245,12 @@ zvol_alloc_blk_mq(zvol_state_t *zv) return (1); } zso->zvo_queue = zso->zvo_disk->queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); zso->zvo_disk->minors = ZVOL_MINORS; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) - struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); + struct queue_limits qlimits; + zvol_queue_limits_convert(limits, &qlimits); + struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); if (IS_ERR(disk)) { zso->zvo_disk = NULL; blk_mq_free_tag_set(&zso->tag_set); @@ -1170,6 +1276,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv) /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif #endif @@ -1209,6 +1316,9 @@ zvol_alloc(dev_t dev, const char *name) zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #endif + zvol_queue_limits_t limits; + zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); + /* * The block layer has 3 interfaces for getting BIOs: * @@ -1225,10 +1335,10 @@ zvol_alloc(dev_t dev, const char *name) * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { - ret = zvol_alloc_blk_mq(zv); + ret = zvol_alloc_blk_mq(zv, &limits); zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { - ret = zvol_alloc_non_blk_mq(zso); + ret = zvol_alloc_non_blk_mq(zso, &limits); zso->zvo_disk->fops = &zvol_ops; } if (ret != 0) @@ -1505,74 +1615,10 @@ zvol_os_create_minor(const char *name) set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); - blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, - (DMU_MAX_ACCESS / 4) >> 9); - if (zv->zv_zso->use_blk_mq) { - /* - * IO requests can be really big (1MB). When an IO request - * comes in, it is passed off to zvol_read() or zvol_write() - * in a new thread, where it is chunked up into 'volblocksize' - * sized pieces and processed. So for example, if the request - * is a 1MB write and your volblocksize is 128k, one zvol_write - * thread will take that request and sequentially do ten 128k - * IOs. This is due to the fact that the thread needs to lock - * each volblocksize sized block. So you might be wondering: - * "instead of passing the whole 1MB request to one thread, - * why not pass ten individual 128k chunks to ten threads and - * process the whole write in parallel?" The short answer is - * that there's a sweet spot number of chunks that balances - * the greater parallelism with the added overhead of more - * threads. The sweet spot can be different depending on if you - * have a read or write heavy workload. Writes typically want - * high chunk counts while reads typically want lower ones. On - * a test pool with 6 NVMe drives in a 3x 2-disk mirror - * configuration, with volblocksize=8k, the sweet spot for good - * sequential reads and writes was at 8 chunks. - */ - - /* - * Below we tell the kernel how big we want our requests - * to be. You would think that blk_queue_io_opt() would be - * used to do this since it is used to "set optimal request - * size for the queue", but that doesn't seem to do - * anything - the kernel still gives you huge requests - * with tons of little PAGE_SIZE segments contained within it. - * - * Knowing that the kernel will just give you PAGE_SIZE segments - * no matter what, you can say "ok, I want PAGE_SIZE byte - * segments, and I want 'N' of them per request", where N is - * the correct number of segments for the volblocksize and - * number of chunks you want. - */ -#ifdef HAVE_BLK_MQ - if (zvol_blk_mq_blocks_per_thread != 0) { - unsigned int chunks; - chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); - - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, - PAGE_SIZE); - blk_queue_max_segments(zv->zv_zso->zvo_queue, - (zv->zv_volblocksize * chunks) / PAGE_SIZE); - } else { - /* - * Special case: zvol_blk_mq_blocks_per_thread = 0 - * Max everything out. - */ - blk_queue_max_segments(zv->zv_zso->zvo_queue, - UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, - UINT_MAX); - } -#endif - } else { - blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); - } blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); - blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue, From 3ea3649755d4a684a7e04a0c6a5eb3df8f87b27b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 28 May 2024 11:56:41 -0400 Subject: [PATCH 16/48] Linux 6.10: work harder to avoid kmem_cache_alloc reuse Linux 6.10 change kmem_cache_alloc to be a macro, rather than a function, such that the old #undef for it in spl-kmem-cache.c would remove its definition completely, breaking the build. This inverts the model used before. Rather than always defining the kmem_cache_* macro, then undefining then inside spl-kmem-cache.c, instead we make a special tag to indicate we're currently inside spl-kmem-cache.c, and not defining those in macros in the first place, so we can use the kernel-supplied kmem_cache_* functions to implement spl_kmem_cache_*, as we expect. For all other callers, we create the macros as normal and remove access to the kernel's own conflicting names. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf --- include/os/linux/spl/sys/kmem_cache.h | 19 +++++++++++-------- module/os/linux/spl/spl-kmem-cache.c | 12 ++---------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index b159bb52d11..905ff57a143 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -192,22 +192,25 @@ extern void spl_kmem_reap(void); extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); +#ifndef SPL_KMEM_CACHE_IMPLEMENTING +/* + * Macros for the kmem_cache_* API expected by ZFS and SPL clients. We don't + * define them inside spl-kmem-cache.c, as that uses the kernel's incompatible + * kmem_cache_* facilities to implement ours. + */ + +/* Avoid conflicts with kernel names that might be implemented as macros. */ +#undef kmem_cache_alloc + #define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) -/* - * This is necessary to be compatible with other kernel modules - * or in-tree filesystem that may define kmem_cache_alloc, - * like bcachefs does it now. - */ -#ifdef kmem_cache_alloc -#undef kmem_cache_alloc -#endif #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) #define kmem_reap() spl_kmem_reap() +#endif /* * The following functions are only available for internal use. diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 42821ad6025..737c2e063f7 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -21,6 +21,8 @@ * with the SPL. If not, see . */ +#define SPL_KMEM_CACHE_IMPLEMENTING + #include #include #include @@ -33,16 +35,6 @@ #include #include -/* - * Within the scope of spl-kmem.c file the kmem_cache_* definitions - * are removed to allow access to the real Linux slab allocator. - */ -#undef kmem_cache_destroy -#undef kmem_cache_create -#undef kmem_cache_alloc -#undef kmem_cache_free - - /* * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() * with smp_mb__{before,after}_atomic() because they were redundant. This is From 7d8e2a7f730374cb39fefa25e02702c03a41aa85 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 28 May 2024 16:16:28 -0400 Subject: [PATCH 17/48] Linux 5.16: use bdev_nr_bytes() to get device capacity This helper was introduced long ago, in 5.16. Since 6.10, bd_inode no longer exists, but the helper has been updated, so detect it and use it in all versions where it is available. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf --- config/kernel-blkdev.m4 | 26 ++++++++++++++++++++++++++ module/os/linux/zfs/vdev_disk.c | 14 +++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index b6ce1e1cf08..4f60f96acb5 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -534,6 +534,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ ]) ]) +dnl # +dnl # 5.16 API change +dnl # Added bdev_nr_bytes() helper. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES], [ + ZFS_LINUX_TEST_SRC([bdev_nr_bytes], [ + #include + ],[ + struct block_device *bdev = NULL; + loff_t nr_bytes __attribute__ ((unused)) = 0; + nr_bytes = bdev_nr_bytes(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES], [ + AC_MSG_CHECKING([whether bdev_nr_bytes() is available]) + ZFS_LINUX_TEST_RESULT([bdev_nr_bytes], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_NR_BYTES, 1, [bdev_nr_bytes() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 5.20 API change, dnl # Removed bdevname(), snprintf(.., %pg) should be used. @@ -747,6 +771,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ @@ -767,6 +792,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE + ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 943e534ef5b..67704093d71 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -150,7 +150,11 @@ vdev_bdev_mode(spa_mode_t smode) static uint64_t bdev_capacity(struct block_device *bdev) { +#ifdef HAVE_BDEV_NR_BYTES + return (bdev_nr_bytes(bdev)); +#else return (i_size_read(bdev->bd_inode)); +#endif } #if !defined(HAVE_BDEV_WHOLE) @@ -209,7 +213,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) * "reserved" EFI partition: in such cases return the device * usable capacity. */ - available = i_size_read(bdev_whole(bdev)->bd_inode) - + available = bdev_capacity(bdev_whole(bdev)) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); @@ -916,12 +920,12 @@ vdev_disk_io_rw(zio_t *zio) /* * Accessing outside the block device is never allowed. */ - if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { + if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, - (u_longlong_t)i_size_read(bdev->bd_inode)); + (u_longlong_t)bdev_capacity(bdev)); return (SET_ERROR(EIO)); } @@ -1116,12 +1120,12 @@ vdev_classic_physio(zio_t *zio) /* * Accessing outside the block device is never allowed. */ - if (io_offset + io_size > bdev->bd_inode->i_size) { + if (io_offset + io_size > bdev_capacity(bdev)) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)io_offset, (u_longlong_t)io_size, - (u_longlong_t)i_size_read(bdev->bd_inode)); + (u_longlong_t)bdev_capacity(bdev)); return (SET_ERROR(EIO)); } From 97f1eb80522532b65bf5b529d765c454cfbbc6b8 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 31 May 2024 15:11:00 -0700 Subject: [PATCH 18/48] ZTS: Fix redacted_send failures on FreeBSD We're seeing failures for redacted_deleted and redacted_mount on FreeBSD 13-15: 09:58:34.74 diff: /dev/fd/3: No such file or directory 09:58:34.74 ERROR: diff /dev/fd/3 /dev/fd/4 exited 2 The test was trying to diff the file listings between two directories to see if they are the same. The workaround is to do a string comparison of the directory listings instead of using `diff`. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #16224 --- .../tests/functional/redacted_send/redacted_deleted.ksh | 2 +- .../tests/functional/redacted_send/redacted_mounts.ksh | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh index 3e2aeb73354..ec11610742f 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh @@ -96,7 +96,7 @@ log_must zfs destroy -R $clone2 log_must eval "zfs send -i $sendfs#book2 --redact book3 $sendfs@snap2 >$stream" log_must eval "zfs recv $recvfs <$stream" log_must mount_redacted -f $recvfs -log_must diff <(ls $send_mnt) <(ls $recv_mnt) +log_must [ "$(ls $send_mnt)" == "$(ls $recv_mnt)" ] log_must zfs destroy -R $recvfs log_must zfs rollback -R $sendfs@snap diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh index 0bc4bf46174..c041469163c 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh @@ -71,8 +71,7 @@ log_must ismounted $recvfs # deleted. contents=$(log_must find $recv_mnt) contents_orig=$(log_must find $send_mnt) -log_must diff <(echo ${contents//$recv_mnt/}) \ - <(echo ${contents_orig//$send_mnt/}) +log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ] log_must zfs redact $sendvol@snap book2 $clonevol@snap log_must eval "zfs send --redact book2 $sendvol@snap >$stream" log_must eval "zfs receive $recvvol <$stream" @@ -103,7 +102,6 @@ log_must mount_redacted -f $recvfs log_must ismounted $recvfs contents=$(log_must find $recv_mnt) contents_orig=$(log_must find $send_mnt) -log_must diff <(echo ${contents//$recv_mnt/}) \ - <(echo ${contents_orig//$send_mnt/}) +log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ] log_pass "Received redacted streams can be mounted." From da9da6aea6b925c91b8e46dab0e9cd7488cf1ddc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 13 Jul 2024 03:58:03 +1000 Subject: [PATCH 19/48] ZTS: handle FreeBSD version numbers correctly (#16340) FreeBSD patchlevel versions are optional and, if present, in a different location in the version string. Sponsored-by: https://despairlabs.com/sponsor/ Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tino Reichardt Reviewed-by: Tony Hutter --- tests/zfs-tests/include/libtest.shlib | 30 ++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index dfab48d2cda..a2f42999a31 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -62,11 +62,39 @@ function compare_version_gte } # Helper function used by linux_version() and freebsd_version() +# $1, if provided, should be a MAJOR, MAJOR.MINOR or MAJOR.MINOR.PATCH +# version number function kernel_version { typeset ver="$1" - [ -z "$ver" ] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+") + [ -z "$ver" ] && case "$UNAME" in + Linux) + # Linux version numbers are X.Y.Z followed by optional + # vendor/distro specific stuff + # RHEL7: 3.10.0-1160.108.1.el7.x86_64 + # Fedora 37: 6.5.12-100.fc37.x86_64 + # Debian 12.6: 6.1.0-22-amd64 + ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+") + ;; + FreeBSD) + # FreeBSD version numbers are X.Y-BRANCH-pZ. Depending on + # branch, -pZ may not be present, but this is typically only + # on pre-release or true .0 releases, so can be assumed 0 + # if not present. + # eg: + # 13.2-RELEASE-p4 + # 14.1-RELEASE + # 15.0-CURRENT + ver=$(uname -r | \ + grep -Eo "[0-9]+\.[0-9]+(-[A-Z0-9]+-p[0-9]+)?" | \ + sed -E "s/-[^-]+-p/./") + ;; + *) + # Unknown system + log_fail "Don't know how to get kernel version for '$UNAME'" + ;; + esac typeset version major minor _ IFS='.' read -r version major minor _ <<<"$ver" From bb401c02fca54e0877d33ee6e8012edb38ce1a27 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 16 Jul 2024 16:27:54 -0700 Subject: [PATCH 20/48] Linux 6.9 compat: META (#16358) Update the META file to reflect compatibility with the 6.9 kernel. Signed-off-by: Tony Hutter Reviewed-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 383fa37fd42..0f32d8b4fe1 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.8 +Linux-Maximum: 6.9 Linux-Minimum: 3.10 From 08da05400556a778105156640a6f0dbee81e32ee Mon Sep 17 00:00:00 2001 From: a1ea321 Date: Sat, 13 Jul 2024 01:27:12 +0200 Subject: [PATCH 21/48] one-word manpage correction: snapshot->rollback (#16294) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes what is probably a copy-paste mistake. The `dracut.zfs` manpage claims that the `bootfs.rollback` option executes `zfs snapshot -Rf`. `zfs snapshot` does not have a `-R` option. `zfs rollback` does. Signed-off-by: Alphan Yılmaz Reviewed-by: Rob Norris Reviewed-by: George Melikov Reviewed-by: Tony Hutter --- man/man7/dracut.zfs.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man7/dracut.zfs.7 b/man/man7/dracut.zfs.7 index c1475c695e8..b67e1cecb24 100644 --- a/man/man7/dracut.zfs.7 +++ b/man/man7/dracut.zfs.7 @@ -154,7 +154,7 @@ defaults to the current kernel release. . .It Sy bootfs.rollback Ns Op Sy = Ns Ar snapshot-name Execute -.Nm zfs Cm snapshot Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name +.Nm zfs Cm rollback Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name before pivoting to the real root. .Ar snapshot-name defaults to the current kernel release. From dfdac38afb1f25de0b35c4b522a839fe78052aed Mon Sep 17 00:00:00 2001 From: Daniel Berlin Date: Fri, 12 Jul 2024 20:44:10 -0400 Subject: [PATCH 22/48] Fix missing semicolon in trace_dbuf.h (#16281) On fedora 40, on the 6.9.4 kernel (in updates-testing), assign_str expands to a "do { } while(0)" loop. Without this semicolon, the while(0) is unterminated, causing a cascade of useless errors. With this semicolon, it compiles fine. It also compiles fine on 6.8.11 (the previous kernel). I have not tested earlier kernels than that, but at worst it should add a pointless semicolon. All other instances in the source tree are already terminated with semicolons. Signed-off-by: Daniel Berlin Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- include/os/linux/zfs/sys/trace_dbuf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/os/linux/zfs/sys/trace_dbuf.h b/include/os/linux/zfs/sys/trace_dbuf.h index 0f6a98b47d6..0e9cbdd725b 100644 --- a/include/os/linux/zfs/sys/trace_dbuf.h +++ b/include/os/linux/zfs/sys/trace_dbuf.h @@ -80,7 +80,7 @@ snprintf(__get_str(msg), TRACE_DBUF_MSG_MAX, \ DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS); \ } else { \ - __assign_str(os_spa, "NULL") \ + __assign_str(os_spa, "NULL"); \ __entry->ds_object = 0; \ __entry->db_object = 0; \ __entry->db_level = 0; \ From f14a62ebbe87f52993dc52b6e0f3c0aee1f6d6d2 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 16 Apr 2024 06:44:12 +1000 Subject: [PATCH 23/48] zts: allow running a single test by name only Specifying a single test is kind of a hassle, because the full relative path under the test suite dir has to be included, but it's not always clear what that path even is. This change allows `-t` to take the name of a single test instead of a full path. If the value has no `/` characters, we search for a file of that name under the test root, and if found, use that as the full test path instead. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16088 --- scripts/zfs-tests.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 179e24d7a0e..b5b3e4ab351 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -326,7 +326,8 @@ OPTIONS: -d DIR Use world-writable DIR for files and loopback devices -s SIZE Use vdevs of SIZE (default: 4G) -r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES}) - -t PATH Run single test at PATH relative to test suite + -t PATH|NAME Run single test at PATH relative to test suite, + or search for test by NAME -T TAGS Comma separated list of tags (default: 'functional') -u USER Run single test as USER (default: root) @@ -340,6 +341,9 @@ $0 -r linux-fast # Run a single test $0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh +# Run a single test by name +$0 -t zfs_bookmark_cliargs + # Cleanup a previous run of the test suite prior to testing, run the # default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit. $0 -x @@ -450,8 +454,15 @@ post_user = root post = outputdir = /var/tmp/test_results EOF - SINGLETESTDIR="${SINGLETEST%/*}" + if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then + NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit) + if [ -z "$NEWSINGLETEST" ] ; then + fail "couldn't find test matching '$SINGLETEST'" + fi + SINGLETEST=$NEWSINGLETEST + fi + SINGLETESTDIR="${SINGLETEST%/*}" SETUPDIR="$SINGLETESTDIR" [ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR" [ -x "$SETUPDIR/setup.ksh" ] && SETUPSCRIPT="setup" || SETUPSCRIPT= From ad8c8c1e31f94a96f904c59e6aef2972dd10670c Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 17 Apr 2024 02:13:01 +1000 Subject: [PATCH 24/48] zts: add a debug option to get full test output The test runner accumulates output from individual tests, then writes it to the log at the end. If a test hangs or crashes the system half way through, we get no insight into how it got to where it did. This adds a -D option for "debug". When set, all test output is written to stdout. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Signed-off-by: Rob Norris Closes #16096 --- scripts/zfs-tests.sh | 10 +++++++++- tests/test-runner/bin/test-runner.py.in | 23 ++++++++++++++++------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index b5b3e4ab351..c25903ea1be 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -32,6 +32,7 @@ SCRIPT_COMMON=${SCRIPT_COMMON:-${0%/*}/common.sh} PROG=zfs-tests.sh VERBOSE="no" QUIET="" +DEBUG="" CLEANUP="yes" CLEANUPALL="no" KMSG="" @@ -313,6 +314,7 @@ OPTIONS: -h Show this message -v Verbose zfs-tests.sh output -q Quiet test-runner output + -D Debug; show all test output immediately (noisy) -x Remove all testpools, dm, lo, and files (unsafe) -k Disable cleanup after test failure -K Log test names to /dev/kmsg @@ -351,7 +353,7 @@ $0 -x EOF } -while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do +while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do case $OPTION in h) usage @@ -397,6 +399,9 @@ while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do d) FILEDIR="$OPTARG" ;; + D) + DEBUG="yes" + ;; I) ITERATIONS="$OPTARG" if [ "$ITERATIONS" -le 0 ]; then @@ -691,6 +696,7 @@ REPORT_FILE=$(mktemp_file zts-report) # msg "${TEST_RUNNER}" \ "${QUIET:+-q}" \ + "${DEBUG:+-D}" \ "${KMEMLEAK:+-m}" \ "${KMSG:+-K}" \ "-c \"${RUNFILES}\"" \ @@ -700,6 +706,7 @@ msg "${TEST_RUNNER}" \ { PATH=$STF_PATH \ ${TEST_RUNNER} \ ${QUIET:+-q} \ + ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ ${KMSG:+-K} \ -c "${RUNFILES}" \ @@ -726,6 +733,7 @@ if [ "$RESULT" -eq "2" ] && [ -n "$RERUN" ]; then { PATH=$STF_PATH \ ${TEST_RUNNER} \ ${QUIET:+-q} \ + ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ -c "${RUNFILES}" \ -T "${TAGS}" \ diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index 422ebd7bc8b..65247f4f06f 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -113,8 +113,9 @@ class Output(object): This class is a slightly modified version of the 'Stream' class found here: http://goo.gl/aSGfv """ - def __init__(self, stream): + def __init__(self, stream, debug=False): self.stream = stream + self.debug = debug self._buf = b'' self.lines = [] @@ -140,6 +141,8 @@ class Output(object): buf = os.read(fd, 4096) if not buf: return None + if self.debug: + os.write(sys.stderr.fileno(), buf) if b'\n' not in buf: self._buf += buf return [] @@ -238,14 +241,14 @@ User: %s ret = '%s -E -u %s %s' % (SUDO, user, cmd) return ret.split(' ') - def collect_output(self, proc): + def collect_output(self, proc, debug=False): """ Read from stdout/stderr as data becomes available, until the process is no longer running. Return the lines from the stdout and stderr Output objects. """ - out = Output(proc.stdout) - err = Output(proc.stderr) + out = Output(proc.stdout, debug) + err = Output(proc.stderr, debug) res = [] while proc.returncode is None: proc.poll() @@ -308,7 +311,10 @@ User: %s try: t.start() - self.result.stdout, self.result.stderr = self.collect_output(proc) + + out, err = self.collect_output(proc, options.debug) + self.result.stdout = out + self.result.stderr = err if kmemleak: cmd = f'{SUDO} sh -c "echo scan > {KMEMLEAK_FILE}"' @@ -624,7 +630,7 @@ Tags: %s class TestRun(object): - props = ['quiet', 'outputdir'] + props = ['quiet', 'outputdir', 'debug'] def __init__(self, options): self.tests = {} @@ -644,7 +650,8 @@ class TestRun(object): ('post_user', ''), ('failsafe', ''), ('failsafe_user', ''), - ('tags', []) + ('tags', []), + ('debug', False) ] def __str__(self): @@ -1067,6 +1074,8 @@ def parse_args(): help='Specify tests to run via config files.') parser.add_option('-d', action='store_true', default=False, dest='dryrun', help='Dry run. Print tests, but take no other action.') + parser.add_option('-D', action='store_true', default=False, dest='debug', + help='Write all test output to stdout as it arrives.') parser.add_option('-l', action='callback', callback=options_cb, default=None, dest='logfile', metavar='logfile', type='string', From fa2480f5b3bc1d265304a950c1b5ab05497853cd Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 20 Apr 2024 09:41:31 +1000 Subject: [PATCH 25/48] abd_iter_page: rework to handle multipage scatterlists Previously, abd_iter_page() would assume that every scatterlist would contain a single page (compound or no), because that's all we ever create in abd_alloc_chunks(). However, scatterlists can contain multiple pages of arbitrary provenance, and if we get one of those, we'd get all the math wrong. This reworks things to handle multiple pages in a scatterlist, by properly finding the right page within it for the given offset, and understanding better where the end of the page is and not crossing it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reported-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Rob Norris Closes #16108 --- module/os/linux/zfs/abd_os.c | 120 +++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 46 deletions(-) diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index d3255dcbc0f..cee7410c883 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -1015,10 +1015,50 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) + /* - * Yield the next page struct and data offset and size within it, without + * This is abd_iter_page(), the function underneath abd_iterate_page_func(). + * It yields the next page struct and data offset and size within it, without * mapping it into the address space. */ + +/* + * "Compound pages" are a group of pages that can be referenced from a single + * struct page *. Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we + * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a + * great many of the IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGESIZE pages, and can be safely used + * as-is. However, the head page has length covering itself and all the tail + * pages. If the ABD chunk spans multiple pages, then we can use the head page + * and a >PAGESIZE length, which is far more efficient. + * + * Before kernel 4.5 however, compound page heads were refcounted separately + * from tail pages, such that moving back to the head page would require us to + * take a reference to it and releasing it once we're completely finished with + * it. In practice, that means when our caller is done with the ABD, which we + * have no insight into from here. Rather than contort this API to track head + * page references on such ancient kernels, we disable this special compound + * page handling on 4.5, instead just using treating each page within it as a + * regular PAGESIZE page (which it is). This is slightly less efficient, but + * makes everything far simpler. + * + * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the + * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to + * understand compound pages, or not, as required. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define ABD_ITER_COMPOUND_PAGES 1 +#define ABD_ITER_PAGE_SIZE(page) \ + (PageCompound(page) ? page_size(page) : PAGESIZE) +#else +#undef ABD_ITER_COMPOUND_PAGES +#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) +#endif + void abd_iter_page(struct abd_iter *aiter) { @@ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter) struct page *page; size_t doff, dsize; + /* + * Find the page, and the start of the data within it. This is computed + * differently for linear and scatter ABDs; linear is referenced by + * virtual memory location, while scatter is referenced by page + * pointer. + */ if (abd_is_linear(aiter->iter_abd)) { ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); @@ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter) /* offset of address within the page */ doff = offset_in_page(paddr); - - /* total data remaining in abd from this position */ - dsize = aiter->iter_abd->abd_size - aiter->iter_offset; } else { ASSERT(!abd_is_gang(aiter->iter_abd)); /* current scatter page */ - page = sg_page(aiter->iter_sg); + page = nth_page(sg_page(aiter->iter_sg), + aiter->iter_offset >> PAGE_SHIFT); /* position within page */ - doff = aiter->iter_offset; - - /* remaining data in scatterlist */ - dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, - aiter->iter_abd->abd_size - aiter->iter_pos); + doff = aiter->iter_offset & (PAGESIZE - 1); } - ASSERT(page); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#ifdef ABD_ITER_COMPOUND_PAGES if (PageTail(page)) { /* - * This page is part of a "compound page", which is a group of - * pages that can be referenced from a single struct page *. - * Its organised as a "head" page, followed by a series of - * "tail" pages. - * - * In OpenZFS, compound pages are allocated using the - * __GFP_COMP flag, which we get from scatter ABDs and SPL - * vmalloc slabs (ie >16K allocations). So a great many of the - * IO buffers we get are going to be of this type. - * - * The tail pages are just regular PAGE_SIZE pages, and can be - * safely used as-is. However, the head page has length - * covering itself and all the tail pages. If this ABD chunk - * spans multiple pages, then we can use the head page and a - * >PAGE_SIZE length, which is far more efficient. - * - * To do this, we need to adjust the offset to be counted from - * the head page. struct page for compound pages are stored - * contiguously, so we can just adjust by a simple offset. - * - * Before kernel 4.5, compound page heads were refcounted - * separately, such that moving back to the head page would - * require us to take a reference to it and releasing it once - * we're completely finished with it. In practice, that means - * when our caller is done with the ABD, which we have no - * insight into from here. Rather than contort this API to - * track head page references on such ancient kernels, we just - * compile this block out and use the tail pages directly. This - * is slightly less efficient, but makes everything far - * simpler. + * If this is a compound tail page, move back to the head, and + * adjust the offset to match. This may let us yield a much + * larger amount of data from a single logical page, and so + * leave our caller with fewer pages to process. */ struct page *head = compound_head(page); doff += ((page - head) * PAGESIZE); @@ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter) } #endif - /* final page and position within it */ + ASSERT(page); + + /* + * Compute the maximum amount of data we can take from this page. This + * is the smaller of: + * - the remaining space in the page + * - the remaining space in this scatterlist entry (which may not cover + * the entire page) + * - the remaining space in the abd (which may not cover the entire + * scatterlist entry) + */ + dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, + aiter->iter_abd->abd_size - aiter->iter_pos); + if (!abd_is_linear(aiter->iter_abd)) + dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); + ASSERT3U(dsize, >, 0); + + /* final iterator outputs */ aiter->iter_page = page; aiter->iter_page_doff = doff; - - /* amount of data in the chunk, up to the end of the page */ - aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); + aiter->iter_page_dsize = dsize; } /* From 32cd2da551c952298c11b5f17efd5cbb18b98d8f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Apr 2024 12:35:30 +1000 Subject: [PATCH 26/48] find_system_library: fix var cleanup when library not found The "not found" path is attempting to clear SOMELIB_CFLAGS and SOMELIB_LIBS by resetting them in AC_SUBST(). However, the second arg to AC_SUBST is expanded in autoconf with `m4_ifvaln([$2], [[$1]=$2])`, which is defined as "if the first arg is non-empty". The m4 "empty" construction is [], therefore, the existing AC_SUBST calls never modify the variables at all. The effect of this is that leftovers from the library test can leak out. At least, if a library header is found in the first stage, but the library itself is not, -lsomelib is added to SOMELIB_LIBS and further tests done. If that library is not found, SOMELIB_LIBS will not be cleared. For most of our library tests this hasn't been a problem, as they're either always found properly via pkg-config or set directly, or the calling test immediately aborts configure. For an optional dependency however, an apparent "partial" result where the header is found but no corresponding library causes link errors later. I think a complete fix should probably not be setting SOMELIB_xxx until the final result is known, but for now, adjusting the AC_SUBST calls to explictly set the empty shell string (which is not "empty" to m4) at least restores the intent. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/find_system_library.m4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/find_system_library.m4 b/config/find_system_library.m4 index 310b44112ae..8b98bd67d2e 100644 --- a/config/find_system_library.m4 +++ b/config/find_system_library.m4 @@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [ AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]]) $7 ],[dnl ELSE - AC_SUBST([$1]_CFLAGS, []) - AC_SUBST([$1]_LIBS, []) + AC_SUBST([$1]_CFLAGS, [""]) + AC_SUBST([$1]_LIBS, [""]) AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations]) $8 ]) From 56684117133943f176f73fdbf0b9535a2bcf251d Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 30 Nov 2023 16:04:18 -0800 Subject: [PATCH 27/48] Only provide execvpe(3) when needed Check for the existence of execvpe(3) and only provide the FreeBSD compat version if required. Reviewed-by: Brian Behlendorf Signed-off-by: Brooks Davis Closes #15609 --- config/user.m4 | 2 +- lib/libspl/include/os/freebsd/sys/param.h | 2 ++ lib/libzfs/os/freebsd/libzfs_compat.c | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/config/user.m4 b/config/user.m4 index 6ec27a5b2cf..87df8c7ccab 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -31,7 +31,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([issetugid mlockall strlcat strlcpy]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy]) AC_SUBST(RM) ]) diff --git a/lib/libspl/include/os/freebsd/sys/param.h b/lib/libspl/include/os/freebsd/sys/param.h index 15d3ff0dcb5..1ff3ca8025f 100644 --- a/lib/libspl/include/os/freebsd/sys/param.h +++ b/lib/libspl/include/os/freebsd/sys/param.h @@ -57,6 +57,8 @@ extern size_t spl_pagesize(void); #define PAGESIZE (spl_pagesize()) +#ifndef HAVE_EXECVPE extern int execvpe(const char *name, char * const argv[], char * const envp[]); +#endif #endif diff --git a/lib/libzfs/os/freebsd/libzfs_compat.c b/lib/libzfs/os/freebsd/libzfs_compat.c index d1c1fea7fb6..aef2abf6256 100644 --- a/lib/libzfs/os/freebsd/libzfs_compat.c +++ b/lib/libzfs/os/freebsd/libzfs_compat.c @@ -38,7 +38,8 @@ #define ZFS_KMOD "openzfs" #endif - +#ifndef HAVE_EXECVPE +/* FreeBSD prior to 15 lacks execvpe */ static int execvPe(const char *name, const char *path, char * const *argv, char * const *envp) @@ -192,6 +193,7 @@ execvpe(const char *name, char * const argv[], char * const envp[]) return (execvPe(name, path, argv, envp)); } +#endif /* !HAVE_EXECVPE */ static __thread char errbuf[ERRBUFLEN]; From 96cad4ca4c1f78fd57a8e3ae1193457606be9eb4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 21 Apr 2024 21:43:53 +1000 Subject: [PATCH 28/48] libspl/assert: show process/task details in assert output Makes it much easier to see what thing complained. Getting thread id, program name and thread name vary wildly between Linux and FreeBSD, so those are set up in macros. pthread_getname_np() did not appear in musl until very recently, but the same info has always been available via prctl(PR_GET_NAME), so we use that instead. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/user.m4 | 2 +- lib/libspl/assert.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/config/user.m4 b/config/user.m4 index 87df8c7ccab..3a69086a9d9 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -31,7 +31,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 9d44740d4e3..185ec65cb89 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -22,9 +22,32 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2024, Rob Norris + */ #include +#if defined(__linux__) +#include +#include +#ifdef HAVE_GETTID +#define libspl_gettid() gettid() +#else +#include +#define libspl_gettid() ((pid_t)syscall(__NR_gettid)) +#endif +#define libspl_getprogname() (program_invocation_short_name) +#define libspl_getthreadname(buf, len) \ + prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0) +#elif defined(__FreeBSD__) +#include +#define libspl_gettid() pthread_getthreadid_np() +#define libspl_getprogname() getprogname() +#define libspl_getthreadname(buf, len) \ + pthread_getname_np(pthread_self(), buf, len); +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -39,13 +62,22 @@ libspl_assertf(const char *file, const char *func, int line, const char *format, ...) { va_list args; + char tname[64]; + + libspl_getthreadname(tname, sizeof (tname)); + + fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func); va_start(args, format); vfprintf(stderr, format, args); - fprintf(stderr, "\n"); - fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); va_end(args); + fprintf(stderr, "\n" + " PID: %-8u COMM: %s\n" + " TID: %-8u NAME: %s\n", + getpid(), libspl_getprogname(), + libspl_gettid(), tname); + #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { return; From 3ca305f87377afeeb3006ee6fe053358dc2f1ad8 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 28 Apr 2024 12:49:58 +1000 Subject: [PATCH 29/48] libspl/assert: add lock around assertion output If multiple threads trip an assertion at the same moment (quite common), they can be printing at the same time, and their output gets messy. This adds a simple lock around the whole thing, to prevent a second task printing assert output before the first has finished. Additionally, if libspl_assert_ok is not set, abort() is called without dropping the lock, so that any other asserting tasks will be killed before starting any output, rather than only getting part-way through. This is a tradeoff; it's assumed that multiple threads asserting at the same moment are likely the same fault in different instances of a thread, and so there won't be any more useful information from the other tasks anyway. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- lib/libspl/assert.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 185ec65cb89..d402462531b 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -27,6 +27,7 @@ */ #include +#include #if defined(__linux__) #include @@ -56,11 +57,15 @@ libspl_set_assert_ok(boolean_t val) libspl_assert_ok = val; } +static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER; + /* printf version of libspl_assert */ void libspl_assertf(const char *file, const char *func, int line, const char *format, ...) { + pthread_mutex_lock(&assert_lock); + va_list args; char tname[64]; @@ -80,6 +85,7 @@ libspl_assertf(const char *file, const char *func, int line, #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { + pthread_mutex_unlock(&assert_lock); return; } #endif From 21f66db67432ad9f6363d0e69fcabb53976a45e4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 27 Apr 2024 21:35:05 +1000 Subject: [PATCH 30/48] libspl/assert: dump backtrace in assert Adds a check for the backtrace() function. If available, uses it to show a stack backtrace in the assertion output. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/user-backtrace.m4 | 14 ++++++++++++++ config/user.m4 | 1 + lib/libspl/Makefile.am | 2 ++ lib/libspl/assert.c | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+) create mode 100644 config/user-backtrace.m4 diff --git a/config/user-backtrace.m4 b/config/user-backtrace.m4 new file mode 100644 index 00000000000..25706767cdc --- /dev/null +++ b/config/user-backtrace.m4 @@ -0,0 +1,14 @@ +dnl +dnl backtrace(), for userspace assertions. glibc has this directly in libc. +dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed +dnl that this will also get the companion function backtrace_symbols(). +dnl +AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [ + AX_SAVE_FLAGS + LIBS="" + AC_SEARCH_LIBS([backtrace], [execinfo], [ + AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available]) + AC_SUBST([BACKTRACE_LIBS], ["$LIBS"]) + ]) + AX_RESTORE_FLAGS +]) diff --git a/config/user.m4 b/config/user.m4 index 3a69086a9d9..8d11e031ba2 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -26,6 +26,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_AIO_H ZFS_AC_CONFIG_USER_CLOCK_GETTIME ZFS_AC_CONFIG_USER_PAM + ZFS_AC_CONFIG_USER_BACKTRACE ZFS_AC_CONFIG_USER_RUNSTATEDIR ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 822bef7e7a8..9f413b08c16 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -43,3 +43,5 @@ libspl_la_LIBADD = \ libspl_assert.la libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) + +libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index d402462531b..4acf687f4b2 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -49,6 +49,24 @@ pthread_getname_np(pthread_self(), buf, len); #endif +#if defined(HAVE_BACKTRACE) +#include + +static inline void +libspl_dump_backtrace(void) +{ + void *btptrs[100]; + size_t nptrs = backtrace(btptrs, 100); + char **bt = backtrace_symbols(btptrs, nptrs); + fprintf(stderr, "Call trace:\n"); + for (size_t i = 0; i < nptrs; i++) + fprintf(stderr, " %s\n", bt[i]); + free(bt); +} +#else +#define libspl_dump_backtrace() +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -83,6 +101,8 @@ libspl_assertf(const char *file, const char *func, int line, getpid(), libspl_getprogname(), libspl_gettid(), tname); + libspl_dump_backtrace(); + #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { pthread_mutex_unlock(&assert_lock); From 88686213c38068492f0831fca0931f949b79ab4b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Apr 2024 10:37:29 +1000 Subject: [PATCH 31/48] libspl/assert: use libunwind for backtrace when available libunwind seems to do a better job of resolving a symbols than backtrace(), and is also useful on platforms that don't have backtrace() (eg musl). If it's available, use it. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/user-libunwind.m4 | 44 ++++++++++++++++++++++++++++++++++++++++ config/user.m4 | 1 + lib/libspl/Makefile.am | 4 ++-- lib/libspl/assert.c | 33 +++++++++++++++++++++++++++++- 4 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 config/user-libunwind.m4 diff --git a/config/user-libunwind.m4 b/config/user-libunwind.m4 new file mode 100644 index 00000000000..99ba3dcf452 --- /dev/null +++ b/config/user-libunwind.m4 @@ -0,0 +1,44 @@ +dnl +dnl Checks for libunwind, which usually does a better job than backtrace() when +dnl resolving symbols in the stack backtrace. Newer versions have support for +dnl getting info about the object file the function came from, so we look for +dnl that too and use it if found. +dnl +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [ + AC_ARG_WITH([libunwind], + AS_HELP_STRING([--with-libunwind], + [use libunwind for backtraces in userspace assertions]), + [], + [with_libunwind=auto]) + + AS_IF([test "x$with_libunwind" != "xno"], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [ + dnl unw_get_elf_filename() is sometimes a macro, other + dnl times a proper symbol, so we can't just do a link + dnl check; we need to include the header properly. + AX_SAVE_FLAGS + CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS" + LIBS="$LIBS $LIBUNWIND_LIBS" + AC_MSG_CHECKING([for unw_get_elf_filename in libunwind]) + AC_LINK_IFELSE([ + AC_LANG_PROGRAM([ + #define UNW_LOCAL_ONLY + #include + ], [ + unw_get_elf_filename(0, 0, 0, 0); + ]) + ], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_LIBUNWIND_ELF, 1, + [libunwind has unw_get_elf_filename]) + ], [ + AC_MSG_RESULT([no]) + ]) + AX_RESTORE_FLAGS + ], [ + AS_IF([test "x$with_libunwind" = "xyes"], [ + AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel]) + ]) + ]) + ]) +]) diff --git a/config/user.m4 b/config/user.m4 index 8d11e031ba2..badd920d2b8 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -27,6 +27,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_CLOCK_GETTIME ZFS_AC_CONFIG_USER_PAM ZFS_AC_CONFIG_USER_BACKTRACE + ZFS_AC_CONFIG_USER_LIBUNWIND ZFS_AC_CONFIG_USER_RUNSTATEDIR ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 9f413b08c16..eb2377305ac 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -1,6 +1,6 @@ include $(srcdir)/%D%/include/Makefile.am -libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS) libspl_la_CFLAGS = $(libspl_assert_la_CFLAGS) noinst_LTLIBRARIES += libspl_assert.la libspl.la @@ -44,4 +44,4 @@ libspl_la_LIBADD = \ libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) -libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) +libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 4acf687f4b2..e6e3008f0aa 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -49,7 +49,38 @@ pthread_getname_np(pthread_self(), buf, len); #endif -#if defined(HAVE_BACKTRACE) +#if defined(HAVE_LIBUNWIND) +#define UNW_LOCAL_ONLY +#include + +static inline void +libspl_dump_backtrace(void) +{ + unw_context_t uc; + unw_cursor_t cp; + unw_word_t ip, off; + char funcname[128]; +#ifdef HAVE_LIBUNWIND_ELF + char objname[128]; + unw_word_t objoff; +#endif + + fprintf(stderr, "Call trace:\n"); + unw_getcontext(&uc); + unw_init_local(&cp, &uc); + while (unw_step(&cp) > 0) { + unw_get_reg(&cp, UNW_REG_IP, &ip); + unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); +#ifdef HAVE_LIBUNWIND_ELF + unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); + fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", + ip, funcname, off, objname, objoff); +#else + fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); +#endif + } +} +#elif defined(HAVE_BACKTRACE) #include static inline void From bc42d96d66d91cbfba98c0cbc34530c1156d6120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Thu, 9 May 2024 16:42:51 +0200 Subject: [PATCH 32/48] Unbreak FreeBSD cross-build on MacOS broken in 051460b8b MacOS used FreeBSD-compatible getprogname() and pthread_getname_np(). But pthread_getthreadid_np() does not exist on MacOS. This implements libspl_gettid() using pthread_threadid_np() to get the thread id of the current thread. Tested with FreeBSD GitHub actions freebsd-src/.github/workflows/cross-bootstrap-tools.yml Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Martin Matuska Closes #16167 --- lib/libspl/assert.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index e6e3008f0aa..5b12c14acd6 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -41,9 +41,11 @@ #define libspl_getprogname() (program_invocation_short_name) #define libspl_getthreadname(buf, len) \ prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0) -#elif defined(__FreeBSD__) +#elif defined(__FreeBSD__) || defined(__APPLE__) +#if !defined(__APPLE__) #include #define libspl_gettid() pthread_getthreadid_np() +#endif #define libspl_getprogname() getprogname() #define libspl_getthreadname(buf, len) \ pthread_getname_np(pthread_self(), buf, len); @@ -98,6 +100,19 @@ libspl_dump_backtrace(void) #define libspl_dump_backtrace() #endif +#if defined(__APPLE__) +static inline uint64_t +libspl_gettid(void) +{ + uint64_t tid; + + if (pthread_threadid_np(NULL, &tid) != 0) + tid = 0; + + return (tid); +} +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -128,7 +143,11 @@ libspl_assertf(const char *file, const char *func, int line, fprintf(stderr, "\n" " PID: %-8u COMM: %s\n" +#if defined(__APPLE__) + " TID: %-8" PRIu64 " NAME: %s\n", +#else " TID: %-8u NAME: %s\n", +#endif getpid(), libspl_getprogname(), libspl_gettid(), tname); From 2a2e3584752a172d9b3c406835f1e2057c498ef7 Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 10 May 2024 00:43:48 +1000 Subject: [PATCH 33/48] libspl_assert: always link -lpthread on FreeBSD The pthread_* functions are in -lpthread on FreeBSD. Some of them are implicitly linked through libc, but on FreeBSD 13 at least pthread_getname_np() is not. Just be explicit, since -lpthread is the documented interface anyway. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16168 --- lib/libspl/Makefile.am | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index eb2377305ac..94be416d46a 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -45,3 +45,7 @@ libspl_la_LIBADD = \ libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) + +if BUILD_FREEBSD +libspl_assert_la_LIBADD += -lpthread +endif From d06c8de7484930de6bb932f0d739ebbd9e255cb5 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 09:56:48 +1000 Subject: [PATCH 34/48] zdb: bring crash handling over from ztest ztest has a very nice ability to show a backtrace when there's an unexpected crash. zdb is used often enough on corrupted data and can blow up too, so nice output is useful there too. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1eeb6e9116a..ad88aa43aa6 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -84,6 +84,9 @@ #include #include #include +#if (__GLIBC__ && !__UCLIBC__) +#include /* for backtrace() */ +#endif #include #include @@ -926,11 +929,41 @@ usage(void) static void dump_debug_buffer(void) { - if (dump_opt['G']) { - (void) printf("\n"); - (void) fflush(stdout); - zfs_dbgmsg_print("zdb"); - } + ssize_t ret __attribute__((unused)); + + if (!dump_opt['G']) + return; + /* + * We use write() instead of printf() so that this function + * is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "\n", 1); + zfs_dbgmsg_print("zdb"); +} + +#define BACKTRACE_SZ 100 + +static void sig_handler(int signo) +{ + struct sigaction action; +#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ + int nptrs; + void *buffer[BACKTRACE_SZ]; + + nptrs = backtrace(buffer, BACKTRACE_SZ); + backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); +#endif + dump_debug_buffer(); + + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + raise(signo); } /* @@ -8934,9 +8967,27 @@ main(int argc, char **argv) char *spa_config_path_env, *objset_str; boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; + struct sigaction action; dprintf_setup(&argc, argv); + /* + * Set up signal handlers, so if we crash due to bad on-disk data we + * can get more info. Unlike ztest, we don't bail out if we can't set + * up signal handlers, because zdb is very useful without them. + */ + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGSEGV, &action, NULL) < 0) { + (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n", + strerror(errno)); + } + if (sigaction(SIGABRT, &action, NULL) < 0) { + (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n", + strerror(errno)); + } + /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will From 27cc6df760edb26bf3e55e795f83b92d40fcae8f Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 12:00:29 +1000 Subject: [PATCH 35/48] Use memset to zero stack allocations containing unions C99 6.7.8.17 says that when an undesignated initialiser is used, only the first element of a union is initialised. If the first element is not the largest within the union, how the remaining space is initialised is up to the compiler. GCC extends the initialiser to the entire union, while Clang treats the remainder as padding, and so initialises according to whatever automatic/implicit initialisation rules are currently active. When Linux is compiled with CONFIG_INIT_STACK_ALL_PATTERN, -ftrivial-auto-var-init=pattern is added to the kernel CFLAGS. This flag sets the policy for automatic/implicit initialisation of variables on the stack. Taken together, this means that when compiling under CONFIG_INIT_STACK_ALL_PATTERN on Clang, the "zero" initialiser will only zero the first element in a union, and the rest will be filled with a pattern. This is significant for aes_ctx_t, which in aes_encrypt_atomic() and aes_decrypt_atomic() is initialised to zero, but then used as a gcm_ctx_t, which is the fifth element in the union, and thus gets pattern initialisation. Later, it's assumed to be zero, resulting in a hang. As confusing and undiscoverable as it is, by the spec, we are at fault when we initialise a structure containing a union with the zero initializer. As such, this commit replaces these uses with an explicit memset(0). Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16135 Closes #16206 --- cmd/zstream/zstream_redup.c | 4 +++- lib/libzfs/libzfs_sendrecv.c | 6 ++++-- module/icp/io/aes.c | 8 ++++++-- tests/zfs-tests/cmd/libzfs_input_check.c | 4 +++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c index c56a09cee75..6866639fe46 100644 --- a/cmd/zstream/zstream_redup.c +++ b/cmd/zstream/zstream_redup.c @@ -186,7 +186,7 @@ static void zfs_redup_stream(int infd, int outfd, boolean_t verbose) { int bufsz = SPA_MAXBLOCKSIZE; - dmu_replay_record_t thedrr = { 0 }; + dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; redup_table_t rdt; zio_cksum_t stream_cksum; @@ -194,6 +194,8 @@ zfs_redup_stream(int infd, int outfd, boolean_t verbose) uint64_t num_records = 0; uint64_t num_write_byref_records = 0; + memset(&thedrr, 0, sizeof (dmu_replay_record_t)); + #ifdef _ILP32 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; #else diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 143aecb9459..c6ee7e5660a 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2170,7 +2170,8 @@ out: static int send_conclusion_record(int fd, zio_cksum_t *zc) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; + memset(&drr, 0, sizeof (dmu_replay_record_t)); drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; @@ -2272,7 +2273,8 @@ send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, } if (!dryrun) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; + memset(&drr, 0, sizeof (dmu_replay_record_t)); /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index d6f01304f56..522c436497b 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -832,12 +832,14 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext, crypto_spi_ctx_template_t template) { - aes_ctx_t aes_ctx = {{{{0}}}}; + aes_ctx_t aes_ctx; off_t saved_offset; size_t saved_length; size_t length_needed; int ret; + memset(&aes_ctx, 0, sizeof (aes_ctx_t)); + ASSERT(ciphertext != NULL); /* @@ -956,12 +958,14 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext, crypto_spi_ctx_template_t template) { - aes_ctx_t aes_ctx = {{{{0}}}}; + aes_ctx_t aes_ctx; off_t saved_offset; size_t saved_length; size_t length_needed; int ret; + memset(&aes_ctx, 0, sizeof (aes_ctx_t)); + ASSERT(plaintext != NULL); /* diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c index c661718a296..7d9ce4fada1 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check.c @@ -521,13 +521,15 @@ test_send_new(const char *snapshot, int fd) static void test_recv_new(const char *dataset, int fd) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); char snapshot[MAXNAMELEN + 32]; ssize_t count; + memset(&drr, 0, sizeof (dmu_replay_record_t)); + int cleanup_fd = open(ZFS_DEV, O_RDWR); if (cleanup_fd == -1) { (void) fprintf(stderr, "open(%s) failed: %s\n", ZFS_DEV, From ba3c7692cd73291b16abdf5ce7e51c058f73fca7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 24 May 2024 22:11:18 -0400 Subject: [PATCH 36/48] Destroy ARC buffer in case of fill error In case of error dmu_buf_fill_done() returns the buffer back into DB_UNCACHED state. Since during transition from DB_UNCACHED into DB_FILL state dbuf_noread() allocates an ARC buffer, we must free it here, otherwise it will be leaked. Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15665 Closes #15802 Closes #16216 --- module/zfs/dbuf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d56117cde01..f81ae96f2e0 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2834,6 +2834,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) failed = B_FALSE; } else if (failed) { VERIFY(!dbuf_undirty(db, tx)); + arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; dbuf_clear_data(db); DTRACE_SET_STATE(db, "fill failed"); From 13ccbbb47a1be93f2c44e4fa51f6ae1a5a327f5d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 29 May 2024 11:53:31 -0400 Subject: [PATCH 37/48] Some improvements to metaslabs eviction - Add old eviction for special and dedup metaslab classes. Those vdevs may be potentially big and fragmented with large metaslabs, while their asynchronous write pattern is not really different from normal class. It seems an omission to not evict old metaslabs from them. - If we have metaslab preload enabled, which means we are not too low on memory, do not evict active metaslabs even if they are not used for some time. Eviction of active metaslabs means we won't be able to write anything until we load them, that may take some time, that is straight opposite to metaslab preload goals. For small systems the memory saving should be less important after recent reduction in number of allocators and so open metaslabs. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16214 --- module/zfs/metaslab.c | 7 +++++-- module/zfs/spa.c | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d50225fc3f6..3876b101497 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -640,6 +640,7 @@ void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; + hrtime_t now = gethrtime(); for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); @@ -663,8 +664,10 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && - gethrtime() > msp->ms_selected_time + - (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + now > msp->ms_selected_time + + MSEC2NSEC(metaslab_unload_delay_ms) && + (msp->ms_allocator == -1 || + !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index fc7cf000f0c..886867b739f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -9939,6 +9939,9 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); + /* spa_embedded_log_class has only one metaslab per vdev. */ + metaslab_class_evict_old(spa->spa_special_class, txg); + metaslab_class_evict_old(spa->spa_dedup_class, txg); spa_sync_close_syncing_log_sm(spa); From c950c5d3695b35ba4f02de167bdd3441862db210 Mon Sep 17 00:00:00 2001 From: Martin Wagner Date: Fri, 14 Jun 2024 03:08:49 +0200 Subject: [PATCH 38/48] disable automatic dependency tracking for dkms builds Previously the dkms build left some unwanted files in `/usr/lib/modules` which could cause package managers to not properly clean up old kernels. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Martin Wagner Closes #16221 Closes #16241 --- scripts/dkms.mkconf | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/dkms.mkconf b/scripts/dkms.mkconf index 0bd38342043..046ce9edcef 100755 --- a/scripts/dkms.mkconf +++ b/scripts/dkms.mkconf @@ -26,6 +26,7 @@ PACKAGE_VERSION="${pkgver}" PACKAGE_CONFIG="${pkgcfg}" NO_WEAK_MODULES="yes" PRE_BUILD="configure + --disable-dependency-tracking --prefix=/usr --with-config=kernel --with-linux=\$( From 25c4271d2f3bf36025abc5c80f6a2f80b84b4a88 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 9 May 2024 20:22:21 +1000 Subject: [PATCH 39/48] zts: test single-disk pool resumes properly after disk pull A single disk pool should suspend when its disk fails and hold the IO. When the disk is returned, the pool should return and the IO be reissued, leaving everything in good shape. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Jorgen Lundman Reviewed-by: Tony Hutter Reviewed-by: Don Brady --- tests/runfiles/linux.run | 2 +- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../fault/suspend_resume_single.ksh | 102 ++++++++++++++++++ 4 files changed, 105 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 92ce09ec6fc..bd6cc56f358 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -121,7 +121,7 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', - 'scrub_after_resilver', 'zpool_status_-s'] + 'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 5ca13093136..de06c7c6e2c 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true': 'fault/auto_replace_002_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], + 'fault/suspend_resume_single': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], }) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index cc66d762f3c..b8eed952b90 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1476,6 +1476,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/decompress_fault.ksh \ functional/fault/decrypt_fault.ksh \ functional/fault/scrub_after_resilver.ksh \ + functional/fault/suspend_resume_single.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh new file mode 100755 index 00000000000..041dadb1ead --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +set -x + +DATAFILE="$TMPDIR/datafile" + +function cleanup +{ + destroy_pool $TESTPOOL + unload_scsi_debug + rm -f $DATA_FILE +} + +log_onexit cleanup + +log_assert "ensure single-disk pool resumes properly after suspend and clear" + +# create a file, and take a checksum, so we can compare later +log_must dd if=/dev/random of=$DATAFILE bs=128K count=1 +typeset sum1=$(cat $DATAFILE | md5sum) + +# make a debug device that we can "unplug" +load_scsi_debug 100 1 1 1 '512b' +sd=$(get_debug_device) + +# create a single-device pool +log_must zpool create $TESTPOOL $sd +log_must zpool sync + +# "pull" the disk +log_must eval "echo offline > /sys/block/$sd/device/state" + +# copy data onto the pool. it'll appear to succeed, but only be in memory +log_must cp $DATAFILE /$TESTPOOL/file + +# wait until sync starts, and the pool suspends +log_note "waiting for pool to suspend" +typeset -i tries=10 +until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 +done + +# return the disk +log_must eval "echo running > /sys/block/$sd/device/state" + +# clear the error states, which should reopen the vdev, get the pool back +# online, and replay the failed IO +log_must zpool clear $TESTPOOL + +# wait a while for everything to sync out. if something is going to go wrong, +# this is where it will happen +log_note "giving pool time to settle and complete txg" +sleep 7 + +# if the pool suspended, then everything is bad +if [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; then + log_fail "pool suspended" +fi + +# export the pool, to make sure it exports clean, and also to clear the file +# out of the cache +log_must zpool export $TESTPOOL + +# import the pool +log_must zpool import $TESTPOOL + +# sum the file we wrote earlier +typeset sum2=$(cat /$TESTPOOL/file | md5sum) + +# make sure the checksums match +log_must test "$sum1" = "$sum2" + +log_pass "single-disk pool resumes properly after disk suspend and clear" From 4d2f7f9839d12708417457cd57cf43d15cae5e92 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 11 Jun 2024 20:49:10 +1000 Subject: [PATCH 40/48] vdev_open: clear async fault flag after reopen After c3f2f1aa2, vdev_fault_wanted is set on a vdev after a probe fails. An end-of-txg async task is charged with actually faulting the vdev. In a single-disk pool, the probe failure will degrade the last disk, and then suspend the pool. However, vdev_fault_wanted is not cleared. After the pool returns, the transaction finishes and the async task runs and faults the vdev, which suspends the pool again. The fix is simple: when reopening a vdev, clear the async fault flag. If the vdev is still failed, the startup probe will quickly notice and degrade/suspend it again. If not, all is well! Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Co-authored-by: Don Brady Signed-off-by: Rob Norris Reviewed-by: Jorgen Lundman Reviewed-by: Tony Hutter Reviewed-by: Don Brady --- module/zfs/vdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c9ee39b468f..224493a86e7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2008,6 +2008,7 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; + vd->vdev_fault_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* From 9835255f5dcf52b664ef6272f2db17886b702e10 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 18 Jul 2024 05:02:28 +1000 Subject: [PATCH 41/48] zdb: dump ZAP_FLAG_UINT64_KEY ZAPs properly (#16334) These are used for DDT and BRT stores. There's limited information available to produce meaningful output, but at least we can put something on screen rather than crashing. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- cmd/zdb/zdb.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ad88aa43aa6..2caa83f5c63 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -1232,16 +1233,33 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size) for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { - (void) printf("\t\t%s = ", attr.za_name); + boolean_t key64 = + !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY); + + if (key64) + (void) printf("\t\t0x%010lx = ", + *(uint64_t *)attr.za_name); + else + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); - (void) zap_lookup(os, object, attr.za_name, - attr.za_integer_length, attr.za_num_integers, prop); - if (attr.za_integer_length == 1) { + + if (key64) + (void) zap_lookup_uint64(os, object, + (const uint64_t *)attr.za_name, 1, + attr.za_integer_length, attr.za_num_integers, + prop); + else + (void) zap_lookup(os, object, attr.za_name, + attr.za_integer_length, attr.za_num_integers, + prop); + + if (attr.za_integer_length == 1 && !key64) { if (strcmp(attr.za_name, DSL_CRYPTO_KEY_MASTER_KEY) == 0 || strcmp(attr.za_name, @@ -1260,6 +1278,10 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size) } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { + case 1: + (void) printf("%u ", + ((uint8_t *)prop)[i]); + break; case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); From 14cce09a65e5921fd33439ae804039c913565a43 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 8 Jul 2024 19:59:08 -0500 Subject: [PATCH 42/48] FreeBSD: Use a statement expression to implement SET_ERROR() (#16284) This way we can avoid making assumptions about the SDT probe implementation. No functional change intended. Signed-off-by: Mark Johnston Reviewed-by: Alexander Motin Reviewed-by: Allan Jude Reviewed-by: Rob Norris Reviewed-by: Tino Reichardt Reviewed-by: Tony Hutter --- include/os/freebsd/spl/sys/sdt.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h index 2daa6de1af0..e2c4830cb96 100644 --- a/include/os/freebsd/spl/sys/sdt.h +++ b/include/os/freebsd/spl/sys/sdt.h @@ -31,13 +31,14 @@ #include_next #ifdef KDTRACE_HOOKS -/* CSTYLED */ +/* BEGIN CSTYLED */ SDT_PROBE_DECLARE(sdt, , , set__error); -#define SET_ERROR(err) \ - ((sdt_sdt___set__error->id ? \ - (*sdt_probe_func)(sdt_sdt___set__error->id, \ - (uintptr_t)err, 0, 0, 0, 0) : 0), err) +#define SET_ERROR(err) ({ \ + SDT_PROBE1(sdt, , , set__error, (uintptr_t)err); \ + err; \ +}) +/* END CSTYLED */ #else #define SET_ERROR(err) (err) #endif From 9ad205ecded2fc24ee1d3bf38439fbb3c0df7e6c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 24 Jul 2024 04:47:04 +1000 Subject: [PATCH 43/48] AUTHORS: refresh with recent new contributors (#16362) Sponsored-by: https://despairlabs.com/sponsor/ Signed-off-by: Rob Norris Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Reviewed-by: George Melikov --- .mailmap | 6 ++++++ AUTHORS | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/.mailmap b/.mailmap index 32bdb520961..7e17d82d735 100644 --- a/.mailmap +++ b/.mailmap @@ -77,7 +77,10 @@ Yanping Gao Youzhong Yang # Signed-off-by: overriding Author: +Ryan +Qiuhao Chen Yuxin Wang +Zhenlei Huang # Commits from strange places, long ago Brian Behlendorf @@ -95,6 +98,7 @@ Alek Pinchuk Alexander Lobakin Alexey Smirnoff Allen Holl <65494904+allen-4@users.noreply.github.com> +Alphan Yılmaz Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Andrew J. Hesford <48421688+ahesford@users.noreply.github.com>> Andrew Sun @@ -102,6 +106,7 @@ Aron Xu Arun KV <65647132+arun-kv@users.noreply.github.com> Ben Wolsieffer bernie1995 <42413912+bernie1995@users.noreply.github.com> +Bojan Novković <72801811+bnovkov@users.noreply.github.com> Boris Protopopov Brad Forschinger Brandon Thetford @@ -193,6 +198,7 @@ Stefan Lendl <1321542+stfl@users.noreply.github.com> Thomas Bertschinger <101425190+bertschinger@users.noreply.github.com> Thomas Geppert Tim Crawford +Todd Seidelmann <18294602+seidelma@users.noreply.github.com> Tom Matthews Tony Perkins <62951051+tony-zfs@users.noreply.github.com> Torsten Wörtwein diff --git a/AUTHORS b/AUTHORS index d7d55f42d2e..09814b43531 100644 --- a/AUTHORS +++ b/AUTHORS @@ -46,6 +46,7 @@ CONTRIBUTORS: Alex Zhuravlev Allan Jude Allen Holl + Alphan Yılmaz alteriks Alyssa Ross Ameer Hamza @@ -99,6 +100,7 @@ CONTRIBUTORS: bernie1995 Bill McGonigle Bill Pijewski + Bojan Novković Boris Protopopov Brad Forschinger Brad Lewis @@ -168,6 +170,7 @@ CONTRIBUTORS: Daniel Hoffman Daniel Kobras Daniel Kolesa + Daniel Perry Daniel Reichelt Daniel Stevenson Daniel Verite @@ -187,6 +190,7 @@ CONTRIBUTORS: Dennis R. Friedrichsen Denys Rtveliashvili Derek Dai + Derek Schrock Dex Wood DHE Didier Roche @@ -245,6 +249,7 @@ CONTRIBUTORS: Gionatan Danti Giuseppe Di Natale Glenn Washburn + glibg10b gofaster Gordan Bobic Gordon Bergling @@ -410,6 +415,7 @@ CONTRIBUTORS: Mart Frauenlob Martin Matuska Martin Rüegg + Martin Wagner Massimo Maggi Mateusz Guzik Mateusz Piotrowski <0mp@FreeBSD.org> @@ -488,6 +494,7 @@ CONTRIBUTORS: Peng Peter Ashford Peter Dave Hello + Peter Doherty Peter Levine Peter Wirdemo Petros Koutoupis @@ -501,6 +508,7 @@ CONTRIBUTORS: Prasad Joshi privb0x23 P.SCH + Qiuhao Chen Quartz Quentin Zdanis Rafael Kitover @@ -532,6 +540,7 @@ CONTRIBUTORS: Roman Strashkin Ross Williams Ruben Kerkhof + Ryan Ryan Hirasaki Ryan Lahfa Ryan Libby @@ -556,6 +565,7 @@ CONTRIBUTORS: Sen Haerens Serapheim Dimitropoulos Seth Forshee + Seth Troisi Shaan Nobee Shampavman Shaun Tancheff @@ -602,6 +612,7 @@ CONTRIBUTORS: Tim Schumacher Tino Reichardt Tobin Harding + Todd Seidelmann Tom Caputi Tom Matthews Tomohiro Kusumi @@ -653,6 +664,8 @@ CONTRIBUTORS: Zachary Bedell Zach Dykstra zgock + Zhao Yongming + Zhenlei Huang Zhu Chuang Érico Nogueira Đoàn Trần Công Danh From ef08cb26dae6b3c2e930e66852f13329babb7c2e Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Tue, 23 Jul 2024 11:34:19 -0700 Subject: [PATCH 44/48] Fix long_free_dirty accounting for small files (#16264) For files smaller than recordsize, it's most likely that they don't have L1 blocks. However, current calculation will always return at least 1 L1 block. In this change, we check dnode level to figure out if it has L1 blocks or not, and return 0 if it doesn't. This will reduce the chance of unnecessary throttling when deleting a large number of small files. Signed-off-by: Chunwei Chen Co-authored-by: Chunwei Chen Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf --- module/zfs/dmu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1d10ceebfb9..24c5ab07df5 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -815,6 +815,13 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) ASSERT3U(minimum, <=, *start); + /* dn_nlevels == 1 means we don't have any L1 blocks */ + if (dn->dn_nlevels <= 1) { + *l1blks = 0; + *start = minimum; + return (0); + } + /* * Check if we can free the entire range assuming that all of the * L1 blocks in this range have data. If we can, we use this From b5835ed137f7c954efc27630283b4c08d38ceff2 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 23 Jul 2024 17:13:04 -0700 Subject: [PATCH 45/48] Linux 6.9: Fix UBSAN errors in sa.c (#16380) This is a follow-on to 156a64161b4f9da35f2e0484106173344cf78317 that ignores UBSAN errors in sa.c. Thank you @thwalker3 for the fix. Original-patch-by: @thwalker3 Signed-off-by: Tony Hutter Closes #16278 Closes #16330 Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf --- module/Kbuild.in | 1 + 1 file changed, 1 insertion(+) diff --git a/module/Kbuild.in b/module/Kbuild.in index d13e1cf1143..87bd17cf5a6 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -493,6 +493,7 @@ UBSAN_SANITIZE_zap_leaf.o := n UBSAN_SANITIZE_zap_micro.o := n UBSAN_SANITIZE_sa.o := n UBSAN_SANITIZE_zfs/zap_micro.o := n +UBSAN_SANITIZE_zfs/sa.o := n # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. From dd5de55ebaa60ffa3ad1a4b004cf5c068471ba78 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 24 Jul 2024 12:33:30 -0400 Subject: [PATCH 46/48] ZTS: Make do_vol_test() more deterministic (#16379) - Explicitly disable compression since mkfile uses a zero buffer. - Explicitly sync file systems instead of waiting for timeout. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Reviewed-by: George Melikov Reviewed-by: Tony Hutter --- .../cli_root/zfs_copies/zfs_copies.kshlib | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib index 5617bd01ba4..9911ccdf536 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib @@ -84,7 +84,8 @@ function do_vol_test vol=$TESTPOOL/$TESTVOL1 vol_b_path=$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL1 - log_must zfs create -V $VOLSIZE -o copies=$copies $vol + log_must zfs create -V $VOLSIZE -o compression=off -o copies=$copies \ + $vol log_must zfs set refreservation=none $vol block_device_wait $vol_b_path @@ -116,31 +117,30 @@ function do_vol_test else log_must zpool create $TESTPOOL1 $vol_b_path fi - log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs create -o compression=off $TESTPOOL1/$TESTFS1 + sync_pool $TESTPOOL1 ;; *) log_unsupported "$type test not implemented" ;; esac - ((nfilesize = copies * ${FILESIZE%m})) + sync_pool $TESTPOOL pre_used=$(get_prop used $vol) - ((target_size = pre_used + nfilesize)) if [[ $type == "zfs" ]]; then log_must mkfile $FILESIZE /$TESTPOOL1/$TESTFS1/$FILE + sync_pool $TESTPOOL1 else log_must mkfile $FILESIZE $mntp/$FILE + log_must sync fi + sync_pool $TESTPOOL post_used=$(get_prop used $vol) - ((retries = 0)) - while ((post_used < target_size && retries++ < 42)); do - sleep 1 - post_used=$(get_prop used $vol) - done ((used = post_used - pre_used)) + ((nfilesize = copies * ${FILESIZE%m})) if ((used < nfilesize)); then log_fail "The space is not charged correctly while setting" \ "copies as $copies ($used < $nfilesize)" \ From 6f27c4cadd29eb9b850c1c66bf71ef9ba119b955 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 2 Aug 2024 17:45:00 -0700 Subject: [PATCH 47/48] [2.2.5-only] Make 'rmmod zfs' work after zfs-2.2.4 (#16406) db65272ae was added to zfs-2.2.4 to stub in the VDEV_PROP_RAIDZ_EXPANDING enum without adding the RAIDz expansion feature. This was needed to provide the right enum count for when the VDEV_PROP_SLOW_IO proprieties got added. This had the unfortunate side effect of breaking module removal though. Specifically, with the VDEV_PROP_RAIDZ_EXPANDING stub added, the module would correctly omit making kobjects for the RAIDz expansion vdev property, but then would try to blindly remove its non-existent kobjects during module unload. This commit fixes the issue by checking for an uninitialized kobject. Fixes: #16249 Signed-off-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Reviewed-by: Tino Reichardt --- module/os/linux/zfs/zfs_sysfs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zfs_sysfs.c b/module/os/linux/zfs/zfs_sysfs.c index e2431fe8a80..b8cc682ad55 100644 --- a/module/os/linux/zfs/zfs_sysfs.c +++ b/module/os/linux/zfs/zfs_sysfs.c @@ -110,8 +110,17 @@ zfs_kobj_fini(zfs_mod_kobj_t *zkobj) } /* kobject_put() will call zfs_kobj_release() to release memory */ - kobject_del(&zkobj->zko_kobj); - kobject_put(&zkobj->zko_kobj); + /* + * Special case note: + * + * We have to check for 'zkobj->zko_kobj.name != NULL' as + * a workaround for #16249 which was added to zfs-2.2.4 + * and fixed (with this change) in zfs-2.2.5. + */ + if (zkobj->zko_kobj.name != NULL) { + kobject_del(&zkobj->zko_kobj); + kobject_put(&zkobj->zko_kobj); + } } static void From 33174af15112ed5c53299da2d28e763b0163f428 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 16 Jul 2024 16:30:25 -0700 Subject: [PATCH 48/48] Tag zfs-2.2.5 META file and changelog updated. Signed-off-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 0f32d8b4fe1..14cfc5f00a4 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.4 +Version: 2.2.5 Release: 1 Release-Tags: relext License: CDDL